Fix T92985: CUDA errors with Cycles film convert kernels

rB3a4c8f406a3a3bf0627477c6183a594fa707a6e2 changed the macros that create the film convert kernel entry points, but in the process accidentally changed the parameter definition to one of those (which caused CUDA launch and misaligned address errors) and changed the implementation as well. This restores the correct implementation from before. In addition, the `ccl_gpu_kernel_threads` macro did not work as intended and caused the generated launch bounds to end up with an incorrect input for the second parameter (it was set to "thread_num_registers", rather than the result of the block number calculation). I'm not entirely sure why, as the macro definition looked sound to me. Decided to simply go with two separate macros instead, to simplify and solve this. Also changed how state is captured with the `ccl_gpu_kernel_lambda` macro slightly, to avoid a compiler warning (expression has no effect) that otherwise occurred. Maniphest Tasks: T92985 Differential Revision: https://developer.blender.org/D13175
Referenced by issue #92985, OptiX denoise error on Blender 3.1 and nvidia 495.44: Misaligned address in cuMemFree_v2
2021-11-10 14:37:15 +01:00 · 2021-11-10 14:37:15 +01:00 · f565620435 · 2023-02-14 06:57:56 +01:00
parent a6e4cb092e
commit f565620435
5 changed files with 62 additions and 81 deletions
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -379,7 +379,6 @@ if(WITH_CYCLES_CUDA_BINARIES)
    ${SRC_KERNEL_HEADERS}
    ${SRC_KERNEL_DEVICE_GPU_HEADERS}
    ${SRC_KERNEL_DEVICE_CUDA_HEADERS}
-    ${SRC_KERNEL_DEVICE_METAL_HEADERS}
    ${SRC_UTIL_HEADERS}
  )
  set(cuda_cubins)
--- a/intern/cycles/kernel/device/cuda/config.h
+++ b/intern/cycles/kernel/device/cuda/config.h
@ -92,25 +92,19 @@

 /* Compute number of threads per block and minimum blocks per multiprocessor
 * given the maximum number of registers per thread. */
-
-#define ccl_gpu_kernel_threads(block_num_threads) \
-  extern "C" __global__ void __launch_bounds__(block_num_threads)
-
-#define ccl_gpu_kernel_threads_registers(block_num_threads, thread_num_registers) \
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
  extern "C" __global__ void __launch_bounds__(block_num_threads, \
                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
                                                   (block_num_threads * thread_num_registers))

-/* allow ccl_gpu_kernel to accept 1 or 2 parameters */
-#define SELECT_MACRO(_1, _2, NAME, ...) NAME
-#define ccl_gpu_kernel(...) \
-  SELECT_MACRO(__VA_ARGS__, ccl_gpu_kernel_threads_registers, ccl_gpu_kernel_threads)(__VA_ARGS__)
+#define ccl_gpu_kernel_threads(block_num_threads) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads)

 #define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)

 #define ccl_gpu_kernel_call(x) x

-/* define a function object where "func" is the lambda body, and additional parameters are used to
+/* Define a function object where "func" is the lambda body, and additional parameters are used to
 * specify captured state  */
 #define ccl_gpu_kernel_lambda(func, ...) \
  struct KernelLambda { \
@ -119,8 +113,7 @@
    { \
      return (func); \
    } \
-  } ccl_gpu_kernel_lambda_pass; \
-  ccl_gpu_kernel_lambda_pass
+  } ccl_gpu_kernel_lambda_pass

 /* sanity checks */

--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@ -56,8 +56,7 @@
 */

 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-ccl_gpu_kernel_signature(integrator_reset,
-                          int num_states)
+    ccl_gpu_kernel_signature(integrator_reset, int num_states)
 {
  const int state = ccl_gpu_global_id_x();

@ -265,7 +264,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  }
 }

-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_queued_paths_array,
                             int num_states,
                             ccl_global int *indices,
@ -273,14 +272,14 @@ ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                             int kernel_index)
 {
  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index,
-                        int kernel_index)
-      .kernel_index = kernel_index;
+                        int kernel_index);
+  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;

  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }

-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array,
                             int num_states,
                             ccl_global int *indices,
@ -288,25 +287,26 @@ ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                             int kernel_index)
 {
  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index,
-                        int kernel_index)
-      .kernel_index = kernel_index;
+                        int kernel_index);
+  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;

  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }

-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_active_paths_array,
                             int num_states,
                             ccl_global int *indices,
                             ccl_global int *num_indices)
 {
  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
+
  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }

-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_terminated_paths_array,
                             int num_states,
                             ccl_global int *indices,
@ -314,11 +314,12 @@ ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                             int indices_offset)
 {
  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
+
  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
      num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
 }

-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array,
                             int num_states,
                             ccl_global int *indices,
@ -326,11 +327,12 @@ ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                             int indices_offset)
 {
  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+
  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
      num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
 }

-ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_sorted_paths_array,
                             int num_states,
                             int num_states_limit,
@ -343,37 +345,37 @@ ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
  ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ?
                            INTEGRATOR_STATE(state, path, shader_sort_key) :
                            GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY,
-                        int kernel_index)
-    .kernel_index = kernel_index;
-  
+                        int kernel_index);
+  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
  const uint state_index = ccl_gpu_global_id_x();
-  gpu_parallel_sorted_index_array(
-      state_index,
-      num_states,
-      num_states_limit,
-      indices,
+  gpu_parallel_sorted_index_array(state_index,
+                                  num_states,
+                                  num_states_limit,
+                                  indices,
                                  num_indices,
                                  key_counter,
                                  key_prefix_sum,
                                  ccl_gpu_kernel_lambda_pass);
 }

-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_compact_paths_array,
                             int num_states,
                             ccl_global int *indices,
-                         ccl_global int *num_indices,
-                         int num_active_paths)
+                             ccl_global int *num_indices,
+                             int num_active_paths)
 {
-  ccl_gpu_kernel_lambda((state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0),
-                        int num_active_paths)
-    .num_active_paths = num_active_paths;
-  
+  ccl_gpu_kernel_lambda((state >= num_active_paths) &&
+                            (INTEGRATOR_STATE(state, path, queued_kernel) != 0),
+                        int num_active_paths);
+  ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
+
  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }

-ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_compact_states,
                             ccl_global const int *active_terminated_states,
                             const int active_states_offset,
@ -390,22 +392,23 @@ ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
  }
 }

-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array,
                             int num_states,
                             ccl_global int *indices,
-                         ccl_global int *num_indices,
-                         int num_active_paths)
+                             ccl_global int *num_indices,
+                             int num_active_paths)
 {
-  ccl_gpu_kernel_lambda((state >= num_active_paths) && (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
-                        int num_active_paths)
-    .num_active_paths = num_active_paths;
+  ccl_gpu_kernel_lambda((state >= num_active_paths) &&
+                            (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
+                        int num_active_paths);
+  ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;

  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }

-ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_compact_shadow_states,
                             ccl_global const int *active_terminated_states,
                             const int active_states_offset,
@ -422,7 +425,7 @@ ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
  }
 }

-ccl_gpu_kernel(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature(
+ccl_gpu_kernel_threads(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature(
    prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, int num_values)
 {
  gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values);
@ -524,7 +527,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
      ccl_gpu_kernel_signature(film_convert_##variant, \
                               const KernelFilmConvert kfilm_convert, \
-                               ccl_global uchar4 *rgba, \
+                               ccl_global float *pixels, \
                               ccl_global float *render_buffer, \
                               int num_pixels, \
                               int width, \
@ -544,20 +547,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
    ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
                                     y * stride * kfilm_convert.pass_stride; \
 \
-    float pixel[4]; \
+    ccl_global float *pixel = pixels + \
+                              (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \
+\
    film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
-\
-    film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
-\
-    if (input_channel_count == 1) { \
-      pixel[1] = pixel[2] = pixel[0]; \
-    } \
-    if (input_channel_count <= 3) { \
-      pixel[3] = 1.0f; \
-    } \
-\
-    ccl_global float *out = ((ccl_global float *)rgba) + rgba_offset + y * rgba_stride + x; \
-    *(ccl_global float4 *)out = make_float4(pixel[0], pixel[1], pixel[2], pixel[3]); \
  } \
 \
  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
@ -585,8 +578,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 \
    float pixel[4]; \
    film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
-\
-    film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
 \
    if (input_channel_count == 1) { \
      pixel[1] = pixel[2] = pixel[0]; \
@ -595,7 +586,9 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
      pixel[3] = 1.0f; \
    } \
 \
-    ccl_global half4 *out = ((ccl_global half4 *)rgba) + (rgba_offset + y * rgba_stride + x); \
+    film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
+\
+    ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x; \
    *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
  }

@ -617,6 +610,8 @@ KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4)
 KERNEL_FILM_CONVERT_VARIANT(combined, 4)
 KERNEL_FILM_CONVERT_VARIANT(float4, 4)

+#undef KERNEL_FILM_CONVERT_VARIANT
+
 /* --------------------------------------------------------------------
 * Shader evaluation.
 */
--- a/intern/cycles/kernel/device/hip/config.h
+++ b/intern/cycles/kernel/device/hip/config.h
@ -35,25 +35,19 @@

 /* Compute number of threads per block and minimum blocks per multiprocessor
 * given the maximum number of registers per thread. */
-
-#define ccl_gpu_kernel_threads(block_num_threads) \
-  extern "C" __global__ void __launch_bounds__(block_num_threads)
-
-#define ccl_gpu_kernel_threads_registers(block_num_threads, thread_num_registers) \
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
  extern "C" __global__ void __launch_bounds__(block_num_threads, \
                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
                                                   (block_num_threads * thread_num_registers))

-/* allow ccl_gpu_kernel to accept 1 or 2 parameters */
-#define SELECT_MACRO(_1, _2, NAME, ...) NAME
-#define ccl_gpu_kernel(...) \
-  SELECT_MACRO(__VA_ARGS__, ccl_gpu_kernel_threads_registers, ccl_gpu_kernel_threads)(__VA_ARGS__)
+#define ccl_gpu_kernel_threads(block_num_threads) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads)

 #define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)

 #define ccl_gpu_kernel_call(x) x

-/* define a function object where "func" is the lambda body, and additional parameters are used to
+/* Define a function object where "func" is the lambda body, and additional parameters are used to
 * specify captured state  */
 #define ccl_gpu_kernel_lambda(func, ...) \
  struct KernelLambda { \
@ -62,8 +56,7 @@
    { \
      return (func); \
    } \
-  } ccl_gpu_kernel_lambda_pass; \
-  ccl_gpu_kernel_lambda_pass
+  } ccl_gpu_kernel_lambda_pass

 /* sanity checks */

--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@ -70,7 +70,8 @@ using namespace metal;

 /* kernel.h adapters */

-#define ccl_gpu_kernel(...)
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
+#define ccl_gpu_kernel_threads(block_num_threads)

 /* convert a comma-separated list into a semicolon-separated list (so that we can generate a struct based on kernel entrypoint parameters) */
 #define FN0()
@ -143,7 +144,7 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \
    ccl_private MetalKernelContext &context; \
    __VA_ARGS__; \
    int operator()(const int state) const { return (func); } \
-  }ccl_gpu_kernel_lambda_pass(context); ccl_gpu_kernel_lambda_pass
+  } ccl_gpu_kernel_lambda_pass(context)

 // clang-format on

@ -247,4 +248,4 @@ constant constexpr array<sampler, SamplerCount> metal_samplers = {
    sampler(address::repeat, filter::linear),
    sampler(address::clamp_to_edge, filter::linear),
    sampler(address::clamp_to_zero, filter::linear),
-};
+};