Cycles: generalize shader sorting / locality heuristic to all GPU devices

This was added for Metal, but also gives good results with CUDA and OptiX. Also enable it for future Apple GPUs instead of only M1 and M2, since this has been shown to help across multiple GPUs so the better bet seems to enable rather than disable it. Also moves some of the logic outside of the Metal device code, and always enables the code in the kernel since other devices don't do dynamic compile. Time per sample with OptiX + RTX A6000: new old barbershop_interior 0.0730s 0.0727s bmw27 0.0047s 0.0053s classroom 0.0428s 0.0464s fishy_cat 0.0102s 0.0108s junkshop 0.0366s 0.0395s koro 0.0567s 0.0578s monster 0.0206s 0.0223s pabellon 0.0158s 0.0174s sponza 0.0088s 0.0100s spring 0.1267s 0.1280s victor 0.0524s 0.0531s wdas_cloud 0.0817s 0.0816s Ref D15331, T87836
2022-07-14 16:42:43 +02:00 · 2022-07-14 16:42:43 +02:00 · 523bbf7065
commit 523bbf7065
parent da4ef05e4d
7 changed files with 45 additions and 52 deletions
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@ -229,10 +229,6 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
    global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
  }

-  if (MetalInfo::optimal_sort_partition_elements(mtlDevice) > 0) {
-    build_options += " -D__KERNEL_SORT_PARTITIONING__ ";
-  }
-
  if (use_metalrt) {
    global_defines += "#define __METALRT__\n";
    if (motion_blur) {
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@ -24,7 +24,7 @@ class MetalDeviceQueue : public DeviceQueue {

  virtual int num_concurrent_states(const size_t) const override;
  virtual int num_concurrent_busy_states() const override;
-  virtual int num_sort_partitions(const size_t) const override;
+  virtual int num_sort_partition_elements() const override;

  virtual void init_execution() override;

--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@ -293,21 +293,9 @@ int MetalDeviceQueue::num_concurrent_busy_states() const
  return result;
 }

-int MetalDeviceQueue::num_sort_partitions(const size_t state_size) const
+int MetalDeviceQueue::num_sort_partition_elements() const
 {
-  /* Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a
-   * more sophisticated heuristic we simply disable sort partitioning if the shader count is high.
-   */
-  if (metal_device_->launch_params.data.max_shaders >= 300) {
-    return 1;
-  }
-
-  const int optimal_partition_elements = MetalInfo::optimal_sort_partition_elements(
-      metal_device_->mtlDevice);
-  if (optimal_partition_elements) {
-    return num_concurrent_states(state_size) / optimal_partition_elements;
-  }
-  return 1;
+  return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
 }

 void MetalDeviceQueue::init_execution()
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@ -82,10 +82,7 @@ int MetalInfo::optimal_sort_partition_elements(id<MTLDevice> device)
   * sorting each partition by material. Partitioning into chunks of 65536 elements results in an
   * overall render time speedup of up to 15%. */
  if (get_device_vendor(device) == METAL_GPU_APPLE) {
-    AppleGPUArchitecture arch = get_apple_gpu_architecture(device);
-    if (arch == APPLE_M1 || arch == APPLE_M2) {
-      return 65536;
-    }
+    return 65536;
  }
  return 0;
 }
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@ -105,12 +105,11 @@ class DeviceQueue {
   * value. */
  virtual int num_concurrent_busy_states() const = 0;

-  /* Number of partitions within which active indices are sorted by material ID.
-   * Using more partitions lets us trade off material coherence for better integrator state fetch
-   * locality. */
-  virtual int num_sort_partitions(const size_t /*state_size*/) const
+  /* Number of elements in a partition of sorted shaders, that improves memory locality of
+   * integrator state fetch at the cost of decreased coherence for shader kernel execution. */
+  virtual int num_sort_partition_elements() const
  {
-    return 1;
+    return 65536;
  }

  /* Initialize execution of kernels on this queue.
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@ -181,28 +181,45 @@ void PathTraceWorkGPU::alloc_integrator_queue()

 void PathTraceWorkGPU::alloc_integrator_sorting()
 {
+  /* Compute sort partitions, to balance between memory locality and coherence.
+   * Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a
+   * more sophisticated heuristic we simply disable sort partitioning if the shader count is high.
+   */
+  num_sort_partitions_ = 1;
+  if (device_scene_->data.max_shaders < 300) {
+    const int num_elements = queue_->num_sort_partition_elements();
+    if (num_elements) {
+      num_sort_partitions_ = max(max_num_paths_ / num_elements, 1);
+    }
+  }
+
+  integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
+                                                                num_sort_partitions_);
+
  /* Allocate arrays for shader sorting. */
-  num_sort_partitions_ = queue_->num_sort_partitions(estimate_single_state_size());
  const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
  if (integrator_shader_sort_counter_.size() < sort_buckets) {
    integrator_shader_sort_counter_.alloc(sort_buckets);
    integrator_shader_sort_counter_.zero_to_device();
+    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+        (int *)integrator_shader_sort_counter_.device_pointer;

-    integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
-    integrator_shader_raytrace_sort_counter_.zero_to_device();
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+      integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
+      integrator_shader_raytrace_sort_counter_.zero_to_device();
+      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+          (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+    }

-    integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
-    integrator_shader_mnee_sort_counter_.zero_to_device();
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
+      integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
+      integrator_shader_mnee_sort_counter_.zero_to_device();
+      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
+          (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+    }

    integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
    integrator_shader_sort_prefix_sum_.zero_to_device();
-
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
-        (int *)integrator_shader_sort_counter_.device_pointer;
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
-        (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
-        (int *)integrator_shader_mnee_sort_counter_.device_pointer;
  }
 }

@ -238,10 +255,6 @@ void PathTraceWorkGPU::init_execution()
 {
  queue_->init_execution();

-  /* Setup sort partitioning divisor for better cache utilization. */
-  integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
-                                                                num_sort_partitions_);
-
  /* Copy to device side struct in constant memory. */
  device_->const_copy_to(
      "integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
@ -338,8 +351,12 @@ void PathTraceWorkGPU::enqueue_reset()
  queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
  queue_->zero_to_device(integrator_queue_counter_);
  queue_->zero_to_device(integrator_shader_sort_counter_);
-  queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
-  queue_->zero_to_device(integrator_shader_mnee_sort_counter_);
+  if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+  }
+  if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
+    queue_->zero_to_device(integrator_shader_mnee_sort_counter_);
+  }

  /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
   * counter on the host side because `zero_to_device()` is not doing it. */
--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@ -99,13 +99,9 @@ ccl_device_forceinline void integrator_shadow_path_terminate(KernelGlobals kg,
  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
 }

-#  ifdef __KERNEL_SORT_PARTITIONING__
 /* Sort first by truncated state index (for good locality), then by key (for good coherence). */
-#    define INTEGRATOR_SORT_KEY(key, state) \
-      (key + kernel_data.max_shaders * (state / kernel_integrator_state.sort_partition_divisor))
-#  else
-#    define INTEGRATOR_SORT_KEY(key, state) (key)
-#  endif
+#  define INTEGRATOR_SORT_KEY(key, state) \
+    (key + kernel_data.max_shaders * (state / kernel_integrator_state.sort_partition_divisor))

 ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
                                                        IntegratorState state,