Cycles: Use local atomics for faster shader sorting (enabled on Metal)

This patch adds two new kernels: SORT_BUCKET_PASS and SORT_WRITE_PASS. These replace PREFIX_SUM and SORTED_PATHS_ARRAY on supported devices (currently implemented on Metal, but will be trivial to enable on the other backends). The new kernels exploit sort partitioning (see D15331) by sorting each partition separately using local atomics. This can give an overall render speedup of 2-3% depending on architecture. As before, we fall back to the original non-partitioned sorting when the shader count is "too high". Reviewed By: brecht Differential Revision: https://developer.blender.org/D16909
2023-02-06 11:16:02 +00:00 · 2023-02-06 11:16:02 +00:00 · 654e1e901b
parent 46c9f7702a
commit 654e1e901b
19 changed files with 359 additions and 48 deletions
--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@ -73,6 +73,10 @@ const char *device_kernel_as_string(DeviceKernel kernel)
      return "integrator_terminated_paths_array";
    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
      return "integrator_sorted_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
+      return "integrator_sort_bucket_pass";
+    case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS:
+      return "integrator_sort_write_pass";
    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
      return "integrator_compact_paths_array";
    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@ -105,6 +105,8 @@ class MetalDevice : public Device {

  bool use_adaptive_compilation();

+  bool use_local_atomic_sort() const;
+
  bool make_source_and_check_if_compile_needed(MetalPipelineType pso_type);

  void make_source(MetalPipelineType pso_type, const uint kernel_features);
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@ -271,6 +271,11 @@ bool MetalDevice::use_adaptive_compilation()
  return DebugFlags().metal.adaptive_compile;
 }

+bool MetalDevice::use_local_atomic_sort() const
+{
+  return DebugFlags().metal.use_local_atomic_sort;
+}
+
 void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
 {
  string global_defines;
@ -278,6 +283,10 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
    global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
  }

+  if (use_local_atomic_sort()) {
+    global_defines += "#define __KERNEL_LOCAL_ATOMIC_SORT__\n";
+  }
+
  if (use_metalrt) {
    global_defines += "#define __METALRT__\n";
    if (motion_blur) {
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@ -87,6 +87,9 @@ struct ShaderCache {
          break;
      }
    }
+
+    occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS] = {1024, 1024};
+    occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS] = {1024, 1024};
  }
  ~ShaderCache();

--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@ -25,6 +25,7 @@ class MetalDeviceQueue : public DeviceQueue {
  virtual int num_concurrent_states(const size_t) const override;
  virtual int num_concurrent_busy_states(const size_t) const override;
  virtual int num_sort_partition_elements() const override;
+  virtual bool supports_local_atomic_sort() const override;

  virtual void init_execution() override;

--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@ -315,6 +315,11 @@ int MetalDeviceQueue::num_sort_partition_elements() const
  return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
 }

+bool MetalDeviceQueue::supports_local_atomic_sort() const
+{
+  return metal_device_->use_local_atomic_sort();
+}
+
 void MetalDeviceQueue::init_execution()
 {
  /* Synchronize all textures and memory copies before executing task. */
@ -553,13 +558,24 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      /* See parallel_active_index.h for why this amount of shared memory is needed.
       * Rounded up to 16 bytes for Metal */
      shared_mem_bytes = (int)round_up((num_threads_per_block + 1) * sizeof(int), 16);
-      [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
      break;

+    case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
+    case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
+      int key_count = metal_device_->launch_params.data.max_shaders;
+      shared_mem_bytes = (int)round_up(key_count * sizeof(int), 16);
+      break;
+    }
+
    default:
      break;
  }

+  if (shared_mem_bytes) {
+    assert(shared_mem_bytes <= 32 * 1024);
+    [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
+  }
+
  MTLSize size_threadgroups_per_dispatch = MTLSizeMake(
      divide_up(work_size, num_threads_per_block), 1, 1);
  MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@ -112,6 +112,13 @@ class DeviceQueue {
    return 65536;
  }

+  /* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and
+   * INTEGRATOR_SORT_WRITE_PASS)? */
+  virtual bool supports_local_atomic_sort() const
+  {
+    return false;
+  }
+
  /* Initialize execution of kernels on this queue.
   *
   * Will, for example, load all data required by the kernels from Device to global or path state.
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@ -71,6 +71,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
          device, "integrator_shader_mnee_sort_counter", MEM_READ_WRITE),
      integrator_shader_sort_prefix_sum_(
          device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
+      integrator_shader_sort_partition_key_offsets_(
+          device, "integrator_shader_sort_partition_key_offsets", MEM_READ_WRITE),
      integrator_next_main_path_index_(device, "integrator_next_main_path_index", MEM_READ_WRITE),
      integrator_next_shadow_path_index_(
          device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
@ -207,33 +209,45 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
  integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
                                                                num_sort_partitions_);

-  /* Allocate arrays for shader sorting. */
-  const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
-  if (integrator_shader_sort_counter_.size() < sort_buckets) {
-    integrator_shader_sort_counter_.alloc(sort_buckets);
-    integrator_shader_sort_counter_.zero_to_device();
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
-        (int *)integrator_shader_sort_counter_.device_pointer;
-
-    integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
-    integrator_shader_sort_prefix_sum_.zero_to_device();
-  }
-
-  if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
-    if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
-      integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
-      integrator_shader_raytrace_sort_counter_.zero_to_device();
-      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
-          (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+  if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
+    /* Allocate array for partitioned shader sorting using local atomics. */
+    const int num_offsets = (device_scene_->data.max_shaders + 1) * num_sort_partitions_;
+    if (integrator_shader_sort_partition_key_offsets_.size() < num_offsets) {
+      integrator_shader_sort_partition_key_offsets_.alloc(num_offsets);
+      integrator_shader_sort_partition_key_offsets_.zero_to_device();
    }
+    integrator_state_gpu_.sort_partition_key_offsets =
+        (int *)integrator_shader_sort_partition_key_offsets_.device_pointer;
  }
+  else {
+    /* Allocate arrays for shader sorting. */
+    const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
+    if (integrator_shader_sort_counter_.size() < sort_buckets) {
+      integrator_shader_sort_counter_.alloc(sort_buckets);
+      integrator_shader_sort_counter_.zero_to_device();
+      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+          (int *)integrator_shader_sort_counter_.device_pointer;

-  if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
-    if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
-      integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
-      integrator_shader_mnee_sort_counter_.zero_to_device();
-      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
-          (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+      integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
+      integrator_shader_sort_prefix_sum_.zero_to_device();
+    }
+
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+      if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
+        integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
+        integrator_shader_raytrace_sort_counter_.zero_to_device();
+        integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+            (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+      }
+    }
+
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
+      if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
+        integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
+        integrator_shader_mnee_sort_counter_.zero_to_device();
+        integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
+            (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+      }
    }
  }
 }
@ -451,8 +465,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
    work_size = num_queued;
    d_path_index = queued_paths_.device_pointer;

-    compute_sorted_queued_paths(
-        DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit);
+    compute_sorted_queued_paths(kernel, num_paths_limit);
  }
  else if (num_queued < work_size) {
    work_size = num_queued;
@ -511,11 +524,26 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
  }
 }

-void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
-                                                   DeviceKernel queued_kernel,
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel queued_kernel,
                                                   const int num_paths_limit)
 {
  int d_queued_kernel = queued_kernel;
+
+  /* Launch kernel to fill the active paths arrays. */
+  if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
+    const int work_size = kernel_max_active_main_path_index(queued_kernel);
+    device_ptr d_queued_paths = queued_paths_.device_pointer;
+
+    int partition_size = (int)integrator_state_gpu_.sort_partition_divisor;
+
+    DeviceKernelArguments args(
+        &work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
+
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, 1024 * num_sort_partitions_, args);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, 1024 * num_sort_partitions_, args);
+    return;
+  }
+
  device_ptr d_counter = (device_ptr)integrator_state_gpu_.sort_key_counter[d_queued_kernel];
  device_ptr d_prefix_sum = integrator_shader_sort_prefix_sum_.device_pointer;
  assert(d_counter != 0 && d_prefix_sum != 0);
@ -552,7 +580,7 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
                               &d_prefix_sum,
                               &d_queued_kernel);

-    queue_->enqueue(kernel, work_size, args);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, work_size, args);
  }
 }

--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@ -70,9 +70,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  void enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit = INT_MAX);

  void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
-  void compute_sorted_queued_paths(DeviceKernel kernel,
-                                   DeviceKernel queued_kernel,
-                                   const int num_paths_limit);
+  void compute_sorted_queued_paths(DeviceKernel queued_kernel, const int num_paths_limit);

  void compact_main_paths(const int num_active_paths);
  void compact_shadow_paths();
@ -135,6 +133,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  device_vector<int> integrator_shader_raytrace_sort_counter_;
  device_vector<int> integrator_shader_mnee_sort_counter_;
  device_vector<int> integrator_shader_sort_prefix_sum_;
+  device_vector<int> integrator_shader_sort_partition_key_offsets_;
  /* Path split. */
  device_vector<int> integrator_next_main_path_index_;
  device_vector<int> integrator_next_shadow_path_index_;
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@ -401,6 +401,71 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
 }
 ccl_gpu_kernel_postfix

+ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
+                             int num_states,
+                             int partition_size,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             int kernel_index)
+{
+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  int max_shaders = context.launch_params_metal.data.max_shaders;
+  ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
+                                           kernel_integrator_state.path.queued_kernel;
+  ccl_global uint *d_shader_sort_key = (ccl_global uint *)
+                                           kernel_integrator_state.path.shader_sort_key;
+  ccl_global int *key_offsets = (ccl_global int *)
+                                    kernel_integrator_state.sort_partition_key_offsets;
+
+  gpu_parallel_sort_bucket_pass(num_states,
+                                partition_size,
+                                max_shaders,
+                                kernel_index,
+                                d_queued_kernel,
+                                d_shader_sort_key,
+                                key_offsets,
+                                (threadgroup int *)threadgroup_array,
+                                metal_local_id,
+                                metal_local_size,
+                                metal_grid_id);
+#endif
+}
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sort_write_pass,
+                             int num_states,
+                             int partition_size,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             int kernel_index)
+{
+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  int max_shaders = context.launch_params_metal.data.max_shaders;
+  ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
+                                           kernel_integrator_state.path.queued_kernel;
+  ccl_global uint *d_shader_sort_key = (ccl_global uint *)
+                                           kernel_integrator_state.path.shader_sort_key;
+  ccl_global int *key_offsets = (ccl_global int *)
+                                    kernel_integrator_state.sort_partition_key_offsets;
+
+  gpu_parallel_sort_write_pass(num_states,
+                               partition_size,
+                               max_shaders,
+                               kernel_index,
+                               num_states_limit,
+                               indices,
+                               d_queued_kernel,
+                               d_shader_sort_key,
+                               key_offsets,
+                               (threadgroup int *)threadgroup_array,
+                               metal_local_id,
+                               metal_local_size,
+                               metal_grid_id);
+#endif
+}
+ccl_gpu_kernel_postfix
+
 ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_compact_paths_array,
                             int num_states,
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@ -178,7 +178,7 @@ __device__
                                         simd_lane_index, \
                                         simd_group_index, \
                                         num_simd_groups, \
-                                         simdgroup_offset)
+                                         (threadgroup int *)threadgroup_array)
 #elif defined(__KERNEL_ONEAPI__)

 #  define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@ -20,6 +20,116 @@ CCL_NAMESPACE_BEGIN
 #endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)

+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+
+#  define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
+
+#  define atomic_store_local(p, x) \
+    atomic_store_explicit((threadgroup atomic_int *)p, x, memory_order_relaxed)
+#  define atomic_load_local(p) \
+    atomic_load_explicit((threadgroup atomic_int *)p, memory_order_relaxed)
+
+ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
+                                                     const uint partition_size,
+                                                     const uint max_shaders,
+                                                     const uint queued_kernel,
+                                                     ccl_global ushort *d_queued_kernel,
+                                                     ccl_global uint *d_shader_sort_key,
+                                                     ccl_global int *partition_key_offsets,
+                                                     ccl_gpu_shared int *buckets,
+                                                     const ushort local_id,
+                                                     const ushort local_size,
+                                                     const ushort grid_id)
+{
+  /* Zero the bucket sizes. */
+  if (local_id < max_shaders) {
+    atomic_store_local(&buckets[local_id], 0);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Determine bucket sizes within the partitions. */
+
+  const uint partition_start = partition_size * uint(grid_id);
+  const uint partition_end = min(num_states, partition_start + partition_size);
+
+  for (int state_index = partition_start + uint(local_id); state_index < partition_end;
+       state_index += uint(local_size)) {
+    ushort kernel_index = d_queued_kernel[state_index];
+    if (kernel_index == queued_kernel) {
+      uint key = d_shader_sort_key[state_index] % max_shaders;
+      atomic_fetch_and_add_uint32(&buckets[key], 1);
+    }
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Calculate the partition's local offsets from the prefix sum of bucket sizes. */
+
+  if (local_id == 0) {
+    int offset = 0;
+    for (int i = 0; i < max_shaders; i++) {
+      partition_key_offsets[i + uint(grid_id) * (max_shaders + 1)] = offset;
+      offset = offset + atomic_load_local(&buckets[i]);
+    }
+
+    /* Store the number of active states in this partition. */
+    partition_key_offsets[max_shaders + uint(grid_id) * (max_shaders + 1)] = offset;
+  }
+}
+
+ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
+                                                    const uint partition_size,
+                                                    const uint max_shaders,
+                                                    const uint queued_kernel,
+                                                    const int num_states_limit,
+                                                    ccl_global int *indices,
+                                                    ccl_global ushort *d_queued_kernel,
+                                                    ccl_global uint *d_shader_sort_key,
+                                                    ccl_global int *partition_key_offsets,
+                                                    ccl_gpu_shared int *local_offset,
+                                                    const ushort local_id,
+                                                    const ushort local_size,
+                                                    const ushort grid_id)
+{
+  /* Calculate each partition's global offset from the prefix sum of the active state counts per
+   * partition. */
+
+  if (local_id < max_shaders) {
+    int partition_offset = 0;
+    for (int i = 0; i < uint(grid_id); i++) {
+      int partition_key_count = partition_key_offsets[max_shaders + uint(i) * (max_shaders + 1)];
+      partition_offset += partition_key_count;
+    }
+
+    ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * (max_shaders + 1));
+    atomic_store_local(&local_offset[local_id], key_offsets[local_id] + partition_offset);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Write the sorted active indices. */
+
+  const uint partition_start = partition_size * uint(grid_id);
+  const uint partition_end = min(num_states, partition_start + partition_size);
+
+  ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * max_shaders);
+
+  for (int state_index = partition_start + uint(local_id); state_index < partition_end;
+       state_index += uint(local_size)) {
+    ushort kernel_index = d_queued_kernel[state_index];
+    if (kernel_index == queued_kernel) {
+      uint key = d_shader_sort_key[state_index] % max_shaders;
+      int index = atomic_fetch_and_add_uint32(&local_offset[key], 1);
+      if (index < num_states_limit) {
+        indices[index] = state_index;
+      }
+    }
+  }
+}
+
+#endif /* __KERNEL_LOCAL_ATOMIC_SORT__ */
+
 template<typename GetKeyOp>
 __device__ void gpu_parallel_sorted_index_array(const uint state_index,
                                                const uint num_states,
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@ -105,10 +105,11 @@ struct kernel_gpu_##name \
 { \
  PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
  void run(thread MetalKernelContext& context, \
-           threadgroup int *simdgroup_offset, \
+           threadgroup atomic_int *threadgroup_array, \
           const uint metal_global_id, \
           const ushort metal_local_id, \
           const ushort metal_local_size, \
+           const ushort metal_grid_id, \
           uint simdgroup_size, \
           uint simd_lane_index, \
           uint simd_group_index, \
@ -117,22 +118,24 @@ struct kernel_gpu_##name \
 kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
                                constant KernelParamsMetal &ccl_restrict   _launch_params_metal, \
                                constant MetalAncillaries *_metal_ancillaries, \
-                                threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
+                                threadgroup atomic_int *threadgroup_array[[ threadgroup(0) ]], \
                                const uint metal_global_id [[thread_position_in_grid]], \
                                const ushort metal_local_id   [[thread_position_in_threadgroup]], \
                                const ushort metal_local_size [[threads_per_threadgroup]], \
+                                const ushort metal_grid_id    [[threadgroup_position_in_grid]], \
                                uint simdgroup_size [[threads_per_simdgroup]], \
                                uint simd_lane_index [[thread_index_in_simdgroup]], \
                                uint simd_group_index [[simdgroup_index_in_threadgroup]], \
                                uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
  MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
-  params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
+  params_struct->run(context, threadgroup_array, metal_global_id, metal_local_id, metal_local_size, metal_grid_id, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
 } \
 void kernel_gpu_##name::run(thread MetalKernelContext& context, \
-                  threadgroup int *simdgroup_offset, \
+                  threadgroup atomic_int *threadgroup_array, \
                  const uint metal_global_id, \
                  const ushort metal_local_id, \
                  const ushort metal_local_size, \
+                  const ushort metal_grid_id, \
                  uint simdgroup_size, \
                  uint simd_lane_index, \
                  uint simd_group_index, \
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@ -132,6 +132,9 @@ typedef struct IntegratorStateGPU {
  /* Index of main path which will be used by a next shadow catcher split.  */
  ccl_global int *next_main_path_index;

+  /* Partition/key offsets used when writing sorted active indices. */
+  ccl_global int *sort_partition_key_offsets;
+
  /* Divisor used to partition active indices by locality when sorting by material.  */
  uint sort_partition_divisor;
 } IntegratorStateGPU;
--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@ -115,6 +115,13 @@ ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+
+#  if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  if (!kernel_integrator_state.sort_key_counter[next_kernel]) {
+    return;
+  }
+#  endif
+
  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
 }

@ -130,6 +137,13 @@ ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+
+#  if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  if (!kernel_integrator_state.sort_key_counter[next_kernel]) {
+    return;
+  }
+#  endif
+
  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
 }

--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@ -1509,6 +1509,8 @@ typedef enum DeviceKernel : int {
  DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS,
+  DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS,
  DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES,
  DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY,
--- a/intern/cycles/util/atomic.h
+++ b/intern/cycles/util/atomic.h
@ -73,16 +73,55 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *_s
  return new_value.float_value;
 }

-#    define atomic_fetch_and_add_uint32(p, x) \
-      atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed)
-#    define atomic_fetch_and_sub_uint32(p, x) \
-      atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed)
-#    define atomic_fetch_and_inc_uint32(p) \
-      atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
-#    define atomic_fetch_and_dec_uint32(p) \
-      atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
-#    define atomic_fetch_and_or_uint32(p, x) \
-      atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed)
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_add_uint32(device T *p, int x)
+{
+  return atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_sub_uint32(device T *p, int x)
+{
+  return atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_inc_uint32(device T *p)
+{
+  return atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_dec_uint32(device T *p)
+{
+  return atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_or_uint32(device T *p, int x)
+{
+  return atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_add_uint32(threadgroup T *p, int x)
+{
+  return atomic_fetch_add_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_sub_uint32(threadgroup T *p, int x)
+{
+  return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_inc_uint32(threadgroup T *p)
+{
+  return atomic_fetch_add_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_dec_uint32(threadgroup T *p)
+{
+  return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_or_uint32(threadgroup T *p, int x)
+{
+  return atomic_fetch_or_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
+}

 ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest,
                                                      const float old_val,
--- a/intern/cycles/util/debug.cpp
+++ b/intern/cycles/util/debug.cpp
@ -69,6 +69,9 @@ void DebugFlags::Metal::reset()
 {
  if (getenv("CYCLES_METAL_ADAPTIVE_COMPILE") != NULL)
    adaptive_compile = true;
+
+  if (auto str = getenv("CYCLES_METAL_LOCAL_ATOMIC_SORT"))
+    use_local_atomic_sort = (atoi(str) != 0);
 }

 DebugFlags::OptiX::OptiX()
--- a/intern/cycles/util/debug.h
+++ b/intern/cycles/util/debug.h
@ -97,6 +97,9 @@ class DebugFlags {

    /* Whether adaptive feature based runtime compile is enabled or not. */
    bool adaptive_compile = false;
+
+    /* Whether local atomic sorting is enabled or not. */
+    bool use_local_atomic_sort = true;
  };

  /* Get instance of debug flags registry. */