Cycles: Apple GPU threadgroup tuning

This patch tunes maximum threads-per-threadgroup and threads-per-block for faster renders on Apple GPUs. Appropriate tuning is selected based on the GPU architecture (M1 or M2). We see a benchmark uplift of around 5-10% on M1 family chips. Similar uplift is expected on M2 with upcoming OS changes. (Ref T101931) Reviewed By: brecht Maniphest Tasks: T101931 Differential Revision: https://developer.blender.org/D16299
Referenced by issue #101931, Cycles Apple Silicon optimisations
2022-11-04 15:59:55 +00:00 · 2022-11-04 15:59:55 +00:00 · 74140d41b1 · 2023-04-14 09:18:04 +02:00
parent 671c3e1fa4
commit 74140d41b1
1 changed files with 49 additions and 12 deletions
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@ -45,6 +45,36 @@ bool kernel_has_intersection(DeviceKernel device_kernel)
 struct ShaderCache {
  ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice)
  {
+    /* Initialize occupancy tuning LUT. */
+    if (MetalInfo::get_device_vendor(mtlDevice) == METAL_GPU_APPLE) {
+      switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) {
+        default:
+        case APPLE_M2:
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {32, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {832, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {64, 64};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {64, 64};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {704, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {1024, 256};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {64, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {256, 256};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {448, 384};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {1024, 1024};
+          break;
+        case APPLE_M1:
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {256, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {768, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {512, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {384, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {512, 64};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {512, 256};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {384, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {576, 384};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {832, 832};
+          break;
+      }
+    }
  }
  ~ShaderCache();

@ -73,6 +103,11 @@ struct ShaderCache {
    std::function<void(MetalKernelPipeline *)> completionHandler;
  };

+  struct OccupancyTuningParameters {
+    int threads_per_threadgroup = 0;
+    int num_threads_per_block = 0;
+  } occupancy_tuning[DEVICE_KERNEL_NUM];
+
  std::mutex cache_mutex;

  PipelineCollection pipelines[DEVICE_KERNEL_NUM];
@ -230,6 +265,13 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
  request.pipeline->device_kernel = device_kernel;
  request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;

+  if (occupancy_tuning[device_kernel].threads_per_threadgroup) {
+    request.pipeline->threads_per_threadgroup =
+        occupancy_tuning[device_kernel].threads_per_threadgroup;
+    request.pipeline->num_threads_per_block =
+        occupancy_tuning[device_kernel].num_threads_per_block;
+  }
+
  /* metalrt options */
  request.pipeline->use_metalrt = device->use_metalrt;
  request.pipeline->metalrt_hair = device->use_metalrt &&
@ -374,13 +416,6 @@ void MetalKernelPipeline::compile()
  const std::string function_name = std::string("cycles_metal_") +
                                    device_kernel_as_string(device_kernel);

-  int threads_per_threadgroup = this->threads_per_threadgroup;
-  if (device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL &&
-      device_kernel < DEVICE_KERNEL_INTEGRATOR_RESET) {
-    /* Always use 512 for the sorting kernels */
-    threads_per_threadgroup = 512;
-  }
-
  NSString *entryPoint = [@(function_name.c_str()) copy];

  NSError *error = NULL;
@ -644,12 +679,14 @@ void MetalKernelPipeline::compile()
      return;
    }

-    int num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
-                                           computePipelineState.threadExecutionWidth);
-    num_threads_per_block = std::max(num_threads_per_block,
-                                     (int)computePipelineState.threadExecutionWidth);
+    if (!num_threads_per_block) {
+      num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
+                                         computePipelineState.threadExecutionWidth);
+      num_threads_per_block = std::max(num_threads_per_block,
+                                       (int)computePipelineState.threadExecutionWidth);
+    }
+
    this->pipeline = computePipelineState;
-    this->num_threads_per_block = num_threads_per_block;

    if (@available(macOS 11.0, *)) {
      if (creating_new_archive || recreate_archive) {