Cycles: Occupancy tuning for new higher end M2 machines

This patch adds occupancy tuning for the newly announced high-end M2 machines, giving 10-15% render speedup over a pre-tuned build. Reviewed By: brecht Differential Revision: https://developer.blender.org/D17037
2023-01-19 17:55:53 +00:00 · 2023-01-19 17:55:53 +00:00 · 08b3426df9
parent e7af2503c5
commit 08b3426df9
4 changed files with 16 additions and 2 deletions
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@ -49,6 +49,18 @@ struct ShaderCache {
    if (MetalInfo::get_device_vendor(mtlDevice) == METAL_GPU_APPLE) {
      switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) {
        default:
+        case APPLE_M2_BIG:
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {384, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {640, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {1024, 64};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {704, 704};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {640, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {896, 768};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {32, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {768, 576};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {896, 768};
+          break;
        case APPLE_M2:
          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {32, 32};
          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {832, 32};
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@ -278,7 +278,8 @@ int MetalDeviceQueue::num_concurrent_states(const size_t state_size) const
  if (metal_device_->device_vendor == METAL_GPU_APPLE) {
    result *= 4;

-    if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) == APPLE_M2) {
+    /* Increasing the state count doesn't notably benefit M1-family systems.  */
+    if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) != APPLE_M1) {
      size_t system_ram = system_physical_ram();
      size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize];
      size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize];
--- a/intern/cycles/device/metal/util.h
+++ b/intern/cycles/device/metal/util.h
@ -29,6 +29,7 @@ enum AppleGPUArchitecture {
  APPLE_UNKNOWN,
  APPLE_M1,
  APPLE_M2,
+  APPLE_M2_BIG,
 };

 /* Contains static Metal helper functions. */
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@ -52,7 +52,7 @@ AppleGPUArchitecture MetalInfo::get_apple_gpu_architecture(id<MTLDevice> device)
    return APPLE_M1;
  }
  else if (strstr(device_name, "M2")) {
-    return APPLE_M2;
+    return get_apple_gpu_core_count(device) <= 10 ? APPLE_M2 : APPLE_M2_BIG;
  }
  return APPLE_UNKNOWN;
 }