Cycles: Additional Metal kernel specialisation exposed through UI

This patch adds a new "Kernel Optimization Level" dropdown menu to control Metal kernel specialisation. Currently this defaults to "full" optimisation, on the assumption that the changes proposed in D16371 will address usability concerns around app responsiveness and shader cache housekeeping. Reviewed By: brecht Differential Revision: https://developer.blender.org/D16514
Referenced by issue #103635, Metal backend shows everything in pink Referenced by issue #103424, Blender 3.5.0 crashes on Cycles when importing images, macOS Mojave 10.14.6 Intel
2023-01-04 16:01:24 +00:00 · 2023-01-04 16:01:24 +00:00 · a7cc6e015c · 2023-02-14 10:54:29 +01:00
parent 496d736adc
commit a7cc6e015c
10 changed files with 173 additions and 86 deletions
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@ -1543,6 +1543,17 @@ class CyclesPreferences(bpy.types.AddonPreferences):
        default=False,
    )

+    kernel_optimization_level: EnumProperty(
+        name="Kernel Optimization",
+        description="Kernels can be optimized based on scene content. Optimized kernels are requested at the start of a render. If optimized kernels are not available, rendering will proceed using generic kernels until the optimized set is available in the cache. This can result in additional CPU usage for a brief time (tens of seconds).",
+        default='FULL',
+        items=(
+            ('OFF', "Off", "Disable kernel optimization. Slowest rendering, no extra background CPU usage"),
+            ('INTERSECT', "Intersection only", "Optimize only intersection kernels. Faster rendering, negligible extra background CPU usage"),
+            ('FULL', "Full", "Optimize all kernels. Fastest rendering, may result in extra background CPU usage"),
+        ),
+    )
+
    def find_existing_device_entry(self, device):
        for device_entry in self.devices:
            if device_entry.id == device[2] and device_entry.type == device[1]:
@ -1711,10 +1722,12 @@ class CyclesPreferences(bpy.types.AddonPreferences):
        if compute_device_type == 'METAL':
            import platform
            # MetalRT only works on Apple Silicon at present, pending argument encoding fixes on AMD
+            # Kernel specialization is only viable on Apple Silicon at present due to relative compilation speed
            if platform.machine() == 'arm64':
-                row = layout.row()
-                row.use_property_split = True
-                row.prop(self, "use_metalrt")
+                col = layout.column()
+                col.use_property_split = True
+                col.prop(self, "kernel_optimization_level")
+                col.prop(self, "use_metalrt")

    def draw(self, context):
        self.draw_impl(self.layout, context)
--- a/intern/cycles/blender/device.cpp
+++ b/intern/cycles/blender/device.cpp
@ -30,7 +30,7 @@ int blender_device_threads(BL::Scene &b_scene)
    return 0;
 }

-DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scene, bool background)
+DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scene, bool background, bool preview)
 {
  PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");

@ -113,6 +113,18 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
    device.use_metalrt = true;
  }

+  if (preview) {
+    /* Disable specialization for preview renders. */
+    device.kernel_optimization_level = KERNEL_OPTIMIZATION_LEVEL_OFF;
+  }
+  else {
+    device.kernel_optimization_level = (KernelOptimizationLevel)get_enum(
+        cpreferences,
+        "kernel_optimization_level",
+        KERNEL_OPTIMIZATION_NUM_LEVELS,
+        KERNEL_OPTIMIZATION_LEVEL_FULL);
+  }
+
  return device;
 }

--- a/intern/cycles/blender/device.h
+++ b/intern/cycles/blender/device.h
@ -19,7 +19,8 @@ int blender_device_threads(BL::Scene &b_scene);
 /* Convert Blender settings to device specification. */
 DeviceInfo blender_device_info(BL::Preferences &b_preferences,
                               BL::Scene &b_scene,
-                               bool background);
+                               bool background,
+                               bool preview);

 CCL_NAMESPACE_END

--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@ -754,7 +754,7 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key
  RNA_id_pointer_create((ID *)PyLong_AsVoidPtr(pyscene), &sceneptr);
  BL::Scene b_scene(sceneptr);

-  DeviceInfo device = blender_device_info(b_preferences, b_scene, true);
+  DeviceInfo device = blender_device_info(b_preferences, b_scene, true, true);

  /* Get denoising parameters from view layer. */
  PointerRNA viewlayerptr;
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@ -866,7 +866,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,

  /* Device */
  params.threads = blender_device_threads(b_scene);
-  params.device = blender_device_info(b_preferences, b_scene, params.background);
+  params.device = blender_device_info(b_preferences, b_scene, params.background, b_engine.is_preview());

  /* samples */
  int samples = get_int(cscene, "samples");
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@ -57,6 +57,14 @@ enum DeviceTypeMask {

 #define DEVICE_MASK(type) (DeviceTypeMask)(1 << type)

+enum KernelOptimizationLevel {
+  KERNEL_OPTIMIZATION_LEVEL_OFF = 0,
+  KERNEL_OPTIMIZATION_LEVEL_INTERSECT = 1,
+  KERNEL_OPTIMIZATION_LEVEL_FULL = 2,
+
+  KERNEL_OPTIMIZATION_NUM_LEVELS
+};
+
 class DeviceInfo {
 public:
  DeviceType type;
@ -66,13 +74,15 @@ class DeviceInfo {
  bool display_device;        /* GPU is used as a display device. */
  bool has_nanovdb;           /* Support NanoVDB volumes. */
  bool has_light_tree;        /* Support light tree. */
-  bool has_osl;               /* Support Open Shading Language. */
-  bool has_guiding;           /* Support path guiding. */
-  bool has_profiling;         /* Supports runtime collection of profiling info. */
-  bool has_peer_memory;       /* GPU has P2P access to memory of another GPU. */
-  bool has_gpu_queue;         /* Device supports GPU queue. */
-  bool use_metalrt;           /* Use MetalRT to accelerate ray queries (Metal only). */
-  DenoiserTypeMask denoisers; /* Supported denoiser types. */
+  bool has_osl;         /* Support Open Shading Language. */
+  bool has_guiding;     /* Support path guiding. */
+  bool has_profiling;   /* Supports runtime collection of profiling info. */
+  bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */
+  bool has_gpu_queue;   /* Device supports GPU queue. */
+  bool use_metalrt;     /* Use MetalRT to accelerate ray queries (Metal only). */
+  KernelOptimizationLevel kernel_optimization_level; /* Optimization level applied to path tracing
+                                                        kernels (Metal only). */
+  DenoiserTypeMask denoisers;                        /* Supported denoiser types. */
  int cpu_threads;
  vector<DeviceInfo> multi_devices;
  string error_msg;
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@ -110,10 +110,6 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
    case METAL_GPU_APPLE: {
      max_threads_per_threadgroup = 512;
      use_metalrt = info.use_metalrt;
-
-      /* Specialize the intersection kernels on Apple GPUs by default as these can be built very
-       * quickly. */
-      kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
      break;
    }
  }
@ -126,6 +122,22 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
    capture_enabled = true;
  }

+  if (device_vendor == METAL_GPU_APPLE) {
+    /* Set kernel_specialization_level based on user prefs. */
+    switch (info.kernel_optimization_level) {
+      case KERNEL_OPTIMIZATION_LEVEL_OFF:
+        kernel_specialization_level = PSO_GENERIC;
+        break;
+      default:
+      case KERNEL_OPTIMIZATION_LEVEL_INTERSECT:
+        kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
+        break;
+      case KERNEL_OPTIMIZATION_LEVEL_FULL:
+        kernel_specialization_level = PSO_SPECIALIZED_SHADE;
+        break;
+    }
+  }
+
  if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) {
    kernel_specialization_level = (MetalPipelineType)atoi(envstr);
  }
@ -444,7 +456,7 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
                    source);
  }

-  const double starttime = time_dt();
+  double starttime = time_dt();

  NSError *error = NULL;
  id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
@ -457,6 +469,12 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)

  [options release];

+  bool blocking_pso_build = (getenv("CYCLES_METAL_PROFILING") || MetalDeviceKernels::is_benchmark_warmup());
+  if (blocking_pso_build) {
+    MetalDeviceKernels::wait_for_all();
+    starttime = 0.0;
+  }
+
  /* Save the compiled MTLLibrary and trigger the AIR->PSO builds (if the MetalDevice still
   * exists). */
  {
@ -464,6 +482,8 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
    if (MetalDevice *instance = get_device_by_ID(device_id, lock)) {
      if (mtlLibrary) {
        instance->mtlLibrary[pso_type] = mtlLibrary;
+
+        starttime = time_dt();
        MetalDeviceKernels::load(instance, pso_type);
      }
      else {
@ -472,6 +492,14 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
      }
    }
  }
+
+  if (starttime && blocking_pso_build) {
+    MetalDeviceKernels::wait_for_all();
+
+    metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
+                 time_dt() - starttime,
+                 kernel_type_as_string(pso_type));
+  }
 }

 void MetalDevice::load_texture_info()
@ -832,10 +860,8 @@ void MetalDevice::optimize_for_scene(Scene *scene)
  }

  /* Block during benchmark warm-up to ensure kernels are cached prior to the observed run. */
-  for (int i = 0; i < *_NSGetArgc(); i++) {
-    if (!strcmp((*_NSGetArgv())[i], "--warm-up")) {
-      specialize_in_background = false;
-    }
+  if (MetalDeviceKernels::is_benchmark_warmup()) {
+    specialize_in_background = false;
  }

  if (specialize_in_background) {
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@ -101,6 +101,8 @@ int get_loaded_kernel_count(MetalDevice const *device, MetalPipelineType pso_typ
 bool should_load_kernels(MetalDevice const *device, MetalPipelineType pso_type);
 bool load(MetalDevice *device, MetalPipelineType pso_type);
 const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel);
+void wait_for_all();
+bool is_benchmark_warmup();

 } /* namespace MetalDeviceKernels */

--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@ -116,19 +116,29 @@ struct ShaderCache {
 };

 bool ShaderCache::running = true;
-std::mutex g_shaderCacheMutex;
-std::map<id<MTLDevice>, unique_ptr<ShaderCache>> g_shaderCache;
+
+const int MAX_POSSIBLE_GPUS_ON_SYSTEM = 8;
+using DeviceShaderCache = std::pair<id<MTLDevice>, unique_ptr<ShaderCache>>;
+int g_shaderCacheCount = 0;
+DeviceShaderCache g_shaderCache[MAX_POSSIBLE_GPUS_ON_SYSTEM];

 ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice)
 {
-  thread_scoped_lock lock(g_shaderCacheMutex);
-  auto it = g_shaderCache.find(mtlDevice);
-  if (it != g_shaderCache.end()) {
-    return it->second.get();
+  for (int i=0; i<g_shaderCacheCount; i++) {
+    if (g_shaderCache[i].first == mtlDevice) {
+      return g_shaderCache[i].second.get();
+    }
  }

-  g_shaderCache[mtlDevice] = make_unique<ShaderCache>(mtlDevice);
-  return g_shaderCache[mtlDevice].get();
+  static thread_mutex g_shaderCacheCountMutex;
+  g_shaderCacheCountMutex.lock();
+  int index = g_shaderCacheCount++;
+  g_shaderCacheCountMutex.unlock();
+
+  assert(index < MAX_POSSIBLE_GPUS_ON_SYSTEM);
+  g_shaderCache[index].first = mtlDevice;
+  g_shaderCache[index].second = make_unique<ShaderCache>(mtlDevice);
+  return g_shaderCache[index].second.get();
 }

 ShaderCache::~ShaderCache()
@ -145,7 +155,7 @@ ShaderCache::~ShaderCache()
    num_incomplete = int(incomplete_requests);
  }

-  if (num_incomplete) {
+  if (num_incomplete && !MetalDeviceKernels::is_benchmark_warmup()) {
    metal_printf("ShaderCache still busy (incomplete_requests = %d). Terminating...\n",
                 num_incomplete);
    std::terminate();
@ -332,12 +342,6 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,

 MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const MetalDevice *device)
 {
-  thread_scoped_lock lock(cache_mutex);
-  auto &collection = pipelines[kernel];
-  if (collection.empty()) {
-    return nullptr;
-  }
-
  /* metalrt options */
  bool use_metalrt = device->use_metalrt;
  bool device_metalrt_hair = use_metalrt && device->kernel_features & KERNEL_FEATURE_HAIR;
@ -349,34 +353,43 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M
                               device->kernel_features & KERNEL_FEATURE_OBJECT_MOTION;

  MetalKernelPipeline *best_pipeline = nullptr;
-  for (auto &pipeline : collection) {
-    if (!pipeline->loaded) {
-      /* still loading - ignore */
-      continue;
-    }
+  while(!best_pipeline) {
+    {
+      thread_scoped_lock lock(cache_mutex);
+      for (auto &pipeline : pipelines[kernel]) {
+        if (!pipeline->loaded) {
+          /* still loading - ignore */
+          continue;
+        }

-    bool pipeline_metalrt_hair = pipeline->metalrt_features & KERNEL_FEATURE_HAIR;
-    bool pipeline_metalrt_hair_thick = pipeline->metalrt_features & KERNEL_FEATURE_HAIR_THICK;
-    bool pipeline_metalrt_pointcloud = pipeline->metalrt_features & KERNEL_FEATURE_POINTCLOUD;
-    bool pipeline_metalrt_motion = use_metalrt &&
-                                   pipeline->metalrt_features & KERNEL_FEATURE_OBJECT_MOTION;
+        bool pipeline_metalrt_hair = pipeline->metalrt_features & KERNEL_FEATURE_HAIR;
+        bool pipeline_metalrt_hair_thick = pipeline->metalrt_features & KERNEL_FEATURE_HAIR_THICK;
+        bool pipeline_metalrt_pointcloud = pipeline->metalrt_features & KERNEL_FEATURE_POINTCLOUD;
+        bool pipeline_metalrt_motion = use_metalrt &&
+                                      pipeline->metalrt_features & KERNEL_FEATURE_OBJECT_MOTION;

-    if (pipeline->use_metalrt != use_metalrt || pipeline_metalrt_hair != device_metalrt_hair ||
-        pipeline_metalrt_hair_thick != device_metalrt_hair_thick ||
-        pipeline_metalrt_pointcloud != device_metalrt_pointcloud ||
-        pipeline_metalrt_motion != device_metalrt_motion) {
-      /* wrong combination of metalrt options */
-      continue;
-    }
+        if (pipeline->use_metalrt != use_metalrt || pipeline_metalrt_hair != device_metalrt_hair ||
+            pipeline_metalrt_hair_thick != device_metalrt_hair_thick ||
+            pipeline_metalrt_pointcloud != device_metalrt_pointcloud ||
+            pipeline_metalrt_motion != device_metalrt_motion) {
+          /* wrong combination of metalrt options */
+          continue;
+        }

-    if (pipeline->pso_type != PSO_GENERIC) {
-      if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] ||
-          pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) {
-        best_pipeline = pipeline.get();
+        if (pipeline->pso_type != PSO_GENERIC) {
+          if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] ||
+              pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) {
+            best_pipeline = pipeline.get();
+          }
+        }
+        else if (!best_pipeline) {
+          best_pipeline = pipeline.get();
+        }
      }
    }
-    else if (!best_pipeline) {
-      best_pipeline = pipeline.get();
+
+    if (!best_pipeline) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(100));
    }
  }

@ -802,28 +815,26 @@ void MetalKernelPipeline::compile()

 bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type)
 {
-  const double starttime = time_dt();
  auto shader_cache = get_shader_cache(device->mtlDevice);
  for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
    shader_cache->load_kernel((DeviceKernel)i, device, pso_type);
  }
-
-  if (getenv("CYCLES_METAL_PROFILING")) {
-    shader_cache->wait_for_all();
-    metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
-                 time_dt() - starttime,
-                 kernel_type_as_string(pso_type));
-  }
  return true;
 }

+void MetalDeviceKernels::wait_for_all()
+{
+  for (int i=0; i<g_shaderCacheCount; i++) {
+    g_shaderCache[i].second->wait_for_all();
+  }
+}
+
 bool MetalDeviceKernels::any_specialization_happening_now()
 {
  /* Return true if any ShaderCaches have ongoing specialization requests (typically there will be
   * only 1). */
-  thread_scoped_lock lock(g_shaderCacheMutex);
-  for (auto &it : g_shaderCache) {
-    if (it.second->incomplete_specialization_requests > 0) {
+  for (int i=0; i<g_shaderCacheCount; i++) {
+    if (g_shaderCache[i].second->incomplete_specialization_requests > 0) {
      return true;
    }
  }
@ -854,6 +865,19 @@ const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevi
  return get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device);
 }

+bool MetalDeviceKernels::is_benchmark_warmup()
+{
+  NSArray *args = [[NSProcessInfo processInfo] arguments];
+  for (int i = 0; i<args.count; i++) {
+    if (const char* arg = [[args objectAtIndex:i] cStringUsingEncoding:NSASCIIStringEncoding]) {
+      if (!strcmp(arg, "--warm-up")) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 CCL_NAMESPACE_END

 #endif /* WITH_METAL*/
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@ -202,6 +202,9 @@ MetalDeviceQueue::~MetalDeviceQueue()
  assert(mtlCommandBuffer_ == nil);
  assert(command_buffers_submitted_ == command_buffers_completed_);

+  close_compute_encoder();
+  close_blit_encoder();
+
  if (@available(macos 10.14, *)) {
    [shared_event_listener_ release];
    [shared_event_ release];
@ -637,9 +640,7 @@ bool MetalDeviceQueue::synchronize()
    return false;
  }

-  if (mtlComputeEncoder_) {
-    close_compute_encoder();
-  }
+  close_compute_encoder();
  close_blit_encoder();

  if (mtlCommandBuffer_) {
@ -855,9 +856,7 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel
  if (@available(macos 10.14, *)) {
    if (timing_shared_event_) {
      /* Close the current encoder to ensure we're able to capture per-encoder timing data. */
-      if (mtlComputeEncoder_) {
-        close_compute_encoder();
-      }
+      close_compute_encoder();
    }

    if (mtlComputeEncoder_) {
@ -897,9 +896,7 @@ id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder()
    return mtlBlitEncoder_;
  }

-  if (mtlComputeEncoder_) {
-    close_compute_encoder();
-  }
+  close_compute_encoder();

  if (!mtlCommandBuffer_) {
    mtlCommandBuffer_ = [mtlCommandQueue_ commandBuffer];
@ -913,12 +910,14 @@ id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder()

 void MetalDeviceQueue::close_compute_encoder()
 {
-  [mtlComputeEncoder_ endEncoding];
-  mtlComputeEncoder_ = nil;
+  if (mtlComputeEncoder_) {
+    [mtlComputeEncoder_ endEncoding];
+    mtlComputeEncoder_ = nil;

-  if (@available(macos 10.14, *)) {
-    if (timing_shared_event_) {
-      [mtlCommandBuffer_ encodeSignalEvent:timing_shared_event_ value:timing_shared_event_id_++];
+    if (@available(macos 10.14, *)) {
+      if (timing_shared_event_) {
+        [mtlCommandBuffer_ encodeSignalEvent:timing_shared_event_ value:timing_shared_event_id_++];
+      }
    }
  }
 }