Cycles: Improved render start/stop responsiveness on Metal

All kernel specialisation is now performed in the background regardless of kernel type, meaning that the first render will be visible a few seconds sooner. The only exception is during benchmark warm up, in which case we wait for all kernels to be cached. When stopping a render, we call a new `cancel()` method on the device which causes any outstanding compilation work to be cancelled, and we destroy the device in a detached thread so that any stale queued compilations can be safely purged without blocking the UI for longer than necessary. Reviewed By: brecht Differential Revision: https://developer.blender.org/D16371
2023-01-04 14:23:33 +00:00 · 2023-01-04 14:23:33 +00:00 · 77c3e67d3d
parent fbc2c4c331
commit 77c3e67d3d
8 changed files with 390 additions and 194 deletions
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@ -167,6 +167,17 @@ class Device {
    return true;
  }

+  /* Request cancellation of any long-running work. */
+  virtual void cancel()
+  {
+  }
+
+  /* Return true if device is ready for rendering, or report status if not. */
+  virtual bool is_ready(string &status) const
+  {
+    return true;
+  }
+
  /* GPU device only functions.
   * These may not be used on CPU or multi-devices. */

--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@ -76,7 +76,20 @@ class MetalDevice : public Device {

  bool use_metalrt = false;
  MetalPipelineType kernel_specialization_level = PSO_GENERIC;
-  std::atomic_bool async_compile_and_load = false;
+
+  int device_id = 0;
+
+  static thread_mutex existing_devices_mutex;
+  static std::map<int, MetalDevice *> active_device_ids;
+
+  static bool is_device_cancelled(int device_id);
+
+  static MetalDevice *get_device_by_ID(int device_idID,
+                                       thread_scoped_lock &existing_devices_mutex_lock);
+
+  virtual bool is_ready(string &status) const override;
+
+  virtual void cancel() override;

  virtual BVHLayoutMask get_bvh_layout_mask() const override;

@ -92,14 +105,12 @@ class MetalDevice : public Device {

  bool use_adaptive_compilation();

+  bool make_source_and_check_if_compile_needed(MetalPipelineType pso_type);
+
  void make_source(MetalPipelineType pso_type, const uint kernel_features);

  virtual bool load_kernels(const uint kernel_features) override;

-  void reserve_local_memory(const uint kernel_features);
-
-  void init_host_memory();
-
  void load_texture_info();

  void erase_allocation(device_memory &mem);
@ -112,7 +123,7 @@ class MetalDevice : public Device {

  virtual void optimize_for_scene(Scene *scene) override;

-  bool compile_and_load(MetalPipelineType pso_type);
+  static void compile_and_load(int device_id, MetalPipelineType pso_type);

  /* ------------------------------------------------------------------ */
  /* low-level memory management */
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@ -13,10 +13,32 @@
 #  include "util/path.h"
 #  include "util/time.h"

+#  include <crt_externs.h>
+
 CCL_NAMESPACE_BEGIN

 class MetalDevice;

+thread_mutex MetalDevice::existing_devices_mutex;
+std::map<int, MetalDevice *> MetalDevice::active_device_ids;
+
+/* Thread-safe device access for async work. Calling code must pass an appropriatelty scoped lock
+ * to existing_devices_mutex to safeguard against destruction of the returned instance. */
+MetalDevice *MetalDevice::get_device_by_ID(int ID, thread_scoped_lock &existing_devices_mutex_lock)
+{
+  auto it = active_device_ids.find(ID);
+  if (it != active_device_ids.end()) {
+    return it->second;
+  }
+  return nullptr;
+}
+
+bool MetalDevice::is_device_cancelled(int ID)
+{
+  thread_scoped_lock lock(existing_devices_mutex);
+  return get_device_by_ID(ID, lock) == nullptr;
+}
+
 BVHLayoutMask MetalDevice::get_bvh_layout_mask() const
 {
  return use_metalrt ? BVH_LAYOUT_METAL : BVH_LAYOUT_BVH2;
@ -40,6 +62,15 @@ void MetalDevice::set_error(const string &error)
 MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
    : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
 {
+  {
+    /* Assign an ID for this device which we can use to query whether async shader compilation
+     * requests are still relevant. */
+    thread_scoped_lock lock(existing_devices_mutex);
+    static int existing_devices_counter = 1;
+    device_id = existing_devices_counter++;
+    active_device_ids[device_id] = this;
+  }
+
  mtlDevId = info.num;

  /* select chosen device */
@ -57,7 +88,6 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
  if (@available(macos 11.0, *)) {
    if ([mtlDevice hasUnifiedMemory]) {
      default_storage_mode = MTLResourceStorageModeShared;
-      init_host_memory();
    }
  }

@ -181,6 +211,13 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile

 MetalDevice::~MetalDevice()
 {
+  /* Cancel any async shader compilations that are in flight. */
+  cancel();
+
+  /* This lock safeguards against destruction during use (see other uses of
+   * existing_devices_mutex). */
+  thread_scoped_lock lock(existing_devices_mutex);
+
  for (auto &tex : texture_slot_map) {
    if (tex) {
      [tex release];
@ -326,21 +363,66 @@ bool MetalDevice::load_kernels(const uint _kernel_features)
   * active, but may still need to be rendered without motion blur if that isn't active as well. */
  motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;

-  bool result = compile_and_load(PSO_GENERIC);
+  /* Only request generic kernels if they aren't cached in memory. */
+  if (make_source_and_check_if_compile_needed(PSO_GENERIC)) {
+    /* If needed, load them asynchronously in order to responsively message progess to the user. */
+    int this_device_id = this->device_id;
+    auto compile_kernels_fn = ^() {
+      compile_and_load(this_device_id, PSO_GENERIC);
+    };

-  reserve_local_memory(kernel_features);
-  return result;
+    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
+                   compile_kernels_fn);
+  }
+
+  return true;
 }

-bool MetalDevice::compile_and_load(MetalPipelineType pso_type)
+bool MetalDevice::make_source_and_check_if_compile_needed(MetalPipelineType pso_type)
 {
-  make_source(pso_type, kernel_features);
-
-  if (!MetalDeviceKernels::should_load_kernels(this, pso_type)) {
-    /* We already have a full set of matching pipelines which are cached or queued. */
-    metal_printf("%s kernels already requested\n", kernel_type_as_string(pso_type));
-    return true;
+  if (this->source[pso_type].empty()) {
+    make_source(pso_type, kernel_features);
  }
+  return MetalDeviceKernels::should_load_kernels(this, pso_type);
+}
+
+void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
+{
+  /* Thread-safe front-end compilation. Typically the MSL->AIR compilation can take a few seconds,
+   * so we avoid blocking device teardown if the user cancels a render immediately.
+   */
+
+  id<MTLDevice> mtlDevice;
+  string source;
+  MetalGPUVendor device_vendor;
+
+  /* Safely gather any state required for the MSL->AIR compilation. */
+  {
+    thread_scoped_lock lock(existing_devices_mutex);
+
+    /* Check whether the device still exists. */
+    MetalDevice *instance = get_device_by_ID(device_id, lock);
+    if (!instance) {
+      metal_printf("Ignoring %s compilation request - device no longer exists\n",
+                   kernel_type_as_string(pso_type));
+      return;
+    }
+
+    if (!instance->make_source_and_check_if_compile_needed(pso_type)) {
+      /* We already have a full set of matching pipelines which are cached or queued. Return early
+       * to avoid redundant MTLLibrary compilation. */
+      metal_printf("Ignoreing %s compilation request - kernels already requested\n",
+                   kernel_type_as_string(pso_type));
+      return;
+    }
+
+    mtlDevice = instance->mtlDevice;
+    device_vendor = instance->device_vendor;
+    source = instance->source[pso_type];
+  }
+
+  /* Perform the actual compilation using our cached context. The MetalDevice can safely destruct
+   * in this time. */

  MTLCompileOptions *options = [[MTLCompileOptions alloc] init];

@ -359,20 +441,15 @@ bool MetalDevice::compile_and_load(MetalPipelineType pso_type)

  if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) {
    path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))),
-                    source[pso_type]);
+                    source);
  }

  const double starttime = time_dt();

  NSError *error = NULL;
-  mtlLibrary[pso_type] = [mtlDevice newLibraryWithSource:@(source[pso_type].c_str())
-                                                 options:options
-                                                   error:&error];
-
-  if (!mtlLibrary[pso_type]) {
-    NSString *err = [error localizedDescription];
-    set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
-  }
+  id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
+                                                      options:options
+                                                        error:&error];

  metal_printf("Front-end compilation finished in %.1f seconds (%s)\n",
               time_dt() - starttime,
@ -380,17 +457,21 @@ bool MetalDevice::compile_and_load(MetalPipelineType pso_type)

  [options release];

-  return MetalDeviceKernels::load(this, pso_type);
-}
-
-void MetalDevice::reserve_local_memory(const uint kernel_features)
-{
-  /* METAL_WIP - implement this */
-}
-
-void MetalDevice::init_host_memory()
-{
-  /* METAL_WIP - implement this */
+  /* Save the compiled MTLLibrary and trigger the AIR->PSO builds (if the MetalDevice still
+   * exists). */
+  {
+    thread_scoped_lock lock(existing_devices_mutex);
+    if (MetalDevice *instance = get_device_by_ID(device_id, lock)) {
+      if (mtlLibrary) {
+        instance->mtlLibrary[pso_type] = mtlLibrary;
+        MetalDeviceKernels::load(instance, pso_type);
+      }
+      else {
+        NSString *err = [error localizedDescription];
+        instance->set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
+      }
+    }
+  }
 }

 void MetalDevice::load_texture_info()
@ -700,55 +781,74 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz
  return 0;
 }

+void MetalDevice::cancel()
+{
+  /* Remove this device's ID from the list of active devices. Any pending compilation requests
+   * originating from this session will be cancelled. */
+  thread_scoped_lock lock(existing_devices_mutex);
+  if (device_id) {
+    active_device_ids.erase(device_id);
+    device_id = 0;
+  }
+}
+
+bool MetalDevice::is_ready(string &status) const
+{
+  int num_loaded = MetalDeviceKernels::get_loaded_kernel_count(this, PSO_GENERIC);
+  if (num_loaded < DEVICE_KERNEL_NUM) {
+    status = string_printf("%d / %d render kernels loaded (may take a few minutes the first time)",
+                           num_loaded,
+                           DEVICE_KERNEL_NUM);
+    return false;
+  }
+  metal_printf("MetalDevice::is_ready(...) --> true\n");
+  return true;
+}
+
 void MetalDevice::optimize_for_scene(Scene *scene)
 {
  MetalPipelineType specialization_level = kernel_specialization_level;

-  if (specialization_level < PSO_SPECIALIZED_INTERSECT) {
-    return;
-  }
-
-  /* PSO_SPECIALIZED_INTERSECT kernels are fast to specialize, so we always load them
-   * synchronously. */
-  compile_and_load(PSO_SPECIALIZED_INTERSECT);
-
-  if (specialization_level < PSO_SPECIALIZED_SHADE) {
-    return;
-  }
  if (!scene->params.background) {
-    /* Don't load PSO_SPECIALIZED_SHADE kernels during viewport rendering as they are slower to
-     * build. */
-    return;
+    /* In live viewport, don't specialize beyond intersection kernels for responsiveness. */
+    specialization_level = (MetalPipelineType)min(specialization_level, PSO_SPECIALIZED_INTERSECT);
  }

-  /* PSO_SPECIALIZED_SHADE kernels are slower to specialize, so we load them asynchronously, and
-   * only if there isn't an existing load in flight.
-   */
-  auto specialize_shade_fn = ^() {
-    compile_and_load(PSO_SPECIALIZED_SHADE);
-    async_compile_and_load = false;
+  /* For responsive rendering, specialize the kernels in the background, and only if there isn't an
+   * existing "optimize_for_scene" request in flight. */
+  int this_device_id = this->device_id;
+  auto specialize_kernels_fn = ^() {
+    for (int level = 1; level <= int(specialization_level); level++) {
+      compile_and_load(this_device_id, MetalPipelineType(level));
+    }
  };

-  bool async_specialize_shade = true;
+  /* In normal use, we always compile the specialized kernels in the background. */
+  bool specialize_in_background = true;

  /* Block if a per-kernel profiling is enabled (ensure steady rendering rate). */
  if (getenv("CYCLES_METAL_PROFILING") != nullptr) {
-    async_specialize_shade = false;
+    specialize_in_background = false;
  }

-  if (async_specialize_shade) {
-    if (!async_compile_and_load) {
-      async_compile_and_load = true;
+  /* Block during benchmark warm-up to ensure kernels are cached prior to the observed run. */
+  for (int i = 0; i < *_NSGetArgc(); i++) {
+    if (!strcmp((*_NSGetArgv())[i], "--warm-up")) {
+      specialize_in_background = false;
+    }
+  }
+
+  if (specialize_in_background) {
+    if (!MetalDeviceKernels::any_specialization_happening_now()) {
      dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
-                     specialize_shade_fn);
+                     specialize_kernels_fn);
    }
    else {
-      metal_printf(
-          "Async PSO_SPECIALIZED_SHADE load request already in progress - dropping request\n");
+      metal_printf("\"optimize_for_scene\" request already in flight - dropping request\n");
    }
  }
  else {
-    specialize_shade_fn();
+    specialize_kernels_fn();
  }
 }

--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@ -64,6 +64,8 @@ struct MetalKernelPipeline {

  void compile();

+  int originating_device_id;
+
  id<MTLLibrary> mtlLibrary = nil;
  MetalPipelineType pso_type;
  string source_md5;
@ -94,7 +96,9 @@ struct MetalKernelPipeline {
 /* Cache of Metal kernels for each DeviceKernel. */
 namespace MetalDeviceKernels {

-bool should_load_kernels(MetalDevice *device, MetalPipelineType pso_type);
+bool any_specialization_happening_now();
+int get_loaded_kernel_count(MetalDevice const *device, MetalPipelineType pso_type);
+bool should_load_kernels(MetalDevice const *device, MetalPipelineType pso_type);
 bool load(MetalDevice *device, MetalPipelineType pso_type);
 const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel);

--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@ -86,23 +86,17 @@ struct ShaderCache {
  void load_kernel(DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type);

  bool should_load_kernel(DeviceKernel device_kernel,
-                          MetalDevice *device,
+                          MetalDevice const *device,
                          MetalPipelineType pso_type);

  void wait_for_all();

- private:
  friend ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice);

  void compile_thread_func(int thread_index);

  using PipelineCollection = std::vector<unique_ptr<MetalKernelPipeline>>;

-  struct PipelineRequest {
-    MetalKernelPipeline *pipeline = nullptr;
-    std::function<void(MetalKernelPipeline *)> completionHandler;
-  };
-
  struct OccupancyTuningParameters {
    int threads_per_threadgroup = 0;
    int num_threads_per_block = 0;
@ -113,13 +107,15 @@ struct ShaderCache {
  PipelineCollection pipelines[DEVICE_KERNEL_NUM];
  id<MTLDevice> mtlDevice;

-  bool running = false;
+  static bool running;
  std::condition_variable cond_var;
-  std::deque<PipelineRequest> request_queue;
+  std::deque<MetalKernelPipeline *> request_queue;
  std::vector<std::thread> compile_threads;
  std::atomic_int incomplete_requests = 0;
+  std::atomic_int incomplete_specialization_requests = 0;
 };

+bool ShaderCache::running = true;
 std::mutex g_shaderCacheMutex;
 std::map<id<MTLDevice>, unique_ptr<ShaderCache>> g_shaderCache;

@ -137,11 +133,25 @@ ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice)

 ShaderCache::~ShaderCache()
 {
-  metal_printf("ShaderCache shutting down with incomplete_requests = %d\n",
-               int(incomplete_requests));
-
  running = false;
  cond_var.notify_all();
+
+  int num_incomplete = int(incomplete_requests);
+  if (num_incomplete) {
+    /* Shutting down the app with incomplete shader compilation requests. Give 1 second's grace for
+     * clean shutdown. */
+    metal_printf("ShaderCache busy (incomplete_requests = %d)...\n", num_incomplete);
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+    num_incomplete = int(incomplete_requests);
+  }
+
+  if (num_incomplete) {
+    metal_printf("ShaderCache still busy (incomplete_requests = %d). Terminating...\n",
+                 num_incomplete);
+    std::terminate();
+  }
+
+  metal_printf("ShaderCache idle. Shutting down.\n");
  for (auto &thread : compile_threads) {
    thread.join();
  }
@ -156,35 +166,69 @@ void ShaderCache::wait_for_all()

 void ShaderCache::compile_thread_func(int thread_index)
 {
-  while (1) {
+  while (running) {

    /* wait for / acquire next request */
-    PipelineRequest request;
+    MetalKernelPipeline *pipeline;
    {
      thread_scoped_lock lock(cache_mutex);
      cond_var.wait(lock, [&] { return !running || !request_queue.empty(); });
-      if (!running) {
-        break;
+      if (!running || request_queue.empty()) {
+        continue;
      }

-      if (!request_queue.empty()) {
-        request = request_queue.front();
-        request_queue.pop_front();
-      }
+      pipeline = request_queue.front();
+      request_queue.pop_front();
    }

-    /* service request */
-    if (request.pipeline) {
-      request.pipeline->compile();
-      incomplete_requests--;
+    /* Service the request. */
+    DeviceKernel device_kernel = pipeline->device_kernel;
+    MetalPipelineType pso_type = pipeline->pso_type;
+
+    if (MetalDevice::is_device_cancelled(pipeline->originating_device_id)) {
+      /* The originating MetalDevice is no longer active, so this request is obsolete. */
+      metal_printf("Cancelling compilation of %s (%s)\n",
+                   device_kernel_as_string(device_kernel),
+                   kernel_type_as_string(pso_type));
+    }
+    else {
+      /* Do the actual compilation. */
+      pipeline->compile();
+
+      thread_scoped_lock lock(cache_mutex);
+      auto &collection = pipelines[device_kernel];
+
+      /* Cache up to 3 kernel variants with the same pso_type in memory, purging oldest first. */
+      int max_entries_of_same_pso_type = 3;
+      for (int i = (int)collection.size() - 1; i >= 0; i--) {
+        if (collection[i]->pso_type == pso_type) {
+          max_entries_of_same_pso_type -= 1;
+          if (max_entries_of_same_pso_type == 0) {
+            metal_printf("Purging oldest %s:%s kernel from ShaderCache\n",
+                         kernel_type_as_string(pso_type),
+                         device_kernel_as_string(device_kernel));
+            collection.erase(collection.begin() + i);
+            break;
+          }
+        }
+      }
+      collection.push_back(unique_ptr<MetalKernelPipeline>(pipeline));
+    }
+    incomplete_requests--;
+    if (pso_type != PSO_GENERIC) {
+      incomplete_specialization_requests--;
    }
  }
 }

 bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
-                                     MetalDevice *device,
+                                     MetalDevice const *device,
                                     MetalPipelineType pso_type)
 {
+  if (!running) {
+    return false;
+  }
+
  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
    /* Skip megakernel. */
    return false;
@ -240,7 +284,6 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
    /* create compiler threads on first run */
    thread_scoped_lock lock(cache_mutex);
    if (compile_threads.empty()) {
-      running = true;
      for (int i = 0; i < max_mtlcompiler_threads; i++) {
        compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
      }
@ -252,53 +295,39 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
  }

  incomplete_requests++;
+  if (pso_type != PSO_GENERIC) {
+    incomplete_specialization_requests++;
+  }

-  PipelineRequest request;
-  request.pipeline = new MetalKernelPipeline;
-  memcpy(&request.pipeline->kernel_data_,
-         &device->launch_params.data,
-         sizeof(request.pipeline->kernel_data_));
-  request.pipeline->pso_type = pso_type;
-  request.pipeline->mtlDevice = mtlDevice;
-  request.pipeline->source_md5 = device->source_md5[pso_type];
-  request.pipeline->mtlLibrary = device->mtlLibrary[pso_type];
-  request.pipeline->device_kernel = device_kernel;
-  request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
+  MetalKernelPipeline *pipeline = new MetalKernelPipeline;
+
+  /* Keep track of the originating device's ID so that we can cancel requests if the device ceases
+   * to be active. */
+  pipeline->originating_device_id = device->device_id;
+  memcpy(&pipeline->kernel_data_, &device->launch_params.data, sizeof(pipeline->kernel_data_));
+  pipeline->pso_type = pso_type;
+  pipeline->mtlDevice = mtlDevice;
+  pipeline->source_md5 = device->source_md5[pso_type];
+  pipeline->mtlLibrary = device->mtlLibrary[pso_type];
+  pipeline->device_kernel = device_kernel;
+  pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;

  if (occupancy_tuning[device_kernel].threads_per_threadgroup) {
-    request.pipeline->threads_per_threadgroup =
+    pipeline->threads_per_threadgroup =
        occupancy_tuning[device_kernel].threads_per_threadgroup;
-    request.pipeline->num_threads_per_block =
+    pipeline->num_threads_per_block =
        occupancy_tuning[device_kernel].num_threads_per_block;
  }

  /* metalrt options */
-  request.pipeline->use_metalrt = device->use_metalrt;
-  request.pipeline->metalrt_features = device->use_metalrt ?
-                                           (device->kernel_features & METALRT_FEATURE_MASK) :
-                                           0;
+  pipeline->use_metalrt = device->use_metalrt;
+  pipeline->metalrt_features = device->use_metalrt ?
+                               (device->kernel_features & METALRT_FEATURE_MASK) :
+                               0;

  {
    thread_scoped_lock lock(cache_mutex);
-    auto &collection = pipelines[device_kernel];
-
-    /* Cache up to 3 kernel variants with the same pso_type, purging oldest first. */
-    int max_entries_of_same_pso_type = 3;
-    for (int i = (int)collection.size() - 1; i >= 0; i--) {
-      if (collection[i]->pso_type == pso_type) {
-        max_entries_of_same_pso_type -= 1;
-        if (max_entries_of_same_pso_type == 0) {
-          metal_printf("Purging oldest %s:%s kernel from ShaderCache\n",
-                       kernel_type_as_string(pso_type),
-                       device_kernel_as_string(device_kernel));
-          collection.erase(collection.begin() + i);
-          break;
-        }
-      }
-    }
-
-    collection.push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
-    request_queue.push_back(request);
+    request_queue.push_back(pipeline);
  }
  cond_var.notify_one();
 }
@ -664,51 +693,61 @@ void MetalKernelPipeline::compile()

  double starttime = time_dt();

-  MTLNewComputePipelineStateWithReflectionCompletionHandler completionHandler = ^(
-      id<MTLComputePipelineState> computePipelineState,
-      MTLComputePipelineReflection *reflection,
-      NSError *error) {
-    bool recreate_archive = false;
-    if (computePipelineState == nil && archive) {
+  /* Block on load to ensure we continue with a valid kernel function */
+  if (creating_new_archive) {
+    starttime = time_dt();
+    NSError *error;
+    if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
+                                                      error:&error]) {
      NSString *errStr = [error localizedDescription];
-      metal_printf(
-          "Failed to create compute pipeline state \"%s\" from archive - attempting recreation... "
-          "(error: %s)\n",
-          device_kernel_as_string((DeviceKernel)device_kernel),
-          errStr ? [errStr UTF8String] : "nil");
-      computePipelineState = [mtlDevice
-          newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
-                                        options:MTLPipelineOptionNone
-                                     reflection:nullptr
-                                          error:&error];
-      recreate_archive = true;
+      metal_printf("Failed to add PSO to archive:\n%s\n", errStr ? [errStr UTF8String] : "nil");
    }
+  }

-    double duration = time_dt() - starttime;
+  pipeline = [mtlDevice newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
+                                                      options:pipelineOptions
+                                                   reflection:nullptr
+                                                        error:&error];

-    if (computePipelineState == nil) {
-      NSString *errStr = [error localizedDescription];
-      error_str = string_printf("Failed to create compute pipeline state \"%s\", error: \n",
-                                device_kernel_as_string((DeviceKernel)device_kernel));
-      error_str += (errStr ? [errStr UTF8String] : "nil");
-      metal_printf("%16s | %2d | %-55s | %7.2fs | FAILED!\n",
-                   kernel_type_as_string(pso_type),
-                   device_kernel,
-                   device_kernel_as_string((DeviceKernel)device_kernel),
-                   duration);
-      return;
-    }
+  bool recreate_archive = false;
+  if (pipeline == nil && archive) {
+    NSString *errStr = [error localizedDescription];
+    metal_printf(
+        "Failed to create compute pipeline state \"%s\" from archive - attempting recreation... "
+        "(error: %s)\n",
+        device_kernel_as_string((DeviceKernel)device_kernel),
+        errStr ? [errStr UTF8String] : "nil");
+    pipeline = [mtlDevice newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
+                                                        options:MTLPipelineOptionNone
+                                                     reflection:nullptr
+                                                          error:&error];
+    recreate_archive = true;
+  }

-    if (!num_threads_per_block) {
-      num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
-                                         computePipelineState.threadExecutionWidth);
-      num_threads_per_block = std::max(num_threads_per_block,
-                                       (int)computePipelineState.threadExecutionWidth);
-    }
+  double duration = time_dt() - starttime;

-    this->pipeline = computePipelineState;
+  if (pipeline == nil) {
+    NSString *errStr = [error localizedDescription];
+    error_str = string_printf("Failed to create compute pipeline state \"%s\", error: \n",
+                              device_kernel_as_string((DeviceKernel)device_kernel));
+    error_str += (errStr ? [errStr UTF8String] : "nil");
+    metal_printf("%16s | %2d | %-55s | %7.2fs | FAILED!\n",
+                 kernel_type_as_string(pso_type),
+                 device_kernel,
+                 device_kernel_as_string((DeviceKernel)device_kernel),
+                 duration);
+    return;
+  }

-    if (@available(macOS 11.0, *)) {
+  if (!num_threads_per_block) {
+    num_threads_per_block = round_down(pipeline.maxTotalThreadsPerThreadgroup,
+                                       pipeline.threadExecutionWidth);
+    num_threads_per_block = std::max(num_threads_per_block,
+                                     (int)pipeline.threadExecutionWidth);
+  }
+
+  if (@available(macOS 11.0, *)) {
+    if (ShaderCache::running) {
      if (creating_new_archive || recreate_archive) {
        if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())]
                               error:&error]) {
@ -720,24 +759,7 @@ void MetalKernelPipeline::compile()
        }
      }
    }
-  };
-
-  /* Block on load to ensure we continue with a valid kernel function */
-  if (creating_new_archive) {
-    starttime = time_dt();
-    NSError *error;
-    if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
-                                                      error:&error]) {
-      NSString *errStr = [error localizedDescription];
-      metal_printf("Failed to add PSO to archive:\n%s\n", errStr ? [errStr UTF8String] : "nil");
-    }
  }
-  id<MTLComputePipelineState> pipeline = [mtlDevice
-      newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
-                                    options:pipelineOptions
-                                 reflection:nullptr
-                                      error:&error];
-  completionHandler(pipeline, nullptr, error);

  this->loaded = true;
  [computePipelineStateDescriptor release];
@ -763,8 +785,6 @@ void MetalKernelPipeline::compile()
    }
  }

-  double duration = time_dt() - starttime;
-
  if (!use_binary_archive) {
    metal_printf("%16s | %2d | %-55s | %7.2fs\n",
                 kernel_type_as_string(pso_type),
@ -791,24 +811,46 @@ bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type)
    shader_cache->load_kernel((DeviceKernel)i, device, pso_type);
  }

-  shader_cache->wait_for_all();
-  metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
-               time_dt() - starttime,
-               kernel_type_as_string(pso_type));
+  if (getenv("CYCLES_METAL_PROFILING")) {
+    shader_cache->wait_for_all();
+    metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
+                 time_dt() - starttime,
+                 kernel_type_as_string(pso_type));
+  }
  return true;
 }

-bool MetalDeviceKernels::should_load_kernels(MetalDevice *device, MetalPipelineType pso_type)
+bool MetalDeviceKernels::any_specialization_happening_now()
 {
-  auto shader_cache = get_shader_cache(device->mtlDevice);
-  for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
-    if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) {
+  /* Return true if any ShaderCaches have ongoing specialization requests (typically there will be
+   * only 1). */
+  thread_scoped_lock lock(g_shaderCacheMutex);
+  for (auto &it : g_shaderCache) {
+    if (it.second->incomplete_specialization_requests > 0) {
      return true;
    }
  }
  return false;
 }

+int MetalDeviceKernels::get_loaded_kernel_count(MetalDevice const *device,
+                                                MetalPipelineType pso_type)
+{
+  auto shader_cache = get_shader_cache(device->mtlDevice);
+  int loaded_count = DEVICE_KERNEL_NUM;
+  for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
+    if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) {
+      loaded_count -= 1;
+    }
+  }
+  return loaded_count;
+}
+
+bool MetalDeviceKernels::should_load_kernels(MetalDevice const *device, MetalPipelineType pso_type)
+{
+  return get_loaded_kernel_count(device, pso_type) != DEVICE_KERNEL_NUM;
+}
+
 const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device,
                                                                 DeviceKernel kernel)
 {
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@ -702,6 +702,10 @@ bool MetalDeviceQueue::synchronize()

 void MetalDeviceQueue::zero_to_device(device_memory &mem)
 {
+  if (metal_device_->have_error()) {
+    return;
+  }
+
  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);

  if (mem.memory_size() == 0) {
@ -729,6 +733,10 @@ void MetalDeviceQueue::zero_to_device(device_memory &mem)

 void MetalDeviceQueue::copy_to_device(device_memory &mem)
 {
+  if (metal_device_->have_error()) {
+    return;
+  }
+
  if (mem.memory_size() == 0) {
    return;
  }
@ -771,6 +779,10 @@ void MetalDeviceQueue::copy_to_device(device_memory &mem)

 void MetalDeviceQueue::copy_from_device(device_memory &mem)
 {
+  if (metal_device_->have_error()) {
+    return;
+  }
+
  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);

  if (mem.memory_size() == 0) {
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@ -390,6 +390,9 @@ void PathTrace::path_trace(RenderWork &render_work)
    const int num_samples = render_work.path_trace.num_samples;

    PathTraceWork *path_trace_work = path_trace_works_[i].get();
+    if (path_trace_work->get_device()->have_error()) {
+      return;
+    }

    PathTraceWork::RenderStatistics statistics;
    path_trace_work->render_samples(statistics,
--- a/intern/cycles/session/session.cpp
+++ b/intern/cycles/session/session.cpp
@ -113,6 +113,9 @@ void Session::start()

 void Session::cancel(bool quick)
 {
+  /* Cancel any long running device operations (e.g. shader compilations). */
+  device->cancel();
+
  /* Check if session thread is rendering. */
  const bool rendering = is_session_thread_rendering();

@ -401,6 +404,16 @@ RenderWork Session::run_update_for_next_iteration()
    path_trace_->load_kernels();
    path_trace_->alloc_work_memory();

+    /* Wait for device to be ready (e.g. finish any background compilations). */
+    string device_status;
+    while (!device->is_ready(device_status)) {
+      progress.set_status(device_status);
+      if (progress.get_cancel()) {
+        break;
+      }
+      std::this_thread::sleep_for(std::chrono::milliseconds(200));
+    }
+
    progress.add_skip_time(update_timer, params.background);
  }