Cycles: Apple Silicon optimization to specialize intersection kernels

The Metal backend now compiles and caches a second set of kernels which are optimized for scene contents, enabled for Apple Silicon. The implementation supports doing this both for intersection and shading kernels. However this is currently only enabled for intersection kernels that are quick to compile, and already give a good speedup. Enabling this for shading kernels would be faster still, however this also causes a long wait times and would need a good user interface to control this. M1 Max samples per minute (macOS 13.0): PSO_GENERIC PSO_SPECIALIZED_INTERSECT PSO_SPECIALIZED_SHADE barbershop_interior 83.4 89.5 93.7 bmw27 1486.1 1671.0 1825.8 classroom 175.2 196.8 206.3 fishy_cat 674.2 704.3 719.3 junkshop 205.4 212.0 257.7 koro 310.1 336.1 342.8 monster 376.7 418.6 424.1 pabellon 273.5 325.4 339.8 sponza 830.6 929.6 1142.4 victor 86.7 96.4 96.3 wdas_cloud 111.8 112.7 183.1 Code contributed by Jason Fielder, Morteza Mostajabodaveh and Michael Jones Differential Revision: https://developer.blender.org/D14645
2022-07-12 15:32:46 +02:00 · 2022-07-12 15:32:46 +02:00 · da4ef05e4d
parent 5653c5fcdd
commit da4ef05e4d
13 changed files with 401 additions and 129 deletions
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@ -29,6 +29,7 @@ class DeviceQueue;
 class Progress;
 class CPUKernels;
 class CPUKernelThreadGlobals;
+class Scene;

 /* Device Types */

@ -186,6 +187,11 @@ class Device {
    return 0;
  }

+  /* Called after kernel texture setup, and prior to integrator state setup. */
+  virtual void optimize_for_scene(Scene *scene)
+  {
+  }
+
  virtual bool is_resident(device_ptr /*key*/, Device *sub_device)
  {
    /* Memory is always resident if this is not a multi device, regardless of whether the pointer
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@ -75,7 +75,8 @@ class MetalDevice : public Device {
  std::vector<id<MTLTexture>> texture_slot_map;

  bool use_metalrt = false;
-  bool use_function_specialisation = false;
+  MetalPipelineType kernel_specialization_level = PSO_GENERIC;
+  std::atomic_bool async_compile_and_load = false;

  virtual BVHLayoutMask get_bvh_layout_mask() const override;

@ -91,9 +92,7 @@ class MetalDevice : public Device {

  bool use_adaptive_compilation();

-  string get_source(const uint kernel_features);
-
-  string compile_kernel(const uint kernel_features, const char *name);
+  void make_source(MetalPipelineType pso_type, const uint kernel_features);

  virtual bool load_kernels(const uint kernel_features) override;

@ -111,7 +110,9 @@ class MetalDevice : public Device {

  virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;

-  id<MTLLibrary> compile(string const &source);
+  virtual void optimize_for_scene(Scene *scene) override;
+
+  bool compile_and_load(MetalPipelineType pso_type);

  /* ------------------------------------------------------------------ */
  /* low-level memory management */
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@ -6,6 +6,8 @@
 #  include "device/metal/device_impl.h"
 #  include "device/metal/device.h"

+#  include "scene/scene.h"
+
 #  include "util/debug.h"
 #  include "util/md5.h"
 #  include "util/path.h"
@ -78,6 +80,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
    case METAL_GPU_APPLE: {
      max_threads_per_threadgroup = 512;
      use_metalrt = info.use_metalrt;
+
+      /* Specialize the intersection kernels on Apple GPUs by default as these can be built very
+       * quickly. */
+      kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
      break;
    }
  }
@ -90,6 +96,13 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
    capture_enabled = true;
  }

+  if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) {
+    kernel_specialization_level = (MetalPipelineType)atoi(envstr);
+  }
+  metal_printf("kernel_specialization_level = %s\n",
+               kernel_type_as_string(
+                   (MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1)));
+
  MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init];
  arg_desc_params.dataType = MTLDataTypePointer;
  arg_desc_params.access = MTLArgumentAccessReadOnly;
@ -209,12 +222,11 @@ bool MetalDevice::use_adaptive_compilation()
  return DebugFlags().metal.adaptive_compile;
 }

-string MetalDevice::get_source(const uint kernel_features)
+void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
 {
-  string build_options;
-
+  string global_defines;
  if (use_adaptive_compilation()) {
-    build_options += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
+    global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
  }

  if (MetalInfo::optimal_sort_partition_elements(mtlDevice) > 0) {
@ -222,52 +234,78 @@ string MetalDevice::get_source(const uint kernel_features)
  }

  if (use_metalrt) {
-    build_options += "-D__METALRT__ ";
+    global_defines += "#define __METALRT__\n";
    if (motion_blur) {
-      build_options += "-D__METALRT_MOTION__ ";
+      global_defines += "#define __METALRT_MOTION__\n";
    }
  }

 #  ifdef WITH_CYCLES_DEBUG
-  build_options += "-D__KERNEL_DEBUG__ ";
+  global_defines += "#define __KERNEL_DEBUG__\n";
 #  endif

  switch (device_vendor) {
    default:
      break;
    case METAL_GPU_INTEL:
-      build_options += "-D__KERNEL_METAL_INTEL__ ";
+      global_defines += "#define __KERNEL_METAL_INTEL__\n";
      break;
    case METAL_GPU_AMD:
-      build_options += "-D__KERNEL_METAL_AMD__ ";
+      global_defines += "#define __KERNEL_METAL_AMD__\n";
      break;
    case METAL_GPU_APPLE:
-      build_options += "-D__KERNEL_METAL_APPLE__ ";
+      global_defines += "#define __KERNEL_METAL_APPLE__\n";
      break;
  }

-  /* reformat -D defines list into compilable form */
-  vector<string> components;
-  string_replace(build_options, "-D", "");
-  string_split(components, build_options, " ");
-
-  string globalDefines;
-  for (const string &component : components) {
-    vector<string> assignments;
-    string_split(assignments, component, "=");
-    if (assignments.size() == 2)
-      globalDefines += string_printf(
-          "#define %s %s\n", assignments[0].c_str(), assignments[1].c_str());
-    else
-      globalDefines += string_printf("#define %s\n", assignments[0].c_str());
-  }
-
-  string source = globalDefines + "\n#include \"kernel/device/metal/kernel.metal\"\n";
+  string &source = this->source[pso_type];
+  source = "\n#include \"kernel/device/metal/kernel.metal\"\n";
  source = path_source_replace_includes(source, path_get("source"));

-  metal_printf("Global defines:\n%s\n", globalDefines.c_str());
+  /* Perform any required specialization on the source.
+   * With Metal function constants we can generate a single variant of the kernel source which can
+   * be repeatedly respecialized.
+   */
+  string baked_constants;

-  return source;
+  /* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of
+   * the same character length. Build a string of all active constant values which is then hashed
+   * in order to identify the PSO.
+   */
+  if (pso_type != PSO_GENERIC) {
+    const double starttime = time_dt();
+
+#  define KERNEL_STRUCT_BEGIN(name, parent) \
+    string_replace_same_length(source, "kernel_data." #parent ".", "kernel_data_" #parent "_");
+
+    /* Add constants to md5 so that 'get_best_pipeline' is able to return a suitable match. */
+#  define KERNEL_STRUCT_MEMBER(parent, _type, name) \
+    baked_constants += string(#parent "." #name "=") + \
+                       to_string(_type(launch_params.data.parent.name)) + "\n";
+
+#  include "kernel/data_template.h"
+
+    /* Opt in to all of available specializations. This can be made more granular for the
+     * PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests,
+     * but the overhead should be negligible as these are very quick to (re)build and aren't
+     * serialized to disk via MTLBinaryArchives.
+     */
+    global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n";
+
+    metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0);
+  }
+
+  source = global_defines + source;
+  metal_printf("================\n%s================\n\%s================\n",
+               global_defines.c_str(),
+               baked_constants.c_str());
+
+  /* Generate an MD5 from the source and include any baked constants. This is used when caching
+   * PSOs. */
+  MD5Hash md5;
+  md5.append(baked_constants);
+  md5.append(source);
+  source_md5[pso_type] = md5.get_hex();
 }

 bool MetalDevice::load_kernels(const uint _kernel_features)
@ -283,28 +321,22 @@ bool MetalDevice::load_kernels(const uint _kernel_features)
   * active, but may still need to be rendered without motion blur if that isn't active as well. */
  motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;

-  source[PSO_GENERIC] = get_source(kernel_features);
-
-  const double starttime = time_dt();
-
-  mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]);
-
-  metal_printf("Front-end compilation finished in %.1f seconds (generic)\n",
-               time_dt() - starttime);
-
-  MD5Hash md5;
-  md5.append(source[PSO_GENERIC]);
-  source_md5[PSO_GENERIC] = md5.get_hex();
-
-  bool result = MetalDeviceKernels::load(this, false);
+  bool result = compile_and_load(PSO_GENERIC);

  reserve_local_memory(kernel_features);
-
  return result;
 }

-id<MTLLibrary> MetalDevice::compile(string const &source)
+bool MetalDevice::compile_and_load(MetalPipelineType pso_type)
 {
+  make_source(pso_type, kernel_features);
+
+  if (!MetalDeviceKernels::should_load_kernels(this, pso_type)) {
+    /* We already have a full set of matching pipelines which are cached or queued. */
+    metal_printf("%s kernels already requested\n", kernel_type_as_string(pso_type));
+    return true;
+  }
+
  MTLCompileOptions *options = [[MTLCompileOptions alloc] init];

  options.fastMathEnabled = YES;
@ -312,19 +344,30 @@ id<MTLLibrary> MetalDevice::compile(string const &source)
    options.languageVersion = MTLLanguageVersion2_4;
  }

-  NSError *error = NULL;
-  id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
-                                                      options:options
-                                                        error:&error];
+  if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) {
+    path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))),
+                    source[pso_type]);
+  }

-  if (!mtlLibrary) {
+  const double starttime = time_dt();
+
+  NSError *error = NULL;
+  mtlLibrary[pso_type] = [mtlDevice newLibraryWithSource:@(source[pso_type].c_str())
+                                                 options:options
+                                                   error:&error];
+
+  if (!mtlLibrary[pso_type]) {
    NSString *err = [error localizedDescription];
    set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
  }

+  metal_printf("Front-end compilation finished in %.1f seconds (%s)\n",
+               time_dt() - starttime,
+               kernel_type_as_string(pso_type));
+
  [options release];

-  return mtlLibrary;
+  return MetalDeviceKernels::load(this, pso_type);
 }

 void MetalDevice::reserve_local_memory(const uint kernel_features)
@ -631,6 +674,58 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz
  return 0;
 }

+void MetalDevice::optimize_for_scene(Scene *scene)
+{
+  MetalPipelineType specialization_level = kernel_specialization_level;
+
+  if (specialization_level < PSO_SPECIALIZED_INTERSECT) {
+    return;
+  }
+
+  /* PSO_SPECIALIZED_INTERSECT kernels are fast to specialize, so we always load them
+   * synchronously. */
+  compile_and_load(PSO_SPECIALIZED_INTERSECT);
+
+  if (specialization_level < PSO_SPECIALIZED_SHADE) {
+    return;
+  }
+  if (!scene->params.background) {
+    /* Don't load PSO_SPECIALIZED_SHADE kernels during viewport rendering as they are slower to
+     * build. */
+    return;
+  }
+
+  /* PSO_SPECIALIZED_SHADE kernels are slower to specialize, so we load them asynchronously, and
+   * only if there isn't an existing load in flight.
+   */
+  auto specialize_shade_fn = ^() {
+    compile_and_load(PSO_SPECIALIZED_SHADE);
+    async_compile_and_load = false;
+  };
+
+  bool async_specialize_shade = true;
+
+  /* Block if a per-kernel profiling is enabled (ensure steady rendering rate). */
+  if (getenv("CYCLES_METAL_PROFILING") != nullptr) {
+    async_specialize_shade = false;
+  }
+
+  if (async_specialize_shade) {
+    if (!async_compile_and_load) {
+      async_compile_and_load = true;
+      dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
+                     specialize_shade_fn);
+    }
+    else {
+      metal_printf(
+          "Async PSO_SPECIALIZED_SHADE load request already in progress - dropping request\n");
+    }
+  }
+  else {
+    specialize_shade_fn();
+  }
+}
+
 void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
 {
  if (strcmp(name, "data") == 0) {
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@ -31,7 +31,7 @@ enum {
 enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };

 /* Pipeline State Object types */
-enum {
+enum MetalPipelineType {
  /* A kernel that can be used with all scenes, supporting all features.
   * It is slow to compile, but only needs to be compiled once and is then
   * cached for future render sessions. This allows a render to get underway
@ -39,28 +39,33 @@ enum {
   */
  PSO_GENERIC,

-  /* A kernel that is relatively quick to compile, but is specialized for the
-   * scene being rendered. It only contains the functionality and even baked in
-   * constants for values that means it needs to be recompiled whenever a
-   * dependent setting is changed. The render performance of this kernel is
-   * significantly faster though, and justifies the extra compile time.
+  /* A intersection kernel that is very quick to specialize and results in faster intersection
+   * kernel performance. It uses Metal function constants to replace several KernelData variables
+   * with fixed constants.
   */
-  /* METAL_WIP: This isn't used and will require more changes to enable. */
-  PSO_SPECIALISED,
+  PSO_SPECIALIZED_INTERSECT,
+
+  /* A shading kernel that is slow to specialize, but results in faster shading kernel performance
+   * rendered. It uses Metal function constants to replace several KernelData variables with fixed
+   * constants and short-circuit all unused SVM node case handlers.
+   */
+  PSO_SPECIALIZED_SHADE,

  PSO_NUM
 };

-const char *kernel_type_as_string(int kernel_type);
+const char *kernel_type_as_string(MetalPipelineType pso_type);

 struct MetalKernelPipeline {

  void compile();

  id<MTLLibrary> mtlLibrary = nil;
-  bool scene_specialized;
+  MetalPipelineType pso_type;
  string source_md5;
+  size_t usage_count = 0;

+  KernelData kernel_data_;
  bool use_metalrt;
  bool metalrt_hair;
  bool metalrt_hair_thick;
@ -75,6 +80,8 @@ struct MetalKernelPipeline {
  id<MTLComputePipelineState> pipeline = nil;
  int num_threads_per_block = 0;

+  bool should_use_binary_archive() const;
+
  string error_str;

  API_AVAILABLE(macos(11.0))
@ -85,7 +92,8 @@ struct MetalKernelPipeline {
 /* Cache of Metal kernels for each DeviceKernel. */
 namespace MetalDeviceKernels {

-bool load(MetalDevice *device, bool scene_specialized);
+bool should_load_kernels(MetalDevice *device, MetalPipelineType pso_type);
+bool load(MetalDevice *device, MetalPipelineType pso_type);
 const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel);

 } /* namespace MetalDeviceKernels */
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@ -5,6 +5,7 @@

 #  include "device/metal/kernel.h"
 #  include "device/metal/device_impl.h"
+#  include "kernel/device/metal/function_constants.h"
 #  include "util/md5.h"
 #  include "util/path.h"
 #  include "util/tbb.h"
@ -16,13 +17,15 @@ CCL_NAMESPACE_BEGIN
 /* limit to 2 MTLCompiler instances */
 int max_mtlcompiler_threads = 2;

-const char *kernel_type_as_string(int kernel_type)
+const char *kernel_type_as_string(MetalPipelineType pso_type)
 {
-  switch (kernel_type) {
+  switch (pso_type) {
    case PSO_GENERIC:
      return "PSO_GENERIC";
-    case PSO_SPECIALISED:
-      return "PSO_SPECIALISED";
+    case PSO_SPECIALIZED_INTERSECT:
+      return "PSO_SPECIALIZED_INTERSECT";
+    case PSO_SPECIALIZED_SHADE:
+      return "PSO_SPECIALIZED_SHADE";
    default:
      assert(0);
  }
@ -50,7 +53,11 @@ struct ShaderCache {

  /* Non-blocking request for a kernel, optionally specialized to the scene being rendered by
   * device. */
-  void load_kernel(DeviceKernel kernel, MetalDevice *device, bool scene_specialized);
+  void load_kernel(DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type);
+
+  bool should_load_kernel(DeviceKernel device_kernel,
+                          MetalDevice *device,
+                          MetalPipelineType pso_type);

  void wait_for_all();

@ -139,9 +146,53 @@ void ShaderCache::compile_thread_func(int thread_index)
  }
 }

+bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
+                                     MetalDevice *device,
+                                     MetalPipelineType pso_type)
+{
+  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+    /* Skip megakernel. */
+    return false;
+  }
+
+  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    if ((device->kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) {
+      /* Skip shade_surface_raytrace kernel if the scene doesn't require it. */
+      return false;
+    }
+  }
+
+  if (pso_type != PSO_GENERIC) {
+    /* Only specialize kernels where it can make an impact. */
+    if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+        device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+      return false;
+    }
+
+    /* Only specialize shading / intersection kernels as requested. */
+    bool is_shade_kernel = (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    bool is_shade_pso = (pso_type == PSO_SPECIALIZED_SHADE);
+    if (is_shade_pso != is_shade_kernel) {
+      return false;
+    }
+  }
+
+  {
+    /* check whether the kernel has already been requested / cached */
+    thread_scoped_lock lock(cache_mutex);
+    for (auto &pipeline : pipelines[device_kernel]) {
+      if (pipeline->source_md5 == device->source_md5[pso_type]) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
 void ShaderCache::load_kernel(DeviceKernel device_kernel,
                              MetalDevice *device,
-                              bool scene_specialized)
+                              MetalPipelineType pso_type)
 {
  {
    /* create compiler threads on first run */
@ -154,52 +205,21 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
    }
  }

-  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
-    /* skip megakernel */
+  if (!should_load_kernel(device_kernel, device, pso_type)) {
    return;
  }

-  if (scene_specialized) {
-    /* Only specialize kernels where it can make an impact. */
-    if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-        device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
-      return;
-    }
-  }
-
-  {
-    /* check whether the kernel has already been requested / cached */
-    thread_scoped_lock lock(cache_mutex);
-    for (auto &pipeline : pipelines[device_kernel]) {
-      if (scene_specialized) {
-        if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
-          /* we already requested a pipeline that is specialized for this kernel data */
-          metal_printf("Specialized kernel already requested (%s)\n",
-                       device_kernel_as_string(device_kernel));
-          return;
-        }
-      }
-      else {
-        if (pipeline->source_md5 == device->source_md5[PSO_GENERIC]) {
-          /* we already requested a generic pipeline for this kernel */
-          metal_printf("Generic kernel already requested (%s)\n",
-                       device_kernel_as_string(device_kernel));
-          return;
-        }
-      }
-    }
-  }
-
  incomplete_requests++;

  PipelineRequest request;
  request.pipeline = new MetalKernelPipeline;
-  request.pipeline->scene_specialized = scene_specialized;
+  memcpy(&request.pipeline->kernel_data_,
+         &device->launch_params.data,
+         sizeof(request.pipeline->kernel_data_));
+  request.pipeline->pso_type = pso_type;
  request.pipeline->mtlDevice = mtlDevice;
-  request.pipeline->source_md5 =
-      device->source_md5[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
-  request.pipeline->mtlLibrary =
-      device->mtlLibrary[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
+  request.pipeline->source_md5 = device->source_md5[pso_type];
+  request.pipeline->mtlLibrary = device->mtlLibrary[pso_type];
  request.pipeline->device_kernel = device_kernel;
  request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;

@ -214,7 +234,24 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,

  {
    thread_scoped_lock lock(cache_mutex);
-    pipelines[device_kernel].push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
+    auto &collection = pipelines[device_kernel];
+
+    /* Cache up to 3 kernel variants with the same pso_type, purging oldest first. */
+    int max_entries_of_same_pso_type = 3;
+    for (int i = (int)collection.size() - 1; i >= 0; i--) {
+      if (collection[i]->pso_type == pso_type) {
+        max_entries_of_same_pso_type -= 1;
+        if (max_entries_of_same_pso_type == 0) {
+          metal_printf("Purging oldest %s:%s kernel from ShaderCache\n",
+                       kernel_type_as_string(pso_type),
+                       device_kernel_as_string(device_kernel));
+          collection.erase(collection.begin() + i);
+          break;
+        }
+      }
+    }
+
+    collection.push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
    request_queue.push_back(request);
  }
  cond_var.notify_one();
@ -248,8 +285,9 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M
      continue;
    }

-    if (pipeline->scene_specialized) {
-      if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
+    if (pipeline->pso_type != PSO_GENERIC) {
+      if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] ||
+          pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) {
        best_pipeline = pipeline.get();
      }
    }
@ -258,13 +296,65 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M
    }
  }

+  if (best_pipeline->usage_count == 0 && best_pipeline->pso_type != PSO_GENERIC) {
+    metal_printf("Swapping in %s version of %s\n",
+                 kernel_type_as_string(best_pipeline->pso_type),
+                 device_kernel_as_string(kernel));
+  }
+  best_pipeline->usage_count += 1;
+
  return best_pipeline;
 }

+bool MetalKernelPipeline::should_use_binary_archive() const
+{
+  if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
+    if (atoi(str) != 0) {
+      /* Don't archive if we have opted out by env var. */
+      return false;
+    }
+  }
+
+  if (pso_type == PSO_GENERIC) {
+    /* Archive the generic kernels. */
+    return true;
+  }
+
+  if (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND &&
+      device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+    /* Archive all shade kernels - they take a long time to compile. */
+    return true;
+  }
+
+  /* The remaining kernels are all fast to compile. They may get cached by the system shader cache,
+   * but will be quick to regenerate if not. */
+  return false;
+}
+
+static MTLFunctionConstantValues *GetConstantValues(KernelData const *data = nullptr)
+{
+  MTLFunctionConstantValues *constant_values = [MTLFunctionConstantValues new];
+
+  MTLDataType MTLDataType_int = MTLDataTypeInt;
+  MTLDataType MTLDataType_float = MTLDataTypeFloat;
+  MTLDataType MTLDataType_float4 = MTLDataTypeFloat4;
+  KernelData zero_data = {0};
+  if (!data) {
+    data = &zero_data;
+  }
+
+#  define KERNEL_STRUCT_MEMBER(parent, _type, name) \
+    [constant_values setConstantValue:&data->parent.name \
+                                 type:MTLDataType_##_type \
+                              atIndex:KernelData_##parent##_##name];
+
+#  include "kernel/data_template.h"
+
+  return constant_values;
+}
+
 void MetalKernelPipeline::compile()
 {
-  int pso_type = scene_specialized ? PSO_SPECIALISED : PSO_GENERIC;
-
  const std::string function_name = std::string("cycles_metal_") +
                                    device_kernel_as_string(device_kernel);

@ -281,6 +371,17 @@ void MetalKernelPipeline::compile()
  if (@available(macOS 11.0, *)) {
    MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
    func_desc.name = entryPoint;
+
+    if (pso_type == PSO_SPECIALIZED_SHADE) {
+      func_desc.constantValues = GetConstantValues(&kernel_data_);
+    }
+    else if (pso_type == PSO_SPECIALIZED_INTERSECT) {
+      func_desc.constantValues = GetConstantValues(&kernel_data_);
+    }
+    else {
+      func_desc.constantValues = GetConstantValues();
+    }
+
    function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error];
  }

@ -427,10 +528,7 @@ void MetalKernelPipeline::compile()

  MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;

-  bool use_binary_archive = true;
-  if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
-    use_binary_archive = (atoi(str) == 0);
-  }
+  bool use_binary_archive = should_use_binary_archive();

  id<MTLBinaryArchive> archive = nil;
  string metalbin_path;
@ -608,17 +706,30 @@ void MetalKernelPipeline::compile()
  }
 }

-bool MetalDeviceKernels::load(MetalDevice *device, bool scene_specialized)
+bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type)
+{
+  const double starttime = time_dt();
+  auto shader_cache = get_shader_cache(device->mtlDevice);
+  for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
+    shader_cache->load_kernel((DeviceKernel)i, device, pso_type);
+  }
+
+  shader_cache->wait_for_all();
+  metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
+               time_dt() - starttime,
+               kernel_type_as_string(pso_type));
+  return true;
+}
+
+bool MetalDeviceKernels::should_load_kernels(MetalDevice *device, MetalPipelineType pso_type)
 {
  auto shader_cache = get_shader_cache(device->mtlDevice);
  for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
-    shader_cache->load_kernel((DeviceKernel)i, device, scene_specialized);
+    if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) {
+      return true;
+    }
  }
-
-  if (!scene_specialized || getenv("CYCLES_METAL_PROFILING")) {
-    shader_cache->wait_for_all();
-  }
-  return true;
+  return false;
 }

 const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device,
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -79,6 +79,7 @@ set(SRC_KERNEL_DEVICE_METAL_HEADERS
  device/metal/compat.h
  device/metal/context_begin.h
  device/metal/context_end.h
+  device/metal/function_constants.h
  device/metal/globals.h
 )

--- a/intern/cycles/kernel/device/metal/function_constants.h
+++ b/intern/cycles/kernel/device/metal/function_constants.h
@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Blender Foundation */
+
+enum {
+#define KERNEL_STRUCT_MEMBER(parent, type, name) KernelData_##parent##_##name,
+#include "kernel/data_template.h"
+};
+
+#ifdef __KERNEL_METAL__
+#  define KERNEL_STRUCT_MEMBER(parent, type, name) \
+    constant type kernel_data_##parent##_##name \
+        [[function_constant(KernelData_##parent##_##name)]];
+#  include "kernel/data_template.h"
+#endif
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@ -5,6 +5,7 @@

 #include "kernel/device/metal/compat.h"
 #include "kernel/device/metal/globals.h"
+#include "kernel/device/metal/function_constants.h"
 #include "kernel/device/gpu/kernel.h"

 /* MetalRT intersection handlers */
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@ -204,7 +204,14 @@ CCL_NAMESPACE_END

 CCL_NAMESPACE_BEGIN

-#define SVM_CASE(node) case node:
+#ifdef __KERNEL_USE_DATA_CONSTANTS__
+#  define SVM_CASE(node) \
+    case node: \
+      if (!kernel_data_svm_usage_##node) \
+        break;
+#else
+#  define SVM_CASE(node) case node:
+#endif

 /* Main Interpreter Loop */
 template<uint node_feature_mask, ShaderType type, typename ConstIntegratorGenericState>
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@ -1136,7 +1136,13 @@ typedef enum KernelBVHLayout {
  } \
  ; \
  static_assert_align(name, 16);
-#define KERNEL_STRUCT_MEMBER(parent, type, name) type name;
+
+#ifdef __KERNEL_USE_DATA_CONSTANTS__
+#  define KERNEL_STRUCT_MEMBER(parent, type, name) type __unused_##name;
+#else
+#  define KERNEL_STRUCT_MEMBER(parent, type, name) type name;
+#endif
+
 #include "kernel/data_template.h"

 typedef struct KernelTables {
--- a/intern/cycles/scene/scene.cpp
+++ b/intern/cycles/scene/scene.cpp
@ -369,6 +369,8 @@ void Scene::device_update(Device *device_, Progress &progress)
    device->const_copy_to("data", &dscene.data, sizeof(dscene.data));
  }

+  device->optimize_for_scene(this);
+
  if (print_stats) {
    size_t mem_used = util_guarded_get_mem_used();
    size_t mem_peak = util_guarded_get_mem_peak();
--- a/intern/cycles/util/string.cpp
+++ b/intern/cycles/util/string.cpp
@ -136,6 +136,19 @@ void string_replace(string &haystack, const string &needle, const string &other)
  }
 }

+void string_replace_same_length(string &haystack, const string &needle, const string &other)
+{
+  assert(needle.size() == other.size());
+  size_t pos = 0;
+  while (pos != string::npos) {
+    pos = haystack.find(needle, pos);
+    if (pos != string::npos) {
+      memcpy(haystack.data() + pos, other.data(), other.size());
+      pos += other.size();
+    }
+  }
+}
+
 string string_remove_trademark(const string &s)
 {
  string result = s;
@ -164,6 +177,11 @@ string to_string(const char *str)
  return string(str);
 }

+string to_string(const float4 &v)
+{
+  return string_printf("%f,%f,%f,%f", v.x, v.y, v.z, v.w);
+}
+
 string string_to_lower(const string &s)
 {
  string r = s;
--- a/intern/cycles/util/string.h
+++ b/intern/cycles/util/string.h
@ -38,12 +38,14 @@ void string_split(vector<string> &tokens,
                  const string &separators = "\t ",
                  bool skip_empty_tokens = true);
 void string_replace(string &haystack, const string &needle, const string &other);
+void string_replace_same_length(string &haystack, const string &needle, const string &other);
 bool string_startswith(string_view s, string_view start);
 bool string_endswith(string_view s, string_view end);
 string string_strip(const string &s);
 string string_remove_trademark(const string &s);
 string string_from_bool(const bool var);
 string to_string(const char *str);
+string to_string(const float4 &v);
 string string_to_lower(const string &s);

 /* Wide char strings are only used on Windows to deal with non-ASCII