Revert "Cycles: Enable inlining on Apple Silicon for 1.1x speedup"

This reverts commit b82de02e7c. It is causing crashes in various regression tests. Ref D14763
Referenced by issue #101365, Regression: Can't save second Viewlayer as .exr Referenced by issue #99732, Regression: Crash when opening the F3 menu in the dopesheet
2022-04-28 00:46:14 +02:00 · 2022-04-28 00:46:14 +02:00 · 52a5f68562 · 2023-02-14 11:25:11 +01:00
parent 3558f565f1
commit 52a5f68562
6 changed files with 576 additions and 593 deletions
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@ -28,8 +28,7 @@ class MetalDevice : public Device {
  id<MTLCommandQueue> mtlGeneralCommandQueue = nil;
  id<MTLArgumentEncoder> mtlAncillaryArgEncoder =
      nil; /* encoder used for fetching device pointers from MTLBuffers */
-  string source[PSO_NUM];
-  string source_md5[PSO_NUM];
+  string source_used_for_compile[PSO_NUM];

  KernelParamsMetal launch_params = {0};

@ -111,12 +110,6 @@ class MetalDevice : public Device {

  virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;

-  id<MTLLibrary> compile(string const &source);
-
-  const MetalKernelPipeline &get_best_pipeline(DeviceKernel kernel) const;
-
-  bool kernel_available(DeviceKernel kernel) const;
-
  /* ------------------------------------------------------------------ */
  /* low-level memory management */

--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@ -275,44 +275,96 @@ bool MetalDevice::load_kernels(const uint _kernel_features)
   * active, but may still need to be rendered without motion blur if that isn't active as well. */
  motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;

-  source[PSO_GENERIC] = get_source(kernel_features);
-  mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]);
+  NSError *error = NULL;

-  MD5Hash md5;
-  md5.append(source[PSO_GENERIC]);
-  source_md5[PSO_GENERIC] = md5.get_hex();
+  for (int i = 0; i < PSO_NUM; i++) {
+    if (mtlLibrary[i]) {
+      [mtlLibrary[i] release];
+      mtlLibrary[i] = nil;
+    }
+  }

-  metal_printf("Front-end compilation finished (generic)\n");
-
-  bool result = kernels.load(this, false);
-
-  reserve_local_memory(kernel_features);
-
-  return result;
-}
-
-id<MTLLibrary> MetalDevice::compile(string const &source)
-{
  MTLCompileOptions *options = [[MTLCompileOptions alloc] init];

  options.fastMathEnabled = YES;
  if (@available(macOS 12.0, *)) {
    options.languageVersion = MTLLanguageVersion2_4;
  }
+  else {
+    return false;
+  }

-  NSError *error = NULL;
-  id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
+  string metalsrc;
+
+  /* local helper: dump source to disk and return filepath */
+  auto dump_source = [&](int kernel_type) -> string {
+    string &source = source_used_for_compile[kernel_type];
+    string metalsrc = path_cache_get(path_join("kernels",
+                                               string_printf("%s.%s.metal",
+                                                             kernel_type_as_string(kernel_type),
+                                                             util_md5_string(source).c_str())));
+    path_write_text(metalsrc, source);
+    return metalsrc;
+  };
+
+  /* local helper: fetch the kernel source code, adjust it for specific PSO_.. kernel_type flavor,
+   * then compile it into a MTLLibrary */
+  auto fetch_and_compile_source = [&](int kernel_type) {
+    /* Record the source used to compile this library, for hash building later. */
+    string &source = source_used_for_compile[kernel_type];
+
+    switch (kernel_type) {
+      case PSO_GENERIC: {
+        source = get_source(kernel_features);
+        break;
+      }
+      case PSO_SPECIALISED: {
+        /* PSO_SPECIALISED derives from PSO_GENERIC */
+        string &generic_source = source_used_for_compile[PSO_GENERIC];
+        if (generic_source.empty()) {
+          generic_source = get_source(kernel_features);
+        }
+        source = "#define __KERNEL_METAL_USE_FUNCTION_SPECIALISATION__\n" + generic_source;
+        break;
+      }
+      default:
+        assert(0);
+    }
+
+    /* create MTLLibrary (front-end compilation) */
+    mtlLibrary[kernel_type] = [mtlDevice newLibraryWithSource:@(source.c_str())
                                                      options:options
                                                        error:&error];

-  if (!mtlLibrary) {
-    NSString *err = [error localizedDescription];
-    set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
+    bool do_source_dump = (getenv("CYCLES_METAL_DUMP_SOURCE") != nullptr);
+
+    if (!mtlLibrary[kernel_type] || do_source_dump) {
+      string metalsrc = dump_source(kernel_type);
+
+      if (!mtlLibrary[kernel_type]) {
+        NSString *err = [error localizedDescription];
+        set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
+
+        return false;
+      }
+    }
+    return true;
+  };
+
+  fetch_and_compile_source(PSO_GENERIC);
+
+  if (use_function_specialisation) {
+    fetch_and_compile_source(PSO_SPECIALISED);
  }

-  [options release];
+  metal_printf("Front-end compilation finished\n");

-  return mtlLibrary;
+  bool result = kernels.load(this, PSO_GENERIC);
+
+  [options release];
+  reserve_local_memory(kernel_features);
+
+  return result;
 }

 void MetalDevice::reserve_local_memory(const uint kernel_features)
@ -619,11 +671,6 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz
  return 0;
 }

-const MetalKernelPipeline &MetalDevice::get_best_pipeline(DeviceKernel kernel) const
-{
-  return kernels.get_best_pipeline(this, kernel);
-}
-
 void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
 {
  if (strcmp(name, "__data") == 0) {
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@ -54,41 +54,98 @@ enum {
 const char *kernel_type_as_string(int kernel_type);

 struct MetalKernelPipeline {
+  void release()
+  {
+    if (pipeline) {
+      [pipeline release];
+      pipeline = nil;
+      if (@available(macOS 11.0, *)) {
+        for (int i = 0; i < METALRT_TABLE_NUM; i++) {
+          if (intersection_func_table[i]) {
+            [intersection_func_table[i] release];
+            intersection_func_table[i] = nil;
+          }
+        }
+      }
+    }
+    if (function) {
+      [function release];
+      function = nil;
+    }
+    if (@available(macOS 11.0, *)) {
+      for (int i = 0; i < METALRT_TABLE_NUM; i++) {
+        if (intersection_func_table[i]) {
+          [intersection_func_table[i] release];
+        }
+      }
+    }
+  }

-  void compile();
-
-  id<MTLLibrary> mtlLibrary = nil;
-  bool scene_specialized;
-  string source_md5;
-
-  bool use_metalrt;
-  bool metalrt_hair;
-  bool metalrt_hair_thick;
-  bool metalrt_pointcloud;
-
-  int threads_per_threadgroup;
-
-  DeviceKernel device_kernel;
  bool loaded = false;
-  id<MTLDevice> mtlDevice = nil;
  id<MTLFunction> function = nil;
  id<MTLComputePipelineState> pipeline = nil;
-  int num_threads_per_block = 0;
-
-  string error_str;

  API_AVAILABLE(macos(11.0))
  id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
-  id<MTLFunction> rt_intersection_function[METALRT_FUNC_NUM] = {nil};
+};
+
+struct MetalKernelLoadDesc {
+  int pso_index = 0;
+  const char *function_name = nullptr;
+  int kernel_index = 0;
+  int threads_per_threadgroup = 0;
+  MTLFunctionConstantValues *constant_values = nullptr;
+  NSArray *linked_functions = nullptr;
+
+  struct IntersectorFunctions {
+    NSArray *defaults;
+    NSArray *shadow;
+    NSArray *local;
+    NSArray *operator[](int index) const
+    {
+      if (index == METALRT_TABLE_DEFAULT)
+        return defaults;
+      if (index == METALRT_TABLE_SHADOW)
+        return shadow;
+      return local;
+    }
+  } intersector_functions = {nullptr};
+};
+
+/* Metal kernel and associate occupancy information. */
+class MetalDeviceKernel {
+ public:
+  ~MetalDeviceKernel();
+
+  bool load(MetalDevice *device, MetalKernelLoadDesc const &desc, class MD5Hash const &md5);
+
+  void mark_loaded(int pso_index)
+  {
+    pso[pso_index].loaded = true;
+  }
+
+  int get_num_threads_per_block() const
+  {
+    return num_threads_per_block;
+  }
+  const MetalKernelPipeline &get_pso() const;
+
+  double load_duration = 0.0;
+
+ private:
+  MetalKernelPipeline pso[PSO_NUM];
+
+  int num_threads_per_block = 0;
 };

 /* Cache of Metal kernels for each DeviceKernel. */
 class MetalDeviceKernels {
 public:
-  bool load(MetalDevice *device, bool scene_specialized);
-  bool available(const MetalDevice *device, DeviceKernel kernel) const;
-  const MetalKernelPipeline &get_best_pipeline(const MetalDevice *device,
-                                               DeviceKernel kernel) const;
+  bool load(MetalDevice *device, int kernel_type);
+  bool available(DeviceKernel kernel) const;
+  const MetalDeviceKernel &get(DeviceKernel kernel) const;
+
+  MetalDeviceKernel kernels_[DEVICE_KERNEL_NUM];

  id<MTLFunction> rt_intersection_funcs[PSO_NUM][METALRT_FUNC_NUM] = {{nil}};

--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@ -108,6 +108,9 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
  VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
          << work_size;

+  const MetalDeviceKernel &metal_kernel = metal_device->kernels.get(kernel);
+  const MetalKernelPipeline &metal_kernel_pso = metal_kernel.get_pso();
+
  id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);

  /* Determine size requirement for argument buffer. */
@ -209,8 +212,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
  }
  bytes_written = globals_offsets + sizeof(KernelParamsMetal);

-  const MetalKernelPipeline &metal_kernel_pso = metal_device->get_best_pipeline(kernel);
-
  /* Encode ancillaries */
  [metal_device->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
  [metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_2d
@ -283,7 +284,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
  [mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso.pipeline];

  /* Compute kernel launch parameters. */
-  const int num_threads_per_block = metal_kernel_pso.num_threads_per_block;
+  const int num_threads_per_block = metal_kernel.get_num_threads_per_block();

  int shared_mem_bytes = 0;

@ -546,8 +547,6 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel
        computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
                                                           MTLDispatchTypeSerial];

-    [mtlComputeEncoder setLabel:@(device_kernel_as_string(kernel))];
-
    /* declare usage of MTLBuffers etc */
    prepare_resources(kernel);
  }
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@ -29,26 +29,10 @@ using namespace metal::raytracing;

 /* Qualifiers */

-#if defined(__KERNEL_METAL_APPLE__)
-
-/* Inline everything for Apple GPUs.
- * This gives ~1.1x speedup and 10% spill reduction for integator_shade_surface
- * at the cost of longer compile times (~4.5 minutes on M1 Max). */
-
-#  define ccl_device __attribute__((always_inline))
-#  define ccl_device_inline __attribute__((always_inline))
-#  define ccl_device_forceinline __attribute__((always_inline))
-#  define ccl_device_noinline __attribute__((always_inline))
-
-#else
-
-#  define ccl_device
-#  define ccl_device_inline ccl_device
-#  define ccl_device_forceinline ccl_device
-#  define ccl_device_noinline ccl_device __attribute__((noinline))
-
-#endif
-
+#define ccl_device
+#define ccl_device_inline ccl_device
+#define ccl_device_forceinline ccl_device
+#define ccl_device_noinline ccl_device __attribute__((noinline))
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_device_inline_method ccl_device
 #define ccl_global device