Enable inlining on Apple Silicon. Use new process-wide ShaderCache in order to safely re-enable binary archives

This patch is the same as D14763, but with a fix for unit test failures caused by ShaderCache fetch logic not working in the non-MetalRT case:

```
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index ad268ae7057..6aa1a56056e 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -203,9 +203,12 @@ bool kernel_has_intersection(DeviceKernel device_kernel)

   /* metalrt options */
   request.pipeline->use_metalrt = device->use_metalrt;
-  request.pipeline->metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR;
-  request.pipeline->metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK;
-  request.pipeline->metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD;
+  request.pipeline->metalrt_hair = device->use_metalrt &&
+                                   (device->kernel_features & KERNEL_FEATURE_HAIR);
+  request.pipeline->metalrt_hair_thick = device->use_metalrt &&
+                                         (device->kernel_features & KERNEL_FEATURE_HAIR_THICK);
+  request.pipeline->metalrt_pointcloud = device->use_metalrt &&
+                                         (device->kernel_features & KERNEL_FEATURE_POINTCLOUD);

   {
     thread_scoped_lock lock(cache_mutex);
@@ -225,9 +228,9 @@ bool kernel_has_intersection(DeviceKernel device_kernel)

   /* metalrt options */
   bool use_metalrt = device->use_metalrt;
-  bool metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR;
-  bool metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK;
-  bool metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD;
+  bool metalrt_hair = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR);
+  bool metalrt_hair_thick = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR_THICK);
+  bool metalrt_pointcloud = use_metalrt && (device->kernel_features & KERNEL_FEATURE_POINTCLOUD);

   MetalKernelPipeline *best_pipeline = nullptr;
   for (auto &pipeline : collection) {

```

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D14923
This commit is contained in:
Michael Jones (Apple) 2022-05-11 14:52:49 +01:00
parent 59cd616534
commit 007184bcf2
6 changed files with 600 additions and 591 deletions

View File

@ -28,7 +28,8 @@ class MetalDevice : public Device {
id<MTLCommandQueue> mtlGeneralCommandQueue = nil;
id<MTLArgumentEncoder> mtlAncillaryArgEncoder =
nil; /* encoder used for fetching device pointers from MTLBuffers */
string source_used_for_compile[PSO_NUM];
string source[PSO_NUM];
string source_md5[PSO_NUM];
KernelParamsMetal launch_params = {0};
@ -72,7 +73,6 @@ class MetalDevice : public Device {
id<MTLBuffer> texture_bindings_3d = nil;
std::vector<id<MTLTexture>> texture_slot_map;
MetalDeviceKernels kernels;
bool use_metalrt = false;
bool use_function_specialisation = false;
@ -110,6 +110,8 @@ class MetalDevice : public Device {
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
id<MTLLibrary> compile(string const &source);
/* ------------------------------------------------------------------ */
/* low-level memory management */

View File

@ -275,96 +275,44 @@ bool MetalDevice::load_kernels(const uint _kernel_features)
* active, but may still need to be rendered without motion blur if that isn't active as well. */
motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
NSError *error = NULL;
source[PSO_GENERIC] = get_source(kernel_features);
mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]);
for (int i = 0; i < PSO_NUM; i++) {
if (mtlLibrary[i]) {
[mtlLibrary[i] release];
mtlLibrary[i] = nil;
}
}
MD5Hash md5;
md5.append(source[PSO_GENERIC]);
source_md5[PSO_GENERIC] = md5.get_hex();
metal_printf("Front-end compilation finished (generic)\n");
bool result = MetalDeviceKernels::load(this, false);
reserve_local_memory(kernel_features);
return result;
}
id<MTLLibrary> MetalDevice::compile(string const &source)
{
MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
options.fastMathEnabled = YES;
if (@available(macOS 12.0, *)) {
options.languageVersion = MTLLanguageVersion2_4;
}
else {
return false;
}
string metalsrc;
/* local helper: dump source to disk and return filepath */
auto dump_source = [&](int kernel_type) -> string {
string &source = source_used_for_compile[kernel_type];
string metalsrc = path_cache_get(path_join("kernels",
string_printf("%s.%s.metal",
kernel_type_as_string(kernel_type),
util_md5_string(source).c_str())));
path_write_text(metalsrc, source);
return metalsrc;
};
/* local helper: fetch the kernel source code, adjust it for specific PSO_.. kernel_type flavor,
* then compile it into a MTLLibrary */
auto fetch_and_compile_source = [&](int kernel_type) {
/* Record the source used to compile this library, for hash building later. */
string &source = source_used_for_compile[kernel_type];
switch (kernel_type) {
case PSO_GENERIC: {
source = get_source(kernel_features);
break;
}
case PSO_SPECIALISED: {
/* PSO_SPECIALISED derives from PSO_GENERIC */
string &generic_source = source_used_for_compile[PSO_GENERIC];
if (generic_source.empty()) {
generic_source = get_source(kernel_features);
}
source = "#define __KERNEL_METAL_USE_FUNCTION_SPECIALISATION__\n" + generic_source;
break;
}
default:
assert(0);
}
/* create MTLLibrary (front-end compilation) */
mtlLibrary[kernel_type] = [mtlDevice newLibraryWithSource:@(source.c_str())
NSError *error = NULL;
id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
options:options
error:&error];
bool do_source_dump = (getenv("CYCLES_METAL_DUMP_SOURCE") != nullptr);
if (!mtlLibrary[kernel_type] || do_source_dump) {
string metalsrc = dump_source(kernel_type);
if (!mtlLibrary[kernel_type]) {
NSString *err = [error localizedDescription];
set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
return false;
}
}
return true;
};
fetch_and_compile_source(PSO_GENERIC);
if (use_function_specialisation) {
fetch_and_compile_source(PSO_SPECIALISED);
if (!mtlLibrary) {
NSString *err = [error localizedDescription];
set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
}
metal_printf("Front-end compilation finished\n");
bool result = kernels.load(this, PSO_GENERIC);
[options release];
reserve_local_memory(kernel_features);
return result;
return mtlLibrary;
}
void MetalDevice::reserve_local_memory(const uint kernel_features)

View File

@ -54,103 +54,41 @@ enum {
const char *kernel_type_as_string(int kernel_type);
struct MetalKernelPipeline {
void release()
{
if (pipeline) {
[pipeline release];
pipeline = nil;
if (@available(macOS 11.0, *)) {
for (int i = 0; i < METALRT_TABLE_NUM; i++) {
if (intersection_func_table[i]) {
[intersection_func_table[i] release];
intersection_func_table[i] = nil;
}
}
}
}
if (function) {
[function release];
function = nil;
}
if (@available(macOS 11.0, *)) {
for (int i = 0; i < METALRT_TABLE_NUM; i++) {
if (intersection_func_table[i]) {
[intersection_func_table[i] release];
}
}
}
}
void compile();
id<MTLLibrary> mtlLibrary = nil;
bool scene_specialized;
string source_md5;
bool use_metalrt;
bool metalrt_hair;
bool metalrt_hair_thick;
bool metalrt_pointcloud;
int threads_per_threadgroup;
DeviceKernel device_kernel;
bool loaded = false;
id<MTLDevice> mtlDevice = nil;
id<MTLFunction> function = nil;
id<MTLComputePipelineState> pipeline = nil;
int num_threads_per_block = 0;
string error_str;
API_AVAILABLE(macos(11.0))
id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
};
struct MetalKernelLoadDesc {
int pso_index = 0;
const char *function_name = nullptr;
int kernel_index = 0;
int threads_per_threadgroup = 0;
MTLFunctionConstantValues *constant_values = nullptr;
NSArray *linked_functions = nullptr;
struct IntersectorFunctions {
NSArray *defaults;
NSArray *shadow;
NSArray *local;
NSArray *operator[](int index) const
{
if (index == METALRT_TABLE_DEFAULT)
return defaults;
if (index == METALRT_TABLE_SHADOW)
return shadow;
return local;
}
} intersector_functions = {nullptr};
};
/* Metal kernel and associate occupancy information. */
class MetalDeviceKernel {
public:
~MetalDeviceKernel();
bool load(MetalDevice *device, MetalKernelLoadDesc const &desc, class MD5Hash const &md5);
void mark_loaded(int pso_index)
{
pso[pso_index].loaded = true;
}
int get_num_threads_per_block() const
{
return num_threads_per_block;
}
const MetalKernelPipeline &get_pso() const;
double load_duration = 0.0;
private:
MetalKernelPipeline pso[PSO_NUM];
int num_threads_per_block = 0;
id<MTLFunction> rt_intersection_function[METALRT_FUNC_NUM] = {nil};
};
/* Cache of Metal kernels for each DeviceKernel. */
class MetalDeviceKernels {
public:
bool load(MetalDevice *device, int kernel_type);
bool available(DeviceKernel kernel) const;
const MetalDeviceKernel &get(DeviceKernel kernel) const;
namespace MetalDeviceKernels {
MetalDeviceKernel kernels_[DEVICE_KERNEL_NUM];
bool load(MetalDevice *device, bool scene_specialized);
const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel);
id<MTLFunction> rt_intersection_funcs[PSO_NUM][METALRT_FUNC_NUM] = {{nil}};
string loaded_md5[PSO_NUM];
};
} /* namespace MetalDeviceKernels */
CCL_NAMESPACE_END

File diff suppressed because it is too large Load Diff

View File

@ -108,9 +108,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
<< work_size;
const MetalDeviceKernel &metal_kernel = metal_device->kernels.get(kernel);
const MetalKernelPipeline &metal_kernel_pso = metal_kernel.get_pso();
id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
/* Determine size requirement for argument buffer. */
@ -212,6 +209,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
}
bytes_written = globals_offsets + sizeof(KernelParamsMetal);
const MetalKernelPipeline *metal_kernel_pso = MetalDeviceKernels::get_best_pipeline(metal_device,
kernel);
if (!metal_kernel_pso) {
metal_device->set_error(
string_printf("No MetalKernelPipeline for %s\n", device_kernel_as_string(kernel)));
return false;
}
/* Encode ancillaries */
[metal_device->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
[metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_2d
@ -228,14 +233,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
}
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
if (metal_kernel_pso.intersection_func_table[table]) {
[metal_kernel_pso.intersection_func_table[table] setBuffer:arg_buffer
offset:globals_offsets
atIndex:1];
if (metal_kernel_pso->intersection_func_table[table]) {
[metal_kernel_pso->intersection_func_table[table] setBuffer:arg_buffer
offset:globals_offsets
atIndex:1];
[metal_device->mtlAncillaryArgEncoder
setIntersectionFunctionTable:metal_kernel_pso.intersection_func_table[table]
setIntersectionFunctionTable:metal_kernel_pso->intersection_func_table[table]
atIndex:3 + table];
[mtlComputeCommandEncoder useResource:metal_kernel_pso.intersection_func_table[table]
[mtlComputeCommandEncoder useResource:metal_kernel_pso->intersection_func_table[table]
usage:MTLResourceUsageRead];
}
else {
@ -281,10 +286,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
}
}
[mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso.pipeline];
[mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso->pipeline];
/* Compute kernel launch parameters. */
const int num_threads_per_block = metal_kernel.get_num_threads_per_block();
const int num_threads_per_block = metal_kernel_pso->num_threads_per_block;
int shared_mem_bytes = 0;
@ -314,7 +319,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
threadsPerThreadgroup:size_threads_per_threadgroup];
[mtlCommandBuffer addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
NSString *kernel_name = metal_kernel_pso.function.label;
NSString *kernel_name = metal_kernel_pso->function.label;
/* Enhanced command buffer errors are only available in 11.0+ */
if (@available(macos 11.0, *)) {
@ -547,6 +552,8 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel
computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
MTLDispatchTypeSerial];
[mtlComputeEncoder setLabel:@(device_kernel_as_string(kernel))];
/* declare usage of MTLBuffers etc */
prepare_resources(kernel);
}

View File

@ -29,10 +29,26 @@ using namespace metal::raytracing;
/* Qualifiers */
#define ccl_device
#define ccl_device_inline ccl_device
#define ccl_device_forceinline ccl_device
#define ccl_device_noinline ccl_device __attribute__((noinline))
#if defined(__KERNEL_METAL_APPLE__)
/* Inline everything for Apple GPUs.
* This gives ~1.1x speedup and 10% spill reduction for integator_shade_surface
* at the cost of longer compile times (~4.5 minutes on M1 Max). */
# define ccl_device __attribute__((always_inline))
# define ccl_device_inline __attribute__((always_inline))
# define ccl_device_forceinline __attribute__((always_inline))
# define ccl_device_noinline __attribute__((always_inline))
#else
# define ccl_device
# define ccl_device_inline ccl_device
# define ccl_device_forceinline ccl_device
# define ccl_device_noinline ccl_device __attribute__((noinline))
#endif
#define ccl_device_noinline_cpu ccl_device
#define ccl_device_inline_method ccl_device
#define ccl_global device