Cycles: Improved render start/stop responsiveness on Metal

All kernel specialisation is now performed in the background regardless of kernel type, meaning that the first render will be visible a few seconds sooner. The only exception is during benchmark warm up, in which case we wait for all kernels to be cached. When stopping a render, we call a new `cancel()` method on the device which causes any outstanding compilation work to be cancelled, and we destroy the device in a detached thread so that any stale queued compilations can be safely purged without blocking the UI for longer than necessary.

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D16371
This commit is contained in:
Michael Jones 2023-01-04 14:23:33 +00:00 committed by Michael Jones
parent fbc2c4c331
commit 77c3e67d3d
8 changed files with 390 additions and 194 deletions

View File

@ -167,6 +167,17 @@ class Device {
return true;
}
/* Request cancellation of any long-running work. */
virtual void cancel()
{
}
/* Return true if device is ready for rendering, or report status if not. */
virtual bool is_ready(string &status) const
{
return true;
}
/* GPU device only functions.
* These may not be used on CPU or multi-devices. */

View File

@ -76,7 +76,20 @@ class MetalDevice : public Device {
bool use_metalrt = false;
MetalPipelineType kernel_specialization_level = PSO_GENERIC;
std::atomic_bool async_compile_and_load = false;
int device_id = 0;
static thread_mutex existing_devices_mutex;
static std::map<int, MetalDevice *> active_device_ids;
static bool is_device_cancelled(int device_id);
static MetalDevice *get_device_by_ID(int device_idID,
thread_scoped_lock &existing_devices_mutex_lock);
virtual bool is_ready(string &status) const override;
virtual void cancel() override;
virtual BVHLayoutMask get_bvh_layout_mask() const override;
@ -92,14 +105,12 @@ class MetalDevice : public Device {
bool use_adaptive_compilation();
bool make_source_and_check_if_compile_needed(MetalPipelineType pso_type);
void make_source(MetalPipelineType pso_type, const uint kernel_features);
virtual bool load_kernels(const uint kernel_features) override;
void reserve_local_memory(const uint kernel_features);
void init_host_memory();
void load_texture_info();
void erase_allocation(device_memory &mem);
@ -112,7 +123,7 @@ class MetalDevice : public Device {
virtual void optimize_for_scene(Scene *scene) override;
bool compile_and_load(MetalPipelineType pso_type);
static void compile_and_load(int device_id, MetalPipelineType pso_type);
/* ------------------------------------------------------------------ */
/* low-level memory management */

View File

@ -13,10 +13,32 @@
# include "util/path.h"
# include "util/time.h"
# include <crt_externs.h>
CCL_NAMESPACE_BEGIN
class MetalDevice;
thread_mutex MetalDevice::existing_devices_mutex;
std::map<int, MetalDevice *> MetalDevice::active_device_ids;
/* Thread-safe device access for async work. Calling code must pass an appropriatelty scoped lock
* to existing_devices_mutex to safeguard against destruction of the returned instance. */
MetalDevice *MetalDevice::get_device_by_ID(int ID, thread_scoped_lock &existing_devices_mutex_lock)
{
auto it = active_device_ids.find(ID);
if (it != active_device_ids.end()) {
return it->second;
}
return nullptr;
}
bool MetalDevice::is_device_cancelled(int ID)
{
thread_scoped_lock lock(existing_devices_mutex);
return get_device_by_ID(ID, lock) == nullptr;
}
BVHLayoutMask MetalDevice::get_bvh_layout_mask() const
{
return use_metalrt ? BVH_LAYOUT_METAL : BVH_LAYOUT_BVH2;
@ -40,6 +62,15 @@ void MetalDevice::set_error(const string &error)
MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
: Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
{
{
/* Assign an ID for this device which we can use to query whether async shader compilation
* requests are still relevant. */
thread_scoped_lock lock(existing_devices_mutex);
static int existing_devices_counter = 1;
device_id = existing_devices_counter++;
active_device_ids[device_id] = this;
}
mtlDevId = info.num;
/* select chosen device */
@ -57,7 +88,6 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
if (@available(macos 11.0, *)) {
if ([mtlDevice hasUnifiedMemory]) {
default_storage_mode = MTLResourceStorageModeShared;
init_host_memory();
}
}
@ -181,6 +211,13 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
MetalDevice::~MetalDevice()
{
/* Cancel any async shader compilations that are in flight. */
cancel();
/* This lock safeguards against destruction during use (see other uses of
* existing_devices_mutex). */
thread_scoped_lock lock(existing_devices_mutex);
for (auto &tex : texture_slot_map) {
if (tex) {
[tex release];
@ -326,21 +363,66 @@ bool MetalDevice::load_kernels(const uint _kernel_features)
* active, but may still need to be rendered without motion blur if that isn't active as well. */
motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
bool result = compile_and_load(PSO_GENERIC);
/* Only request generic kernels if they aren't cached in memory. */
if (make_source_and_check_if_compile_needed(PSO_GENERIC)) {
/* If needed, load them asynchronously in order to responsively message progess to the user. */
int this_device_id = this->device_id;
auto compile_kernels_fn = ^() {
compile_and_load(this_device_id, PSO_GENERIC);
};
reserve_local_memory(kernel_features);
return result;
dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
compile_kernels_fn);
}
return true;
}
bool MetalDevice::compile_and_load(MetalPipelineType pso_type)
bool MetalDevice::make_source_and_check_if_compile_needed(MetalPipelineType pso_type)
{
make_source(pso_type, kernel_features);
if (!MetalDeviceKernels::should_load_kernels(this, pso_type)) {
/* We already have a full set of matching pipelines which are cached or queued. */
metal_printf("%s kernels already requested\n", kernel_type_as_string(pso_type));
return true;
if (this->source[pso_type].empty()) {
make_source(pso_type, kernel_features);
}
return MetalDeviceKernels::should_load_kernels(this, pso_type);
}
void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
{
/* Thread-safe front-end compilation. Typically the MSL->AIR compilation can take a few seconds,
* so we avoid blocking device teardown if the user cancels a render immediately.
*/
id<MTLDevice> mtlDevice;
string source;
MetalGPUVendor device_vendor;
/* Safely gather any state required for the MSL->AIR compilation. */
{
thread_scoped_lock lock(existing_devices_mutex);
/* Check whether the device still exists. */
MetalDevice *instance = get_device_by_ID(device_id, lock);
if (!instance) {
metal_printf("Ignoring %s compilation request - device no longer exists\n",
kernel_type_as_string(pso_type));
return;
}
if (!instance->make_source_and_check_if_compile_needed(pso_type)) {
/* We already have a full set of matching pipelines which are cached or queued. Return early
* to avoid redundant MTLLibrary compilation. */
metal_printf("Ignoreing %s compilation request - kernels already requested\n",
kernel_type_as_string(pso_type));
return;
}
mtlDevice = instance->mtlDevice;
device_vendor = instance->device_vendor;
source = instance->source[pso_type];
}
/* Perform the actual compilation using our cached context. The MetalDevice can safely destruct
* in this time. */
MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
@ -359,20 +441,15 @@ bool MetalDevice::compile_and_load(MetalPipelineType pso_type)
if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) {
path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))),
source[pso_type]);
source);
}
const double starttime = time_dt();
NSError *error = NULL;
mtlLibrary[pso_type] = [mtlDevice newLibraryWithSource:@(source[pso_type].c_str())
options:options
error:&error];
if (!mtlLibrary[pso_type]) {
NSString *err = [error localizedDescription];
set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
}
id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
options:options
error:&error];
metal_printf("Front-end compilation finished in %.1f seconds (%s)\n",
time_dt() - starttime,
@ -380,17 +457,21 @@ bool MetalDevice::compile_and_load(MetalPipelineType pso_type)
[options release];
return MetalDeviceKernels::load(this, pso_type);
}
void MetalDevice::reserve_local_memory(const uint kernel_features)
{
/* METAL_WIP - implement this */
}
void MetalDevice::init_host_memory()
{
/* METAL_WIP - implement this */
/* Save the compiled MTLLibrary and trigger the AIR->PSO builds (if the MetalDevice still
* exists). */
{
thread_scoped_lock lock(existing_devices_mutex);
if (MetalDevice *instance = get_device_by_ID(device_id, lock)) {
if (mtlLibrary) {
instance->mtlLibrary[pso_type] = mtlLibrary;
MetalDeviceKernels::load(instance, pso_type);
}
else {
NSString *err = [error localizedDescription];
instance->set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
}
}
}
}
void MetalDevice::load_texture_info()
@ -700,55 +781,74 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz
return 0;
}
void MetalDevice::cancel()
{
/* Remove this device's ID from the list of active devices. Any pending compilation requests
* originating from this session will be cancelled. */
thread_scoped_lock lock(existing_devices_mutex);
if (device_id) {
active_device_ids.erase(device_id);
device_id = 0;
}
}
bool MetalDevice::is_ready(string &status) const
{
int num_loaded = MetalDeviceKernels::get_loaded_kernel_count(this, PSO_GENERIC);
if (num_loaded < DEVICE_KERNEL_NUM) {
status = string_printf("%d / %d render kernels loaded (may take a few minutes the first time)",
num_loaded,
DEVICE_KERNEL_NUM);
return false;
}
metal_printf("MetalDevice::is_ready(...) --> true\n");
return true;
}
void MetalDevice::optimize_for_scene(Scene *scene)
{
MetalPipelineType specialization_level = kernel_specialization_level;
if (specialization_level < PSO_SPECIALIZED_INTERSECT) {
return;
}
/* PSO_SPECIALIZED_INTERSECT kernels are fast to specialize, so we always load them
* synchronously. */
compile_and_load(PSO_SPECIALIZED_INTERSECT);
if (specialization_level < PSO_SPECIALIZED_SHADE) {
return;
}
if (!scene->params.background) {
/* Don't load PSO_SPECIALIZED_SHADE kernels during viewport rendering as they are slower to
* build. */
return;
/* In live viewport, don't specialize beyond intersection kernels for responsiveness. */
specialization_level = (MetalPipelineType)min(specialization_level, PSO_SPECIALIZED_INTERSECT);
}
/* PSO_SPECIALIZED_SHADE kernels are slower to specialize, so we load them asynchronously, and
* only if there isn't an existing load in flight.
*/
auto specialize_shade_fn = ^() {
compile_and_load(PSO_SPECIALIZED_SHADE);
async_compile_and_load = false;
/* For responsive rendering, specialize the kernels in the background, and only if there isn't an
* existing "optimize_for_scene" request in flight. */
int this_device_id = this->device_id;
auto specialize_kernels_fn = ^() {
for (int level = 1; level <= int(specialization_level); level++) {
compile_and_load(this_device_id, MetalPipelineType(level));
}
};
bool async_specialize_shade = true;
/* In normal use, we always compile the specialized kernels in the background. */
bool specialize_in_background = true;
/* Block if a per-kernel profiling is enabled (ensure steady rendering rate). */
if (getenv("CYCLES_METAL_PROFILING") != nullptr) {
async_specialize_shade = false;
specialize_in_background = false;
}
if (async_specialize_shade) {
if (!async_compile_and_load) {
async_compile_and_load = true;
/* Block during benchmark warm-up to ensure kernels are cached prior to the observed run. */
for (int i = 0; i < *_NSGetArgc(); i++) {
if (!strcmp((*_NSGetArgv())[i], "--warm-up")) {
specialize_in_background = false;
}
}
if (specialize_in_background) {
if (!MetalDeviceKernels::any_specialization_happening_now()) {
dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
specialize_shade_fn);
specialize_kernels_fn);
}
else {
metal_printf(
"Async PSO_SPECIALIZED_SHADE load request already in progress - dropping request\n");
metal_printf("\"optimize_for_scene\" request already in flight - dropping request\n");
}
}
else {
specialize_shade_fn();
specialize_kernels_fn();
}
}

View File

@ -64,6 +64,8 @@ struct MetalKernelPipeline {
void compile();
int originating_device_id;
id<MTLLibrary> mtlLibrary = nil;
MetalPipelineType pso_type;
string source_md5;
@ -94,7 +96,9 @@ struct MetalKernelPipeline {
/* Cache of Metal kernels for each DeviceKernel. */
namespace MetalDeviceKernels {
bool should_load_kernels(MetalDevice *device, MetalPipelineType pso_type);
bool any_specialization_happening_now();
int get_loaded_kernel_count(MetalDevice const *device, MetalPipelineType pso_type);
bool should_load_kernels(MetalDevice const *device, MetalPipelineType pso_type);
bool load(MetalDevice *device, MetalPipelineType pso_type);
const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel);

View File

@ -86,23 +86,17 @@ struct ShaderCache {
void load_kernel(DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type);
bool should_load_kernel(DeviceKernel device_kernel,
MetalDevice *device,
MetalDevice const *device,
MetalPipelineType pso_type);
void wait_for_all();
private:
friend ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice);
void compile_thread_func(int thread_index);
using PipelineCollection = std::vector<unique_ptr<MetalKernelPipeline>>;
struct PipelineRequest {
MetalKernelPipeline *pipeline = nullptr;
std::function<void(MetalKernelPipeline *)> completionHandler;
};
struct OccupancyTuningParameters {
int threads_per_threadgroup = 0;
int num_threads_per_block = 0;
@ -113,13 +107,15 @@ struct ShaderCache {
PipelineCollection pipelines[DEVICE_KERNEL_NUM];
id<MTLDevice> mtlDevice;
bool running = false;
static bool running;
std::condition_variable cond_var;
std::deque<PipelineRequest> request_queue;
std::deque<MetalKernelPipeline *> request_queue;
std::vector<std::thread> compile_threads;
std::atomic_int incomplete_requests = 0;
std::atomic_int incomplete_specialization_requests = 0;
};
bool ShaderCache::running = true;
std::mutex g_shaderCacheMutex;
std::map<id<MTLDevice>, unique_ptr<ShaderCache>> g_shaderCache;
@ -137,11 +133,25 @@ ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice)
ShaderCache::~ShaderCache()
{
metal_printf("ShaderCache shutting down with incomplete_requests = %d\n",
int(incomplete_requests));
running = false;
cond_var.notify_all();
int num_incomplete = int(incomplete_requests);
if (num_incomplete) {
/* Shutting down the app with incomplete shader compilation requests. Give 1 second's grace for
* clean shutdown. */
metal_printf("ShaderCache busy (incomplete_requests = %d)...\n", num_incomplete);
std::this_thread::sleep_for(std::chrono::seconds(1));
num_incomplete = int(incomplete_requests);
}
if (num_incomplete) {
metal_printf("ShaderCache still busy (incomplete_requests = %d). Terminating...\n",
num_incomplete);
std::terminate();
}
metal_printf("ShaderCache idle. Shutting down.\n");
for (auto &thread : compile_threads) {
thread.join();
}
@ -156,35 +166,69 @@ void ShaderCache::wait_for_all()
void ShaderCache::compile_thread_func(int thread_index)
{
while (1) {
while (running) {
/* wait for / acquire next request */
PipelineRequest request;
MetalKernelPipeline *pipeline;
{
thread_scoped_lock lock(cache_mutex);
cond_var.wait(lock, [&] { return !running || !request_queue.empty(); });
if (!running) {
break;
if (!running || request_queue.empty()) {
continue;
}
if (!request_queue.empty()) {
request = request_queue.front();
request_queue.pop_front();
}
pipeline = request_queue.front();
request_queue.pop_front();
}
/* service request */
if (request.pipeline) {
request.pipeline->compile();
incomplete_requests--;
/* Service the request. */
DeviceKernel device_kernel = pipeline->device_kernel;
MetalPipelineType pso_type = pipeline->pso_type;
if (MetalDevice::is_device_cancelled(pipeline->originating_device_id)) {
/* The originating MetalDevice is no longer active, so this request is obsolete. */
metal_printf("Cancelling compilation of %s (%s)\n",
device_kernel_as_string(device_kernel),
kernel_type_as_string(pso_type));
}
else {
/* Do the actual compilation. */
pipeline->compile();
thread_scoped_lock lock(cache_mutex);
auto &collection = pipelines[device_kernel];
/* Cache up to 3 kernel variants with the same pso_type in memory, purging oldest first. */
int max_entries_of_same_pso_type = 3;
for (int i = (int)collection.size() - 1; i >= 0; i--) {
if (collection[i]->pso_type == pso_type) {
max_entries_of_same_pso_type -= 1;
if (max_entries_of_same_pso_type == 0) {
metal_printf("Purging oldest %s:%s kernel from ShaderCache\n",
kernel_type_as_string(pso_type),
device_kernel_as_string(device_kernel));
collection.erase(collection.begin() + i);
break;
}
}
}
collection.push_back(unique_ptr<MetalKernelPipeline>(pipeline));
}
incomplete_requests--;
if (pso_type != PSO_GENERIC) {
incomplete_specialization_requests--;
}
}
}
bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
MetalDevice *device,
MetalDevice const *device,
MetalPipelineType pso_type)
{
if (!running) {
return false;
}
if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
/* Skip megakernel. */
return false;
@ -240,7 +284,6 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
/* create compiler threads on first run */
thread_scoped_lock lock(cache_mutex);
if (compile_threads.empty()) {
running = true;
for (int i = 0; i < max_mtlcompiler_threads; i++) {
compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
}
@ -252,53 +295,39 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
}
incomplete_requests++;
if (pso_type != PSO_GENERIC) {
incomplete_specialization_requests++;
}
PipelineRequest request;
request.pipeline = new MetalKernelPipeline;
memcpy(&request.pipeline->kernel_data_,
&device->launch_params.data,
sizeof(request.pipeline->kernel_data_));
request.pipeline->pso_type = pso_type;
request.pipeline->mtlDevice = mtlDevice;
request.pipeline->source_md5 = device->source_md5[pso_type];
request.pipeline->mtlLibrary = device->mtlLibrary[pso_type];
request.pipeline->device_kernel = device_kernel;
request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
MetalKernelPipeline *pipeline = new MetalKernelPipeline;
/* Keep track of the originating device's ID so that we can cancel requests if the device ceases
* to be active. */
pipeline->originating_device_id = device->device_id;
memcpy(&pipeline->kernel_data_, &device->launch_params.data, sizeof(pipeline->kernel_data_));
pipeline->pso_type = pso_type;
pipeline->mtlDevice = mtlDevice;
pipeline->source_md5 = device->source_md5[pso_type];
pipeline->mtlLibrary = device->mtlLibrary[pso_type];
pipeline->device_kernel = device_kernel;
pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
if (occupancy_tuning[device_kernel].threads_per_threadgroup) {
request.pipeline->threads_per_threadgroup =
pipeline->threads_per_threadgroup =
occupancy_tuning[device_kernel].threads_per_threadgroup;
request.pipeline->num_threads_per_block =
pipeline->num_threads_per_block =
occupancy_tuning[device_kernel].num_threads_per_block;
}
/* metalrt options */
request.pipeline->use_metalrt = device->use_metalrt;
request.pipeline->metalrt_features = device->use_metalrt ?
(device->kernel_features & METALRT_FEATURE_MASK) :
0;
pipeline->use_metalrt = device->use_metalrt;
pipeline->metalrt_features = device->use_metalrt ?
(device->kernel_features & METALRT_FEATURE_MASK) :
0;
{
thread_scoped_lock lock(cache_mutex);
auto &collection = pipelines[device_kernel];
/* Cache up to 3 kernel variants with the same pso_type, purging oldest first. */
int max_entries_of_same_pso_type = 3;
for (int i = (int)collection.size() - 1; i >= 0; i--) {
if (collection[i]->pso_type == pso_type) {
max_entries_of_same_pso_type -= 1;
if (max_entries_of_same_pso_type == 0) {
metal_printf("Purging oldest %s:%s kernel from ShaderCache\n",
kernel_type_as_string(pso_type),
device_kernel_as_string(device_kernel));
collection.erase(collection.begin() + i);
break;
}
}
}
collection.push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
request_queue.push_back(request);
request_queue.push_back(pipeline);
}
cond_var.notify_one();
}
@ -664,51 +693,61 @@ void MetalKernelPipeline::compile()
double starttime = time_dt();
MTLNewComputePipelineStateWithReflectionCompletionHandler completionHandler = ^(
id<MTLComputePipelineState> computePipelineState,
MTLComputePipelineReflection *reflection,
NSError *error) {
bool recreate_archive = false;
if (computePipelineState == nil && archive) {
/* Block on load to ensure we continue with a valid kernel function */
if (creating_new_archive) {
starttime = time_dt();
NSError *error;
if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
error:&error]) {
NSString *errStr = [error localizedDescription];
metal_printf(
"Failed to create compute pipeline state \"%s\" from archive - attempting recreation... "
"(error: %s)\n",
device_kernel_as_string((DeviceKernel)device_kernel),
errStr ? [errStr UTF8String] : "nil");
computePipelineState = [mtlDevice
newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
options:MTLPipelineOptionNone
reflection:nullptr
error:&error];
recreate_archive = true;
metal_printf("Failed to add PSO to archive:\n%s\n", errStr ? [errStr UTF8String] : "nil");
}
}
double duration = time_dt() - starttime;
pipeline = [mtlDevice newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
options:pipelineOptions
reflection:nullptr
error:&error];
if (computePipelineState == nil) {
NSString *errStr = [error localizedDescription];
error_str = string_printf("Failed to create compute pipeline state \"%s\", error: \n",
device_kernel_as_string((DeviceKernel)device_kernel));
error_str += (errStr ? [errStr UTF8String] : "nil");
metal_printf("%16s | %2d | %-55s | %7.2fs | FAILED!\n",
kernel_type_as_string(pso_type),
device_kernel,
device_kernel_as_string((DeviceKernel)device_kernel),
duration);
return;
}
bool recreate_archive = false;
if (pipeline == nil && archive) {
NSString *errStr = [error localizedDescription];
metal_printf(
"Failed to create compute pipeline state \"%s\" from archive - attempting recreation... "
"(error: %s)\n",
device_kernel_as_string((DeviceKernel)device_kernel),
errStr ? [errStr UTF8String] : "nil");
pipeline = [mtlDevice newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
options:MTLPipelineOptionNone
reflection:nullptr
error:&error];
recreate_archive = true;
}
if (!num_threads_per_block) {
num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
computePipelineState.threadExecutionWidth);
num_threads_per_block = std::max(num_threads_per_block,
(int)computePipelineState.threadExecutionWidth);
}
double duration = time_dt() - starttime;
this->pipeline = computePipelineState;
if (pipeline == nil) {
NSString *errStr = [error localizedDescription];
error_str = string_printf("Failed to create compute pipeline state \"%s\", error: \n",
device_kernel_as_string((DeviceKernel)device_kernel));
error_str += (errStr ? [errStr UTF8String] : "nil");
metal_printf("%16s | %2d | %-55s | %7.2fs | FAILED!\n",
kernel_type_as_string(pso_type),
device_kernel,
device_kernel_as_string((DeviceKernel)device_kernel),
duration);
return;
}
if (@available(macOS 11.0, *)) {
if (!num_threads_per_block) {
num_threads_per_block = round_down(pipeline.maxTotalThreadsPerThreadgroup,
pipeline.threadExecutionWidth);
num_threads_per_block = std::max(num_threads_per_block,
(int)pipeline.threadExecutionWidth);
}
if (@available(macOS 11.0, *)) {
if (ShaderCache::running) {
if (creating_new_archive || recreate_archive) {
if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())]
error:&error]) {
@ -720,24 +759,7 @@ void MetalKernelPipeline::compile()
}
}
}
};
/* Block on load to ensure we continue with a valid kernel function */
if (creating_new_archive) {
starttime = time_dt();
NSError *error;
if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
error:&error]) {
NSString *errStr = [error localizedDescription];
metal_printf("Failed to add PSO to archive:\n%s\n", errStr ? [errStr UTF8String] : "nil");
}
}
id<MTLComputePipelineState> pipeline = [mtlDevice
newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
options:pipelineOptions
reflection:nullptr
error:&error];
completionHandler(pipeline, nullptr, error);
this->loaded = true;
[computePipelineStateDescriptor release];
@ -763,8 +785,6 @@ void MetalKernelPipeline::compile()
}
}
double duration = time_dt() - starttime;
if (!use_binary_archive) {
metal_printf("%16s | %2d | %-55s | %7.2fs\n",
kernel_type_as_string(pso_type),
@ -791,24 +811,46 @@ bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type)
shader_cache->load_kernel((DeviceKernel)i, device, pso_type);
}
shader_cache->wait_for_all();
metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
time_dt() - starttime,
kernel_type_as_string(pso_type));
if (getenv("CYCLES_METAL_PROFILING")) {
shader_cache->wait_for_all();
metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
time_dt() - starttime,
kernel_type_as_string(pso_type));
}
return true;
}
bool MetalDeviceKernels::should_load_kernels(MetalDevice *device, MetalPipelineType pso_type)
bool MetalDeviceKernels::any_specialization_happening_now()
{
auto shader_cache = get_shader_cache(device->mtlDevice);
for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) {
/* Return true if any ShaderCaches have ongoing specialization requests (typically there will be
* only 1). */
thread_scoped_lock lock(g_shaderCacheMutex);
for (auto &it : g_shaderCache) {
if (it.second->incomplete_specialization_requests > 0) {
return true;
}
}
return false;
}
int MetalDeviceKernels::get_loaded_kernel_count(MetalDevice const *device,
MetalPipelineType pso_type)
{
auto shader_cache = get_shader_cache(device->mtlDevice);
int loaded_count = DEVICE_KERNEL_NUM;
for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) {
loaded_count -= 1;
}
}
return loaded_count;
}
bool MetalDeviceKernels::should_load_kernels(MetalDevice const *device, MetalPipelineType pso_type)
{
return get_loaded_kernel_count(device, pso_type) != DEVICE_KERNEL_NUM;
}
const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device,
DeviceKernel kernel)
{

View File

@ -702,6 +702,10 @@ bool MetalDeviceQueue::synchronize()
void MetalDeviceQueue::zero_to_device(device_memory &mem)
{
if (metal_device_->have_error()) {
return;
}
assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
if (mem.memory_size() == 0) {
@ -729,6 +733,10 @@ void MetalDeviceQueue::zero_to_device(device_memory &mem)
void MetalDeviceQueue::copy_to_device(device_memory &mem)
{
if (metal_device_->have_error()) {
return;
}
if (mem.memory_size() == 0) {
return;
}
@ -771,6 +779,10 @@ void MetalDeviceQueue::copy_to_device(device_memory &mem)
void MetalDeviceQueue::copy_from_device(device_memory &mem)
{
if (metal_device_->have_error()) {
return;
}
assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
if (mem.memory_size() == 0) {

View File

@ -390,6 +390,9 @@ void PathTrace::path_trace(RenderWork &render_work)
const int num_samples = render_work.path_trace.num_samples;
PathTraceWork *path_trace_work = path_trace_works_[i].get();
if (path_trace_work->get_device()->have_error()) {
return;
}
PathTraceWork::RenderStatistics statistics;
path_trace_work->render_samples(statistics,

View File

@ -113,6 +113,9 @@ void Session::start()
void Session::cancel(bool quick)
{
/* Cancel any long running device operations (e.g. shader compilations). */
device->cancel();
/* Check if session thread is rendering. */
const bool rendering = is_session_thread_rendering();
@ -401,6 +404,16 @@ RenderWork Session::run_update_for_next_iteration()
path_trace_->load_kernels();
path_trace_->alloc_work_memory();
/* Wait for device to be ready (e.g. finish any background compilations). */
string device_status;
while (!device->is_ready(device_status)) {
progress.set_status(device_status);
if (progress.get_cancel()) {
break;
}
std::this_thread::sleep_for(std::chrono::milliseconds(200));
}
progress.add_skip_time(update_timer, params.background);
}