Cycles: Useful Metal backend debug & profiling functionality

This patch adds some useful debugging & profiling env vars to the Metal backend:

- `CYCLES_METAL_PROFILING`: output a per-kernel timing report at the end of the render
- `CYCLES_METAL_DEBUG`: enable per-dispatch tracing (very verbose)
- `CYCLES_DEBUG_METAL_CAPTURE_KERNEL`: enable programatic .gputrace capture for a specified kernel index

Here's an example of the timing report with `CYCLES_METAL_PROFILING` enabled:

```
---------------------------------------------------------------------------------------------------
Kernel name                                 Total threads   Dispatches     Avg. T/D    Time   Time%
---------------------------------------------------------------------------------------------------
integrator_init_from_camera                   657,407,232          161    4,083,274   0.24s   0.51%
integrator_intersect_closest                1,629,288,440          681    2,392,494  15.18s  32.12%
integrator_intersect_shadow                   751,652,291          470    1,599,260   5.80s  12.28%
integrator_shade_background                   304,612,074          263    1,158,220   1.16s   2.45%
integrator_shade_surface                    1,159,764,041          676    1,715,627  20.57s  43.52%
integrator_shade_shadow                       598,885,847          418    1,432,741   1.27s   2.69%
integrator_queued_paths_array               2,969,650,130          805    3,689,006   0.35s   0.74%
integrator_queued_shadow_paths_array          593,936,619          379    1,567,115   0.14s   0.29%
integrator_terminated_paths_array              22,205,417          155      143,260   0.05s   0.10%
integrator_sorted_paths_array               2,517,140,043          676    3,723,579   1.65s   3.50%
integrator_compact_paths_array                648,912,748          155    4,186,533   0.03s   0.07%
integrator_compact_states                      20,872,687          155      134,662   0.14s   0.29%
integrator_terminated_shadow_paths_array      374,100,675          438      854,111   0.16s   0.33%
integrator_compact_shadow_paths_array         503,768,657          438    1,150,156   0.05s   0.10%
integrator_compact_shadow_states               37,664,941          202      186,460   0.23s   0.50%
integrator_reset                               25,165,824            6    4,194,304   0.06s   0.12%
film_convert_combined_half_rgba                 3,110,400            6      518,400   0.00s   0.01%
prefix_sum                                            676          676            1   0.19s   0.40%
---------------------------------------------------------------------------------------------------
                                                                 6,760               47.27s 100.00%
---------------------------------------------------------------------------------------------------
```

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D15044
This commit is contained in:
Michael Jones 2022-06-07 11:08:21 +01:00 committed by Michael Jones
parent 4fc7e1a880
commit 4412e14708
7 changed files with 269 additions and 7 deletions

View File

@ -11,6 +11,7 @@
# include "util/progress.h"
# include "device/metal/bvh.h"
# include "device/metal/util.h"
CCL_NAMESPACE_BEGIN
@ -18,6 +19,7 @@ CCL_NAMESPACE_BEGIN
{ \
string str = string_printf(__VA_ARGS__); \
progress.set_substatus(str); \
metal_printf("%s\n", str.c_str()); \
}
BVHMetal::BVHMetal(const BVHParams &params_,

View File

@ -31,6 +31,8 @@ class MetalDevice : public Device {
string source[PSO_NUM];
string source_md5[PSO_NUM];
bool capture_enabled = false;
KernelParamsMetal launch_params = {0};
/* MetalRT members ----------------------------------*/

View File

@ -86,6 +86,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
use_metalrt = (atoi(metalrt) != 0);
}
if (getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
capture_enabled = true;
}
MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init];
arg_desc_params.dataType = MTLDataTypePointer;
arg_desc_params.access = MTLArgumentAccessReadOnly;
@ -394,7 +398,7 @@ MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem)
}
if (size > 0) {
if (mem.type == MEM_DEVICE_ONLY) {
if (mem.type == MEM_DEVICE_ONLY && !capture_enabled) {
options = MTLResourceStorageModePrivate;
}

View File

@ -12,8 +12,6 @@
# include "device/metal/util.h"
# include "kernel/device/metal/globals.h"
# define metal_printf VLOG(4) << string_printf
CCL_NAMESPACE_BEGIN
class MetalDevice;
@ -77,6 +75,38 @@ class MetalDeviceQueue : public DeviceQueue {
void close_compute_encoder();
void close_blit_encoder();
bool verbose_tracing = false;
/* Per-kernel profiling (see CYCLES_METAL_PROFILING). */
struct TimingData {
DeviceKernel kernel;
int work_size;
uint64_t timing_id;
};
std::vector<TimingData> command_encoder_labels;
id<MTLSharedEvent> timing_shared_event = nil;
uint64_t timing_shared_event_id;
uint64_t command_buffer_start_timing_id;
struct TimingStats {
double total_time = 0.0;
uint64_t total_work_size = 0;
uint64_t num_dispatches = 0;
};
TimingStats timing_stats[DEVICE_KERNEL_NUM];
double last_completion_time = 0.0;
/* .gputrace capture (see CYCLES_DEBUG_METAL_CAPTURE_...). */
id<MTLCaptureScope> mtlCaptureScope = nil;
DeviceKernel capture_kernel;
int capture_dispatch = 0;
int capture_dispatch_counter = 0;
bool is_capturing = false;
bool is_capturing_to_disk = false;
bool has_captured_to_disk = false;
};
CCL_NAMESPACE_END

View File

@ -37,6 +37,61 @@ MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device)
}
wait_semaphore = dispatch_semaphore_create(0);
if (@available(macos 10.14, *)) {
if (getenv("CYCLES_METAL_PROFILING")) {
/* Enable per-kernel timing breakdown (shown at end of render). */
timing_shared_event = [mtlDevice newSharedEvent];
}
if (getenv("CYCLES_METAL_DEBUG")) {
/* Enable very verbose tracing (shows every dispatch). */
verbose_tracing = true;
}
timing_shared_event_id = 1;
}
capture_kernel = DeviceKernel(-1);
if (auto capture_kernel_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
/* Enable .gputrace capture for the specified DeviceKernel. */
MTLCaptureManager *captureManager = [MTLCaptureManager sharedCaptureManager];
mtlCaptureScope = [captureManager newCaptureScopeWithDevice:mtlDevice];
mtlCaptureScope.label = [NSString stringWithFormat:@"Cycles kernel dispatch"];
[captureManager setDefaultCaptureScope:mtlCaptureScope];
capture_dispatch = -1;
if (auto capture_dispatch_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_DISPATCH")) {
capture_dispatch = atoi(capture_dispatch_str);
capture_dispatch_counter = 0;
}
capture_kernel = DeviceKernel(atoi(capture_kernel_str));
printf("Capture kernel: %d = %s\n", capture_kernel, device_kernel_as_string(capture_kernel));
if (auto capture_url = getenv("CYCLES_DEBUG_METAL_CAPTURE_URL")) {
if (@available(macos 10.15, *)) {
if ([captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument]) {
MTLCaptureDescriptor *captureDescriptor = [[MTLCaptureDescriptor alloc] init];
captureDescriptor.captureObject = mtlCaptureScope;
captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument;
captureDescriptor.outputURL = [NSURL fileURLWithPath:@(capture_url)];
NSError *error;
if (![captureManager startCaptureWithDescriptor:captureDescriptor error:&error]) {
NSString *err = [error localizedDescription];
printf("Start capture failed: %s\n", [err UTF8String]);
}
else {
printf("Capture started (URL: %s)\n", capture_url);
is_capturing_to_disk = true;
}
}
else {
printf("Capture to file is not supported\n");
}
}
}
}
}
MetalDeviceQueue::~MetalDeviceQueue()
@ -58,6 +113,56 @@ MetalDeviceQueue::~MetalDeviceQueue()
[mtlCommandQueue release];
mtlCommandQueue = nil;
}
if (mtlCaptureScope) {
[mtlCaptureScope release];
}
double total_time = 0.0;
/* Show per-kernel timings, if gathered (see CYCLES_METAL_PROFILING). */
int64_t total_work_size = 0;
int64_t num_dispatches = 0;
for (auto &stat : timing_stats) {
total_time += stat.total_time;
total_work_size += stat.total_work_size;
num_dispatches += stat.num_dispatches;
}
if (num_dispatches) {
printf("\nMetal dispatch stats:\n\n");
auto header = string_printf("%-40s %16s %12s %12s %7s %7s",
"Kernel name",
"Total threads",
"Dispatches",
"Avg. T/D",
"Time",
"Time%");
auto divider = string(header.length(), '-');
printf("%s\n%s\n%s\n", divider.c_str(), header.c_str(), divider.c_str());
for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) {
auto &stat = timing_stats[i];
if (stat.num_dispatches > 0) {
printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n",
device_kernel_as_string(DeviceKernel(i)),
string_human_readable_number(stat.total_work_size).c_str(),
string_human_readable_number(stat.num_dispatches).c_str(),
string_human_readable_number(stat.total_work_size / stat.num_dispatches).c_str(),
stat.total_time,
stat.total_time * 100.0 / total_time);
}
}
printf("%s\n", divider.c_str());
printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n",
"",
"",
string_human_readable_number(num_dispatches).c_str(),
"",
total_time,
100.0);
printf("%s\n\n", divider.c_str());
}
}
int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
@ -101,6 +206,19 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
const int work_size,
DeviceKernelArguments const &args)
{
if (kernel == capture_kernel) {
if (capture_dispatch < 0 || capture_dispatch == capture_dispatch_counter) {
/* Start gputrace capture. */
if (mtlCommandBuffer) {
synchronize();
}
[mtlCaptureScope beginScope];
printf("[mtlCaptureScope beginScope]\n");
is_capturing = true;
}
capture_dispatch_counter += 1;
}
if (metal_device->have_error()) {
return false;
}
@ -110,6 +228,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
if (timing_shared_event) {
command_encoder_labels.push_back({kernel, work_size, timing_shared_event_id});
}
/* Determine size requirement for argument buffer. */
size_t arg_buffer_length = 0;
for (size_t i = 0; i < args.count; i++) {
@ -189,6 +311,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
/* Encode KernelParamsMetal buffers */
[metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer offset:globals_offsets];
if (verbose_tracing || timing_shared_event || is_capturing) {
/* Add human-readable labels if we're doing any form of debugging / profiling. */
mtlComputeCommandEncoder.label = [[NSString alloc]
initWithFormat:@"Metal queue launch %s, work_size %d",
device_kernel_as_string(kernel),
work_size];
}
/* this relies on IntegratorStateGPU layout being contiguous device_ptrs */
const size_t pointer_block_end = offsetof(KernelParamsMetal, __integrator_state) +
sizeof(IntegratorStateGPU);
@ -196,7 +326,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
int pointer_index = offset / sizeof(device_ptr);
MetalDevice::MetalMem *mmem = *(
MetalDevice::MetalMem **)((uint8_t *)&metal_device->launch_params + offset);
if (mmem && (mmem->mtlBuffer || mmem->mtlTexture)) {
if (mmem && mmem->mem && (mmem->mtlBuffer || mmem->mtlTexture)) {
[metal_device->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer
offset:0
atIndex:pointer_index];
@ -344,12 +474,53 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
}
}];
if (verbose_tracing || is_capturing) {
/* Force a sync we've enabled step-by-step verbose tracing or if we're capturing. */
synchronize();
/* Show queue counters and dispatch timing. */
if (verbose_tracing) {
if (kernel == DEVICE_KERNEL_INTEGRATOR_RESET) {
printf(
"_____________________________________.____________________.______________.___________"
"______________________________________\n");
}
printf("%-40s| %7d threads |%5.2fms | buckets [",
device_kernel_as_string(kernel),
work_size,
last_completion_time * 1000.0);
std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex);
for (auto &it : metal_device->metal_mem_map) {
const string c_integrator_queue_counter = "integrator_queue_counter";
if (it.first->name == c_integrator_queue_counter) {
/* Workaround "device_copy_from" being protected. */
struct MyDeviceMemory : device_memory {
void device_copy_from__IntegratorQueueCounter()
{
device_copy_from(0, data_width, 1, sizeof(IntegratorQueueCounter));
}
};
((MyDeviceMemory *)it.first)->device_copy_from__IntegratorQueueCounter();
if (IntegratorQueueCounter *queue_counter = (IntegratorQueueCounter *)
it.first->host_pointer) {
for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++)
printf("%s%d", i == 0 ? "" : ",", int(queue_counter->num_queued[i]));
}
break;
}
}
printf("]\n");
}
}
return !(metal_device->have_error());
}
bool MetalDeviceQueue::synchronize()
{
if (metal_device->have_error()) {
if (has_captured_to_disk || metal_device->have_error()) {
return false;
}
@ -359,6 +530,28 @@ bool MetalDeviceQueue::synchronize()
close_blit_encoder();
if (mtlCommandBuffer) {
scoped_timer timer;
if (timing_shared_event) {
/* For per-kernel timing, add event handlers to measure & accumulate dispatch times. */
__block double completion_time = 0;
for (uint64_t i = command_buffer_start_timing_id; i < timing_shared_event_id; i++) {
[timing_shared_event notifyListener:shared_event_listener
atValue:i
block:^(id<MTLSharedEvent> sharedEvent, uint64_t value) {
completion_time = timer.get_time() - completion_time;
last_completion_time = completion_time;
for (auto label : command_encoder_labels) {
if (label.timing_id == value) {
TimingStats &stat = timing_stats[label.kernel];
stat.num_dispatches++;
stat.total_time += completion_time;
stat.total_work_size += label.work_size;
}
}
}];
}
}
uint64_t shared_event_id = this->shared_event_id++;
if (@available(macos 10.14, *)) {
@ -374,6 +567,22 @@ bool MetalDeviceQueue::synchronize()
dispatch_semaphore_wait(wait_semaphore, DISPATCH_TIME_FOREVER);
}
if (is_capturing) {
[mtlCaptureScope endScope];
is_capturing = false;
printf("[mtlCaptureScope endScope]\n");
if (is_capturing_to_disk) {
if (@available(macos 10.15, *)) {
[[MTLCaptureManager sharedCaptureManager] stopCapture];
has_captured_to_disk = true;
is_capturing_to_disk = false;
is_capturing = false;
printf("Capture stopped\n");
}
}
}
[mtlCommandBuffer release];
for (const CopyBack &mmem : copy_back_mem) {
@ -385,6 +594,7 @@ bool MetalDeviceQueue::synchronize()
metal_device->flush_delayed_free_list();
mtlCommandBuffer = nil;
command_encoder_labels.clear();
}
return !(metal_device->have_error());
@ -530,6 +740,13 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel
{
bool concurrent = (kernel < DEVICE_KERNEL_INTEGRATOR_NUM);
if (timing_shared_event) {
/* Close the current encoder to ensure we're able to capture per-encoder timing data. */
if (mtlComputeEncoder) {
close_compute_encoder();
}
}
if (@available(macos 10.14, *)) {
if (mtlComputeEncoder) {
if (mtlComputeEncoder.dispatchType == concurrent ? MTLDispatchTypeConcurrent :
@ -575,6 +792,7 @@ id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder()
if (!mtlCommandBuffer) {
mtlCommandBuffer = [mtlCommandQueue commandBuffer];
[mtlCommandBuffer retain];
command_buffer_start_timing_id = timing_shared_event_id;
}
mtlBlitEncoder = [mtlCommandBuffer blitCommandEncoder];
@ -585,6 +803,10 @@ void MetalDeviceQueue::close_compute_encoder()
{
[mtlComputeEncoder endEncoding];
mtlComputeEncoder = nil;
if (timing_shared_event) {
[mtlCommandBuffer encodeSignalEvent:timing_shared_event value:timing_shared_event_id++];
}
}
void MetalDeviceQueue::close_blit_encoder()

View File

@ -14,6 +14,8 @@
# include "util/thread.h"
# define metal_printf VLOG(4) << string_printf
CCL_NAMESPACE_BEGIN
enum MetalGPUVendor {

View File

@ -241,7 +241,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
ccl_gpu_kernel_postfix
#ifdef __KERNEL_METAL__
#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
constant int __dummy_constant [[function_constant(0)]];
#endif
@ -256,7 +256,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
#ifdef __KERNEL_METAL__
#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
KernelGlobals kg = NULL;
/* Workaround Ambient Occlusion and Bevel nodes not working with Metal.
* Dummy offset should not affect result, but somehow fixes bug! */