Cycles: Add optional per-kernel performance statistics
When verbose level 4 is enabled, Blender prints kernel performance data for Cycles on GPU backends (except Metal that doesn't use debug_enqueue_* methods) for groups of kernels. These changes introduce a new CYCLES_DEBUG_PER_KERNEL_PERFORMANCE environment variable to allow getting timings for each kernels separately and not grouped with others. This is done by adding explicit synchronization after each kernel execution. Differential Revision: https://developer.blender.org/D15971
This commit is contained in:
parent
b145cc9d36
commit
2ead05d738
|
@ -79,7 +79,7 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel,
|
|||
return false;
|
||||
}
|
||||
|
||||
debug_enqueue(kernel, work_size);
|
||||
debug_enqueue_begin(kernel, work_size);
|
||||
|
||||
const CUDAContextScope scope(cuda_device_);
|
||||
const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel);
|
||||
|
@ -121,6 +121,8 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel,
|
|||
0),
|
||||
"enqueue");
|
||||
|
||||
debug_enqueue_end();
|
||||
|
||||
return !(cuda_device_->have_error());
|
||||
}
|
||||
|
||||
|
|
|
@ -79,7 +79,7 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel,
|
|||
return false;
|
||||
}
|
||||
|
||||
debug_enqueue(kernel, work_size);
|
||||
debug_enqueue_begin(kernel, work_size);
|
||||
|
||||
const HIPContextScope scope(hip_device_);
|
||||
const HIPDeviceKernel &hip_kernel = hip_device_->kernels.get(kernel);
|
||||
|
@ -120,6 +120,8 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel,
|
|||
0),
|
||||
"enqueue");
|
||||
|
||||
debug_enqueue_end();
|
||||
|
||||
return !(hip_device_->have_error());
|
||||
}
|
||||
|
||||
|
|
|
@ -77,7 +77,7 @@ bool OneapiDeviceQueue::enqueue(DeviceKernel kernel,
|
|||
|
||||
void **args = const_cast<void **>(_args.values);
|
||||
|
||||
debug_enqueue(kernel, signed_kernel_work_size);
|
||||
debug_enqueue_begin(kernel, signed_kernel_work_size);
|
||||
assert(signed_kernel_work_size >= 0);
|
||||
size_t kernel_work_size = (size_t)signed_kernel_work_size;
|
||||
|
||||
|
@ -97,6 +97,8 @@ bool OneapiDeviceQueue::enqueue(DeviceKernel kernel,
|
|||
oneapi_device_->oneapi_error_message() + "\"");
|
||||
}
|
||||
|
||||
debug_enqueue_end();
|
||||
|
||||
return is_finished_ok;
|
||||
}
|
||||
|
||||
|
|
|
@ -46,7 +46,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
|
|||
return false;
|
||||
}
|
||||
|
||||
debug_enqueue(kernel, work_size);
|
||||
debug_enqueue_begin(kernel, work_size);
|
||||
|
||||
const CUDAContextScope scope(cuda_device_);
|
||||
|
||||
|
@ -131,6 +131,8 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
|
|||
1,
|
||||
1));
|
||||
|
||||
debug_enqueue_end();
|
||||
|
||||
return !(optix_device->have_error());
|
||||
}
|
||||
|
||||
|
|
|
@ -12,9 +12,13 @@
|
|||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
DeviceQueue::DeviceQueue(Device *device)
|
||||
: device(device), last_kernels_enqueued_(0), last_sync_time_(0.0)
|
||||
: device(device),
|
||||
last_kernels_enqueued_(0),
|
||||
last_sync_time_(0.0),
|
||||
is_per_kernel_performance_(false)
|
||||
{
|
||||
DCHECK_NE(device, nullptr);
|
||||
is_per_kernel_performance_ = getenv("CYCLES_DEBUG_PER_KERNEL_PERFORMANCE");
|
||||
}
|
||||
|
||||
DeviceQueue::~DeviceQueue()
|
||||
|
@ -33,11 +37,17 @@ DeviceQueue::~DeviceQueue()
|
|||
});
|
||||
|
||||
VLOG_DEVICE_STATS << "GPU queue stats:";
|
||||
double total_time = 0.0;
|
||||
for (const auto &[mask, time] : stats_sorted) {
|
||||
total_time += time;
|
||||
VLOG_DEVICE_STATS << " " << std::setfill(' ') << std::setw(10) << std::fixed
|
||||
<< std::setprecision(5) << std::right << time
|
||||
<< "s: " << device_kernel_mask_as_string(mask);
|
||||
}
|
||||
|
||||
if (is_per_kernel_performance_)
|
||||
VLOG_DEVICE_STATS << "GPU queue total time: " << std::fixed << std::setprecision(5)
|
||||
<< total_time;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -50,7 +60,7 @@ void DeviceQueue::debug_init_execution()
|
|||
last_kernels_enqueued_ = 0;
|
||||
}
|
||||
|
||||
void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
|
||||
void DeviceQueue::debug_enqueue_begin(DeviceKernel kernel, const int work_size)
|
||||
{
|
||||
if (VLOG_DEVICE_STATS_IS_ON) {
|
||||
VLOG_DEVICE_STATS << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
|
||||
|
@ -60,6 +70,13 @@ void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
|
|||
last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
|
||||
}
|
||||
|
||||
void DeviceQueue::debug_enqueue_end()
|
||||
{
|
||||
if (VLOG_DEVICE_STATS_IS_ON && is_per_kernel_performance_) {
|
||||
synchronize();
|
||||
}
|
||||
}
|
||||
|
||||
void DeviceQueue::debug_synchronize()
|
||||
{
|
||||
if (VLOG_DEVICE_STATS_IS_ON) {
|
||||
|
@ -67,7 +84,11 @@ void DeviceQueue::debug_synchronize()
|
|||
const double elapsed_time = new_time - last_sync_time_;
|
||||
VLOG_DEVICE_STATS << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s";
|
||||
|
||||
stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
|
||||
/* There is no sense to have an entries in the performance data
|
||||
* container without related kernel information. */
|
||||
if (last_kernels_enqueued_ != 0) {
|
||||
stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
|
||||
}
|
||||
|
||||
last_sync_time_ = new_time;
|
||||
}
|
||||
|
|
|
@ -162,7 +162,8 @@ class DeviceQueue {
|
|||
|
||||
/* Implementations call these from the corresponding methods to generate debugging logs. */
|
||||
void debug_init_execution();
|
||||
void debug_enqueue(DeviceKernel kernel, const int work_size);
|
||||
void debug_enqueue_begin(DeviceKernel kernel, const int work_size);
|
||||
void debug_enqueue_end();
|
||||
void debug_synchronize();
|
||||
string debug_active_kernels();
|
||||
|
||||
|
@ -172,6 +173,9 @@ class DeviceQueue {
|
|||
double last_sync_time_;
|
||||
/* Accumulated execution time for combinations of kernels launched together. */
|
||||
map<DeviceKernelMask, double> stats_kernel_time_;
|
||||
/* If it is true, then a performance statistics in the debugging logs will have focus on kernels
|
||||
* and an explicitqueue synchronization will be added after each kernel execution. */
|
||||
bool is_per_kernel_performance_;
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
|
Loading…
Reference in New Issue