Cycles: print name of kernels on errors in CUDA queue, for debugging

This commit is contained in:
Brecht Van Lommel 2021-09-27 14:47:51 +02:00
parent 2bd0205215
commit a6b53ef994
4 changed files with 47 additions and 25 deletions

View File

@ -116,18 +116,18 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar
}
/* Launch kernel. */
cuda_device_assert(cuda_device_,
cuLaunchKernel(cuda_kernel.function,
num_blocks,
1,
1,
num_threads_per_block,
1,
1,
shared_mem_bytes,
cuda_stream_,
args,
0));
assert_success(cuLaunchKernel(cuda_kernel.function,
num_blocks,
1,
1,
num_threads_per_block,
1,
1,
shared_mem_bytes,
cuda_stream_,
args,
0),
"enqueue");
return !(cuda_device_->have_error());
}
@ -139,7 +139,8 @@ bool CUDADeviceQueue::synchronize()
}
const CUDAContextScope scope(cuda_device_);
cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
assert_success(cuStreamSynchronize(cuda_stream_), "synchronize");
debug_synchronize();
return !(cuda_device_->have_error());
@ -162,9 +163,9 @@ void CUDADeviceQueue::zero_to_device(device_memory &mem)
assert(mem.device_pointer != 0);
const CUDAContextScope scope(cuda_device_);
cuda_device_assert(
cuda_device_,
cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_));
assert_success(
cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_),
"zero_to_device");
}
void CUDADeviceQueue::copy_to_device(device_memory &mem)
@ -185,10 +186,10 @@ void CUDADeviceQueue::copy_to_device(device_memory &mem)
/* Copy memory to device. */
const CUDAContextScope scope(cuda_device_);
cuda_device_assert(
cuda_device_,
assert_success(
cuMemcpyHtoDAsync(
(CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_));
(CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_),
"copy_to_device");
}
void CUDADeviceQueue::copy_from_device(device_memory &mem)
@ -204,10 +205,19 @@ void CUDADeviceQueue::copy_from_device(device_memory &mem)
/* Copy memory from device. */
const CUDAContextScope scope(cuda_device_);
cuda_device_assert(
cuda_device_,
assert_success(
cuMemcpyDtoHAsync(
mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_));
mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_),
"copy_from_device");
}
void CUDADeviceQueue::assert_success(CUresult result, const char *operation)
{
if (result != CUDA_SUCCESS) {
const char *name = cuewErrorString(result);
cuda_device_->set_error(string_printf(
"%s in CUDA queue %s (%s)", name, operation, debug_active_kernels().c_str()));
}
}
unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create()

View File

@ -60,6 +60,8 @@ class CUDADeviceQueue : public DeviceQueue {
protected:
CUDADevice *cuda_device_;
CUstream cuda_stream_;
void assert_success(CUresult result, const char *operation);
};
CCL_NAMESPACE_END

View File

@ -57,8 +57,9 @@ void DeviceQueue::debug_init_execution()
{
if (VLOG_IS_ON(3)) {
last_sync_time_ = time_dt();
last_kernels_enqueued_ = 0;
}
last_kernels_enqueued_ = 0;
}
void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
@ -66,8 +67,9 @@ void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
if (VLOG_IS_ON(3)) {
VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
<< work_size;
last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
}
last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
}
void DeviceQueue::debug_synchronize()
@ -80,8 +82,14 @@ void DeviceQueue::debug_synchronize()
stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
last_sync_time_ = new_time;
last_kernels_enqueued_ = 0;
}
last_kernels_enqueued_ = 0;
}
string DeviceQueue::debug_active_kernels()
{
return device_kernel_mask_as_string(last_kernels_enqueued_);
}
CCL_NAMESPACE_END

View File

@ -21,6 +21,7 @@
#include "device/device_graphics_interop.h"
#include "util/util_logging.h"
#include "util/util_map.h"
#include "util/util_string.h"
#include "util/util_unique_ptr.h"
CCL_NAMESPACE_BEGIN
@ -101,6 +102,7 @@ class DeviceQueue {
void debug_init_execution();
void debug_enqueue(DeviceKernel kernel, const int work_size);
void debug_synchronize();
string debug_active_kernels();
/* Combination of kernels enqueued together sync last synchronize. */
DeviceKernelMask last_kernels_enqueued_;