Cycles: print name of kernels on errors in CUDA queue, for debugging
This commit is contained in:
parent
2bd0205215
commit
a6b53ef994
|
@ -116,18 +116,18 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar
|
|||
}
|
||||
|
||||
/* Launch kernel. */
|
||||
cuda_device_assert(cuda_device_,
|
||||
cuLaunchKernel(cuda_kernel.function,
|
||||
num_blocks,
|
||||
1,
|
||||
1,
|
||||
num_threads_per_block,
|
||||
1,
|
||||
1,
|
||||
shared_mem_bytes,
|
||||
cuda_stream_,
|
||||
args,
|
||||
0));
|
||||
assert_success(cuLaunchKernel(cuda_kernel.function,
|
||||
num_blocks,
|
||||
1,
|
||||
1,
|
||||
num_threads_per_block,
|
||||
1,
|
||||
1,
|
||||
shared_mem_bytes,
|
||||
cuda_stream_,
|
||||
args,
|
||||
0),
|
||||
"enqueue");
|
||||
|
||||
return !(cuda_device_->have_error());
|
||||
}
|
||||
|
@ -139,7 +139,8 @@ bool CUDADeviceQueue::synchronize()
|
|||
}
|
||||
|
||||
const CUDAContextScope scope(cuda_device_);
|
||||
cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
|
||||
assert_success(cuStreamSynchronize(cuda_stream_), "synchronize");
|
||||
|
||||
debug_synchronize();
|
||||
|
||||
return !(cuda_device_->have_error());
|
||||
|
@ -162,9 +163,9 @@ void CUDADeviceQueue::zero_to_device(device_memory &mem)
|
|||
assert(mem.device_pointer != 0);
|
||||
|
||||
const CUDAContextScope scope(cuda_device_);
|
||||
cuda_device_assert(
|
||||
cuda_device_,
|
||||
cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_));
|
||||
assert_success(
|
||||
cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_),
|
||||
"zero_to_device");
|
||||
}
|
||||
|
||||
void CUDADeviceQueue::copy_to_device(device_memory &mem)
|
||||
|
@ -185,10 +186,10 @@ void CUDADeviceQueue::copy_to_device(device_memory &mem)
|
|||
|
||||
/* Copy memory to device. */
|
||||
const CUDAContextScope scope(cuda_device_);
|
||||
cuda_device_assert(
|
||||
cuda_device_,
|
||||
assert_success(
|
||||
cuMemcpyHtoDAsync(
|
||||
(CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_));
|
||||
(CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_),
|
||||
"copy_to_device");
|
||||
}
|
||||
|
||||
void CUDADeviceQueue::copy_from_device(device_memory &mem)
|
||||
|
@ -204,10 +205,19 @@ void CUDADeviceQueue::copy_from_device(device_memory &mem)
|
|||
|
||||
/* Copy memory from device. */
|
||||
const CUDAContextScope scope(cuda_device_);
|
||||
cuda_device_assert(
|
||||
cuda_device_,
|
||||
assert_success(
|
||||
cuMemcpyDtoHAsync(
|
||||
mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_));
|
||||
mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_),
|
||||
"copy_from_device");
|
||||
}
|
||||
|
||||
void CUDADeviceQueue::assert_success(CUresult result, const char *operation)
|
||||
{
|
||||
if (result != CUDA_SUCCESS) {
|
||||
const char *name = cuewErrorString(result);
|
||||
cuda_device_->set_error(string_printf(
|
||||
"%s in CUDA queue %s (%s)", name, operation, debug_active_kernels().c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create()
|
||||
|
|
|
@ -60,6 +60,8 @@ class CUDADeviceQueue : public DeviceQueue {
|
|||
protected:
|
||||
CUDADevice *cuda_device_;
|
||||
CUstream cuda_stream_;
|
||||
|
||||
void assert_success(CUresult result, const char *operation);
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
|
|
@ -57,8 +57,9 @@ void DeviceQueue::debug_init_execution()
|
|||
{
|
||||
if (VLOG_IS_ON(3)) {
|
||||
last_sync_time_ = time_dt();
|
||||
last_kernels_enqueued_ = 0;
|
||||
}
|
||||
|
||||
last_kernels_enqueued_ = 0;
|
||||
}
|
||||
|
||||
void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
|
||||
|
@ -66,8 +67,9 @@ void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
|
|||
if (VLOG_IS_ON(3)) {
|
||||
VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
|
||||
<< work_size;
|
||||
last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
|
||||
}
|
||||
|
||||
last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
|
||||
}
|
||||
|
||||
void DeviceQueue::debug_synchronize()
|
||||
|
@ -80,8 +82,14 @@ void DeviceQueue::debug_synchronize()
|
|||
stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
|
||||
|
||||
last_sync_time_ = new_time;
|
||||
last_kernels_enqueued_ = 0;
|
||||
}
|
||||
|
||||
last_kernels_enqueued_ = 0;
|
||||
}
|
||||
|
||||
string DeviceQueue::debug_active_kernels()
|
||||
{
|
||||
return device_kernel_mask_as_string(last_kernels_enqueued_);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "device/device_graphics_interop.h"
|
||||
#include "util/util_logging.h"
|
||||
#include "util/util_map.h"
|
||||
#include "util/util_string.h"
|
||||
#include "util/util_unique_ptr.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
@ -101,6 +102,7 @@ class DeviceQueue {
|
|||
void debug_init_execution();
|
||||
void debug_enqueue(DeviceKernel kernel, const int work_size);
|
||||
void debug_synchronize();
|
||||
string debug_active_kernels();
|
||||
|
||||
/* Combination of kernels enqueued together sync last synchronize. */
|
||||
DeviceKernelMask last_kernels_enqueued_;
|
||||
|
|
Loading…
Reference in New Issue