Code refactor: simplify CUDA context push/pop.

Makes it possible to call a function like mem_alloc() when the context is
already active. Also fixes some missing pops in case of errors.
This commit is contained in:
Brecht Van Lommel 2017-09-24 00:18:28 +02:00
parent 43b4913051
commit 88520dd5b6
1 changed files with 69 additions and 109 deletions

View File

@ -111,6 +111,16 @@ public:
virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
};
/* Utility to push/pop CUDA context. */
class CUDAContextScope {
public:
CUDAContextScope(CUDADevice *device);
~CUDAContextScope();
private:
CUDADevice *device;
};
class CUDADevice : public Device
{
public:
@ -206,16 +216,6 @@ public:
cuda_error_documentation();
}
void cuda_push_context()
{
cuda_assert(cuCtxSetCurrent(cuContext));
}
void cuda_pop_context()
{
cuda_assert(cuCtxSetCurrent(NULL));
}
CUDADevice(DeviceInfo& info, Stats &stats, bool background_)
: Device(info, stats, background_)
{
@ -263,7 +263,8 @@ public:
cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
cuDevArchitecture = major*100 + minor*10;
cuda_pop_context();
/* Pop context set by cuCtxCreate. */
cuCtxPopCurrent(NULL);
}
~CUDADevice()
@ -519,7 +520,7 @@ public:
return false;
/* open module */
cuda_push_context();
CUDAContextScope scope(this);
string cubin_data;
CUresult result;
@ -540,8 +541,6 @@ public:
if(cuda_error_(result, "cuModuleLoad"))
cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
cuda_pop_context();
return (result == CUDA_SUCCESS);
}
@ -556,36 +555,36 @@ public:
void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
{
CUDAContextScope scope(this);
if(name) {
VLOG(1) << "Buffer allocate: " << name << ", "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ")";
}
cuda_push_context();
CUdeviceptr device_pointer;
size_t size = mem.memory_size();
cuda_assert(cuMemAlloc(&device_pointer, size));
mem.device_pointer = (device_ptr)device_pointer;
mem.device_size = size;
stats.mem_alloc(size);
cuda_pop_context();
}
void mem_copy_to(device_memory& mem)
{
cuda_push_context();
CUDAContextScope scope(this);
if(mem.device_pointer)
cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()));
cuda_pop_context();
}
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
{
CUDAContextScope scope(this);
size_t offset = elem*y*w;
size_t size = elem*w*h;
cuda_push_context();
if(mem.device_pointer) {
cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
(CUdeviceptr)(mem.device_pointer + offset), size));
@ -593,7 +592,6 @@ public:
else {
memset((char*)mem.data_pointer + offset, 0, size);
}
cuda_pop_context();
}
void mem_zero(device_memory& mem)
@ -602,18 +600,17 @@ public:
memset((void*)mem.data_pointer, 0, mem.memory_size());
}
cuda_push_context();
if(mem.device_pointer)
if(mem.device_pointer) {
CUDAContextScope scope(this);
cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
cuda_pop_context();
}
}
void mem_free(device_memory& mem)
{
if(mem.device_pointer) {
cuda_push_context();
CUDAContextScope scope(this);
cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)));
cuda_pop_context();
mem.device_pointer = 0;
@ -629,14 +626,13 @@ public:
void const_copy_to(const char *name, void *host, size_t size)
{
CUDAContextScope scope(this);
CUdeviceptr mem;
size_t bytes;
cuda_push_context();
cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
//assert(bytes == size);
cuda_assert(cuMemcpyHtoD(mem, host, size));
cuda_pop_context();
}
void tex_alloc(const char *name,
@ -644,6 +640,8 @@ public:
InterpolationType interpolation,
ExtensionType extension)
{
CUDAContextScope scope(this);
VLOG(1) << "Texture allocate: " << name << ", "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ")";
@ -706,9 +704,7 @@ public:
tokens[3].c_str());
}
cuda_push_context();
cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str()));
cuda_pop_context();
if(!texref) {
return;
@ -721,8 +717,6 @@ public:
mem_alloc(NULL, mem, MEM_READ_ONLY);
mem_copy_to(mem);
cuda_push_context();
CUdeviceptr cumem;
size_t cubytes;
@ -738,28 +732,20 @@ public:
uint32_t ptr = (uint32_t)mem.device_pointer;
cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
}
cuda_pop_context();
}
else {
mem_alloc(NULL, mem, MEM_READ_ONLY);
mem_copy_to(mem);
cuda_push_context();
cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size));
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
cuda_pop_context();
}
}
/* Texture Storage */
else {
CUarray handle = NULL;
cuda_push_context();
if(mem.data_depth > 1) {
CUDA_ARRAY3D_DESCRIPTOR desc;
@ -784,7 +770,6 @@ public:
}
if(!handle) {
cuda_pop_context();
return;
}
@ -877,14 +862,10 @@ public:
cuda_assert(cuTexRefSetFilterMode(texref, filter_mode));
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
}
cuda_pop_context();
}
/* Fermi, Data and Image Textures */
if(!has_bindless_textures) {
cuda_push_context();
cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
if(mem.data_depth > 1) {
@ -892,8 +873,6 @@ public:
}
cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
cuda_pop_context();
}
/* Fermi and Kepler */
@ -904,9 +883,8 @@ public:
{
if(mem.device_pointer) {
if(tex_interp_map[mem.device_pointer]) {
cuda_push_context();
CUDAContextScope scope(this);
cuArrayDestroy((CUarray)mem.device_pointer);
cuda_pop_context();
/* Free CUtexObject (Bindless Textures) */
if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) {
@ -960,7 +938,7 @@ public:
if(have_error())
return false;
cuda_push_context();
CUDAContextScope scope(this);
int4 rect = task->rect;
int w = align_up(rect.z-rect.x, 4);
@ -1017,7 +995,6 @@ public:
CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
cuda_assert(cuCtxSynchronize());
cuda_pop_context();
return !have_error();
}
@ -1026,7 +1003,7 @@ public:
if(have_error())
return false;
cuda_push_context();
CUDAContextScope scope(this);
CUfunction cuFilterConstructTransform;
cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
@ -1046,7 +1023,6 @@ public:
CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
cuda_assert(cuCtxSynchronize());
cuda_pop_context();
return !have_error();
}
@ -1058,11 +1034,11 @@ public:
if(have_error())
return false;
CUDAContextScope scope(this);
mem_zero(task->storage.XtWX);
mem_zero(task->storage.XtWY);
cuda_push_context();
CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
@ -1150,7 +1126,6 @@ public:
CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
cuda_assert(cuCtxSynchronize());
cuda_pop_context();
return !have_error();
}
@ -1161,7 +1136,7 @@ public:
if(have_error())
return false;
cuda_push_context();
CUDAContextScope scope(this);
CUfunction cuFilterCombineHalves;
cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
@ -1179,7 +1154,6 @@ public:
CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
cuda_assert(cuCtxSynchronize());
cuda_pop_context();
return !have_error();
}
@ -1190,7 +1164,7 @@ public:
if(have_error())
return false;
cuda_push_context();
CUDAContextScope scope(this);
CUfunction cuFilterDivideShadow;
cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
@ -1214,7 +1188,6 @@ public:
CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
cuda_assert(cuCtxSynchronize());
cuda_pop_context();
return !have_error();
}
@ -1227,7 +1200,7 @@ public:
if(have_error())
return false;
cuda_push_context();
CUDAContextScope scope(this);
CUfunction cuFilterGetFeature;
cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
@ -1250,7 +1223,6 @@ public:
CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
cuda_assert(cuCtxSynchronize());
cuda_pop_context();
return !have_error();
}
@ -1263,7 +1235,7 @@ public:
if(have_error())
return false;
cuda_push_context();
CUDAContextScope scope(this);
CUfunction cuFilterDetectOutliers;
cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
@ -1282,7 +1254,6 @@ public:
CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
cuda_assert(cuCtxSynchronize());
cuda_pop_context();
return !have_error();
}
@ -1319,7 +1290,7 @@ public:
if(have_error())
return;
cuda_push_context();
CUDAContextScope scope(this);
CUfunction cuPathTrace;
CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
@ -1333,8 +1304,9 @@ public:
cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
}
if(have_error())
if(have_error()) {
return;
}
/* pass in parameters */
void *args[] = {&d_buffer,
@ -1370,8 +1342,6 @@ public:
0, 0, args, 0));
cuda_assert(cuCtxSynchronize());
cuda_pop_context();
}
void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
@ -1379,7 +1349,7 @@ public:
if(have_error())
return;
cuda_push_context();
CUDAContextScope scope(this);
CUfunction cuFilmConvert;
CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half);
@ -1424,8 +1394,6 @@ public:
0, 0, args, 0));
unmap_pixels((rgba_byte)? rgba_byte: rgba_half);
cuda_pop_context();
}
void shader(DeviceTask& task)
@ -1433,7 +1401,7 @@ public:
if(have_error())
return;
cuda_push_context();
CUDAContextScope scope(this);
CUfunction cuShader;
CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
@ -1498,8 +1466,6 @@ public:
task.update_progress(NULL);
}
cuda_pop_context();
}
CUdeviceptr map_pixels(device_ptr mem)
@ -1535,7 +1501,7 @@ public:
pmem.w = mem.data_width;
pmem.h = mem.data_height;
cuda_push_context();
CUDAContextScope scope(this);
glGenBuffers(1, &pmem.cuPBO);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
@ -1559,8 +1525,6 @@ public:
CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
if(result == CUDA_SUCCESS) {
cuda_pop_context();
mem.device_pointer = pmem.cuTexId;
pixel_mem_map[mem.device_pointer] = pmem;
@ -1574,8 +1538,6 @@ public:
glDeleteBuffers(1, &pmem.cuPBO);
glDeleteTextures(1, &pmem.cuTexId);
cuda_pop_context();
background = true;
}
}
@ -1588,7 +1550,7 @@ public:
if(!background) {
PixelMem pmem = pixel_mem_map[mem.device_pointer];
cuda_push_context();
CUDAContextScope scope(this);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
@ -1597,8 +1559,6 @@ public:
glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
cuda_pop_context();
return;
}
@ -1611,14 +1571,12 @@ public:
if(!background) {
PixelMem pmem = pixel_mem_map[mem.device_pointer];
cuda_push_context();
CUDAContextScope scope(this);
cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
glDeleteBuffers(1, &pmem.cuPBO);
glDeleteTextures(1, &pmem.cuTexId);
cuda_pop_context();
pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
mem.device_pointer = 0;
@ -1639,7 +1597,7 @@ public:
PixelMem pmem = pixel_mem_map[mem.device_pointer];
float *vpointer;
cuda_push_context();
CUDAContextScope scope(this);
/* for multi devices, this assumes the inefficient method that we allocate
* all pixels on the device even though we only render to a subset */
@ -1728,8 +1686,6 @@ public:
glBindTexture(GL_TEXTURE_2D, 0);
glDisable(GL_TEXTURE_2D);
cuda_pop_context();
return;
}
@ -1738,6 +1694,8 @@ public:
void thread_run(DeviceTask *task)
{
CUDAContextScope scope(this);
if(task->type == DeviceTask::RENDER) {
RenderTile tile;
@ -1805,9 +1763,7 @@ public:
shader(*task);
cuda_push_context();
cuda_assert(cuCtxSynchronize());
cuda_pop_context();
}
}
@ -1828,12 +1784,11 @@ public:
void task_add(DeviceTask& task)
{
if(task.type == DeviceTask::FILM_CONVERT) {
CUDAContextScope scope(this);
/* must be done in main thread due to opengl access */
film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
cuda_push_context();
cuda_assert(cuCtxSynchronize());
cuda_pop_context();
}
else {
task_pool.push(new CUDADeviceTask(this, task));
@ -1852,6 +1807,7 @@ public:
friend class CUDASplitKernelFunction;
friend class CUDASplitKernel;
friend class CUDAContextScope;
};
/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
@ -1872,6 +1828,20 @@ public:
} \
} (void)0
/* CUDA context scope. */
CUDAContextScope::CUDAContextScope(CUDADevice *device)
: device(device)
{
cuda_assert(cuCtxPushCurrent(device->cuContext));
}
CUDAContextScope::~CUDAContextScope()
{
cuda_assert(cuCtxPopCurrent(NULL));
}
/* split kernel */
class CUDASplitKernelFunction : public SplitKernelFunction{
@ -1889,11 +1859,11 @@ public:
/* enqueue the kernel, returns false if there is an error */
bool enqueue(const KernelDimensions &dim, void *args[])
{
device->cuda_push_context();
if(device->have_error())
return false;
CUDAContextScope scope(device);
/* we ignore dim.local_size for now, as this is faster */
int threads_per_block;
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
@ -1907,8 +1877,6 @@ public:
threads_per_block, 1, 1, /* threads */
0, 0, args, 0));
device->cuda_pop_context();
return !device->have_error();
}
};
@ -1919,12 +1887,12 @@ CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device)
uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
{
CUDAContextScope scope(device);
device_vector<uint64_t> size_buffer;
size_buffer.resize(1);
device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
device->cuda_push_context();
uint threads = num_threads;
CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
@ -1946,8 +1914,6 @@ uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory
1, 1, 1,
0, 0, (void**)&args, 0));
device->cuda_pop_context();
device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
device->mem_free(size_buffer);
@ -1965,7 +1931,7 @@ bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim
device_memory& use_queues_flag,
device_memory& work_pool_wgs)
{
device->cuda_push_context();
CUDAContextScope scope(device);
CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
@ -2029,26 +1995,21 @@ bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim
CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
device->cuda_pop_context();
return !device->have_error();
}
SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name,
const DeviceRequestedFeatures&)
{
CUDAContextScope scope(device);
CUfunction func;
device->cuda_push_context();
cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
if(device->have_error()) {
device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
return NULL;
}
device->cuda_pop_context();
return new CUDASplitKernelFunction(device, func);
}
@ -2059,12 +2020,11 @@ int2 CUDASplitKernel::split_kernel_local_size()
int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
{
CUDAContextScope scope(device);
size_t free;
size_t total;
device->cuda_push_context();
cuda_assert(cuMemGetInfo(&free, &total));
device->cuda_pop_context();
VLOG(1) << "Maximum device allocation size: "
<< string_human_readable_number(free) << " bytes. ("