Cycles Denoising: Pass tile buffers to every OpenCL kernel to conform to standard and get rid of set_tile_info

This commit is contained in:
Lukas Stockner 2018-07-04 14:02:38 +02:00
parent f1525cf534
commit c960804747
9 changed files with 55 additions and 92 deletions

View File

@ -459,18 +459,6 @@ public:
}
};
bool denoising_set_tile_info(device_ptr *buffers, DenoisingTask *task)
{
TileInfo *tile_info = (TileInfo*) task->tile_info_mem.host_pointer;
for(int i = 0; i < 9; i++) {
tile_info->buffers[i] = buffers[i];
}
task->tile_info_mem.copy_to_device();
return true;
}
bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
DenoisingTask *task)
{
@ -722,7 +710,6 @@ public:
denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
denoising.functions.set_tile_info = function_bind(&CPUDevice::denoising_set_tile_info, this, _1, &denoising);
denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
denoising.render_buffer.samples = tile.sample;

View File

@ -1251,18 +1251,6 @@ public:
}
}
bool denoising_set_tile_info(device_ptr *buffers, DenoisingTask *task)
{
TileInfo *tile_info = (TileInfo*) task->tile_info_mem.host_pointer;
for(int i = 0; i < 9; i++) {
tile_info->buffers[i] = buffers[i];
}
task->tile_info_mem.copy_to_device();
return !have_error();
}
#define CUDA_GET_BLOCKSIZE(func, w, h) \
int threads_per_block; \
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
@ -1622,7 +1610,6 @@ public:
denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
denoising.functions.set_tile_info = function_bind(&CUDADevice::denoising_set_tile_info, this, _1, &denoising);
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
denoising.render_buffer.samples = rtile.sample;

View File

@ -62,11 +62,10 @@ void DenoisingTask::set_render_buffer(RenderTile *rtiles)
{
tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int));
device_ptr buffers[9];
for(int i = 0; i < 9; i++) {
buffers[i] = rtiles[i].buffer;
tile_info->offsets[i] = rtiles[i].offset;
tile_info->strides[i] = rtiles[i].stride;
tile_info->buffers[i] = rtiles[i].buffer;
}
tile_info->x[0] = rtiles[3].x;
tile_info->x[1] = rtiles[4].x;
@ -81,7 +80,7 @@ void DenoisingTask::set_render_buffer(RenderTile *rtiles)
target_buffer.stride = rtiles[9].stride;
target_buffer.ptr = rtiles[9].buffer;
functions.set_tile_info(buffers);
tile_info_mem.copy_to_device();
}
void DenoisingTask::setup_denoising_buffer()

View File

@ -89,7 +89,6 @@ public:
device_ptr depth_ptr,
device_ptr output_ptr
)> detect_outliers;
function<bool(device_ptr*)> set_tile_info;
function<void(RenderTile *rtiles)> map_neighbor_tiles;
function<void(RenderTile *rtiles)> unmap_neighbor_tiles;
} functions;

View File

@ -436,8 +436,6 @@ protected:
device_ptr depth_ptr,
device_ptr output_ptr,
DenoisingTask *task);
bool denoising_set_tile_info(device_ptr *buffers,
DenoisingTask *task);
device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size);
void mem_free_sub_ptr(device_ptr ptr);

View File

@ -246,7 +246,6 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
denoising_program.add_kernel(ustring("filter_nlm_normalize"));
denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
denoising_program.add_kernel(ustring("filter_finalize"));
denoising_program.add_kernel(ustring("filter_set_tile_info"));
vector<OpenCLProgram*> programs;
programs.push_back(&base_program);
@ -981,9 +980,16 @@ bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
kernel_set_args(ckFilterDivideShadow, 0,
task->render_buffer.samples,
tile_info_mem,
int arg_ofs = kernel_set_args(ckFilterDivideShadow, 0,
task->render_buffer.samples,
tile_info_mem);
cl_mem buffers[9];
for(int i = 0; i < 9; i++) {
buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs,
buffers[i]);
}
kernel_set_args(ckFilterDivideShadow, arg_ofs,
a_mem,
b_mem,
sample_variance_mem,
@ -1012,9 +1018,16 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
kernel_set_args(ckFilterGetFeature, 0,
task->render_buffer.samples,
tile_info_mem,
int arg_ofs = kernel_set_args(ckFilterGetFeature, 0,
task->render_buffer.samples,
tile_info_mem);
cl_mem buffers[9];
for(int i = 0; i < 9; i++) {
buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs,
buffers[i]);
}
kernel_set_args(ckFilterGetFeature, arg_ofs,
mean_offset,
variance_offset,
mean_mem,
@ -1056,29 +1069,8 @@ bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
return true;
}
bool OpenCLDeviceBase::denoising_set_tile_info(device_ptr *buffers,
DenoisingTask *task)
{
task->tile_info_mem.copy_to_device();
cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
cl_kernel ckFilterSetTileInfo = denoising_program(ustring("filter_set_tile_info"));
kernel_set_args(ckFilterSetTileInfo, 0, tile_info_mem);
for(int i = 0; i < 9; i++) {
cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
kernel_set_args(ckFilterSetTileInfo, i+1, buffer_mem);
}
enqueue_kernel(ckFilterSetTileInfo, 1, 1);
return true;
}
void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising)
{
denoising.functions.set_tile_info = function_bind(&OpenCLDeviceBase::denoising_set_tile_info, this, _1, &denoising);
denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising);
denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);

View File

@ -35,4 +35,29 @@ typedef struct TileInfo {
#endif
} TileInfo;
#ifdef __KERNEL_OPENCL__
# define CCL_FILTER_TILE_INFO ccl_global TileInfo* tile_info, \
ccl_global float *tile_buffer_1, \
ccl_global float *tile_buffer_2, \
ccl_global float *tile_buffer_3, \
ccl_global float *tile_buffer_4, \
ccl_global float *tile_buffer_5, \
ccl_global float *tile_buffer_6, \
ccl_global float *tile_buffer_7, \
ccl_global float *tile_buffer_8, \
ccl_global float *tile_buffer_9
# define CCL_FILTER_TILE_INFO_ARG tile_info, \
tile_buffer_1, tile_buffer_2, tile_buffer_3, \
tile_buffer_4, tile_buffer_5, tile_buffer_6, \
tile_buffer_7, tile_buffer_8, tile_buffer_9
# define ccl_get_tile_buffer(id) (tile_buffer_ ## id)
#else
# ifdef __KERNEL_CUDA__
# define CCL_FILTER_TILE_INFO ccl_global TileInfo* tile_info
# else
# define CCL_FILTER_TILE_INFO TileInfo* tile_info
# endif
# define ccl_get_tile_buffer(id) (tile_info->buffers[id])
#endif
#endif /* __FILTER_DEFINES_H__*/

View File

@ -26,7 +26,7 @@ CCL_NAMESPACE_BEGIN
* bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy.
*/
ccl_device void kernel_filter_divide_shadow(int sample,
ccl_global TileInfo *tile_info,
CCL_FILTER_TILE_INFO,
int x, int y,
ccl_global float *unfilteredA,
ccl_global float *unfilteredB,
@ -43,7 +43,7 @@ ccl_device void kernel_filter_divide_shadow(int sample,
int offset = tile_info->offsets[tile];
int stride = tile_info->strides[tile];
const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) tile_info->buffers[tile];
const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) ccl_get_tile_buffer(tile);
center_buffer += (y*stride + x + offset)*buffer_pass_stride;
center_buffer += buffer_denoising_offset + 14;
@ -79,7 +79,7 @@ ccl_device void kernel_filter_divide_shadow(int sample,
* - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
*/
ccl_device void kernel_filter_get_feature(int sample,
ccl_global TileInfo *tile_info,
CCL_FILTER_TILE_INFO,
int m_offset, int v_offset,
int x, int y,
ccl_global float *mean,
@ -90,7 +90,7 @@ ccl_device void kernel_filter_get_feature(int sample,
int xtile = (x < tile_info->x[1])? 0: ((x < tile_info->x[2])? 1: 2);
int ytile = (y < tile_info->y[1])? 0: ((y < tile_info->y[2])? 1: 2);
int tile = ytile*3+xtile;
ccl_global float *center_buffer = ((ccl_global float*) tile_info->buffers[tile]) + (tile_info->offsets[tile] + y*tile_info->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
ccl_global float *center_buffer = ((ccl_global float*) ccl_get_tile_buffer(tile)) + (tile_info->offsets[tile] + y*tile_info->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
int buffer_w = align_up(rect.z - rect.x, 4);
int idx = (y-rect.y)*buffer_w + (x - rect.x);

View File

@ -23,7 +23,7 @@
/* kernels */
__kernel void kernel_ocl_filter_divide_shadow(int sample,
ccl_global TileInfo *tile_info,
CCL_FILTER_TILE_INFO,
ccl_global float *unfilteredA,
ccl_global float *unfilteredB,
ccl_global float *sampleVariance,
@ -37,7 +37,7 @@ __kernel void kernel_ocl_filter_divide_shadow(int sample,
int y = prefilter_rect.y + get_global_id(1);
if(x < prefilter_rect.z && y < prefilter_rect.w) {
kernel_filter_divide_shadow(sample,
tiles,
CCL_FILTER_TILE_INFO_ARG,
x, y,
unfilteredA,
unfilteredB,
@ -51,7 +51,7 @@ __kernel void kernel_ocl_filter_divide_shadow(int sample,
}
__kernel void kernel_ocl_filter_get_feature(int sample,
ccl_global TileInfo *tile_info,
CCL_FILTER_TILE_INFO,
int m_offset,
int v_offset,
ccl_global float *mean,
@ -64,7 +64,7 @@ __kernel void kernel_ocl_filter_get_feature(int sample,
int y = prefilter_rect.y + get_global_id(1);
if(x < prefilter_rect.z && y < prefilter_rect.w) {
kernel_filter_get_feature(sample,
tiles,
CCL_FILTER_TILE_INFO_ARG,
m_offset, v_offset,
x, y,
mean, variance,
@ -276,27 +276,3 @@ __kernel void kernel_ocl_filter_finalize(ccl_global float *buffer,
buffer_params, sample);
}
}
__kernel void kernel_ocl_filter_set_tile_info(ccl_global TileInfo* tile_info,
ccl_global float *buffer_1,
ccl_global float *buffer_2,
ccl_global float *buffer_3,
ccl_global float *buffer_4,
ccl_global float *buffer_5,
ccl_global float *buffer_6,
ccl_global float *buffer_7,
ccl_global float *buffer_8,
ccl_global float *buffer_9)
{
if((get_global_id(0) == 0) && (get_global_id(1) == 0)) {
tile_info->buffers[0] = buffer_1;
tile_info->buffers[1] = buffer_2;
tile_info->buffers[2] = buffer_3;
tile_info->buffers[3] = buffer_4;
tile_info->buffers[4] = buffer_5;
tile_info->buffers[5] = buffer_6;
tile_info->buffers[6] = buffer_7;
tile_info->buffers[7] = buffer_8;
tile_info->buffers[8] = buffer_9;
}
}