Fix T46207: Slow OpenCL GPU bake and blown out baking Cycles render

This commit is contained in:
Sergey Sharybin 2016-05-31 17:47:54 +02:00
parent ad1c3bef8b
commit d2bb0e660b
Notes: blender-bot 2023-02-14 08:38:07 +01:00
Referenced by issue #48503, Vertex Colors are not displayed in Material view
Referenced by issue #46207, Slow GPU bake and blown out baking cycles render
3 changed files with 53 additions and 44 deletions

View File

@ -1224,18 +1224,28 @@ public:
CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
clGetDeviceInfo(cdDevice,
CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
/* try to divide evenly over 2 dimensions */
/* Try to divide evenly over 2 dimensions. */
size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
/* some implementations have max size 1 on 2nd dimension */
/* Some implementations have max size 1 on 2nd dimension. */
if(local_size[1] > max_work_items[1]) {
local_size[0] = workgroup_size/max_work_items[1];
local_size[1] = max_work_items[1];
}
size_t global_size[2] = {global_size_round_up(local_size[0], w), global_size_round_up(local_size[1], h)};
size_t global_size[2] = {global_size_round_up(local_size[0], w),
global_size_round_up(local_size[1], h)};
/* Vertical size of 1 is coming from bake/shade kernels where we should
* not round anything up because otherwise we'll either be doing too
* much work per pixel (if we don't check global ID on Y axis) or will
* be checking for global ID to always have Y of 0.
*/
if (h == 1) {
global_size[h] = 1;
}
/* run kernel */
opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
@ -1320,48 +1330,49 @@ public:
else
kernel = ckShaderKernel;
cl_uint start_arg_index =
kernel_set_args(kernel,
0,
d_data,
d_input,
d_output);
if(task.shader_eval_type < SHADER_EVAL_BAKE) {
start_arg_index += kernel_set_args(kernel,
start_arg_index,
d_output_luma);
}
#define KERNEL_TEX(type, ttype, name) \
set_kernel_arg_mem(kernel, &start_arg_index, #name);
#include "kernel_textures.h"
#undef KERNEL_TEX
start_arg_index += kernel_set_args(kernel,
start_arg_index,
d_shader_eval_type);
if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
start_arg_index += kernel_set_args(kernel,
start_arg_index,
d_shader_filter);
}
start_arg_index += kernel_set_args(kernel,
start_arg_index,
d_shader_x,
d_shader_w,
d_offset);
for(int sample = 0; sample < task.num_samples; sample++) {
if(task.get_cancel())
break;
cl_int d_sample = sample;
cl_uint start_arg_index =
kernel_set_args(kernel,
0,
d_data,
d_input,
d_output);
if(task.shader_eval_type < SHADER_EVAL_BAKE) {
start_arg_index += kernel_set_args(kernel,
start_arg_index,
d_output_luma);
}
#define KERNEL_TEX(type, ttype, name) \
set_kernel_arg_mem(kernel, &start_arg_index, #name);
#include "kernel_textures.h"
#undef KERNEL_TEX
start_arg_index += kernel_set_args(kernel,
start_arg_index,
d_shader_eval_type);
if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
start_arg_index += kernel_set_args(kernel,
start_arg_index,
d_shader_filter);
}
start_arg_index += kernel_set_args(kernel,
start_arg_index,
d_shader_x,
d_shader_w,
d_offset,
d_sample);
kernel_set_args(kernel, start_arg_index, sample);
enqueue_kernel(kernel, task.shader_w, 1);
clFinish(cqCommandQueue);
task.update_progress(NULL);
}
}

View File

@ -482,12 +482,10 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
}
/* write output */
float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f;
const float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f;
const float4 scaled_result = make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
if(sample == 0)
output[i] = make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
else
output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
output[i] = (sample == 0)? scaled_result: output[i] + scaled_result;
}
#endif /* __BAKING__ */

View File

@ -177,7 +177,7 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
device->mem_alloc(d_input, MEM_READ_ONLY);
device->mem_copy_to(d_input);
device->mem_alloc(d_output, MEM_WRITE_ONLY);
device->mem_alloc(d_output, MEM_READ_WRITE);
DeviceTask task(DeviceTask::SHADER);
task.shader_input = d_input.device_pointer;