Cycles: Implement automatic global size for CUDA split kernel

Not sure this is the best way to do things for CUDA but its much better than
being unimplemented.
This commit is contained in:
Mai Lavelle 2017-04-11 02:36:08 -04:00
parent 3722da3b4e
commit 1e6038a426
1 changed files with 16 additions and 3 deletions

View File

@ -1613,10 +1613,23 @@ int2 CUDASplitKernel::split_kernel_local_size()
return make_int2(32, 1);
}
int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/)
int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
{
/* TODO(mai): implement something here to detect ideal work size */
return make_int2(256, 256);
size_t free;
size_t total;
device->cuda_push_context();
cuda_assert(cuMemGetInfo(&free, &total));
device->cuda_pop_context();
VLOG(1) << "Maximum device allocation size: "
<< string_human_readable_number(free) << " bytes. ("
<< string_human_readable_size(free) << ").";
size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
int2 global_size = make_int2(round_down((int)sqrt(num_elements), 32), (int)sqrt(num_elements));
VLOG(1) << "Global size: " << global_size << ".";
return global_size;
}
bool device_cuda_init(void)