Fix T50888: Numeric overflow in split kernel state buffer size calculation

Overflow led to the state buffer being too small and the split kernel to
get stuck doing nothing forever.
This commit is contained in:
Mai Lavelle 2017-03-11 05:23:11 -05:00
parent 5afe4c787f
commit 96868a3941
Notes: blender-bot 2023-02-14 07:09:37 +01:00
Referenced by issue #50925, AO bounce simplification diffences between GPU and CPU
Referenced by issue #50927, intersect (knife) sometime crashes blender.
Referenced by issue #50888, Master Broken, Cuda kernel compat errors compile, OpenCL GPU just sits there on one tile never rendering anything.
11 changed files with 41 additions and 28 deletions

View File

@ -72,7 +72,7 @@ public:
virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
virtual int2 split_kernel_local_size();
virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
};
class CPUDevice : public Device
@ -860,7 +860,7 @@ int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memo
return task->requested_tile_size;
}
size_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
return split_data_buffer_size(kg, num_threads);

View File

@ -89,7 +89,7 @@ class CUDASplitKernel : public DeviceSplitKernel {
public:
explicit CUDASplitKernel(CUDADevice *device);
virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
@ -1473,9 +1473,9 @@ CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device)
{
}
size_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
{
device_vector<uint> size_buffer;
device_vector<uint64_t> size_buffer;
size_buffer.resize(1);
device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
@ -1504,7 +1504,7 @@ size_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory&
device->cuda_pop_context();
device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint));
device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
device->mem_free(size_buffer);
return *size_buffer.get_data();

View File

@ -48,7 +48,8 @@ enum DataType {
TYPE_UINT,
TYPE_INT,
TYPE_FLOAT,
TYPE_HALF
TYPE_HALF,
TYPE_UINT64,
};
static inline size_t datatype_size(DataType datatype)
@ -59,6 +60,7 @@ static inline size_t datatype_size(DataType datatype)
case TYPE_UINT: return sizeof(uint);
case TYPE_INT: return sizeof(int);
case TYPE_HALF: return sizeof(half);
case TYPE_UINT64: return sizeof(uint64_t);
default: return 0;
}
}
@ -160,6 +162,11 @@ template<> struct device_type_traits<half4> {
static const int num_elements = 4;
};
template<> struct device_type_traits<uint64_t> {
static const DataType data_type = TYPE_UINT64;
static const int num_elements = 1;
};
/* Device Memory */
class device_memory

View File

@ -105,9 +105,9 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe
return true;
}
size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size)
size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size)
{
size_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
return max_buffer_size / size_per_element;
}

View File

@ -105,8 +105,8 @@ public:
device_memory& kgbuffer,
device_memory& kernel_data);
virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0;
size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size);
virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0;
size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size);
virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,

View File

@ -334,11 +334,11 @@ void OpenCLDeviceBase::mem_zero(device_memory& mem)
size_t num_threads = global_size[0] * global_size[1];
cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
unsigned long long d_offset = 0;
unsigned long long d_size = 0;
cl_ulong d_offset = 0;
cl_ulong d_size = 0;
while(d_offset < mem.memory_size()) {
d_size = std::min<unsigned long long>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);

View File

@ -227,9 +227,9 @@ public:
return kernel;
}
virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
{
device_vector<uint> size_buffer;
device_vector<uint64_t> size_buffer;
size_buffer.resize(1);
device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
@ -249,7 +249,7 @@ public:
device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint));
device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
device->mem_free(size_buffer);
if(device->ciErr != CL_SUCCESS) {
@ -346,8 +346,8 @@ public:
virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask */*task*/)
{
size_t max_buffer_size;
clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_buffer_size, NULL);
cl_ulong max_buffer_size;
clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
VLOG(1) << "Maximum device allocation side: "
<< string_human_readable_number(max_buffer_size) << " bytes. ("
<< string_human_readable_size(max_buffer_size) << ").";

View File

@ -46,7 +46,7 @@
/* kernels */
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_state_buffer_size(uint num_threads, uint *size)
kernel_cuda_state_buffer_size(uint num_threads, uint64_t *size)
{
*size = split_data_buffer_size(NULL, num_threads);
}

View File

@ -21,7 +21,7 @@ __kernel void kernel_ocl_path_trace_state_buffer_size(
KernelGlobals *kg,
ccl_constant KernelData *data,
uint num_threads,
ccl_global uint *size)
ccl_global uint64_t *size)
{
kg->data = data;
*size = split_data_buffer_size(kg, num_threads);

View File

@ -22,11 +22,11 @@
CCL_NAMESPACE_BEGIN
ccl_device_inline size_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
{
(void)kg; /* Unused on CPU. */
size_t size = 0;
uint64_t size = 0;
#define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16)
size = size SPLIT_DATA_ENTRIES;
#undef SPLIT_DATA_ENTRY

View File

@ -106,10 +106,16 @@ typedef unsigned int uint;
#endif
#ifndef __KERNEL_GPU__
/* Fixed Bits Types */
#ifdef __KERNEL_OPENCL__
typedef ulong uint64_t;
#endif
#ifndef __KERNEL_GPU__
#ifdef _WIN32
typedef signed char int8_t;
@ -474,17 +480,17 @@ ccl_device_inline int4 make_int4(const float3& f)
#endif
ccl_device_inline int align_up(int offset, int alignment)
ccl_device_inline size_t align_up(size_t offset, size_t alignment)
{
return (offset + alignment - 1) & ~(alignment - 1);
}
ccl_device_inline int round_up(int x, int multiple)
ccl_device_inline size_t round_up(size_t x, size_t multiple)
{
return ((x + multiple - 1) / multiple) * multiple;
}
ccl_device_inline int round_down(int x, int multiple)
ccl_device_inline size_t round_down(size_t x, size_t multiple)
{
return (x / multiple) * multiple;
}