Fix T90308: Cycles crash copying memory from device to host

Happens when device runs out of memory and Cycles is moving some
textures to the host memory.

The delayed memory free for OptiX BVH was moving data from one
device_memory to another, leaving the original device memory in
an invalid state. This was ruining the allocation map in the CUDA
device which is using pointer to the device_memory.

This change makes it so the memory pointer is stolen from BVH
into the delayed memory free list.

Additionally, forbid copying and moving instances of device_memory
and added sanity checks in the device implementation.

Differential Revision: https://developer.blender.org/D13316
This commit is contained in:
Sergey Sharybin 2021-11-22 17:07:55 +01:00
parent 25c83c217b
commit 336ca6796a
Notes: blender-bot 2023-02-14 03:13:26 +01:00
Referenced by issue #91433, invalid value in cuMemcpy
Referenced by issue #90308, Crash when moving data from dedicated GPU memory to shared during cycles Optix viewport rendering
8 changed files with 30 additions and 55 deletions

View File

@ -30,15 +30,17 @@ BVHOptiX::BVHOptiX(const BVHParams &params_,
: BVH(params_, geometry_, objects_),
device(device),
traversable_handle(0),
as_data(device, params_.top_level ? "optix tlas" : "optix blas", false),
motion_transform_data(device, "optix motion transform", false)
as_data(make_unique<device_only_memory<char>>(
device, params.top_level ? "optix tlas" : "optix blas", false)),
motion_transform_data(
make_unique<device_only_memory<char>>(device, "optix motion transform", false))
{
}
BVHOptiX::~BVHOptiX()
{
// Acceleration structure memory is delayed freed on device, since deleting the
// BVH may happen while still being used for rendering.
/* Acceleration structure memory is delayed freed on device, since deleting the
* BVH may happen while still being used for rendering. */
device->release_optix_bvh(this);
}

View File

@ -25,14 +25,16 @@
# include "device/memory.h"
# include "util/unique_ptr.h"
CCL_NAMESPACE_BEGIN
class BVHOptiX : public BVH {
public:
Device *device;
uint64_t traversable_handle;
device_only_memory<char> as_data;
device_only_memory<char> motion_transform_data;
unique_ptr<device_only_memory<char>> as_data;
unique_ptr<device_only_memory<char>> motion_transform_data;
protected:
friend class BVH;

View File

@ -777,6 +777,7 @@ void CUDADevice::generic_free(device_memory &mem)
if (mem.device_pointer) {
CUDAContextScope scope(this);
thread_scoped_lock lock(cuda_mem_map_mutex);
DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
const CUDAMem &cmem = cuda_mem_map[&mem];
/* If cmem.use_mapped_host is true, reference counting is used
@ -1145,6 +1146,7 @@ void CUDADevice::tex_free(device_texture &mem)
if (mem.device_pointer) {
CUDAContextScope scope(this);
thread_scoped_lock lock(cuda_mem_map_mutex);
DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
const CUDAMem &cmem = cuda_mem_map[&mem];
if (cmem.texobject) {

View File

@ -745,6 +745,7 @@ void HIPDevice::generic_free(device_memory &mem)
if (mem.device_pointer) {
HIPContextScope scope(this);
thread_scoped_lock lock(hip_mem_map_mutex);
DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
const HIPMem &cmem = hip_mem_map[&mem];
/* If cmem.use_mapped_host is true, reference counting is used
@ -1114,6 +1115,7 @@ void HIPDevice::tex_free(device_texture &mem)
if (mem.device_pointer) {
HIPContextScope scope(this);
thread_scoped_lock lock(hip_mem_map_mutex);
DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
const HIPMem &cmem = hip_mem_map[&mem];
if (cmem.texobject) {

View File

@ -44,45 +44,6 @@ device_memory::device_memory(Device *device, const char *name, MemoryType type)
{
}
device_memory::device_memory(device_memory &&other) noexcept
: data_type(other.data_type),
data_elements(other.data_elements),
data_size(other.data_size),
device_size(other.device_size),
data_width(other.data_width),
data_height(other.data_height),
data_depth(other.data_depth),
type(other.type),
name(other.name),
device(other.device),
device_pointer(other.device_pointer),
host_pointer(other.host_pointer),
shared_pointer(other.shared_pointer),
shared_counter(other.shared_counter),
original_device_ptr(other.original_device_ptr),
original_device_size(other.original_device_size),
original_device(other.original_device),
need_realloc_(other.need_realloc_),
modified(other.modified)
{
other.data_elements = 0;
other.data_size = 0;
other.device_size = 0;
other.data_width = 0;
other.data_height = 0;
other.data_depth = 0;
other.device = 0;
other.device_pointer = 0;
other.host_pointer = 0;
other.shared_pointer = 0;
other.shared_counter = 0;
other.original_device_ptr = 0;
other.original_device_size = 0;
other.original_device = 0;
other.need_realloc_ = false;
other.modified = false;
}
device_memory::~device_memory()
{
assert(shared_pointer == 0);

View File

@ -281,11 +281,16 @@ class device_memory {
/* Only create through subclasses. */
device_memory(Device *device, const char *name, MemoryType type);
device_memory(device_memory &&other) noexcept;
/* No copying allowed. */
/* No copying and allowed.
*
* This is because device implementation might need to register device memory in an allocation
* map of some sort and use pointer as a key to identify blocks. Moving data from one place to
* another bypassing device allocation routines will make those maps hard to maintain. */
device_memory(const device_memory &) = delete;
device_memory(device_memory &&other) noexcept = delete;
device_memory &operator=(const device_memory &) = delete;
device_memory &operator=(device_memory &&) = delete;
/* Host allocation on the device. All host_pointer memory should be
* allocated with these functions, for devices that support using

View File

@ -1032,7 +1032,7 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
return false;
}
device_only_memory<char> &out_data = bvh->as_data;
device_only_memory<char> &out_data = *bvh->as_data;
if (operation == OPTIX_BUILD_OPERATION_BUILD) {
assert(out_data.device == this);
out_data.alloc_to_device(sizes.outputSizeInBytes);
@ -1123,7 +1123,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
operation = OPTIX_BUILD_OPERATION_UPDATE;
}
else {
bvh_optix->as_data.free();
bvh_optix->as_data->free();
bvh_optix->traversable_handle = 0;
}
@ -1344,9 +1344,9 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
unsigned int num_instances = 0;
unsigned int max_num_instances = 0xFFFFFFFF;
bvh_optix->as_data.free();
bvh_optix->as_data->free();
bvh_optix->traversable_handle = 0;
bvh_optix->motion_transform_data.free();
bvh_optix->motion_transform_data->free();
optixDeviceContextGetProperty(context,
OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
@ -1379,8 +1379,8 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
}
}
assert(bvh_optix->motion_transform_data.device == this);
bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
assert(bvh_optix->motion_transform_data->device == this);
bvh_optix->motion_transform_data->alloc_to_device(total_motion_transform_size);
}
for (Object *ob : bvh->objects) {
@ -1441,7 +1441,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
motion_transform_offset = align_up(motion_transform_offset,
OPTIX_TRANSFORM_BYTE_ALIGNMENT);
CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data->device_pointer +
motion_transform_offset;
motion_transform_offset += motion_transform_size;

View File

@ -23,6 +23,7 @@
# include "device/optix/queue.h"
# include "device/optix/util.h"
# include "kernel/types.h"
# include "util/unique_ptr.h"
CCL_NAMESPACE_BEGIN
@ -76,7 +77,7 @@ class OptiXDevice : public CUDADevice {
device_only_memory<KernelParamsOptiX> launch_params;
OptixTraversableHandle tlas_handle = 0;
vector<device_only_memory<char>> delayed_free_bvh_memory;
vector<unique_ptr<device_only_memory<char>>> delayed_free_bvh_memory;
thread_mutex delayed_free_bvh_mutex;
class Denoiser {