Fix T90308: Cycles crash copying memory from device to host

Happens when device runs out of memory and Cycles is moving some textures to the host memory. The delayed memory free for OptiX BVH was moving data from one device_memory to another, leaving the original device memory in an invalid state. This was ruining the allocation map in the CUDA device which is using pointer to the device_memory. This change makes it so the memory pointer is stolen from BVH into the delayed memory free list. Additionally, forbid copying and moving instances of device_memory and added sanity checks in the device implementation. Differential Revision: https://developer.blender.org/D13316
Referenced by issue #91433, invalid value in cuMemcpy Referenced by issue #90308, Crash when moving data from dedicated GPU memory to shared during cycles Optix viewport rendering
2021-11-22 17:07:55 +01:00 · 2021-11-22 17:07:55 +01:00 · 336ca6796a · 2023-02-14 03:13:26 +01:00
parent 25c83c217b
commit 336ca6796a
8 changed files with 30 additions and 55 deletions
--- a/intern/cycles/bvh/optix.cpp
+++ b/intern/cycles/bvh/optix.cpp
@ -30,15 +30,17 @@ BVHOptiX::BVHOptiX(const BVHParams &params_,
    : BVH(params_, geometry_, objects_),
      device(device),
      traversable_handle(0),
-      as_data(device, params_.top_level ? "optix tlas" : "optix blas", false),
-      motion_transform_data(device, "optix motion transform", false)
+      as_data(make_unique<device_only_memory<char>>(
+          device, params.top_level ? "optix tlas" : "optix blas", false)),
+      motion_transform_data(
+          make_unique<device_only_memory<char>>(device, "optix motion transform", false))
 {
 }

 BVHOptiX::~BVHOptiX()
 {
-  // Acceleration structure memory is delayed freed on device, since deleting the
-  // BVH may happen while still being used for rendering.
+  /* Acceleration structure memory is delayed freed on device, since deleting the
+   * BVH may happen while still being used for rendering. */
  device->release_optix_bvh(this);
 }

--- a/intern/cycles/bvh/optix.h
+++ b/intern/cycles/bvh/optix.h
@ -25,14 +25,16 @@

 #  include "device/memory.h"

+#  include "util/unique_ptr.h"
+
 CCL_NAMESPACE_BEGIN

 class BVHOptiX : public BVH {
 public:
  Device *device;
  uint64_t traversable_handle;
-  device_only_memory<char> as_data;
-  device_only_memory<char> motion_transform_data;
+  unique_ptr<device_only_memory<char>> as_data;
+  unique_ptr<device_only_memory<char>> motion_transform_data;

 protected:
  friend class BVH;
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@ -777,6 +777,7 @@ void CUDADevice::generic_free(device_memory &mem)
  if (mem.device_pointer) {
    CUDAContextScope scope(this);
    thread_scoped_lock lock(cuda_mem_map_mutex);
+    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
    const CUDAMem &cmem = cuda_mem_map[&mem];

    /* If cmem.use_mapped_host is true, reference counting is used
@ -1145,6 +1146,7 @@ void CUDADevice::tex_free(device_texture &mem)
  if (mem.device_pointer) {
    CUDAContextScope scope(this);
    thread_scoped_lock lock(cuda_mem_map_mutex);
+    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
    const CUDAMem &cmem = cuda_mem_map[&mem];

    if (cmem.texobject) {
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@ -745,6 +745,7 @@ void HIPDevice::generic_free(device_memory &mem)
  if (mem.device_pointer) {
    HIPContextScope scope(this);
    thread_scoped_lock lock(hip_mem_map_mutex);
+    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
    const HIPMem &cmem = hip_mem_map[&mem];

    /* If cmem.use_mapped_host is true, reference counting is used
@ -1114,6 +1115,7 @@ void HIPDevice::tex_free(device_texture &mem)
  if (mem.device_pointer) {
    HIPContextScope scope(this);
    thread_scoped_lock lock(hip_mem_map_mutex);
+    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
    const HIPMem &cmem = hip_mem_map[&mem];

    if (cmem.texobject) {
--- a/intern/cycles/device/memory.cpp
+++ b/intern/cycles/device/memory.cpp
@ -44,45 +44,6 @@ device_memory::device_memory(Device *device, const char *name, MemoryType type)
 {
 }

-device_memory::device_memory(device_memory &&other) noexcept
-    : data_type(other.data_type),
-      data_elements(other.data_elements),
-      data_size(other.data_size),
-      device_size(other.device_size),
-      data_width(other.data_width),
-      data_height(other.data_height),
-      data_depth(other.data_depth),
-      type(other.type),
-      name(other.name),
-      device(other.device),
-      device_pointer(other.device_pointer),
-      host_pointer(other.host_pointer),
-      shared_pointer(other.shared_pointer),
-      shared_counter(other.shared_counter),
-      original_device_ptr(other.original_device_ptr),
-      original_device_size(other.original_device_size),
-      original_device(other.original_device),
-      need_realloc_(other.need_realloc_),
-      modified(other.modified)
-{
-  other.data_elements = 0;
-  other.data_size = 0;
-  other.device_size = 0;
-  other.data_width = 0;
-  other.data_height = 0;
-  other.data_depth = 0;
-  other.device = 0;
-  other.device_pointer = 0;
-  other.host_pointer = 0;
-  other.shared_pointer = 0;
-  other.shared_counter = 0;
-  other.original_device_ptr = 0;
-  other.original_device_size = 0;
-  other.original_device = 0;
-  other.need_realloc_ = false;
-  other.modified = false;
-}
-
 device_memory::~device_memory()
 {
  assert(shared_pointer == 0);
--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@ -281,11 +281,16 @@ class device_memory {

  /* Only create through subclasses. */
  device_memory(Device *device, const char *name, MemoryType type);
-  device_memory(device_memory &&other) noexcept;

-  /* No copying allowed. */
+  /* No copying and allowed.
+   *
+   * This is because device implementation might need to register device memory in an allocation
+   * map of some sort and use pointer as a key to identify blocks. Moving data from one place to
+   * another bypassing device allocation routines will make those maps hard to maintain. */
  device_memory(const device_memory &) = delete;
+  device_memory(device_memory &&other) noexcept = delete;
  device_memory &operator=(const device_memory &) = delete;
+  device_memory &operator=(device_memory &&) = delete;

  /* Host allocation on the device. All host_pointer memory should be
   * allocated with these functions, for devices that support using
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@ -1032,7 +1032,7 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
    return false;
  }

-  device_only_memory<char> &out_data = bvh->as_data;
+  device_only_memory<char> &out_data = *bvh->as_data;
  if (operation == OPTIX_BUILD_OPERATION_BUILD) {
    assert(out_data.device == this);
    out_data.alloc_to_device(sizes.outputSizeInBytes);
@ -1123,7 +1123,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
      operation = OPTIX_BUILD_OPERATION_UPDATE;
    }
    else {
-      bvh_optix->as_data.free();
+      bvh_optix->as_data->free();
      bvh_optix->traversable_handle = 0;
    }

@ -1344,9 +1344,9 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
    unsigned int num_instances = 0;
    unsigned int max_num_instances = 0xFFFFFFFF;

-    bvh_optix->as_data.free();
+    bvh_optix->as_data->free();
    bvh_optix->traversable_handle = 0;
-    bvh_optix->motion_transform_data.free();
+    bvh_optix->motion_transform_data->free();

    optixDeviceContextGetProperty(context,
                                  OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
@ -1379,8 +1379,8 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
        }
      }

-      assert(bvh_optix->motion_transform_data.device == this);
-      bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
+      assert(bvh_optix->motion_transform_data->device == this);
+      bvh_optix->motion_transform_data->alloc_to_device(total_motion_transform_size);
    }

    for (Object *ob : bvh->objects) {
@ -1441,7 +1441,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)

        motion_transform_offset = align_up(motion_transform_offset,
                                           OPTIX_TRANSFORM_BYTE_ALIGNMENT);
-        CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
+        CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data->device_pointer +
                                           motion_transform_offset;
        motion_transform_offset += motion_transform_size;

--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@ -23,6 +23,7 @@
 #  include "device/optix/queue.h"
 #  include "device/optix/util.h"
 #  include "kernel/types.h"
+#  include "util/unique_ptr.h"

 CCL_NAMESPACE_BEGIN

@ -76,7 +77,7 @@ class OptiXDevice : public CUDADevice {
  device_only_memory<KernelParamsOptiX> launch_params;
  OptixTraversableHandle tlas_handle = 0;

-  vector<device_only_memory<char>> delayed_free_bvh_memory;
+  vector<unique_ptr<device_only_memory<char>>> delayed_free_bvh_memory;
  thread_mutex delayed_free_bvh_mutex;

  class Denoiser {