HIP device code cleanup and fix for high VRAM usage

This patch cleans up code for HIP device and makes it more consistent with the CUDA code. It also fixes the issue with high VRAM usage on AMD cards using HIP allowing better performance and usage on cards like 6600XT. Added a check in intern/cycles/kernel/bvh/bvh_util.h to prevent compiler error with hipcc Reviewed By: brecht, leesonw Maniphest Tasks: T92124 Differential Revision: https://developer.blender.org/D12834
Referenced by issue #92124, Fix high device memory usage on HIP device
2021-10-20 13:37:39 +02:00 · 2021-10-20 13:37:39 +02:00 · ba4e227def · 2023-02-14 05:44:22 +01:00
parent d28aaf6139
commit ba4e227def
6 changed files with 69 additions and 168 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -430,7 +430,7 @@ mark_as_advanced(WITH_CYCLES_NATIVE_ONLY)

 option(WITH_CYCLES_DEVICE_CUDA              "Enable Cycles CUDA compute support" ON)
 option(WITH_CYCLES_DEVICE_OPTIX             "Enable Cycles OptiX support" ON)
-option(WITH_CYCLES_DEVICE_HIP               "Enable Cycles HIP support" OFF)
+option(WITH_CYCLES_DEVICE_HIP               "Enable Cycles HIP support" ON)
 mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
 mark_as_advanced(WITH_CYCLES_DEVICE_CUDA)

--- a/extern/hipew/include/hipew.h
+++ b/extern/hipew/include/hipew.h
@ -24,9 +24,13 @@ extern "C" {
 #include <stdlib.h>

 #define HIP_IPC_HANDLE_SIZE 64
+#define hipHostMallocDefault 0x00
 #define hipHostMallocPortable 0x01
 #define hipHostMallocMapped 0x02
 #define hipHostMallocWriteCombined 0x04
+#define hipHostMallocNumaUser 0x20000000
+#define hipHostMallocCoherent 0x40000000
+#define hipHostMallocNonCoherent 0x80000000
 #define hipHostRegisterPortable 0x01
 #define hipHostRegisterMapped 0x02
 #define hipHostRegisterIoMemory 0x04
@ -989,7 +993,7 @@ typedef hipError_t HIPAPI thipMalloc(hipDeviceptr_t* dptr, size_t bytesize);
 typedef hipError_t HIPAPI thipMemAllocPitch(hipDeviceptr_t* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
 typedef hipError_t HIPAPI thipFree(hipDeviceptr_t dptr);
 typedef hipError_t HIPAPI thipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize, hipDeviceptr_t dptr);
-typedef hipError_t HIPAPI thipHostMalloc(void** pp, size_t bytesize);
+typedef hipError_t HIPAPI thipHostMalloc(void** pp, size_t bytesize, unsigned int flags);
 typedef hipError_t HIPAPI thipHostFree(void* p);
 typedef hipError_t HIPAPI thipMemHostAlloc(void** pp, size_t bytesize, unsigned int Flags);
 typedef hipError_t HIPAPI thipHostGetDevicePointer(hipDeviceptr_t* pdptr, void* p, unsigned int Flags);
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@ -108,6 +108,9 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
    return;
  }

+  /* hipDeviceMapHost for mapping host memory when out of device memory.
+   * hipDeviceLmemResizeToMax for reserving local memory ahead of render,
+   * so we can predict which memory to map to host. */
  hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));

  hip_assert(
@ -657,7 +660,8 @@ HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_pad
    }
    else if (map_host_used + size < map_host_limit) {
      /* Allocate host memory ourselves. */
-      mem_alloc_result = hipHostMalloc(&shared_pointer, size);
+      mem_alloc_result = hipHostMalloc(
+          &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);

      assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
             (mem_alloc_result != hipSuccess && shared_pointer == 0));
@ -874,7 +878,6 @@ void HIPDevice::const_copy_to(const char *name, void *host, size_t size)
  size_t bytes;

  hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, name));
-  assert(bytes == size);
  hip_assert(hipMemcpyHtoD(mem, host, size));
 }

@ -1142,141 +1145,6 @@ void HIPDevice::tex_free(device_texture &mem)
  }
 }

-#  if 0
-void HIPDevice::render(DeviceTask &task,
-                        RenderTile &rtile,
-                        device_vector<KernelWorkTile> &work_tiles)
-{
-  scoped_timer timer(&rtile.buffers->render_time);
-
-  if (have_error())
-    return;
-
-  HIPContextScope scope(this);
-  hipFunction_t hipRender;
-
-  /* Get kernel function. */
-  if (rtile.task == RenderTile::BAKE) {
-    hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_bake"));
-  }
-  else {
-    hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_path_trace"));
-  }
-
-  if (have_error()) {
-    return;
-  }
-
-  hip_assert(hipFuncSetCacheConfig(hipRender, hipFuncCachePreferL1));
-
-  /* Allocate work tile. */
-  work_tiles.alloc(1);
-
-  KernelWorkTile *wtile = work_tiles.data();
-  wtile->x = rtile.x;
-  wtile->y = rtile.y;
-  wtile->w = rtile.w;
-  wtile->h = rtile.h;
-  wtile->offset = rtile.offset;
-  wtile->stride = rtile.stride;
-  wtile->buffer = (float *)(hipDeviceptr_t)rtile.buffer;
-
-  /* Prepare work size. More step samples render faster, but for now we
-   * remain conservative for GPUs connected to a display to avoid driver
-   * timeouts and display freezing. */
-  int min_blocks, num_threads_per_block;
-  hip_assert(
-      hipModuleOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, hipRender, NULL, 0, 0));
-  if (!info.display_device) {
-    min_blocks *= 8;
-  }
-
-  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
-  /* Render all samples. */
-  uint start_sample = rtile.start_sample;
-  uint end_sample = rtile.start_sample + rtile.num_samples;
-
-  for (int sample = start_sample; sample < end_sample;) {
-    /* Setup and copy work tile to device. */
-    wtile->start_sample = sample;
-    wtile->num_samples = step_samples;
-    if (task.adaptive_sampling.use) {
-      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
-    }
-    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
-    work_tiles.copy_to_device();
-
-    hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer;
-    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
-    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
-    /* Launch kernel. */
-    void *args[] = {&d_work_tiles, &total_work_size};
-
-    hip_assert(
-        hipModuleLaunchKernel(hipRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
-    uint filter_sample = sample + wtile->num_samples - 1;
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
-    }
-
-    hip_assert(hipDeviceSynchronize());
-
-    /* Update progress. */
-    sample += wtile->num_samples;
-    rtile.sample = sample;
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-
-    if (task.get_cancel()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-  }
-
-  /* Finalize adaptive sampling. */
-  if (task.adaptive_sampling.use) {
-    hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer;
-    adaptive_sampling_post(rtile, wtile, d_work_tiles);
-    hip_assert(hipDeviceSynchronize());
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-  }
-}
-
-void HIPDevice::thread_run(DeviceTask &task)
-{
-  HIPContextScope scope(this);
-
-  if (task.type == DeviceTask::RENDER) {
-    device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
-    /* keep rendering tiles until done */
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-
-    while (task.acquire_tile(this, tile, task.tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        render(task, tile, work_tiles);
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        render(task, tile, work_tiles);
-      }
-
-      task.release_tile(tile);
-
-      if (task.get_cancel()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-
-    work_tiles.free();
-  }
-}
-#  endif
-
 unique_ptr<DeviceQueue> HIPDevice::gpu_queue_create()
 {
  return make_unique<HIPDeviceQueue>(this);
--- a/intern/cycles/device/hip/queue.cpp
+++ b/intern/cycles/device/hip/queue.cpp
@ -39,11 +39,30 @@ HIPDeviceQueue::~HIPDeviceQueue()
  hipStreamDestroy(hip_stream_);
 }

-int HIPDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const
 {
-  /* TODO: compute automatically. */
-  /* TODO: must have at least num_threads_per_block. */
-  return 14416128;
+  int num_states = 0;
+  const int max_num_threads = hip_device_->get_num_multiprocessors() *
+                              hip_device_->get_max_num_threads_per_multiprocessor();
+  if (max_num_threads == 0) {
+    num_states = 1048576;  // 65536 * 16
+  }
+  else {
+    num_states = max_num_threads * 16;
+  }
+
+  const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
+  if (factor_str) {
+    float factor = atof(factor_str);
+    if (!factor)
+      VLOG(3) << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0";
+    num_states = max((int)(num_states * factor), 1024);
+  }
+
+  VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to "
+          << string_human_readable_size(num_states * state_size);
+
+  return num_states;
 }

 int HIPDeviceQueue::num_concurrent_busy_states() const
@ -105,18 +124,19 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *arg
  }

  /* Launch kernel. */
-  hip_device_assert(hip_device_,
-                    hipModuleLaunchKernel(hip_kernel.function,
-                                          num_blocks,
-                                          1,
-                                          1,
-                                          num_threads_per_block,
-                                          1,
-                                          1,
-                                          shared_mem_bytes,
-                                          hip_stream_,
-                                          args,
-                                          0));
+  assert_success(hipModuleLaunchKernel(hip_kernel.function,
+                                       num_blocks,
+                                       1,
+                                       1,
+                                       num_threads_per_block,
+                                       1,
+                                       1,
+                                       shared_mem_bytes,
+                                       hip_stream_,
+                                       args,
+                                       0),
+                 "enqueue");
+
  return !(hip_device_->have_error());
 }

@ -127,7 +147,7 @@ bool HIPDeviceQueue::synchronize()
  }

  const HIPContextScope scope(hip_device_);
-  hip_device_assert(hip_device_, hipStreamSynchronize(hip_stream_));
+  assert_success(hipStreamSynchronize(hip_stream_), "synchronize");
  debug_synchronize();

  return !(hip_device_->have_error());
@ -150,9 +170,9 @@ void HIPDeviceQueue::zero_to_device(device_memory &mem)
  assert(mem.device_pointer != 0);

  const HIPContextScope scope(hip_device_);
-  hip_device_assert(
-      hip_device_,
-      hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_));
+  assert_success(
+      hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_),
+      "zero_to_device");
 }

 void HIPDeviceQueue::copy_to_device(device_memory &mem)
@ -173,10 +193,10 @@ void HIPDeviceQueue::copy_to_device(device_memory &mem)

  /* Copy memory to device. */
  const HIPContextScope scope(hip_device_);
-  hip_device_assert(
-      hip_device_,
+  assert_success(
      hipMemcpyHtoDAsync(
-          (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_));
+          (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_),
+      "copy_to_device");
 }

 void HIPDeviceQueue::copy_from_device(device_memory &mem)
@ -192,13 +212,21 @@ void HIPDeviceQueue::copy_from_device(device_memory &mem)

  /* Copy memory from device. */
  const HIPContextScope scope(hip_device_);
-  hip_device_assert(
-      hip_device_,
+  assert_success(
      hipMemcpyDtoHAsync(
-          mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_));
+          mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_),
+      "copy_from_device");
+}
+
+void HIPDeviceQueue::assert_success(hipError_t result, const char *operation)
+{
+  if (result != hipSuccess) {
+    const char *name = hipewErrorString(result);
+    hip_device_->set_error(
+        string_printf("%s in HIP queue %s (%s)", name, operation, debug_active_kernels().c_str()));
+  }
 }

-// TODO : (Arya) Enable this after stabilizing dev branch
 unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create()
 {
  return make_unique<HIPDeviceGraphicsInterop>(this);
--- a/intern/cycles/device/hip/queue.h
+++ b/intern/cycles/device/hip/queue.h
@ -55,12 +55,13 @@ class HIPDeviceQueue : public DeviceQueue {
    return hip_stream_;
  }

-  // TODO : (Arya) Enable this after stabilizing the dev branch
  virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override;

 protected:
  HIPDevice *hip_device_;
  hipStream_t hip_stream_;
+
+  void assert_success(hipError_t result, const char *operation);
 };

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/bvh/bvh_util.h
+++ b/intern/cycles/kernel/bvh/bvh_util.h
@ -98,7 +98,7 @@ ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection *
    for (int j = 0; j < num_hits - 1; ++j) {
      if (hits[j].t > hits[j + 1].t) {
        struct Intersection tmp_hit = hits[j];
-        struct float3 tmp_Ng = Ng[j];
+        float3 tmp_Ng = Ng[j];
        hits[j] = hits[j + 1];
        Ng[j] = Ng[j + 1];
        hits[j + 1] = tmp_hit;