Cycles: reduce kernel reserved local memory when not using shader raytracing

Ref T87836
Referenced by issue #87836, Cycles: GPU Performance
2021-10-20 14:21:01 +02:00 · 2021-10-20 14:21:01 +02:00 · 001f548227 · 2023-02-14 06:00:49 +01:00
parent 2537b32392
commit 001f548227
2 changed files with 8 additions and 4 deletions
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@ -454,7 +454,7 @@ bool CUDADevice::load_kernels(const uint kernel_features)
  return (result == CUDA_SUCCESS);
 }

-void CUDADevice::reserve_local_memory(const uint /* kernel_features */)
+void CUDADevice::reserve_local_memory(const uint kernel_features)
 {
  /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
   * needed for kernel launches, so that we can reliably figure out when
@ -468,7 +468,9 @@ void CUDADevice::reserve_local_memory(const uint /* kernel_features */)

  {
    /* Use the biggest kernel for estimation. */
-    const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE;
+    const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
+                                         DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE :
+                                         DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE;

    /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
     * multiprocessors. It would be good to do this in parallel for the multi GPU case
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@ -430,7 +430,7 @@ bool HIPDevice::load_kernels(const uint kernel_features)
  return (result == hipSuccess);
 }

-void HIPDevice::reserve_local_memory(const uint)
+void HIPDevice::reserve_local_memory(const uint kernel_features)
 {
  /* Together with hipDeviceLmemResizeToMax, this reserves local memory
   * needed for kernel launches, so that we can reliably figure out when
@ -444,7 +444,9 @@ void HIPDevice::reserve_local_memory(const uint)

  {
    /* Use the biggest kernel for estimation. */
-    const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE;
+    const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
+                                         DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE :
+                                         DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE;

    /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
     * multiprocessors. It would be good to do this in parallel for the multi GPU case