Fix T91922: Cycles artifacts with high volume nested level

Make volume stack allocated conditionally, potentially based on the actual nested level of objects in the scene. Currently the nested level is estimated by number of volume objects. This is a non-expensive check which is probably enough in practice to get almost perfect memory usage and performance. The conditional allocation is a bit tricky. For the CPU we declare and define maximum possible volume stack, because there are only that many integrator states on the CPU. On the GPU we declare outer SoA to have all volume stack elements, but only allocate actually needed ones. The actually used volume stack size is passed as a pre-processor, which seems to be easiest and fastest for the GPU state copy. There seems to be no speed regression in the demo files on RTX6000. Note that scenes with high nested level of volume will now be slower but correct. Differential Revision: https://developer.blender.org/D12759
Referenced by issue #91922, Cycles X Particle Alpha Referenced by issue #91812, VDB overlapping volume rendering artifacts
2021-10-05 15:05:12 +02:00 · 2021-10-05 15:05:12 +02:00 · c6275da852 · 2023-02-14 10:04:50 +01:00
parent e41dddd29a
commit c6275da852
12 changed files with 112 additions and 23 deletions
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@ -23,6 +23,7 @@
 #include "render/buffers.h"
 #include "render/scene.h"
 #include "util/util_logging.h"
+#include "util/util_string.h"
 #include "util/util_tbb.h"
 #include "util/util_time.h"

@ -30,7 +31,7 @@

 CCL_NAMESPACE_BEGIN

-static size_t estimate_single_state_size()
+static size_t estimate_single_state_size(DeviceScene *device_scene)
 {
  size_t state_size = 0;

@ -45,12 +46,14 @@ static size_t estimate_single_state_size()
    break; \
  } \
  }
+#define KERNEL_STRUCT_VOLUME_STACK_SIZE (device_scene->data.volume_stack_size)
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
 #undef KERNEL_STRUCT_END
 #undef KERNEL_STRUCT_END_ARRAY
+#undef KERNEL_STRUCT_VOLUME_STACK_SIZE

  return state_size;
 }
@ -72,7 +75,7 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
      num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
      work_tiles_(device, "work_tiles", MEM_READ_WRITE),
      display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
-      max_num_paths_(queue_->num_concurrent_states(estimate_single_state_size())),
+      max_num_paths_(queue_->num_concurrent_states(estimate_single_state_size(device_scene))),
      min_num_active_paths_(queue_->num_concurrent_busy_states()),
      max_active_path_index_(0)
 {
@ -125,12 +128,23 @@ void PathTraceWorkGPU::alloc_integrator_soa()
    break; \
  } \
  }
+#define KERNEL_STRUCT_VOLUME_STACK_SIZE (device_scene_->data.volume_stack_size)
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
 #undef KERNEL_STRUCT_END
 #undef KERNEL_STRUCT_END_ARRAY
+#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
+
+  if (VLOG_IS_ON(3)) {
+    size_t total_soa_size = 0;
+    for (auto &&soa_memory : integrator_state_soa_) {
+      total_soa_size += soa_memory->memory_size();
+    }
+
+    VLOG(3) << "GPU SoA state size: " << string_human_readable_size(total_soa_size);
+  }
 }

 void PathTraceWorkGPU::alloc_integrator_queue()
--- a/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
@ -38,10 +38,13 @@ ccl_device void integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_A
  volume_ray.P = from_P;
  volume_ray.D = normalize_len(to_P - from_P, &volume_ray.t);

+  /* Store to avoid global fetches on every intersection step. */
+  const uint volume_stack_size = kernel_data.volume_stack_size;
+
 #ifdef __VOLUME_RECORD_ALL__
-  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
+  Intersection hits[2 * volume_stack_size + 1];
  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, PATH_RAY_ALL_VISIBILITY);
+      kg, &volume_ray, hits, 2 * volume_stack_size, PATH_RAY_ALL_VISIBILITY);
  if (num_hits > 0) {
    Intersection *isect = hits;

@ -55,7 +58,7 @@ ccl_device void integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_A
 #else
  Intersection isect;
  int step = 0;
-  while (step < 2 * VOLUME_STACK_SIZE &&
+  while (step < 2 * volume_stack_size &&
         scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
    shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
    volume_stack_enter_exit(INTEGRATOR_STATE_PASS, stack_sd);
@ -91,12 +94,15 @@ ccl_device void integrator_intersect_volume_stack(INTEGRATOR_STATE_ARGS)
    stack_index++;
  }

+  /* Store to avoid global fetches on every intersection step. */
+  const uint volume_stack_size = kernel_data.volume_stack_size;
+
 #ifdef __VOLUME_RECORD_ALL__
-  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
+  Intersection hits[2 * volume_stack_size + 1];
  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, visibility);
+      kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
  if (num_hits > 0) {
-    int enclosed_volumes[VOLUME_STACK_SIZE];
+    int enclosed_volumes[volume_stack_size];
    Intersection *isect = hits;

    qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
@ -121,7 +127,7 @@ ccl_device void integrator_intersect_volume_stack(INTEGRATOR_STATE_ARGS)
            break;
          }
        }
-        if (need_add && stack_index < VOLUME_STACK_SIZE - 1) {
+        if (need_add && stack_index < volume_stack_size - 1) {
          const VolumeStack new_entry = {stack_sd->object, stack_sd->shader};
          integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
          ++stack_index;
@ -136,11 +142,12 @@ ccl_device void integrator_intersect_volume_stack(INTEGRATOR_STATE_ARGS)
    }
  }
 #else
-  int enclosed_volumes[VOLUME_STACK_SIZE];
+  /* CUDA does not support defintion of a variable size arrays, so use the maximum possible. */
+  int enclosed_volumes[MAX_VOLUME_STACK_SIZE];
  int step = 0;

-  while (stack_index < VOLUME_STACK_SIZE - 1 && enclosed_index < VOLUME_STACK_SIZE - 1 &&
-         step < 2 * VOLUME_STACK_SIZE) {
+  while (stack_index < volume_stack_size - 1 && enclosed_index < volume_stack_size - 1 &&
+         step < 2 * volume_stack_size) {
    Intersection isect;
    if (!scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
      break;
--- a/intern/cycles/kernel/integrator/integrator_state.h
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@ -59,8 +59,6 @@ CCL_NAMESPACE_BEGIN
 *
 * TODO: these could be made dynamic depending on the features used in the scene. */

-#define INTEGRATOR_VOLUME_STACK_SIZE VOLUME_STACK_SIZE
-
 #define INTEGRATOR_SHADOW_ISECT_SIZE_CPU 1024
 #define INTEGRATOR_SHADOW_ISECT_SIZE_GPU 4

@ -85,12 +83,14 @@ typedef struct IntegratorStateCPU {
 #define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
  } \
  name[cpu_size];
+#define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
 #undef KERNEL_STRUCT_END
 #undef KERNEL_STRUCT_END_ARRAY
+#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
 } IntegratorStateCPU;

 /* Path Queue
@ -114,12 +114,14 @@ typedef struct IntegratorStateGPU {
 #define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
  } \
  name[gpu_size];
+#define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
 #undef KERNEL_STRUCT_END
 #undef KERNEL_STRUCT_END_ARRAY
+#undef KERNEL_STRUCT_VOLUME_STACK_SIZE

  /* Count number of queued kernels. */
  IntegratorQueueCounter *queue_counter;
--- a/intern/cycles/kernel/integrator/integrator_state_template.h
+++ b/intern/cycles/kernel/integrator/integrator_state_template.h
@ -107,7 +107,9 @@ KERNEL_STRUCT_END(subsurface)
 KERNEL_STRUCT_BEGIN(volume_stack)
 KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, object, KERNEL_FEATURE_VOLUME)
 KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
-KERNEL_STRUCT_END_ARRAY(volume_stack, INTEGRATOR_VOLUME_STACK_SIZE, INTEGRATOR_VOLUME_STACK_SIZE)
+KERNEL_STRUCT_END_ARRAY(volume_stack,
+                        KERNEL_STRUCT_VOLUME_STACK_SIZE,
+                        KERNEL_STRUCT_VOLUME_STACK_SIZE)

 /********************************* Shadow Path State **************************/

@ -163,5 +165,5 @@ KERNEL_STRUCT_BEGIN(shadow_volume_stack)
 KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, object, KERNEL_FEATURE_VOLUME)
 KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
 KERNEL_STRUCT_END_ARRAY(shadow_volume_stack,
-                        INTEGRATOR_VOLUME_STACK_SIZE,
-                        INTEGRATOR_VOLUME_STACK_SIZE)
+                        KERNEL_STRUCT_VOLUME_STACK_SIZE,
+                        KERNEL_STRUCT_VOLUME_STACK_SIZE)
--- a/intern/cycles/kernel/integrator/integrator_state_util.h
+++ b/intern/cycles/kernel/integrator/integrator_state_util.h
@ -155,7 +155,7 @@ ccl_device_forceinline void integrator_state_read_shadow_isect(INTEGRATOR_STATE_
 ccl_device_forceinline void integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_ARGS)
 {
  if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
-    for (int i = 0; i < INTEGRATOR_VOLUME_STACK_SIZE; i++) {
+    for (int i = 0; i < kernel_data.volume_stack_size; i++) {
      INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, object) = INTEGRATOR_STATE_ARRAY(
          volume_stack, i, object);
      INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, shader) = INTEGRATOR_STATE_ARRAY(
@ -223,6 +223,8 @@ ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state
    while (index < gpu_array_size) \
      ;

+#  define KERNEL_STRUCT_VOLUME_STACK_SIZE kernel_data.volume_stack_size
+
 #  include "kernel/integrator/integrator_state_template.h"

 #  undef KERNEL_STRUCT_BEGIN
@ -230,6 +232,7 @@ ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state
 #  undef KERNEL_STRUCT_ARRAY_MEMBER
 #  undef KERNEL_STRUCT_END
 #  undef KERNEL_STRUCT_END_ARRAY
+#  undef KERNEL_STRUCT_VOLUME_STACK_SIZE
 }

 ccl_device_inline void integrator_state_move(const IntegratorState to_state,
--- a/intern/cycles/kernel/integrator/integrator_volume_stack.h
+++ b/intern/cycles/kernel/integrator/integrator_volume_stack.h
@ -72,7 +72,7 @@ ccl_device void volume_stack_enter_exit(INTEGRATOR_STATE_ARGS,
    }

    /* If we exceed the stack limit, ignore. */
-    if (i >= VOLUME_STACK_SIZE - 1) {
+    if (i >= kernel_data.volume_stack_size - 1) {
      return;
    }

--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@ -61,8 +61,6 @@ CCL_NAMESPACE_BEGIN
 #define ID_NONE (0.0f)
 #define PASS_UNUSED (~0)

-#define VOLUME_STACK_SIZE 4
-
 /* Kernel features */
 #define __SOBOL__
 #define __DPDU__
@ -608,6 +606,12 @@ typedef struct AttributeDescriptor {
 #  define MAX_CLOSURE __MAX_CLOSURE__
 #endif

+#ifndef __MAX_VOLUME_STACK_SIZE__
+#  define MAX_VOLUME_STACK_SIZE 32
+#else
+#  define MAX_VOLUME_STACK_SIZE __MAX_VOLUME_STACK_SIZE__
+#endif
+
 #define MAX_VOLUME_CLOSURE 8

 /* This struct is the base class for all closures. The common members are
@ -1223,7 +1227,7 @@ typedef struct KernelData {
  uint kernel_features;
  uint max_closures;
  uint max_shaders;
-  uint pad;
+  uint volume_stack_size;

  KernelCamera cam;
  KernelFilm film;
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@ -1149,7 +1149,9 @@ int ShaderGraph::get_num_closures()
      num_closures += 8;
    }
    else if (CLOSURE_IS_VOLUME(closure_type)) {
-      num_closures += VOLUME_STACK_SIZE;
+      /* TODO(sergey): Verify this is still needed, since we have special minimized volume storage
+       * for the volume steps. */
+      num_closures += MAX_VOLUME_STACK_SIZE;
    }
    else if (closure_type == CLOSURE_BSDF_HAIR_PRINCIPLED_ID) {
      num_closures += 4;
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@ -366,6 +366,22 @@ float Object::compute_volume_step_size() const
  return step_size;
 }

+bool Object::check_is_volume() const
+{
+  if (geometry->geometry_type == Geometry::VOLUME) {
+    return true;
+  }
+
+  for (Node *node : get_geometry()->get_used_shaders()) {
+    const Shader *shader = static_cast<const Shader *>(node);
+    if (shader->has_volume_connected) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 int Object::get_device_index() const
 {
  return index;
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@ -109,6 +109,13 @@ class Object : public Node {
  /* Compute step size from attributes, shaders, transforms. */
  float compute_volume_step_size() const;

+  /* Check whether this object requires volume sampling (and hence might require space in the
+   * volume stack).
+   *
+   * Note that this is a naive iteration over sharders, which allows to access information prior
+   * to `scene_update()`. */
+  bool check_is_volume() const;
+
 protected:
  /* Specifies the position of the object in scene->objects and
   * in the device vectors. Gets set in device_update. */
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@ -527,6 +527,8 @@ void Scene::update_kernel_features()
  const uint max_closures = (params.background) ? get_max_closure_count() : MAX_CLOSURE;
  dscene.data.max_closures = max_closures;
  dscene.data.max_shaders = shaders.size();
+
+  dscene.data.volume_stack_size = get_volume_stack_size();
 }

 bool Scene::update(Progress &progress)
@ -642,6 +644,33 @@ int Scene::get_max_closure_count()
  return max_closure_global;
 }

+int Scene::get_volume_stack_size() const
+{
+  /* Quick non-expensive check. Can over-estimate maximum possible nested level, but does not
+   * require expensive calculation during pre-processing. */
+  int num_volume_objects = 0;
+  for (const Object *object : objects) {
+    if (object->check_is_volume()) {
+      ++num_volume_objects;
+    }
+
+    if (num_volume_objects == MAX_VOLUME_STACK_SIZE) {
+      break;
+    }
+  }
+
+  /* Count background world for the stack. */
+  const Shader *background_shader = background->get_shader(this);
+  if (background_shader && background_shader->has_volume_connected) {
+    ++num_volume_objects;
+  }
+
+  /* Space for terminator. */
+  ++num_volume_objects;
+
+  return min(num_volume_objects, MAX_VOLUME_STACK_SIZE);
+}
+
 bool Scene::has_shadow_catcher()
 {
  if (shadow_catcher_modified_) {
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@ -344,6 +344,9 @@ class Scene : public NodeOwner {
  /* Get maximum number of closures to be used in kernel. */
  int get_max_closure_count();

+  /* Get size of a volume stack needed to render this scene.  */
+  int get_volume_stack_size() const;
+
  template<typename T> void delete_node_impl(T *node)
  {
    delete node;