Cycles: decouple shadow paths from main path on GPU

The motivation for this is twofold. It improves performance (5-10% on most benchmark scenes), and will help to bring back transparency support for the ambient occlusion pass. * Duplicate some members from the main path state in the shadow path state. * Add shadow paths incrementally to the array similar to what we do for the shadow catchers. * For the scheduling, allow running shade surface and shade volume kernels as long as there is enough space in the shadow paths array. If not, execute shadow kernels until it is empty. * Add IntegratorShadowState and ConstIntegratorShadowState typedefs that can be different between CPU and GPU. For GPU both main and shadow paths juse have an integer for SoA access. Bt with CPU it's a different pointer type so we get type safety checks in code shared between CPU and GPU. * For CPU, add a separate IntegratorShadowStateCPU struct embedded in IntegratorShadowState. * Update various functions to take the shadow state, and make SVM take either type of state using templates. Differential Revision: https://developer.blender.org/D12889
2021-10-17 16:22:20 +02:00 · 2021-10-17 16:22:20 +02:00 · 943e73b07e
parent 6e473a897c
commit 943e73b07e
30 changed files with 550 additions and 277 deletions
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@ -52,7 +52,11 @@ static size_t estimate_single_state_size()
 * For until then use common value. Currently this size is only used for logging, but is weak to
 * rely on this. */
 #define KERNEL_STRUCT_VOLUME_STACK_SIZE 4
+
 #include "kernel/integrator/integrator_state_template.h"
+
+#include "kernel/integrator/integrator_shadow_state_template.h"
+
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
@ -74,6 +78,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
      integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
      integrator_shader_raytrace_sort_counter_(
          device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
+      integrator_next_shadow_path_index_(
+          device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
      integrator_next_shadow_catcher_path_index_(
          device, "integrator_next_shadow_catcher_path_index", MEM_READ_WRITE),
      queued_paths_(device, "queued_paths", MEM_READ_WRITE),
@ -138,7 +144,11 @@ void PathTraceWorkGPU::alloc_integrator_soa()
  } \
  }
 #define KERNEL_STRUCT_VOLUME_STACK_SIZE (integrator_state_soa_volume_stack_size_)
+
 #include "kernel/integrator/integrator_state_template.h"
+
+#include "kernel/integrator/integrator_shadow_state_template.h"
+
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
@ -199,16 +209,22 @@ void PathTraceWorkGPU::alloc_integrator_sorting()

 void PathTraceWorkGPU::alloc_integrator_path_split()
 {
-  if (integrator_next_shadow_catcher_path_index_.size() != 0) {
-    return;
+  if (integrator_next_shadow_path_index_.size() == 0) {
+    integrator_next_shadow_path_index_.alloc(1);
+    integrator_next_shadow_path_index_.zero_to_device();
+
+    integrator_state_gpu_.next_shadow_path_index =
+        (int *)integrator_next_shadow_path_index_.device_pointer;
  }

-  integrator_next_shadow_catcher_path_index_.alloc(1);
-  /* TODO(sergey): Use queue? */
-  integrator_next_shadow_catcher_path_index_.zero_to_device();
+  if (integrator_next_shadow_catcher_path_index_.size() == 0) {
+    integrator_next_shadow_catcher_path_index_.alloc(1);
+    integrator_next_shadow_path_index_.data()[0] = 0;
+    integrator_next_shadow_catcher_path_index_.zero_to_device();

-  integrator_state_gpu_.next_shadow_catcher_path_index =
-      (int *)integrator_next_shadow_catcher_path_index_.device_pointer;
+    integrator_state_gpu_.next_shadow_catcher_path_index =
+        (int *)integrator_next_shadow_catcher_path_index_.device_pointer;
+  }
 }

 void PathTraceWorkGPU::alloc_work_memory()
@ -341,27 +357,45 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
    return false;
  }

-  /* Finish shadows before potentially adding more shadow rays. We can only
-   * store one shadow ray in the integrator state.
+  /* If the number of shadow kernels dropped to zero, set the next shadow path
+   * index to zero as well.
   *
-   * When there is a shadow catcher in the scene finish shadow rays before invoking intersect
-   * closest kernel since so that the shadow paths are writing to the pre-split state. */
-  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME ||
-      (has_shadow_catcher() && kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST)) {
-    if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
-      enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
-      return true;
+   * TODO: use shadow path compaction to lower it more often instead of letting
+   * it fill up entirely? */
+  const int num_queued_shadow =
+      queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] +
+      queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW];
+  if (num_queued_shadow == 0) {
+    if (integrator_next_shadow_path_index_.data()[0] != 0) {
+      integrator_next_shadow_path_index_.data()[0] = 0;
+      queue_->copy_to_device(integrator_next_shadow_path_index_);
    }
-    else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) {
-      enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
-      return true;
+  }
+
+  /* For kernels that add shadow paths, check if there is enough space available.
+   * If not, schedule shadow kernels first to clear out the shadow paths. */
+  if (kernel_creates_shadow_paths(kernel)) {
+    if (max_num_paths_ - integrator_next_shadow_path_index_.data()[0] <
+        queue_counter->num_queued[kernel]) {
+      if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
+        enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+        return true;
+      }
+      else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) {
+        enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+        return true;
+      }
    }
  }

  /* Schedule kernel with maximum number of queued items. */
  enqueue_path_iteration(kernel);
+
+  /* Update next shadow path index for kernels that can add shadow paths. */
+  if (kernel_creates_shadow_paths(kernel)) {
+    queue_->copy_from_device(integrator_next_shadow_path_index_);
+  }
+
  return true;
 }

@ -370,13 +404,12 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
  void *d_path_index = (void *)NULL;

  /* Create array of path indices for which this kernel is queued to be executed. */
-  int work_size = max_active_path_index_;
+  int work_size = kernel_max_active_path_index(kernel);

  IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
  int num_queued = queue_counter->num_queued[kernel];

-  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+  if (kernel_uses_sorting(kernel)) {
    /* Compute array of active paths, sorted by shader. */
    work_size = num_queued;
    d_path_index = (void *)queued_paths_.device_pointer;
@ -387,8 +420,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
    work_size = num_queued;
    d_path_index = (void *)queued_paths_.device_pointer;

-    if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
-        kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+    if (kernel_is_shadow_path(kernel)) {
      /* Compute array of active shadow paths for specific kernel. */
      compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, kernel);
    }
@ -452,7 +484,7 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKe
  {
    /* TODO: this could be smaller for terminated paths based on amount of work we want
     * to schedule. */
-    const int work_size = max_active_path_index_;
+    const int work_size = kernel_max_active_path_index(queued_kernel);

    void *d_queued_paths = (void *)queued_paths_.device_pointer;
    void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
@ -481,7 +513,7 @@ void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel qu
  int d_queued_kernel = queued_kernel;

  /* Launch kernel to fill the active paths arrays. */
-  const int work_size = max_active_path_index_;
+  const int work_size = kernel_max_active_path_index(queued_kernel);
  void *d_queued_paths = (void *)queued_paths_.device_pointer;
  void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
  void *args[] = {
@ -981,4 +1013,29 @@ int PathTraceWorkGPU::shadow_catcher_count_possible_splits()
  return num_queued_paths_.data()[0];
 }

+bool PathTraceWorkGPU::kernel_uses_sorting(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE);
+}
+
+bool PathTraceWorkGPU::kernel_creates_shadow_paths(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+}
+
+bool PathTraceWorkGPU::kernel_is_shadow_path(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+}
+
+int PathTraceWorkGPU::kernel_max_active_path_index(DeviceKernel kernel)
+{
+  return (kernel_is_shadow_path(kernel)) ? integrator_next_shadow_path_index_.data()[0] :
+                                           max_active_path_index_;
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@ -113,6 +113,12 @@ class PathTraceWorkGPU : public PathTraceWork {
  /* Count how many currently scheduled paths can still split. */
  int shadow_catcher_count_possible_splits();

+  /* Kernel properties. */
+  bool kernel_uses_sorting(DeviceKernel kernel);
+  bool kernel_creates_shadow_paths(DeviceKernel kernel);
+  bool kernel_is_shadow_path(DeviceKernel kernel);
+  int kernel_max_active_path_index(DeviceKernel kernel);
+
  /* Integrator queue. */
  unique_ptr<DeviceQueue> queue_;

@ -131,6 +137,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  device_vector<int> integrator_shader_sort_counter_;
  device_vector<int> integrator_shader_raytrace_sort_counter_;
  /* Path split. */
+  device_vector<int> integrator_next_shadow_path_index_;
  device_vector<int> integrator_next_shadow_catcher_path_index_;

  /* Temporary buffer to get an array of queued path for a particular kernel. */
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -236,6 +236,7 @@ set(SRC_INTEGRATOR_HEADERS
  integrator/integrator_shade_shadow.h
  integrator/integrator_shade_surface.h
  integrator/integrator_shade_volume.h
+  integrator/integrator_shadow_state_template.h
  integrator/integrator_state.h
  integrator/integrator_state_flow.h
  integrator/integrator_state_template.h
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@ -69,6 +69,18 @@ CCL_NAMESPACE_BEGIN
 #  define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__)
 #endif

+/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
+ * that it does not contain unused fields. */
+#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    KernelWorkTile *tile, \
+                                                    ccl_global float *render_buffer) \
+  { \
+    return KERNEL_INVOKE( \
+        name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+  }
+
 #define DEFINE_INTEGRATOR_KERNEL(name) \
  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
                                                    IntegratorStateCPU *state) \
@ -83,30 +95,32 @@ CCL_NAMESPACE_BEGIN
    KERNEL_INVOKE(name, kg, state, render_buffer); \
  }

-/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
- * that it does not contain unused fields. */
-#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
-  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
-                                                    IntegratorStateCPU *state, \
-                                                    KernelWorkTile *tile, \
-                                                    ccl_global float *render_buffer) \
+#define DEFINE_INTEGRATOR_SHADOW_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+                                                    IntegratorStateCPU *state) \
  { \
-    return KERNEL_INVOKE( \
-        name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+    KERNEL_INVOKE(name, kg, &state->shadow); \
+  }
+
+#define DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+      const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+  { \
+    KERNEL_INVOKE(name, kg, &state->shadow, render_buffer); \
  }

 DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
 DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
 DEFINE_INTEGRATOR_KERNEL(intersect_closest)
-DEFINE_INTEGRATOR_KERNEL(intersect_shadow)
 DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
 DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light)
-DEFINE_INTEGRATOR_SHADE_KERNEL(shade_shadow)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume)
 DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
+DEFINE_INTEGRATOR_SHADOW_KERNEL(intersect_shadow)
+DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(shade_shadow)

 /* --------------------------------------------------------------------
 * Shader evaluation.
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@ -265,8 +265,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
      num_states, indices, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) != 0) ||
-               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
+        return (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
      });
 }

@ -278,8 +277,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
      num_states, indices + indices_offset, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) == 0) &&
-               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+        return (INTEGRATOR_STATE(state, path, queued_kernel) == 0);
      });
 }

@ -303,9 +301,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
      num_states, indices, num_indices, [num_active_paths](const int state) {
-        return (state >= num_active_paths) &&
-               ((INTEGRATOR_STATE(state, path, queued_kernel) != 0) ||
-                (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0));
+        return (state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
      });
 }

--- a/intern/cycles/kernel/integrator/integrator_intersect_closest.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
@ -136,13 +136,6 @@ ccl_device_forceinline void integrator_intersect_shader_next_kernel(
    else {
      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
    }
-
-    /* If the split happened after bounce through a transparent object it's possible to have shadow
-     * patch. Make sure it is properly re-scheduled on the split path. */
-    const int shadow_kernel = INTEGRATOR_STATE(state, shadow_path, queued_kernel);
-    if (shadow_kernel != 0) {
-      INTEGRATOR_SHADOW_PATH_INIT(shadow_kernel);
-    }
  }
 #endif
 }
--- a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
@ -20,7 +20,7 @@ CCL_NAMESPACE_BEGIN

 /* Visibility for the shadow ray. */
 ccl_device_forceinline uint integrate_intersect_shadow_visibility(KernelGlobals kg,
-                                                                  ConstIntegratorState state)
+                                                                  ConstIntegratorShadowState state)
 {
  uint visibility = PATH_RAY_SHADOW;

@ -33,7 +33,7 @@ ccl_device_forceinline uint integrate_intersect_shadow_visibility(KernelGlobals
 }

 ccl_device bool integrate_intersect_shadow_opaque(KernelGlobals kg,
-                                                  IntegratorState state,
+                                                  IntegratorShadowState state,
                                                  ccl_private const Ray *ray,
                                                  const uint visibility)
 {
@ -55,7 +55,7 @@ ccl_device bool integrate_intersect_shadow_opaque(KernelGlobals kg,
 }

 ccl_device_forceinline int integrate_shadow_max_transparent_hits(KernelGlobals kg,
-                                                                 ConstIntegratorState state)
+                                                                 ConstIntegratorShadowState state)
 {
  const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
  const int transparent_bounce = INTEGRATOR_STATE(state, shadow_path, transparent_bounce);
@ -65,7 +65,7 @@ ccl_device_forceinline int integrate_shadow_max_transparent_hits(KernelGlobals k

 #ifdef __TRANSPARENT_SHADOWS__
 ccl_device bool integrate_intersect_shadow_transparent(KernelGlobals kg,
-                                                       IntegratorState state,
+                                                       IntegratorShadowState state,
                                                       ccl_private const Ray *ray,
                                                       const uint visibility)
 {
@ -106,7 +106,7 @@ ccl_device bool integrate_intersect_shadow_transparent(KernelGlobals kg,
 }
 #endif

-ccl_device void integrator_intersect_shadow(KernelGlobals kg, IntegratorState state)
+ccl_device void integrator_intersect_shadow(KernelGlobals kg, IntegratorShadowState state)
 {
  PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW);

--- a/intern/cycles/kernel/integrator/integrator_megakernel.h
+++ b/intern/cycles/kernel/integrator/integrator_megakernel.h
@ -39,14 +39,17 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
   * TODO: investigate if we can use device side enqueue for GPUs to avoid
   * having to compile this big kernel. */
  while (true) {
-    if (INTEGRATOR_STATE(state, shadow_path, queued_kernel)) {
+    const uint32_t shadow_queued_kernel = INTEGRATOR_STATE(
+        &state->shadow, shadow_path, queued_kernel);
+
+    if (shadow_queued_kernel) {
      /* First handle any shadow paths before we potentially create more shadow paths. */
-      switch (INTEGRATOR_STATE(state, shadow_path, queued_kernel)) {
+      switch (shadow_queued_kernel) {
        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
-          integrator_intersect_shadow(kg, state);
+          integrator_intersect_shadow(kg, &state->shadow);
          break;
        case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
-          integrator_shade_shadow(kg, state, render_buffer);
+          integrator_shade_shadow(kg, &state->shadow, render_buffer);
          break;
        default:
          kernel_assert(0);
--- a/intern/cycles/kernel/integrator/integrator_shade_shadow.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_shadow.h
@ -30,7 +30,7 @@ ccl_device_inline bool shadow_intersections_has_remaining(const int num_hits)

 #ifdef __TRANSPARENT_SHADOWS__
 ccl_device_inline float3 integrate_transparent_surface_shadow(KernelGlobals kg,
-                                                              IntegratorState state,
+                                                              IntegratorShadowState state,
                                                              const int hit)
 {
  PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SURFACE);
@ -69,7 +69,7 @@ ccl_device_inline float3 integrate_transparent_surface_shadow(KernelGlobals kg,

 #  ifdef __VOLUME__
 ccl_device_inline void integrate_transparent_volume_shadow(KernelGlobals kg,
-                                                           IntegratorState state,
+                                                           IntegratorShadowState state,
                                                           const int hit,
                                                           const int num_recorded_hits,
                                                           ccl_private float3 *ccl_restrict
@ -97,14 +97,14 @@ ccl_device_inline void integrate_transparent_volume_shadow(KernelGlobals kg,
  shader_setup_from_volume(kg, shadow_sd, &ray);

  const float step_size = volume_stack_step_size(
-      kg, state, [=](const int i) { return integrator_state_read_shadow_volume_stack(state, i); });
+      kg, [=](const int i) { return integrator_state_read_shadow_volume_stack(state, i); });

  volume_shadow_heterogeneous(kg, state, &ray, shadow_sd, throughput, step_size);
 }
 #  endif

 ccl_device_inline bool integrate_transparent_shadow(KernelGlobals kg,
-                                                    IntegratorState state,
+                                                    IntegratorShadowState state,
                                                    const int num_hits)
 {
  /* Accumulate shadow for transparent surfaces. */
@ -158,7 +158,7 @@ ccl_device_inline bool integrate_transparent_shadow(KernelGlobals kg,
 #endif /* __TRANSPARENT_SHADOWS__ */

 ccl_device void integrator_shade_shadow(KernelGlobals kg,
-                                        IntegratorState state,
+                                        IntegratorShadowState state,
                                        ccl_global float *ccl_restrict render_buffer)
 {
  PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SETUP);
--- a/intern/cycles/kernel/integrator/integrator_shade_surface.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h
@ -167,17 +167,20 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
  light_sample_to_surface_shadow_ray(kg, sd, &ls, &ray);
  const bool is_light = light_sample_is_light(&ls);

+  /* Branch off shadow kernel. */
+  INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+
  /* Copy volume stack and enter/exit volume. */
-  integrator_state_copy_volume_stack_to_shadow(kg, state);
+  integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);

  if (is_transmission) {
 #  ifdef __VOLUME__
-    shadow_volume_stack_enter_exit(kg, state, sd);
+    shadow_volume_stack_enter_exit(kg, shadow_state, sd);
 #  endif
  }

  /* Write shadow ray and associated state to global memory. */
-  integrator_state_write_shadow_ray(kg, state, &ray);
+  integrator_state_write_shadow_ray(kg, shadow_state, &ray);

  /* Copy state from main path to shadow path. */
  const uint16_t bounce = INTEGRATOR_STATE(state, path, bounce);
@ -191,20 +194,32 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
    const float3 diffuse_glossy_ratio = (bounce == 0) ?
                                            bsdf_eval_diffuse_glossy_ratio(&bsdf_eval) :
                                            INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
-    INTEGRATOR_STATE_WRITE(state, shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
  }

-  INTEGRATOR_STATE_WRITE(state, shadow_path, flag) = shadow_flag;
-  INTEGRATOR_STATE_WRITE(state, shadow_path, bounce) = bounce;
-  INTEGRATOR_STATE_WRITE(state, shadow_path, transparent_bounce) = transparent_bounce;
-  INTEGRATOR_STATE_WRITE(state, shadow_path, throughput) = throughput;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
+      state, path, render_pixel_index);
+  INTEGRATOR_STATE_WRITE(
+      shadow_state, shadow_path, rng_offset) = INTEGRATOR_STATE(state, path, rng_offset) -
+                                               PRNG_BOUNCE_NUM * transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, rng_hash) = INTEGRATOR_STATE(
+      state, path, rng_hash);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, sample) = INTEGRATOR_STATE(
+      state, path, sample);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, flag) = shadow_flag;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, bounce) = bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transparent_bounce) = transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, diffuse_bounce) = INTEGRATOR_STATE(
+      state, path, diffuse_bounce);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, glossy_bounce) = INTEGRATOR_STATE(
+      state, path, glossy_bounce);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transmission_bounce) = INTEGRATOR_STATE(
+      state, path, transmission_bounce);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, throughput) = throughput;

  if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) {
-    INTEGRATOR_STATE_WRITE(state, shadow_path, unshadowed_throughput) = throughput;
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, unshadowed_throughput) = throughput;
  }
-
-  /* Branch off shadow kernel. */
-  INTEGRATOR_SHADOW_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
 }
 #endif

--- a/intern/cycles/kernel/integrator/integrator_shade_volume.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h
@ -71,7 +71,7 @@ typedef struct VolumeShaderCoefficients {

 /* Evaluate shader to get extinction coefficient at P. */
 ccl_device_inline bool shadow_volume_shader_sample(KernelGlobals kg,
-                                                   IntegratorState state,
+                                                   IntegratorShadowState state,
                                                   ccl_private ShaderData *ccl_restrict sd,
                                                   ccl_private float3 *ccl_restrict extinction)
 {
@ -187,7 +187,7 @@ ccl_device void volume_shadow_homogeneous(KernelGlobals kg, IntegratorState stat
 /* heterogeneous volume: integrate stepping through the volume until we
 * reach the end, get absorbed entirely, or run out of iterations */
 ccl_device void volume_shadow_heterogeneous(KernelGlobals kg,
-                                            IntegratorState state,
+                                            IntegratorShadowState state,
                                            ccl_private Ray *ccl_restrict ray,
                                            ccl_private ShaderData *ccl_restrict sd,
                                            ccl_private float3 *ccl_restrict throughput,
@ -775,8 +775,11 @@ ccl_device_forceinline void integrate_volume_direct_light(
  light_sample_to_volume_shadow_ray(kg, sd, ls, P, &ray);
  const bool is_light = light_sample_is_light(ls);

+  /* Branch off shadow kernel. */
+  INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+
  /* Write shadow ray and associated state to global memory. */
-  integrator_state_write_shadow_ray(kg, state, &ray);
+  integrator_state_write_shadow_ray(kg, shadow_state, &ray);

  /* Copy state from main path to shadow path. */
  const uint16_t bounce = INTEGRATOR_STATE(state, path, bounce);
@ -790,22 +793,34 @@ ccl_device_forceinline void integrate_volume_direct_light(
    const float3 diffuse_glossy_ratio = (bounce == 0) ?
                                            one_float3() :
                                            INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
-    INTEGRATOR_STATE_WRITE(state, shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
  }

-  INTEGRATOR_STATE_WRITE(state, shadow_path, flag) = shadow_flag;
-  INTEGRATOR_STATE_WRITE(state, shadow_path, bounce) = bounce;
-  INTEGRATOR_STATE_WRITE(state, shadow_path, transparent_bounce) = transparent_bounce;
-  INTEGRATOR_STATE_WRITE(state, shadow_path, throughput) = throughput_phase;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
+      state, path, render_pixel_index);
+  INTEGRATOR_STATE_WRITE(
+      shadow_state, shadow_path, rng_offset) = INTEGRATOR_STATE(state, path, rng_offset) -
+                                               PRNG_BOUNCE_NUM * transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, rng_hash) = INTEGRATOR_STATE(
+      state, path, rng_hash);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, sample) = INTEGRATOR_STATE(
+      state, path, sample);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, flag) = shadow_flag;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, bounce) = bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transparent_bounce) = transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, diffuse_bounce) = INTEGRATOR_STATE(
+      state, path, diffuse_bounce);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, glossy_bounce) = INTEGRATOR_STATE(
+      state, path, glossy_bounce);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transmission_bounce) = INTEGRATOR_STATE(
+      state, path, transmission_bounce);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, throughput) = throughput_phase;

  if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) {
-    INTEGRATOR_STATE_WRITE(state, shadow_path, unshadowed_throughput) = throughput;
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, unshadowed_throughput) = throughput;
  }

-  integrator_state_copy_volume_stack_to_shadow(kg, state);
-
-  /* Branch off shadow kernel. */
-  INTEGRATOR_SHADOW_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+  integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
 }
 #  endif

@ -902,7 +917,7 @@ ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg,

  /* Step through volume. */
  const float step_size = volume_stack_step_size(
-      kg, state, [=](const int i) { return integrator_state_read_volume_stack(state, i); });
+      kg, [=](const int i) { return integrator_state_read_volume_stack(state, i); });

  /* TODO: expensive to zero closures? */
  VolumeIntegrateResult result = {};
--- a/intern/cycles/kernel/integrator/integrator_shadow_state_template.h
+++ b/intern/cycles/kernel/integrator/integrator_shadow_state_template.h
@ -0,0 +1,83 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/********************************* Shadow Path State **************************/
+
+KERNEL_STRUCT_BEGIN(shadow_path)
+/* Index of a pixel within the device render buffer. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, render_pixel_index, KERNEL_FEATURE_PATH_TRACING)
+/* Current sample number. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, sample, KERNEL_FEATURE_PATH_TRACING)
+/* Random number generator seed. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, rng_hash, KERNEL_FEATURE_PATH_TRACING)
+/* Random number dimension offset. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, rng_offset, KERNEL_FEATURE_PATH_TRACING)
+/* Current ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transparent ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current diffuse ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, diffuse_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current glossy ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, glossy_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transmission ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, transmission_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* DeviceKernel bit indicating queued kernels. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING)
+/* enum PathRayFlag */
+KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput for shadow pass. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, unshadowed_throughput, KERNEL_FEATURE_SHADOW_PASS)
+/* Ratio of throughput to distinguish diffuse and glossy render passes. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
+/* Number of intersections found by ray-tracing. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(shadow_path)
+
+/********************************** Shadow Ray *******************************/
+
+KERNEL_STRUCT_BEGIN(shadow_ray)
+KERNEL_STRUCT_MEMBER(shadow_ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(shadow_ray)
+
+/*********************** Shadow Intersection result **************************/
+
+/* Result from scene intersection. */
+KERNEL_STRUCT_BEGIN(shadow_isect)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, u, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, v, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, prim, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, object, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, type, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END_ARRAY(shadow_isect,
+                        INTEGRATOR_SHADOW_ISECT_SIZE_CPU,
+                        INTEGRATOR_SHADOW_ISECT_SIZE_GPU)
+
+/**************************** Shadow Volume Stack *****************************/
+
+KERNEL_STRUCT_BEGIN(shadow_volume_stack)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, object, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_END_ARRAY(shadow_volume_stack,
+                        KERNEL_STRUCT_VOLUME_STACK_SIZE,
+                        KERNEL_STRUCT_VOLUME_STACK_SIZE)
--- a/intern/cycles/kernel/integrator/integrator_state.h
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@ -66,6 +66,25 @@ CCL_NAMESPACE_BEGIN
 /* Integrator State
 *
 * CPU rendering path state with AoS layout. */
+typedef struct IntegratorShadowStateCPU {
+#define KERNEL_STRUCT_BEGIN(name) struct {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name;
+#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
+#define KERNEL_STRUCT_END(name) \
+  } \
+  name;
+#define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
+  } \
+  name[cpu_size];
+#define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
+#include "kernel/integrator/integrator_shadow_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+} IntegratorShadowStateCPU;
+
 typedef struct IntegratorStateCPU {
 #define KERNEL_STRUCT_BEGIN(name) struct {
 #define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name;
@ -84,6 +103,8 @@ typedef struct IntegratorStateCPU {
 #undef KERNEL_STRUCT_END
 #undef KERNEL_STRUCT_END_ARRAY
 #undef KERNEL_STRUCT_VOLUME_STACK_SIZE
+
+  IntegratorShadowStateCPU shadow;
 } IntegratorStateCPU;

 /* Path Queue
@ -108,7 +129,11 @@ typedef struct IntegratorStateGPU {
  } \
  name[gpu_size];
 #define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
+
 #include "kernel/integrator/integrator_state_template.h"
+
+#include "kernel/integrator/integrator_shadow_state_template.h"
+
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
@ -122,7 +147,10 @@ typedef struct IntegratorStateGPU {
  /* Count number of kernels queued for specific shaders. */
  ccl_global int *sort_key_counter[DEVICE_KERNEL_INTEGRATOR_NUM];

-  /* Index of path which will be used by a next shadow catcher split.  */
+  /* Index of shadow path which will be used by a next shadow path.  */
+  ccl_global int *next_shadow_path_index;
+
+  /* Index of main path which will be used by a next shadow catcher split.  */
  ccl_global int *next_shadow_catcher_path_index;
 } IntegratorStateGPU;

@ -140,6 +168,8 @@ typedef struct IntegratorStateGPU {

 typedef IntegratorStateCPU *ccl_restrict IntegratorState;
 typedef const IntegratorStateCPU *ccl_restrict ConstIntegratorState;
+typedef IntegratorShadowStateCPU *ccl_restrict IntegratorShadowState;
+typedef const IntegratorShadowStateCPU *ccl_restrict ConstIntegratorShadowState;

 #  define INTEGRATOR_STATE_NULL nullptr

@ -157,6 +187,8 @@ typedef const IntegratorStateCPU *ccl_restrict ConstIntegratorState;

 typedef const int IntegratorState;
 typedef const int ConstIntegratorState;
+typedef const int IntegratorShadowState;
+typedef const int ConstIntegratorShadowState;

 #  define INTEGRATOR_STATE_NULL -1

--- a/intern/cycles/kernel/integrator/integrator_state_flow.h
+++ b/intern/cycles/kernel/integrator/integrator_state_flow.h
@ -63,10 +63,12 @@ CCL_NAMESPACE_BEGIN
        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
    INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;

-#  define INTEGRATOR_SHADOW_PATH_INIT(next_kernel) \
+#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel) \
+    IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32( \
+        &kernel_integrator_state.next_shadow_path_index[0], 1); \
    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
                                1); \
-    INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel;
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
 #  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
    atomic_fetch_and_sub_uint32( \
        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
@ -127,8 +129,9 @@ CCL_NAMESPACE_BEGIN
      (void)current_kernel; \
    }

-#  define INTEGRATOR_SHADOW_PATH_INIT(next_kernel) \
-    INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel) \
+    IntegratorShadowState shadow_state = &state->shadow; \
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
 #  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
    { \
      INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel; \
--- a/intern/cycles/kernel/integrator/integrator_state_template.h
+++ b/intern/cycles/kernel/integrator/integrator_state_template.h
@ -28,6 +28,8 @@ KERNEL_STRUCT_MEMBER(path, uint32_t, render_pixel_index, KERNEL_FEATURE_PATH_TRA
 KERNEL_STRUCT_MEMBER(path, uint16_t, sample, KERNEL_FEATURE_PATH_TRACING)
 /* Current ray bounce depth. */
 KERNEL_STRUCT_MEMBER(path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transparent ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
 /* Current diffuse ray bounce depth. */
 KERNEL_STRUCT_MEMBER(path, uint16_t, diffuse_bounce, KERNEL_FEATURE_PATH_TRACING)
 /* Current glossy ray bounce depth. */
@ -38,8 +40,6 @@ KERNEL_STRUCT_MEMBER(path, uint16_t, transmission_bounce, KERNEL_FEATURE_PATH_TR
 KERNEL_STRUCT_MEMBER(path, uint16_t, volume_bounce, KERNEL_FEATURE_PATH_TRACING)
 /* Current volume bounds ray bounce depth. */
 KERNEL_STRUCT_MEMBER(path, uint16_t, volume_bounds_bounce, KERNEL_FEATURE_PATH_TRACING)
-/* Current transparent ray bounce depth. */
-KERNEL_STRUCT_MEMBER(path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
 /* DeviceKernel bit indicating queued kernels. */
 KERNEL_STRUCT_MEMBER(path, uint16_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING)
 /* Random number generator seed. */
@ -107,57 +107,3 @@ KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
 KERNEL_STRUCT_END_ARRAY(volume_stack,
                        KERNEL_STRUCT_VOLUME_STACK_SIZE,
                        KERNEL_STRUCT_VOLUME_STACK_SIZE)
-
-/********************************* Shadow Path State **************************/
-
-KERNEL_STRUCT_BEGIN(shadow_path)
-/* Current ray bounce depth. */
-KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
-/* Current transparent ray bounce depth. */
-KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
-/* DeviceKernel bit indicating queued kernels. */
-KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING)
-/* enum PathRayFlag */
-KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
-/* Throughput. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
-/* Throughput for shadow pass. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, unshadowed_throughput, KERNEL_FEATURE_SHADOW_PASS)
-/* Ratio of throughput to distinguish diffuse and glossy render passes. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
-/* Number of intersections found by ray-tracing. */
-KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_END(shadow_path)
-
-/********************************** Shadow Ray *******************************/
-
-KERNEL_STRUCT_BEGIN(shadow_ray)
-KERNEL_STRUCT_MEMBER(shadow_ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(shadow_ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_END(shadow_ray)
-
-/*********************** Shadow Intersection result **************************/
-
-/* Result from scene intersection. */
-KERNEL_STRUCT_BEGIN(shadow_isect)
-KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, t, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, u, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, v, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, prim, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, object, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, type, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_END_ARRAY(shadow_isect,
-                        INTEGRATOR_SHADOW_ISECT_SIZE_CPU,
-                        INTEGRATOR_SHADOW_ISECT_SIZE_GPU)
-
-/**************************** Shadow Volume Stack *****************************/
-
-KERNEL_STRUCT_BEGIN(shadow_volume_stack)
-KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, object, KERNEL_FEATURE_VOLUME)
-KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
-KERNEL_STRUCT_END_ARRAY(shadow_volume_stack,
-                        KERNEL_STRUCT_VOLUME_STACK_SIZE,
-                        KERNEL_STRUCT_VOLUME_STACK_SIZE)
--- a/intern/cycles/kernel/integrator/integrator_state_util.h
+++ b/intern/cycles/kernel/integrator/integrator_state_util.h
@ -50,7 +50,7 @@ ccl_device_forceinline void integrator_state_read_ray(KernelGlobals kg,
 /* Shadow Ray */

 ccl_device_forceinline void integrator_state_write_shadow_ray(
-    KernelGlobals kg, IntegratorState state, ccl_private const Ray *ccl_restrict ray)
+    KernelGlobals kg, IntegratorShadowState state, ccl_private const Ray *ccl_restrict ray)
 {
  INTEGRATOR_STATE_WRITE(state, shadow_ray, P) = ray->P;
  INTEGRATOR_STATE_WRITE(state, shadow_ray, D) = ray->D;
@ -60,7 +60,7 @@ ccl_device_forceinline void integrator_state_write_shadow_ray(
 }

 ccl_device_forceinline void integrator_state_read_shadow_ray(KernelGlobals kg,
-                                                             ConstIntegratorState state,
+                                                             ConstIntegratorShadowState state,
                                                             ccl_private Ray *ccl_restrict ray)
 {
  ray->P = INTEGRATOR_STATE(state, shadow_ray, P);
@ -122,7 +122,9 @@ ccl_device_forceinline bool integrator_state_volume_stack_is_empty(KernelGlobals
 /* Shadow Intersection */

 ccl_device_forceinline void integrator_state_write_shadow_isect(
-    IntegratorState state, ccl_private const Intersection *ccl_restrict isect, const int index)
+    IntegratorShadowState state,
+    ccl_private const Intersection *ccl_restrict isect,
+    const int index)
 {
  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, index, t) = isect->t;
  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, index, u) = isect->u;
@ -133,7 +135,9 @@ ccl_device_forceinline void integrator_state_write_shadow_isect(
 }

 ccl_device_forceinline void integrator_state_read_shadow_isect(
-    ConstIntegratorState state, ccl_private Intersection *ccl_restrict isect, const int index)
+    ConstIntegratorShadowState state,
+    ccl_private Intersection *ccl_restrict isect,
+    const int index)
 {
  isect->prim = INTEGRATOR_STATE_ARRAY(state, shadow_isect, index, prim);
  isect->object = INTEGRATOR_STATE_ARRAY(state, shadow_isect, index, object);
@ -143,8 +147,8 @@ ccl_device_forceinline void integrator_state_read_shadow_isect(
  isect->t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, index, t);
 }

-ccl_device_forceinline void integrator_state_copy_volume_stack_to_shadow(KernelGlobals kg,
-                                                                         IntegratorState state)
+ccl_device_forceinline void integrator_state_copy_volume_stack_to_shadow(
+    KernelGlobals kg, IntegratorShadowState shadow_state, ConstIntegratorState state)
 {
  if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
    int index = 0;
@ -152,9 +156,9 @@ ccl_device_forceinline void integrator_state_copy_volume_stack_to_shadow(KernelG
    do {
      shader = INTEGRATOR_STATE_ARRAY(state, volume_stack, index, shader);

-      INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_volume_stack, index, object) =
+      INTEGRATOR_STATE_ARRAY_WRITE(shadow_state, shadow_volume_stack, index, object) =
          INTEGRATOR_STATE_ARRAY(state, volume_stack, index, object);
-      INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_volume_stack, index, shader) = shader;
+      INTEGRATOR_STATE_ARRAY_WRITE(shadow_state, shadow_volume_stack, index, shader) = shader;

      ++index;
    } while (shader != OBJECT_NONE);
@ -181,7 +185,7 @@ ccl_device_forceinline void integrator_state_copy_volume_stack(KernelGlobals kg,
 }

 ccl_device_forceinline VolumeStack
-integrator_state_read_shadow_volume_stack(ConstIntegratorState state, int i)
+integrator_state_read_shadow_volume_stack(ConstIntegratorShadowState state, int i)
 {
  VolumeStack entry = {INTEGRATOR_STATE_ARRAY(state, shadow_volume_stack, i, object),
                       INTEGRATOR_STATE_ARRAY(state, shadow_volume_stack, i, shader)};
@ -189,14 +193,14 @@ integrator_state_read_shadow_volume_stack(ConstIntegratorState state, int i)
 }

 ccl_device_forceinline bool integrator_state_shadow_volume_stack_is_empty(
-    KernelGlobals kg, ConstIntegratorState state)
+    KernelGlobals kg, ConstIntegratorShadowState state)
 {
  return (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) ?
             INTEGRATOR_STATE_ARRAY(state, shadow_volume_stack, 0, shader) == SHADER_NONE :
             true;
 }

-ccl_device_forceinline void integrator_state_write_shadow_volume_stack(IntegratorState state,
+ccl_device_forceinline void integrator_state_write_shadow_volume_stack(IntegratorShadowState state,
                                                                       int i,
                                                                       VolumeStack entry)
 {
@ -259,7 +263,6 @@ ccl_device_inline void integrator_state_move(KernelGlobals kg,
  integrator_state_copy_only(kg, to_state, state);

  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
-  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
 }

 #endif
@ -270,12 +273,11 @@ ccl_device_inline void integrator_state_shadow_catcher_split(KernelGlobals kg,
                                                             IntegratorState state)
 {
 #if defined(__KERNEL_GPU__)
-  const IntegratorState to_state = atomic_fetch_and_add_uint32(
+  ConstIntegratorState to_state = atomic_fetch_and_add_uint32(
      &kernel_integrator_state.next_shadow_catcher_path_index[0], 1);

  integrator_state_copy_only(kg, to_state, state);
 #else
-
  IntegratorStateCPU *ccl_restrict to_state = state + 1;

  /* Only copy the required subset, since shadow intersections are big and irrelevant here. */
@ -283,10 +285,99 @@ ccl_device_inline void integrator_state_shadow_catcher_split(KernelGlobals kg,
  to_state->ray = state->ray;
  to_state->isect = state->isect;
  integrator_state_copy_volume_stack(kg, to_state, state);
-  to_state->shadow_path = state->shadow_path;
 #endif

  INTEGRATOR_STATE_WRITE(to_state, path, flag) |= PATH_RAY_SHADOW_CATCHER_PASS;
 }

+#ifdef __KERNEL_CPU__
+ccl_device_inline int integrator_state_bounce(ConstIntegratorState state, const int)
+{
+  return INTEGRATOR_STATE(state, path, bounce);
+}
+
+ccl_device_inline int integrator_state_bounce(ConstIntegratorShadowState state, const int)
+{
+  return INTEGRATOR_STATE(state, shadow_path, bounce);
+}
+
+ccl_device_inline int integrator_state_diffuse_bounce(ConstIntegratorState state, const int)
+{
+  return INTEGRATOR_STATE(state, path, diffuse_bounce);
+}
+
+ccl_device_inline int integrator_state_diffuse_bounce(ConstIntegratorShadowState state, const int)
+{
+  return INTEGRATOR_STATE(state, shadow_path, diffuse_bounce);
+}
+
+ccl_device_inline int integrator_state_glossy_bounce(ConstIntegratorState state, const int)
+{
+  return INTEGRATOR_STATE(state, path, glossy_bounce);
+}
+
+ccl_device_inline int integrator_state_glossy_bounce(ConstIntegratorShadowState state, const int)
+{
+  return INTEGRATOR_STATE(state, shadow_path, glossy_bounce);
+}
+
+ccl_device_inline int integrator_state_transmission_bounce(ConstIntegratorState state, const int)
+{
+  return INTEGRATOR_STATE(state, path, transmission_bounce);
+}
+
+ccl_device_inline int integrator_state_transmission_bounce(ConstIntegratorShadowState state,
+                                                           const int)
+{
+  return INTEGRATOR_STATE(state, shadow_path, transmission_bounce);
+}
+
+ccl_device_inline int integrator_state_transparent_bounce(ConstIntegratorState state, const int)
+{
+  return INTEGRATOR_STATE(state, path, transparent_bounce);
+}
+
+ccl_device_inline int integrator_state_transparent_bounce(ConstIntegratorShadowState state,
+                                                          const int)
+{
+  return INTEGRATOR_STATE(state, shadow_path, transparent_bounce);
+}
+#else
+ccl_device_inline int integrator_state_bounce(ConstIntegratorShadowState state,
+                                              const uint32_t path_flag)
+{
+  return (path_flag & PATH_RAY_SHADOW) ? INTEGRATOR_STATE(state, shadow_path, bounce) :
+                                         INTEGRATOR_STATE(state, path, bounce);
+}
+
+ccl_device_inline int integrator_state_diffuse_bounce(ConstIntegratorShadowState state,
+                                                      const uint32_t path_flag)
+{
+  return (path_flag & PATH_RAY_SHADOW) ? INTEGRATOR_STATE(state, shadow_path, diffuse_bounce) :
+                                         INTEGRATOR_STATE(state, path, diffuse_bounce);
+}
+
+ccl_device_inline int integrator_state_glossy_bounce(ConstIntegratorShadowState state,
+                                                     const uint32_t path_flag)
+{
+  return (path_flag & PATH_RAY_SHADOW) ? INTEGRATOR_STATE(state, shadow_path, glossy_bounce) :
+                                         INTEGRATOR_STATE(state, path, glossy_bounce);
+}
+
+ccl_device_inline int integrator_state_transmission_bounce(ConstIntegratorShadowState state,
+                                                           const uint32_t path_flag)
+{
+  return (path_flag & PATH_RAY_SHADOW) ?
+             INTEGRATOR_STATE(state, shadow_path, transmission_bounce) :
+             INTEGRATOR_STATE(state, path, transmission_bounce);
+}
+
+ccl_device_inline int integrator_state_transparent_bounce(ConstIntegratorShadowState state,
+                                                          const uint32_t path_flag)
+{
+  return (path_flag & PATH_RAY_SHADOW) ? INTEGRATOR_STATE(state, shadow_path, transparent_bounce) :
+                                         INTEGRATOR_STATE(state, path, transparent_bounce);
+}
+#endif
+
 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/integrator_volume_stack.h
+++ b/intern/cycles/kernel/integrator/integrator_volume_stack.h
@ -98,7 +98,7 @@ ccl_device void volume_stack_enter_exit(KernelGlobals kg,
 }

 ccl_device void shadow_volume_stack_enter_exit(KernelGlobals kg,
-                                               IntegratorState state,
+                                               IntegratorShadowState state,
                                               ccl_private const ShaderData *sd)
 {
  volume_stack_enter_exit(
@ -136,9 +136,7 @@ ccl_device_inline void volume_stack_clean(KernelGlobals kg, IntegratorState stat
 }

 template<typename StackReadOp>
-ccl_device float volume_stack_step_size(KernelGlobals kg,
-                                        IntegratorState state,
-                                        StackReadOp stack_read)
+ccl_device float volume_stack_step_size(KernelGlobals kg, StackReadOp stack_read)
 {
  float step_size = FLT_MAX;

--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@ -393,17 +393,20 @@ ccl_device_inline void kernel_accum_emission_or_background_pass(KernelGlobals kg

 /* Write light contribution to render buffer. */
 ccl_device_inline void kernel_accum_light(KernelGlobals kg,
-                                          ConstIntegratorState state,
+                                          ConstIntegratorShadowState state,
                                          ccl_global float *ccl_restrict render_buffer)
 {
  /* The throughput for shadow paths already contains the light shader evaluation. */
  float3 contribution = INTEGRATOR_STATE(state, shadow_path, throughput);
  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(state, shadow_path, bounce));

-  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(state, shadow_path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  ccl_global float *buffer = render_buffer + render_buffer_offset;

  const uint32_t path_flag = INTEGRATOR_STATE(state, shadow_path, flag);
-  const int sample = INTEGRATOR_STATE(state, path, sample);
+  const int sample = INTEGRATOR_STATE(state, shadow_path, sample);

  kernel_accum_combined_pass(kg, path_flag, sample, contribution, buffer);

--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@ -26,7 +26,9 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void path_state_init_queues(IntegratorState state)
 {
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
-  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
+#ifdef __KERNEL_CPU__
+  INTEGRATOR_STATE_WRITE(&state->shadow, shadow_path, queued_kernel) = 0;
+#endif
 }

 /* Minimalistic initialization of the path state, which is needed for early outputs in the
@ -293,16 +295,15 @@ ccl_device_inline void path_state_rng_load(ConstIntegratorState state,
  rng_state->sample = INTEGRATOR_STATE(state, path, sample);
 }

-ccl_device_inline void shadow_path_state_rng_load(ConstIntegratorState state,
+ccl_device_inline void shadow_path_state_rng_load(ConstIntegratorShadowState state,
                                                  ccl_private RNGState *rng_state)
 {
-  const uint shadow_bounces = INTEGRATOR_STATE(state, shadow_path, transparent_bounce) -
-                              INTEGRATOR_STATE(state, path, transparent_bounce);
+  const uint shadow_bounces = INTEGRATOR_STATE(state, shadow_path, transparent_bounce);

-  rng_state->rng_hash = INTEGRATOR_STATE(state, path, rng_hash);
-  rng_state->rng_offset = INTEGRATOR_STATE(state, path, rng_offset) +
+  rng_state->rng_hash = INTEGRATOR_STATE(state, shadow_path, rng_hash);
+  rng_state->rng_offset = INTEGRATOR_STATE(state, shadow_path, rng_offset) +
                          PRNG_BOUNCE_NUM * shadow_bounces;
-  rng_state->sample = INTEGRATOR_STATE(state, path, sample);
+  rng_state->sample = INTEGRATOR_STATE(state, shadow_path, sample);
 }

 ccl_device_inline float path_state_rng_1D(KernelGlobals kg,
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@ -575,9 +575,9 @@ ccl_device float3 shader_holdout_apply(KernelGlobals kg, ccl_private ShaderData

 /* Surface Evaluation */

-template<uint node_feature_mask>
+template<uint node_feature_mask, typename ConstIntegratorGenericState>
 ccl_device void shader_eval_surface(KernelGlobals kg,
-                                    ConstIntegratorState state,
+                                    ConstIntegratorGenericState state,
                                    ccl_private ShaderData *ccl_restrict sd,
                                    ccl_global float *ccl_restrict buffer,
                                    uint32_t path_flag)
@ -753,9 +753,9 @@ ccl_device int shader_phase_sample_closure(KernelGlobals kg,

 /* Volume Evaluation */

-template<const bool shadow, typename StackReadOp>
+template<const bool shadow, typename StackReadOp, typename ConstIntegratorGenericState>
 ccl_device_inline void shader_eval_volume(KernelGlobals kg,
-                                          ConstIntegratorState state,
+                                          ConstIntegratorGenericState state,
                                          ccl_private ShaderData *ccl_restrict sd,
                                          const uint32_t path_flag,
                                          StackReadOp stack_read)
@ -831,8 +831,9 @@ ccl_device_inline void shader_eval_volume(KernelGlobals kg,

 /* Displacement Evaluation */

+template<typename ConstIntegratorGenericState>
 ccl_device void shader_eval_displacement(KernelGlobals kg,
-                                         ConstIntegratorState state,
+                                         ConstIntegratorGenericState state,
                                         ccl_private ShaderData *sd)
 {
  sd->num_closure = 0;
--- a/intern/cycles/kernel/kernel_shadow_catcher.h
+++ b/intern/cycles/kernel/kernel_shadow_catcher.h
@ -62,7 +62,7 @@ ccl_device_inline bool kernel_shadow_catcher_is_path_split_bounce(KernelGlobals
 ccl_device_inline bool kernel_shadow_catcher_path_can_split(KernelGlobals kg,
                                                            ConstIntegratorState state)
 {
-  if (INTEGRATOR_PATH_IS_TERMINATED && INTEGRATOR_SHADOW_PATH_IS_TERMINATED) {
+  if (INTEGRATOR_PATH_IS_TERMINATED) {
    return false;
  }

--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@ -812,6 +812,7 @@ typedef struct ccl_align(16) ShaderData
 #ifdef __OSL__
  const struct KernelGlobalsCPU *osl_globals;
  const struct IntegratorStateCPU *osl_path_state;
+  const struct IntegratorShadowStateCPU *osl_shadow_path_state;
 #endif

  /* LCG state for closures that require additional random numbers. */
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@ -1015,31 +1015,44 @@ bool OSLRenderServices::get_background_attribute(const KernelGlobalsCPU *kg,
  else if (name == u_path_ray_depth) {
    /* Ray Depth */
    const IntegratorStateCPU *state = sd->osl_path_state;
-    int f = state->path.bounce;
+    const IntegratorShadowStateCPU *shadow_state = sd->osl_shadow_path_state;
+    int f = (state) ? state->path.bounce : (shadow_state) ? shadow_state->shadow_path.bounce : 0;
    return set_attribute_int(f, type, derivatives, val);
  }
  else if (name == u_path_diffuse_depth) {
    /* Diffuse Ray Depth */
    const IntegratorStateCPU *state = sd->osl_path_state;
-    int f = state->path.diffuse_bounce;
+    const IntegratorShadowStateCPU *shadow_state = sd->osl_shadow_path_state;
+    int f = (state)        ? state->path.diffuse_bounce :
+            (shadow_state) ? shadow_state->shadow_path.diffuse_bounce :
+                             0;
    return set_attribute_int(f, type, derivatives, val);
  }
  else if (name == u_path_glossy_depth) {
    /* Glossy Ray Depth */
    const IntegratorStateCPU *state = sd->osl_path_state;
-    int f = state->path.glossy_bounce;
+    const IntegratorShadowStateCPU *shadow_state = sd->osl_shadow_path_state;
+    int f = (state)        ? state->path.glossy_bounce :
+            (shadow_state) ? shadow_state->shadow_path.glossy_bounce :
+                             0;
    return set_attribute_int(f, type, derivatives, val);
  }
  else if (name == u_path_transmission_depth) {
    /* Transmission Ray Depth */
    const IntegratorStateCPU *state = sd->osl_path_state;
-    int f = state->path.transmission_bounce;
+    const IntegratorShadowStateCPU *shadow_state = sd->osl_shadow_path_state;
+    int f = (state)        ? state->path.transmission_bounce :
+            (shadow_state) ? shadow_state->shadow_path.transmission_bounce :
+                             0;
    return set_attribute_int(f, type, derivatives, val);
  }
  else if (name == u_path_transparent_depth) {
    /* Transparent Ray Depth */
    const IntegratorStateCPU *state = sd->osl_path_state;
-    int f = state->path.transparent_bounce;
+    const IntegratorShadowStateCPU *shadow_state = sd->osl_shadow_path_state;
+    int f = (state)        ? state->path.transparent_bounce :
+            (shadow_state) ? shadow_state->shadow_path.transparent_bounce :
+                             0;
    return set_attribute_int(f, type, derivatives, val);
  }
  else if (name == u_ndc) {
@ -1228,34 +1241,38 @@ bool OSLRenderServices::texture(ustring filename,
      /* Bevel shader hack. */
      if (nchannels >= 3) {
        const IntegratorStateCPU *state = sd->osl_path_state;
-        int num_samples = (int)s;
-        float radius = t;
-        float3 N = svm_bevel(kernel_globals, state, sd, radius, num_samples);
-        result[0] = N.x;
-        result[1] = N.y;
-        result[2] = N.z;
-        status = true;
+        if (state) {
+          int num_samples = (int)s;
+          float radius = t;
+          float3 N = svm_bevel(kernel_globals, state, sd, radius, num_samples);
+          result[0] = N.x;
+          result[1] = N.y;
+          result[2] = N.z;
+          status = true;
+        }
      }
      break;
    }
    case OSLTextureHandle::AO: {
      /* AO shader hack. */
      const IntegratorStateCPU *state = sd->osl_path_state;
-      int num_samples = (int)s;
-      float radius = t;
-      float3 N = make_float3(dsdx, dtdx, dsdy);
-      int flags = 0;
-      if ((int)dtdy) {
-        flags |= NODE_AO_INSIDE;
+      if (state) {
+        int num_samples = (int)s;
+        float radius = t;
+        float3 N = make_float3(dsdx, dtdx, dsdy);
+        int flags = 0;
+        if ((int)dtdy) {
+          flags |= NODE_AO_INSIDE;
+        }
+        if ((int)options.sblur) {
+          flags |= NODE_AO_ONLY_LOCAL;
+        }
+        if ((int)options.tblur) {
+          flags |= NODE_AO_GLOBAL_RADIUS;
+        }
+        result[0] = svm_ao(kernel_globals, state, sd, N, radius, num_samples, flags);
+        status = true;
      }
-      if ((int)options.sblur) {
-        flags |= NODE_AO_ONLY_LOCAL;
-      }
-      if ((int)options.tblur) {
-        flags |= NODE_AO_GLOBAL_RADIUS;
-      }
-      result[0] = svm_ao(kernel_globals, state, sd, N, radius, num_samples, flags);
-      status = true;
      break;
    }
    case OSLTextureHandle::SVM: {
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@ -89,7 +89,7 @@ void OSLShader::thread_free(KernelGlobalsCPU *kg)

 static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
                                        ShaderData *sd,
-                                        const IntegratorStateCPU *state,
+                                        const void *state,
                                        uint32_t path_flag,
                                        OSLThreadData *tdata)
 {
@ -134,7 +134,12 @@ static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,

  /* Used by render-services. */
  sd->osl_globals = kg;
-  sd->osl_path_state = state;
+  if (path_flag & PATH_RAY_SHADOW) {
+    sd->osl_shadow_path_state = (const IntegratorShadowStateCPU *)state;
+  }
+  else {
+    sd->osl_path_state = (const IntegratorStateCPU *)state;
+  }
 }

 /* Surface */
@ -175,7 +180,7 @@ static void flatten_surface_closure_tree(ShaderData *sd,
 }

 void OSLShader::eval_surface(const KernelGlobalsCPU *kg,
-                             const IntegratorStateCPU *state,
+                             const void *state,
                             ShaderData *sd,
                             uint32_t path_flag)
 {
@ -283,7 +288,7 @@ static void flatten_background_closure_tree(ShaderData *sd,
 }

 void OSLShader::eval_background(const KernelGlobalsCPU *kg,
-                                const IntegratorStateCPU *state,
+                                const void *state,
                                ShaderData *sd,
                                uint32_t path_flag)
 {
@ -341,7 +346,7 @@ static void flatten_volume_closure_tree(ShaderData *sd,
 }

 void OSLShader::eval_volume(const KernelGlobalsCPU *kg,
-                            const IntegratorStateCPU *state,
+                            const void *state,
                            ShaderData *sd,
                            uint32_t path_flag)
 {
@ -366,9 +371,7 @@ void OSLShader::eval_volume(const KernelGlobalsCPU *kg,

 /* Displacement */

-void OSLShader::eval_displacement(const KernelGlobalsCPU *kg,
-                                  const IntegratorStateCPU *state,
-                                  ShaderData *sd)
+void OSLShader::eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd)
 {
  /* setup shader globals from shader data */
  OSLThreadData *tdata = kg->osl_tdata;
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@ -55,20 +55,18 @@ class OSLShader {

  /* eval */
  static void eval_surface(const KernelGlobalsCPU *kg,
-                           const IntegratorStateCPU *state,
+                           const void *state,
                           ShaderData *sd,
                           uint32_t path_flag);
  static void eval_background(const KernelGlobalsCPU *kg,
-                              const IntegratorStateCPU *state,
+                              const void *state,
                              ShaderData *sd,
                              uint32_t path_flag);
  static void eval_volume(const KernelGlobalsCPU *kg,
-                          const IntegratorStateCPU *state,
+                          const void *state,
                          ShaderData *sd,
                          uint32_t path_flag);
-  static void eval_displacement(const KernelGlobalsCPU *kg,
-                                const IntegratorStateCPU *state,
-                                ShaderData *sd);
+  static void eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd);

  /* attributes */
  static int find_attribute(const KernelGlobalsCPU *kg,
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@ -225,9 +225,9 @@ CCL_NAMESPACE_END
 CCL_NAMESPACE_BEGIN

 /* Main Interpreter Loop */
-template<uint node_feature_mask, ShaderType type>
+template<uint node_feature_mask, ShaderType type, typename ConstIntegratorGenericState>
 ccl_device void svm_eval_nodes(KernelGlobals kg,
-                               ConstIntegratorState state,
+                               ConstIntegratorGenericState state,
                               ShaderData *sd,
                               ccl_global float *render_buffer,
                               uint32_t path_flag)
--- a/intern/cycles/kernel/svm/svm_ao.h
+++ b/intern/cycles/kernel/svm/svm_ao.h
@ -21,17 +21,17 @@ CCL_NAMESPACE_BEGIN
 #ifdef __SHADER_RAYTRACE__

 #  ifdef __KERNEL_OPTIX__
-extern "C" __device__ float __direct_callable__svm_node_ao(KernelGlobals kg,
-                                                           ConstIntegratorState state,
+extern "C" __device__ float __direct_callable__svm_node_ao(
 #  else
-ccl_device float svm_ao(KernelGlobals kg,
-                        ConstIntegratorState state,
+ccl_device float svm_ao(
 #  endif
-                                                           ccl_private ShaderData *sd,
-                                                           float3 N,
-                                                           float max_dist,
-                                                           int num_samples,
-                                                           int flags)
+    KernelGlobals kg,
+    ConstIntegratorState state,
+    ccl_private ShaderData *sd,
+    float3 N,
+    float max_dist,
+    int num_samples,
+    int flags)
 {
  if (flags & NODE_AO_GLOBAL_RADIUS) {
    max_dist = kernel_data.integrator.ao_bounces_distance;
@ -91,7 +91,7 @@ ccl_device float svm_ao(KernelGlobals kg,
  return ((float)unoccluded) / num_samples;
 }

-template<uint node_feature_mask>
+template<uint node_feature_mask, typename ConstIntegratorGenericState>
 #  if defined(__KERNEL_OPTIX__)
 ccl_device_inline
 #  else
@ -99,7 +99,7 @@ ccl_device_noinline
 #  endif
    void
    svm_node_ao(KernelGlobals kg,
-                ConstIntegratorState state,
+                ConstIntegratorGenericState state,
                ccl_private ShaderData *sd,
                ccl_private float *stack,
                uint4 node)
--- a/intern/cycles/kernel/svm/svm_aov.h
+++ b/intern/cycles/kernel/svm/svm_aov.h
@ -26,9 +26,9 @@ ccl_device_inline bool svm_node_aov_check(const uint32_t path_flag,
  return ((render_buffer != NULL) && is_primary);
 }

-template<uint node_feature_mask>
+template<uint node_feature_mask, typename ConstIntegratorGenericState>
 ccl_device void svm_node_aov_color(KernelGlobals kg,
-                                   ConstIntegratorState state,
+                                   ConstIntegratorGenericState state,
                                   ccl_private ShaderData *sd,
                                   ccl_private float *stack,
                                   uint4 node,
@ -46,9 +46,9 @@ ccl_device void svm_node_aov_color(KernelGlobals kg,
  }
 }

-template<uint node_feature_mask>
+template<uint node_feature_mask, typename ConstIntegratorGenericState>
 ccl_device void svm_node_aov_value(KernelGlobals kg,
-                                   ConstIntegratorState state,
+                                   ConstIntegratorGenericState state,
                                   ccl_private ShaderData *sd,
                                   ccl_private float *stack,
                                   uint4 node,
--- a/intern/cycles/kernel/svm/svm_bevel.h
+++ b/intern/cycles/kernel/svm/svm_bevel.h
@ -99,15 +99,15 @@ ccl_device void svm_bevel_cubic_sample(const float radius,
 */

 #  ifdef __KERNEL_OPTIX__
-extern "C" __device__ float3 __direct_callable__svm_node_bevel(KernelGlobals kg,
-                                                               ConstIntegratorState state,
+extern "C" __device__ float3 __direct_callable__svm_node_bevel(
 #  else
-ccl_device float3 svm_bevel(KernelGlobals kg,
-                            ConstIntegratorState state,
+ccl_device float3 svm_bevel(
 #  endif
-                                                               ccl_private ShaderData *sd,
-                                                               float radius,
-                                                               int num_samples)
+    KernelGlobals kg,
+    ConstIntegratorState state,
+    ccl_private ShaderData *sd,
+    float radius,
+    int num_samples)
 {
  /* Early out if no sampling needed. */
  if (radius <= 0.0f || num_samples < 1 || sd->object == OBJECT_NONE) {
@ -282,7 +282,7 @@ ccl_device float3 svm_bevel(KernelGlobals kg,
  return is_zero(N) ? sd->N : (sd->flag & SD_BACKFACING) ? -N : N;
 }

-template<uint node_feature_mask>
+template<uint node_feature_mask, typename ConstIntegratorGenericState>
 #  if defined(__KERNEL_OPTIX__)
 ccl_device_inline
 #  else
@ -290,7 +290,7 @@ ccl_device_noinline
 #  endif
    void
    svm_node_bevel(KernelGlobals kg,
-                   ConstIntegratorState state,
+                   ConstIntegratorGenericState state,
                   ccl_private ShaderData *sd,
                   ccl_private float *stack,
                   uint4 node)
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@ -18,9 +18,9 @@ CCL_NAMESPACE_BEGIN

 /* Light Path Node */

-template<uint node_feature_mask>
+template<uint node_feature_mask, typename ConstIntegratorGenericState>
 ccl_device_noinline void svm_node_light_path(KernelGlobals kg,
-                                             ConstIntegratorState state,
+                                             ConstIntegratorGenericState state,
                                             ccl_private const ShaderData *sd,
                                             ccl_private float *stack,
                                             uint type,
@ -64,48 +64,43 @@ ccl_device_noinline void svm_node_light_path(KernelGlobals kg,
      /* Read bounce from difference location depending if this is a shadow
       * path. It's a bit dubious to have integrate state details leak into
       * this function but hard to avoid currently. */
-      int bounce = 0;
      IF_KERNEL_NODES_FEATURE(LIGHT_PATH)
      {
-        bounce = (path_flag & PATH_RAY_SHADOW) ? INTEGRATOR_STATE(state, shadow_path, bounce) :
-                                                 INTEGRATOR_STATE(state, path, bounce);
+        info = (float)integrator_state_bounce(state, path_flag);
      }

      /* For background, light emission and shadow evaluation we from a
       * surface or volume we are effective one bounce further. */
      if (path_flag & (PATH_RAY_SHADOW | PATH_RAY_EMISSION)) {
-        bounce++;
+        info += 1.0f;
      }
-
-      info = (float)bounce;
      break;
    }
-      /* TODO */
    case NODE_LP_ray_transparent: {
-      int bounce = 0;
      IF_KERNEL_NODES_FEATURE(LIGHT_PATH)
      {
-        bounce = (path_flag & PATH_RAY_SHADOW) ?
-                     INTEGRATOR_STATE(state, shadow_path, transparent_bounce) :
-                     INTEGRATOR_STATE(state, path, transparent_bounce);
+        info = (float)integrator_state_transparent_bounce(state, path_flag);
      }
-
-      info = (float)bounce;
      break;
    }
-#if 0
    case NODE_LP_ray_diffuse:
-      info = (float)state->diffuse_bounce;
+      IF_KERNEL_NODES_FEATURE(LIGHT_PATH)
+      {
+        info = (float)integrator_state_diffuse_bounce(state, path_flag);
+      }
      break;
    case NODE_LP_ray_glossy:
-      info = (float)state->glossy_bounce;
+      IF_KERNEL_NODES_FEATURE(LIGHT_PATH)
+      {
+        info = (float)integrator_state_glossy_bounce(state, path_flag);
+      }
      break;
-#endif
-#if 0
    case NODE_LP_ray_transmission:
-      info = (float)state->transmission_bounce;
+      IF_KERNEL_NODES_FEATURE(LIGHT_PATH)
+      {
+        info = (float)integrator_state_transmission_bounce(state, path_flag);
+      }
      break;
-#endif
  }

  stack_store_float(stack, out_offset, info);