Cycles: make ambient occlusion pass take into account transparency again

Taking advantage of the new decoupled main and shadow paths. For CPU we just store two nested structs in the integrator state, one for direct light shadows and one for AO. For the GPU we restrict the number of shade surface states to be executed based on available space in the shadow paths queue. This also helps improve performance in benchmark scenes with an AO pass, since it is no longer needed to use the shader raytracing kernel there, which has worse performance. Differential Revision: https://developer.blender.org/D12900
2021-10-17 18:08:00 +02:00 · 2021-10-17 18:08:00 +02:00 · cccfa597ba
parent 001f548227
commit cccfa597ba
18 changed files with 178 additions and 97 deletions
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@ -78,6 +78,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
      integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
      integrator_shader_raytrace_sort_counter_(
          device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
+      integrator_shader_sort_prefix_sum_(
+          device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
      integrator_next_shadow_path_index_(
          device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
      integrator_next_shadow_catcher_path_index_(
@ -200,6 +202,9 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
    integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
    integrator_shader_raytrace_sort_counter_.zero_to_device();

+    integrator_shader_sort_prefix_sum_.alloc(max_shaders);
+    integrator_shader_sort_prefix_sum_.zero_to_device();
+
    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
        (int *)integrator_shader_sort_counter_.device_pointer;
    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
@ -374,9 +379,12 @@ bool PathTraceWorkGPU::enqueue_path_iteration()

  /* For kernels that add shadow paths, check if there is enough space available.
   * If not, schedule shadow kernels first to clear out the shadow paths. */
+  int num_paths_limit = INT_MAX;
+
  if (kernel_creates_shadow_paths(kernel)) {
-    if (max_num_paths_ - integrator_next_shadow_path_index_.data()[0] <
-        queue_counter->num_queued[kernel]) {
+    const int available_shadow_paths = max_num_paths_ -
+                                       integrator_next_shadow_path_index_.data()[0];
+    if (available_shadow_paths < queue_counter->num_queued[kernel]) {
      if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
        enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
        return true;
@ -386,10 +394,14 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
        return true;
      }
    }
+    else if (kernel_creates_ao_paths(kernel)) {
+      /* AO kernel creates two shadow paths, so limit number of states to schedule. */
+      num_paths_limit = available_shadow_paths / 2;
+    }
  }

  /* Schedule kernel with maximum number of queued items. */
-  enqueue_path_iteration(kernel);
+  enqueue_path_iteration(kernel, num_paths_limit);

  /* Update next shadow path index for kernels that can add shadow paths. */
  if (kernel_creates_shadow_paths(kernel)) {
@ -399,7 +411,7 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
  return true;
 }

-void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
+void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit)
 {
  void *d_path_index = (void *)NULL;

@ -414,7 +426,8 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
    work_size = num_queued;
    d_path_index = (void *)queued_paths_.device_pointer;

-    compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel);
+    compute_sorted_queued_paths(
+        DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit);
  }
  else if (num_queued < work_size) {
    work_size = num_queued;
@ -430,6 +443,8 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
    }
  }

+  work_size = min(work_size, num_paths_limit);
+
  DCHECK_LE(work_size, max_num_paths_);

  switch (kernel) {
@ -464,17 +479,20 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
  }
 }

-void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
+                                                   DeviceKernel queued_kernel,
+                                                   const int num_paths_limit)
 {
  int d_queued_kernel = queued_kernel;
  void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
-  assert(d_counter != nullptr);
+  void *d_prefix_sum = (void *)integrator_shader_sort_prefix_sum_.device_pointer;
+  assert(d_counter != nullptr && d_prefix_sum != nullptr);

  /* Compute prefix sum of number of active paths with each shader. */
  {
    const int work_size = 1;
    int max_shaders = device_scene_->data.max_shaders;
-    void *args[] = {&d_counter, &max_shaders};
+    void *args[] = {&d_counter, &d_prefix_sum, &max_shaders};
    queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
  }

@ -483,29 +501,24 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKe
  /* Launch kernel to fill the active paths arrays. */
  {
    /* TODO: this could be smaller for terminated paths based on amount of work we want
-     * to schedule. */
+     * to schedule, and also based on num_paths_limit.
+     *
+     * Also, when the number paths is limited it may be better to prefer paths from the
+     * end of the array since compaction would need to do less work. */
    const int work_size = kernel_max_active_path_index(queued_kernel);

    void *d_queued_paths = (void *)queued_paths_.device_pointer;
    void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
    void *args[] = {const_cast<int *>(&work_size),
+                    const_cast<int *>(&num_paths_limit),
                    &d_queued_paths,
                    &d_num_queued_paths,
                    &d_counter,
+                    &d_prefix_sum,
                    &d_queued_kernel};

    queue_->enqueue(kernel, work_size, args);
  }
-
-  if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
-    queue_->zero_to_device(integrator_shader_sort_counter_);
-  }
-  else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
-    queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
-  }
-  else {
-    assert(0);
-  }
 }

 void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
@ -1026,6 +1039,13 @@ bool PathTraceWorkGPU::kernel_creates_shadow_paths(DeviceKernel kernel)
          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
 }

+bool PathTraceWorkGPU::kernel_creates_ao_paths(DeviceKernel kernel)
+{
+  return (device_scene_->data.film.pass_ao != PASS_UNUSED) &&
+         (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE);
+}
+
 bool PathTraceWorkGPU::kernel_is_shadow_path(DeviceKernel kernel)
 {
  return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@ -79,10 +79,12 @@ class PathTraceWorkGPU : public PathTraceWork {
                          const int num_predicted_splits);

  bool enqueue_path_iteration();
-  void enqueue_path_iteration(DeviceKernel kernel);
+  void enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit = INT_MAX);

  void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
-  void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+  void compute_sorted_queued_paths(DeviceKernel kernel,
+                                   DeviceKernel queued_kernel,
+                                   const int num_paths_limit);

  void compact_states(const int num_active_paths);

@ -116,6 +118,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  /* Kernel properties. */
  bool kernel_uses_sorting(DeviceKernel kernel);
  bool kernel_creates_shadow_paths(DeviceKernel kernel);
+  bool kernel_creates_ao_paths(DeviceKernel kernel);
  bool kernel_is_shadow_path(DeviceKernel kernel);
  int kernel_max_active_path_index(DeviceKernel kernel);

@ -136,6 +139,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  /* Shader sorting. */
  device_vector<int> integrator_shader_sort_counter_;
  device_vector<int> integrator_shader_raytrace_sort_counter_;
+  device_vector<int> integrator_shader_sort_prefix_sum_;
  /* Path split. */
  device_vector<int> integrator_next_shadow_path_index_;
  device_vector<int> integrator_next_shadow_catcher_path_index_;
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@ -282,11 +282,22 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 }

 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_sorted_paths_array(
-        int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel)
+    kernel_gpu_integrator_sorted_paths_array(int num_states,
+                                             int num_states_limit,
+                                             int *indices,
+                                             int *num_indices,
+                                             int *key_counter,
+                                             int *key_prefix_sum,
+                                             int kernel)
 {
  gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
+      num_states,
+      num_states_limit,
+      indices,
+      num_indices,
+      key_counter,
+      key_prefix_sum,
+      [kernel](const int state) {
        return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ?
                   INTEGRATOR_STATE(state, path, shader_sort_key) :
                   GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
@ -322,9 +333,10 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
 }

 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_prefix_sum(int *values, int num_values)
+    kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
 {
-  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values);
+  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(
+      counter, prefix_sum, num_values);
 }

 /* --------------------------------------------------------------------
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@ -33,7 +33,8 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
 #endif

-template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
+template<uint blocksize>
+__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values)
 {
  if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
    return;
@ -41,8 +42,9 @@ template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, co

  int offset = 0;
  for (int i = 0; i < num_values; i++) {
-    const int new_offset = offset + values[i];
-    values[i] = offset;
+    const int new_offset = offset + counter[i];
+    prefix_sum[i] = offset;
+    counter[i] = 0;
    offset = new_offset;
  }
 }
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@ -35,8 +35,10 @@ CCL_NAMESPACE_BEGIN

 template<uint blocksize, typename GetKeyOp>
 __device__ void gpu_parallel_sorted_index_array(const uint num_states,
+                                                const int num_states_limit,
                                                int *indices,
                                                int *num_indices,
+                                                int *key_counter,
                                                int *key_prefix_sum,
                                                GetKeyOp get_key_op)
 {
@ -46,7 +48,15 @@ __device__ void gpu_parallel_sorted_index_array(const uint num_states,

  if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
    const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
-    indices[index] = state_index;
+    if (index < num_states_limit) {
+      /* Assign state index. */
+      indices[index] = state_index;
+    }
+    else {
+      /* Can't process this state now, increase the counter again so that
+       * it will be handled in another iteration. */
+      atomic_fetch_and_add_uint32(&key_counter[key], 1);
+    }
  }
 }

--- a/intern/cycles/kernel/integrator/integrator_init_from_bake.h
+++ b/intern/cycles/kernel/integrator/integrator_init_from_bake.h
@ -185,7 +185,7 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
    /* Setup next kernel to execute. */
    const int shader_index = shader & SHADER_MASK;
    const int shader_flags = kernel_tex_fetch(__shaders, shader_index).flags;
-    if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+    if (shader_flags & SD_HAS_RAYTRACE) {
      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader_index);
    }
    else {
--- a/intern/cycles/kernel/integrator/integrator_intersect_closest.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
@ -111,8 +111,7 @@ ccl_device_forceinline void integrator_intersect_shader_next_kernel(
   * Note that the splitting leaves kernel and sorting counters as-is, so use INIT semantic for
   * the matte path. */

-  const bool use_raytrace_kernel = ((shader_flags & SD_HAS_RAYTRACE) ||
-                                    (kernel_data.film.pass_ao != PASS_UNUSED));
+  const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE);

  if (use_raytrace_kernel) {
    INTEGRATOR_PATH_NEXT_SORTED(
--- a/intern/cycles/kernel/integrator/integrator_megakernel.h
+++ b/intern/cycles/kernel/integrator/integrator_megakernel.h
@ -34,16 +34,12 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
                                      ccl_global float *ccl_restrict render_buffer)
 {
  /* Each kernel indicates the next kernel to execute, so here we simply
-   * have to check what that kernel is and execute it.
-   *
-   * TODO: investigate if we can use device side enqueue for GPUs to avoid
-   * having to compile this big kernel. */
+   * have to check what that kernel is and execute it. */
  while (true) {
+    /* Handle any shadow paths before we potentially create more shadow paths. */
    const uint32_t shadow_queued_kernel = INTEGRATOR_STATE(
        &state->shadow, shadow_path, queued_kernel);
-
    if (shadow_queued_kernel) {
-      /* First handle any shadow paths before we potentially create more shadow paths. */
      switch (shadow_queued_kernel) {
        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
          integrator_intersect_shadow(kg, &state->shadow);
@ -55,10 +51,30 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
          kernel_assert(0);
          break;
      }
+      continue;
    }
-    else if (INTEGRATOR_STATE(state, path, queued_kernel)) {
-      /* Then handle regular path kernels. */
-      switch (INTEGRATOR_STATE(state, path, queued_kernel)) {
+
+    /* Handle any AO paths before we potentially create more AO paths. */
+    const uint32_t ao_queued_kernel = INTEGRATOR_STATE(&state->ao, shadow_path, queued_kernel);
+    if (ao_queued_kernel) {
+      switch (ao_queued_kernel) {
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+          integrator_intersect_shadow(kg, &state->ao);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+          integrator_shade_shadow(kg, &state->ao, render_buffer);
+          break;
+        default:
+          kernel_assert(0);
+          break;
+      }
+      continue;
+    }
+
+    /* Then handle regular path kernels. */
+    const uint32_t queued_kernel = INTEGRATOR_STATE(state, path, queued_kernel);
+    if (queued_kernel) {
+      switch (queued_kernel) {
        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
          integrator_intersect_closest(kg, state);
          break;
@ -87,10 +103,10 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
          kernel_assert(0);
          break;
      }
+      continue;
    }
-    else {
-      break;
-    }
+
+    break;
  }
 }

--- a/intern/cycles/kernel/integrator/integrator_shade_background.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_background.h
@ -198,7 +198,7 @@ ccl_device void integrator_shade_background(KernelGlobals kg,
    const int shader = intersection_get_shader_from_isect_prim(kg, isect_prim, isect_type);
    const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;

-    if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+    if (shader_flags & SD_HAS_RAYTRACE) {
      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
                                  shader);
--- a/intern/cycles/kernel/integrator/integrator_shade_surface.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h
@ -168,7 +168,8 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
  const bool is_light = light_sample_is_light(&ls);

  /* Branch off shadow kernel. */
-  INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+  INTEGRATOR_SHADOW_PATH_INIT(
+      shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, shadow);

  /* Copy volume stack and enter/exit volume. */
  integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
@ -324,26 +325,14 @@ ccl_device_forceinline bool integrate_surface_volume_only_bounce(IntegratorState
 }
 #endif

-#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+#if defined(__AO__)
 ccl_device_forceinline void integrate_surface_ao_pass(
    KernelGlobals kg,
-    ConstIntegratorState state,
+    IntegratorState state,
    ccl_private const ShaderData *ccl_restrict sd,
    ccl_private const RNGState *ccl_restrict rng_state,
    ccl_global float *ccl_restrict render_buffer)
 {
-#  ifdef __KERNEL_OPTIX__
-  optixDirectCall<void>(2, kg, state, sd, rng_state, render_buffer);
-}
-
-extern "C" __device__ void __direct_callable__ao_pass(
-    KernelGlobals kg,
-    ConstIntegratorState state,
-    ccl_private const ShaderData *ccl_restrict sd,
-    ccl_private const RNGState *ccl_restrict rng_state,
-    ccl_global float *ccl_restrict render_buffer)
-{
-#  endif /* __KERNEL_OPTIX__ */
  float bsdf_u, bsdf_v;
  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);

@ -352,24 +341,48 @@ extern "C" __device__ void __direct_callable__ao_pass(
  float ao_pdf;
  sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);

-  if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-    Ray ray ccl_optional_struct_init;
-    ray.P = ray_offset(sd->P, sd->Ng);
-    ray.D = ao_D;
-    ray.t = kernel_data.integrator.ao_bounces_distance;
-    ray.time = sd->time;
-    ray.dP = differential_zero_compact();
-    ray.dD = differential_zero_compact();
-
-    Intersection isect ccl_optional_struct_init;
-    if (!scene_intersect(kg, &ray, PATH_RAY_SHADOW_OPAQUE, &isect)) {
-      ccl_global float *buffer = kernel_pass_pixel_render_buffer(kg, state, render_buffer);
-      const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-      kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, throughput);
-    }
+  if (!(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f)) {
+    return;
  }
+
+  Ray ray ccl_optional_struct_init;
+  ray.P = ray_offset(sd->P, sd->Ng);
+  ray.D = ao_D;
+  ray.t = kernel_data.integrator.ao_bounces_distance;
+  ray.time = sd->time;
+  ray.dP = differential_zero_compact();
+  ray.dD = differential_zero_compact();
+
+  /* Branch off shadow kernel. */
+  INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, ao);
+
+  /* Copy volume stack and enter/exit volume. */
+  integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
+
+  /* Write shadow ray and associated state to global memory. */
+  integrator_state_write_shadow_ray(kg, shadow_state, &ray);
+
+  /* Copy state from main path to shadow path. */
+  const uint16_t bounce = INTEGRATOR_STATE(state, path, bounce);
+  const uint16_t transparent_bounce = INTEGRATOR_STATE(state, path, transparent_bounce);
+  uint32_t shadow_flag = INTEGRATOR_STATE(state, path, flag) | PATH_RAY_SHADOW_FOR_AO;
+  const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * shader_bsdf_alpha(kg, sd);
+
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
+      state, path, render_pixel_index);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, rng_offset) =
+      INTEGRATOR_STATE(state, path, rng_offset) -
+      PRNG_BOUNCE_NUM * INTEGRATOR_STATE(state, path, transparent_bounce);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, rng_hash) = INTEGRATOR_STATE(
+      state, path, rng_hash);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, sample) = INTEGRATOR_STATE(
+      state, path, sample);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, flag) = shadow_flag;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, bounce) = bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transparent_bounce) = transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, throughput) = throughput;
 }
-#endif /* defined(__AO__) && defined(__SHADER_RAYTRACE__) */
+#endif /* defined(__AO__) */

 template<uint node_feature_mask>
 ccl_device bool integrate_surface(KernelGlobals kg,
@ -474,14 +487,12 @@ ccl_device bool integrate_surface(KernelGlobals kg,
    PROFILING_EVENT(PROFILING_SHADE_SURFACE_DIRECT_LIGHT);
    integrate_surface_direct_light(kg, state, &sd, &rng_state);

-#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+#if defined(__AO__)
    /* Ambient occlusion pass. */
-    if (node_feature_mask & KERNEL_FEATURE_NODE_RAYTRACE) {
-      if ((kernel_data.film.pass_ao != PASS_UNUSED) &&
-          (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_CAMERA)) {
-        PROFILING_EVENT(PROFILING_SHADE_SURFACE_AO);
-        integrate_surface_ao_pass(kg, state, &sd, &rng_state, render_buffer);
-      }
+    if ((kernel_data.film.pass_ao != PASS_UNUSED) &&
+        (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_CAMERA)) {
+      PROFILING_EVENT(PROFILING_SHADE_SURFACE_AO);
+      integrate_surface_ao_pass(kg, state, &sd, &rng_state, render_buffer);
    }
 #endif

--- a/intern/cycles/kernel/integrator/integrator_shade_volume.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h
@ -776,7 +776,8 @@ ccl_device_forceinline void integrate_volume_direct_light(
  const bool is_light = light_sample_is_light(ls);

  /* Branch off shadow kernel. */
-  INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+  INTEGRATOR_SHADOW_PATH_INIT(
+      shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, shadow);

  /* Write shadow ray and associated state to global memory. */
  integrator_state_write_shadow_ray(kg, shadow_state, &ray);
--- a/intern/cycles/kernel/integrator/integrator_state.h
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@ -92,6 +92,7 @@ typedef struct IntegratorStateCPU {
 #undef KERNEL_STRUCT_VOLUME_STACK_SIZE

  IntegratorShadowStateCPU shadow;
+  IntegratorShadowStateCPU ao;
 } IntegratorStateCPU;

 /* Path Queue
--- a/intern/cycles/kernel/integrator/integrator_state_flow.h
+++ b/intern/cycles/kernel/integrator/integrator_state_flow.h
@ -63,7 +63,7 @@ CCL_NAMESPACE_BEGIN
        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
    INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;

-#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel) \
+#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \
    IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32( \
        &kernel_integrator_state.next_shadow_path_index[0], 1); \
    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
@ -129,8 +129,8 @@ CCL_NAMESPACE_BEGIN
      (void)current_kernel; \
    }

-#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel) \
-    IntegratorShadowState shadow_state = &state->shadow; \
+#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \
+    IntegratorShadowState shadow_state = &state->shadow_type; \
    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
 #  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
    { \
--- a/intern/cycles/kernel/integrator/integrator_subsurface.h
+++ b/intern/cycles/kernel/integrator/integrator_subsurface.h
@ -182,7 +182,7 @@ ccl_device_inline bool subsurface_scatter(KernelGlobals kg, IntegratorState stat

  const int shader = intersection_get_shader(kg, &ss_isect.hits[0]);
  const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
-  if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+  if (shader_flags & SD_HAS_RAYTRACE) {
    INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
                                DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
                                shader);
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@ -408,6 +408,13 @@ ccl_device_inline void kernel_accum_light(KernelGlobals kg,
  const uint32_t path_flag = INTEGRATOR_STATE(state, shadow_path, flag);
  const int sample = INTEGRATOR_STATE(state, shadow_path, sample);

+  /* Ambient occlusion. */
+  if (path_flag & PATH_RAY_SHADOW_FOR_AO) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, contribution);
+    return;
+  }
+
+  /* Direct light shadow. */
  kernel_accum_combined_pass(kg, path_flag, sample, contribution, buffer);

 #ifdef __PASSES__
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@ -28,6 +28,7 @@ ccl_device_inline void path_state_init_queues(IntegratorState state)
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
 #ifdef __KERNEL_CPU__
  INTEGRATOR_STATE_WRITE(&state->shadow, shadow_path, queued_kernel) = 0;
+  INTEGRATOR_STATE_WRITE(&state->ao, shadow_path, queued_kernel) = 0;
 #endif
 }

--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@ -285,21 +285,22 @@ enum PathRayFlag {
  PATH_RAY_VOLUME_PASS = (1U << 26U),
  PATH_RAY_ANY_PASS = (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS | PATH_RAY_VOLUME_PASS),

-  /* Shadow ray is for a light or surface. */
+  /* Shadow ray is for a light or surface, or AO. */
  PATH_RAY_SHADOW_FOR_LIGHT = (1U << 27U),
+  PATH_RAY_SHADOW_FOR_AO = (1U << 28U),

  /* A shadow catcher object was hit and the path was split into two. */
-  PATH_RAY_SHADOW_CATCHER_HIT = (1U << 28U),
+  PATH_RAY_SHADOW_CATCHER_HIT = (1U << 29U),

  /* A shadow catcher object was hit and this path traces only shadow catchers, writing them into
   * their dedicated pass for later division.
   *
   * NOTE: Is not covered with `PATH_RAY_ANY_PASS` because shadow catcher does special handling
   * which is separate from the light passes. */
-  PATH_RAY_SHADOW_CATCHER_PASS = (1U << 29U),
+  PATH_RAY_SHADOW_CATCHER_PASS = (1U << 30U),

  /* Path is evaluating background for an approximate shadow catcher with non-transparent film. */
-  PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1U << 30U),
+  PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1U << 31U),
 };

 /* Configure ray visibility bits for rays and objects respectively,
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@ -677,10 +677,6 @@ uint Film::get_kernel_features(const Scene *scene) const
        kernel_features |= KERNEL_FEATURE_SHADOW_PASS;
      }
    }
-
-    if (pass_type == PASS_AO) {
-      kernel_features |= KERNEL_FEATURE_NODE_RAYTRACE;
-    }
  }

  return kernel_features;