Merge branch 'blender-v3.0-release'

2021-11-05 16:33:08 -05:00 · 2021-11-05 16:33:08 -05:00 · 9e611c5616
parent f0bc7f3261 9be49a1069
commit 9e611c5616
22 changed files with 457 additions and 281 deletions
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@ -68,7 +68,8 @@ CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_
 {
  /* Pick any kernel, all of them are supposed to have same level of microarchitecture
   * optimization. */
-  VLOG(1) << "Using " << kernels.integrator_init_from_camera.get_uarch_name() << " CPU kernels.";
+  VLOG(1) << "Using " << get_cpu_kernels().integrator_init_from_camera.get_uarch_name()
+          << " CPU kernels.";

  if (info.cpu_threads == 0) {
    info.cpu_threads = TaskScheduler::num_threads();
@ -296,11 +297,6 @@ void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
    Device::build_bvh(bvh, progress, refit);
 }

-const CPUKernels *CPUDevice::get_cpu_kernels() const
-{
-  return &kernels;
-}
-
 void CPUDevice::get_cpu_kernel_thread_globals(
    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
 {
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@ -57,8 +57,6 @@ class CPUDevice : public Device {
  RTCDevice embree_device;
 #endif

-  CPUKernels kernels;
-
  CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
  ~CPUDevice();

@ -90,7 +88,6 @@ class CPUDevice : public Device {

  void build_bvh(BVH *bvh, Progress &progress, bool refit) override;

-  virtual const CPUKernels *get_cpu_kernels() const override;
  virtual void get_cpu_kernel_thread_globals(
      vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
  virtual void *get_cpu_osl_memory() override;
--- a/intern/cycles/device/cpu/kernel.cpp
+++ b/intern/cycles/device/cpu/kernel.cpp
@ -26,6 +26,9 @@ CCL_NAMESPACE_BEGIN
      KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)

 #define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
+#define REGISTER_KERNEL_FILM_CONVERT(name) \
+  film_convert_##name(KERNEL_FUNCTIONS(film_convert_##name)), \
+      film_convert_half_rgba_##name(KERNEL_FUNCTIONS(film_convert_half_rgba_##name))

 CPUKernels::CPUKernels()
    : /* Integrator. */
@ -50,11 +53,25 @@ CPUKernels::CPUKernels()
      REGISTER_KERNEL(adaptive_sampling_filter_x),
      REGISTER_KERNEL(adaptive_sampling_filter_y),
      /* Cryptomatte. */
-      REGISTER_KERNEL(cryptomatte_postprocess)
+      REGISTER_KERNEL(cryptomatte_postprocess),
+      /* Film Convert. */
+      REGISTER_KERNEL_FILM_CONVERT(depth),
+      REGISTER_KERNEL_FILM_CONVERT(mist),
+      REGISTER_KERNEL_FILM_CONVERT(sample_count),
+      REGISTER_KERNEL_FILM_CONVERT(float),
+      REGISTER_KERNEL_FILM_CONVERT(light_path),
+      REGISTER_KERNEL_FILM_CONVERT(float3),
+      REGISTER_KERNEL_FILM_CONVERT(motion),
+      REGISTER_KERNEL_FILM_CONVERT(cryptomatte),
+      REGISTER_KERNEL_FILM_CONVERT(shadow_catcher),
+      REGISTER_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow),
+      REGISTER_KERNEL_FILM_CONVERT(combined),
+      REGISTER_KERNEL_FILM_CONVERT(float4)
 {
 }

 #undef REGISTER_KERNEL
+#undef REGISTER_KERNEL_FILM_CONVERT
 #undef KERNEL_FUNCTIONS

 CCL_NAMESPACE_END
--- a/intern/cycles/device/cpu/kernel.h
+++ b/intern/cycles/device/cpu/kernel.h
@ -17,11 +17,13 @@
 #pragma once

 #include "device/cpu/kernel_function.h"
+#include "util/half.h"
 #include "util/types.h"

 CCL_NAMESPACE_BEGIN

 struct KernelGlobalsCPU;
+struct KernelFilmConvert;
 struct IntegratorStateCPU;
 struct TileInfo;

@ -102,6 +104,41 @@ class CPUKernels {

  CryptomattePostprocessFunction cryptomatte_postprocess;

+  /* Film Convert. */
+  using FilmConvertFunction = CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
+                                                         const float *buffer,
+                                                         float *pixel,
+                                                         const int width,
+                                                         const int buffer_stride,
+                                                         const int pixel_stride)>;
+  using FilmConvertHalfRGBAFunction =
+      CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
+                                 const float *buffer,
+                                 half4 *pixel,
+                                 const int width,
+                                 const int buffer_stride)>;
+
+#define KERNEL_FILM_CONVERT_FUNCTION(name) \
+  FilmConvertFunction film_convert_##name; \
+  FilmConvertHalfRGBAFunction film_convert_half_rgba_##name;
+
+  KERNEL_FILM_CONVERT_FUNCTION(depth)
+  KERNEL_FILM_CONVERT_FUNCTION(mist)
+  KERNEL_FILM_CONVERT_FUNCTION(sample_count)
+  KERNEL_FILM_CONVERT_FUNCTION(float)
+
+  KERNEL_FILM_CONVERT_FUNCTION(light_path)
+  KERNEL_FILM_CONVERT_FUNCTION(float3)
+
+  KERNEL_FILM_CONVERT_FUNCTION(motion)
+  KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
+  KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
+  KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
+  KERNEL_FILM_CONVERT_FUNCTION(combined)
+  KERNEL_FILM_CONVERT_FUNCTION(float4)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
+
  CPUKernels();
 };

--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@ -23,6 +23,7 @@
 #include "device/queue.h"

 #include "device/cpu/device.h"
+#include "device/cpu/kernel.h"
 #include "device/cuda/device.h"
 #include "device/dummy/device.h"
 #include "device/hip/device.h"
@ -361,10 +362,11 @@ unique_ptr<DeviceQueue> Device::gpu_queue_create()
  return nullptr;
 }

-const CPUKernels *Device::get_cpu_kernels() const
+const CPUKernels &Device::get_cpu_kernels()
 {
-  LOG(FATAL) << "Device does not support CPU kernels.";
-  return nullptr;
+  /* Initialize CPU kernels once and reuse. */
+  static CPUKernels kernels;
+  return kernels;
 }

 void Device::get_cpu_kernel_thread_globals(
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@ -178,7 +178,7 @@ class Device {
   * These may not be used on GPU or multi-devices. */

  /* Get CPU kernel functions for native instruction set. */
-  virtual const CPUKernels *get_cpu_kernels() const;
+  static const CPUKernels &get_cpu_kernels();
  /* Get kernel globals to pass to kernels. */
  virtual void get_cpu_kernel_thread_globals(
      vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
--- a/intern/cycles/integrator/pass_accessor_cpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@ -14,9 +14,12 @@
 * limitations under the License.
 */

+#include "device/device.h"
+
 #include "integrator/pass_accessor_cpu.h"

 #include "session/buffers.h"
+
 #include "util/log.h"
 #include "util/tbb.h"

@ -33,70 +36,16 @@ CCL_NAMESPACE_BEGIN
 * Kernel processing.
 */

-template<typename Processor>
-inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
-                                                           const BufferParams &buffer_params,
-                                                           const Destination &destination,
-                                                           const Processor &processor) const
-{
-  KernelFilmConvert kfilm_convert;
-  init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
-
-  if (destination.pixels) {
-    /* NOTE: No overlays are applied since they are not used for final renders.
-     * Can be supported via some sort of specialization to avoid code duplication. */
-
-    run_get_pass_kernel_processor_float(
-        &kfilm_convert, render_buffers, buffer_params, destination, processor);
-  }
-
-  if (destination.pixels_half_rgba) {
-    /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */
-
-    if (destination.num_components == 1) {
-      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
-                                              render_buffers,
-                                              buffer_params,
-                                              destination,
-                                              [&processor](const KernelFilmConvert *kfilm_convert,
-                                                           ccl_global const float *buffer,
-                                                           float *pixel_rgba) {
-                                                float pixel;
-                                                processor(kfilm_convert, buffer, &pixel);
-
-                                                pixel_rgba[0] = pixel;
-                                                pixel_rgba[1] = pixel;
-                                                pixel_rgba[2] = pixel;
-                                                pixel_rgba[3] = 1.0f;
-                                              });
-    }
-    else if (destination.num_components == 3) {
-      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
-                                              render_buffers,
-                                              buffer_params,
-                                              destination,
-                                              [&processor](const KernelFilmConvert *kfilm_convert,
-                                                           ccl_global const float *buffer,
-                                                           float *pixel_rgba) {
-                                                processor(kfilm_convert, buffer, pixel_rgba);
-                                                pixel_rgba[3] = 1.0f;
-                                              });
-    }
-    else if (destination.num_components == 4) {
-      run_get_pass_kernel_processor_half_rgba(
-          &kfilm_convert, render_buffers, buffer_params, destination, processor);
-    }
-  }
-}
-
-template<typename Processor>
 inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
    const KernelFilmConvert *kfilm_convert,
    const RenderBuffers *render_buffers,
    const BufferParams &buffer_params,
    const Destination &destination,
-    const Processor &processor) const
+    const CPUKernels::FilmConvertFunction func) const
 {
+  /* NOTE: No overlays are applied since they are not used for final renders.
+   * Can be supported via some sort of specialization to avoid code duplication. */
+
  DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";

  const int64_t pass_stride = buffer_params.pass_stride;
@ -112,21 +61,16 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
    const float *buffer = window_data + y * buffer_row_stride;
    float *pixel = destination.pixels +
                   (y * buffer_params.width + destination.offset) * pixel_stride;
-
-    for (int64_t x = 0; x < buffer_params.window_width;
-         ++x, buffer += pass_stride, pixel += pixel_stride) {
-      processor(kfilm_convert, buffer, pixel);
-    }
+    func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride, pixel_stride);
  });
 }

-template<typename Processor>
 inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
    const KernelFilmConvert *kfilm_convert,
    const RenderBuffers *render_buffers,
    const BufferParams &buffer_params,
    const Destination &destination,
-    const Processor &processor) const
+    const CPUKernels::FilmConvertHalfRGBAFunction func) const
 {
  const int64_t pass_stride = buffer_params.pass_stride;
  const int64_t buffer_row_stride = buffer_params.stride * buffer_params.pass_stride;
@ -141,16 +85,7 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
  tbb::parallel_for(0, buffer_params.window_height, [&](int64_t y) {
    const float *buffer = window_data + y * buffer_row_stride;
    half4 *pixel = dst_start + y * destination_stride;
-    for (int64_t x = 0; x < buffer_params.window_width; ++x, buffer += pass_stride, ++pixel) {
-
-      float pixel_rgba[4];
-      processor(kfilm_convert, buffer, pixel_rgba);
-
-      film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba);
-
-      *pixel = float4_to_half4_display(
-          make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3]));
-    }
+    func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride);
  });
 }

@ -163,8 +98,25 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
                                        const BufferParams &buffer_params, \
                                        const Destination &destination) const \
  { \
-    run_get_pass_kernel_processor( \
-        render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \
+    const CPUKernels &kernels = Device::get_cpu_kernels(); \
+    KernelFilmConvert kfilm_convert; \
+    init_kernel_film_convert(&kfilm_convert, buffer_params, destination); \
+\
+    if (destination.pixels) { \
+      run_get_pass_kernel_processor_float(&kfilm_convert, \
+                                          render_buffers, \
+                                          buffer_params, \
+                                          destination, \
+                                          kernels.film_convert_##pass); \
+    } \
+\
+    if (destination.pixels_half_rgba) { \
+      run_get_pass_kernel_processor_half_rgba(&kfilm_convert, \
+                                              render_buffers, \
+                                              buffer_params, \
+                                              destination, \
+                                              kernels.film_convert_half_rgba_##pass); \
+    } \
  }

 /* Float (scalar) passes. */
--- a/intern/cycles/integrator/pass_accessor_cpu.h
+++ b/intern/cycles/integrator/pass_accessor_cpu.h
@ -16,6 +16,8 @@

 #pragma once

+#include "device/cpu/kernel.h"
+
 #include "integrator/pass_accessor.h"

 CCL_NAMESPACE_BEGIN
@ -28,25 +30,19 @@ class PassAccessorCPU : public PassAccessor {
  using PassAccessor::PassAccessor;

 protected:
-  template<typename Processor>
-  inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
-                                            const BufferParams &buffer_params,
-                                            const Destination &destination,
-                                            const Processor &processor) const;
+  inline void run_get_pass_kernel_processor_float(
+      const KernelFilmConvert *kfilm_convert,
+      const RenderBuffers *render_buffers,
+      const BufferParams &buffer_params,
+      const Destination &destination,
+      const CPUKernels::FilmConvertFunction func) const;

-  template<typename Processor>
-  inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert,
-                                                  const RenderBuffers *render_buffers,
-                                                  const BufferParams &buffer_params,
-                                                  const Destination &destination,
-                                                  const Processor &processor) const;
-
-  template<typename Processor>
-  inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert,
-                                                      const RenderBuffers *render_buffers,
-                                                      const BufferParams &buffer_params,
-                                                      const Destination &destination,
-                                                      const Processor &processor) const;
+  inline void run_get_pass_kernel_processor_half_rgba(
+      const KernelFilmConvert *kfilm_convert,
+      const RenderBuffers *render_buffers,
+      const BufferParams &buffer_params,
+      const Destination &destination,
+      const CPUKernels::FilmConvertHalfRGBAFunction func) const;

 #define DECLARE_PASS_ACCESSOR(pass) \
  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@ -58,7 +58,7 @@ PathTraceWorkCPU::PathTraceWorkCPU(Device *device,
                                   DeviceScene *device_scene,
                                   bool *cancel_requested_flag)
    : PathTraceWork(device, film, device_scene, cancel_requested_flag),
-      kernels_(*(device->get_cpu_kernels()))
+      kernels_(Device::get_cpu_kernels())
 {
  DCHECK_EQ(device->info.type, DEVICE_CPU);
 }
--- a/intern/cycles/integrator/shader_eval.cpp
+++ b/intern/cycles/integrator/shader_eval.cpp
@ -96,7 +96,7 @@ bool ShaderEval::eval_cpu(Device *device,
  device->get_cpu_kernel_thread_globals(kernel_thread_globals);

  /* Find required kernel function. */
-  const CPUKernels &kernels = *(device->get_cpu_kernels());
+  const CPUKernels &kernels = Device::get_cpu_kernels();

  /* Simple parallel_for over all work items. */
  KernelShaderEvalInput *input_data = input.data();
--- a/intern/cycles/kernel/device/cpu/kernel.h
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@ -18,6 +18,7 @@

 /* CPU Kernel Interface */

+#include "util/half.h"
 #include "util/types.h"

 #include "kernel/types.h"
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@ -52,6 +52,37 @@ KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
 #undef KERNEL_INTEGRATOR_INIT_FUNCTION
 #undef KERNEL_INTEGRATOR_SHADE_FUNCTION

+#define KERNEL_FILM_CONVERT_FUNCTION(name) \
+  void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+                                                      const float *buffer, \
+                                                      float *pixel, \
+                                                      const int width, \
+                                                      const int buffer_stride, \
+                                                      const int pixel_stride); \
+  void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+      const KernelFilmConvert *kfilm_convert, \
+      const float *buffer, \
+      half4 *pixel, \
+      const int width, \
+      const int buffer_stride);
+
+KERNEL_FILM_CONVERT_FUNCTION(depth)
+KERNEL_FILM_CONVERT_FUNCTION(mist)
+KERNEL_FILM_CONVERT_FUNCTION(sample_count)
+KERNEL_FILM_CONVERT_FUNCTION(float)
+
+KERNEL_FILM_CONVERT_FUNCTION(light_path)
+KERNEL_FILM_CONVERT_FUNCTION(float3)
+
+KERNEL_FILM_CONVERT_FUNCTION(motion)
+KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
+KERNEL_FILM_CONVERT_FUNCTION(combined)
+KERNEL_FILM_CONVERT_FUNCTION(float4)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
+
 /* --------------------------------------------------------------------
 * Shader evaluation.
 */
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@ -47,8 +47,8 @@
 #    include "kernel/integrator/megakernel.h"

 #    include "kernel/film/adaptive_sampling.h"
-#    include "kernel/film/read.h"
 #    include "kernel/film/id_passes.h"
+#    include "kernel/film/read.h"

 #    include "kernel/bake/bake.h"

@ -232,6 +232,85 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *
 #endif
 }

+/* --------------------------------------------------------------------
+ * Film Convert.
+ */
+
+#ifdef KERNEL_STUB
+
+#  define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
+    void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+                                                        const float *buffer, \
+                                                        float *pixel, \
+                                                        const int width, \
+                                                        const int buffer_stride, \
+                                                        const int pixel_stride) \
+    { \
+      STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
+    } \
+    void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+        const KernelFilmConvert *kfilm_convert, \
+        const float *buffer, \
+        half4 *pixel, \
+        const int width, \
+        const int buffer_stride) \
+    { \
+      STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
+    }
+
+#else
+
+#  define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
+    void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+                                                        const float *buffer, \
+                                                        float *pixel, \
+                                                        const int width, \
+                                                        const int buffer_stride, \
+                                                        const int pixel_stride) \
+    { \
+      for (int i = 0; i < width; i++, buffer += buffer_stride, pixel += pixel_stride) { \
+        film_get_pass_pixel_##name(kfilm_convert, buffer, pixel); \
+      } \
+    } \
+    void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+        const KernelFilmConvert *kfilm_convert, \
+        const float *buffer, \
+        half4 *pixel, \
+        const int width, \
+        const int buffer_stride) \
+    { \
+      for (int i = 0; i < width; i++, buffer += buffer_stride, pixel++) { \
+        float pixel_rgba[4] = {0.0f, 0.0f, 0.0f, 1.0f}; \
+        film_get_pass_pixel_##name(kfilm_convert, buffer, pixel_rgba); \
+        if (is_float) { \
+          pixel_rgba[1] = pixel_rgba[0]; \
+          pixel_rgba[2] = pixel_rgba[0]; \
+        } \
+        film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba); \
+        *pixel = float4_to_half4_display( \
+            make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3])); \
+      } \
+    }
+
+#endif
+
+KERNEL_FILM_CONVERT_FUNCTION(depth, true)
+KERNEL_FILM_CONVERT_FUNCTION(mist, true)
+KERNEL_FILM_CONVERT_FUNCTION(sample_count, true)
+KERNEL_FILM_CONVERT_FUNCTION(float, true)
+
+KERNEL_FILM_CONVERT_FUNCTION(light_path, false)
+KERNEL_FILM_CONVERT_FUNCTION(float3, false)
+
+KERNEL_FILM_CONVERT_FUNCTION(motion, false)
+KERNEL_FILM_CONVERT_FUNCTION(cryptomatte, false)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher, false)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow, false)
+KERNEL_FILM_CONVERT_FUNCTION(combined, false)
+KERNEL_FILM_CONVERT_FUNCTION(float4, false)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
+
 #undef KERNEL_INVOKE
 #undef DEFINE_INTEGRATOR_KERNEL
 #undef DEFINE_INTEGRATOR_SHADE_KERNEL
--- a/intern/cycles/kernel/integrator/intersect_closest.h
+++ b/intern/cycles/kernel/integrator/intersect_closest.h
@ -31,7 +31,6 @@

 CCL_NAMESPACE_BEGIN

-template<uint32_t current_kernel>
 ccl_device_forceinline bool integrator_intersect_terminate(KernelGlobals kg,
                                                           IntegratorState state,
                                                           const int shader_flags)
@ -86,36 +85,75 @@ ccl_device_forceinline bool integrator_intersect_terminate(KernelGlobals kg,
  return false;
 }

-/* Note that current_kernel is a template value since making this a variable
- * leads to poor performance with CUDA atomics. */
-template<uint32_t current_kernel>
-ccl_device_forceinline void integrator_intersect_shader_next_kernel(
-    KernelGlobals kg,
-    IntegratorState state,
-    ccl_private const Intersection *ccl_restrict isect,
-    const int shader,
-    const int shader_flags)
+#ifdef __SHADOW_CATCHER__
+/* Split path if a shadow catcher was hit. */
+ccl_device_forceinline void integrator_split_shadow_catcher(
+    KernelGlobals kg, IntegratorState state, ccl_private const Intersection *ccl_restrict isect)
 {
-  /* Note on scheduling.
-   *
-   * When there is no shadow catcher split the scheduling is simple: schedule surface shading with
-   * or without raytrace support, depending on the shader used.
-   *
-   * When there is a shadow catcher split the general idea is to have the following configuration:
-   *
-   *  - Schedule surface shading kernel (with corresponding raytrace support) for the ray which
-   *    will trace shadow catcher object.
-   *
-   *  - When no alpha-over of approximate shadow catcher is needed, schedule surface shading for
-   *    the matte ray.
-   *
-   *  - Otherwise schedule background shading kernel, so that we have a background to alpha-over
-   *    on. The background kernel will then schedule surface shading for the matte ray.
+  /* Test if we hit a shadow catcher object, and potentially split the path to continue tracing two
+   * paths from here. */
+  const int object_flags = intersection_get_object_flags(kg, isect);
+  if (!kernel_shadow_catcher_is_path_split_bounce(kg, state, object_flags)) {
+    return;
+  }
+
+  /* Mark state as having done a shadow catcher split so that it stops contributing to
+   * the shadow catcher matte pass, but keeps contributing to the combined pass. */
+  INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_HIT;
+
+  /* Copy current state to new state. */
+  state = integrator_state_shadow_catcher_split(kg, state);
+
+  /* Initialize new state.
   *
   * Note that the splitting leaves kernel and sorting counters as-is, so use INIT semantic for
   * the matte path. */

-  const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE);
+  /* Mark current state so that it will only track contribution of shadow catcher objects ignoring
+   * non-catcher objects. */
+  INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_PASS;
+
+  if (kernel_data.film.pass_background != PASS_UNUSED && !kernel_data.background.transparent) {
+    /* If using background pass, schedule background shading kernel so that we have a background
+     * to alpha-over on. The background kernel will then continue the path afterwards. */
+    INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND;
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    return;
+  }
+
+  if (!integrator_state_volume_stack_is_empty(kg, state)) {
+    /* Volume stack is not empty. Re-init the volume stack to exclude any non-shadow catcher
+     * objects from it, and then continue shading volume and shadow catcher surface after. */
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+    return;
+  }
+
+  /* Continue with shading shadow catcher surface. */
+  const int shader = intersection_get_shader(kg, isect);
+  const int flags = kernel_tex_fetch(__shaders, shader).flags;
+  const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
+
+  if (use_raytrace_kernel) {
+    INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+  }
+  else {
+    INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+  }
+}
+
+/* Schedule next kernel to be executed after updating volume stack for shadow catcher. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catcher_volume(
+    KernelGlobals kg, IntegratorState state)
+{
+  /* Continue with shading shadow catcher surface. Same as integrator_split_shadow_catcher, but
+   * using NEXT instead of INIT. */
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_isect(kg, state, &isect);
+
+  const int shader = intersection_get_shader(kg, &isect);
+  const int flags = kernel_tex_fetch(__shaders, shader).flags;
+  const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);

  if (use_raytrace_kernel) {
    INTEGRATOR_PATH_NEXT_SORTED(
@ -124,23 +162,132 @@ ccl_device_forceinline void integrator_intersect_shader_next_kernel(
  else {
    INTEGRATOR_PATH_NEXT_SORTED(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
  }
+}

-#ifdef __SHADOW_CATCHER__
-  const int object_flags = intersection_get_object_flags(kg, isect);
-  if (kernel_shadow_catcher_split(kg, state, object_flags)) {
-    if (kernel_data.film.pass_background != PASS_UNUSED && !kernel_data.background.transparent) {
-      INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND;
+/* Schedule next kernel to be executed after executing background shader for shadow catcher. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catcher_background(
+    KernelGlobals kg, IntegratorState state)
+{
+  /* Same logic as integrator_split_shadow_catcher, but using NEXT instead of INIT. */
+  if (!integrator_state_volume_stack_is_empty(kg, state)) {
+    /* Volume stack is not empty. Re-init the volume stack to exclude any non-shadow catcher
+     * objects from it, and then continue shading volume and shadow catcher surface after. */
+    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+    return;
+  }

-      INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
-    }
-    else if (use_raytrace_kernel) {
-      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+  /* Continue with shading shadow catcher surface. */
+  integrator_intersect_next_kernel_after_shadow_catcher_volume<current_kernel>(kg, state);
+}
+#endif
+
+/* Schedule next kernel to be executed after intersect closest.
+ *
+ * Note that current_kernel is a template value since making this a variable
+ * leads to poor performance with CUDA atomics. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_next_kernel(
+    KernelGlobals kg,
+    IntegratorState state,
+    ccl_private const Intersection *ccl_restrict isect,
+    const bool hit)
+{
+  /* Continue with volume kernel if we are inside a volume, regardless if we hit anything. */
+#ifdef __VOLUME__
+  if (!integrator_state_volume_stack_is_empty(kg, state)) {
+    const bool hit_surface = hit && !(isect->type & PRIMITIVE_LAMP);
+    const int shader = (hit_surface) ? intersection_get_shader(kg, isect) : SHADER_NONE;
+    const int flags = (hit_surface) ? kernel_tex_fetch(__shaders, shader).flags : 0;
+
+    if (!integrator_intersect_terminate(kg, state, flags)) {
+      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
    }
    else {
-      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+      INTEGRATOR_PATH_TERMINATE(current_kernel);
    }
+    return;
  }
 #endif
+
+  if (hit) {
+    /* Hit a surface, continue with light or surface kernel. */
+    if (isect->type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+
+      if (!integrator_intersect_terminate(kg, state, flags)) {
+        const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
+        if (use_raytrace_kernel) {
+          INTEGRATOR_PATH_NEXT_SORTED(
+              current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+        }
+        else {
+          INTEGRATOR_PATH_NEXT_SORTED(
+              current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+        }
+
+#ifdef __SHADOW_CATCHER__
+        /* Handle shadow catcher. */
+        integrator_split_shadow_catcher(kg, state, isect);
+#endif
+      }
+      else {
+        INTEGRATOR_PATH_TERMINATE(current_kernel);
+      }
+    }
+  }
+  else {
+    /* Nothing hit, continue with background kernel. */
+    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+  }
+}
+
+/* Schedule next kernel to be executed after shade volume.
+ *
+ * The logic here matches integrator_intersect_next_kernel, except that
+ * volume shading and termination testing have already been done. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_next_kernel_after_volume(
+    KernelGlobals kg, IntegratorState state, ccl_private const Intersection *ccl_restrict isect)
+{
+  if (isect->prim != PRIM_NONE) {
+    /* Hit a surface, continue with light or surface kernel. */
+    if (isect->type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      return;
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+      const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
+
+      if (use_raytrace_kernel) {
+        INTEGRATOR_PATH_NEXT_SORTED(
+            current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+      }
+      else {
+        INTEGRATOR_PATH_NEXT_SORTED(
+            current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+      }
+
+#ifdef __SHADOW_CATCHER__
+      /* Handle shadow catcher. */
+      integrator_split_shadow_catcher(kg, state, isect);
+#endif
+      return;
+    }
+  }
+  else {
+    /* Nothing hit, continue with background kernel. */
+    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    return;
+  }
 }

 ccl_device void integrator_intersect_closest(KernelGlobals kg, IntegratorState state)
@ -192,56 +339,9 @@ ccl_device void integrator_intersect_closest(KernelGlobals kg, IntegratorState s
  /* Write intersection result into global integrator state memory. */
  integrator_state_write_isect(kg, state, &isect);

-#ifdef __VOLUME__
-  if (!integrator_state_volume_stack_is_empty(kg, state)) {
-    const bool hit_surface = hit && !(isect.type & PRIMITIVE_LAMP);
-    const int shader = (hit_surface) ? intersection_get_shader(kg, &isect) : SHADER_NONE;
-    const int flags = (hit_surface) ? kernel_tex_fetch(__shaders, shader).flags : 0;
-
-    if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
-            kg, state, flags)) {
-      /* Continue with volume kernel if we are inside a volume, regardless
-       * if we hit anything. */
-      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
-                           DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
-    }
-    else {
-      INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
-    }
-    return;
-  }
-#endif
-
-  if (hit) {
-    /* Hit a surface, continue with light or surface kernel. */
-    if (isect.type & PRIMITIVE_LAMP) {
-      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
-                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
-      return;
-    }
-    else {
-      /* Hit a surface, continue with surface kernel unless terminated. */
-      const int shader = intersection_get_shader(kg, &isect);
-      const int flags = kernel_tex_fetch(__shaders, shader).flags;
-
-      if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
-              kg, state, flags)) {
-        integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
-            kg, state, &isect, shader, flags);
-        return;
-      }
-      else {
-        INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
-        return;
-      }
-    }
-  }
-  else {
-    /* Nothing hit, continue with background kernel. */
-    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
-                         DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
-    return;
-  }
+  /* Setup up next kernel to be executed. */
+  integrator_intersect_next_kernel<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+      kg, state, &isect, hit);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/intersect_volume_stack.h
+++ b/intern/cycles/kernel/integrator/intersect_volume_stack.h
@ -42,10 +42,13 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
  /* Store to avoid global fetches on every intersection step. */
  const uint volume_stack_size = kernel_data.volume_stack_size;

+  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
+  const uint32_t visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, PATH_RAY_ALL_VISIBILITY);
+
 #ifdef __VOLUME_RECORD_ALL__
  Intersection hits[2 * MAX_VOLUME_STACK_SIZE + 1];
  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * volume_stack_size, PATH_RAY_ALL_VISIBILITY);
+      kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
  if (num_hits > 0) {
    Intersection *isect = hits;

@ -60,7 +63,7 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
  Intersection isect;
  int step = 0;
  while (step < 2 * volume_stack_size &&
-         scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
+         scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
    shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
    volume_stack_enter_exit(kg, state, stack_sd);

@ -74,7 +77,7 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
 #endif
 }

-ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorState state)
+ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState state)
 {
  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_STACK);

@ -89,14 +92,20 @@ ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorSt
  volume_ray.D = make_float3(0.0f, 0.0f, 1.0f);
  volume_ray.t = FLT_MAX;

-  const uint visibility = (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_ALL_VISIBILITY);
  int stack_index = 0, enclosed_index = 0;

-  /* Write background shader. */
+  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
+  const uint32_t visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, PATH_RAY_CAMERA);
+
+  /* Initialize volume stack with background volume For shadow catcher the
+   * background volume is always assumed to be CG. */
  if (kernel_data.background.volume_shader != SHADER_NONE) {
-    const VolumeStack new_entry = {OBJECT_NONE, kernel_data.background.volume_shader};
-    integrator_state_write_volume_stack(state, stack_index, new_entry);
-    stack_index++;
+    if (!(path_flag & PATH_RAY_SHADOW_CATCHER_PASS)) {
+      INTEGRATOR_STATE_ARRAY_WRITE(state, volume_stack, stack_index, object) = OBJECT_NONE;
+      INTEGRATOR_STATE_ARRAY_WRITE(
+          state, volume_stack, stack_index, shader) = kernel_data.background.volume_shader;
+      stack_index++;
+    }
  }

  /* Store to avoid global fetches on every intersection step. */
@ -202,9 +211,22 @@ ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorSt
  /* Write terminator. */
  const VolumeStack new_entry = {OBJECT_NONE, SHADER_NONE};
  integrator_state_write_volume_stack(state, stack_index, new_entry);
+}

-  INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
-                       DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorState state)
+{
+  integrator_volume_stack_init(kg, state);
+
+  if (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_SHADOW_CATCHER_PASS) {
+    /* Volume stack re-init for shadow catcher, continue with shading of hit. */
+    integrator_intersect_next_kernel_after_shadow_catcher_volume<
+        DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK>(kg, state);
+  }
+  else {
+    /* Volume stack init for camera rays, continue with intersection of camera ray. */
+    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
+                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+  }
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@ -192,23 +192,11 @@ ccl_device void integrator_shade_background(KernelGlobals kg,

 #ifdef __SHADOW_CATCHER__
  if (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_SHADOW_CATCHER_BACKGROUND) {
+    /* Special case for shadow catcher where we want to fill the background pass
+     * behind the shadow catcher but also continue tracing the path. */
    INTEGRATOR_STATE_WRITE(state, path, flag) &= ~PATH_RAY_SHADOW_CATCHER_BACKGROUND;
-
-    const int isect_prim = INTEGRATOR_STATE(state, isect, prim);
-    const int isect_type = INTEGRATOR_STATE(state, isect, type);
-    const int shader = intersection_get_shader_from_isect_prim(kg, isect_prim, isect_type);
-    const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
-
-    if (shader_flags & SD_HAS_RAYTRACE) {
-      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
-                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
-                                  shader);
-    }
-    else {
-      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
-                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
-                                  shader);
-    }
+    integrator_intersect_next_kernel_after_shadow_catcher_background<
+        DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND>(kg, state);
    return;
  }
 #endif
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@ -1023,25 +1023,9 @@ ccl_device void integrator_shade_volume(KernelGlobals kg,
  }
  else {
    /* Continue to background, light or surface. */
-    if (isect.prim == PRIM_NONE) {
-      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
-                           DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
-      return;
-    }
-    else if (isect.type & PRIMITIVE_LAMP) {
-      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
-                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
-      return;
-    }
-    else {
-      /* Hit a surface, continue with surface kernel unless terminated. */
-      const int shader = intersection_get_shader(kg, &isect);
-      const int flags = kernel_tex_fetch(__shaders, shader).flags;
-
-      integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME>(
-          kg, state, &isect, shader, flags);
-      return;
-    }
+    integrator_intersect_next_kernel_after_volume<DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME>(
+        kg, state, &isect);
+    return;
  }
 #endif /* __VOLUME__ */
 }
--- a/intern/cycles/kernel/integrator/shadow_catcher.h
+++ b/intern/cycles/kernel/integrator/shadow_catcher.h
@ -76,33 +76,6 @@ ccl_device_inline bool kernel_shadow_catcher_path_can_split(KernelGlobals kg,
  return (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) != 0;
 }

-/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
- * after this function. */
-ccl_device_inline bool kernel_shadow_catcher_split(KernelGlobals kg,
-                                                   IntegratorState state,
-                                                   const int object_flags)
-{
-#ifdef __SHADOW_CATCHER__
-
-  if (!kernel_shadow_catcher_is_path_split_bounce(kg, state, object_flags)) {
-    return false;
-  }
-
-  /* The split is to be done. Mark the current state as such, so that it stops contributing to the
-   * shadow catcher matte pass, but keeps contributing to the combined pass. */
-  INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_HIT;
-
-  /* Split new state from the current one. This new state will only track contribution of shadow
-   * catcher objects ignoring non-catcher objects. */
-  integrator_state_shadow_catcher_split(kg, state);
-
-  return true;
-#else
-  (void)object_flags;
-  return false;
-#endif
-}
-
 #ifdef __SHADOW_CATCHER__

 ccl_device_forceinline bool kernel_shadow_catcher_is_matte_path(const uint32_t path_flag)
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@ -173,10 +173,10 @@ typedef const IntegratorShadowStateCPU *ccl_restrict ConstIntegratorShadowState;

 /* Array access on GPU with Structure-of-Arrays. */

-typedef const int IntegratorState;
-typedef const int ConstIntegratorState;
-typedef const int IntegratorShadowState;
-typedef const int ConstIntegratorShadowState;
+typedef int IntegratorState;
+typedef int ConstIntegratorState;
+typedef int IntegratorShadowState;
+typedef int ConstIntegratorShadowState;

 #  define INTEGRATOR_STATE_NULL -1

--- a/intern/cycles/kernel/integrator/state_util.h
+++ b/intern/cycles/kernel/integrator/state_util.h
@ -326,8 +326,8 @@ ccl_device_inline void integrator_shadow_state_move(KernelGlobals kg,

 /* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
 * after this function. */
-ccl_device_inline void integrator_state_shadow_catcher_split(KernelGlobals kg,
-                                                             IntegratorState state)
+ccl_device_inline IntegratorState integrator_state_shadow_catcher_split(KernelGlobals kg,
+                                                                        IntegratorState state)
 {
 #if defined(__KERNEL_GPU__)
  ConstIntegratorState to_state = atomic_fetch_and_add_uint32(
@ -337,14 +337,14 @@ ccl_device_inline void integrator_state_shadow_catcher_split(KernelGlobals kg,
 #else
  IntegratorStateCPU *ccl_restrict to_state = state + 1;

-  /* Only copy the required subset, since shadow intersections are big and irrelevant here. */
+  /* Only copy the required subset for performance. */
  to_state->path = state->path;
  to_state->ray = state->ray;
  to_state->isect = state->isect;
  integrator_state_copy_volume_stack(kg, to_state, state);
 #endif

-  INTEGRATOR_STATE_WRITE(to_state, path, flag) |= PATH_RAY_SHADOW_CATCHER_PASS;
+  return to_state;
 }

 #ifdef __KERNEL_CPU__
--- a/source/blender/editors/interface/interface_icons.c
+++ b/source/blender/editors/interface/interface_icons.c
@ -1503,7 +1503,8 @@ static void icon_draw_rect(float x,
  int draw_w = w;
  int draw_h = h;
  int draw_x = x;
-  int draw_y = y;
+  /* We need to round y, to avoid the icon jittering in some cases. */
+  int draw_y = round_fl_to_int(y);

  /* sanity check */
  if (w <= 0 || h <= 0 || w > 2000 || h > 2000) {
--- a/source/blender/editors/interface/interface_widgets.c
+++ b/source/blender/editors/interface/interface_widgets.c
@ -1407,8 +1407,8 @@ static void widget_draw_icon(

    /* force positions to integers, for zoom levels near 1. draws icons crisp. */
    if (aspect > 0.95f && aspect < 1.05f) {
-      xs = (int)(xs + 0.1f);
-      ys = (int)(ys + 0.1f);
+      xs = roundf(xs);
+      ys = roundf(ys);
    }

    /* Get theme color. */