Cycles: Add basic support for using OSL with OptiX

This patch generalizes the OSL support in Cycles to include GPU device types and adds an implementation for that in the OptiX device. There are some caveats still, including simplified texturing due to lack of OIIO on the GPU and a few missing OSL intrinsics. Note that this is incomplete and missing an update to the OSL library before being enabled! The implementation is already committed now to simplify further development. Maniphest Tasks: T101222 Differential Revision: https://developer.blender.org/D15902
Referenced by issue #101222, Cycles OSL with OptiX
2022-11-09 14:25:32 +01:00 · 2022-11-09 14:25:32 +01:00 · e6b38deb9d · 2023-02-14 05:28:01 +01:00
parent efe073f57c
commit e6b38deb9d
43 changed files with 3469 additions and 555 deletions
--- a/build_files/cmake/platform/platform_win32.cmake
+++ b/build_files/cmake/platform/platform_win32.cmake
@ -419,7 +419,7 @@ if(WITH_IMAGE_OPENEXR)
    warn_hardcoded_paths(OpenEXR)
    set(OPENEXR ${LIBDIR}/openexr)
    set(OPENEXR_INCLUDE_DIR ${OPENEXR}/include)
-    set(OPENEXR_INCLUDE_DIRS ${OPENEXR_INCLUDE_DIR} ${IMATH_INCLUDE_DIRS} ${OPENEXR}/include/OpenEXR)
+    set(OPENEXR_INCLUDE_DIRS ${OPENEXR_INCLUDE_DIR} ${IMATH_INCLUDE_DIRS} ${OPENEXR_INCLUDE_DIR}/OpenEXR)
    set(OPENEXR_LIBPATH ${OPENEXR}/lib)
    # Check if the 3.x library name exists
    # if not assume this is a 2.x library folder
@ -568,7 +568,8 @@ if(WITH_OPENIMAGEIO)
  if(NOT OpenImageIO_FOUND)
    set(OPENIMAGEIO ${LIBDIR}/OpenImageIO)
    set(OPENIMAGEIO_LIBPATH ${OPENIMAGEIO}/lib)
-    set(OPENIMAGEIO_INCLUDE_DIRS ${OPENIMAGEIO}/include)
+    set(OPENIMAGEIO_INCLUDE_DIR ${OPENIMAGEIO}/include)
+    set(OPENIMAGEIO_INCLUDE_DIRS ${OPENIMAGEIO_INCLUDE_DIR})
    set(OIIO_OPTIMIZED optimized ${OPENIMAGEIO_LIBPATH}/OpenImageIO.lib optimized ${OPENIMAGEIO_LIBPATH}/OpenImageIO_Util.lib)
    set(OIIO_DEBUG debug ${OPENIMAGEIO_LIBPATH}/OpenImageIO_d.lib debug ${OPENIMAGEIO_LIBPATH}/OpenImageIO_Util_d.lib)
    set(OPENIMAGEIO_LIBRARIES ${OIIO_OPTIMIZED} ${OIIO_DEBUG})
@ -785,6 +786,14 @@ if(WITH_CYCLES AND WITH_CYCLES_OSL)
  endif()
  find_path(OSL_INCLUDE_DIR OSL/oslclosure.h PATHS ${CYCLES_OSL}/include)
  find_program(OSL_COMPILER NAMES oslc PATHS ${CYCLES_OSL}/bin)
+  file(STRINGS "${OSL_INCLUDE_DIR}/OSL/oslversion.h" OSL_LIBRARY_VERSION_MAJOR
+       REGEX "^[ \t]*#define[ \t]+OSL_LIBRARY_VERSION_MAJOR[ \t]+[0-9]+.*$")
+  file(STRINGS "${OSL_INCLUDE_DIR}/OSL/oslversion.h" OSL_LIBRARY_VERSION_MINOR
+       REGEX "^[ \t]*#define[ \t]+OSL_LIBRARY_VERSION_MINOR[ \t]+[0-9]+.*$")
+  string(REGEX REPLACE ".*#define[ \t]+OSL_LIBRARY_VERSION_MAJOR[ \t]+([.0-9]+).*"
+         "\\1" OSL_LIBRARY_VERSION_MAJOR ${OSL_LIBRARY_VERSION_MAJOR})
+  string(REGEX REPLACE ".*#define[ \t]+OSL_LIBRARY_VERSION_MINOR[ \t]+([.0-9]+).*"
+         "\\1" OSL_LIBRARY_VERSION_MINOR ${OSL_LIBRARY_VERSION_MINOR})
 endif()

 if(WITH_CYCLES AND WITH_CYCLES_EMBREE)
--- a/intern/cycles/blender/addon/init.py
+++ b/intern/cycles/blender/addon/init.py
@ -58,7 +58,7 @@ class CyclesRender(bpy.types.RenderEngine):
        if not self.session:
            if self.is_preview:
                cscene = bpy.context.scene.cycles
-                use_osl = cscene.shading_system and cscene.device == 'CPU'
+                use_osl = cscene.shading_system

                engine.create(self, data, preview_osl=use_osl)
            else:
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@ -155,6 +155,10 @@ def with_osl():
    import _cycles
    return _cycles.with_osl

+def osl_version():
+    import _cycles
+    return _cycles.osl_version
+

 def with_path_guiding():
    import _cycles
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@ -290,7 +290,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
    )
    shading_system: BoolProperty(
        name="Open Shading Language",
-        description="Use Open Shading Language (CPU rendering only)",
+        description="Use Open Shading Language",
    )

    preview_pause: BoolProperty(
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@ -2305,7 +2305,7 @@ def draw_device(self, context):
        col.prop(cscene, "device")

        from . import engine
-        if engine.with_osl() and use_cpu(context):
+        if engine.with_osl() and (use_cpu(context) or (use_optix(context) and (engine.osl_version()[1] >= 13 or engine.osl_version()[0] > 1))):
            col.prop(cscene, "shading_system")


--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@ -160,6 +160,11 @@ class Device {
    return true;
  }

+  virtual bool load_osl_kernels()
+  {
+    return true;
+  }
+
  /* GPU device only functions.
   * These may not be used on CPU or multi-devices. */

--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@ -7,6 +7,30 @@

 CCL_NAMESPACE_BEGIN

+bool device_kernel_has_shading(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY);
+}
+
+bool device_kernel_has_intersection(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE);
+}
+
 const char *device_kernel_as_string(DeviceKernel kernel)
 {
  switch (kernel) {
--- a/intern/cycles/device/kernel.h
+++ b/intern/cycles/device/kernel.h
@ -11,6 +11,9 @@

 CCL_NAMESPACE_BEGIN

+bool device_kernel_has_shading(DeviceKernel kernel);
+bool device_kernel_has_intersection(DeviceKernel kernel);
+
 const char *device_kernel_as_string(DeviceKernel kernel);
 std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);

--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@ -138,6 +138,15 @@ class MultiDevice : public Device {
    return true;
  }

+  bool load_osl_kernels() override
+  {
+    foreach (SubDevice &sub, devices)
+      if (!sub.device->load_osl_kernels())
+        return false;
+
+    return true;
+  }
+
  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
  {
    /* Try to build and share a single acceleration structure, if possible */
@ -204,10 +213,12 @@ class MultiDevice : public Device {

  virtual void *get_cpu_osl_memory() override
  {
-    if (devices.size() > 1) {
+    /* Always return the OSL memory of the CPU device (this works since the constructor above
+     * guarantees that CPU devices are always added to the back). */
+    if (devices.size() > 1 && devices.back().device->info.type != DEVICE_CPU) {
      return NULL;
    }
-    return devices.front().device->get_cpu_osl_memory();
+    return devices.back().device->get_cpu_osl_memory();
  }

  bool is_resident(device_ptr key, Device *sub_device) override
--- a/intern/cycles/device/optix/device.cpp
+++ b/intern/cycles/device/optix/device.cpp
@ -9,6 +9,10 @@

 #include "util/log.h"

+#ifdef WITH_OSL
+#  include <OSL/oslversion.h>
+#endif
+
 #ifdef WITH_OPTIX
 #  include <optix_function_table_definition.h>
 #endif
@ -65,6 +69,9 @@ void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo

    info.type = DEVICE_OPTIX;
    info.id += "_OptiX";
+#  if defined(WITH_OSL) && (OSL_VERSION_MINOR >= 13 || OSL_VERSION_MAJOR > 1)
+    info.has_osl = true;
+#  endif
    info.denoisers |= DENOISER_OPTIX;

    devices.push_back(info);
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@ -312,16 +312,34 @@ OptiXDevice::~OptiXDevice()
  if (optix_module != NULL) {
    optixModuleDestroy(optix_module);
  }
-  for (unsigned int i = 0; i < 2; ++i) {
+  for (int i = 0; i < 2; ++i) {
    if (builtin_modules[i] != NULL) {
      optixModuleDestroy(builtin_modules[i]);
    }
  }
-  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+  for (int i = 0; i < NUM_PIPELINES; ++i) {
    if (pipelines[i] != NULL) {
      optixPipelineDestroy(pipelines[i]);
    }
  }
+  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    if (groups[i] != NULL) {
+      optixProgramGroupDestroy(groups[i]);
+    }
+  }
+
+#  ifdef WITH_OSL
+  for (const OptixModule &module : osl_modules) {
+    if (module != NULL) {
+      optixModuleDestroy(module);
+    }
+  }
+  for (const OptixProgramGroup &group : osl_groups) {
+    if (group != NULL) {
+      optixProgramGroupDestroy(group);
+    }
+  }
+#  endif

  /* Make sure denoiser is destroyed before device context! */
  if (denoiser_.optix_denoiser != nullptr) {
@ -381,6 +399,12 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
    return false;
  }

+#  ifdef WITH_OSL
+  const bool use_osl = (kernel_features & KERNEL_FEATURE_OSL);
+#  else
+  const bool use_osl = false;
+#  endif
+
  /* Skip creating OptiX module if only doing denoising. */
  const bool need_optix_kernels = (kernel_features &
                                   (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING));
@ -388,12 +412,13 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
  /* Detect existence of OptiX kernel and SDK here early. So we can error out
   * before compiling the CUDA kernels, to avoid failing right after when
   * compiling the OptiX kernel. */
+  string suffix = use_osl ? "_osl" :
+                  (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
+                            "_shader_raytrace" :
+                            "";
  string ptx_filename;
  if (need_optix_kernels) {
-    ptx_filename = path_get(
-        (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
-            "lib/kernel_optix_shader_raytrace.ptx" :
-            "lib/kernel_optix.ptx");
+    ptx_filename = path_get("lib/kernel_optix" + suffix + ".ptx");
    if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
      std::string optix_include_dir = get_optix_include_dir();
      if (optix_include_dir.empty()) {
@ -429,18 +454,41 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
    optixModuleDestroy(optix_module);
    optix_module = NULL;
  }
-  for (unsigned int i = 0; i < 2; ++i) {
+  for (int i = 0; i < 2; ++i) {
    if (builtin_modules[i] != NULL) {
      optixModuleDestroy(builtin_modules[i]);
      builtin_modules[i] = NULL;
    }
  }
-  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+  for (int i = 0; i < NUM_PIPELINES; ++i) {
    if (pipelines[i] != NULL) {
      optixPipelineDestroy(pipelines[i]);
      pipelines[i] = NULL;
    }
  }
+  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    if (groups[i] != NULL) {
+      optixProgramGroupDestroy(groups[i]);
+      groups[i] = NULL;
+    }
+  }
+
+#  ifdef WITH_OSL
+  /* Recreating base OptiX module invalidates all OSL modules too, since they link against it. */
+  for (const OptixModule &module : osl_modules) {
+    if (module != NULL) {
+      optixModuleDestroy(module);
+    }
+  }
+  osl_modules.clear();
+
+  for (const OptixProgramGroup &group : osl_groups) {
+    if (group != NULL) {
+      optixProgramGroupDestroy(group);
+    }
+  }
+  osl_groups.clear();
+#  endif

  OptixModuleCompileOptions module_options = {};
  module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */
@ -461,7 +509,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
  module_options.numPayloadTypes = 0;
 #  endif

-  OptixPipelineCompileOptions pipeline_options = {};
  /* Default to no motion blur and two-level graph, since it is the fastest option. */
  pipeline_options.usesMotionBlur = false;
  pipeline_options.traversableGraphFlags =
@ -490,9 +537,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
  /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
   * This is necessary since objects may be reported to have motion if the Vector pass is
   * active, but may still need to be rendered without motion blur if that isn't active as well. */
-  motion_blur = (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) != 0;
-
-  if (motion_blur) {
+  if (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) {
    pipeline_options.usesMotionBlur = true;
    /* Motion blur can insert motion transforms into the traversal graph.
     * It is no longer a two-level graph then, so need to set flags to allow any configuration. */
@ -503,13 +548,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
    string ptx_data;
    if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
      string cflags = compile_kernel_get_common_cflags(kernel_features);
-      ptx_filename = compile_kernel(
-          cflags,
-          (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
-              "kernel_shader_raytrace" :
-              "kernel",
-          "optix",
-          true);
+      ptx_filename = compile_kernel(cflags, ("kernel" + suffix).c_str(), "optix", true);
    }
    if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
      set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
@ -551,7 +590,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
  }

  /* Create program groups. */
-  OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
  OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
  OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
  group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
@ -609,7 +647,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
      group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
      group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;

-      if (motion_blur) {
+      if (pipeline_options.usesMotionBlur) {
        builtin_options.usesMotionBlur = true;

        optix_assert(optixBuiltinISModuleGet(
@ -630,7 +668,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
    }
  }

-  /* Pointclouds */
  if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
    group_descs[PG_HITD_POINTCLOUD] = group_descs[PG_HITD];
    group_descs[PG_HITD_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
@ -642,8 +679,8 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
    group_descs[PG_HITS_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point";
  }

+  /* Add hit group for local intersections. */
  if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) {
-    /* Add hit group for local intersections. */
    group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
    group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
    group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
@ -655,16 +692,19 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
    group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module;
    group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName =
        "__raygen__kernel_optix_integrator_shade_surface_raytrace";
-    group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-    group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
-    group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
-    group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-    group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
-    group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
-        "__direct_callable__svm_node_bevel";
+
+    /* Kernels with OSL support are built without SVM, so can skip those direct callables there. */
+    if (!use_osl) {
+      group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+      group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
+      group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
+      group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+      group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
+      group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
+          "__direct_callable__svm_node_bevel";
+    }
  }

-  /* MNEE. */
  if (kernel_features & KERNEL_FEATURE_MNEE) {
    group_descs[PG_RGEN_SHADE_SURFACE_MNEE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
    group_descs[PG_RGEN_SHADE_SURFACE_MNEE].raygen.module = optix_module;
@ -672,6 +712,42 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
        "__raygen__kernel_optix_integrator_shade_surface_mnee";
  }

+  /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */
+  if (use_osl) {
+    group_descs[PG_RGEN_SHADE_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_background";
+    group_descs[PG_RGEN_SHADE_LIGHT].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_LIGHT].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_LIGHT].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_light";
+    group_descs[PG_RGEN_SHADE_SURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_SURFACE].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_SURFACE].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_surface";
+    group_descs[PG_RGEN_SHADE_VOLUME].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_VOLUME].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_VOLUME].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_volume";
+    group_descs[PG_RGEN_SHADE_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_SHADOW].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_SHADOW].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_shadow";
+    group_descs[PG_RGEN_EVAL_DISPLACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_EVAL_DISPLACE].raygen.module = optix_module;
+    group_descs[PG_RGEN_EVAL_DISPLACE].raygen.entryFunctionName =
+        "__raygen__kernel_optix_shader_eval_displace";
+    group_descs[PG_RGEN_EVAL_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.module = optix_module;
+    group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.entryFunctionName =
+        "__raygen__kernel_optix_shader_eval_background";
+    group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.module = optix_module;
+    group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.entryFunctionName =
+        "__raygen__kernel_optix_shader_eval_curve_shadow_transparency";
+  }
+
  optix_assert(optixProgramGroupCreate(
      context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));

@ -680,7 +756,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
  /* Set up SBT, which in this case is used only to select between different programs. */
  sbt_data.alloc(NUM_PROGRAM_GROUPS);
  memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
-  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
    optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
    optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
  }
@ -704,25 +780,26 @@ bool OptiXDevice::load_kernels(const uint kernel_features)

  OptixPipelineLinkOptions link_options = {};
  link_options.maxTraceDepth = 1;
+  link_options.debugLevel = module_options.debugLevel;

-  if (DebugFlags().optix.use_debug) {
-    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
-  }
-  else {
-    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
-  }
-
-  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
-    /* Create shader raytracing pipeline. */
+  if (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE) && !use_osl) {
+    /* Create shader raytracing and MNEE pipeline. */
    vector<OptixProgramGroup> pipeline_groups;
    pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+    if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+      pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+      pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
+      pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
+    }
+    if (kernel_features & KERNEL_FEATURE_MNEE) {
+      pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
+    }
    pipeline_groups.push_back(groups[PG_MISS]);
    pipeline_groups.push_back(groups[PG_HITD]);
    pipeline_groups.push_back(groups[PG_HITS]);
    pipeline_groups.push_back(groups[PG_HITL]);
    pipeline_groups.push_back(groups[PG_HITV]);
-    if (motion_blur) {
+    if (pipeline_options.usesMotionBlur) {
      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
    }
@ -730,8 +807,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
      pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
      pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
    }
-    pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
-    pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);

    optix_assert(optixPipelineCreate(context,
                                     &pipeline_options,
@ -740,57 +815,18 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
                                     pipeline_groups.size(),
                                     nullptr,
                                     0,
-                                     &pipelines[PIP_SHADE_RAYTRACE]));
+                                     &pipelines[PIP_SHADE]));

    /* Combine ray generation and trace continuation stack size. */
-    const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG +
+    const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
+                                      stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG) +
                             link_options.maxTraceDepth * trace_css;
    const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC,
                                      stack_size[PG_CALL_SVM_BEVEL].dssDC);

    /* Set stack size depending on pipeline options. */
    optix_assert(optixPipelineSetStackSize(
-        pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2));
-  }
-
-  if (kernel_features & KERNEL_FEATURE_MNEE) {
-    /* Create MNEE pipeline. */
-    vector<OptixProgramGroup> pipeline_groups;
-    pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
-    pipeline_groups.push_back(groups[PG_MISS]);
-    pipeline_groups.push_back(groups[PG_HITD]);
-    pipeline_groups.push_back(groups[PG_HITS]);
-    pipeline_groups.push_back(groups[PG_HITL]);
-    pipeline_groups.push_back(groups[PG_HITV]);
-    if (motion_blur) {
-      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
-      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
-    }
-    if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
-      pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
-      pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
-    }
-    pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
-    pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
-
-    optix_assert(optixPipelineCreate(context,
-                                     &pipeline_options,
-                                     &link_options,
-                                     pipeline_groups.data(),
-                                     pipeline_groups.size(),
-                                     nullptr,
-                                     0,
-                                     &pipelines[PIP_SHADE_MNEE]));
-
-    /* Combine ray generation and trace continuation stack size. */
-    const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG +
-                             link_options.maxTraceDepth * trace_css;
-    const unsigned int dss = 0;
-
-    /* Set stack size depending on pipeline options. */
-    optix_assert(
-        optixPipelineSetStackSize(pipelines[PIP_SHADE_MNEE], 0, dss, css, motion_blur ? 3 : 2));
+        pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
  }

  { /* Create intersection-only pipeline. */
@ -805,7 +841,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
    pipeline_groups.push_back(groups[PG_HITS]);
    pipeline_groups.push_back(groups[PG_HITL]);
    pipeline_groups.push_back(groups[PG_HITV]);
-    if (motion_blur) {
+    if (pipeline_options.usesMotionBlur) {
      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
    }
@ -831,16 +867,257 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
                                   stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
        link_options.maxTraceDepth * trace_css;

-    optix_assert(
-        optixPipelineSetStackSize(pipelines[PIP_INTERSECT], 0, 0, css, motion_blur ? 3 : 2));
+    optix_assert(optixPipelineSetStackSize(
+        pipelines[PIP_INTERSECT], 0, 0, css, pipeline_options.usesMotionBlur ? 3 : 2));
  }

-  /* Clean up program group objects. */
-  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
-    optixProgramGroupDestroy(groups[i]);
+  return !have_error();
+}
+
+bool OptiXDevice::load_osl_kernels()
+{
+#  ifdef WITH_OSL
+  if (have_error()) {
+    return false;
  }

-  return true;
+  struct OSLKernel {
+    string ptx;
+    string init_entry;
+    string exec_entry;
+  };
+
+  /* This has to be in the same order as the ShaderType enum, so that the index calculation in
+   * osl_eval_nodes checks out */
+  vector<OSLKernel> osl_kernels;
+
+  for (ShaderType type = SHADER_TYPE_SURFACE; type <= SHADER_TYPE_BUMP;
+       type = static_cast<ShaderType>(type + 1)) {
+    const vector<OSL::ShaderGroupRef> &groups = (type == SHADER_TYPE_SURFACE ?
+                                                     osl_globals.surface_state :
+                                                 type == SHADER_TYPE_VOLUME ?
+                                                     osl_globals.volume_state :
+                                                 type == SHADER_TYPE_DISPLACEMENT ?
+                                                     osl_globals.displacement_state :
+                                                     osl_globals.bump_state);
+    for (const OSL::ShaderGroupRef &group : groups) {
+      if (group) {
+        string osl_ptx, init_name, entry_name;
+        osl_globals.ss->getattribute(group.get(), "group_init_name", init_name);
+        osl_globals.ss->getattribute(group.get(), "group_entry_name", entry_name);
+        osl_globals.ss->getattribute(
+            group.get(), "ptx_compiled_version", OSL::TypeDesc::PTR, &osl_ptx);
+
+        int groupdata_size = 0;
+        osl_globals.ss->getattribute(group.get(), "groupdata_size", groupdata_size);
+        if (groupdata_size > 2048) { /* See 'group_data' array in kernel/osl/osl.h */
+          set_error(
+              string_printf("Requested OSL group data size (%d) is greater than the maximum "
+                            "supported with OptiX (2048)",
+                            groupdata_size));
+          return false;
+        }
+
+        osl_kernels.push_back({std::move(osl_ptx), std::move(init_name), std::move(entry_name)});
+      }
+      else {
+        /* Add empty entry for non-existent shader groups, so that the index stays stable. */
+        osl_kernels.emplace_back();
+      }
+    }
+  }
+
+  const CUDAContextScope scope(this);
+
+  if (pipelines[PIP_SHADE]) {
+    optixPipelineDestroy(pipelines[PIP_SHADE]);
+  }
+
+  for (OptixModule &module : osl_modules) {
+    if (module != NULL) {
+      optixModuleDestroy(module);
+      module = NULL;
+    }
+  }
+  for (OptixProgramGroup &group : osl_groups) {
+    if (group != NULL) {
+      optixProgramGroupDestroy(group);
+      group = NULL;
+    }
+  }
+
+  OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
+  OptixModuleCompileOptions module_options = {};
+  module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
+  module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
+
+  osl_groups.resize(osl_kernels.size() * 2 + 1);
+  osl_modules.resize(osl_kernels.size() + 1);
+
+  { /* Load and compile PTX module with OSL services. */
+    string ptx_data, ptx_filename = path_get("lib/kernel_optix_osl_services.ptx");
+    if (!path_read_text(ptx_filename, ptx_data)) {
+      set_error(string_printf("Failed to load OptiX OSL services kernel from '%s'",
+                              ptx_filename.c_str()));
+      return false;
+    }
+
+    const OptixResult result = optixModuleCreateFromPTX(context,
+                                                        &module_options,
+                                                        &pipeline_options,
+                                                        ptx_data.data(),
+                                                        ptx_data.size(),
+                                                        nullptr,
+                                                        0,
+                                                        &osl_modules.back());
+    if (result != OPTIX_SUCCESS) {
+      set_error(string_printf("Failed to load OptiX OSL services kernel from '%s' (%s)",
+                              ptx_filename.c_str(),
+                              optixGetErrorName(result)));
+      return false;
+    }
+
+    OptixProgramGroupDesc group_desc = {};
+    group_desc.kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_desc.callables.entryFunctionNameDC = "__direct_callable__dummy_services";
+    group_desc.callables.moduleDC = osl_modules.back();
+
+    optix_assert(optixProgramGroupCreate(
+        context, &group_desc, 1, &group_options, nullptr, 0, &osl_groups.back()));
+  }
+
+  TaskPool pool;
+  vector<OptixResult> results(osl_kernels.size(), OPTIX_SUCCESS);
+
+  for (size_t i = 0; i < osl_kernels.size(); ++i) {
+    if (osl_kernels[i].ptx.empty()) {
+      continue;
+    }
+
+#    if OPTIX_ABI_VERSION >= 55
+    OptixTask task = nullptr;
+    results[i] = optixModuleCreateFromPTXWithTasks(context,
+                                                   &module_options,
+                                                   &pipeline_options,
+                                                   osl_kernels[i].ptx.data(),
+                                                   osl_kernels[i].ptx.size(),
+                                                   nullptr,
+                                                   nullptr,
+                                                   &osl_modules[i],
+                                                   &task);
+    if (results[i] == OPTIX_SUCCESS) {
+      execute_optix_task(pool, task, results[i]);
+    }
+#    else
+    pool.push([this, &results, i, &module_options, &osl_kernels]() {
+      results[i] = optixModuleCreateFromPTX(context,
+                                            &module_options,
+                                            &pipeline_options,
+                                            osl_kernels[i].ptx.data(),
+                                            osl_kernels[i].ptx.size(),
+                                            nullptr,
+                                            0,
+                                            &osl_modules[i]);
+    });
+#    endif
+  }
+
+  pool.wait_work();
+
+  for (size_t i = 0; i < osl_kernels.size(); ++i) {
+    if (osl_kernels[i].ptx.empty()) {
+      continue;
+    }
+
+    if (results[i] != OPTIX_SUCCESS) {
+      set_error(string_printf("Failed to load OptiX OSL kernel for %s (%s)",
+                              osl_kernels[i].init_entry.c_str(),
+                              optixGetErrorName(results[i])));
+      return false;
+    }
+
+    OptixProgramGroupDesc group_descs[2] = {};
+    group_descs[0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[0].callables.entryFunctionNameDC = osl_kernels[i].init_entry.c_str();
+    group_descs[0].callables.moduleDC = osl_modules[i];
+    group_descs[1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[1].callables.entryFunctionNameDC = osl_kernels[i].exec_entry.c_str();
+    group_descs[1].callables.moduleDC = osl_modules[i];
+
+    optix_assert(optixProgramGroupCreate(
+        context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2]));
+  }
+
+  vector<OptixStackSizes> osl_stack_size(osl_groups.size());
+
+  /* Update SBT with new entries. */
+  sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size());
+  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+  }
+  for (size_t i = 0; i < osl_groups.size(); ++i) {
+    if (osl_groups[i] != NULL) {
+      optix_assert(optixSbtRecordPackHeader(osl_groups[i], &sbt_data[NUM_PROGRAM_GROUPS + i]));
+      optix_assert(optixProgramGroupGetStackSize(osl_groups[i], &osl_stack_size[i]));
+    }
+  }
+  sbt_data.copy_to_device(); /* Upload updated SBT to device. */
+
+  OptixPipelineLinkOptions link_options = {};
+  link_options.maxTraceDepth = 0;
+  link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
+
+  {
+    vector<OptixProgramGroup> pipeline_groups;
+    pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_BACKGROUND]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_LIGHT]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_VOLUME]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SHADOW]);
+    pipeline_groups.push_back(groups[PG_RGEN_EVAL_DISPLACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_EVAL_BACKGROUND]);
+    pipeline_groups.push_back(groups[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY]);
+
+    for (const OptixProgramGroup &group : osl_groups) {
+      if (group != NULL) {
+        pipeline_groups.push_back(group);
+      }
+    }
+
+    optix_assert(optixPipelineCreate(context,
+                                     &pipeline_options,
+                                     &link_options,
+                                     pipeline_groups.data(),
+                                     pipeline_groups.size(),
+                                     nullptr,
+                                     0,
+                                     &pipelines[PIP_SHADE]));
+
+    unsigned int dss = 0;
+    for (unsigned int i = 0; i < osl_stack_size.size(); ++i) {
+      dss = std::max(dss, osl_stack_size[i].dssDC);
+    }
+
+    optix_assert(optixPipelineSetStackSize(
+        pipelines[PIP_SHADE], 0, dss, 0, pipeline_options.usesMotionBlur ? 3 : 2));
+  }
+
+  return !have_error();
+#  else
+  return false;
+#  endif
+}
+
+void *OptiXDevice::get_cpu_osl_memory()
+{
+#  ifdef WITH_OSL
+  return &osl_globals;
+#  else
+  return NULL;
+#  endif
 }

 /* --------------------------------------------------------------------
@ -1567,7 +1844,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)

      size_t num_motion_steps = 1;
      Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-      if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
+      if (pipeline_options.usesMotionBlur && hair->get_use_motion_blur() && motion_keys) {
        num_motion_steps = hair->get_motion_steps();
      }

@ -1721,7 +1998,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)

      size_t num_motion_steps = 1;
      Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-      if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
+      if (pipeline_options.usesMotionBlur && mesh->get_use_motion_blur() && motion_keys) {
        num_motion_steps = mesh->get_motion_steps();
      }

@ -1788,7 +2065,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)

      size_t num_motion_steps = 1;
      Attribute *motion_points = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-      if (motion_blur && pointcloud->get_use_motion_blur() && motion_points) {
+      if (pipeline_options.usesMotionBlur && pointcloud->get_use_motion_blur() && motion_points) {
        num_motion_steps = pointcloud->get_motion_steps();
      }

@ -1885,7 +2162,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)

    /* Calculate total motion transform size and allocate memory for them. */
    size_t motion_transform_offset = 0;
-    if (motion_blur) {
+    if (pipeline_options.usesMotionBlur) {
      size_t total_motion_transform_size = 0;
      for (Object *const ob : bvh->objects) {
        if (ob->is_traceable() && ob->use_motion()) {
@ -1936,7 +2213,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)

      if (ob->get_geometry()->geometry_type == Geometry::HAIR &&
          static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
-        if (motion_blur && ob->get_geometry()->has_motion_blur()) {
+        if (pipeline_options.usesMotionBlur && ob->get_geometry()->has_motion_blur()) {
          /* Select between motion blur and non-motion blur built-in intersection module. */
          instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
        }
@ -1964,7 +2241,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
      }

      /* Insert motion traversable if object has motion. */
-      if (motion_blur && ob->use_motion()) {
+      if (pipeline_options.usesMotionBlur && ob->use_motion()) {
        size_t motion_keys = max(ob->get_motion().size(), (size_t)2) - 2;
        size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
                                       motion_keys * sizeof(OptixSRTData);
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@ -9,6 +9,7 @@
 #  include "device/cuda/device_impl.h"
 #  include "device/optix/queue.h"
 #  include "device/optix/util.h"
+#  include "kernel/osl/globals.h"
 #  include "kernel/types.h"
 #  include "util/unique_ptr.h"

@ -23,8 +24,16 @@ enum {
  PG_RGEN_INTERSECT_SHADOW,
  PG_RGEN_INTERSECT_SUBSURFACE,
  PG_RGEN_INTERSECT_VOLUME_STACK,
+  PG_RGEN_SHADE_BACKGROUND,
+  PG_RGEN_SHADE_LIGHT,
+  PG_RGEN_SHADE_SURFACE,
  PG_RGEN_SHADE_SURFACE_RAYTRACE,
  PG_RGEN_SHADE_SURFACE_MNEE,
+  PG_RGEN_SHADE_VOLUME,
+  PG_RGEN_SHADE_SHADOW,
+  PG_RGEN_EVAL_DISPLACE,
+  PG_RGEN_EVAL_BACKGROUND,
+  PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY,
  PG_MISS,
  PG_HITD, /* Default hit group. */
  PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
@ -40,14 +49,14 @@ enum {
 };

 static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
-static const int NUM_MIS_PROGRAM_GROUPS = 1;
+static const int NUM_MISS_PROGRAM_GROUPS = 1;
 static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
 static const int NUM_HIT_PROGRAM_GROUPS = 8;
 static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
 static const int NUM_CALLABLE_PROGRAM_GROUPS = 2;

 /* List of OptiX pipelines. */
-enum { PIP_SHADE_RAYTRACE, PIP_SHADE_MNEE, PIP_INTERSECT, NUM_PIPELINES };
+enum { PIP_SHADE, PIP_INTERSECT, NUM_PIPELINES };

 /* A single shader binding table entry. */
 struct SbtRecord {
@ -61,12 +70,20 @@ class OptiXDevice : public CUDADevice {
  OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */
  OptixModule builtin_modules[2] = {};
  OptixPipeline pipelines[NUM_PIPELINES] = {};
+  OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+  OptixPipelineCompileOptions pipeline_options = {};

-  bool motion_blur = false;
  device_vector<SbtRecord> sbt_data;
  device_only_memory<KernelParamsOptiX> launch_params;
-  OptixTraversableHandle tlas_handle = 0;

+#  ifdef WITH_OSL
+  OSLGlobals osl_globals;
+  vector<OptixModule> osl_modules;
+  vector<OptixProgramGroup> osl_groups;
+#  endif
+
+ private:
+  OptixTraversableHandle tlas_handle = 0;
  vector<unique_ptr<device_only_memory<char>>> delayed_free_bvh_memory;
  thread_mutex delayed_free_bvh_mutex;

@ -100,13 +117,14 @@ class OptiXDevice : public CUDADevice {
  OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
  ~OptiXDevice();

- private:
  BVHLayoutMask get_bvh_layout_mask() const override;

  string compile_kernel_get_common_cflags(const uint kernel_features);

  bool load_kernels(const uint kernel_features) override;

+  bool load_osl_kernels() override;
+
  bool build_optix_bvh(BVHOptiX *bvh,
                       OptixBuildOperation operation,
                       const OptixBuildInput &build_input,
@ -123,6 +141,8 @@ class OptiXDevice : public CUDADevice {

  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;

+  void *get_cpu_osl_memory() override;
+
  /* --------------------------------------------------------------------
   * Denoising.
   */
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@ -24,21 +24,33 @@ void OptiXDeviceQueue::init_execution()
  CUDADeviceQueue::init_execution();
 }

-static bool is_optix_specific_kernel(DeviceKernel kernel)
+static bool is_optix_specific_kernel(DeviceKernel kernel, bool use_osl)
 {
-  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+#  ifdef WITH_OSL
+  /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */
+  if (use_osl && device_kernel_has_shading(kernel)) {
+    return true;
+  }
+#  else
+  (void)use_osl;
+#  endif
+
+  return device_kernel_has_intersection(kernel);
 }

 bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                               const int work_size,
                               DeviceKernelArguments const &args)
 {
-  if (!is_optix_specific_kernel(kernel)) {
+  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
+
+#  ifdef WITH_OSL
+  const bool use_osl = static_cast<OSLGlobals *>(optix_device->get_cpu_osl_memory())->use;
+#  else
+  const bool use_osl = false;
+#  endif
+
+  if (!is_optix_specific_kernel(kernel, use_osl)) {
    return CUDADeviceQueue::enqueue(kernel, work_size, args);
  }

@ -50,8 +62,6 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,

  const CUDAContextScope scope(cuda_device_);

-  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
-
  const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer;
  const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer;

@ -62,9 +72,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                        sizeof(device_ptr),
                        cuda_stream_));

-  if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || device_kernel_has_shading(kernel)) {
    cuda_device_assert(
        cuda_device_,
        cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
@ -72,6 +80,15 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                          sizeof(device_ptr),
                          cuda_stream_));
  }
+  if (kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE ||
+      kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND ||
+      kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY) {
+    cuda_device_assert(cuda_device_,
+                       cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, offset),
+                                         args.values[2],  // &d_offset
+                                         sizeof(int32_t),
+                                         cuda_stream_));
+  }

  cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));

@ -79,14 +96,35 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
  OptixShaderBindingTable sbt_params = {};

  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_BACKGROUND * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_LIGHT * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE * sizeof(SbtRecord);
+      break;
    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
-      pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE];
+      pipeline = optix_device->pipelines[PIP_SHADE];
      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord);
      break;
    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
-      pipeline = optix_device->pipelines[PIP_SHADE_MNEE];
+      pipeline = optix_device->pipelines[PIP_SHADE];
      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_MNEE * sizeof(SbtRecord);
      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_VOLUME * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SHADOW * sizeof(SbtRecord);
+      break;
+
    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
      pipeline = optix_device->pipelines[PIP_INTERSECT];
      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord);
@ -104,6 +142,20 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord);
      break;

+    case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_DISPLACE * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_BACKGROUND * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr +
+                                PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY * sizeof(SbtRecord);
+      break;
+
    default:
      LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel)
                 << " is attempted to be enqueued.";
@ -112,7 +164,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,

  sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord);
  sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
-  sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS;
+  sbt_params.missRecordCount = NUM_MISS_PROGRAM_GROUPS;
  sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord);
  sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
  sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS;
@ -120,6 +172,12 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
  sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS;
  sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);

+#  ifdef WITH_OSL
+  if (use_osl) {
+    sbt_params.callablesRecordCount += static_cast<unsigned int>(optix_device->osl_groups.size());
+  }
+#  endif
+
  /* Launch the ray generation program. */
  optix_device_assert(optix_device,
                      optixLaunch(pipeline,
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -37,6 +37,14 @@ set(SRC_KERNEL_DEVICE_OPTIX
  device/optix/kernel_shader_raytrace.cu
 )

+if(WITH_CYCLES_OSL AND (OSL_LIBRARY_VERSION_MINOR GREATER_EQUAL 13 OR OSL_LIBRARY_VERSION_MAJOR GREATER 1))
+  set(SRC_KERNEL_DEVICE_OPTIX
+    ${SRC_KERNEL_DEVICE_OPTIX}
+    osl/services_optix.cu
+    device/optix/kernel_osl.cu
+  )
+endif()
+
 set(SRC_KERNEL_DEVICE_ONEAPI
  device/oneapi/kernel.cpp
 )
@ -181,6 +189,16 @@ set(SRC_KERNEL_SVM_HEADERS
  svm/vertex_color.h
 )

+if(WITH_CYCLES_OSL)
+  set(SRC_KERNEL_OSL_HEADERS
+    osl/osl.h
+    osl/closures_setup.h
+    osl/closures_template.h
+    osl/services_gpu.h
+    osl/types.h
+  )
+endif()
+
 set(SRC_KERNEL_GEOM_HEADERS
  geom/geom.h
  geom/attribute.h
@ -306,6 +324,7 @@ set(SRC_KERNEL_HEADERS
  ${SRC_KERNEL_GEOM_HEADERS}
  ${SRC_KERNEL_INTEGRATOR_HEADERS}
  ${SRC_KERNEL_LIGHT_HEADERS}
+  ${SRC_KERNEL_OSL_HEADERS}
  ${SRC_KERNEL_SAMPLE_HEADERS}
  ${SRC_KERNEL_SVM_HEADERS}
  ${SRC_KERNEL_TYPES_HEADERS}
@ -708,6 +727,16 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
    kernel_optix_shader_raytrace
    "device/optix/kernel_shader_raytrace.cu"
    "--keep-device-functions")
+  if(WITH_CYCLES_OSL AND (OSL_LIBRARY_VERSION_MINOR GREATER_EQUAL 13 OR OSL_LIBRARY_VERSION_MAJOR GREATER 1))
+    CYCLES_OPTIX_KERNEL_ADD(
+      kernel_optix_osl
+      "device/optix/kernel_osl.cu"
+      "--relocatable-device-code=true")
+    CYCLES_OPTIX_KERNEL_ADD(
+      kernel_optix_osl_services
+      "osl/services_optix.cu"
+      "--relocatable-device-code=true")
+  endif()

  add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx})
  cycles_set_solution_folder(cycles_kernel_optix)
@ -995,6 +1024,7 @@ source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS})
 source_group("kernel" FILES ${SRC_KERNEL_TYPES_HEADERS})
 source_group("light" FILES ${SRC_KERNEL_LIGHT_HEADERS})
+source_group("osl" FILES ${SRC_KERNEL_OSL_HEADERS})
 source_group("sample" FILES ${SRC_KERNEL_SAMPLE_HEADERS})
 source_group("svm" FILES ${SRC_KERNEL_SVM_HEADERS})
 source_group("util" FILES ${SRC_KERNEL_UTIL_HEADERS})
@ -1031,6 +1061,7 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_FILM_HEADERS}" ${CYCLE
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_LIGHT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/light)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_OSL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/osl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_SAMPLE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/sample)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_TYPES_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@ -297,8 +297,10 @@ ccl_device_inline void bsdf_roughness_eta(const KernelGlobals kg,
                                          ccl_private float2 *roughness,
                                          ccl_private float *eta)
 {
+#ifdef __SVM__
  bool refractive = false;
  float alpha = 1.0f;
+#endif
  switch (sc->type) {
    case CLOSURE_BSDF_DIFFUSE_ID:
      *roughness = one_float2();
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@ -30,6 +30,7 @@ typedef unsigned long long uint64_t;
 /* Qualifiers */

 #define ccl_device __device__ __inline__
+#define ccl_device_extern extern "C" __device__
 #if __CUDA_ARCH__ < 500
 #  define ccl_device_inline __device__ __forceinline__
 #  define ccl_device_forceinline __device__ __forceinline__
@ -109,14 +110,14 @@ ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object_3D

 typedef unsigned short half;

-__device__ half __float2half(const float f)
+ccl_device_forceinline half __float2half(const float f)
 {
  half val;
  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
  return val;
 }

-__device__ float __half2float(const half h)
+ccl_device_forceinline float __half2float(const half h)
 {
  float val;
  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@ -28,6 +28,7 @@ typedef unsigned long long uint64_t;
 /* Qualifiers */

 #define ccl_device __device__ __inline__
+#define ccl_device_extern extern "C" __device__
 #define ccl_device_inline __device__ __inline__
 #define ccl_device_forceinline __device__ __forceinline__
 #define ccl_device_noinline __device__ __noinline__
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@ -38,6 +38,7 @@ using namespace metal::raytracing;
 #  define ccl_device_noinline ccl_device __attribute__((noinline))
 #endif

+#define ccl_device_extern extern "C"
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_device_inline_method ccl_device
 #define ccl_global device
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@ -28,6 +28,7 @@
 /* Qualifier wrappers for different names on different devices */

 #define ccl_device
+#define ccl_device_extern extern "C"
 #define ccl_global
 #define ccl_always_inline __attribute__((always_inline))
 #define ccl_device_inline inline
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@ -33,14 +33,16 @@ typedef unsigned long long uint64_t;
 #endif

 #define ccl_device \
-  __device__ __forceinline__  // Function calls are bad for OptiX performance, so inline everything
+  static __device__ \
+      __forceinline__  // Function calls are bad for OptiX performance, so inline everything
+#define ccl_device_extern extern "C" __device__
 #define ccl_device_inline ccl_device
 #define ccl_device_forceinline ccl_device
-#define ccl_device_inline_method ccl_device
-#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_inline_method __device__ __forceinline__
+#define ccl_device_noinline static __device__ __noinline__
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_global
-#define ccl_inline_constant __constant__
+#define ccl_inline_constant static __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
 #define ccl_gpu_shared __shared__
@ -57,23 +59,6 @@ typedef unsigned long long uint64_t;

 #define kernel_assert(cond)

-/* GPU thread, block, grid size and index */
-
-#define ccl_gpu_thread_idx_x (threadIdx.x)
-#define ccl_gpu_block_dim_x (blockDim.x)
-#define ccl_gpu_block_idx_x (blockIdx.x)
-#define ccl_gpu_grid_dim_x (gridDim.x)
-#define ccl_gpu_warp_size (warpSize)
-#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
-#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
-
-/* GPU warp synchronization. */
-
-#define ccl_gpu_syncthreads() __syncthreads()
-#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
-
 /* GPU texture objects */

 typedef unsigned long long CUtexObject;
@ -101,14 +86,14 @@ ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object_3D

 typedef unsigned short half;

-__device__ half __float2half(const float f)
+ccl_device_forceinline half __float2half(const float f)
 {
  half val;
  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
  return val;
 }

-__device__ float __half2float(const half h)
+ccl_device_forceinline float __half2float(const half h)
 {
  float val;
  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
--- a/intern/cycles/kernel/device/optix/globals.h
+++ b/intern/cycles/kernel/device/optix/globals.h
@ -25,6 +25,7 @@ struct KernelParamsOptiX {
  /* Kernel arguments */
  const int *path_index_array;
  float *render_buffer;
+  int offset;

  /* Global scene data and textures */
  KernelData data;
@ -36,7 +37,11 @@ struct KernelParamsOptiX {
 };

 #ifdef __NVCC__
-extern "C" static __constant__ KernelParamsOptiX kernel_params;
+extern "C"
+#  ifndef __CUDACC_RDC__
+    static
+#  endif
+    __constant__ KernelParamsOptiX kernel_params;
 #endif

 /* Abstraction macros */
--- a/intern/cycles/kernel/device/optix/kernel_osl.cu
+++ b/intern/cycles/kernel/device/optix/kernel_osl.cu
@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#define WITH_OSL
+
+/* Copy of the regular OptiX kernels with additional OSL support. */
+
+#include "kernel/device/optix/kernel_shader_raytrace.cu"
+
+#include "kernel/bake/bake.h"
+#include "kernel/integrator/shade_background.h"
+#include "kernel/integrator/shade_light.h"
+#include "kernel/integrator/shade_shadow.h"
+#include "kernel/integrator/shade_volume.h"
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_background()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_background(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_light()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_light(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_surface(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_volume()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_volume(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_shadow()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_shadow(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_displace()
+{
+  KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+  float *const output = kernel_params.render_buffer;
+  const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+  kernel_displace_evaluate(nullptr, input, output, global_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_background()
+{
+  KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+  float *const output = kernel_params.render_buffer;
+  const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+  kernel_background_evaluate(nullptr, input, output, global_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_curve_shadow_transparency()
+{
+  KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+  float *const output = kernel_params.render_buffer;
+  const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+  kernel_curve_shadow_transparency_evaluate(nullptr, input, output, global_index);
+}
--- a/intern/cycles/kernel/integrator/displacement_shader.h
+++ b/intern/cycles/kernel/integrator/displacement_shader.h
@ -24,8 +24,8 @@ ccl_device void displacement_shader_eval(KernelGlobals kg,

  /* this will modify sd->P */
 #ifdef __OSL__
-  if (kg->osl) {
-    OSLShader::eval_displacement(kg, state, sd);
+  if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+    osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(kg, state, sd, 0);
  }
  else
 #endif
--- a/intern/cycles/kernel/integrator/surface_shader.h
+++ b/intern/cycles/kernel/integrator/surface_shader.h
@ -827,13 +827,8 @@ ccl_device void surface_shader_eval(KernelGlobals kg,
  sd->num_closure_left = max_closures;

 #ifdef __OSL__
-  if (kg->osl) {
-    if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
-      OSLShader::eval_background(kg, state, sd, path_flag);
-    }
-    else {
-      OSLShader::eval_surface(kg, state, sd, path_flag);
-    }
+  if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+    osl_eval_nodes<SHADER_TYPE_SURFACE>(kg, state, sd, path_flag);
  }
  else
 #endif
--- a/intern/cycles/kernel/integrator/volume_shader.h
+++ b/intern/cycles/kernel/integrator/volume_shader.h
@ -493,8 +493,8 @@ ccl_device_inline void volume_shader_eval(KernelGlobals kg,

    /* evaluate shader */
 #  ifdef __OSL__
-    if (kg->osl) {
-      OSLShader::eval_volume(kg, state, sd, path_flag);
+    if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+      osl_eval_nodes<SHADER_TYPE_VOLUME>(kg, state, sd, path_flag);
    }
    else
 #  endif
--- a/intern/cycles/kernel/osl/closures.cpp
+++ b/intern/cycles/kernel/osl/closures.cpp
@ -25,13 +25,18 @@

 #include "kernel/osl/osl.h"

-#include "kernel/osl/closures_setup.h"
-
 #define TO_VEC3(v) OSL::Vec3(v.x, v.y, v.z)
 #define TO_FLOAT3(v) make_float3(v[0], v[1], v[2])

 CCL_NAMESPACE_BEGIN

+static_assert(sizeof(OSLClosure) == sizeof(OSL::ClosureColor) &&
+              sizeof(OSLClosureAdd) == sizeof(OSL::ClosureAdd) &&
+              sizeof(OSLClosureMul) == sizeof(OSL::ClosureMul) &&
+              sizeof(OSLClosureComponent) == sizeof(OSL::ClosureComponent));
+static_assert(sizeof(ShaderGlobals) == sizeof(OSL::ShaderGlobals) &&
+              offsetof(ShaderGlobals, Ci) == offsetof(OSL::ShaderGlobals, Ci));
+
 /* Registration */

 #define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
@ -60,53 +65,18 @@ void OSLRenderServices::register_closures(OSL::ShadingSystem *ss)
 #include "closures_template.h"
 }

-/* Globals */
+/* Surface & Background */

-static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
-                                        ShaderData *sd,
-                                        const void *state,
-                                        uint32_t path_flag,
-                                        OSLThreadData *tdata)
+template<>
+void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg,
+                                         const void *state,
+                                         ShaderData *sd,
+                                         uint32_t path_flag)
 {
-  OSL::ShaderGlobals *globals = &tdata->globals;
-
-  const differential3 dP = differential_from_compact(sd->Ng, sd->dP);
-  const differential3 dI = differential_from_compact(sd->I, sd->dI);
-
-  /* copy from shader data to shader globals */
-  globals->P = TO_VEC3(sd->P);
-  globals->dPdx = TO_VEC3(dP.dx);
-  globals->dPdy = TO_VEC3(dP.dy);
-  globals->I = TO_VEC3(sd->I);
-  globals->dIdx = TO_VEC3(dI.dx);
-  globals->dIdy = TO_VEC3(dI.dy);
-  globals->N = TO_VEC3(sd->N);
-  globals->Ng = TO_VEC3(sd->Ng);
-  globals->u = sd->u;
-  globals->dudx = sd->du.dx;
-  globals->dudy = sd->du.dy;
-  globals->v = sd->v;
-  globals->dvdx = sd->dv.dx;
-  globals->dvdy = sd->dv.dy;
-  globals->dPdu = TO_VEC3(sd->dPdu);
-  globals->dPdv = TO_VEC3(sd->dPdv);
-  globals->surfacearea = 1.0f;
-  globals->time = sd->time;
-
-  /* booleans */
-  globals->raytype = path_flag;
-  globals->flipHandedness = 0;
-  globals->backfacing = (sd->flag & SD_BACKFACING);
-
-  /* shader data to be used in services callbacks */
-  globals->renderstate = sd;
-
-  /* hacky, we leave it to services to fetch actual object matrix */
-  globals->shader2common = sd;
-  globals->object2common = sd;
-
-  /* must be set to NULL before execute */
-  globals->Ci = NULL;
+  /* setup shader globals from shader data */
+  OSLThreadData *tdata = kg->osl_tdata;
+  shaderdata_to_shaderglobals(
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));

  /* clear trace data */
  tdata->tracedata.init = false;
@ -121,53 +91,6 @@ static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
    sd->osl_path_state = (const IntegratorStateCPU *)state;
    sd->osl_shadow_path_state = nullptr;
  }
-}
-
-static void flatten_closure_tree(const KernelGlobalsCPU *kg,
-                                 ShaderData *sd,
-                                 uint32_t path_flag,
-                                 const OSL::ClosureColor *closure,
-                                 float3 weight = make_float3(1.0f, 1.0f, 1.0f))
-{
-  /* OSL gives us a closure tree, we flatten it into arrays per
-   * closure type, for evaluation, sampling, etc later on. */
-
-  switch (closure->id) {
-    case OSL::ClosureColor::MUL: {
-      OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
-      flatten_closure_tree(kg, sd, path_flag, mul->closure, TO_FLOAT3(mul->weight) * weight);
-      break;
-    }
-    case OSL::ClosureColor::ADD: {
-      OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
-      flatten_closure_tree(kg, sd, path_flag, add->closureA, weight);
-      flatten_closure_tree(kg, sd, path_flag, add->closureB, weight);
-      break;
-    }
-#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
-  case OSL_CLOSURE_##Upper##_ID: { \
-    const OSL::ClosureComponent *comp = reinterpret_cast<const OSL::ClosureComponent *>(closure); \
-    weight *= TO_FLOAT3(comp->w); \
-    osl_closure_##lower##_setup( \
-        kg, sd, path_flag, weight, reinterpret_cast<const Upper##Closure *>(comp + 1)); \
-    break; \
-  }
-#include "closures_template.h"
-    default:
-      break;
-  }
-}
-
-/* Surface */
-
-void OSLShader::eval_surface(const KernelGlobalsCPU *kg,
-                             const void *state,
-                             ShaderData *sd,
-                             uint32_t path_flag)
-{
-  /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);

  /* execute shader for this point */
  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
@ -175,101 +98,99 @@ void OSLShader::eval_surface(const KernelGlobalsCPU *kg,
  OSL::ShadingContext *octx = tdata->context;
  int shader = sd->shader & SHADER_MASK;

-  /* automatic bump shader */
-  if (kg->osl->bump_state[shader]) {
-    /* save state */
-    const float3 P = sd->P;
-    const float dP = sd->dP;
-    const OSL::Vec3 dPdx = globals->dPdx;
-    const OSL::Vec3 dPdy = globals->dPdy;
+  if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
+    /* background */
+    if (kg->osl->background_state) {
+      ss->execute(octx, *(kg->osl->background_state), *globals);
+    }
+  }
+  else {
+    /* automatic bump shader */
+    if (kg->osl->bump_state[shader]) {
+      /* save state */
+      const float3 P = sd->P;
+      const float dP = sd->dP;
+      const OSL::Vec3 dPdx = globals->dPdx;
+      const OSL::Vec3 dPdy = globals->dPdy;

-    /* set state as if undisplaced */
-    if (sd->flag & SD_HAS_DISPLACEMENT) {
-      float data[9];
-      bool found = kg->osl->services->get_attribute(sd,
-                                                    true,
-                                                    OSLRenderServices::u_empty,
-                                                    TypeDesc::TypeVector,
-                                                    OSLRenderServices::u_geom_undisplaced,
-                                                    data);
-      (void)found;
-      assert(found);
+      /* set state as if undisplaced */
+      if (sd->flag & SD_HAS_DISPLACEMENT) {
+        float data[9];
+        bool found = kg->osl->services->get_attribute(sd,
+                                                      true,
+                                                      OSLRenderServices::u_empty,
+                                                      TypeDesc::TypeVector,
+                                                      OSLRenderServices::u_geom_undisplaced,
+                                                      data);
+        (void)found;
+        assert(found);

-      differential3 tmp_dP;
-      memcpy(&sd->P, data, sizeof(float) * 3);
-      memcpy(&tmp_dP.dx, data + 3, sizeof(float) * 3);
-      memcpy(&tmp_dP.dy, data + 6, sizeof(float) * 3);
+        differential3 tmp_dP;
+        memcpy(&sd->P, data, sizeof(float) * 3);
+        memcpy(&tmp_dP.dx, data + 3, sizeof(float) * 3);
+        memcpy(&tmp_dP.dy, data + 6, sizeof(float) * 3);

-      object_position_transform(kg, sd, &sd->P);
-      object_dir_transform(kg, sd, &tmp_dP.dx);
-      object_dir_transform(kg, sd, &tmp_dP.dy);
+        object_position_transform(kg, sd, &sd->P);
+        object_dir_transform(kg, sd, &tmp_dP.dx);
+        object_dir_transform(kg, sd, &tmp_dP.dy);

-      sd->dP = differential_make_compact(tmp_dP);
+        sd->dP = differential_make_compact(tmp_dP);

-      globals->P = TO_VEC3(sd->P);
-      globals->dPdx = TO_VEC3(tmp_dP.dx);
-      globals->dPdy = TO_VEC3(tmp_dP.dy);
+        globals->P = TO_VEC3(sd->P);
+        globals->dPdx = TO_VEC3(tmp_dP.dx);
+        globals->dPdy = TO_VEC3(tmp_dP.dy);
+      }
+
+      /* execute bump shader */
+      ss->execute(octx, *(kg->osl->bump_state[shader]), *globals);
+
+      /* reset state */
+      sd->P = P;
+      sd->dP = dP;
+
+      globals->P = TO_VEC3(P);
+      globals->dPdx = TO_VEC3(dPdx);
+      globals->dPdy = TO_VEC3(dPdy);
    }

-    /* execute bump shader */
-    ss->execute(octx, *(kg->osl->bump_state[shader]), *globals);
-
-    /* reset state */
-    sd->P = P;
-    sd->dP = dP;
-
-    globals->P = TO_VEC3(P);
-    globals->dPdx = TO_VEC3(dPdx);
-    globals->dPdy = TO_VEC3(dPdy);
-  }
-
-  /* surface shader */
-  if (kg->osl->surface_state[shader]) {
-    ss->execute(octx, *(kg->osl->surface_state[shader]), *globals);
+    /* surface shader */
+    if (kg->osl->surface_state[shader]) {
+      ss->execute(octx, *(kg->osl->surface_state[shader]), *globals);
+    }
  }

  /* flatten closure tree */
  if (globals->Ci) {
-    flatten_closure_tree(kg, sd, path_flag, globals->Ci);
-  }
-}
-
-/* Background */
-
-void OSLShader::eval_background(const KernelGlobalsCPU *kg,
-                                const void *state,
-                                ShaderData *sd,
-                                uint32_t path_flag)
-{
-  /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
-
-  /* execute shader for this point */
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
-  OSL::ShaderGlobals *globals = &tdata->globals;
-  OSL::ShadingContext *octx = tdata->context;
-
-  if (kg->osl->background_state) {
-    ss->execute(octx, *(kg->osl->background_state), *globals);
-  }
-
-  /* return background color immediately */
-  if (globals->Ci) {
-    flatten_closure_tree(kg, sd, path_flag, globals->Ci);
+    flatten_closure_tree(kg, sd, path_flag, reinterpret_cast<OSLClosure *>(globals->Ci));
  }
 }

 /* Volume */

-void OSLShader::eval_volume(const KernelGlobalsCPU *kg,
-                            const void *state,
-                            ShaderData *sd,
-                            uint32_t path_flag)
+template<>
+void osl_eval_nodes<SHADER_TYPE_VOLUME>(const KernelGlobalsCPU *kg,
+                                        const void *state,
+                                        ShaderData *sd,
+                                        uint32_t path_flag)
 {
  /* setup shader globals from shader data */
  OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
+  shaderdata_to_shaderglobals(
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
+
+  /* clear trace data */
+  tdata->tracedata.init = false;
+
+  /* Used by render-services. */
+  sd->osl_globals = kg;
+  if (path_flag & PATH_RAY_SHADOW) {
+    sd->osl_path_state = nullptr;
+    sd->osl_shadow_path_state = (const IntegratorShadowStateCPU *)state;
+  }
+  else {
+    sd->osl_path_state = (const IntegratorStateCPU *)state;
+    sd->osl_shadow_path_state = nullptr;
+  }

  /* execute shader */
  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
@ -283,17 +204,30 @@ void OSLShader::eval_volume(const KernelGlobalsCPU *kg,

  /* flatten closure tree */
  if (globals->Ci) {
-    flatten_closure_tree(kg, sd, path_flag, globals->Ci);
+    flatten_closure_tree(kg, sd, path_flag, reinterpret_cast<OSLClosure *>(globals->Ci));
  }
 }

 /* Displacement */

-void OSLShader::eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd)
+template<>
+void osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(const KernelGlobalsCPU *kg,
+                                              const void *state,
+                                              ShaderData *sd,
+                                              uint32_t path_flag)
 {
  /* setup shader globals from shader data */
  OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, 0, tdata);
+  shaderdata_to_shaderglobals(
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
+
+  /* clear trace data */
+  tdata->tracedata.init = false;
+
+  /* Used by render-services. */
+  sd->osl_globals = kg;
+  sd->osl_path_state = (const IntegratorStateCPU *)state;
+  sd->osl_shadow_path_state = nullptr;

  /* execute shader */
  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
--- a/intern/cycles/kernel/osl/closures_setup.h
+++ b/intern/cycles/kernel/osl/closures_setup.h
@ -40,12 +40,7 @@ CCL_NAMESPACE_BEGIN
    const char *label;
 #define OSL_CLOSURE_STRUCT_END(Upper, lower) \
  } \
-  ; \
-  ccl_device void osl_closure_##lower##_setup(KernelGlobals kg, \
-                                              ccl_private ShaderData *sd, \
-                                              uint32_t path_flag, \
-                                              float3 weight, \
-                                              ccl_private Upper##Closure *closure);
+  ;
 #define OSL_CLOSURE_STRUCT_MEMBER(Upper, TYPE, type, name, key) type name;
 #define OSL_CLOSURE_STRUCT_ARRAY_MEMBER(Upper, TYPE, type, name, key, size) type name[size];

@ -210,11 +205,9 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
  bsdf->ior = closure->ior;
  bsdf->T = closure->T;

-  static OSL::ustring u_ggx("ggx");
-  static OSL::ustring u_default("default");
-
  /* GGX */
-  if (closure->distribution == u_ggx || closure->distribution == u_default) {
+  if (closure->distribution == make_string("ggx", 11253504724482777663ull) ||
+      closure->distribution == make_string("default", 4430693559278735917ull)) {
    if (!closure->refract) {
      if (closure->alpha_x == closure->alpha_y) {
        /* Isotropic */
@ -1000,18 +993,14 @@ ccl_device void osl_closure_bssrdf_setup(KernelGlobals kg,
                                         float3 weight,
                                         ccl_private const BSSRDFClosure *closure)
 {
-  static ustring u_burley("burley");
-  static ustring u_random_walk_fixed_radius("random_walk_fixed_radius");
-  static ustring u_random_walk("random_walk");
-
  ClosureType type;
-  if (closure->method == u_burley) {
+  if (closure->method == make_string("burley", 186330084368958868ull)) {
    type = CLOSURE_BSSRDF_BURLEY_ID;
  }
-  else if (closure->method == u_random_walk_fixed_radius) {
+  else if (closure->method == make_string("random_walk_fixed_radius", 5695810351010063150ull)) {
    type = CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID;
  }
-  else if (closure->method == u_random_walk) {
+  else if (closure->method == make_string("random_walk", 11360609267673527222ull)) {
    type = CLOSURE_BSSRDF_RANDOM_WALK_ID;
  }
  else {
--- a/intern/cycles/kernel/osl/closures_template.h
+++ b/intern/cycles/kernel/osl/closures_template.h
@ -40,7 +40,7 @@ OSL_CLOSURE_STRUCT_BEGIN(Transparent, transparent)
 OSL_CLOSURE_STRUCT_END(Transparent, transparent)

 OSL_CLOSURE_STRUCT_BEGIN(Microfacet, microfacet)
-  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, STRING, ustring, distribution, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, STRING, DeviceString, distribution, NULL)
  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, VECTOR, packed_float3, N, NULL)
  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, VECTOR, packed_float3, T, NULL)
  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, FLOAT, float, alpha_x, NULL)
@ -210,7 +210,7 @@ OSL_CLOSURE_STRUCT_BEGIN(PhongRamp, phong_ramp)
 OSL_CLOSURE_STRUCT_END(PhongRamp, phong_ramp)

 OSL_CLOSURE_STRUCT_BEGIN(BSSRDF, bssrdf)
-  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, STRING, ustring, method, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, STRING, DeviceString, method, NULL)
  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, N, NULL)
  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, radius, NULL)
  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, albedo, NULL)
--- a/intern/cycles/kernel/osl/osl.h
+++ b/intern/cycles/kernel/osl/osl.h
@ -1,38 +1,171 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Adapted from Open Shading Language
+ * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
+ * All Rights Reserved.
+ *
+ * Modifications Copyright 2011-2022 Blender Foundation. */

 #pragma once

 /* OSL Shader Engine
 *
- * Holds all variables to execute and use OSL shaders from the kernel. These
- * are initialized externally by OSLShaderManager before rendering starts.
- *
- * Before/after a thread starts rendering, thread_init/thread_free must be
- * called, which will store any per thread OSL state in thread local storage.
- * This means no thread state must be passed along in the kernel itself.
+ * Holds all variables to execute and use OSL shaders from the kernel.
 */

 #include "kernel/osl/types.h"

+#include "kernel/osl/closures_setup.h"
+
 CCL_NAMESPACE_BEGIN

-class OSLShader {
- public:
-  /* eval */
-  static void eval_surface(const KernelGlobalsCPU *kg,
-                           const void *state,
-                           ShaderData *sd,
-                           uint32_t path_flag);
-  static void eval_background(const KernelGlobalsCPU *kg,
-                              const void *state,
-                              ShaderData *sd,
-                              uint32_t path_flag);
-  static void eval_volume(const KernelGlobalsCPU *kg,
-                          const void *state,
-                          ShaderData *sd,
-                          uint32_t path_flag);
-  static void eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd);
-};
+ccl_device_inline void shaderdata_to_shaderglobals(KernelGlobals kg,
+                                                   ccl_private ShaderData *sd,
+                                                   uint32_t path_flag,
+                                                   ccl_private ShaderGlobals *globals)
+{
+  const differential3 dP = differential_from_compact(sd->Ng, sd->dP);
+  const differential3 dI = differential_from_compact(sd->I, sd->dI);
+
+  /* copy from shader data to shader globals */
+  globals->P = sd->P;
+  globals->dPdx = dP.dx;
+  globals->dPdy = dP.dy;
+  globals->I = sd->I;
+  globals->dIdx = dI.dx;
+  globals->dIdy = dI.dy;
+  globals->N = sd->N;
+  globals->Ng = sd->Ng;
+  globals->u = sd->u;
+  globals->dudx = sd->du.dx;
+  globals->dudy = sd->du.dy;
+  globals->v = sd->v;
+  globals->dvdx = sd->dv.dx;
+  globals->dvdy = sd->dv.dy;
+  globals->dPdu = sd->dPdu;
+  globals->dPdv = sd->dPdv;
+  globals->time = sd->time;
+  globals->dtime = 1.0f;
+  globals->surfacearea = 1.0f;
+  globals->raytype = path_flag;
+  globals->flipHandedness = 0;
+  globals->backfacing = (sd->flag & SD_BACKFACING);
+
+  /* shader data to be used in services callbacks */
+  globals->renderstate = sd;
+
+  /* hacky, we leave it to services to fetch actual object matrix */
+  globals->shader2common = sd;
+  globals->object2common = sd;
+
+  /* must be set to NULL before execute */
+  globals->Ci = nullptr;
+}
+
+ccl_device void flatten_closure_tree(KernelGlobals kg,
+                                     ccl_private ShaderData *sd,
+                                     uint32_t path_flag,
+                                     ccl_private const OSLClosure *closure)
+{
+  int stack_size = 0;
+  float3 weight = one_float3();
+  float3 weight_stack[16];
+  ccl_private const OSLClosure *closure_stack[16];
+
+  while (closure) {
+    switch (closure->id) {
+      case OSL_CLOSURE_MUL_ID: {
+        ccl_private const OSLClosureMul *mul = static_cast<ccl_private const OSLClosureMul *>(
+            closure);
+        weight *= mul->weight;
+        closure = mul->closure;
+        continue;
+      }
+      case OSL_CLOSURE_ADD_ID: {
+        if (stack_size >= 16) {
+          kernel_assert(!"Exhausted OSL closure stack");
+          break;
+        }
+        ccl_private const OSLClosureAdd *add = static_cast<ccl_private const OSLClosureAdd *>(
+            closure);
+        closure = add->closureA;
+        weight_stack[stack_size] = weight;
+        closure_stack[stack_size++] = add->closureB;
+        continue;
+      }
+#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
+  case OSL_CLOSURE_##Upper##_ID: { \
+    ccl_private const OSLClosureComponent *comp = \
+        static_cast<ccl_private const OSLClosureComponent *>(closure); \
+    osl_closure_##lower##_setup(kg, \
+                                sd, \
+                                path_flag, \
+                                weight * comp->weight, \
+                                reinterpret_cast<ccl_private const Upper##Closure *>(comp + 1)); \
+    break; \
+  }
+#include "closures_template.h"
+      default:
+        break;
+    }
+
+    if (stack_size > 0) {
+      weight = weight_stack[--stack_size];
+      closure = closure_stack[stack_size];
+    }
+    else {
+      closure = nullptr;
+    }
+  }
+}
+
+#ifndef __KERNEL_GPU__
+
+template<ShaderType type>
+void osl_eval_nodes(const KernelGlobalsCPU *kg,
+                    const void *state,
+                    ShaderData *sd,
+                    uint32_t path_flag);
+
+#else
+
+template<ShaderType type, typename ConstIntegratorGenericState>
+ccl_device_inline void osl_eval_nodes(KernelGlobals kg,
+                                      ConstIntegratorGenericState state,
+                                      ccl_private ShaderData *sd,
+                                      uint32_t path_flag)
+{
+  ShaderGlobals globals;
+  shaderdata_to_shaderglobals(kg, sd, path_flag, &globals);
+
+  const int shader = sd->shader & SHADER_MASK;
+
+#  ifdef __KERNEL_OPTIX__
+  uint8_t group_data[2048];
+  uint8_t closure_pool[1024];
+  sd->osl_closure_pool = closure_pool;
+
+  unsigned int optix_dc_index = 2 /* NUM_CALLABLE_PROGRAM_GROUPS */ +
+                                (shader + type * kernel_data.max_shaders) * 2;
+  optixDirectCall<void>(optix_dc_index + 0,
+                        /* shaderglobals_ptr = */ &globals,
+                        /* groupdata_ptr = */ (void *)group_data,
+                        /* userdata_base_ptr = */ (void *)nullptr,
+                        /* output_base_ptr = */ (void *)nullptr,
+                        /* shadeindex = */ 0);
+  optixDirectCall<void>(optix_dc_index + 1,
+                        /* shaderglobals_ptr = */ &globals,
+                        /* groupdata_ptr = */ (void *)group_data,
+                        /* userdata_base_ptr = */ (void *)nullptr,
+                        /* output_base_ptr = */ (void *)nullptr,
+                        /* shadeindex = */ 0);
+#  endif
+
+  if (globals.Ci) {
+    flatten_closure_tree(kg, sd, path_flag, globals.Ci);
+  }
+}
+
+#endif

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@ -119,8 +119,8 @@ ustring OSLRenderServices::u_u("u");
 ustring OSLRenderServices::u_v("v");
 ustring OSLRenderServices::u_empty;

-OSLRenderServices::OSLRenderServices(OSL::TextureSystem *texture_system)
-    : OSL::RendererServices(texture_system)
+OSLRenderServices::OSLRenderServices(OSL::TextureSystem *texture_system, int device_type)
+    : OSL::RendererServices(texture_system), device_type_(device_type)
 {
 }

@ -131,6 +131,17 @@ OSLRenderServices::~OSLRenderServices()
  }
 }

+int OSLRenderServices::supports(string_view feature) const
+{
+#ifdef WITH_OPTIX
+  if (feature == "OptiX") {
+    return device_type_ == DEVICE_OPTIX;
+  }
+#endif
+
+  return false;
+}
+
 bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
                                   OSL::Matrix44 &result,
                                   OSL::TransformationPtr xform,
@ -1139,29 +1150,39 @@ TextureSystem::TextureHandle *OSLRenderServices::get_texture_handle(ustring file
 {
  OSLTextureHandleMap::iterator it = textures.find(filename);

-  /* For non-OIIO textures, just return a pointer to our own OSLTextureHandle. */
-  if (it != textures.end()) {
-    if (it->second->type != OSLTextureHandle::OIIO) {
-      return (TextureSystem::TextureHandle *)it->second.get();
+  if (device_type_ == DEVICE_CPU) {
+    /* For non-OIIO textures, just return a pointer to our own OSLTextureHandle. */
+    if (it != textures.end()) {
+      if (it->second->type != OSLTextureHandle::OIIO) {
+        return (TextureSystem::TextureHandle *)it->second.get();
+      }
+    }
+
+    /* Get handle from OpenImageIO. */
+    OSL::TextureSystem *ts = m_texturesys;
+    TextureSystem::TextureHandle *handle = ts->get_texture_handle(filename);
+    if (handle == NULL) {
+      return NULL;
+    }
+
+    /* Insert new OSLTextureHandle if needed. */
+    if (it == textures.end()) {
+      textures.insert(filename, new OSLTextureHandle(OSLTextureHandle::OIIO));
+      it = textures.find(filename);
+    }
+
+    /* Assign OIIO texture handle and return. */
+    it->second->oiio_handle = handle;
+    return (TextureSystem::TextureHandle *)it->second.get();
+  }
+  else {
+    if (it != textures.end() && it->second->type == OSLTextureHandle::SVM && it->second->svm_slots[0].w == -1) {
+        return reinterpret_cast<TextureSystem::TextureHandle *>(
+            static_cast<uintptr_t>(it->second->svm_slots[0].y + 1));
    }
-  }

-  /* Get handle from OpenImageIO. */
-  OSL::TextureSystem *ts = m_texturesys;
-  TextureSystem::TextureHandle *handle = ts->get_texture_handle(filename);
-  if (handle == NULL) {
    return NULL;
  }
-
-  /* Insert new OSLTextureHandle if needed. */
-  if (it == textures.end()) {
-    textures.insert(filename, new OSLTextureHandle(OSLTextureHandle::OIIO));
-    it = textures.find(filename);
-  }
-
-  /* Assign OIIO texture handle and return. */
-  it->second->oiio_handle = handle;
-  return (TextureSystem::TextureHandle *)it->second.get();
 }

 bool OSLRenderServices::good(TextureSystem::TextureHandle *texture_handle)
--- a/intern/cycles/kernel/osl/services.h
+++ b/intern/cycles/kernel/osl/services.h
@ -22,11 +22,8 @@ class PtexCache;

 CCL_NAMESPACE_BEGIN

-class Object;
 class Scene;
-class Shader;
 struct ShaderData;
-struct float3;
 struct KernelGlobalsCPU;

 /* OSL Texture Handle
@ -73,11 +70,13 @@ typedef OIIO::unordered_map_concurrent<ustring, OSLTextureHandleRef, ustringHash

 class OSLRenderServices : public OSL::RendererServices {
 public:
-  OSLRenderServices(OSL::TextureSystem *texture_system);
+  OSLRenderServices(OSL::TextureSystem *texture_system, int device_type);
  ~OSLRenderServices();

  static void register_closures(OSL::ShadingSystem *ss);

+  int supports(string_view feature) const override;
+
  bool get_matrix(OSL::ShaderGlobals *sg,
                  OSL::Matrix44 &result,
                  OSL::TransformationPtr xform,
@ -324,6 +323,9 @@ class OSLRenderServices : public OSL::RendererServices {
   * and is required because texture handles are cached as part of the shared
   * shading system. */
  OSLTextureHandleMap textures;
+
+ private:
+  int device_type_;
 };

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/osl/services_gpu.h
+++ b/intern/cycles/kernel/osl/services_gpu.h
--- a/intern/cycles/kernel/osl/services_optix.cu
+++ b/intern/cycles/kernel/osl/services_optix.cu
@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#define WITH_OSL
+
+// clang-format off
+#include "kernel/device/optix/compat.h"
+#include "kernel/device/optix/globals.h"
+
+#include "kernel/device/gpu/image.h"  /* Texture lookup uses normal CUDA intrinsics. */
+
+#include "kernel/osl/services_gpu.h"
+// clang-format on
+
+extern "C" __device__ void __direct_callable__dummy_services()
+{
+}
--- a/intern/cycles/kernel/osl/types.h
+++ b/intern/cycles/kernel/osl/types.h
@ -5,9 +5,53 @@

 CCL_NAMESPACE_BEGIN

+struct DeviceString {
+#if defined(__KERNEL_GPU__)
+  /* Strings are represented by their hashes in CUDA and OptiX. */
+  size_t str_;
+
+  ccl_device_inline_method uint64_t hash() const
+  {
+    return str_;
+  }
+#elif defined(OPENIMAGEIO_USTRING_H)
+  ustring str_;
+
+  ccl_device_inline_method uint64_t hash() const
+  {
+    return str_.hash();
+  }
+#else
+  const char *str_;
+#endif
+
+  ccl_device_inline_method bool operator==(DeviceString b) const
+  {
+    return str_ == b.str_;
+  }
+  ccl_device_inline_method bool operator!=(DeviceString b) const
+  {
+    return str_ != b.str_;
+  }
+};
+
+ccl_device_inline DeviceString make_string(const char *str, size_t hash)
+{
+#if defined(__KERNEL_GPU__)
+  (void)str;
+  return {hash};
+#elif defined(OPENIMAGEIO_USTRING_H)
+  (void)hash;
+  return {ustring(str)};
+#else
+  (void)hash;
+  return {str};
+#endif
+}
+
 /* Closure */

-enum ClosureTypeOSL {
+enum OSLClosureType {
  OSL_CLOSURE_MUL_ID = -1,
  OSL_CLOSURE_ADD_ID = -2,

@ -17,4 +61,60 @@ enum ClosureTypeOSL {
 #include "closures_template.h"
 };

+struct OSLClosure {
+  OSLClosureType id;
+};
+
+struct ccl_align(8) OSLClosureMul : public OSLClosure
+{
+  packed_float3 weight;
+  ccl_private const OSLClosure *closure;
+};
+
+struct ccl_align(8) OSLClosureAdd : public OSLClosure
+{
+  ccl_private const OSLClosure *closureA;
+  ccl_private const OSLClosure *closureB;
+};
+
+struct ccl_align(8) OSLClosureComponent : public OSLClosure
+{
+  packed_float3 weight;
+};
+
+/* Globals */
+
+struct ShaderGlobals {
+  packed_float3 P, dPdx, dPdy;
+  packed_float3 dPdz;
+  packed_float3 I, dIdx, dIdy;
+  packed_float3 N;
+  packed_float3 Ng;
+  float u, dudx, dudy;
+  float v, dvdx, dvdy;
+  packed_float3 dPdu, dPdv;
+  float time;
+  float dtime;
+  packed_float3 dPdtime;
+  packed_float3 Ps, dPsdx, dPsdy;
+  ccl_private void *renderstate;
+  ccl_private void *tracedata;
+  ccl_private void *objdata;
+  void *context;
+  void *renderer;
+  ccl_private void *object2common;
+  ccl_private void *shader2common;
+  ccl_private OSLClosure *Ci;
+  float surfacearea;
+  int raytype;
+  int flipHandedness;
+  int backfacing;
+};
+
+struct OSLNoiseOptions {
+};
+
+struct OSLTextureOptions {
+};
+
 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@ -75,10 +75,14 @@ CCL_NAMESPACE_BEGIN
 #define __VOLUME__

 /* Device specific features */
-#ifndef __KERNEL_GPU__
-#  ifdef WITH_OSL
-#    define __OSL__
+#ifdef WITH_OSL
+#  define __OSL__
+#  ifdef __KERNEL_OPTIX__
+/* Kernels with OSL support are built separately in OptiX and don't need SVM. */
+#    undef __SVM__
 #  endif
+#endif
+#ifndef __KERNEL_GPU__
 #  ifdef WITH_PATH_GUIDING
 #    define __PATH_GUIDING__
 #  endif
@ -917,9 +921,13 @@ typedef struct ccl_align(16) ShaderData
  float ray_dP;

 #ifdef __OSL__
+#  ifdef __KERNEL_GPU__
+  ccl_private uint8_t *osl_closure_pool;
+#  else
  const struct KernelGlobalsCPU *osl_globals;
  const struct IntegratorStateCPU *osl_path_state;
  const struct IntegratorShadowStateCPU *osl_shadow_path_state;
+#  endif
 #endif

  /* LCG state for closures that require additional random numbers. */
@ -1529,6 +1537,9 @@ enum KernelFeatureFlag : uint32_t {

  /* Path guiding. */
  KERNEL_FEATURE_PATH_GUIDING = (1U << 26U),
+
+  /* OSL. */
+  KERNEL_FEATURE_OSL = (1U << 27U),
 };

 /* Shader node feature mask, to specialize shader evaluation for kernels. */
--- a/intern/cycles/scene/osl.cpp
+++ b/intern/cycles/scene/osl.cpp
@ -38,16 +38,17 @@ OSL::TextureSystem *OSLShaderManager::ts_shared = NULL;
 int OSLShaderManager::ts_shared_users = 0;
 thread_mutex OSLShaderManager::ts_shared_mutex;

-OSL::ShadingSystem *OSLShaderManager::ss_shared = NULL;
-OSLRenderServices *OSLShaderManager::services_shared = NULL;
+OSL::ErrorHandler OSLShaderManager::errhandler;
+map<int, OSL::ShadingSystem *> OSLShaderManager::ss_shared;
 int OSLShaderManager::ss_shared_users = 0;
 thread_mutex OSLShaderManager::ss_shared_mutex;
 thread_mutex OSLShaderManager::ss_mutex;
+
 int OSLCompiler::texture_shared_unique_id = 0;

 /* Shader Manager */

-OSLShaderManager::OSLShaderManager()
+OSLShaderManager::OSLShaderManager(Device *device) : device_(device)
 {
  texture_system_init();
  shading_system_init();
@ -107,11 +108,12 @@ void OSLShaderManager::device_update_specific(Device *device,

  device_free(device, dscene, scene);

-  /* set texture system */
-  scene->image_manager->set_osl_texture_system((void *)ts);
+  /* set texture system (only on CPU devices, since GPU devices cannot use OIIO) */
+  if (device->info.type == DEVICE_CPU) {
+    scene->image_manager->set_osl_texture_system((void *)ts_shared);
+  }

  /* create shaders */
-  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
  Shader *background_shader = scene->background->get_shader(scene);

  foreach (Shader *shader, scene->shaders) {
@ -125,22 +127,34 @@ void OSLShaderManager::device_update_specific(Device *device,
     * compile shaders alternating */
    thread_scoped_lock lock(ss_mutex);

-    OSLCompiler compiler(this, services, ss, scene);
-    compiler.background = (shader == background_shader);
-    compiler.compile(og, shader);
+    device->foreach_device(
+        [this, scene, shader, background = (shader == background_shader)](Device *sub_device) {
+          OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+          OSL::ShadingSystem *ss = ss_shared[sub_device->info.type];
+
+          OSLCompiler compiler(this, ss, scene);
+          compiler.background = background;
+          compiler.compile(og, shader);
+        });

    if (shader->get_use_mis() && shader->has_surface_emission)
      scene->light_manager->tag_update(scene, LightManager::SHADER_COMPILED);
  }

  /* setup shader engine */
-  og->ss = ss;
-  og->ts = ts;
-  og->services = services;
-
  int background_id = scene->shader_manager->get_shader_id(background_shader);
-  og->background_state = og->surface_state[background_id & SHADER_MASK];
-  og->use = true;
+
+  device->foreach_device([background_id](Device *sub_device) {
+    OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+    OSL::ShadingSystem *ss = ss_shared[sub_device->info.type];
+
+    og->ss = ss;
+    og->ts = ts_shared;
+    og->services = static_cast<OSLRenderServices *>(ss->renderer());
+
+    og->background_state = og->surface_state[background_id & SHADER_MASK];
+    og->use = true;
+  });

  foreach (Shader *shader, scene->shaders)
    shader->clear_modified();
@ -148,8 +162,12 @@ void OSLShaderManager::device_update_specific(Device *device,
  update_flags = UPDATE_NONE;

  /* add special builtin texture types */
-  services->textures.insert(ustring("@ao"), new OSLTextureHandle(OSLTextureHandle::AO));
-  services->textures.insert(ustring("@bevel"), new OSLTextureHandle(OSLTextureHandle::BEVEL));
+  for (const auto &[device_type, ss] : ss_shared) {
+    OSLRenderServices *services = static_cast<OSLRenderServices *>(ss->renderer());
+
+    services->textures.insert(ustring("@ao"), new OSLTextureHandle(OSLTextureHandle::AO));
+    services->textures.insert(ustring("@bevel"), new OSLTextureHandle(OSLTextureHandle::BEVEL));
+  }

  device_update_common(device, dscene, scene, progress);

@ -166,26 +184,35 @@ void OSLShaderManager::device_update_specific(Device *device,
     * is being freed after the Session is freed.
     */
    thread_scoped_lock lock(ss_shared_mutex);
-    ss->optimize_all_groups();
+    for (const auto &[device_type, ss] : ss_shared) {
+      ss->optimize_all_groups();
+    }
+  }
+
+  /* load kernels */
+  if (!device->load_osl_kernels()) {
+    progress.set_error(device->error_message());
  }
 }

 void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *scene)
 {
-  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
-
  device_free_common(device, dscene, scene);

  /* clear shader engine */
-  og->use = false;
-  og->ss = NULL;
-  og->ts = NULL;
+  device->foreach_device([](Device *sub_device) {
+    OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();

-  og->surface_state.clear();
-  og->volume_state.clear();
-  og->displacement_state.clear();
-  og->bump_state.clear();
-  og->background_state.reset();
+    og->use = false;
+    og->ss = NULL;
+    og->ts = NULL;
+
+    og->surface_state.clear();
+    og->volume_state.clear();
+    og->displacement_state.clear();
+    og->bump_state.clear();
+    og->background_state.reset();
+  });
 }

 void OSLShaderManager::texture_system_init()
@ -193,7 +220,7 @@ void OSLShaderManager::texture_system_init()
  /* create texture system, shared between different renders to reduce memory usage */
  thread_scoped_lock lock(ts_shared_mutex);

-  if (ts_shared_users == 0) {
+  if (ts_shared_users++ == 0) {
    ts_shared = TextureSystem::create(true);

    ts_shared->attribute("automip", 1);
@ -203,24 +230,18 @@ void OSLShaderManager::texture_system_init()
    /* effectively unlimited for now, until we support proper mipmap lookups */
    ts_shared->attribute("max_memory_MB", 16384);
  }
-
-  ts = ts_shared;
-  ts_shared_users++;
 }

 void OSLShaderManager::texture_system_free()
 {
  /* shared texture system decrease users and destroy if no longer used */
  thread_scoped_lock lock(ts_shared_mutex);
-  ts_shared_users--;

-  if (ts_shared_users == 0) {
+  if (--ts_shared_users == 0) {
    ts_shared->invalidate_all(true);
    OSL::TextureSystem::destroy(ts_shared);
    ts_shared = NULL;
  }
-
-  ts = NULL;
 }

 void OSLShaderManager::shading_system_init()
@ -228,101 +249,105 @@ void OSLShaderManager::shading_system_init()
  /* create shading system, shared between different renders to reduce memory usage */
  thread_scoped_lock lock(ss_shared_mutex);

-  if (ss_shared_users == 0) {
-    /* Must use aligned new due to concurrent hash map. */
-    services_shared = util_aligned_new<OSLRenderServices>(ts_shared);
+  device_->foreach_device([](Device *sub_device) {
+    const DeviceType device_type = sub_device->info.type;

-    string shader_path = path_get("shader");
+    if (ss_shared_users++ == 0 || ss_shared.find(device_type) == ss_shared.end()) {
+      /* Must use aligned new due to concurrent hash map. */
+      OSLRenderServices *services = util_aligned_new<OSLRenderServices>(ts_shared, device_type);
+
+      string shader_path = path_get("shader");
 #  ifdef _WIN32
-    /* Annoying thing, Cycles stores paths in UTF-8 codepage, so it can
-     * operate with file paths with any character. This requires to use wide
-     * char functions, but OSL uses old fashioned ANSI functions which means:
-     *
-     * - We have to convert our paths to ANSI before passing to OSL
-     * - OSL can't be used when there's a multi-byte character in the path
-     *   to the shaders folder.
-     */
-    shader_path = string_to_ansi(shader_path);
+      /* Annoying thing, Cycles stores paths in UTF-8 codepage, so it can
+       * operate with file paths with any character. This requires to use wide
+       * char functions, but OSL uses old fashioned ANSI functions which means:
+       *
+       * - We have to convert our paths to ANSI before passing to OSL
+       * - OSL can't be used when there's a multi-byte character in the path
+       *   to the shaders folder.
+       */
+      shader_path = string_to_ansi(shader_path);
 #  endif

-    ss_shared = new OSL::ShadingSystem(services_shared, ts_shared, &errhandler);
-    ss_shared->attribute("lockgeom", 1);
-    ss_shared->attribute("commonspace", "world");
-    ss_shared->attribute("searchpath:shader", shader_path);
-    ss_shared->attribute("greedyjit", 1);
+      OSL::ShadingSystem *ss = new OSL::ShadingSystem(services, ts_shared, &errhandler);
+      ss->attribute("lockgeom", 1);
+      ss->attribute("commonspace", "world");
+      ss->attribute("searchpath:shader", shader_path);
+      ss->attribute("greedyjit", 1);

-    VLOG_INFO << "Using shader search path: " << shader_path;
+      VLOG_INFO << "Using shader search path: " << shader_path;

-    /* our own ray types */
-    static const char *raytypes[] = {
-        "camera",         /* PATH_RAY_CAMERA */
-        "reflection",     /* PATH_RAY_REFLECT */
-        "refraction",     /* PATH_RAY_TRANSMIT */
-        "diffuse",        /* PATH_RAY_DIFFUSE */
-        "glossy",         /* PATH_RAY_GLOSSY */
-        "singular",       /* PATH_RAY_SINGULAR */
-        "transparent",    /* PATH_RAY_TRANSPARENT */
-        "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */
+      /* our own ray types */
+      static const char *raytypes[] = {
+          "camera",         /* PATH_RAY_CAMERA */
+          "reflection",     /* PATH_RAY_REFLECT */
+          "refraction",     /* PATH_RAY_TRANSMIT */
+          "diffuse",        /* PATH_RAY_DIFFUSE */
+          "glossy",         /* PATH_RAY_GLOSSY */
+          "singular",       /* PATH_RAY_SINGULAR */
+          "transparent",    /* PATH_RAY_TRANSPARENT */
+          "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */

-        "shadow", /* PATH_RAY_SHADOW_OPAQUE */
-        "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */
+          "shadow", /* PATH_RAY_SHADOW_OPAQUE */
+          "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */

-        "__unused__", /* PATH_RAY_NODE_UNALIGNED */
-        "__unused__", /* PATH_RAY_MIS_SKIP */
+          "__unused__", /* PATH_RAY_NODE_UNALIGNED */
+          "__unused__", /* PATH_RAY_MIS_SKIP */

-        "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
+          "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */

-        /* Remaining irrelevant bits up to 32. */
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-    };
+          /* Remaining irrelevant bits up to 32. */
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+      };

-    const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]);
-    ss_shared->attribute("raytypes", TypeDesc(TypeDesc::STRING, nraytypes), raytypes);
+      const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]);
+      ss->attribute("raytypes", TypeDesc(TypeDesc::STRING, nraytypes), raytypes);

-    OSLRenderServices::register_closures(ss_shared);
+      OSLRenderServices::register_closures(ss);

-    loaded_shaders.clear();
-  }
+      ss_shared[device_type] = ss;
+    }
+  });

-  ss = ss_shared;
-  services = services_shared;
-  ss_shared_users++;
+  loaded_shaders.clear();
 }

 void OSLShaderManager::shading_system_free()
 {
  /* shared shading system decrease users and destroy if no longer used */
  thread_scoped_lock lock(ss_shared_mutex);
-  ss_shared_users--;

-  if (ss_shared_users == 0) {
-    delete ss_shared;
-    ss_shared = NULL;
+  device_->foreach_device([](Device * /*sub_device*/) {
+    if (--ss_shared_users == 0) {
+      for (const auto &[device_type, ss] : ss_shared) {
+        OSLRenderServices *services = static_cast<OSLRenderServices *>(ss->renderer());

-    util_aligned_delete(services_shared);
-    services_shared = NULL;
-  }
+        delete ss;

-  ss = NULL;
-  services = NULL;
+        util_aligned_delete(services);
+      }
+
+      ss_shared.clear();
+    }
+  });
 }

 bool OSLShaderManager::osl_compile(const string &inputfile, const string &outputfile)
@ -447,7 +472,9 @@ const char *OSLShaderManager::shader_load_filepath(string filepath)

 const char *OSLShaderManager::shader_load_bytecode(const string &hash, const string &bytecode)
 {
-  ss->LoadMemoryCompiledShader(hash.c_str(), bytecode.c_str());
+  for (const auto &[device_type, ss] : ss_shared) {
+    ss->LoadMemoryCompiledShader(hash.c_str(), bytecode.c_str());
+  }

  OSLShaderInfo info;

@ -599,11 +626,11 @@ OSLNode *OSLShaderManager::osl_node(ShaderGraph *graph,

 /* Graph Compiler */

-OSLCompiler::OSLCompiler(OSLShaderManager *manager,
-                         OSLRenderServices *services,
-                         OSL::ShadingSystem *ss,
-                         Scene *scene)
-    : scene(scene), manager(manager), services(services), ss(ss)
+OSLCompiler::OSLCompiler(OSLShaderManager *manager, OSL::ShadingSystem *ss, Scene *scene)
+    : scene(scene),
+      manager(manager),
+      services(static_cast<OSLRenderServices *>(ss->renderer())),
+      ss(ss)
 {
  current_type = SHADER_TYPE_SURFACE;
  current_shader = NULL;
@ -1105,7 +1132,12 @@ OSL::ShaderGroupRef OSLCompiler::compile_type(Shader *shader, ShaderGraph *graph
 {
  current_type = type;

-  OSL::ShaderGroupRef group = ss->ShaderGroupBegin(shader->name.c_str());
+  string name = shader->name.string();
+  /* Replace invalid characters. */
+  for (size_t i; (i = name.find_first_of(" .,:;+-*/#")) != string::npos;)
+    name.replace(i, 1, "_");
+
+  OSL::ShaderGroupRef group = ss->ShaderGroupBegin(name);

  ShaderNode *output = graph->output();
  ShaderNodeSet dependencies;
--- a/intern/cycles/scene/osl.h
+++ b/intern/cycles/scene/osl.h
@ -54,7 +54,7 @@ struct OSLShaderInfo {

 class OSLShaderManager : public ShaderManager {
 public:
-  OSLShaderManager();
+  OSLShaderManager(Device *device);
  ~OSLShaderManager();

  static void free_memory();
@ -92,25 +92,22 @@ class OSLShaderManager : public ShaderManager {
                           const std::string &bytecode_hash = "",
                           const std::string &bytecode = "");

- protected:
+ private:
  void texture_system_init();
  void texture_system_free();

  void shading_system_init();
  void shading_system_free();

-  OSL::ShadingSystem *ss;
-  OSL::TextureSystem *ts;
-  OSLRenderServices *services;
-  OSL::ErrorHandler errhandler;
+  Device *device_;
  map<string, OSLShaderInfo> loaded_shaders;

  static OSL::TextureSystem *ts_shared;
  static thread_mutex ts_shared_mutex;
  static int ts_shared_users;

-  static OSL::ShadingSystem *ss_shared;
-  static OSLRenderServices *services_shared;
+  static OSL::ErrorHandler errhandler;
+  static map<int, OSL::ShadingSystem *> ss_shared;
  static thread_mutex ss_shared_mutex;
  static thread_mutex ss_mutex;
  static int ss_shared_users;
@ -123,10 +120,7 @@ class OSLShaderManager : public ShaderManager {
 class OSLCompiler {
 public:
 #ifdef WITH_OSL
-  OSLCompiler(OSLShaderManager *manager,
-              OSLRenderServices *services,
-              OSL::ShadingSystem *shadingsys,
-              Scene *scene);
+  OSLCompiler(OSLShaderManager *manager, OSL::ShadingSystem *shadingsys, Scene *scene);
 #endif
  void compile(OSLGlobals *og, Shader *shader);

--- a/intern/cycles/scene/scene.cpp
+++ b/intern/cycles/scene/scene.cpp
@ -99,11 +99,8 @@ Scene::Scene(const SceneParams &params_, Device *device)
 {
  memset((void *)&dscene.data, 0, sizeof(dscene.data));

-  /* OSL only works on the CPU */
-  if (device->info.has_osl)
-    shader_manager = ShaderManager::create(params.shadingsystem);
-  else
-    shader_manager = ShaderManager::create(SHADINGSYSTEM_SVM);
+  shader_manager = ShaderManager::create(
+      device->info.has_osl ? params.shadingsystem : SHADINGSYSTEM_SVM, device);

  light_manager = new LightManager();
  geometry_manager = new GeometryManager();
--- a/intern/cycles/scene/shader.cpp
+++ b/intern/cycles/scene/shader.cpp
@ -395,15 +395,16 @@ ShaderManager::~ShaderManager()
 {
 }

-ShaderManager *ShaderManager::create(int shadingsystem)
+ShaderManager *ShaderManager::create(int shadingsystem, Device *device)
 {
  ShaderManager *manager;

  (void)shadingsystem; /* Ignored when built without OSL. */
+  (void)device;

 #ifdef WITH_OSL
  if (shadingsystem == SHADINGSYSTEM_OSL) {
-    manager = new OSLShaderManager();
+    manager = new OSLShaderManager(device);
  }
  else
 #endif
@ -722,6 +723,10 @@ uint ShaderManager::get_kernel_features(Scene *scene)
    }
  }

+  if (use_osl()) {
+    kernel_features |= KERNEL_FEATURE_OSL;
+  }
+
  return kernel_features;
 }

--- a/intern/cycles/scene/shader.h
+++ b/intern/cycles/scene/shader.h
@ -170,7 +170,7 @@ class ShaderManager {
    UPDATE_NONE = 0u,
  };

-  static ShaderManager *create(int shadingsystem);
+  static ShaderManager *create(int shadingsystem, Device *device);
  virtual ~ShaderManager();

  virtual void reset(Scene *scene) = 0;
--- a/intern/cycles/scene/shader_nodes.h
+++ b/intern/cycles/scene/shader_nodes.h
@ -1542,6 +1542,10 @@ class OSLNode final : public ShaderNode {
  {
    return true;
  }
+  virtual int get_feature()
+  {
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_RAYTRACE;
+  }

  virtual bool equals(const ShaderNode & /*other*/)
  {
--- a/intern/cycles/util/defines.h
+++ b/intern/cycles/util/defines.h
@ -23,6 +23,7 @@
 /* Leave inlining decisions to compiler for these, the inline keyword here
 * is not about performance but including function definitions in headers. */
 #  define ccl_device static inline
+#  define ccl_device_extern extern "C"
 #  define ccl_device_noinline static inline
 #  define ccl_device_noinline_cpu ccl_device_noinline

--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@ -196,14 +196,7 @@ ccl_device_inline Transform make_transform_frame(float3 N)
  return make_transform(dx.x, dx.y, dx.z, 0.0f, dy.x, dy.y, dy.z, 0.0f, N.x, N.y, N.z, 0.0f);
 }

-#ifndef __KERNEL_GPU__
-
-ccl_device_inline Transform transform_zero()
-{
-  Transform zero = {zero_float4(), zero_float4(), zero_float4()};
-  return zero;
-}
-
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline Transform operator*(const Transform a, const Transform b)
 {
  float4 c_x = make_float4(b.x.x, b.y.x, b.z.x, 0.0f);
@ -218,6 +211,15 @@ ccl_device_inline Transform operator*(const Transform a, const Transform b)

  return t;
 }
+#endif
+
+#ifndef __KERNEL_GPU__
+
+ccl_device_inline Transform transform_zero()
+{
+  Transform zero = {zero_float4(), zero_float4(), zero_float4()};
+  return zero;
+}

 ccl_device_inline void print_transform(const char *label, const Transform &t)
 {