Cycles: Add support for native OptiX curve primitive

This patch adds support for the curve primitive from OptiX to Cycles. It's currently hidden behind a debug option, since there can be some slight rendering differences still (because no backface culling is performed and something seems off with endcaps). The curve primitive was added with the OptiX 7.1 SDK and requires a r450 driver or newer, so this also updates the codebase to be able to build with the new SDK. Reviewed By: brecht Differential Revision: https://developer.blender.org/D8223
2020-07-06 12:25:54 +02:00 · 2020-07-06 12:25:54 +02:00 · 737bd549b6
parent 95f0f31279
commit 737bd549b6
9 changed files with 303 additions and 84 deletions
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@ -830,6 +830,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
    debug_use_cuda_split_kernel: BoolProperty(name="Split Kernel", default=False)

    debug_optix_cuda_streams: IntProperty(name="CUDA Streams", default=1, min=1)
+    debug_optix_curves_api: BoolProperty(name="Native OptiX Curve Primitive", default=False)

    debug_opencl_kernel_type: EnumProperty(
        name="OpenCL Kernel Type",
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@ -2031,6 +2031,7 @@ class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
        col = layout.column()
        col.label(text="OptiX Flags:")
        col.prop(cscene, "debug_optix_cuda_streams")
+        col.prop(cscene, "debug_optix_curves_api")

        col.separator()

--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@ -92,6 +92,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
  flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
  /* Synchronize OptiX flags. */
  flags.optix.cuda_streams = get_int(cscene, "debug_optix_cuda_streams");
+  flags.optix.curves_api = get_boolean(cscene, "debug_optix_curves_api");
  /* Synchronize OpenCL device type. */
  switch (get_enum(cscene, "debug_opencl_device_type")) {
    case 0:
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@ -131,8 +131,12 @@ class OptiXDevice : public CUDADevice {
    PG_RGEN,
    PG_MISS,
    PG_HITD,  // Default hit group
-    PG_HITL,  // __BVH_LOCAL__ hit group
    PG_HITS,  // __SHADOW_RECORD_ALL__ hit group
+    PG_HITL,  // __BVH_LOCAL__ hit group (only used for triangles)
+#  if OPTIX_ABI_VERSION >= 36
+    PG_HITD_MOTION,
+    PG_HITS_MOTION,
+#  endif
 #  ifdef WITH_CYCLES_DEBUG
    PG_EXCP,
 #  endif
@ -177,6 +181,7 @@ class OptiXDevice : public CUDADevice {
  OptixDeviceContext context = NULL;

  OptixModule optix_module = NULL;  // All necessary OptiX kernels are in one module
+  OptixModule builtin_modules[2] = {};
  OptixPipeline pipelines[NUM_PIPELINES] = {};

  bool motion_blur = false;
@ -264,6 +269,9 @@ class OptiXDevice : public CUDADevice {
    // Unload modules
    if (optix_module != NULL)
      optixModuleDestroy(optix_module);
+    for (unsigned int i = 0; i < 2; ++i)
+      if (builtin_modules[i] != NULL)
+        optixModuleDestroy(builtin_modules[i]);
    for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
      if (pipelines[i] != NULL)
        optixPipelineDestroy(pipelines[i]);
@ -338,6 +346,12 @@ class OptiXDevice : public CUDADevice {
      optixModuleDestroy(optix_module);
      optix_module = NULL;
    }
+    for (unsigned int i = 0; i < 2; ++i) {
+      if (builtin_modules[i] != NULL) {
+        optixModuleDestroy(builtin_modules[i]);
+        builtin_modules[i] = NULL;
+      }
+    }
    for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
      if (pipelines[i] != NULL) {
        optixPipelineDestroy(pipelines[i]);
@ -369,6 +383,18 @@ class OptiXDevice : public CUDADevice {
 #  endif
    pipeline_options.pipelineLaunchParamsVariableName = "__params";  // See kernel_globals.h

+#  if OPTIX_ABI_VERSION >= 36
+    pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
+    if (requested_features.use_hair) {
+      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
+        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
+      }
+      else {
+        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
+      }
+    }
+#  endif
+
    // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
    // This is necessary since objects may be reported to have motion if the Vector pass is
    // active, but may still need to be rendered without motion blur if that isn't active as well
@ -442,6 +468,34 @@ class OptiXDevice : public CUDADevice {
        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
      }
+
+#  if OPTIX_ABI_VERSION >= 36
+      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
+        OptixBuiltinISOptions builtin_options;
+        builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+        builtin_options.usesMotionBlur = false;
+
+        check_result_optix_ret(optixBuiltinISModuleGet(
+            context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
+
+        group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
+        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
+        group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
+        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
+
+        if (motion_blur) {
+          builtin_options.usesMotionBlur = true;
+
+          check_result_optix_ret(optixBuiltinISModuleGet(
+              context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
+
+          group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
+          group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
+          group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
+          group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
+        }
+      }
+#  endif
    }

    if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
@ -493,8 +547,14 @@ class OptiXDevice : public CUDADevice {
    unsigned int trace_css = stack_size[PG_HITD].cssCH;
    // This is based on the maximum of closest-hit and any-hit/intersection programs
    trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
-    trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
    trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
+    trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+#  if OPTIX_ABI_VERSION >= 36
+    trace_css = std::max(trace_css,
+                         stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
+    trace_css = std::max(trace_css,
+                         stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
+#  endif

    OptixPipelineLinkOptions link_options;
    link_options.maxTraceDepth = 1;
@ -503,17 +563,23 @@ class OptiXDevice : public CUDADevice {
 #  else
    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
 #  endif
-    link_options.overrideUsesMotionBlur = pipeline_options.usesMotionBlur;
+#  if OPTIX_ABI_VERSION < 24
+    link_options.overrideUsesMotionBlur = motion_blur;
+#  endif

    {  // Create path tracing pipeline
      OptixProgramGroup pipeline_groups[] = {
-          groups[PG_RGEN],
-          groups[PG_MISS],
-          groups[PG_HITD],
-          groups[PG_HITS],
-          groups[PG_HITL],
+        groups[PG_RGEN],
+        groups[PG_MISS],
+        groups[PG_HITD],
+        groups[PG_HITS],
+        groups[PG_HITL],
+#  if OPTIX_ABI_VERSION >= 36
+        groups[PG_HITD_MOTION],
+        groups[PG_HITS_MOTION],
+#  endif
 #  ifdef WITH_CYCLES_DEBUG
-          groups[PG_EXCP],
+        groups[PG_EXCP],
 #  endif
      };
      check_result_optix_ret(
@ -530,8 +596,8 @@ class OptiXDevice : public CUDADevice {
      const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;

      // Set stack size depending on pipeline options
-      check_result_optix_ret(optixPipelineSetStackSize(
-          pipelines[PIP_PATH_TRACE], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2)));
+      check_result_optix_ret(
+          optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE], 0, 0, css, (motion_blur ? 3 : 2)));
    }

    // Only need to create shader evaluation pipeline if one of these features is used:
@ -541,15 +607,19 @@ class OptiXDevice : public CUDADevice {

    if (use_shader_eval_pipeline) {  // Create shader evaluation pipeline
      OptixProgramGroup pipeline_groups[] = {
-          groups[PG_BAKE],
-          groups[PG_DISP],
-          groups[PG_BACK],
-          groups[PG_MISS],
-          groups[PG_HITD],
-          groups[PG_HITS],
-          groups[PG_HITL],
+        groups[PG_BAKE],
+        groups[PG_DISP],
+        groups[PG_BACK],
+        groups[PG_MISS],
+        groups[PG_HITD],
+        groups[PG_HITS],
+        groups[PG_HITL],
+#  if OPTIX_ABI_VERSION >= 36
+        groups[PG_HITD_MOTION],
+        groups[PG_HITS_MOTION],
+#  endif
 #  ifdef WITH_CYCLES_DEBUG
-          groups[PG_EXCP],
+        groups[PG_EXCP],
 #  endif
      };
      check_result_optix_ret(
@ -672,7 +742,11 @@ class OptiXDevice : public CUDADevice {
      sbt_params.missRecordCount = 1;
      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITL, PG_HITS
+#  if OPTIX_ABI_VERSION >= 36
+      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
+#  else
+      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
+#  endif

      // Launch the ray generation program
      check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
@ -836,7 +910,9 @@ class OptiXDevice : public CUDADevice {
        assert(task.denoising.optix_input_passes >= 1 && task.denoising.optix_input_passes <= 3);
        denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
            OPTIX_DENOISER_INPUT_RGB + (task.denoising.optix_input_passes - 1));
+#  if OPTIX_ABI_VERSION < 28
        denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
+#  endif
        check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
        check_result_optix_ret(
            optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
@ -849,7 +925,11 @@ class OptiXDevice : public CUDADevice {
      check_result_optix_ret(
          optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));

+#  if OPTIX_ABI_VERSION < 28
      const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
+#  else
+      const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
+#  endif
      const size_t scratch_offset = sizes.stateSizeInBytes;

      // Allocate denoiser state if tile size has changed since last setup
@ -993,7 +1073,11 @@ class OptiXDevice : public CUDADevice {
      sbt_params.missRecordCount = 1;
      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITL, PG_HITS
+#  if OPTIX_ABI_VERSION >= 36
+      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
+#  else
+      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
+#  endif

      check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
                                     cuda_stream[thread_index],
@ -1070,7 +1154,7 @@ class OptiXDevice : public CUDADevice {
                                           &build_input,
                                           1,
                                           temp_mem.device_pointer,
-                                           temp_mem.device_size,
+                                           sizes.tempSizeInBytes,
                                           out_data,
                                           sizes.outputSizeInBytes,
                                           &out_handle,
@ -1142,7 +1226,6 @@ class OptiXDevice : public CUDADevice {
          continue;
        }

-        const size_t num_curves = hair->num_curves();
        const size_t num_segments = hair->num_segments();

        size_t num_motion_steps = 1;
@ -1152,7 +1235,18 @@ class OptiXDevice : public CUDADevice {
        }

        device_vector<OptixAabb> aabb_data(this, "temp_aabb_data", MEM_READ_ONLY);
-        aabb_data.alloc(num_segments * num_motion_steps);
+#  if OPTIX_ABI_VERSION >= 36
+        device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY);
+        device_vector<float4> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY);
+        // Four control points for each curve segment
+        const size_t num_vertices = num_segments * 4;
+        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+          index_data.alloc(num_segments);
+          vertex_data.alloc(num_vertices * num_motion_steps);
+        }
+        else
+#  endif
+          aabb_data.alloc(num_segments * num_motion_steps);

        // Get AABBs for each motion step
        for (size_t step = 0; step < num_motion_steps; ++step) {
@ -1165,44 +1259,127 @@ class OptiXDevice : public CUDADevice {
            keys = motion_keys->data_float3() + attr_offset * hair->curve_keys.size();
          }

-          size_t i = step * num_segments;
-          for (size_t j = 0; j < num_curves; ++j) {
-            const Hair::Curve c = hair->get_curve(j);
+          for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
+            const Hair::Curve curve = hair->get_curve(j);

-            for (size_t k = 0; k < c.num_segments(); ++i, ++k) {
-              BoundBox bounds = BoundBox::empty;
-              c.bounds_grow(k, keys, hair->curve_radius.data(), bounds);
+            for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
+#  if OPTIX_ABI_VERSION >= 36
+              if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+                int k0 = curve.first_key + segment;
+                int k1 = k0 + 1;
+                int ka = max(k0 - 1, curve.first_key);
+                int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);

-              aabb_data[i].minX = bounds.min.x;
-              aabb_data[i].minY = bounds.min.y;
-              aabb_data[i].minZ = bounds.min.z;
-              aabb_data[i].maxX = bounds.max.x;
-              aabb_data[i].maxY = bounds.max.y;
-              aabb_data[i].maxZ = bounds.max.z;
+                const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
+                const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
+                const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
+                const float4 pw = make_float4(hair->curve_radius[ka],
+                                              hair->curve_radius[k0],
+                                              hair->curve_radius[k1],
+                                              hair->curve_radius[kb]);
+
+                // Convert Catmull-Rom data to Bezier spline
+                static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
+                static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
+                static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
+                static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
+
+                index_data[i] = i * 4;
+                float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
+                v[0] = make_float4(
+                    dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
+                v[1] = make_float4(
+                    dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
+                v[2] = make_float4(
+                    dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
+                v[3] = make_float4(
+                    dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
+              }
+              else
+#  endif
+              {
+                BoundBox bounds = BoundBox::empty;
+                curve.bounds_grow(segment, keys, hair->curve_radius.data(), bounds);
+
+                const size_t index = step * num_segments + i;
+                aabb_data[index].minX = bounds.min.x;
+                aabb_data[index].minY = bounds.min.y;
+                aabb_data[index].minZ = bounds.min.z;
+                aabb_data[index].maxX = bounds.max.x;
+                aabb_data[index].maxY = bounds.max.y;
+                aabb_data[index].maxZ = bounds.max.z;
+              }
            }
          }
        }

        // Upload AABB data to GPU
        aabb_data.copy_to_device();
+#  if OPTIX_ABI_VERSION >= 36
+        index_data.copy_to_device();
+        vertex_data.copy_to_device();
+#  endif

        vector<device_ptr> aabb_ptrs;
        aabb_ptrs.reserve(num_motion_steps);
+#  if OPTIX_ABI_VERSION >= 36
+        vector<device_ptr> width_ptrs;
+        vector<device_ptr> vertex_ptrs;
+        width_ptrs.reserve(num_motion_steps);
+        vertex_ptrs.reserve(num_motion_steps);
+#  endif
        for (size_t step = 0; step < num_motion_steps; ++step) {
          aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
+#  if OPTIX_ABI_VERSION >= 36
+          const device_ptr base_ptr = vertex_data.device_pointer +
+                                      step * num_vertices * sizeof(float4);
+          width_ptrs.push_back(base_ptr + 3 * sizeof(float));  // Offset by vertex size
+          vertex_ptrs.push_back(base_ptr);
+#  endif
        }

-        // Disable visibility test anyhit program, since it is already checked during intersection
-        // Those trace calls that require anyhit can force it with OPTIX_RAY_FLAG_ENFORCE_ANYHIT
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+        // Force a single any-hit call, so shadow record-all behavior works correctly
+        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
        OptixBuildInput build_input = {};
-        build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
-        build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
-        build_input.aabbArray.numPrimitives = num_segments;
-        build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
-        build_input.aabbArray.flags = &build_flags;
-        build_input.aabbArray.numSbtRecords = 1;
-        build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
+#  if OPTIX_ABI_VERSION >= 36
+        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+          build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
+          build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+          build_input.curveArray.numPrimitives = num_segments;
+          build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+          build_input.curveArray.numVertices = num_vertices;
+          build_input.curveArray.vertexStrideInBytes = sizeof(float4);
+          build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
+          build_input.curveArray.widthStrideInBytes = sizeof(float4);
+          build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
+          build_input.curveArray.indexStrideInBytes = sizeof(int);
+          build_input.curveArray.flag = build_flags;
+          build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
+        }
+        else
+#  endif
+        {
+          // Disable visibility test any-hit program, since it is already checked during
+          // intersection. Those trace calls that require anyhit can force it with a ray flag.
+          build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+
+          build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+#  if OPTIX_ABI_VERSION < 23
+          build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+          build_input.aabbArray.numPrimitives = num_segments;
+          build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
+          build_input.aabbArray.flags = &build_flags;
+          build_input.aabbArray.numSbtRecords = 1;
+          build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
+#  else
+          build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+          build_input.customPrimitiveArray.numPrimitives = num_segments;
+          build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
+          build_input.customPrimitiveArray.flags = &build_flags;
+          build_input.customPrimitiveArray.numSbtRecords = 1;
+          build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
+#  endif
+        }

        // Allocate memory for new BLAS and build it
        OptixTraversableHandle handle;
@ -1257,8 +1434,8 @@ class OptiXDevice : public CUDADevice {
          vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
        }

-        // No special build flags for triangle primitives
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_NONE;
+        // Force a single any-hit call, so shadow record-all behavior works correctly
+        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
        OptixBuildInput build_input = {};
        build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
        build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
@ -1324,9 +1501,26 @@ class OptiXDevice : public CUDADevice {
      // Set user instance ID to object index
      instance.instanceId = ob->get_device_index();

-      // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
-      // See 'scene_intersect_volume' in bvh.h
-      instance.visibilityMask = (ob->geometry->has_volume ? 3 : 1);
+      // Have to have at least one bit in the mask, or else instance would always be culled
+      instance.visibilityMask = 1;
+
+      if (ob->geometry->has_volume) {
+        // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
+        instance.visibilityMask |= 2;
+      }
+
+      if (ob->geometry->type == Geometry::HAIR) {
+        // Same applies to curves (so they can be skipped in local trace calls)
+        instance.visibilityMask |= 4;
+
+#  if OPTIX_ABI_VERSION >= 36
+        if (motion_blur && ob->geometry->has_motion_blur() && DebugFlags().optix.curves_api &&
+            static_cast<const Hair *>(ob->geometry)->curve_shape == CURVE_THICK) {
+          // Select between motion blur and non-motion blur built-in intersection module
+          instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
+        }
+#  endif
+      }

      // Insert motion traversable if object has motion
      if (motion_blur && ob->use_motion()) {
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@ -172,11 +172,11 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
             0.0f,
             ray->t,
             ray->time,
-             0xFF,
+             0xF,
             OPTIX_RAY_FLAG_NONE,
-             0,
-             0,
             0,  // SBT offset for PG_HITD
+             0,
+             0,
             p0,
             p1,
             p2,
@ -264,12 +264,13 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
             0.0f,
             ray->t,
             ray->time,
+             // Skip curves
+             0x3,
             // Need to always call into __anyhit__kernel_optix_local_hit
-             0xFF,
             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
-             1,
+             2,  // SBT offset for PG_HITL
+             0,
             0,
-             0,  // SBT offset for PG_HITL
             p0,
             p1,
             p2,
@ -374,12 +375,12 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
             0.0f,
             ray->t,
             ray->time,
+             0xF,
             // Need to always call into __anyhit__kernel_optix_shadow_all_hit
-             0xFF,
             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
-             2,
+             1,  // SBT offset for PG_HITS
+             0,
             0,
-             0,  // SBT offset for PG_HITS
             p0,
             p1,
             *num_hits,
@ -458,12 +459,12 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
             0.0f,
             ray->t,
             ray->time,
-             // Visibility mask set to only intersect objects with volumes
-             0x02,
+             // Skip everything but volumes
+             0x2,
             OPTIX_RAY_FLAG_NONE,
-             0,
-             0,
             0,  // SBT offset for PG_HITD
+             0,
+             0,
             p0,
             p1,
             p2,
--- a/intern/cycles/kernel/geom/geom_curve_intersect.h
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@ -734,7 +734,6 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
  }

  sd->u = isect->u;
-  sd->v = isect->v;

  P = P + D * t;

@ -750,6 +749,7 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,

    sd->N = normalize(sine * bitangent - cosine * normalize(cross(tangent, bitangent)));
    sd->Ng = -D;
+    sd->v = isect->v;

 #  if 0
    /* This approximates the position and geometric normal of a thick curve too,
@ -764,8 +764,11 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
     * This could be optimized by recording the normal in the intersection,
     * however for Optix this would go beyond the size of the payload. */
    const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, isect->u));
-    sd->Ng = normalize(P - P_inside);
-    sd->N = sd->Ng;
+    const float3 Ng = normalize(P - P_inside);
+
+    sd->N = Ng;
+    sd->Ng = Ng;
+    sd->v = 0.0f;
  }

 #  ifdef __DPDU__
--- a/intern/cycles/kernel/kernels/optix/kernel_optix.cu
+++ b/intern/cycles/kernel/kernels/optix/kernel_optix.cu
@ -15,6 +15,7 @@
 * limitations under the License.
 */

+// clang-format off
 #include "kernel/kernel_compat_optix.h"
 #include "util/util_atomic.h"
 #include "kernel/kernel_types.h"
@ -23,6 +24,7 @@

 #include "kernel/kernel_path.h"
 #include "kernel/kernel_bake.h"
+// clang-format on

 template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
 {
@ -139,8 +141,8 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
  }
  else {
    if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) {
-      // Record closest intersection only (do not terminate ray here, since there is no guarantee
-      // about distance ordering in anyhit)
+      // Record closest intersection only
+      // Do not terminate ray here, since there is no guarantee about distance ordering in any-hit
      return optixIgnoreIntersection();
    }

@ -153,15 +155,9 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
  isect->object = get_object_id();
  isect->type = kernel_tex_fetch(__prim_type, isect->prim);

-  if (optixIsTriangleHit()) {
-    const float2 barycentrics = optixGetTriangleBarycentrics();
-    isect->u = 1.0f - barycentrics.y - barycentrics.x;
-    isect->v = barycentrics.x;
-  }
-  else {
-    isect->u = __uint_as_float(optixGetAttribute_0());
-    isect->v = __uint_as_float(optixGetAttribute_1());
-  }
+  const float2 barycentrics = optixGetTriangleBarycentrics();
+  isect->u = 1.0f - barycentrics.y - barycentrics.x;
+  isect->v = barycentrics.x;

  // Record geometric normal
  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
@ -198,10 +194,18 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
    isect->u = 1.0f - barycentrics.y - barycentrics.x;
    isect->v = barycentrics.x;
  }
+#  ifdef __HAIR__
  else {
-    isect->u = __uint_as_float(optixGetAttribute_0());
+    const float u = __uint_as_float(optixGetAttribute_0());
+    isect->u = u;
    isect->v = __uint_as_float(optixGetAttribute_1());
+
+    // Filter out curve endcaps
+    if (u == 0.0f || u == 1.0f) {
+      return optixIgnoreIntersection();
+    }
  }
+#  endif

 #  ifdef __TRANSPARENT_SHADOWS__
  // Detect if this surface has a shader with transparent shadows
@ -213,7 +217,6 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
 #  ifdef __TRANSPARENT_SHADOWS__
  }

-  // TODO(pmours): Do we need REQUIRE_UNIQUE_ANYHIT for this to work?
  optixSetPayload_2(optixGetPayload_2() + 1);  // num_hits++

  // Continue tracing
@ -227,13 +230,25 @@ extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
  uint visibility = optixGetPayload_4();
 #ifdef __VISIBILITY_FLAG__
  const uint prim = optixGetPrimitiveIndex();
-  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0)
+  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
    return optixIgnoreIntersection();
+  }
+#endif
+
+#ifdef __HAIR__
+  if (!optixIsTriangleHit()) {
+    // Filter out curve endcaps
+    const float u = __uint_as_float(optixGetAttribute_0());
+    if (u == 0.0f || u == 1.0f) {
+      return optixIgnoreIntersection();
+    }
+  }
 #endif

  // Shadow ray early termination
-  if (visibility & PATH_RAY_SHADOW_OPAQUE)
+  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
    return optixTerminateRay();
+  }
 }

 extern "C" __global__ void __closesthit__kernel_optix_hit()
@ -250,7 +265,7 @@ extern "C" __global__ void __closesthit__kernel_optix_hit()
    optixSetPayload_2(__float_as_uint(barycentrics.x));
  }
  else {
-    optixSetPayload_1(optixGetAttribute_0());
+    optixSetPayload_1(optixGetAttribute_0());  // Same as 'optixGetCurveParameter()'
    optixSetPayload_2(optixGetAttribute_1());
  }
 }
@ -286,7 +301,6 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
                            __float_as_int(isect.u),   // Attribute_0
                            __float_as_int(isect.v));  // Attribute_1
  }
-
 }

 extern "C" __global__ void __intersection__curve_ribbon()
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@ -83,6 +83,7 @@ DebugFlags::OptiX::OptiX()
 void DebugFlags::OptiX::reset()
 {
  cuda_streams = 1;
+  curves_api = false;
 }

 DebugFlags::OpenCL::OpenCL() : device_type(DebugFlags::OpenCL::DEVICE_ALL), debug(false)
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@ -108,6 +108,9 @@ class DebugFlags {

    /* Number of CUDA streams to launch kernels concurrently from. */
    int cuda_streams;
+
+    /* Use OptiX curves API for hair instead of custom implementation. */
+    bool curves_api;
  };

  /* Descriptor of OpenCL feature-set to be used. */