Cycles: avoid intermediate stack array for writing shadow intersections
Helps save one OptiX payload and is a bit more efficient. Differential Revision: https://developer.blender.org/D12909
This commit is contained in:
parent
943e73b07e
commit
d06828f0b8
Notes:
blender-bot
2023-02-14 07:36:17 +01:00
Referenced by issue #92363, OptiX kernels fail to load when Ambient Occlusion node is used
|
@ -34,6 +34,8 @@
|
|||
#include "kernel/bvh/bvh_types.h"
|
||||
#include "kernel/bvh/bvh_util.h"
|
||||
|
||||
#include "kernel/integrator/integrator_state_util.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_OPTIX__
|
||||
|
@ -361,15 +363,15 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
|
|||
|
||||
#ifdef __SHADOW_RECORD_ALL__
|
||||
ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
|
||||
IntegratorShadowState state,
|
||||
ccl_private const Ray *ray,
|
||||
ccl_private Intersection *isect,
|
||||
uint visibility,
|
||||
uint max_hits,
|
||||
ccl_private uint *num_hits)
|
||||
{
|
||||
# ifdef __KERNEL_OPTIX__
|
||||
uint p0 = pointer_pack_to_uint_0(isect);
|
||||
uint p1 = pointer_pack_to_uint_1(isect);
|
||||
uint p0 = state;
|
||||
uint p1 = 0; /* Unused */
|
||||
uint p2 = 0; /* Number of hits. */
|
||||
uint p3 = max_hits;
|
||||
uint p4 = visibility;
|
||||
|
@ -412,7 +414,8 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
|
|||
# ifdef __EMBREE__
|
||||
if (kernel_data.bvh.scene) {
|
||||
CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
|
||||
ctx.isect_s = isect;
|
||||
Intersection *isect_array = (Intersection *)state->shadow_isect;
|
||||
ctx.isect_s = isect_array;
|
||||
ctx.max_hits = max_hits;
|
||||
IntersectContext rtc_ctx(&ctx);
|
||||
RTCRay rtc_ray;
|
||||
|
@ -428,21 +431,21 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
|
|||
if (kernel_data.bvh.have_motion) {
|
||||
# ifdef __HAIR__
|
||||
if (kernel_data.bvh.have_curves) {
|
||||
return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, visibility, max_hits, num_hits);
|
||||
return bvh_intersect_shadow_all_hair_motion(kg, ray, state, visibility, max_hits, num_hits);
|
||||
}
|
||||
# endif /* __HAIR__ */
|
||||
|
||||
return bvh_intersect_shadow_all_motion(kg, ray, isect, visibility, max_hits, num_hits);
|
||||
return bvh_intersect_shadow_all_motion(kg, ray, state, visibility, max_hits, num_hits);
|
||||
}
|
||||
# endif /* __OBJECT_MOTION__ */
|
||||
|
||||
# ifdef __HAIR__
|
||||
if (kernel_data.bvh.have_curves) {
|
||||
return bvh_intersect_shadow_all_hair(kg, ray, isect, visibility, max_hits, num_hits);
|
||||
return bvh_intersect_shadow_all_hair(kg, ray, state, visibility, max_hits, num_hits);
|
||||
}
|
||||
# endif /* __HAIR__ */
|
||||
|
||||
return bvh_intersect_shadow_all(kg, ray, isect, visibility, max_hits, num_hits);
|
||||
return bvh_intersect_shadow_all(kg, ray, state, visibility, max_hits, num_hits);
|
||||
# endif /* __KERNEL_OPTIX__ */
|
||||
}
|
||||
#endif /* __SHADOW_RECORD_ALL__ */
|
||||
|
|
|
@ -38,7 +38,7 @@ ccl_device_inline
|
|||
#endif
|
||||
bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
|
||||
ccl_private const Ray *ray,
|
||||
ccl_private Intersection *isect_array,
|
||||
IntegratorShadowState state,
|
||||
const uint visibility,
|
||||
const uint max_hits,
|
||||
ccl_private uint *num_hits)
|
||||
|
@ -227,12 +227,13 @@ ccl_device_inline
|
|||
* the largest distance to potentially replace when another hit
|
||||
* is found. */
|
||||
const int num_recorded_hits = min(max_hits, record_index);
|
||||
float max_recorded_t = isect_array[0].t;
|
||||
float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
|
||||
int max_recorded_hit = 0;
|
||||
|
||||
for (int i = 1; i < num_recorded_hits; i++) {
|
||||
if (isect_array[i].t > max_recorded_t) {
|
||||
max_recorded_t = isect_array[i].t;
|
||||
const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
|
||||
if (isect_t > max_recorded_t) {
|
||||
max_recorded_t = isect_t;
|
||||
max_recorded_hit = i;
|
||||
}
|
||||
}
|
||||
|
@ -246,7 +247,7 @@ ccl_device_inline
|
|||
t_max_current = t_max_world * t_world_to_instance;
|
||||
}
|
||||
|
||||
isect_array[record_index] = isect;
|
||||
integrator_state_write_shadow_isect(state, &isect, record_index);
|
||||
}
|
||||
|
||||
prim_addr++;
|
||||
|
@ -300,12 +301,12 @@ ccl_device_inline
|
|||
|
||||
ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals kg,
|
||||
ccl_private const Ray *ray,
|
||||
ccl_private Intersection *isect_array,
|
||||
IntegratorShadowState state,
|
||||
const uint visibility,
|
||||
const uint max_hits,
|
||||
ccl_private uint *num_hits)
|
||||
{
|
||||
return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, visibility, max_hits, num_hits);
|
||||
return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, state, visibility, max_hits, num_hits);
|
||||
}
|
||||
|
||||
#undef BVH_FUNCTION_NAME
|
||||
|
|
|
@ -71,8 +71,7 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
|
|||
#endif
|
||||
}
|
||||
|
||||
#if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__))
|
||||
/* TODO: Move to another file? */
|
||||
#if defined(__KERNEL_CPU__)
|
||||
ccl_device int intersections_compare(const void *a, const void *b)
|
||||
{
|
||||
const Intersection *isect_a = (const Intersection *)a;
|
||||
|
@ -87,32 +86,6 @@ ccl_device int intersections_compare(const void *a, const void *b)
|
|||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SHADOW_RECORD_ALL__)
|
||||
ccl_device_inline void sort_intersections(ccl_private Intersection *hits, uint num_hits)
|
||||
{
|
||||
kernel_assert(num_hits > 0);
|
||||
|
||||
# ifdef __KERNEL_GPU__
|
||||
/* Use bubble sort which has more friendly memory pattern on GPU. */
|
||||
bool swapped;
|
||||
do {
|
||||
swapped = false;
|
||||
for (int j = 0; j < num_hits - 1; ++j) {
|
||||
if (hits[j].t > hits[j + 1].t) {
|
||||
struct Intersection tmp = hits[j];
|
||||
hits[j] = hits[j + 1];
|
||||
hits[j + 1] = tmp;
|
||||
swapped = true;
|
||||
}
|
||||
}
|
||||
--num_hits;
|
||||
} while (swapped);
|
||||
# else
|
||||
qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
|
||||
# endif
|
||||
}
|
||||
#endif /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */
|
||||
|
||||
/* For subsurface scattering, only sorting a small amount of intersections
|
||||
* so bubble sort is fine for CPU and GPU. */
|
||||
ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection *hits,
|
||||
|
|
|
@ -225,16 +225,17 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
|
|||
|
||||
optixSetPayload_2(num_hits + 1);
|
||||
|
||||
Intersection *const isect_array = get_payload_ptr_0<Intersection>();
|
||||
const IntegratorShadowState state = optixGetPayload_0();
|
||||
|
||||
if (record_index >= max_hits) {
|
||||
/* If maximum number of hits reached, find a hit to replace. */
|
||||
float max_recorded_t = isect_array[0].t;
|
||||
float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
|
||||
int max_recorded_hit = 0;
|
||||
|
||||
for (int i = 1; i < max_hits; i++) {
|
||||
if (isect_array[i].t > max_recorded_t) {
|
||||
max_recorded_t = isect_array[i].t;
|
||||
const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
|
||||
if (isect_t > max_recorded_t) {
|
||||
max_recorded_t = isect_t;
|
||||
max_recorded_hit = i;
|
||||
}
|
||||
}
|
||||
|
@ -248,13 +249,12 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
|
|||
record_index = max_recorded_hit;
|
||||
}
|
||||
|
||||
Intersection *const isect = isect_array + record_index;
|
||||
isect->u = u;
|
||||
isect->v = v;
|
||||
isect->t = optixGetRayTmax();
|
||||
isect->prim = prim;
|
||||
isect->object = object;
|
||||
isect->type = type;
|
||||
INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u;
|
||||
INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v;
|
||||
INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = optixGetRayTmax();
|
||||
INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
|
||||
INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
|
||||
INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
|
||||
|
||||
optixIgnoreIntersection();
|
||||
# endif /* __TRANSPARENT_SHADOWS__ */
|
||||
|
|
|
@ -64,19 +64,61 @@ ccl_device_forceinline int integrate_shadow_max_transparent_hits(KernelGlobals k
|
|||
}
|
||||
|
||||
#ifdef __TRANSPARENT_SHADOWS__
|
||||
# if defined(__KERNEL_CPU__)
|
||||
ccl_device int shadow_intersections_compare(const void *a, const void *b)
|
||||
{
|
||||
const Intersection *isect_a = (const Intersection *)a;
|
||||
const Intersection *isect_b = (const Intersection *)b;
|
||||
|
||||
if (isect_a->t < isect_b->t)
|
||||
return -1;
|
||||
else if (isect_a->t > isect_b->t)
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
# endif
|
||||
|
||||
ccl_device_inline void sort_shadow_intersections(IntegratorShadowState state, uint num_hits)
|
||||
{
|
||||
kernel_assert(num_hits > 0);
|
||||
|
||||
# ifdef __KERNEL_GPU__
|
||||
/* Use bubble sort which has more friendly memory pattern on GPU. */
|
||||
bool swapped;
|
||||
do {
|
||||
swapped = false;
|
||||
for (int j = 0; j < num_hits - 1; ++j) {
|
||||
if (INTEGRATOR_STATE_ARRAY(state, shadow_isect, j, t) >
|
||||
INTEGRATOR_STATE_ARRAY(state, shadow_isect, j + 1, t)) {
|
||||
struct Intersection tmp_j ccl_optional_struct_init;
|
||||
struct Intersection tmp_j_1 ccl_optional_struct_init;
|
||||
integrator_state_read_shadow_isect(state, &tmp_j, j);
|
||||
integrator_state_read_shadow_isect(state, &tmp_j_1, j + 1);
|
||||
integrator_state_write_shadow_isect(state, &tmp_j_1, j);
|
||||
integrator_state_write_shadow_isect(state, &tmp_j, j + 1);
|
||||
swapped = true;
|
||||
}
|
||||
}
|
||||
--num_hits;
|
||||
} while (swapped);
|
||||
# else
|
||||
Intersection *isect_array = (Intersection *)state->shadow_isect;
|
||||
qsort(isect_array, num_hits, sizeof(Intersection), shadow_intersections_compare);
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device bool integrate_intersect_shadow_transparent(KernelGlobals kg,
|
||||
IntegratorShadowState state,
|
||||
ccl_private const Ray *ray,
|
||||
const uint visibility)
|
||||
{
|
||||
Intersection isect[INTEGRATOR_SHADOW_ISECT_SIZE];
|
||||
|
||||
/* Limit the number hits to the max transparent bounces allowed and the size that we
|
||||
* have available in the integrator state. */
|
||||
const uint max_transparent_hits = integrate_shadow_max_transparent_hits(kg, state);
|
||||
const uint max_hits = min(max_transparent_hits, (uint)INTEGRATOR_SHADOW_ISECT_SIZE);
|
||||
uint num_hits = 0;
|
||||
bool opaque_hit = scene_intersect_shadow_all(kg, ray, isect, visibility, max_hits, &num_hits);
|
||||
bool opaque_hit = scene_intersect_shadow_all(kg, state, ray, visibility, max_hits, &num_hits);
|
||||
|
||||
/* If number of hits exceed the transparent bounces limit, make opaque. */
|
||||
if (num_hits > max_transparent_hits) {
|
||||
|
@ -87,13 +129,7 @@ ccl_device bool integrate_intersect_shadow_transparent(KernelGlobals kg,
|
|||
uint num_recorded_hits = min(num_hits, max_hits);
|
||||
|
||||
if (num_recorded_hits > 0) {
|
||||
sort_intersections(isect, num_recorded_hits);
|
||||
|
||||
/* Write intersection result into global integrator state memory.
|
||||
* More efficient may be to do this directly from the intersection kernel. */
|
||||
for (int hit = 0; hit < num_recorded_hits; hit++) {
|
||||
integrator_state_write_shadow_isect(state, &isect[hit], hit);
|
||||
}
|
||||
sort_shadow_intersections(state, num_recorded_hits);
|
||||
}
|
||||
|
||||
INTEGRATOR_STATE_WRITE(state, shadow_path, num_hits) = num_hits;
|
||||
|
|
Loading…
Reference in New Issue