Cycles: shadow function optimization for transparent shadows (CPU only).

Old algorithm:

Raytrace from one transparent surface to the next step by step. To minimize
overhead in cases where we don't need transparent shadows, we first trace a
regular shadow ray. We check if the hit primitive was potentially transparent,
and only in that case start marching. this gives extra ray cast for the cases
were we do want transparency.

New algorithm:

We trace a single ray. If it hits any opaque surface, or more than a given
number of transparent surfaces is hit, then we consider the geometry to be
entirely blocked. If not, all transparent surfaces will be recorded and we
will shade them one by one to determine how much light is blocked. This all
happens in one scene intersection function.

Recording all hits works well in some cases but may be slower in others. If
we have many semi-transparent hairs, one intersection may be faster because
you'd be reinteresecting the same hairs a lot with each step otherwise. If
however there is mostly binary transparency then we may be recording many
unnecessary intersections when one of the first surfaces blocks all light.

We found that this helps quite nicely in some scenes, on koro.blend this can
give a 50% reduction in render time, on the pabellon barcelona scene and a
forest scene with transparent leaves it was 30%. Some other files rendered
maybe 1% or 2% slower, but this seems a reasonable tradeoff.

Differential Revision: https://developer.blender.org/D473
This commit is contained in:
Brecht Van Lommel 2014-04-19 17:02:30 +02:00
parent f6abc96b6b
commit 9ab259f55b
Notes: blender-bot 2023-02-14 10:28:25 +01:00
Referenced by issue #40688, Blender crashes while rendering
7 changed files with 674 additions and 48 deletions

View File

@ -93,6 +93,36 @@ CCL_NAMESPACE_BEGIN
#include "geom_bvh_subsurface.h"
#endif
#if defined(__SHADOW_RECORD_ALL__)
#define BVH_FUNCTION_NAME bvh_intersect_shadow_all
#define BVH_FUNCTION_FEATURES 0
#include "geom_bvh_shadow.h"
#endif
#if defined(__SUBSURFACE__) && defined(__INSTANCING__)
#define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
#define BVH_FUNCTION_FEATURES BVH_INSTANCING
#include "geom_bvh_shadow.h"
#endif
#if defined(__SUBSURFACE__) && defined(__HAIR__)
#define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
#include "geom_bvh_shadow.h"
#endif
#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
#define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
#include "geom_bvh_shadow.h"
#endif
#if defined(__SUBSURFACE__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
#define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
#include "geom_bvh_shadow.h"
#endif
/* to work around titan bug when using arrays instead of textures */
#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
ccl_device_inline
@ -185,6 +215,52 @@ uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection
}
#endif
/* to work around titan bug when using arrays instead of textures */
#ifdef __SHADOW_RECORD_ALL__
#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
ccl_device_inline
#else
ccl_device_noinline
#endif
uint scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
{
#ifdef __OBJECT_MOTION__
if(kernel_data.bvh.have_motion) {
#ifdef __HAIR__
if(kernel_data.bvh.have_curves)
return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, max_hits, num_hits);
#endif /* __HAIR__ */
return bvh_intersect_shadow_all_motion(kg, ray, isect, max_hits, num_hits);
}
#endif /* __OBJECT_MOTION__ */
#ifdef __HAIR__
if(kernel_data.bvh.have_curves)
return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits);
#endif /* __HAIR__ */
#ifdef __KERNEL_CPU__
#ifdef __INSTANCING__
if(kernel_data.bvh.have_instancing)
return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
#endif /* __INSTANCING__ */
return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
#else /* __KERNEL_CPU__ */
#ifdef __INSTANCING__
return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
#else
return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
#endif /* __INSTANCING__ */
#endif /* __KERNEL_CPU__ */
}
#endif
/* Ray offset to avoid self intersection.
*
* This function should be used to compute a modified ray start position for

View File

@ -0,0 +1,374 @@
/*
* Adapted from code Copyright 2009-2010 NVIDIA Corporation,
* and code copyright 2009-2012 Intel Corporation
*
* Modifications Copyright 2011-2013, Blender Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* This is a template BVH traversal function, where various features can be
* enabled/disabled. This way we can compile optimized versions for each case
* without new features slowing things down.
*
* BVH_INSTANCING: object instancing
* BVH_HAIR: hair curve rendering
* BVH_MOTION: motion blur rendering
*
*/
#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
ccl_device bool BVH_FUNCTION_NAME
(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, const uint max_hits, uint *num_hits)
{
/* todo:
* - likely and unlikely for if() statements
* - test restrict attribute for pointers
*/
/* traversal stack in CUDA thread-local memory */
int traversalStack[BVH_STACK_SIZE];
traversalStack[0] = ENTRYPOINT_SENTINEL;
/* traversal variables in registers */
int stackPtr = 0;
int nodeAddr = kernel_data.bvh.root;
/* ray parameters in registers */
const float tmax = ray->t;
float3 P = ray->P;
float3 dir = bvh_clamp_direction(ray->D);
float3 idir = bvh_inverse_direction(dir);
int object = OBJECT_NONE;
float isect_t = tmax;
#if FEATURE(BVH_MOTION)
Transform ob_tfm;
#endif
#if FEATURE(BVH_INSTANCING)
int num_hits_in_instance = 0;
#endif
*num_hits = 0;
isect_array->t = tmax;
#if defined(__KERNEL_SSE2__)
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
__m128 Psplat[3], idirsplat[3];
shuffle_swap_t shufflexyz[3];
Psplat[0] = _mm_set_ps1(P.x);
Psplat[1] = _mm_set_ps1(P.y);
Psplat[2] = _mm_set_ps1(P.z);
__m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
/* traversal loop */
do {
do
{
/* traverse internal nodes */
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
{
bool traverseChild0, traverseChild1;
int nodeAddrChild1;
#if !defined(__KERNEL_SSE2__)
/* Intersect two child bounding boxes, non-SSE version */
float t = isect_t;
/* fetch node data */
float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
/* intersect ray against child nodes */
NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
/* decide which nodes to traverse next */
#ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
#else
traverseChild0 = (c0max >= c0min);
traverseChild1 = (c1max >= c1min);
#endif
#else // __KERNEL_SSE2__
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
/* fetch node data */
const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
const float4 cnodes = ((float4*)bvh_nodes)[3];
/* intersect ray against child nodes */
const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
/* calculate { c0min, c1min, -c0max, -c1max} */
__m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat));
const __m128 tminmax = _mm_xor_ps(minmax, pn);
const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
/* decide which nodes to traverse next */
#ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
#else
traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
#endif
#endif // __KERNEL_SSE2__
nodeAddr = __float_as_int(cnodes.x);
nodeAddrChild1 = __float_as_int(cnodes.y);
if(traverseChild0 && traverseChild1) {
/* both children were intersected, push the farther one */
#if !defined(__KERNEL_SSE2__)
bool closestChild1 = (c1min < c0min);
#else
union { __m128 m128; float v[4]; } uminmax;
uminmax.m128 = tminmax;
bool closestChild1 = uminmax.v[1] < uminmax.v[0];
#endif
if(closestChild1) {
int tmp = nodeAddr;
nodeAddr = nodeAddrChild1;
nodeAddrChild1 = tmp;
}
++stackPtr;
traversalStack[stackPtr] = nodeAddrChild1;
}
else {
/* one child was intersected */
if(traverseChild1) {
nodeAddr = nodeAddrChild1;
}
else if(!traverseChild0) {
/* neither child was intersected */
nodeAddr = traversalStack[stackPtr];
--stackPtr;
}
}
}
/* if node is leaf, fetch triangle list */
if(nodeAddr < 0) {
float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
int primAddr = __float_as_int(leaf.x);
#if FEATURE(BVH_INSTANCING)
if(primAddr >= 0) {
#endif
int primAddr2 = __float_as_int(leaf.y);
/* pop */
nodeAddr = traversalStack[stackPtr];
--stackPtr;
/* primitive intersection */
while(primAddr < primAddr2) {
bool hit;
uint type = kernel_tex_fetch(__prim_type, primAddr);
/* todo: specialized intersect functions which don't fill in
* isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
* might give a few % performance improvement */
switch(type & PRIMITIVE_ALL) {
case PRIMITIVE_TRIANGLE: {
hit = triangle_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr);
break;
}
case PRIMITIVE_MOTION_TRIANGLE: {
hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr);
break;
}
#if FEATURE(BVH_HAIR)
case PRIMITIVE_CURVE:
case PRIMITIVE_MOTION_CURVE: {
if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
else
hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
break;
}
#endif
default: {
hit = false;
break;
}
}
/* shadow ray early termination */
if(hit) {
/* detect if this surface has a shader with transparent shadows */
/* todo: optimize so primitive visibility flag indicates if
* the primitive has a transparent shadow shader? */
int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
int shader = 0;
#ifdef __HAIR__
if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) {
#endif
float4 Ns = kernel_tex_fetch(__tri_normal, prim);
shader = __float_as_int(Ns.w);
#ifdef __HAIR__
}
else {
float4 str = kernel_tex_fetch(__curves, prim);
shader = __float_as_int(str.z);
}
#endif
int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*2);
/* if no transparent shadows, all light is blocked */
if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
return true;
}
/* if maximum number of hits reached, block all light */
else if(*num_hits == max_hits) {
return true;
}
/* move on to next entry in intersections array */
isect_array++;
(*num_hits)++;
#if FEATURE(BVH_INSTANCING)
num_hits_in_instance++;
#endif
isect_array->t = isect_t;
}
primAddr++;
}
}
#if FEATURE(BVH_INSTANCING)
else {
/* instance push */
object = kernel_tex_fetch(__prim_object, -primAddr-1);
#if FEATURE(BVH_MOTION)
bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
#else
bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
#endif
num_hits_in_instance = 0;
#if defined(__KERNEL_SSE2__)
Psplat[0] = _mm_set_ps1(P.x);
Psplat[1] = _mm_set_ps1(P.y);
Psplat[2] = _mm_set_ps1(P.z);
isect_array->t = isect_t;
tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
++stackPtr;
traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
nodeAddr = kernel_tex_fetch(__object_node, object);
}
}
#endif
} while(nodeAddr != ENTRYPOINT_SENTINEL);
#if FEATURE(BVH_INSTANCING)
if(stackPtr >= 0) {
kernel_assert(object != OBJECT_NONE);
if(num_hits_in_instance) {
float t_fac;
#if FEATURE(BVH_MOTION)
bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_tfm);
#else
bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
#endif
/* scale isect->t to adjust for instancing */
for(int i = 0; i < num_hits_in_instance; i++)
(isect_array-i-1)->t *= t_fac;
}
else {
float ignore_t = FLT_MAX;
#if FEATURE(BVH_MOTION)
bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_tfm);
#else
bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
#endif
}
#if defined(__KERNEL_SSE2__)
Psplat[0] = _mm_set_ps1(P.x);
Psplat[1] = _mm_set_ps1(P.y);
Psplat[2] = _mm_set_ps1(P.z);
isect_t = tmax;
isect_array->t = isect_t;
tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
object = OBJECT_NONE;
nodeAddr = traversalStack[stackPtr];
--stackPtr;
}
#endif
} while(nodeAddr != ENTRYPOINT_SENTINEL);
return false;
}
#undef FEATURE
#undef BVH_FUNCTION_NAME
#undef BVH_FUNCTION_FEATURES

View File

@ -48,12 +48,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
int nodeAddr = kernel_data.bvh.root;
/* ray parameters in registers */
const float tmax = ray->t;
float3 P = ray->P;
float3 dir = bvh_clamp_direction(ray->D);
float3 idir = bvh_inverse_direction(dir);
int object = OBJECT_NONE;
float isect_t = tmax;
float isect_t = ray->t;
const uint visibility = PATH_RAY_ALL_VISIBILITY;
uint num_hits = 0;
@ -236,9 +235,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
object = subsurface_object;
#if FEATURE(BVH_MOTION)
bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm, tmax);
bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
#else
bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t, tmax);
bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
#endif
#if defined(__KERNEL_SSE2__)
@ -272,9 +271,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
/* instance pop */
#if FEATURE(BVH_MOTION)
bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm, tmax);
bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
#else
bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect_t, tmax);
bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect_t);
#endif
#if defined(__KERNEL_SSE2__)

View File

@ -53,7 +53,6 @@ ccl_device bool BVH_FUNCTION_NAME
int nodeAddr = kernel_data.bvh.root;
/* ray parameters in registers */
const float tmax = ray->t;
float3 P = ray->P;
float3 dir = bvh_clamp_direction(ray->D);
float3 idir = bvh_inverse_direction(dir);
@ -63,7 +62,7 @@ ccl_device bool BVH_FUNCTION_NAME
Transform ob_tfm;
#endif
isect->t = tmax;
isect->t = ray->t;
isect->object = OBJECT_NONE;
isect->prim = PRIM_NONE;
isect->u = 0.0f;
@ -264,18 +263,10 @@ ccl_device bool BVH_FUNCTION_NAME
#if FEATURE(BVH_HAIR)
case PRIMITIVE_CURVE:
case PRIMITIVE_MOTION_CURVE: {
#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
else
hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
#else
if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type);
else
hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type);
#endif
break;
}
#endif
@ -307,9 +298,9 @@ ccl_device bool BVH_FUNCTION_NAME
object = kernel_tex_fetch(__prim_object, -primAddr-1);
#if FEATURE(BVH_MOTION)
bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm, tmax);
bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
#else
bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, tmax);
bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
#endif
#if defined(__KERNEL_SSE2__)
@ -337,9 +328,9 @@ ccl_device bool BVH_FUNCTION_NAME
/* instance pop */
#if FEATURE(BVH_MOTION)
bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm, tmax);
bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
#else
bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t, tmax);
bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
#endif
#if defined(__KERNEL_SSE2__)

View File

@ -377,7 +377,7 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir)
/* Transform ray into object space to enter static object in BVH */
ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, const float tmax)
ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
{
Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
@ -393,7 +393,7 @@ ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ra
/* Transorm ray to exit static object in BVH */
ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, const float tmax)
ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
{
if(*t != FLT_MAX) {
Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
@ -405,10 +405,23 @@ ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray
*idir = bvh_inverse_direction(*dir);
}
/* Same as above, but returns scale factor to apply to multiple intersection distances */
ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t_fac)
{
Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
*t_fac = len(transform_direction(&tfm, 1.0f/(*idir)));
*P = ray->P;
*dir = bvh_clamp_direction(ray->D);
*idir = bvh_inverse_direction(*dir);
}
#ifdef __OBJECT_MOTION__
/* Transform ray into object space to enter motion blurred object in BVH */
ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm, const float tmax)
ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm)
{
Transform itfm;
*tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm);
@ -425,7 +438,7 @@ ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, c
/* Transorm ray to exit motion blurred object in BVH */
ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm, const float tmax)
ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm)
{
if(*t != FLT_MAX)
*t *= len(transform_direction(tfm, 1.0f/(*idir)));
@ -434,6 +447,18 @@ ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, co
*dir = bvh_clamp_direction(ray->D);
*idir = bvh_inverse_direction(*dir);
}
/* Same as above, but returns scale factor to apply to multiple intersection distances */
ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t_fac, Transform *tfm)
{
*t_fac = len(transform_direction(tfm, 1.0f/(*idir)));
*P = ray->P;
*dir = bvh_clamp_direction(ray->D);
*idir = bvh_inverse_direction(*dir);
}
#endif
CCL_NAMESPACE_END

View File

@ -16,27 +16,194 @@
CCL_NAMESPACE_BEGIN
#ifdef __SHADOW_RECORD_ALL__
/* Shadow function to compute how much light is blocked, CPU variation.
*
* We trace a single ray. If it hits any opaque surface, or more than a given
* number of transparent surfaces is hit, then we consider the geometry to be
* entirely blocked. If not, all transparent surfaces will be recorded and we
* will shade them one by one to determine how much light is blocked. This all
* happens in one scene intersection function.
*
* Recording all hits works well in some cases but may be slower in others. If
* we have many semi-transparent hairs, one intersection may be faster because
* you'd be reinteresecting the same hairs a lot with each step otherwise. If
* however there is mostly binary transparency then we may be recording many
* unnecessary intersections when one of the first surfaces blocks all light.
*
* From tests in real scenes it seems the performance loss is either minimal,
* or there is a performance increase anyway due to avoiding the need to send
* two rays with transparent shadows.
*
* This is CPU only because of qsort, and malloc or high stack space usage to
* record all these intersections. */
ccl_device_noinline int shadow_intersections_compare(const void *a, const void *b)
{
const Intersection *isect_a = (const Intersection*)a;
const Intersection *isect_b = (const Intersection*)b;
if(isect_a->t < isect_b->t)
return -1;
else if(isect_a->t > isect_b->t)
return 1;
else
return 0;
}
#define STACK_MAX_HITS 64
ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow)
{
*shadow = make_float3(1.0f, 1.0f, 1.0f);
if(ray->t == 0.0f)
return false;
bool blocked;
if(kernel_data.integrator.transparent_shadows) {
/* intersect to find an opaque surface, or record all transparent surface hits */
Intersection hits_stack[STACK_MAX_HITS];
Intersection *hits;
uint max_hits = kernel_data.integrator.transparent_max_bounce - state->transparent_bounce - 1;
/* prefer to use stack but use dynamic allocation if too deep max hits
* we need max_hits + 1 storage space due to the logic in
* scene_intersect_shadow_all which will first store and then check if
* the limit is exceeded */
if(max_hits + 1 <= STACK_MAX_HITS)
hits = hits_stack;
else
hits = (Intersection*)malloc(sizeof(Intersection)*(max_hits + 1));
uint num_hits;
blocked = scene_intersect_shadow_all(kg, ray, hits, max_hits, &num_hits);
/* if no opaque surface found but we did find transparent hits, shade them */
if(!blocked && num_hits > 0) {
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
float3 Pend = ray->P + ray->D*ray->t;
float last_t = 0.0f;
int bounce = state->transparent_bounce;
Intersection *isect = hits;
#ifdef __VOLUME__
PathState ps = *state;
#endif
qsort(hits, num_hits, sizeof(Intersection), shadow_intersections_compare);
for(int hit = 0; hit < num_hits; hit++, isect++) {
/* adjust intersection distance for moving ray forward */
float new_t = isect->t;
isect->t -= last_t;
/* skip hit if we did not move forward, step by step raytracing
* would have skipped it as well then */
if(last_t == new_t)
continue;
last_t = new_t;
#ifdef __VOLUME__
/* attenuation between last surface and next surface */
if(ps.volume_stack[0].shader != SHADER_NONE) {
Ray segment_ray = *ray;
segment_ray.t = isect->t;
kernel_volume_shadow(kg, &ps, &segment_ray, &throughput);
}
#endif
/* setup shader data at surface */
ShaderData sd;
shader_setup_from_ray(kg, &sd, isect, ray, state->bounce+1, bounce);
/* attenuation from transparent surface */
if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
shader_eval_surface(kg, &sd, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
throughput *= shader_bsdf_transparency(kg, &sd);
}
/* stop if all light is blocked */
if(is_zero(throughput)) {
/* free dynamic storage */
if(hits != hits_stack)
free(hits);
return true;
}
/* move ray forward */
ray->P = sd.P;
if(ray->t != FLT_MAX)
ray->D = normalize_len(Pend - ray->P, &ray->t);
#ifdef __VOLUME__
/* exit/enter volume */
kernel_volume_stack_enter_exit(kg, &sd, ps.volume_stack);
#endif
bounce++;
}
/* free dynamic storage */
if(hits != hits_stack)
free(hits);
#ifdef __VOLUME__
/* attenuation for last line segment towards light */
if(ps.volume_stack[0].shader != SHADER_NONE)
kernel_volume_shadow(kg, &ps, ray, &throughput);
#endif
*shadow *= throughput;
}
}
else {
Intersection isect;
#ifdef __HAIR__
blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
#else
blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect);
#endif
}
#ifdef __VOLUME__
if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
/* apply attenuation from current volume shader */
kernel_volume_shadow(kg, state, ray, shadow);
}
#endif
return blocked;
}
#else
/* Shadow function to compute how much light is blocked, GPU variation.
*
* Here we raytrace from one transparent surface to the next step by step.
* To minimize overhead in cases where we don't need transparent shadows, we
* first trace a regular shadow ray. We check if the hit primitive was
* potentially transparent, and only in that case start marching. this gives
* one extra ray cast for the cases were we do want transparency. */
ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow)
{
*shadow = make_float3(1.0f, 1.0f, 1.0f);
if(ray->t == 0.0f)
return false;
Intersection isect;
#ifdef __HAIR__
bool result = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
#else
bool result = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect);
bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect);
#endif
#ifdef __TRANSPARENT_SHADOWS__
if(result && kernel_data.integrator.transparent_shadows) {
/* transparent shadows work in such a way to try to minimize overhead
* in cases where we don't need them. after a regular shadow ray is
* cast we check if the hit primitive was potentially transparent, and
* only in that case start marching. this gives on extra ray cast for
* the cases were we do want transparency. */
if(blocked && kernel_data.integrator.transparent_shadows) {
if(shader_transparent_shadow(kg, &isect)) {
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
float3 Pend = ray->P + ray->D*ray->t;
@ -46,21 +213,8 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
#endif
for(;;) {
if(bounce >= kernel_data.integrator.transparent_max_bounce) {
if(bounce >= kernel_data.integrator.transparent_max_bounce)
return true;
}
else if(bounce >= kernel_data.integrator.transparent_min_bounce) {
/* todo: get random number somewhere for probabilistic terminate */
#if 0
float probability = average(throughput);
float terminate = 0.0f;
if(terminate >= probability)
return true;
throughput /= probability;
#endif
}
#ifdef __HAIR__
if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect, NULL, 0.0f, 0.0f)) {
@ -75,6 +229,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
#endif
*shadow *= throughput;
return false;
}
@ -100,6 +255,9 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
throughput *= shader_bsdf_transparency(kg, &sd);
}
if(is_zero(throughput))
return true;
/* move ray forward */
ray->P = ray_offset(sd.P, -sd.Ng);
if(ray->t != FLT_MAX)
@ -115,15 +273,17 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
}
}
#ifdef __VOLUME__
else if(!result && state->volume_stack[0].shader != SHADER_NONE) {
else if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
/* apply attenuation from current volume shader */
kernel_volume_shadow(kg, state, ray, shadow);
}
#endif
#endif
return result;
return blocked;
}
#endif
CCL_NAMESPACE_END

View File

@ -64,6 +64,7 @@ CCL_NAMESPACE_BEGIN
#define __SUBSURFACE__
#define __CMJ__
#define __VOLUME__
#define __SHADOW_RECORD_ALL__
#endif
#ifdef __KERNEL_CUDA__