Cycles: shadow function optimization for transparent shadows (CPU only).

Old algorithm: Raytrace from one transparent surface to the next step by step. To minimize overhead in cases where we don't need transparent shadows, we first trace a regular shadow ray. We check if the hit primitive was potentially transparent, and only in that case start marching. this gives extra ray cast for the cases were we do want transparency. New algorithm: We trace a single ray. If it hits any opaque surface, or more than a given number of transparent surfaces is hit, then we consider the geometry to be entirely blocked. If not, all transparent surfaces will be recorded and we will shade them one by one to determine how much light is blocked. This all happens in one scene intersection function. Recording all hits works well in some cases but may be slower in others. If we have many semi-transparent hairs, one intersection may be faster because you'd be reinteresecting the same hairs a lot with each step otherwise. If however there is mostly binary transparency then we may be recording many unnecessary intersections when one of the first surfaces blocks all light. We found that this helps quite nicely in some scenes, on koro.blend this can give a 50% reduction in render time, on the pabellon barcelona scene and a forest scene with transparent leaves it was 30%. Some other files rendered maybe 1% or 2% slower, but this seems a reasonable tradeoff. Differential Revision: https://developer.blender.org/D473
Referenced by issue #40688, Blender crashes while rendering
2014-04-19 17:02:30 +02:00 · 2014-04-19 17:02:30 +02:00 · 9ab259f55b · 2023-02-14 10:28:25 +01:00
parent f6abc96b6b
commit 9ab259f55b
7 changed files with 674 additions and 48 deletions
--- a/intern/cycles/kernel/geom/geom_bvh.h
+++ b/intern/cycles/kernel/geom/geom_bvh.h
@ -93,6 +93,36 @@ CCL_NAMESPACE_BEGIN
 #include "geom_bvh_subsurface.h"
 #endif

+#if defined(__SHADOW_RECORD_ALL__)
+#define BVH_FUNCTION_NAME bvh_intersect_shadow_all
+#define BVH_FUNCTION_FEATURES 0
+#include "geom_bvh_shadow.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__INSTANCING__)
+#define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#include "geom_bvh_shadow.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__HAIR__)
+#define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
+#include "geom_bvh_shadow.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#include "geom_bvh_shadow.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
+#include "geom_bvh_shadow.h"
+#endif
+
 /* to work around titan bug when using arrays instead of textures */
 #if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
 ccl_device_inline
@ -185,6 +215,52 @@ uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection
 }
 #endif

+/* to work around titan bug when using arrays instead of textures */
+#ifdef __SHADOW_RECORD_ALL__
+#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
+ccl_device_inline
+#else
+ccl_device_noinline
+#endif
+uint scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
+{
+#ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+#ifdef __HAIR__
+		if(kernel_data.bvh.have_curves)
+			return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, max_hits, num_hits);
+#endif /* __HAIR__ */
+
+		return bvh_intersect_shadow_all_motion(kg, ray, isect, max_hits, num_hits);
+	}
+#endif /* __OBJECT_MOTION__ */
+
+#ifdef __HAIR__ 
+	if(kernel_data.bvh.have_curves)
+		return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits);
+#endif /* __HAIR__ */
+
+#ifdef __KERNEL_CPU__
+
+#ifdef __INSTANCING__
+	if(kernel_data.bvh.have_instancing)
+		return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
+#endif /* __INSTANCING__ */
+
+	return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+#else /* __KERNEL_CPU__ */
+
+#ifdef __INSTANCING__
+	return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
+#else
+	return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+#endif /* __INSTANCING__ */
+
+#endif /* __KERNEL_CPU__ */
+}
+#endif
+
+
 /* Ray offset to avoid self intersection.
 *
 * This function should be used to compute a modified ray start position for
--- a/intern/cycles/kernel/geom/geom_bvh_shadow.h
+++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h
@ -0,0 +1,374 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2013, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
+
+ccl_device bool BVH_FUNCTION_NAME
+(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, const uint max_hits, uint *num_hits)
+{
+	/* todo:
+	 * - likely and unlikely for if() statements
+	 * - test restrict attribute for pointers
+	 */
+	
+	/* traversal stack in CUDA thread-local memory */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* traversal variables in registers */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* ray parameters in registers */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+#if FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+#if FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	*num_hits = 0;
+	isect_array->t = tmax;
+
+#if defined(__KERNEL_SSE2__)
+	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+	
+	const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
+	__m128 Psplat[3], idirsplat[3];
+	shuffle_swap_t shufflexyz[3];
+
+	Psplat[0] = _mm_set_ps1(P.x);
+	Psplat[1] = _mm_set_ps1(P.y);
+	Psplat[2] = _mm_set_ps1(P.z);
+
+	__m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+	/* traversal loop */
+	do {
+		do
+		{
+			/* traverse internal nodes */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
+			{
+				bool traverseChild0, traverseChild1;
+				int nodeAddrChild1;
+
+#if !defined(__KERNEL_SSE2__)
+				/* Intersect two child bounding boxes, non-SSE version */
+				float t = isect_t;
+
+				/* fetch node data */
+				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
+				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
+				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
+
+				/* intersect ray against child nodes */
+				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+				/* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+				/* this visibility test gives a 5% performance hit, how to solve? */
+				traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
+				traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
+#else
+				traverseChild0 = (c0max >= c0min);
+				traverseChild1 = (c1max >= c1min);
+#endif
+
+#else // __KERNEL_SSE2__
+				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+
+				/* fetch node data */
+				const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				const float4 cnodes = ((float4*)bvh_nodes)[3];
+
+				/* intersect ray against child nodes */
+				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
+				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
+				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
+
+				/* calculate { c0min, c1min, -c0max, -c1max} */
+				__m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat));
+				const __m128 tminmax = _mm_xor_ps(minmax, pn);
+				const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
+
+				/* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+				/* this visibility test gives a 5% performance hit, how to solve? */
+				traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
+				traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
+#else
+				traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
+				traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
+#endif
+#endif // __KERNEL_SSE2__
+
+				nodeAddr = __float_as_int(cnodes.x);
+				nodeAddrChild1 = __float_as_int(cnodes.y);
+
+				if(traverseChild0 && traverseChild1) {
+					/* both children were intersected, push the farther one */
+#if !defined(__KERNEL_SSE2__)
+					bool closestChild1 = (c1min < c0min);
+#else
+					union { __m128 m128; float v[4]; } uminmax;
+					uminmax.m128 = tminmax;
+					bool closestChild1 = uminmax.v[1] < uminmax.v[0];
+#endif
+
+					if(closestChild1) {
+						int tmp = nodeAddr;
+						nodeAddr = nodeAddrChild1;
+						nodeAddrChild1 = tmp;
+					}
+
+					++stackPtr;
+					traversalStack[stackPtr] = nodeAddrChild1;
+				}
+				else {
+					/* one child was intersected */
+					if(traverseChild1) {
+						nodeAddr = nodeAddrChild1;
+					}
+					else if(!traverseChild0) {
+						/* neither child was intersected */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+
+			/* if node is leaf, fetch triangle list */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
+				int primAddr = __float_as_int(leaf.x);
+
+#if FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+
+					/* pop */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+					/* primitive intersection */
+					while(primAddr < primAddr2) {
+						bool hit;
+						uint type = kernel_tex_fetch(__prim_type, primAddr);
+
+						/* todo: specialized intersect functions which don't fill in
+						 * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
+						 * might give a few % performance improvement */
+
+						switch(type & PRIMITIVE_ALL) {
+							case PRIMITIVE_TRIANGLE: {
+								hit = triangle_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr);
+								break;
+							}
+							case PRIMITIVE_MOTION_TRIANGLE: {
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr);
+								break;
+							}
+#if FEATURE(BVH_HAIR)
+							case PRIMITIVE_CURVE:
+							case PRIMITIVE_MOTION_CURVE: {
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
+									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
+								break;
+							}
+#endif
+							default: {
+								hit = false;
+								break;
+							}
+						}
+
+						/* shadow ray early termination */
+						if(hit) {
+							/* detect if this surface has a shader with transparent shadows */
+
+							/* todo: optimize so primitive visibility flag indicates if
+							 * the primitive has a transparent shadow shader? */
+							int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
+							int shader = 0;
+
+#ifdef __HAIR__
+							if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) {
+#endif
+								float4 Ns = kernel_tex_fetch(__tri_normal, prim);
+								shader = __float_as_int(Ns.w);
+#ifdef __HAIR__
+							}
+							else {
+								float4 str = kernel_tex_fetch(__curves, prim);
+								shader = __float_as_int(str.z);
+							}
+#endif
+							int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*2);
+
+							/* if no transparent shadows, all light is blocked */
+							if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+								return true;
+							}
+							/* if maximum number of hits reached, block all light */
+							else if(*num_hits == max_hits) {
+								return true;
+							}
+
+							/* move on to next entry in intersections array */
+							isect_array++;
+							(*num_hits)++;
+#if FEATURE(BVH_INSTANCING)
+							num_hits_in_instance++;
+#endif
+
+							isect_array->t = isect_t;
+						}
+
+						primAddr++;
+					}
+				}
+#if FEATURE(BVH_INSTANCING)
+				else {
+					/* instance push */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+
+#if FEATURE(BVH_MOTION)
+					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
+#else
+					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#endif
+
+					num_hits_in_instance = 0;
+
+#if defined(__KERNEL_SSE2__)
+					Psplat[0] = _mm_set_ps1(P.x);
+					Psplat[1] = _mm_set_ps1(P.y);
+					Psplat[2] = _mm_set_ps1(P.z);
+
+					isect_array->t = isect_t;
+					tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+
+					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+					++stackPtr;
+					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+					nodeAddr = kernel_tex_fetch(__object_node, object);
+				}
+			}
+#endif
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			if(num_hits_in_instance) {
+				float t_fac;
+
+#if FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_tfm);
+#else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#endif
+
+				/* scale isect->t to adjust for instancing */
+				for(int i = 0; i < num_hits_in_instance; i++)
+					(isect_array-i-1)->t *= t_fac;
+			}
+			else {
+				float ignore_t = FLT_MAX;
+
+#if FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_tfm);
+#else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+#endif
+			}
+
+#if defined(__KERNEL_SSE2__)
+			Psplat[0] = _mm_set_ps1(P.x);
+			Psplat[1] = _mm_set_ps1(P.y);
+			Psplat[2] = _mm_set_ps1(P.z);
+
+			isect_t = tmax;
+			isect_array->t = isect_t;
+			tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+
+			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return false;
+}
+
+#undef FEATURE
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
+
--- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
@ -48,12 +48,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 	int nodeAddr = kernel_data.bvh.root;

 	/* ray parameters in registers */
-	const float tmax = ray->t;
 	float3 P = ray->P;
 	float3 dir = bvh_clamp_direction(ray->D);
 	float3 idir = bvh_inverse_direction(dir);
 	int object = OBJECT_NONE;
-	float isect_t = tmax;
+	float isect_t = ray->t;

 	const uint visibility = PATH_RAY_ALL_VISIBILITY;
 	uint num_hits = 0;
@ -236,9 +235,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 						object = subsurface_object;

 #if FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm, tmax);
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
 #else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t, tmax);
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
 #endif

 #if defined(__KERNEL_SSE2__)
@ -272,9 +271,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio

 			/* instance pop */
 #if FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm, tmax);
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
 #else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect_t, tmax);
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect_t);
 #endif

 #if defined(__KERNEL_SSE2__)
--- a/intern/cycles/kernel/geom/geom_bvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h
@ -53,7 +53,6 @@ ccl_device bool BVH_FUNCTION_NAME
 	int nodeAddr = kernel_data.bvh.root;

 	/* ray parameters in registers */
-	const float tmax = ray->t;
 	float3 P = ray->P;
 	float3 dir = bvh_clamp_direction(ray->D);
 	float3 idir = bvh_inverse_direction(dir);
@ -63,7 +62,7 @@ ccl_device bool BVH_FUNCTION_NAME
 	Transform ob_tfm;
 #endif

-	isect->t = tmax;
+	isect->t = ray->t;
 	isect->object = OBJECT_NONE;
 	isect->prim = PRIM_NONE;
 	isect->u = 0.0f;
@ -264,18 +263,10 @@ ccl_device bool BVH_FUNCTION_NAME
 #if FEATURE(BVH_HAIR)
 							case PRIMITIVE_CURVE:
 							case PRIMITIVE_MOTION_CURVE: {
-#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
 									hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
 								else
 									hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
-#else
-								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
-									hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type);
-								else
-									hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type);
-#endif
-
 								break;
 							}
 #endif
@ -307,9 +298,9 @@ ccl_device bool BVH_FUNCTION_NAME
 					object = kernel_tex_fetch(__prim_object, -primAddr-1);

 #if FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm, tmax);
+					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
 #else
-					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, tmax);
+					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
 #endif

 #if defined(__KERNEL_SSE2__)
@ -337,9 +328,9 @@ ccl_device bool BVH_FUNCTION_NAME

 			/* instance pop */
 #if FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm, tmax);
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
 #else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t, tmax);
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
 #endif

 #if defined(__KERNEL_SSE2__)
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@ -377,7 +377,7 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir)

 /* Transform ray into object space to enter static object in BVH */

-ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, const float tmax)
+ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
 {
 	Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);

@ -393,7 +393,7 @@ ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ra

 /* Transorm ray to exit static object in BVH */

-ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, const float tmax)
+ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
 {
 	if(*t != FLT_MAX) {
 		Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
@ -405,10 +405,23 @@ ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray
 	*idir = bvh_inverse_direction(*dir);
 }

+/* Same as above, but returns scale factor to apply to multiple intersection distances */
+
+ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t_fac)
+{
+	Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+	*t_fac = len(transform_direction(&tfm, 1.0f/(*idir)));
+
+	*P = ray->P;
+	*dir = bvh_clamp_direction(ray->D);
+	*idir = bvh_inverse_direction(*dir);
+}
+
+
 #ifdef __OBJECT_MOTION__
 /* Transform ray into object space to enter motion blurred object in BVH */

-ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm, const float tmax)
+ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm)
 {
 	Transform itfm;
 	*tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm);
@ -425,7 +438,7 @@ ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, c

 /* Transorm ray to exit motion blurred object in BVH */

-ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm, const float tmax)
+ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm)
 {
 	if(*t != FLT_MAX)
 		*t *= len(transform_direction(tfm, 1.0f/(*idir)));
@ -434,6 +447,18 @@ ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, co
 	*dir = bvh_clamp_direction(ray->D);
 	*idir = bvh_inverse_direction(*dir);
 }
+
+/* Same as above, but returns scale factor to apply to multiple intersection distances */
+
+ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t_fac, Transform *tfm)
+{
+	*t_fac = len(transform_direction(tfm, 1.0f/(*idir)));
+
+	*P = ray->P;
+	*dir = bvh_clamp_direction(ray->D);
+	*idir = bvh_inverse_direction(*dir);
+}
+
 #endif

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@ -16,27 +16,194 @@

 CCL_NAMESPACE_BEGIN

+#ifdef __SHADOW_RECORD_ALL__
+
+/* Shadow function to compute how much light is blocked, CPU variation.
+ *
+ * We trace a single ray. If it hits any opaque surface, or more than a given
+ * number of transparent surfaces is hit, then we consider the geometry to be
+ * entirely blocked. If not, all transparent surfaces will be recorded and we
+ * will shade them one by one to determine how much light is blocked. This all
+ * happens in one scene intersection function.
+ *
+ * Recording all hits works well in some cases but may be slower in others. If
+ * we have many semi-transparent hairs, one intersection may be faster because
+ * you'd be reinteresecting the same hairs a lot with each step otherwise. If
+ * however there is mostly binary transparency then we may be recording many
+ * unnecessary intersections when one of the first surfaces blocks all light.
+ *
+ * From tests in real scenes it seems the performance loss is either minimal,
+ * or there is a performance increase anyway due to avoiding the need to send
+ * two rays with transparent shadows.
+ *
+ * This is CPU only because of qsort, and malloc or high stack space usage to
+ * record all these intersections. */
+
+ccl_device_noinline int shadow_intersections_compare(const void *a, const void *b)
+{
+	const Intersection *isect_a = (const Intersection*)a;
+	const Intersection *isect_b = (const Intersection*)b;
+
+	if(isect_a->t < isect_b->t)
+		return -1;
+	else if(isect_a->t > isect_b->t)
+		return 1;
+	else
+		return 0;
+}
+
+#define STACK_MAX_HITS 64
+
 ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow)
 {
 	*shadow = make_float3(1.0f, 1.0f, 1.0f);

+	if(ray->t == 0.0f)
+		return false;
+	
+	bool blocked;
+
+	if(kernel_data.integrator.transparent_shadows) {
+		/* intersect to find an opaque surface, or record all transparent surface hits */
+		Intersection hits_stack[STACK_MAX_HITS];
+		Intersection *hits;
+		uint max_hits = kernel_data.integrator.transparent_max_bounce - state->transparent_bounce - 1;
+
+		/* prefer to use stack but use dynamic allocation if too deep max hits
+		 * we need max_hits + 1 storage space due to the logic in
+		 * scene_intersect_shadow_all which will first store and then check if
+		 * the limit is exceeded */
+		if(max_hits + 1 <= STACK_MAX_HITS)
+			hits = hits_stack;
+		else
+			hits = (Intersection*)malloc(sizeof(Intersection)*(max_hits + 1));
+
+		uint num_hits;
+		blocked = scene_intersect_shadow_all(kg, ray, hits, max_hits, &num_hits);
+
+		/* if no opaque surface found but we did find transparent hits, shade them */
+		if(!blocked && num_hits > 0) {
+			float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+			float3 Pend = ray->P + ray->D*ray->t;
+			float last_t = 0.0f;
+			int bounce = state->transparent_bounce;
+			Intersection *isect = hits;
+#ifdef __VOLUME__
+			PathState ps = *state;
+#endif
+
+			qsort(hits, num_hits, sizeof(Intersection), shadow_intersections_compare);
+
+			for(int hit = 0; hit < num_hits; hit++, isect++) {
+				/* adjust intersection distance for moving ray forward */
+				float new_t = isect->t;
+				isect->t -= last_t;
+
+				/* skip hit if we did not move forward, step by step raytracing
+				 * would have skipped it as well then */
+				if(last_t == new_t)
+					continue;
+
+				last_t = new_t;
+
+#ifdef __VOLUME__
+				/* attenuation between last surface and next surface */
+				if(ps.volume_stack[0].shader != SHADER_NONE) {
+					Ray segment_ray = *ray;
+					segment_ray.t = isect->t;
+					kernel_volume_shadow(kg, &ps, &segment_ray, &throughput);
+				}
+#endif
+
+				/* setup shader data at surface */
+				ShaderData sd;
+				shader_setup_from_ray(kg, &sd, isect, ray, state->bounce+1, bounce);
+
+				/* attenuation from transparent surface */
+				if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
+					shader_eval_surface(kg, &sd, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
+					throughput *= shader_bsdf_transparency(kg, &sd);
+				}
+
+				/* stop if all light is blocked */
+				if(is_zero(throughput)) {
+					/* free dynamic storage */
+					if(hits != hits_stack)
+						free(hits);
+					return true;
+				}
+
+				/* move ray forward */
+				ray->P = sd.P;
+				if(ray->t != FLT_MAX)
+					ray->D = normalize_len(Pend - ray->P, &ray->t);
+
+#ifdef __VOLUME__
+				/* exit/enter volume */
+				kernel_volume_stack_enter_exit(kg, &sd, ps.volume_stack);
+#endif
+
+				bounce++;
+			}
+
+			/* free dynamic storage */
+			if(hits != hits_stack)
+				free(hits);
+
+#ifdef __VOLUME__
+			/* attenuation for last line segment towards light */
+			if(ps.volume_stack[0].shader != SHADER_NONE)
+				kernel_volume_shadow(kg, &ps, ray, &throughput);
+#endif
+
+			*shadow *= throughput;
+		}
+	}
+	else {
+		Intersection isect;
+#ifdef __HAIR__
+		blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
+#else
+		blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect);
+#endif
+	}
+
+#ifdef __VOLUME__
+	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
+		/* apply attenuation from current volume shader */
+		kernel_volume_shadow(kg, state, ray, shadow);
+	}
+#endif
+
+	return blocked;
+}
+
+#else
+
+/* Shadow function to compute how much light is blocked, GPU variation.
+ *
+ * Here we raytrace from one transparent surface to the next step by step.
+ * To minimize overhead in cases where we don't need transparent shadows, we
+ * first trace a regular shadow ray. We check if the hit primitive was
+ * potentially transparent, and only in that case start marching. this gives
+ * one extra ray cast for the cases were we do want transparency. */
+
+ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow)
+{
+*shadow = make_float3(1.0f, 1.0f, 1.0f);
+
 	if(ray->t == 0.0f)
 		return false;

 	Intersection isect;
 #ifdef __HAIR__
-	bool result = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
+	bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
 #else
-	bool result = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect);
+	bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect);
 #endif

 #ifdef __TRANSPARENT_SHADOWS__
-	if(result && kernel_data.integrator.transparent_shadows) {
-		/* transparent shadows work in such a way to try to minimize overhead
-		 * in cases where we don't need them. after a regular shadow ray is
-		 * cast we check if the hit primitive was potentially transparent, and
-		 * only in that case start marching. this gives on extra ray cast for
-		 * the cases were we do want transparency. */
+	if(blocked && kernel_data.integrator.transparent_shadows) {
 		if(shader_transparent_shadow(kg, &isect)) {
 			float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 			float3 Pend = ray->P + ray->D*ray->t;
@ -46,21 +213,8 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 #endif

 			for(;;) {
-				if(bounce >= kernel_data.integrator.transparent_max_bounce) {
+				if(bounce >= kernel_data.integrator.transparent_max_bounce)
 					return true;
-				}
-				else if(bounce >= kernel_data.integrator.transparent_min_bounce) {
-					/* todo: get random number somewhere for probabilistic terminate */
-#if 0
-					float probability = average(throughput);
-					float terminate = 0.0f;
-
-					if(terminate >= probability)
-						return true;
-
-					throughput /= probability;
-#endif
-				}

 #ifdef __HAIR__
 				if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect, NULL, 0.0f, 0.0f)) {
@ -75,6 +229,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 #endif

 					*shadow *= throughput;
+
 					return false;
 				}

@ -100,6 +255,9 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 					throughput *= shader_bsdf_transparency(kg, &sd);
 				}

+				if(is_zero(throughput))
+					return true;
+
 				/* move ray forward */
 				ray->P = ray_offset(sd.P, -sd.Ng);
 				if(ray->t != FLT_MAX)
@ -115,15 +273,17 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 		}
 	}
 #ifdef __VOLUME__
-	else if(!result && state->volume_stack[0].shader != SHADER_NONE) {
+	else if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
 		/* apply attenuation from current volume shader */
 		kernel_volume_shadow(kg, state, ray, shadow);
 	}
 #endif
 #endif

-	return result;
+	return blocked;
 }

+#endif
+
 CCL_NAMESPACE_END

--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@ -64,6 +64,7 @@ CCL_NAMESPACE_BEGIN
 #define __SUBSURFACE__
 #define __CMJ__
 #define __VOLUME__
+#define __SHADOW_RECORD_ALL__
 #endif

 #ifdef __KERNEL_CUDA__