Cycles: Speedup transparent shadows in split kernel

This commit enables record-all transparent shadows rays. Perfromance results: R9 290 render time (without synchronization), seconds Before After Change BMW 261.5 262.5 +0.4 % Classroom 869.6 867.3 -0.3 % Fishy Cat 657.4 639.8 -2.7 % Koro 1909.8 692.8 -63.7 % Pabellon Barcelona 1633.3 1238.0 -24.2 % Pabellon Barcelona(*) 1158.1 903.8 -22.0 % (*) without glossy connected to volume
2017-03-08 16:26:39 +01:00 · 2017-03-08 16:26:39 +01:00 · e8b5a5bf5b
parent 57e26627c4
commit e8b5a5bf5b
2 changed files with 16 additions and 6 deletions
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@ -152,7 +152,13 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
 		int bounce = state->transparent_bounce;
 		Intersection *isect = hits;
 #    ifdef __VOLUME__
-		PathState ps = *state;
+#      ifdef __SPLIT_KERNEL__
+		ccl_addr_space PathState *ps = &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
+#      else
+		PathState ps_object;
+		PathState *ps = &ps_object;
+#      endif
+		*ps = *state;
 #    endif
 		sort_intersections(hits, num_hits);
 		for(int hit = 0; hit < num_hits; hit++, isect++) {
@ -171,7 +177,7 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
 			                                   shadow_sd,
 			                                   state,
 #ifdef __VOLUME__
-			                                   &ps,
+			                                   ps,
 #endif
 			                                   isect,
 			                                   ray,
@ -188,8 +194,8 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
 		}
 #    ifdef __VOLUME__
 		/* Attenuation for last line segment towards light. */
-		if(ps.volume_stack[0].shader != SHADER_NONE) {
-			kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput);
+		if(ps->volume_stack[0].shader != SHADER_NONE) {
+			kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
 		}
 #    endif
 		*shadow = throughput;
@ -214,7 +220,10 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
                                               uint max_hits,
                                               float3 *shadow)
 {
-#    ifdef __KERNEL_CUDA__
+#    ifdef __SPLIT_KERNEL__
+	Intersection hits_[SHADOW_STACK_MAX_HITS];
+	Intersection *hits = &hits_[0];
+#    elif defined(__KERNEL_CUDA__)
 	Intersection *hits = kg->hits_stack;
 #    else
 	Intersection hits_stack[SHADOW_STACK_MAX_HITS];
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@ -80,9 +80,9 @@ CCL_NAMESPACE_BEGIN
 #  define __CMJ__
 #  define __VOLUME__
 #  define __VOLUME_SCATTER__
+#  define __SHADOW_RECORD_ALL__
 #  ifndef __SPLIT_KERNEL__
 #    define __VOLUME_DECOUPLED__
-#    define __SHADOW_RECORD_ALL__
 #    define __VOLUME_RECORD_ALL__
 #  endif
 #endif  /* __KERNEL_CPU__ */
@ -131,6 +131,7 @@ CCL_NAMESPACE_BEGIN
 #    define __SUBSURFACE__
 #    define __VOLUME__
 #    define __VOLUME_SCATTER__
+#    define __SHADOW_RECORD_ALL__
 #  endif  /* __KERNEL_OPENCL_AMD__ */

 #  ifdef __KERNEL_OPENCL_INTEL_CPU__