Fix Cycles CUDA performance on CUDA 8.0.

Mostly this is making inlining match CUDA 7.5 in a few performance critical places. The end result is that performance is now better than before, possibly due to less register spilling or other CUDA 8.0 compiler improvements. On benchmarks scenes, there are 3% to 35% render time reductions. Stack memory usage is reduced a little too. Reviewed By: sergey Differential Revision: https://developer.blender.org/D2269
2016-10-02 14:48:39 +02:00 · 2016-10-02 14:48:39 +02:00 · fd0dea585c
parent 4d14bd10c0
commit fd0dea585c
20 changed files with 82 additions and 94 deletions
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@ -157,8 +157,9 @@ CCL_NAMESPACE_BEGIN
 #undef BVH_NAME_EVAL
 #undef BVH_FUNCTION_FULL_NAME

+/* Note: ray is passed by value to work around a possible CUDA compiler bug. */
 ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
-                                          const Ray *ray,
+                                          const Ray ray,
                                          const uint visibility,
                                          Intersection *isect,
                                          uint *lcg_state,
@ -169,32 +170,32 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
 	if(kernel_data.bvh.have_motion) {
 #  ifdef __HAIR__
 		if(kernel_data.bvh.have_curves)
-			return bvh_intersect_hair_motion(kg, ray, isect, visibility, lcg_state, difl, extmax);
+			return bvh_intersect_hair_motion(kg, &ray, isect, visibility, lcg_state, difl, extmax);
 #  endif /* __HAIR__ */

-		return bvh_intersect_motion(kg, ray, isect, visibility);
+		return bvh_intersect_motion(kg, &ray, isect, visibility);
 	}
 #endif /* __OBJECT_MOTION__ */

 #ifdef __HAIR__
 	if(kernel_data.bvh.have_curves)
-		return bvh_intersect_hair(kg, ray, isect, visibility, lcg_state, difl, extmax);
+		return bvh_intersect_hair(kg, &ray, isect, visibility, lcg_state, difl, extmax);
 #endif /* __HAIR__ */

 #ifdef __KERNEL_CPU__

 #  ifdef __INSTANCING__
 	if(kernel_data.bvh.have_instancing)
-		return bvh_intersect_instancing(kg, ray, isect, visibility);
+		return bvh_intersect_instancing(kg, &ray, isect, visibility);
 #  endif /* __INSTANCING__ */

-	return bvh_intersect(kg, ray, isect, visibility);
+	return bvh_intersect(kg, &ray, isect, visibility);
 #else /* __KERNEL_CPU__ */

 #  ifdef __INSTANCING__
-	return bvh_intersect_instancing(kg, ray, isect, visibility);
+	return bvh_intersect_instancing(kg, &ray, isect, visibility);
 #  else
-	return bvh_intersect(kg, ray, isect, visibility);
+	return bvh_intersect(kg, &ray, isect, visibility);
 #  endif /* __INSTANCING__ */

 #endif /* __KERNEL_CPU__ */
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@ -16,7 +16,7 @@

 // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
 // 3-vector which might be faster.
-ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
+ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
                                                           int node_addr,
                                                           int child)
 {
@ -30,7 +30,7 @@ ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
 }

 #if !defined(__KERNEL_SSE2__)
-ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
                                                 const float3 P,
                                                 const float3 idir,
                                                 const float t,
@ -77,7 +77,7 @@ ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
 #endif
 }

-ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
                                                        const float3 P,
                                                        const float3 idir,
                                                        const float t,
@ -139,7 +139,7 @@ ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
 #endif
 }

-ccl_device_inline bool bvh_unaligned_node_intersect_child(
+ccl_device_forceinline bool bvh_unaligned_node_intersect_child(
        KernelGlobals *kg,
        const float3 P,
        const float3 dir,
@ -166,7 +166,7 @@ ccl_device_inline bool bvh_unaligned_node_intersect_child(
 	return tnear <= tfar;
 }

-ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
+ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust(
        KernelGlobals *kg,
        const float3 P,
        const float3 dir,
@ -202,7 +202,7 @@ ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
 	}
 }

-ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
                                                   const float3 P,
                                                   const float3 dir,
                                                   const float3 idir,
@ -232,7 +232,7 @@ ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
 	return mask;
 }

-ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
                                                          const float3 P,
                                                          const float3 dir,
                                                          const float3 idir,
@ -264,7 +264,7 @@ ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
 	return mask;
 }

-ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
                                         const float3 P,
                                         const float3 dir,
                                         const float3 idir,
@ -295,7 +295,7 @@ ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
 	}
 }

-ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
                                                const float3 P,
                                                const float3 dir,
                                                const float3 idir,
@ -333,7 +333,7 @@ ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
 }
 #else  /* !defined(__KERNEL_SSE2__) */

-int ccl_device_inline bvh_aligned_node_intersect(
+int ccl_device_forceinline bvh_aligned_node_intersect(
        KernelGlobals *kg,
        const float3& P,
        const float3& dir,
@ -377,7 +377,7 @@ int ccl_device_inline bvh_aligned_node_intersect(
 #  endif
 }

-int ccl_device_inline bvh_aligned_node_intersect_robust(
+ccl_device_forceinline int bvh_aligned_node_intersect_robust(
        KernelGlobals *kg,
        const float3& P,
        const float3& dir,
@ -441,7 +441,7 @@ int ccl_device_inline bvh_aligned_node_intersect_robust(
 #  endif
 }

-int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
                                                   const float3 P,
                                                   const float3 dir,
                                                   const ssef& isect_near,
@ -502,7 +502,7 @@ int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
 #  endif
 }

-int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
                                                          const float3 P,
                                                          const float3 dir,
                                                          const ssef& isect_near,
@ -573,7 +573,7 @@ int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
 #  endif
 }

-ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
                                         const float3& P,
                                         const float3& dir,
                                         const ssef& isect_near,
@ -611,7 +611,7 @@ ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
 	}
 }

-ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
                                                const float3& P,
                                                const float3& dir,
                                                const ssef& isect_near,
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@ -40,21 +40,16 @@
 *
 */

-#ifndef __KERNEL_GPU__
-ccl_device
-#else
-ccl_device_inline
-#endif
-bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                 const Ray *ray,
-                                 Intersection *isect,
-                                 const uint visibility
+ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                                     const Ray *ray,
+                                                     Intersection *isect,
+                                                     const uint visibility
 #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-                                 , uint *lcg_state,
-                                 float difl,
-                                 float extmax
+                                                     , uint *lcg_state,
+                                                     float difl,
+                                                     float extmax
 #endif
-                                 )
+                                                     )
 {
 	/* todo:
 	 * - test if pushing distance on the stack helps (for non shadow rays)
--- a/intern/cycles/kernel/bvh/bvh_types.h
+++ b/intern/cycles/kernel/bvh/bvh_types.h
@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN

 /* Don't inline intersect functions on GPU, this is faster */
 #ifdef __KERNEL_GPU__
-#  define ccl_device_intersect ccl_device_noinline
+#  define ccl_device_intersect ccl_device_forceinline
 #else
 #  define ccl_device_intersect ccl_device_inline
 #endif
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@ -36,7 +36,7 @@

 CCL_NAMESPACE_BEGIN

-ccl_device_inline int bsdf_sample(KernelGlobals *kg,
+ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
                                  ShaderData *sd,
                                  const ShaderClosure *sc,
                                  float randu,
@ -147,7 +147,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
 #ifndef __KERNEL_CUDA__
 ccl_device
 #else
-ccl_device_inline
+ccl_device_forceinline
 #endif
 float3 bsdf_eval(KernelGlobals *kg,
                 ShaderData *sd,
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@ -62,7 +62,7 @@ ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float rough
 	return 2.0f / (roughness*roughness) - 2.0f;
 }

-ccl_device_inline float3 bsdf_ashikhmin_shirley_eval_reflect(
+ccl_device_forceinline float3 bsdf_ashikhmin_shirley_eval_reflect(
        const ShaderClosure *sc,
        const float3 I,
        const float3 omega_in,
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@ -183,7 +183,7 @@ ccl_device_inline void microfacet_ggx_sample_slopes(
 	*slope_y = S * z * safe_sqrtf(1.0f + (*slope_x)*(*slope_x));
 }

-ccl_device_inline float3 microfacet_sample_stretched(
+ccl_device_forceinline float3 microfacet_sample_stretched(
 	KernelGlobals *kg, const float3 omega_i,
 	const float alpha_x, const float alpha_y,
 	const float randu, const float randv,
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
 /* === GGX Microfacet distribution functions === */

 /* Isotropic GGX microfacet distribution */
-ccl_device_inline float D_ggx(float3 wm, float alpha)
+ccl_device_forceinline float D_ggx(float3 wm, float alpha)
 {
 	wm.z *= wm.z;
 	alpha *= alpha;
@ -30,7 +30,7 @@ ccl_device_inline float D_ggx(float3 wm, float alpha)
 }

 /* Anisotropic GGX microfacet distribution */
-ccl_device_inline float D_ggx_aniso(const float3 wm, const float2 alpha)
+ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
 {
 	float slope_x = -wm.x/alpha.x;
 	float slope_y = -wm.y/alpha.y;
@ -40,7 +40,7 @@ ccl_device_inline float D_ggx_aniso(const float3 wm, const float2 alpha)
 }

 /* Sample slope distribution (based on page 14 of the supplemental implementation). */
-ccl_device_inline float2 mf_sampleP22_11(const float cosI, const float2 randU)
+ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
 {
 	if(cosI > 0.9999f || cosI < 1e-6f) {
 		const float r = sqrtf(randU.x / (1.0f - randU.x));
@ -78,7 +78,7 @@ ccl_device_inline float2 mf_sampleP22_11(const float cosI, const float2 randU)
 }

 /* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */
-ccl_device_inline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU)
+ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU)
 {
 	const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z));
 	const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU);
@ -94,7 +94,7 @@ ccl_device_inline float3 mf_sample_vndf(const float3 wi, const float2 alpha, con
 /* === Phase functions: Glossy, Diffuse and Glass === */

 /* Phase function for reflective materials, either without a fresnel term (for compatibility) or with the conductive fresnel term. */
-ccl_device_inline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm)
+ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm)
 {
 	if(n && k)
 		*weight *= fresnel_conductor(dot(wi, wm), *n, *k);
@ -102,7 +102,7 @@ ccl_device_inline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, floa
 	return -wi + 2.0f * wm * dot(wi, wm);
 }

-ccl_device_inline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k)
+ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k)
 {
 	if(w.z > 0.9999f)
 		return make_float3(0.0f, 0.0f, 0.0f);
@ -132,7 +132,7 @@ ccl_device_inline float3 mf_eval_phase_glossy(const float3 w, const float lambda
 }

 /* Phase function for rough lambertian diffuse surfaces. */
-ccl_device_inline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv)
+ccl_device_forceinline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv)
 {
 	float3 tm, bm;
 	make_orthonormals(wm, &tm, &bm);
@ -141,14 +141,14 @@ ccl_device_inline float3 mf_sample_phase_diffuse(const float3 wm, const float ra
 	return disk.x*tm + disk.y*bm + safe_sqrtf(1.0f - disk.x*disk.x - disk.y*disk.y)*wm;
 }

-ccl_device_inline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm)
+ccl_device_forceinline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm)
 {
 	const float v = max(0.0f, dot(w, wm)) * M_1_PI_F;
 	return make_float3(v, v, v);
 }

 /* Phase function for dielectric transmissive materials, including both reflection and refraction according to the dielectric fresnel term. */
-ccl_device_inline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside)
+ccl_device_forceinline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside)
 {
 	float cosI = dot(wi, wm);
 	float f = fresnel_dielectric_cos(cosI, eta);
@ -162,7 +162,7 @@ ccl_device_inline float3 mf_sample_phase_glass(const float3 wi, const float eta,
 	return normalize(wm*(cosI*inv_eta + cosT) - wi*inv_eta);
 }

-ccl_device_inline float3 mf_eval_phase_glass(const float3 w, const float lambda, const float3 wo, const bool wo_outside, const float2 alpha, const float eta)
+ccl_device_forceinline float3 mf_eval_phase_glass(const float3 w, const float lambda, const float3 wo, const bool wo_outside, const float2 alpha, const float eta)
 {
 	if(w.z > 0.9999f)
 		return make_float3(0.0f, 0.0f, 0.0f);
@ -195,7 +195,7 @@ ccl_device_inline float3 mf_eval_phase_glass(const float3 w, const float lambda,
 /* === Utility functions for the random walks === */

 /* Smith Lambda function for GGX (based on page 12 of the supplemental implementation). */
-ccl_device_inline float mf_lambda(const float3 w, const float2 alpha)
+ccl_device_forceinline float mf_lambda(const float3 w, const float2 alpha)
 {
 	if(w.z > 0.9999f)
 		return 0.0f;
@ -212,18 +212,18 @@ ccl_device_inline float mf_lambda(const float3 w, const float2 alpha)
 }

 /* Height distribution CDF (based on page 4 of the supplemental implementation). */
-ccl_device_inline float mf_invC1(const float h)
+ccl_device_forceinline float mf_invC1(const float h)
 {
 	return 2.0f * saturate(h) - 1.0f;
 }

-ccl_device_inline float mf_C1(const float h)
+ccl_device_forceinline float mf_C1(const float h)
 {
 	return saturate(0.5f * (h + 1.0f));
 }

 /* Masking function (based on page 16 of the supplemental implementation). */
-ccl_device_inline float mf_G1(const float3 w, const float C1, const float lambda)
+ccl_device_forceinline float mf_G1(const float3 w, const float C1, const float lambda)
 {
 	if(w.z > 0.9999f)
 		return 1.0f;
@ -233,7 +233,7 @@ ccl_device_inline float mf_G1(const float3 w, const float C1, const float lambda
 }

 /* Sampling from the visible height distribution (based on page 17 of the supplemental implementation). */
-ccl_device_inline bool mf_sample_height(const float3 w, float *h, float *C1, float *G1, float *lambda, const float U)
+ccl_device_forceinline bool mf_sample_height(const float3 w, float *h, float *C1, float *G1, float *lambda, const float U)
 {
 	if(w.z > 0.9999f)
 		return false;
@ -262,14 +262,14 @@ ccl_device_inline bool mf_sample_height(const float3 w, float *h, float *C1, flo

 /* Approximation for the albedo of the single-scattering GGX distribution,
 * the missing energy is then approximated as a diffuse reflection for the PDF. */
-ccl_device_inline float mf_ggx_albedo(float r)
+ccl_device_forceinline float mf_ggx_albedo(float r)
 {
 	float albedo = 0.806495f*expf(-1.98712f*r*r) + 0.199531f;
 	albedo -= ((((((1.76741f*r - 8.43891f)*r + 15.784f)*r - 14.398f)*r + 6.45221f)*r - 1.19722f)*r + 0.027803f)*r + 0.00568739f;
 	return saturate(albedo);
 }

-ccl_device_inline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha)
+ccl_device_forceinline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha)
 {
 	float D = D_ggx(normalize(wi+wo), alpha);
 	float lambda = mf_lambda(wi, make_float2(alpha, alpha));
@ -277,17 +277,17 @@ ccl_device_inline float mf_ggx_pdf(const float3 wi, const float3 wo, const float
 	return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z;
 }

-ccl_device_inline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha)
+ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha)
 {
 	return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z;
 }

-ccl_device_inline float mf_diffuse_pdf(const float3 wo)
+ccl_device_forceinline float mf_diffuse_pdf(const float3 wo)
 {
 	return M_1_PI_F * wo.z;
 }

-ccl_device_inline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta)
+ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta)
 {
 	float3 wh;
 	float fresnel;
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@ -25,7 +25,7 @@
 * energy is used. In combination with MIS, that is enough to produce an unbiased result, although
 * the balance heuristic isn't necessarily optimal anymore.
 */
-ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
+ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
        float3 wi,
        float3 wo,
        const bool wo_outside,
@ -168,7 +168,7 @@ ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 * escaped the surface in wo. The function returns the throughput between wi and wo.
 * Without reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal.
 */
-ccl_device float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state
+ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state
 #ifdef MF_MULTI_GLASS
 	, const float eta
 #elif defined(MF_MULTI_GLOSSY)
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@ -141,7 +141,7 @@ ccl_device float bssrdf_cubic_pdf(const ShaderClosure *sc, float r)
 }

 /* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
-ccl_device_inline float bssrdf_cubic_quintic_root_find(float xi)
+ccl_device_forceinline float bssrdf_cubic_quintic_root_find(float xi)
 {
 	/* newton-raphson iteration, usually succeeds in 2-4 iterations, except
 	 * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
@ -255,7 +255,7 @@ ccl_device float bssrdf_burley_pdf(const ShaderClosure *sc, float r)
 * Returns scaled radius, meaning the result is to be scaled up by d.
 * Since there's no closed form solution we do Newton-Raphson method to find it.
 */
-ccl_device_inline float bssrdf_burley_root_find(float xi)
+ccl_device_forceinline float bssrdf_burley_root_find(float xi)
 {
 	const float tolerance = 1e-6f;
 	const int max_iteration_count = 10;
@ -389,7 +389,7 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float
 		bssrdf_burley_sample(sc, xi, r, h);
 }

-ccl_device_inline float bssrdf_pdf(const ShaderClosure *sc, float r)
+ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r)
 {
 	if(sc->type == CLOSURE_BSSRDF_CUBIC_ID)
 		return bssrdf_cubic_pdf(sc, r);
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@ -222,10 +222,10 @@ ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)

 #ifdef __KERNEL_SSE2__
 /* Pass P and dir by reference to aligned vector */
-ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
 #else
-ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
 #endif
 {
@ -621,7 +621,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 	return hit;
 }

-ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
 {
 	/* define few macros to minimize code duplication for SSE */
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@ -54,13 +54,7 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
 #endif
 }

-/* TODO(sergey): This is just a workaround for annoying 6.5 compiler bug. */
-#if !defined(__KERNEL_CUDA__) || __CUDA_ARCH__ < 500
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
+ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@ -37,6 +37,7 @@
 /* Qualifier wrappers for different names on different devices */

 #define ccl_device  __device__ __inline__
+#  define ccl_device_forceinline  __device__ __forceinline__
 #if (__KERNEL_CUDA_VERSION__ == 80) && (__CUDA_ARCH__ < 500)
 #  define ccl_device_inline  __device__ __forceinline__
 #else
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@ -33,6 +33,7 @@
 /* in opencl all functions are device functions, so leave this empty */
 #define ccl_device
 #define ccl_device_inline ccl_device
+#define ccl_device_forceinline ccl_device
 #define ccl_device_noinline ccl_device ccl_noinline
 #define ccl_may_alias
 #define ccl_constant __constant
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@ -69,7 +69,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		Intersection isect;
 		uint visibility = path_state_ray_visibility(kg, state);
 		bool hit = scene_intersect(kg,
-		                           ray,
+		                           *ray,
 		                           visibility,
 		                           &isect,
 		                           NULL,
@ -655,9 +655,9 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
 		}

-		bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
+		bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
 #else
-		bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+		bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
 #endif

 #ifdef __KERNEL_DEBUG__
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@ -282,9 +282,9 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
 		}

-		bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
+		bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
 #else
-		bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+		bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
 #endif

 #ifdef __KERNEL_DEBUG__
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@ -98,7 +98,7 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons
 	return index;
 }

-ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@ -132,13 +132,7 @@ ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng,
 #endif
 }

-/* Temporary workaround for Pascal cards, otherwise AA does not work properly. */
-#if defined(__KERNEL_GPU__) && __CUDA_ARCH__ >= 600
-__device__ __forceinline__
-#else
-ccl_device_inline
-#endif
-void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@ -199,7 +193,7 @@ ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG

 /* Linear Congruential Generator */

-ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
 {
 	/* implicit mod 2^32 */
 	rng = (1103515245*(rng) + 12345);
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@ -155,7 +155,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd,
 	}
 	else {
 		Intersection isect;
-		blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
+		blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
 	}

 #ifdef __VOLUME__
@ -205,7 +205,7 @@ ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
 	Intersection *isect = &isect_object;
 #endif

-	bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);
+	bool blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);

 #ifdef __TRANSPARENT_SHADOWS__
 	if(blocked && kernel_data.integrator.transparent_shadows) {
@ -221,7 +221,7 @@ ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
 				if(bounce >= kernel_data.integrator.transparent_max_bounce)
 					return true;

-				if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
+				if(!scene_intersect(kg, *ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
 				{
 #ifdef __VOLUME__
 					/* attenuation for last line segment towards light */
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@ -1153,7 +1153,7 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg,
 	tracedata->sd.osl_globals = sd->osl_globals;

 	/* raytrace */
-	return scene_intersect(sd->osl_globals, &ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f);
+	return scene_intersect(sd->osl_globals, ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f);
 }


--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@ -42,6 +42,7 @@

 #if defined(_WIN32) && !defined(FREE_WINDOWS)
 #define ccl_device_inline static __forceinline
+#define ccl_device_forceinline static __forceinline
 #define ccl_align(...) __declspec(align(__VA_ARGS__))
 #ifdef __KERNEL_64_BIT__
 #define ccl_try_align(...) __declspec(align(__VA_ARGS__))
@ -56,6 +57,7 @@
 #else

 #define ccl_device_inline static inline __attribute__((always_inline))
+#define ccl_device_forceinline static inline __attribute__((always_inline))
 #define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
 #ifndef FREE_WINDOWS64
 #define __forceinline inline __attribute__((always_inline))