Cycles: Use more SSE intrinsics for float3 type

This gives about 5% speedup on AVX2 kernels (other kernels still have SSE disabled for math operations) and this solves the slowdown of koro scene mention in the previous commit. The title says it all actually. This commit also contains changes to pass float3 as const reference in affected functions. This should make MSVC happier without breaking OpenCL because it's only done in areas which are ifdef-ed for non-OpenCL. Another patch based on inspiration from Maxym Dmytrychenko, thanks!
2016-10-12 14:23:29 +02:00 · 2016-10-12 14:23:29 +02:00 · e588106d45
parent 42aeb608e7
commit e588106d45
2 changed files with 75 additions and 18 deletions
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@ -424,53 +424,87 @@ ccl_device_inline float2 interp(float2 a, float2 b, float t)

 #ifndef __KERNEL_OPENCL__

-ccl_device_inline float3 operator-(const float3 a)
+ccl_device_inline float3 operator-(const float3& a)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+#else
 	return make_float3(-a.x, -a.y, -a.z);
+#endif
 }

-ccl_device_inline float3 operator*(const float3 a, const float3 b)
+ccl_device_inline float3 operator*(const float3& a, const float3& b)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128,b.m128));
+#else
 	return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+#endif
 }

-ccl_device_inline float3 operator*(const float3 a, float f)
+ccl_device_inline float3 operator*(const float3& a, const float f)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f)));
+#else
 	return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
 }

-ccl_device_inline float3 operator*(float f, const float3 a)
+ccl_device_inline float3 operator*(const float f, const float3& a)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
+#else
 	return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
 }

-ccl_device_inline float3 operator/(float f, const float3 a)
+ccl_device_inline float3 operator/(const float f, const float3& a)
 {
-	return make_float3(f/a.x, f/a.y, f/a.z);
+#ifdef __KERNEL_SSE__
+	__m128 rc = _mm_rcp_ps(a.m128);
+	return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
+#else
+	return make_float3(f / a.x, f / a.y, f / a.z);
+#endif
 }

-ccl_device_inline float3 operator/(const float3 a, float f)
+ccl_device_inline float3 operator/(const float3& a, const float f)
 {
 	float invf = 1.0f/f;
-	return make_float3(a.x*invf, a.y*invf, a.z*invf);
+	return a * invf;
 }

-ccl_device_inline float3 operator/(const float3 a, const float3 b)
+ccl_device_inline float3 operator/(const float3& a, const float3& b)
 {
-	return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
+#ifdef __KERNEL_SSE__
+	__m128 rc = _mm_rcp_ps(b.m128);
+	return float3(_mm_mul_ps(a, rc));
+#else
+	return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+#endif
 }

-ccl_device_inline float3 operator+(const float3 a, const float3 b)
+ccl_device_inline float3 operator+(const float3& a, const float3& b)
 {
-	return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
+#ifdef __KERNEL_SSE__
+	return float3(_mm_add_ps(a.m128, b.m128));
+#else
+	return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+#endif
 }

-ccl_device_inline float3 operator-(const float3 a, const float3 b)
+ccl_device_inline float3 operator-(const float3& a, const float3& b)
 {
-	return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
+#ifdef __KERNEL_SSE__
+	return float3(_mm_sub_ps(a.m128, b.m128));
+#else
+	return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+#endif
 }

-ccl_device_inline float3 operator+=(float3& a, const float3 b)
+ccl_device_inline float3 operator+=(float3& a, const float3& b)
 {
 	return a = a + b;
 }
@ -505,6 +539,15 @@ ccl_device_inline float dot(const float3 a, const float3 b)
 #endif
 }

+ccl_device_inline float dot_xy(const float3& a, const float3& b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b));
+#else
+	return a.x*b.x + a.y*b.y;
+#endif
+}
+
 ccl_device_inline float dot(const float4 a, const float4 b)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
@ -563,13 +606,14 @@ ccl_device_inline float3 saturate3(float3 a)
 ccl_device_inline float3 normalize_len(const float3 a, float *t)
 {
 	*t = len(a);
-	return a/(*t);
+	float x = 1.0f / *t;
+	return a*x;
 }

 ccl_device_inline float3 safe_normalize(const float3 a)
 {
 	float t = len(a);
-	return (t != 0.0f)? a/t: a;
+	return (t != 0.0f)? a * (1.0f/t) : a;
 }

 ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@ -174,6 +174,9 @@ struct ccl_try_align(16) int3 {
 	__forceinline int3(const __m128i a) : m128(a) {}
 	__forceinline operator const __m128i&(void) const { return m128; }
 	__forceinline operator __m128i&(void) { return m128; }
+
+	int3(const int3& a) { m128 = a.m128; }
+	int3& operator =(const int3& a) { m128 = a.m128; return *this; }
 #else
 	int x, y, z, w;
 #endif
@ -193,6 +196,9 @@ struct ccl_try_align(16) int4 {
 	__forceinline int4(const __m128i a) : m128(a) {}
 	__forceinline operator const __m128i&(void) const { return m128; }
 	__forceinline operator __m128i&(void) { return m128; }
+
+	int4(const int4& a) : m128(a.m128) {}
+	int4& operator=(const int4& a) { m128 = a.m128; return *this; }
 #else
 	int x, y, z, w;
 #endif
@ -237,9 +243,12 @@ struct ccl_try_align(16) float3 {
 	};

 	__forceinline float3() {}
-	__forceinline float3(const __m128 a) : m128(a) {}
+	__forceinline float3(const __m128& a) : m128(a) {}
 	__forceinline operator const __m128&(void) const { return m128; }
 	__forceinline operator __m128&(void) { return m128; }
+
+	__forceinline float3(const float3& a) : m128(a.m128) {}
+	__forceinline float3& operator =(const float3& a) { m128 = a.m128; return *this; }
 #else
 	float x, y, z, w;
 #endif
@ -259,6 +268,10 @@ struct ccl_try_align(16) float4 {
 	__forceinline float4(const __m128 a) : m128(a) {}
 	__forceinline operator const __m128&(void) const { return m128; }
 	__forceinline operator __m128&(void) { return m128; }
+
+	__forceinline float4(const float4& a) : m128(a.m128) {}
+	__forceinline float4& operator =(const float4& a) { m128 = a.m128; return *this; }
+
 #else
 	float x, y, z, w;
 #endif