Cycles: Use more SSE intrinsics for float3 type

This gives about 5% speedup on AVX2 kernels (other kernels still
have SSE disabled for math operations) and this solves the slowdown
of koro scene mention in the previous commit.

The title says it all actually. This commit also contains
changes to pass float3 as const reference in affected functions.

This should make MSVC happier without breaking OpenCL because it's
only done in areas which are ifdef-ed for non-OpenCL.

Another patch based on inspiration from Maxym Dmytrychenko, thanks!
This commit is contained in:
Sergey Sharybin 2016-10-12 14:23:29 +02:00
parent 42aeb608e7
commit e588106d45
2 changed files with 75 additions and 18 deletions

View File

@ -424,53 +424,87 @@ ccl_device_inline float2 interp(float2 a, float2 b, float t)
#ifndef __KERNEL_OPENCL__
ccl_device_inline float3 operator-(const float3 a)
ccl_device_inline float3 operator-(const float3& a)
{
#ifdef __KERNEL_SSE__
return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
#else
return make_float3(-a.x, -a.y, -a.z);
#endif
}
ccl_device_inline float3 operator*(const float3 a, const float3 b)
ccl_device_inline float3 operator*(const float3& a, const float3& b)
{
#ifdef __KERNEL_SSE__
return float3(_mm_mul_ps(a.m128,b.m128));
#else
return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
#endif
}
ccl_device_inline float3 operator*(const float3 a, float f)
ccl_device_inline float3 operator*(const float3& a, const float f)
{
#ifdef __KERNEL_SSE__
return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f)));
#else
return make_float3(a.x*f, a.y*f, a.z*f);
#endif
}
ccl_device_inline float3 operator*(float f, const float3 a)
ccl_device_inline float3 operator*(const float f, const float3& a)
{
#ifdef __KERNEL_SSE__
return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
#else
return make_float3(a.x*f, a.y*f, a.z*f);
#endif
}
ccl_device_inline float3 operator/(float f, const float3 a)
ccl_device_inline float3 operator/(const float f, const float3& a)
{
return make_float3(f/a.x, f/a.y, f/a.z);
#ifdef __KERNEL_SSE__
__m128 rc = _mm_rcp_ps(a.m128);
return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
#else
return make_float3(f / a.x, f / a.y, f / a.z);
#endif
}
ccl_device_inline float3 operator/(const float3 a, float f)
ccl_device_inline float3 operator/(const float3& a, const float f)
{
float invf = 1.0f/f;
return make_float3(a.x*invf, a.y*invf, a.z*invf);
return a * invf;
}
ccl_device_inline float3 operator/(const float3 a, const float3 b)
ccl_device_inline float3 operator/(const float3& a, const float3& b)
{
return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
#ifdef __KERNEL_SSE__
__m128 rc = _mm_rcp_ps(b.m128);
return float3(_mm_mul_ps(a, rc));
#else
return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
#endif
}
ccl_device_inline float3 operator+(const float3 a, const float3 b)
ccl_device_inline float3 operator+(const float3& a, const float3& b)
{
return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
#ifdef __KERNEL_SSE__
return float3(_mm_add_ps(a.m128, b.m128));
#else
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
#endif
}
ccl_device_inline float3 operator-(const float3 a, const float3 b)
ccl_device_inline float3 operator-(const float3& a, const float3& b)
{
return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
#ifdef __KERNEL_SSE__
return float3(_mm_sub_ps(a.m128, b.m128));
#else
return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
#endif
}
ccl_device_inline float3 operator+=(float3& a, const float3 b)
ccl_device_inline float3 operator+=(float3& a, const float3& b)
{
return a = a + b;
}
@ -505,6 +539,15 @@ ccl_device_inline float dot(const float3 a, const float3 b)
#endif
}
ccl_device_inline float dot_xy(const float3& a, const float3& b)
{
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b));
#else
return a.x*b.x + a.y*b.y;
#endif
}
ccl_device_inline float dot(const float4 a, const float4 b)
{
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
@ -563,13 +606,14 @@ ccl_device_inline float3 saturate3(float3 a)
ccl_device_inline float3 normalize_len(const float3 a, float *t)
{
*t = len(a);
return a/(*t);
float x = 1.0f / *t;
return a*x;
}
ccl_device_inline float3 safe_normalize(const float3 a)
{
float t = len(a);
return (t != 0.0f)? a/t: a;
return (t != 0.0f)? a * (1.0f/t) : a;
}
ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)

View File

@ -174,6 +174,9 @@ struct ccl_try_align(16) int3 {
__forceinline int3(const __m128i a) : m128(a) {}
__forceinline operator const __m128i&(void) const { return m128; }
__forceinline operator __m128i&(void) { return m128; }
int3(const int3& a) { m128 = a.m128; }
int3& operator =(const int3& a) { m128 = a.m128; return *this; }
#else
int x, y, z, w;
#endif
@ -193,6 +196,9 @@ struct ccl_try_align(16) int4 {
__forceinline int4(const __m128i a) : m128(a) {}
__forceinline operator const __m128i&(void) const { return m128; }
__forceinline operator __m128i&(void) { return m128; }
int4(const int4& a) : m128(a.m128) {}
int4& operator=(const int4& a) { m128 = a.m128; return *this; }
#else
int x, y, z, w;
#endif
@ -237,9 +243,12 @@ struct ccl_try_align(16) float3 {
};
__forceinline float3() {}
__forceinline float3(const __m128 a) : m128(a) {}
__forceinline float3(const __m128& a) : m128(a) {}
__forceinline operator const __m128&(void) const { return m128; }
__forceinline operator __m128&(void) { return m128; }
__forceinline float3(const float3& a) : m128(a.m128) {}
__forceinline float3& operator =(const float3& a) { m128 = a.m128; return *this; }
#else
float x, y, z, w;
#endif
@ -259,6 +268,10 @@ struct ccl_try_align(16) float4 {
__forceinline float4(const __m128 a) : m128(a) {}
__forceinline operator const __m128&(void) const { return m128; }
__forceinline operator __m128&(void) { return m128; }
__forceinline float4(const float4& a) : m128(a.m128) {}
__forceinline float4& operator =(const float4& a) { m128 = a.m128; return *this; }
#else
float x, y, z, w;
#endif