Cycles: add more math functions for float4

Add more math functions for float4 to make them on par with float3 ones. It
makes it possible to change the types of float3 variables to float4 without
additional work.

Differential Revision: https://developer.blender.org/D15318
This commit is contained in:
Andrii Symkin 2022-06-30 16:22:43 +02:00 committed by Brecht Van Lommel
parent 6bb703a9ee
commit f00d9e80ae
1 changed files with 90 additions and 50 deletions

View File

@ -55,6 +55,7 @@ ccl_device_inline float4 floor(const float4 &a);
ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
#endif /* !__KERNEL_METAL__*/
ccl_device_inline float4 safe_divide(const float4 a, const float4 b);
ccl_device_inline float4 safe_divide(const float4 a, const float b);
#ifdef __KERNEL_SSE__
@ -74,11 +75,14 @@ template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b);
# endif
#endif /* __KERNEL_SSE__ */
ccl_device_inline float reduce_min(const float4 a);
ccl_device_inline float reduce_max(const float4 a);
ccl_device_inline float reduce_add(const float4 a);
ccl_device_inline bool isequal(const float4 a, const float4 b);
#ifndef __KERNEL_GPU__
ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b);
ccl_device_inline float4 reduce_min(const float4 &a);
ccl_device_inline float4 reduce_max(const float4 &a);
ccl_device_inline float4 reduce_add(const float4 &a);
#endif /* !__KERNEL_GPU__ */
/*******************************************************************************
@ -303,27 +307,9 @@ ccl_device_inline bool is_zero(const float4 &a)
# endif
}
ccl_device_inline float4 reduce_add(const float4 &a)
{
# if defined(__KERNEL_SSE__)
# if defined(__KERNEL_NEON__)
return float4(vdupq_n_f32(vaddvq_f32(a)));
# elif defined(__KERNEL_SSE3__)
float4 h(_mm_hadd_ps(a.m128, a.m128));
return float4(_mm_hadd_ps(h.m128, h.m128));
# else
float4 h(shuffle<1, 0, 3, 2>(a) + a);
return shuffle<2, 3, 0, 1>(h) + h;
# endif
# else
float sum = (a.x + a.y) + (a.z + a.w);
return make_float4(sum, sum, sum, sum);
# endif
}
ccl_device_inline float average(const float4 &a)
{
return reduce_add(a).x * 0.25f;
return reduce_add(a) * 0.25f;
}
ccl_device_inline float len(const float4 &a)
@ -392,8 +378,77 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
return a + t * (b - a);
}
ccl_device_inline float4 saturate(const float4 &a)
{
return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
}
ccl_device_inline float4 exp(float4 v)
{
return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
}
ccl_device_inline float4 log(float4 v)
{
return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
}
#endif /* !__KERNEL_METAL__*/
ccl_device_inline float reduce_add(const float4 a)
{
#if defined(__KERNEL_SSE__)
# if defined(__KERNEL_NEON__)
return vaddvq_f32(a);
# elif defined(__KERNEL_SSE3__)
float4 h(_mm_hadd_ps(a.m128, a.m128));
return _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128));
# else
float4 h(shuffle<1, 0, 3, 2>(a) + a);
return _mm_cvtss_f32(shuffle<2, 3, 0, 1>(h) + h);
# endif
#else
return a.x + a.y + a.z + a.w;
#endif
}
ccl_device_inline float reduce_min(const float4 a)
{
#if defined(__KERNEL_SSE__)
# if defined(__KERNEL_NEON__)
return vminvq_f32(a);
# else
float4 h = min(shuffle<1, 0, 3, 2>(a), a);
return _mm_cvtss_f32(min(shuffle<2, 3, 0, 1>(h), h));
# endif
#else
return min(min(a.x, a.y), min(a.z, a.w));
#endif
}
ccl_device_inline float reduce_max(const float4 a)
{
#if defined(__KERNEL_SSE__)
# if defined(__KERNEL_NEON__)
return vmaxvq_f32(a);
# else
float4 h = max(shuffle<1, 0, 3, 2>(a), a);
return _mm_cvtss_f32(max(shuffle<2, 3, 0, 1>(h), h));
# endif
#else
return max(max(a.x, a.y), max(a.z, a.w));
#endif
}
ccl_device_inline bool isequal(const float4 a, const float4 b)
{
#if defined(__KERNEL_METAL__)
return all(a == b);
#else
return a == b;
#endif
}
#ifdef __KERNEL_SSE__
template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
__forceinline const float4 shuffle(const float4 &b)
@ -461,34 +516,6 @@ ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
return select(mask, a, zero_float4());
}
ccl_device_inline float4 reduce_min(const float4 &a)
{
# if defined(__KERNEL_SSE__)
# if defined(__KERNEL_NEON__)
return float4(vdupq_n_f32(vminvq_f32(a)));
# else
float4 h = min(shuffle<1, 0, 3, 2>(a), a);
return min(shuffle<2, 3, 0, 1>(h), h);
# endif
# else
return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
# endif
}
ccl_device_inline float4 reduce_max(const float4 &a)
{
# if defined(__KERNEL_SSE__)
# if defined(__KERNEL_NEON__)
return float4(vdupq_n_f32(vmaxvq_f32(a)));
# else
float4 h = max(shuffle<1, 0, 3, 2>(a), a);
return max(shuffle<2, 3, 0, 1>(h), h);
# endif
# else
return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
# endif
}
ccl_device_inline float4 load_float4(ccl_private const float *v)
{
# ifdef __KERNEL_SSE__
@ -505,6 +532,14 @@ ccl_device_inline float4 safe_divide(const float4 a, const float b)
return (b != 0.0f) ? a / b : zero_float4();
}
ccl_device_inline float4 safe_divide(const float4 a, const float4 b)
{
return make_float4((b.x != 0.0f) ? a.x / b.x : 0.0f,
(b.y != 0.0f) ? a.y / b.y : 0.0f,
(b.z != 0.0f) ? a.z / b.z : 0.0f,
(b.w != 0.0f) ? a.w / b.w : 0.0f);
}
ccl_device_inline bool isfinite_safe(float4 v)
{
return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z) && isfinite_safe(v.w);
@ -523,6 +558,11 @@ ccl_device_inline float4 ensure_finite(float4 v)
return v;
}
ccl_device_inline float4 pow(float4 v, float e)
{
return make_float4(powf(v.x, e), powf(v.y, e), powf(v.z, e), powf(v.z, e));
}
CCL_NAMESPACE_END
#endif /* __UTIL_MATH_FLOAT4_H__ */