Cycles: add support for Arm Neon instructions using sse2neon
Based on patch contributed by Apple and Stefan Werner. Ref D8237, T78710
This commit is contained in:
parent
68dd7617d7
commit
0e9497e886
|
@ -102,7 +102,7 @@ size_t SocketType::max_size()
|
|||
|
||||
void *SocketType::zero_default_value()
|
||||
{
|
||||
static Transform zero_transform = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
|
||||
static Transform zero_transform = transform_zero();
|
||||
return &zero_transform;
|
||||
}
|
||||
|
||||
|
|
|
@ -741,7 +741,8 @@ float Camera::world_to_raster_size(float3 P)
|
|||
float3 D = transform_point(&worldtocamera, P);
|
||||
float dist = len(D);
|
||||
|
||||
Ray ray = {{0}};
|
||||
Ray ray;
|
||||
memset(&ray, 0, sizeof(ray));
|
||||
|
||||
/* Distortion can become so great that the results become meaningless, there
|
||||
* may be a better way to do this, but calculating differentials from the
|
||||
|
|
|
@ -2081,6 +2081,16 @@ ConvertNode::ConvertNode(SocketType::Type from_, SocketType::Type to_, bool auto
|
|||
special_type = SHADER_SPECIAL_TYPE_AUTOCONVERT;
|
||||
}
|
||||
|
||||
/* Union usage requires a manual copy constructor. */
|
||||
ConvertNode::ConvertNode(const ConvertNode &other)
|
||||
: ShaderNode(other),
|
||||
from(other.from),
|
||||
to(other.to),
|
||||
value_color(other.value_color),
|
||||
value_string(other.value_string)
|
||||
{
|
||||
}
|
||||
|
||||
void ConvertNode::constant_fold(const ConstantFolder &folder)
|
||||
{
|
||||
/* proxy nodes should have been removed at this point */
|
||||
|
|
|
@ -501,6 +501,7 @@ class RGBToBWNode : public ShaderNode {
|
|||
class ConvertNode : public ShaderNode {
|
||||
public:
|
||||
ConvertNode(SocketType::Type from, SocketType::Type to, bool autoconvert = false);
|
||||
ConvertNode(const ConvertNode &other);
|
||||
SHADER_NODE_BASE_CLASS(ConvertNode)
|
||||
|
||||
void constant_fold(const ConstantFolder &folder);
|
||||
|
|
|
@ -304,8 +304,12 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &
|
|||
ccl_device_inline float3 fabs(const float3 &a)
|
||||
{
|
||||
# ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_NEON__
|
||||
return float3(vabsq_f32(a.m128));
|
||||
# else
|
||||
__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
|
||||
return float3(_mm_and_ps(a.m128, mask));
|
||||
# endif
|
||||
# else
|
||||
return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
|
||||
# endif
|
||||
|
@ -447,7 +451,13 @@ ccl_device_inline bool is_zero(const float3 a)
|
|||
|
||||
ccl_device_inline float reduce_add(const float3 a)
|
||||
{
|
||||
#if defined(__KERNEL_SSE__) && defined(__KERNEL_NEON__)
|
||||
__m128 t = a.m128;
|
||||
t[3] = 0.0f;
|
||||
return vaddvq_f32(t);
|
||||
#else
|
||||
return (a.x + a.y + a.z);
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float average(const float3 a)
|
||||
|
|
|
@ -257,7 +257,12 @@ ccl_device_inline float distance(const float4 &a, const float4 &b)
|
|||
ccl_device_inline float dot(const float4 &a, const float4 &b)
|
||||
{
|
||||
# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_NEON__)
|
||||
__m128 t = vmulq_f32(a, b);
|
||||
return vaddvq_f32(t);
|
||||
# else
|
||||
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
|
||||
# endif
|
||||
# else
|
||||
return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
|
||||
# endif
|
||||
|
@ -313,8 +318,10 @@ ccl_device_inline bool is_zero(const float4 &a)
|
|||
|
||||
ccl_device_inline float4 reduce_add(const float4 &a)
|
||||
{
|
||||
# ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE3__
|
||||
# if defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_NEON__)
|
||||
return float4(vdupq_n_f32(vaddvq_f32(a)));
|
||||
# elif defined(__KERNEL_SSE3__)
|
||||
float4 h(_mm_hadd_ps(a.m128, a.m128));
|
||||
return float4(_mm_hadd_ps(h.m128, h.m128));
|
||||
# else
|
||||
|
@ -373,8 +380,12 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
|
|||
|
||||
ccl_device_inline float4 fabs(const float4 &a)
|
||||
{
|
||||
# ifdef __KERNEL_SSE__
|
||||
# if defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_NEON__)
|
||||
return float4(vabsq_f32(a));
|
||||
# else
|
||||
return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
|
||||
# endif
|
||||
# else
|
||||
return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
|
||||
# endif
|
||||
|
@ -400,14 +411,22 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
|
|||
template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
|
||||
__forceinline const float4 shuffle(const float4 &b)
|
||||
{
|
||||
# if defined(__KERNEL_NEON__)
|
||||
return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(b.m128));
|
||||
# else
|
||||
return float4(_mm_castsi128_ps(
|
||||
_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
|
||||
# endif
|
||||
}
|
||||
|
||||
template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
|
||||
__forceinline const float4 shuffle(const float4 &a, const float4 &b)
|
||||
{
|
||||
# if defined(__KERNEL_NEON__)
|
||||
return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(a.m128, b.m128));
|
||||
# else
|
||||
return float4(_mm_shuffle_ps(a.m128, b.m128, _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
|
||||
# endif
|
||||
}
|
||||
|
||||
template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b)
|
||||
|
@ -457,9 +476,13 @@ ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
|
|||
|
||||
ccl_device_inline float4 reduce_min(const float4 &a)
|
||||
{
|
||||
# ifdef __KERNEL_SSE__
|
||||
# if defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_NEON__)
|
||||
return float4(vdupq_n_f32(vminvq_f32(a)));
|
||||
# else
|
||||
float4 h = min(shuffle<1, 0, 3, 2>(a), a);
|
||||
return min(shuffle<2, 3, 0, 1>(h), h);
|
||||
# endif
|
||||
# else
|
||||
return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
|
||||
# endif
|
||||
|
@ -467,9 +490,13 @@ ccl_device_inline float4 reduce_min(const float4 &a)
|
|||
|
||||
ccl_device_inline float4 reduce_max(const float4 &a)
|
||||
{
|
||||
# ifdef __KERNEL_SSE__
|
||||
# if defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_NEON__)
|
||||
return float4(vdupq_n_f32(vmaxvq_f32(a)));
|
||||
# else
|
||||
float4 h = max(shuffle<1, 0, 3, 2>(a), a);
|
||||
return max(shuffle<2, 3, 0, 1>(h), h);
|
||||
# endif
|
||||
# else
|
||||
return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
|
||||
# endif
|
||||
|
|
|
@ -27,44 +27,50 @@
|
|||
|
||||
/* We require minimum SSE2 support on x86, so auto enable. */
|
||||
# define __KERNEL_SSE2__
|
||||
|
||||
# ifdef WITH_KERNEL_SSE2
|
||||
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
# endif
|
||||
|
||||
# ifdef WITH_KERNEL_SSE3
|
||||
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
# endif
|
||||
|
||||
# endif /* defined(i386) || defined(_M_IX86) */
|
||||
|
||||
/* x86-64
|
||||
*
|
||||
* Compile a regular (includes SSE2), SSE3, SSE 4.1, AVX and AVX2 kernel. */
|
||||
|
||||
# if defined(__x86_64__) || defined(_M_X64)
|
||||
# elif defined(__x86_64__) || defined(_M_X64)
|
||||
|
||||
/* SSE2 is always available on x86-64 CPUs, so auto enable */
|
||||
# define __KERNEL_SSE2__
|
||||
|
||||
/* no SSE2 kernel on x86-64, part of regular kernel */
|
||||
# ifdef WITH_KERNEL_SSE3
|
||||
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
# endif
|
||||
|
||||
# ifdef WITH_KERNEL_SSE41
|
||||
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
# endif
|
||||
|
||||
# ifdef WITH_KERNEL_AVX
|
||||
# define WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||
# endif
|
||||
|
||||
# ifdef WITH_KERNEL_AVX2
|
||||
# define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
||||
# endif
|
||||
|
||||
# endif /* defined(__x86_64__) || defined(_M_X64) */
|
||||
/* Arm Neon
|
||||
*
|
||||
* Compile a SSE4 kernel emulated with Neon. Most code is shared with
|
||||
* SSE, some specializations for performance and compatibility are made
|
||||
* made testing for __KERNEL_NEON__. */
|
||||
|
||||
# elif defined(__ARM_NEON) && defined(WITH_SSE2NEON)
|
||||
|
||||
# define __KERNEL_NEON__
|
||||
# define __KERNEL_SSE__
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
|
||||
# endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -35,6 +35,9 @@
|
|||
# include <intrin.h>
|
||||
#elif (defined(__x86_64__) || defined(__i386__))
|
||||
# include <x86intrin.h>
|
||||
#elif defined(__KERNEL_NEON__)
|
||||
# define SSE2NEON_PRECISE_MINMAX 1
|
||||
# include <sse2neon.h>
|
||||
#endif
|
||||
|
||||
/* Floating Point Control, for Embree. */
|
||||
|
@ -116,6 +119,80 @@ static struct StepTy {
|
|||
|
||||
#endif
|
||||
|
||||
/* Utilities used by Neon */
|
||||
#if defined(__KERNEL_NEON__)
|
||||
template<class type, int i0, int i1, int i2, int i3> type shuffle_neon(const type &a)
|
||||
{
|
||||
if (i0 == i1 && i0 == i2 && i0 == i3) {
|
||||
return vdupq_laneq_s32(a, i0);
|
||||
}
|
||||
static const uint8_t tbl[16] = {(i0 * 4) + 0,
|
||||
(i0 * 4) + 1,
|
||||
(i0 * 4) + 2,
|
||||
(i0 * 4) + 3,
|
||||
(i1 * 4) + 0,
|
||||
(i1 * 4) + 1,
|
||||
(i1 * 4) + 2,
|
||||
(i1 * 4) + 3,
|
||||
(i2 * 4) + 0,
|
||||
(i2 * 4) + 1,
|
||||
(i2 * 4) + 2,
|
||||
(i2 * 4) + 3,
|
||||
(i3 * 4) + 0,
|
||||
(i3 * 4) + 1,
|
||||
(i3 * 4) + 2,
|
||||
(i3 * 4) + 3};
|
||||
|
||||
return vqtbl1q_s8(int8x16_t(a), *(int8x16_t *)tbl);
|
||||
}
|
||||
|
||||
template<class type, int i0, int i1, int i2, int i3>
|
||||
type shuffle_neon(const type &a, const type &b)
|
||||
{
|
||||
if (&a == &b) {
|
||||
static const uint8_t tbl[16] = {(i0 * 4) + 0,
|
||||
(i0 * 4) + 1,
|
||||
(i0 * 4) + 2,
|
||||
(i0 * 4) + 3,
|
||||
(i1 * 4) + 0,
|
||||
(i1 * 4) + 1,
|
||||
(i1 * 4) + 2,
|
||||
(i1 * 4) + 3,
|
||||
(i2 * 4) + 0,
|
||||
(i2 * 4) + 1,
|
||||
(i2 * 4) + 2,
|
||||
(i2 * 4) + 3,
|
||||
(i3 * 4) + 0,
|
||||
(i3 * 4) + 1,
|
||||
(i3 * 4) + 2,
|
||||
(i3 * 4) + 3};
|
||||
|
||||
return vqtbl1q_s8(int8x16_t(b), *(int8x16_t *)tbl);
|
||||
}
|
||||
else {
|
||||
|
||||
static const uint8_t tbl[16] = {(i0 * 4) + 0,
|
||||
(i0 * 4) + 1,
|
||||
(i0 * 4) + 2,
|
||||
(i0 * 4) + 3,
|
||||
(i1 * 4) + 0,
|
||||
(i1 * 4) + 1,
|
||||
(i1 * 4) + 2,
|
||||
(i1 * 4) + 3,
|
||||
(i2 * 4) + 0 + 16,
|
||||
(i2 * 4) + 1 + 16,
|
||||
(i2 * 4) + 2 + 16,
|
||||
(i2 * 4) + 3 + 16,
|
||||
(i3 * 4) + 0 + 16,
|
||||
(i3 * 4) + 1 + 16,
|
||||
(i3 * 4) + 2 + 16,
|
||||
(i3 * 4) + 3 + 16};
|
||||
|
||||
return vqtbl2q_s8((int8x16x2_t){a, b}, *(int8x16_t *)tbl);
|
||||
}
|
||||
}
|
||||
#endif /* __KERNEL_NEON */
|
||||
|
||||
/* Intrinsics Functions
|
||||
*
|
||||
* For fast bit operations. */
|
||||
|
@ -428,8 +505,9 @@ __forceinline __m128i _mm_max_epi32_emu(__m128i value, __m128i input)
|
|||
return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input));
|
||||
}
|
||||
|
||||
# undef _mm_extract_epi32
|
||||
# define _mm_extract_epi32 _mm_extract_epi32_emu
|
||||
# ifndef __KERNEL_NEON__
|
||||
# undef _mm_extract_epi32
|
||||
# define _mm_extract_epi32 _mm_extract_epi32_emu
|
||||
__forceinline int _mm_extract_epi32_emu(__m128i input, const int index)
|
||||
{
|
||||
switch (index) {
|
||||
|
@ -446,6 +524,7 @@ __forceinline int _mm_extract_epi32_emu(__m128i input, const int index)
|
|||
return 0;
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# undef _mm_insert_epi32
|
||||
# define _mm_insert_epi32 _mm_insert_epi32_emu
|
||||
|
|
|
@ -197,9 +197,14 @@ __forceinline const sseb unpackhi(const sseb &a, const sseb &b)
|
|||
template<size_t i0, size_t i1, size_t i2, size_t i3>
|
||||
__forceinline const sseb shuffle(const sseb &a)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a);
|
||||
# else
|
||||
return _mm_castsi128_ps(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)));
|
||||
# endif
|
||||
}
|
||||
|
||||
# ifndef __KERNEL_NEON__
|
||||
template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a)
|
||||
{
|
||||
return _mm_movelh_ps(a, a);
|
||||
|
@ -209,13 +214,19 @@ template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a)
|
|||
{
|
||||
return _mm_movehl_ps(a, a);
|
||||
}
|
||||
# endif
|
||||
|
||||
template<size_t i0, size_t i1, size_t i2, size_t i3>
|
||||
__forceinline const sseb shuffle(const sseb &a, const sseb &b)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a, b);
|
||||
# else
|
||||
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
|
||||
# endif
|
||||
}
|
||||
|
||||
# ifndef __KERNEL_NEON__
|
||||
template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a, const sseb &b)
|
||||
{
|
||||
return _mm_movelh_ps(a, b);
|
||||
|
@ -225,8 +236,9 @@ template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a, const sse
|
|||
{
|
||||
return _mm_movehl_ps(b, a);
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(__KERNEL_SSE3__)
|
||||
# if defined(__KERNEL_SSE3__) && !defined(__KERNEL_NEON__)
|
||||
template<> __forceinline const sseb shuffle<0, 0, 2, 2>(const sseb &a)
|
||||
{
|
||||
return _mm_moveldup_ps(a);
|
||||
|
@ -241,7 +253,16 @@ template<> __forceinline const sseb shuffle<1, 1, 3, 3>(const sseb &a)
|
|||
template<size_t dst, size_t src, size_t clr>
|
||||
__forceinline const sseb insert(const sseb &a, const sseb &b)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
sseb res = a;
|
||||
if (clr)
|
||||
res[dst] = 0;
|
||||
else
|
||||
res[dst] = b[src];
|
||||
return res;
|
||||
# else
|
||||
return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
|
||||
# endif
|
||||
}
|
||||
template<size_t dst, size_t src> __forceinline const sseb insert(const sseb &a, const sseb &b)
|
||||
{
|
||||
|
@ -260,7 +281,13 @@ template<size_t dst> __forceinline const sseb insert(const sseb &a, const bool b
|
|||
# if defined(__KERNEL_SSE41__)
|
||||
__forceinline uint32_t popcnt(const sseb &a)
|
||||
{
|
||||
# if defined(__KERNEL_NEON__)
|
||||
const int32x4_t mask = {1, 1, 1, 1};
|
||||
int32x4_t t = vandq_s32(a.m128, mask);
|
||||
return vaddvq_s32(t);
|
||||
# else
|
||||
return _mm_popcnt_u32(_mm_movemask_ps(a));
|
||||
# endif
|
||||
}
|
||||
# else
|
||||
__forceinline uint32_t popcnt(const sseb &a)
|
||||
|
@ -271,23 +298,43 @@ __forceinline uint32_t popcnt(const sseb &a)
|
|||
|
||||
__forceinline bool reduce_and(const sseb &a)
|
||||
{
|
||||
# if defined(__KERNEL_NEON__)
|
||||
return vaddvq_s32(a.m128) == -4;
|
||||
# else
|
||||
return _mm_movemask_ps(a) == 0xf;
|
||||
# endif
|
||||
}
|
||||
__forceinline bool reduce_or(const sseb &a)
|
||||
{
|
||||
# if defined(__KERNEL_NEON__)
|
||||
return vaddvq_s32(a.m128) != 0x0;
|
||||
# else
|
||||
return _mm_movemask_ps(a) != 0x0;
|
||||
# endif
|
||||
}
|
||||
__forceinline bool all(const sseb &b)
|
||||
{
|
||||
# if defined(__KERNEL_NEON__)
|
||||
return vaddvq_s32(b.m128) == -4;
|
||||
# else
|
||||
return _mm_movemask_ps(b) == 0xf;
|
||||
# endif
|
||||
}
|
||||
__forceinline bool any(const sseb &b)
|
||||
{
|
||||
# if defined(__KERNEL_NEON__)
|
||||
return vaddvq_s32(b.m128) != 0x0;
|
||||
# else
|
||||
return _mm_movemask_ps(b) != 0x0;
|
||||
# endif
|
||||
}
|
||||
__forceinline bool none(const sseb &b)
|
||||
{
|
||||
# if defined(__KERNEL_NEON__)
|
||||
return vaddvq_s32(b.m128) == 0x0;
|
||||
# else
|
||||
return _mm_movemask_ps(b) == 0x0;
|
||||
# endif
|
||||
}
|
||||
|
||||
__forceinline uint32_t movemask(const sseb &a)
|
||||
|
|
|
@ -303,41 +303,46 @@ __forceinline ssef maxi(const ssef &a, const ssef &b)
|
|||
/// Ternary Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
# if defined(__KERNEL_AVX2__)
|
||||
__forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c)
|
||||
{
|
||||
# if defined(__KERNEL_NEON__)
|
||||
return vfmaq_f32(c, a, b);
|
||||
# elif defined(__KERNEL_AVX2__)
|
||||
return _mm_fmadd_ps(a, b, c);
|
||||
}
|
||||
__forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c)
|
||||
{
|
||||
return _mm_fmsub_ps(a, b, c);
|
||||
}
|
||||
__forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c)
|
||||
{
|
||||
return _mm_fnmadd_ps(a, b, c);
|
||||
}
|
||||
__forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c)
|
||||
{
|
||||
return _mm_fnmsub_ps(a, b, c);
|
||||
}
|
||||
# else
|
||||
__forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c)
|
||||
{
|
||||
return a * b + c;
|
||||
# endif
|
||||
}
|
||||
__forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c)
|
||||
{
|
||||
# if defined(__KERNEL_NEON__)
|
||||
return vfmaq_f32(vnegq_f32(c), a, b);
|
||||
# elif defined(__KERNEL_AVX2__)
|
||||
return _mm_fmsub_ps(a, b, c);
|
||||
# else
|
||||
return a * b - c;
|
||||
# endif
|
||||
}
|
||||
__forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c)
|
||||
{
|
||||
# if defined(__KERNEL_NEON__)
|
||||
return vfmsq_f32(c, a, b);
|
||||
# elif defined(__KERNEL_AVX2__)
|
||||
return _mm_fnmadd_ps(a, b, c);
|
||||
# else
|
||||
return c - a * b;
|
||||
# endif
|
||||
}
|
||||
__forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c)
|
||||
{
|
||||
# if defined(__KERNEL_NEON__)
|
||||
return vfmsq_f32(vnegq_f32(c), a, b);
|
||||
# elif defined(__KERNEL_AVX2__)
|
||||
return _mm_fnmsub_ps(a, b, c);
|
||||
# else
|
||||
return -a * b - c;
|
||||
}
|
||||
# endif
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Assignment Operators
|
||||
|
@ -496,27 +501,51 @@ __forceinline const ssef select(const int mask, const ssef &t, const ssef &f)
|
|||
# if defined(__KERNEL_SSE41__)
|
||||
__forceinline const ssef round_even(const ssef &a)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return vrndnq_f32(a);
|
||||
# else
|
||||
return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
|
||||
# endif
|
||||
}
|
||||
__forceinline const ssef round_down(const ssef &a)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return vrndmq_f32(a);
|
||||
# else
|
||||
return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
|
||||
# endif
|
||||
}
|
||||
__forceinline const ssef round_up(const ssef &a)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return vrndpq_f32(a);
|
||||
# else
|
||||
return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
|
||||
# endif
|
||||
}
|
||||
__forceinline const ssef round_zero(const ssef &a)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return vrndq_f32(a);
|
||||
# else
|
||||
return _mm_round_ps(a, _MM_FROUND_TO_ZERO);
|
||||
# endif
|
||||
}
|
||||
__forceinline const ssef floor(const ssef &a)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return vrndnq_f32(a);
|
||||
# else
|
||||
return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
|
||||
# endif
|
||||
}
|
||||
__forceinline const ssef ceil(const ssef &a)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return vrndpq_f32(a);
|
||||
# else
|
||||
return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
|
||||
# endif
|
||||
}
|
||||
# endif
|
||||
|
||||
|
@ -566,7 +595,11 @@ __forceinline ssef unpackhi(const ssef &a, const ssef &b)
|
|||
template<size_t i0, size_t i1, size_t i2, size_t i3>
|
||||
__forceinline const ssef shuffle(const ssef &b)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return shuffle_neon<ssef, i0, i1, i2, i3>(b.m128);
|
||||
# else
|
||||
return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
|
||||
# endif
|
||||
}
|
||||
|
||||
template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a)
|
||||
|
@ -582,14 +615,23 @@ template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a)
|
|||
template<size_t i0, size_t i1, size_t i2, size_t i3>
|
||||
__forceinline const ssef shuffle(const ssef &a, const ssef &b)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b);
|
||||
# else
|
||||
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
|
||||
# endif
|
||||
}
|
||||
|
||||
template<size_t i0> __forceinline const ssef shuffle(const ssef &a, const ssef &b)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return shuffle<float32x4_t, i0, i0, i0, i0>(a, b);
|
||||
# else
|
||||
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0));
|
||||
# endif
|
||||
}
|
||||
|
||||
# ifndef __KERNEL_NEON__
|
||||
template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a, const ssef &b)
|
||||
{
|
||||
return _mm_movelh_ps(a, b);
|
||||
|
@ -599,6 +641,7 @@ template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a, const sse
|
|||
{
|
||||
return _mm_movehl_ps(b, a);
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(__KERNEL_SSSE3__)
|
||||
__forceinline const ssef shuffle8(const ssef &a, const ssei &shuf)
|
||||
|
@ -643,7 +686,16 @@ template<> __forceinline float extract<0>(const ssef &a)
|
|||
template<size_t dst, size_t src, size_t clr>
|
||||
__forceinline const ssef insert(const ssef &a, const ssef &b)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
ssef res = a;
|
||||
if (clr)
|
||||
res[dst] = 0;
|
||||
else
|
||||
res[dst] = b[src];
|
||||
return res;
|
||||
# else
|
||||
return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
|
||||
# endif
|
||||
}
|
||||
template<size_t dst, size_t src> __forceinline const ssef insert(const ssef &a, const ssef &b)
|
||||
{
|
||||
|
@ -703,31 +755,55 @@ __forceinline void transpose(
|
|||
|
||||
__forceinline const ssef vreduce_min(const ssef &v)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return vdupq_n_f32(vminvq_f32(v));
|
||||
# else
|
||||
ssef h = min(shuffle<1, 0, 3, 2>(v), v);
|
||||
return min(shuffle<2, 3, 0, 1>(h), h);
|
||||
# endif
|
||||
}
|
||||
__forceinline const ssef vreduce_max(const ssef &v)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return vdupq_n_f32(vmaxvq_f32(v));
|
||||
# else
|
||||
ssef h = max(shuffle<1, 0, 3, 2>(v), v);
|
||||
return max(shuffle<2, 3, 0, 1>(h), h);
|
||||
# endif
|
||||
}
|
||||
__forceinline const ssef vreduce_add(const ssef &v)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return vdupq_n_f32(vaddvq_f32(v));
|
||||
# else
|
||||
ssef h = shuffle<1, 0, 3, 2>(v) + v;
|
||||
return shuffle<2, 3, 0, 1>(h) + h;
|
||||
# endif
|
||||
}
|
||||
|
||||
__forceinline float reduce_min(const ssef &v)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return vminvq_f32(v);
|
||||
# else
|
||||
return _mm_cvtss_f32(vreduce_min(v));
|
||||
# endif
|
||||
}
|
||||
__forceinline float reduce_max(const ssef &v)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return vmaxvq_f32(v);
|
||||
# else
|
||||
return _mm_cvtss_f32(vreduce_max(v));
|
||||
# endif
|
||||
}
|
||||
__forceinline float reduce_add(const ssef &v)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return vaddvq_f32(v);
|
||||
# else
|
||||
return _mm_cvtss_f32(vreduce_add(v));
|
||||
# endif
|
||||
}
|
||||
|
||||
__forceinline uint32_t select_min(const ssef &v)
|
||||
|
@ -942,14 +1018,14 @@ ccl_device_inline const ssef shuffle_swap(const ssef &a, shuffle_swap_t shuf)
|
|||
{
|
||||
/* shuffle value must be a constant, so we need to branch */
|
||||
if (shuf)
|
||||
return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
return shuffle<1, 0, 3, 2>(a);
|
||||
else
|
||||
return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(3, 2, 1, 0)));
|
||||
return shuffle<3, 2, 1, 0>(a);
|
||||
}
|
||||
|
||||
# endif
|
||||
|
||||
# ifdef __KERNEL_SSE41__
|
||||
# if defined(__KERNEL_SSE41__) && !defined(__KERNEL_NEON__)
|
||||
|
||||
ccl_device_inline void gen_idirsplat_swap(const ssef &pn,
|
||||
const shuffle_swap_t &shuf_identity,
|
||||
|
|
|
@ -445,14 +445,22 @@ __forceinline ssei unpackhi(const ssei &a, const ssei &b)
|
|||
template<size_t i0, size_t i1, size_t i2, size_t i3>
|
||||
__forceinline const ssei shuffle(const ssei &a)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return shuffle_neon<ssei, i0, i1, i2, i3>(a);
|
||||
# else
|
||||
return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
|
||||
# endif
|
||||
}
|
||||
|
||||
template<size_t i0, size_t i1, size_t i2, size_t i3>
|
||||
__forceinline const ssei shuffle(const ssei &a, const ssei &b)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return shuffle_neon<ssei, i0, i1, i2, i3>(a, b);
|
||||
# else
|
||||
return _mm_castps_si128(
|
||||
_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
|
||||
# endif
|
||||
}
|
||||
|
||||
template<size_t i0> __forceinline const ssei shuffle(const ssei &b)
|
||||
|
@ -505,15 +513,27 @@ __forceinline const ssei vreduce_add(const ssei &v)
|
|||
|
||||
__forceinline int reduce_min(const ssei &v)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return vminvq_s32(v);
|
||||
# else
|
||||
return extract<0>(vreduce_min(v));
|
||||
# endif
|
||||
}
|
||||
__forceinline int reduce_max(const ssei &v)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return vmaxvq_s32(v);
|
||||
# else
|
||||
return extract<0>(vreduce_max(v));
|
||||
# endif
|
||||
}
|
||||
__forceinline int reduce_add(const ssei &v)
|
||||
{
|
||||
# ifdef __KERNEL_NEON__
|
||||
return vaddvq_s32(v);
|
||||
# else
|
||||
return extract<0>(vreduce_add(v));
|
||||
# endif
|
||||
}
|
||||
|
||||
__forceinline uint32_t select_min(const ssei &v)
|
||||
|
|
Loading…
Reference in New Issue