Cycles: add support for Arm Neon instructions using sse2neon

Based on patch contributed by Apple and Stefan Werner.

Ref D8237, T78710
This commit is contained in:
Brecht Van Lommel 2021-02-14 15:01:26 +01:00
parent 68dd7617d7
commit 0e9497e886
11 changed files with 317 additions and 40 deletions

View File

@ -102,7 +102,7 @@ size_t SocketType::max_size()
void *SocketType::zero_default_value()
{
static Transform zero_transform = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
static Transform zero_transform = transform_zero();
return &zero_transform;
}

View File

@ -741,7 +741,8 @@ float Camera::world_to_raster_size(float3 P)
float3 D = transform_point(&worldtocamera, P);
float dist = len(D);
Ray ray = {{0}};
Ray ray;
memset(&ray, 0, sizeof(ray));
/* Distortion can become so great that the results become meaningless, there
* may be a better way to do this, but calculating differentials from the

View File

@ -2081,6 +2081,16 @@ ConvertNode::ConvertNode(SocketType::Type from_, SocketType::Type to_, bool auto
special_type = SHADER_SPECIAL_TYPE_AUTOCONVERT;
}
/* Union usage requires a manual copy constructor. */
ConvertNode::ConvertNode(const ConvertNode &other)
: ShaderNode(other),
from(other.from),
to(other.to),
value_color(other.value_color),
value_string(other.value_string)
{
}
void ConvertNode::constant_fold(const ConstantFolder &folder)
{
/* proxy nodes should have been removed at this point */

View File

@ -501,6 +501,7 @@ class RGBToBWNode : public ShaderNode {
class ConvertNode : public ShaderNode {
public:
ConvertNode(SocketType::Type from, SocketType::Type to, bool autoconvert = false);
ConvertNode(const ConvertNode &other);
SHADER_NODE_BASE_CLASS(ConvertNode)
void constant_fold(const ConstantFolder &folder);

View File

@ -304,8 +304,12 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &
ccl_device_inline float3 fabs(const float3 &a)
{
# ifdef __KERNEL_SSE__
# ifdef __KERNEL_NEON__
return float3(vabsq_f32(a.m128));
# else
__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
return float3(_mm_and_ps(a.m128, mask));
# endif
# else
return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
# endif
@ -447,7 +451,13 @@ ccl_device_inline bool is_zero(const float3 a)
ccl_device_inline float reduce_add(const float3 a)
{
#if defined(__KERNEL_SSE__) && defined(__KERNEL_NEON__)
__m128 t = a.m128;
t[3] = 0.0f;
return vaddvq_f32(t);
#else
return (a.x + a.y + a.z);
#endif
}
ccl_device_inline float average(const float3 a)

View File

@ -257,7 +257,12 @@ ccl_device_inline float distance(const float4 &a, const float4 &b)
ccl_device_inline float dot(const float4 &a, const float4 &b)
{
# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
# if defined(__KERNEL_NEON__)
__m128 t = vmulq_f32(a, b);
return vaddvq_f32(t);
# else
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
# endif
# else
return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
# endif
@ -313,8 +318,10 @@ ccl_device_inline bool is_zero(const float4 &a)
ccl_device_inline float4 reduce_add(const float4 &a)
{
# ifdef __KERNEL_SSE__
# ifdef __KERNEL_SSE3__
# if defined(__KERNEL_SSE__)
# if defined(__KERNEL_NEON__)
return float4(vdupq_n_f32(vaddvq_f32(a)));
# elif defined(__KERNEL_SSE3__)
float4 h(_mm_hadd_ps(a.m128, a.m128));
return float4(_mm_hadd_ps(h.m128, h.m128));
# else
@ -373,8 +380,12 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
ccl_device_inline float4 fabs(const float4 &a)
{
# ifdef __KERNEL_SSE__
# if defined(__KERNEL_SSE__)
# if defined(__KERNEL_NEON__)
return float4(vabsq_f32(a));
# else
return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
# endif
# else
return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
# endif
@ -400,14 +411,22 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
__forceinline const float4 shuffle(const float4 &b)
{
# if defined(__KERNEL_NEON__)
return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(b.m128));
# else
return float4(_mm_castsi128_ps(
_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
# endif
}
template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
__forceinline const float4 shuffle(const float4 &a, const float4 &b)
{
# if defined(__KERNEL_NEON__)
return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(a.m128, b.m128));
# else
return float4(_mm_shuffle_ps(a.m128, b.m128, _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
# endif
}
template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b)
@ -457,9 +476,13 @@ ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
ccl_device_inline float4 reduce_min(const float4 &a)
{
# ifdef __KERNEL_SSE__
# if defined(__KERNEL_SSE__)
# if defined(__KERNEL_NEON__)
return float4(vdupq_n_f32(vminvq_f32(a)));
# else
float4 h = min(shuffle<1, 0, 3, 2>(a), a);
return min(shuffle<2, 3, 0, 1>(h), h);
# endif
# else
return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
# endif
@ -467,9 +490,13 @@ ccl_device_inline float4 reduce_min(const float4 &a)
ccl_device_inline float4 reduce_max(const float4 &a)
{
# ifdef __KERNEL_SSE__
# if defined(__KERNEL_SSE__)
# if defined(__KERNEL_NEON__)
return float4(vdupq_n_f32(vmaxvq_f32(a)));
# else
float4 h = max(shuffle<1, 0, 3, 2>(a), a);
return max(shuffle<2, 3, 0, 1>(h), h);
# endif
# else
return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
# endif

View File

@ -27,44 +27,50 @@
/* We require minimum SSE2 support on x86, so auto enable. */
# define __KERNEL_SSE2__
# ifdef WITH_KERNEL_SSE2
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# endif
# ifdef WITH_KERNEL_SSE3
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# endif
# endif /* defined(i386) || defined(_M_IX86) */
/* x86-64
*
* Compile a regular (includes SSE2), SSE3, SSE 4.1, AVX and AVX2 kernel. */
# if defined(__x86_64__) || defined(_M_X64)
# elif defined(__x86_64__) || defined(_M_X64)
/* SSE2 is always available on x86-64 CPUs, so auto enable */
# define __KERNEL_SSE2__
/* no SSE2 kernel on x86-64, part of regular kernel */
# ifdef WITH_KERNEL_SSE3
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# endif
# ifdef WITH_KERNEL_SSE41
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# endif
# ifdef WITH_KERNEL_AVX
# define WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# endif
# ifdef WITH_KERNEL_AVX2
# define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# endif
# endif /* defined(__x86_64__) || defined(_M_X64) */
/* Arm Neon
*
* Compile a SSE4 kernel emulated with Neon. Most code is shared with
* SSE, some specializations for performance and compatibility are made
* made testing for __KERNEL_NEON__. */
# elif defined(__ARM_NEON) && defined(WITH_SSE2NEON)
# define __KERNEL_NEON__
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSE41__
# endif
#endif

View File

@ -35,6 +35,9 @@
# include <intrin.h>
#elif (defined(__x86_64__) || defined(__i386__))
# include <x86intrin.h>
#elif defined(__KERNEL_NEON__)
# define SSE2NEON_PRECISE_MINMAX 1
# include <sse2neon.h>
#endif
/* Floating Point Control, for Embree. */
@ -116,6 +119,80 @@ static struct StepTy {
#endif
/* Utilities used by Neon */
#if defined(__KERNEL_NEON__)
template<class type, int i0, int i1, int i2, int i3> type shuffle_neon(const type &a)
{
if (i0 == i1 && i0 == i2 && i0 == i3) {
return vdupq_laneq_s32(a, i0);
}
static const uint8_t tbl[16] = {(i0 * 4) + 0,
(i0 * 4) + 1,
(i0 * 4) + 2,
(i0 * 4) + 3,
(i1 * 4) + 0,
(i1 * 4) + 1,
(i1 * 4) + 2,
(i1 * 4) + 3,
(i2 * 4) + 0,
(i2 * 4) + 1,
(i2 * 4) + 2,
(i2 * 4) + 3,
(i3 * 4) + 0,
(i3 * 4) + 1,
(i3 * 4) + 2,
(i3 * 4) + 3};
return vqtbl1q_s8(int8x16_t(a), *(int8x16_t *)tbl);
}
template<class type, int i0, int i1, int i2, int i3>
type shuffle_neon(const type &a, const type &b)
{
if (&a == &b) {
static const uint8_t tbl[16] = {(i0 * 4) + 0,
(i0 * 4) + 1,
(i0 * 4) + 2,
(i0 * 4) + 3,
(i1 * 4) + 0,
(i1 * 4) + 1,
(i1 * 4) + 2,
(i1 * 4) + 3,
(i2 * 4) + 0,
(i2 * 4) + 1,
(i2 * 4) + 2,
(i2 * 4) + 3,
(i3 * 4) + 0,
(i3 * 4) + 1,
(i3 * 4) + 2,
(i3 * 4) + 3};
return vqtbl1q_s8(int8x16_t(b), *(int8x16_t *)tbl);
}
else {
static const uint8_t tbl[16] = {(i0 * 4) + 0,
(i0 * 4) + 1,
(i0 * 4) + 2,
(i0 * 4) + 3,
(i1 * 4) + 0,
(i1 * 4) + 1,
(i1 * 4) + 2,
(i1 * 4) + 3,
(i2 * 4) + 0 + 16,
(i2 * 4) + 1 + 16,
(i2 * 4) + 2 + 16,
(i2 * 4) + 3 + 16,
(i3 * 4) + 0 + 16,
(i3 * 4) + 1 + 16,
(i3 * 4) + 2 + 16,
(i3 * 4) + 3 + 16};
return vqtbl2q_s8((int8x16x2_t){a, b}, *(int8x16_t *)tbl);
}
}
#endif /* __KERNEL_NEON */
/* Intrinsics Functions
*
* For fast bit operations. */
@ -428,8 +505,9 @@ __forceinline __m128i _mm_max_epi32_emu(__m128i value, __m128i input)
return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input));
}
# undef _mm_extract_epi32
# define _mm_extract_epi32 _mm_extract_epi32_emu
# ifndef __KERNEL_NEON__
# undef _mm_extract_epi32
# define _mm_extract_epi32 _mm_extract_epi32_emu
__forceinline int _mm_extract_epi32_emu(__m128i input, const int index)
{
switch (index) {
@ -446,6 +524,7 @@ __forceinline int _mm_extract_epi32_emu(__m128i input, const int index)
return 0;
}
}
# endif
# undef _mm_insert_epi32
# define _mm_insert_epi32 _mm_insert_epi32_emu

View File

@ -197,9 +197,14 @@ __forceinline const sseb unpackhi(const sseb &a, const sseb &b)
template<size_t i0, size_t i1, size_t i2, size_t i3>
__forceinline const sseb shuffle(const sseb &a)
{
# ifdef __KERNEL_NEON__
return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a);
# else
return _mm_castsi128_ps(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)));
# endif
}
# ifndef __KERNEL_NEON__
template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a)
{
return _mm_movelh_ps(a, a);
@ -209,13 +214,19 @@ template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a)
{
return _mm_movehl_ps(a, a);
}
# endif
template<size_t i0, size_t i1, size_t i2, size_t i3>
__forceinline const sseb shuffle(const sseb &a, const sseb &b)
{
# ifdef __KERNEL_NEON__
return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a, b);
# else
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
# endif
}
# ifndef __KERNEL_NEON__
template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a, const sseb &b)
{
return _mm_movelh_ps(a, b);
@ -225,8 +236,9 @@ template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a, const sse
{
return _mm_movehl_ps(b, a);
}
# endif
# if defined(__KERNEL_SSE3__)
# if defined(__KERNEL_SSE3__) && !defined(__KERNEL_NEON__)
template<> __forceinline const sseb shuffle<0, 0, 2, 2>(const sseb &a)
{
return _mm_moveldup_ps(a);
@ -241,7 +253,16 @@ template<> __forceinline const sseb shuffle<1, 1, 3, 3>(const sseb &a)
template<size_t dst, size_t src, size_t clr>
__forceinline const sseb insert(const sseb &a, const sseb &b)
{
# ifdef __KERNEL_NEON__
sseb res = a;
if (clr)
res[dst] = 0;
else
res[dst] = b[src];
return res;
# else
return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
# endif
}
template<size_t dst, size_t src> __forceinline const sseb insert(const sseb &a, const sseb &b)
{
@ -260,7 +281,13 @@ template<size_t dst> __forceinline const sseb insert(const sseb &a, const bool b
# if defined(__KERNEL_SSE41__)
__forceinline uint32_t popcnt(const sseb &a)
{
# if defined(__KERNEL_NEON__)
const int32x4_t mask = {1, 1, 1, 1};
int32x4_t t = vandq_s32(a.m128, mask);
return vaddvq_s32(t);
# else
return _mm_popcnt_u32(_mm_movemask_ps(a));
# endif
}
# else
__forceinline uint32_t popcnt(const sseb &a)
@ -271,23 +298,43 @@ __forceinline uint32_t popcnt(const sseb &a)
__forceinline bool reduce_and(const sseb &a)
{
# if defined(__KERNEL_NEON__)
return vaddvq_s32(a.m128) == -4;
# else
return _mm_movemask_ps(a) == 0xf;
# endif
}
__forceinline bool reduce_or(const sseb &a)
{
# if defined(__KERNEL_NEON__)
return vaddvq_s32(a.m128) != 0x0;
# else
return _mm_movemask_ps(a) != 0x0;
# endif
}
__forceinline bool all(const sseb &b)
{
# if defined(__KERNEL_NEON__)
return vaddvq_s32(b.m128) == -4;
# else
return _mm_movemask_ps(b) == 0xf;
# endif
}
__forceinline bool any(const sseb &b)
{
# if defined(__KERNEL_NEON__)
return vaddvq_s32(b.m128) != 0x0;
# else
return _mm_movemask_ps(b) != 0x0;
# endif
}
__forceinline bool none(const sseb &b)
{
# if defined(__KERNEL_NEON__)
return vaddvq_s32(b.m128) == 0x0;
# else
return _mm_movemask_ps(b) == 0x0;
# endif
}
__forceinline uint32_t movemask(const sseb &a)

View File

@ -303,41 +303,46 @@ __forceinline ssef maxi(const ssef &a, const ssef &b)
/// Ternary Operators
////////////////////////////////////////////////////////////////////////////////
# if defined(__KERNEL_AVX2__)
__forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c)
{
# if defined(__KERNEL_NEON__)
return vfmaq_f32(c, a, b);
# elif defined(__KERNEL_AVX2__)
return _mm_fmadd_ps(a, b, c);
}
__forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c)
{
return _mm_fmsub_ps(a, b, c);
}
__forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c)
{
return _mm_fnmadd_ps(a, b, c);
}
__forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c)
{
return _mm_fnmsub_ps(a, b, c);
}
# else
__forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c)
{
return a * b + c;
# endif
}
__forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c)
{
# if defined(__KERNEL_NEON__)
return vfmaq_f32(vnegq_f32(c), a, b);
# elif defined(__KERNEL_AVX2__)
return _mm_fmsub_ps(a, b, c);
# else
return a * b - c;
# endif
}
__forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c)
{
# if defined(__KERNEL_NEON__)
return vfmsq_f32(c, a, b);
# elif defined(__KERNEL_AVX2__)
return _mm_fnmadd_ps(a, b, c);
# else
return c - a * b;
# endif
}
__forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c)
{
# if defined(__KERNEL_NEON__)
return vfmsq_f32(vnegq_f32(c), a, b);
# elif defined(__KERNEL_AVX2__)
return _mm_fnmsub_ps(a, b, c);
# else
return -a * b - c;
}
# endif
}
////////////////////////////////////////////////////////////////////////////////
/// Assignment Operators
@ -496,27 +501,51 @@ __forceinline const ssef select(const int mask, const ssef &t, const ssef &f)
# if defined(__KERNEL_SSE41__)
__forceinline const ssef round_even(const ssef &a)
{
# ifdef __KERNEL_NEON__
return vrndnq_f32(a);
# else
return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
# endif
}
__forceinline const ssef round_down(const ssef &a)
{
# ifdef __KERNEL_NEON__
return vrndmq_f32(a);
# else
return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
# endif
}
__forceinline const ssef round_up(const ssef &a)
{
# ifdef __KERNEL_NEON__
return vrndpq_f32(a);
# else
return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
# endif
}
__forceinline const ssef round_zero(const ssef &a)
{
# ifdef __KERNEL_NEON__
return vrndq_f32(a);
# else
return _mm_round_ps(a, _MM_FROUND_TO_ZERO);
# endif
}
__forceinline const ssef floor(const ssef &a)
{
# ifdef __KERNEL_NEON__
return vrndnq_f32(a);
# else
return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
# endif
}
__forceinline const ssef ceil(const ssef &a)
{
# ifdef __KERNEL_NEON__
return vrndpq_f32(a);
# else
return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
# endif
}
# endif
@ -566,7 +595,11 @@ __forceinline ssef unpackhi(const ssef &a, const ssef &b)
template<size_t i0, size_t i1, size_t i2, size_t i3>
__forceinline const ssef shuffle(const ssef &b)
{
# ifdef __KERNEL_NEON__
return shuffle_neon<ssef, i0, i1, i2, i3>(b.m128);
# else
return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
# endif
}
template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a)
@ -582,14 +615,23 @@ template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a)
template<size_t i0, size_t i1, size_t i2, size_t i3>
__forceinline const ssef shuffle(const ssef &a, const ssef &b)
{
# ifdef __KERNEL_NEON__
return shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b);
# else
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
# endif
}
template<size_t i0> __forceinline const ssef shuffle(const ssef &a, const ssef &b)
{
# ifdef __KERNEL_NEON__
return shuffle<float32x4_t, i0, i0, i0, i0>(a, b);
# else
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0));
# endif
}
# ifndef __KERNEL_NEON__
template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a, const ssef &b)
{
return _mm_movelh_ps(a, b);
@ -599,6 +641,7 @@ template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a, const sse
{
return _mm_movehl_ps(b, a);
}
# endif
# if defined(__KERNEL_SSSE3__)
__forceinline const ssef shuffle8(const ssef &a, const ssei &shuf)
@ -643,7 +686,16 @@ template<> __forceinline float extract<0>(const ssef &a)
template<size_t dst, size_t src, size_t clr>
__forceinline const ssef insert(const ssef &a, const ssef &b)
{
# ifdef __KERNEL_NEON__
ssef res = a;
if (clr)
res[dst] = 0;
else
res[dst] = b[src];
return res;
# else
return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
# endif
}
template<size_t dst, size_t src> __forceinline const ssef insert(const ssef &a, const ssef &b)
{
@ -703,31 +755,55 @@ __forceinline void transpose(
__forceinline const ssef vreduce_min(const ssef &v)
{
# ifdef __KERNEL_NEON__
return vdupq_n_f32(vminvq_f32(v));
# else
ssef h = min(shuffle<1, 0, 3, 2>(v), v);
return min(shuffle<2, 3, 0, 1>(h), h);
# endif
}
__forceinline const ssef vreduce_max(const ssef &v)
{
# ifdef __KERNEL_NEON__
return vdupq_n_f32(vmaxvq_f32(v));
# else
ssef h = max(shuffle<1, 0, 3, 2>(v), v);
return max(shuffle<2, 3, 0, 1>(h), h);
# endif
}
__forceinline const ssef vreduce_add(const ssef &v)
{
# ifdef __KERNEL_NEON__
return vdupq_n_f32(vaddvq_f32(v));
# else
ssef h = shuffle<1, 0, 3, 2>(v) + v;
return shuffle<2, 3, 0, 1>(h) + h;
# endif
}
__forceinline float reduce_min(const ssef &v)
{
# ifdef __KERNEL_NEON__
return vminvq_f32(v);
# else
return _mm_cvtss_f32(vreduce_min(v));
# endif
}
__forceinline float reduce_max(const ssef &v)
{
# ifdef __KERNEL_NEON__
return vmaxvq_f32(v);
# else
return _mm_cvtss_f32(vreduce_max(v));
# endif
}
__forceinline float reduce_add(const ssef &v)
{
# ifdef __KERNEL_NEON__
return vaddvq_f32(v);
# else
return _mm_cvtss_f32(vreduce_add(v));
# endif
}
__forceinline uint32_t select_min(const ssef &v)
@ -942,14 +1018,14 @@ ccl_device_inline const ssef shuffle_swap(const ssef &a, shuffle_swap_t shuf)
{
/* shuffle value must be a constant, so we need to branch */
if (shuf)
return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(1, 0, 3, 2)));
return shuffle<1, 0, 3, 2>(a);
else
return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(3, 2, 1, 0)));
return shuffle<3, 2, 1, 0>(a);
}
# endif
# ifdef __KERNEL_SSE41__
# if defined(__KERNEL_SSE41__) && !defined(__KERNEL_NEON__)
ccl_device_inline void gen_idirsplat_swap(const ssef &pn,
const shuffle_swap_t &shuf_identity,

View File

@ -445,14 +445,22 @@ __forceinline ssei unpackhi(const ssei &a, const ssei &b)
template<size_t i0, size_t i1, size_t i2, size_t i3>
__forceinline const ssei shuffle(const ssei &a)
{
# ifdef __KERNEL_NEON__
return shuffle_neon<ssei, i0, i1, i2, i3>(a);
# else
return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
# endif
}
template<size_t i0, size_t i1, size_t i2, size_t i3>
__forceinline const ssei shuffle(const ssei &a, const ssei &b)
{
# ifdef __KERNEL_NEON__
return shuffle_neon<ssei, i0, i1, i2, i3>(a, b);
# else
return _mm_castps_si128(
_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
# endif
}
template<size_t i0> __forceinline const ssei shuffle(const ssei &b)
@ -505,15 +513,27 @@ __forceinline const ssei vreduce_add(const ssei &v)
__forceinline int reduce_min(const ssei &v)
{
# ifdef __KERNEL_NEON__
return vminvq_s32(v);
# else
return extract<0>(vreduce_min(v));
# endif
}
__forceinline int reduce_max(const ssei &v)
{
# ifdef __KERNEL_NEON__
return vmaxvq_s32(v);
# else
return extract<0>(vreduce_max(v));
# endif
}
__forceinline int reduce_add(const ssei &v)
{
# ifdef __KERNEL_NEON__
return vaddvq_s32(v);
# else
return extract<0>(vreduce_add(v));
# endif
}
__forceinline uint32_t select_min(const ssei &v)