Cycles: add support for Arm Neon instructions using sse2neon

Based on patch contributed by Apple and Stefan Werner. Ref D8237, T78710
2021-02-14 15:01:26 +01:00 · 2021-02-14 15:01:26 +01:00 · 0e9497e886
parent 68dd7617d7
commit 0e9497e886
11 changed files with 317 additions and 40 deletions
--- a/intern/cycles/graph/node_type.cpp
+++ b/intern/cycles/graph/node_type.cpp
@ -102,7 +102,7 @@ size_t SocketType::max_size()

 void *SocketType::zero_default_value()
 {
-  static Transform zero_transform = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
+  static Transform zero_transform = transform_zero();
  return &zero_transform;
 }

--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@ -741,7 +741,8 @@ float Camera::world_to_raster_size(float3 P)
    float3 D = transform_point(&worldtocamera, P);
    float dist = len(D);

-    Ray ray = {{0}};
+    Ray ray;
+    memset(&ray, 0, sizeof(ray));

    /* Distortion can become so great that the results become meaningless, there
     * may be a better way to do this, but calculating differentials from the
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@ -2081,6 +2081,16 @@ ConvertNode::ConvertNode(SocketType::Type from_, SocketType::Type to_, bool auto
    special_type = SHADER_SPECIAL_TYPE_AUTOCONVERT;
 }

+/* Union usage requires a manual copy constructor. */
+ConvertNode::ConvertNode(const ConvertNode &other)
+    : ShaderNode(other),
+      from(other.from),
+      to(other.to),
+      value_color(other.value_color),
+      value_string(other.value_string)
+{
+}
+
 void ConvertNode::constant_fold(const ConstantFolder &folder)
 {
  /* proxy nodes should have been removed at this point */
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@ -501,6 +501,7 @@ class RGBToBWNode : public ShaderNode {
 class ConvertNode : public ShaderNode {
 public:
  ConvertNode(SocketType::Type from, SocketType::Type to, bool autoconvert = false);
+  ConvertNode(const ConvertNode &other);
  SHADER_NODE_BASE_CLASS(ConvertNode)

  void constant_fold(const ConstantFolder &folder);
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@ -304,8 +304,12 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &
 ccl_device_inline float3 fabs(const float3 &a)
 {
 #  ifdef __KERNEL_SSE__
+#    ifdef __KERNEL_NEON__
+  return float3(vabsq_f32(a.m128));
+#    else
  __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
  return float3(_mm_and_ps(a.m128, mask));
+#    endif
 #  else
  return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
 #  endif
@ -447,7 +451,13 @@ ccl_device_inline bool is_zero(const float3 a)

 ccl_device_inline float reduce_add(const float3 a)
 {
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_NEON__)
+  __m128 t = a.m128;
+  t[3] = 0.0f;
+  return vaddvq_f32(t);
+#else
  return (a.x + a.y + a.z);
+#endif
 }

 ccl_device_inline float average(const float3 a)
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@ -257,7 +257,12 @@ ccl_device_inline float distance(const float4 &a, const float4 &b)
 ccl_device_inline float dot(const float4 &a, const float4 &b)
 {
 #  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  __m128 t = vmulq_f32(a, b);
+  return vaddvq_f32(t);
+#    else
  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
+#    endif
 #  else
  return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
 #  endif
@ -313,8 +318,10 @@ ccl_device_inline bool is_zero(const float4 &a)

 ccl_device_inline float4 reduce_add(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
-#    ifdef __KERNEL_SSE3__
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  return float4(vdupq_n_f32(vaddvq_f32(a)));
+#    elif defined(__KERNEL_SSE3__)
  float4 h(_mm_hadd_ps(a.m128, a.m128));
  return float4(_mm_hadd_ps(h.m128, h.m128));
 #    else
@ -373,8 +380,12 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &

 ccl_device_inline float4 fabs(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  return float4(vabsq_f32(a));
+#    else
  return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+#    endif
 #  else
  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
 #  endif
@ -400,14 +411,22 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4 &b)
 {
+#  if defined(__KERNEL_NEON__)
+  return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(b.m128));
+#  else
  return float4(_mm_castsi128_ps(
      _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
+#  endif
 }

 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4 &a, const float4 &b)
 {
+#  if defined(__KERNEL_NEON__)
+  return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(a.m128, b.m128));
+#  else
  return float4(_mm_shuffle_ps(a.m128, b.m128, _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
+#  endif
 }

 template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b)
@ -457,9 +476,13 @@ ccl_device_inline float4 mask(const int4 &mask, const float4 &a)

 ccl_device_inline float4 reduce_min(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  return float4(vdupq_n_f32(vminvq_f32(a)));
+#    else
  float4 h = min(shuffle<1, 0, 3, 2>(a), a);
  return min(shuffle<2, 3, 0, 1>(h), h);
+#    endif
 #  else
  return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
 #  endif
@ -467,9 +490,13 @@ ccl_device_inline float4 reduce_min(const float4 &a)

 ccl_device_inline float4 reduce_max(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  return float4(vdupq_n_f32(vmaxvq_f32(a)));
+#    else
  float4 h = max(shuffle<1, 0, 3, 2>(a), a);
  return max(shuffle<2, 3, 0, 1>(h), h);
+#    endif
 #  else
  return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
 #  endif
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@ -27,44 +27,50 @@

 /* We require minimum SSE2 support on x86, so auto enable. */
 #    define __KERNEL_SSE2__
-
 #    ifdef WITH_KERNEL_SSE2
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 #    endif
-
 #    ifdef WITH_KERNEL_SSE3
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 #    endif

-#  endif /* defined(i386) || defined(_M_IX86) */
-
 /* x86-64
 *
 * Compile a regular (includes SSE2), SSE3, SSE 4.1, AVX and AVX2 kernel. */

-#  if defined(__x86_64__) || defined(_M_X64)
+#  elif defined(__x86_64__) || defined(_M_X64)

 /* SSE2 is always available on x86-64 CPUs, so auto enable */
 #    define __KERNEL_SSE2__
-
 /* no SSE2 kernel on x86-64, part of regular kernel */
 #    ifdef WITH_KERNEL_SSE3
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 #    endif
-
 #    ifdef WITH_KERNEL_SSE41
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 #    endif
-
 #    ifdef WITH_KERNEL_AVX
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 #    endif
-
 #    ifdef WITH_KERNEL_AVX2
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 #    endif

-#  endif /* defined(__x86_64__) || defined(_M_X64) */
+/* Arm Neon
+ *
+ * Compile a SSE4 kernel emulated with Neon. Most code is shared with
+ * SSE, some specializations for performance and compatibility are made
+ * made testing for __KERNEL_NEON__. */
+
+#  elif defined(__ARM_NEON) && defined(WITH_SSE2NEON)
+
+#    define __KERNEL_NEON__
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSE41__
+
+#  endif

 #endif

--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@ -35,6 +35,9 @@
 #  include <intrin.h>
 #elif (defined(__x86_64__) || defined(__i386__))
 #  include <x86intrin.h>
+#elif defined(__KERNEL_NEON__)
+#  define SSE2NEON_PRECISE_MINMAX 1
+#  include <sse2neon.h>
 #endif

 /* Floating Point Control, for Embree. */
@ -116,6 +119,80 @@ static struct StepTy {

 #endif

+/* Utilities used by Neon */
+#if defined(__KERNEL_NEON__)
+template<class type, int i0, int i1, int i2, int i3> type shuffle_neon(const type &a)
+{
+  if (i0 == i1 && i0 == i2 && i0 == i3) {
+    return vdupq_laneq_s32(a, i0);
+  }
+  static const uint8_t tbl[16] = {(i0 * 4) + 0,
+                                  (i0 * 4) + 1,
+                                  (i0 * 4) + 2,
+                                  (i0 * 4) + 3,
+                                  (i1 * 4) + 0,
+                                  (i1 * 4) + 1,
+                                  (i1 * 4) + 2,
+                                  (i1 * 4) + 3,
+                                  (i2 * 4) + 0,
+                                  (i2 * 4) + 1,
+                                  (i2 * 4) + 2,
+                                  (i2 * 4) + 3,
+                                  (i3 * 4) + 0,
+                                  (i3 * 4) + 1,
+                                  (i3 * 4) + 2,
+                                  (i3 * 4) + 3};
+
+  return vqtbl1q_s8(int8x16_t(a), *(int8x16_t *)tbl);
+}
+
+template<class type, int i0, int i1, int i2, int i3>
+type shuffle_neon(const type &a, const type &b)
+{
+  if (&a == &b) {
+    static const uint8_t tbl[16] = {(i0 * 4) + 0,
+                                    (i0 * 4) + 1,
+                                    (i0 * 4) + 2,
+                                    (i0 * 4) + 3,
+                                    (i1 * 4) + 0,
+                                    (i1 * 4) + 1,
+                                    (i1 * 4) + 2,
+                                    (i1 * 4) + 3,
+                                    (i2 * 4) + 0,
+                                    (i2 * 4) + 1,
+                                    (i2 * 4) + 2,
+                                    (i2 * 4) + 3,
+                                    (i3 * 4) + 0,
+                                    (i3 * 4) + 1,
+                                    (i3 * 4) + 2,
+                                    (i3 * 4) + 3};
+
+    return vqtbl1q_s8(int8x16_t(b), *(int8x16_t *)tbl);
+  }
+  else {
+
+    static const uint8_t tbl[16] = {(i0 * 4) + 0,
+                                    (i0 * 4) + 1,
+                                    (i0 * 4) + 2,
+                                    (i0 * 4) + 3,
+                                    (i1 * 4) + 0,
+                                    (i1 * 4) + 1,
+                                    (i1 * 4) + 2,
+                                    (i1 * 4) + 3,
+                                    (i2 * 4) + 0 + 16,
+                                    (i2 * 4) + 1 + 16,
+                                    (i2 * 4) + 2 + 16,
+                                    (i2 * 4) + 3 + 16,
+                                    (i3 * 4) + 0 + 16,
+                                    (i3 * 4) + 1 + 16,
+                                    (i3 * 4) + 2 + 16,
+                                    (i3 * 4) + 3 + 16};
+
+    return vqtbl2q_s8((int8x16x2_t){a, b}, *(int8x16_t *)tbl);
+  }
+}
+#endif /* __KERNEL_NEON */
+
 /* Intrinsics Functions
 *
 * For fast bit operations. */
@ -428,8 +505,9 @@ __forceinline __m128i _mm_max_epi32_emu(__m128i value, __m128i input)
  return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input));
 }

-#    undef _mm_extract_epi32
-#    define _mm_extract_epi32 _mm_extract_epi32_emu
+#    ifndef __KERNEL_NEON__
+#      undef _mm_extract_epi32
+#      define _mm_extract_epi32 _mm_extract_epi32_emu
 __forceinline int _mm_extract_epi32_emu(__m128i input, const int index)
 {
  switch (index) {
@ -446,6 +524,7 @@ __forceinline int _mm_extract_epi32_emu(__m128i input, const int index)
      return 0;
  }
 }
+#    endif

 #    undef _mm_insert_epi32
 #    define _mm_insert_epi32 _mm_insert_epi32_emu
--- a/intern/cycles/util/util_sseb.h
+++ b/intern/cycles/util/util_sseb.h
@ -197,9 +197,14 @@ __forceinline const sseb unpackhi(const sseb &a, const sseb &b)
 template<size_t i0, size_t i1, size_t i2, size_t i3>
 __forceinline const sseb shuffle(const sseb &a)
 {
+#  ifdef __KERNEL_NEON__
+  return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a);
+#  else
  return _mm_castsi128_ps(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)));
+#  endif
 }

+#  ifndef __KERNEL_NEON__
 template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a)
 {
  return _mm_movelh_ps(a, a);
@ -209,13 +214,19 @@ template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a)
 {
  return _mm_movehl_ps(a, a);
 }
+#  endif

 template<size_t i0, size_t i1, size_t i2, size_t i3>
 __forceinline const sseb shuffle(const sseb &a, const sseb &b)
 {
+#  ifdef __KERNEL_NEON__
+  return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a, b);
+#  else
  return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+#  endif
 }

+#  ifndef __KERNEL_NEON__
 template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a, const sseb &b)
 {
  return _mm_movelh_ps(a, b);
@ -225,8 +236,9 @@ template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a, const sse
 {
  return _mm_movehl_ps(b, a);
 }
+#  endif

-#  if defined(__KERNEL_SSE3__)
+#  if defined(__KERNEL_SSE3__) && !defined(__KERNEL_NEON__)
 template<> __forceinline const sseb shuffle<0, 0, 2, 2>(const sseb &a)
 {
  return _mm_moveldup_ps(a);
@ -241,7 +253,16 @@ template<> __forceinline const sseb shuffle<1, 1, 3, 3>(const sseb &a)
 template<size_t dst, size_t src, size_t clr>
 __forceinline const sseb insert(const sseb &a, const sseb &b)
 {
+#    ifdef __KERNEL_NEON__
+  sseb res = a;
+  if (clr)
+    res[dst] = 0;
+  else
+    res[dst] = b[src];
+  return res;
+#    else
  return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
+#    endif
 }
 template<size_t dst, size_t src> __forceinline const sseb insert(const sseb &a, const sseb &b)
 {
@ -260,7 +281,13 @@ template<size_t dst> __forceinline const sseb insert(const sseb &a, const bool b
 #  if defined(__KERNEL_SSE41__)
 __forceinline uint32_t popcnt(const sseb &a)
 {
+#    if defined(__KERNEL_NEON__)
+  const int32x4_t mask = {1, 1, 1, 1};
+  int32x4_t t = vandq_s32(a.m128, mask);
+  return vaddvq_s32(t);
+#    else
  return _mm_popcnt_u32(_mm_movemask_ps(a));
+#    endif
 }
 #  else
 __forceinline uint32_t popcnt(const sseb &a)
@ -271,23 +298,43 @@ __forceinline uint32_t popcnt(const sseb &a)

 __forceinline bool reduce_and(const sseb &a)
 {
+#  if defined(__KERNEL_NEON__)
+  return vaddvq_s32(a.m128) == -4;
+#  else
  return _mm_movemask_ps(a) == 0xf;
+#  endif
 }
 __forceinline bool reduce_or(const sseb &a)
 {
+#  if defined(__KERNEL_NEON__)
+  return vaddvq_s32(a.m128) != 0x0;
+#  else
  return _mm_movemask_ps(a) != 0x0;
+#  endif
 }
 __forceinline bool all(const sseb &b)
 {
+#  if defined(__KERNEL_NEON__)
+  return vaddvq_s32(b.m128) == -4;
+#  else
  return _mm_movemask_ps(b) == 0xf;
+#  endif
 }
 __forceinline bool any(const sseb &b)
 {
+#  if defined(__KERNEL_NEON__)
+  return vaddvq_s32(b.m128) != 0x0;
+#  else
  return _mm_movemask_ps(b) != 0x0;
+#  endif
 }
 __forceinline bool none(const sseb &b)
 {
+#  if defined(__KERNEL_NEON__)
+  return vaddvq_s32(b.m128) == 0x0;
+#  else
  return _mm_movemask_ps(b) == 0x0;
+#  endif
 }

 __forceinline uint32_t movemask(const sseb &a)
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@ -303,41 +303,46 @@ __forceinline ssef maxi(const ssef &a, const ssef &b)
 /// Ternary Operators
 ////////////////////////////////////////////////////////////////////////////////

-#  if defined(__KERNEL_AVX2__)
 __forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c)
 {
+#  if defined(__KERNEL_NEON__)
+  return vfmaq_f32(c, a, b);
+#  elif defined(__KERNEL_AVX2__)
  return _mm_fmadd_ps(a, b, c);
-}
-__forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c)
-{
-  return _mm_fmsub_ps(a, b, c);
-}
-__forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c)
-{
-  return _mm_fnmadd_ps(a, b, c);
-}
-__forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c)
-{
-  return _mm_fnmsub_ps(a, b, c);
-}
 #  else
-__forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c)
-{
  return a * b + c;
+#  endif
 }
 __forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c)
 {
+#  if defined(__KERNEL_NEON__)
+  return vfmaq_f32(vnegq_f32(c), a, b);
+#  elif defined(__KERNEL_AVX2__)
+  return _mm_fmsub_ps(a, b, c);
+#  else
  return a * b - c;
+#  endif
 }
 __forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c)
 {
+#  if defined(__KERNEL_NEON__)
+  return vfmsq_f32(c, a, b);
+#  elif defined(__KERNEL_AVX2__)
+  return _mm_fnmadd_ps(a, b, c);
+#  else
  return c - a * b;
+#  endif
 }
 __forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c)
 {
+#  if defined(__KERNEL_NEON__)
+  return vfmsq_f32(vnegq_f32(c), a, b);
+#  elif defined(__KERNEL_AVX2__)
+  return _mm_fnmsub_ps(a, b, c);
+#  else
  return -a * b - c;
-}
 #  endif
+}

 ////////////////////////////////////////////////////////////////////////////////
 /// Assignment Operators
@ -496,27 +501,51 @@ __forceinline const ssef select(const int mask, const ssef &t, const ssef &f)
 #  if defined(__KERNEL_SSE41__)
 __forceinline const ssef round_even(const ssef &a)
 {
+#    ifdef __KERNEL_NEON__
+  return vrndnq_f32(a);
+#    else
  return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
+#    endif
 }
 __forceinline const ssef round_down(const ssef &a)
 {
+#    ifdef __KERNEL_NEON__
+  return vrndmq_f32(a);
+#    else
  return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
+#    endif
 }
 __forceinline const ssef round_up(const ssef &a)
 {
+#    ifdef __KERNEL_NEON__
+  return vrndpq_f32(a);
+#    else
  return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
+#    endif
 }
 __forceinline const ssef round_zero(const ssef &a)
 {
+#    ifdef __KERNEL_NEON__
+  return vrndq_f32(a);
+#    else
  return _mm_round_ps(a, _MM_FROUND_TO_ZERO);
+#    endif
 }
 __forceinline const ssef floor(const ssef &a)
 {
+#    ifdef __KERNEL_NEON__
+  return vrndnq_f32(a);
+#    else
  return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
+#    endif
 }
 __forceinline const ssef ceil(const ssef &a)
 {
+#    ifdef __KERNEL_NEON__
+  return vrndpq_f32(a);
+#    else
  return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
+#    endif
 }
 #  endif

@ -566,7 +595,11 @@ __forceinline ssef unpackhi(const ssef &a, const ssef &b)
 template<size_t i0, size_t i1, size_t i2, size_t i3>
 __forceinline const ssef shuffle(const ssef &b)
 {
+#  ifdef __KERNEL_NEON__
+  return shuffle_neon<ssef, i0, i1, i2, i3>(b.m128);
+#  else
  return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+#  endif
 }

 template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a)
@ -582,14 +615,23 @@ template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a)
 template<size_t i0, size_t i1, size_t i2, size_t i3>
 __forceinline const ssef shuffle(const ssef &a, const ssef &b)
 {
+#  ifdef __KERNEL_NEON__
+  return shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b);
+#  else
  return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+#  endif
 }

 template<size_t i0> __forceinline const ssef shuffle(const ssef &a, const ssef &b)
 {
+#  ifdef __KERNEL_NEON__
+  return shuffle<float32x4_t, i0, i0, i0, i0>(a, b);
+#  else
  return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0));
+#  endif
 }

+#  ifndef __KERNEL_NEON__
 template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a, const ssef &b)
 {
  return _mm_movelh_ps(a, b);
@ -599,6 +641,7 @@ template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a, const sse
 {
  return _mm_movehl_ps(b, a);
 }
+#  endif

 #  if defined(__KERNEL_SSSE3__)
 __forceinline const ssef shuffle8(const ssef &a, const ssei &shuf)
@ -643,7 +686,16 @@ template<> __forceinline float extract<0>(const ssef &a)
 template<size_t dst, size_t src, size_t clr>
 __forceinline const ssef insert(const ssef &a, const ssef &b)
 {
+#    ifdef __KERNEL_NEON__
+  ssef res = a;
+  if (clr)
+    res[dst] = 0;
+  else
+    res[dst] = b[src];
+  return res;
+#    else
  return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
+#    endif
 }
 template<size_t dst, size_t src> __forceinline const ssef insert(const ssef &a, const ssef &b)
 {
@ -703,31 +755,55 @@ __forceinline void transpose(

 __forceinline const ssef vreduce_min(const ssef &v)
 {
+#  ifdef __KERNEL_NEON__
+  return vdupq_n_f32(vminvq_f32(v));
+#  else
  ssef h = min(shuffle<1, 0, 3, 2>(v), v);
  return min(shuffle<2, 3, 0, 1>(h), h);
+#  endif
 }
 __forceinline const ssef vreduce_max(const ssef &v)
 {
+#  ifdef __KERNEL_NEON__
+  return vdupq_n_f32(vmaxvq_f32(v));
+#  else
  ssef h = max(shuffle<1, 0, 3, 2>(v), v);
  return max(shuffle<2, 3, 0, 1>(h), h);
+#  endif
 }
 __forceinline const ssef vreduce_add(const ssef &v)
 {
+#  ifdef __KERNEL_NEON__
+  return vdupq_n_f32(vaddvq_f32(v));
+#  else
  ssef h = shuffle<1, 0, 3, 2>(v) + v;
  return shuffle<2, 3, 0, 1>(h) + h;
+#  endif
 }

 __forceinline float reduce_min(const ssef &v)
 {
+#  ifdef __KERNEL_NEON__
+  return vminvq_f32(v);
+#  else
  return _mm_cvtss_f32(vreduce_min(v));
+#  endif
 }
 __forceinline float reduce_max(const ssef &v)
 {
+#  ifdef __KERNEL_NEON__
+  return vmaxvq_f32(v);
+#  else
  return _mm_cvtss_f32(vreduce_max(v));
+#  endif
 }
 __forceinline float reduce_add(const ssef &v)
 {
+#  ifdef __KERNEL_NEON__
+  return vaddvq_f32(v);
+#  else
  return _mm_cvtss_f32(vreduce_add(v));
+#  endif
 }

 __forceinline uint32_t select_min(const ssef &v)
@ -942,14 +1018,14 @@ ccl_device_inline const ssef shuffle_swap(const ssef &a, shuffle_swap_t shuf)
 {
  /* shuffle value must be a constant, so we need to branch */
  if (shuf)
-    return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(1, 0, 3, 2)));
+    return shuffle<1, 0, 3, 2>(a);
  else
-    return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(3, 2, 1, 0)));
+    return shuffle<3, 2, 1, 0>(a);
 }

 #  endif

-#  ifdef __KERNEL_SSE41__
+#  if defined(__KERNEL_SSE41__) && !defined(__KERNEL_NEON__)

 ccl_device_inline void gen_idirsplat_swap(const ssef &pn,
                                          const shuffle_swap_t &shuf_identity,
--- a/intern/cycles/util/util_ssei.h
+++ b/intern/cycles/util/util_ssei.h
@ -445,14 +445,22 @@ __forceinline ssei unpackhi(const ssei &a, const ssei &b)
 template<size_t i0, size_t i1, size_t i2, size_t i3>
 __forceinline const ssei shuffle(const ssei &a)
 {
+#  ifdef __KERNEL_NEON__
+  return shuffle_neon<ssei, i0, i1, i2, i3>(a);
+#  else
  return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
+#  endif
 }

 template<size_t i0, size_t i1, size_t i2, size_t i3>
 __forceinline const ssei shuffle(const ssei &a, const ssei &b)
 {
+#  ifdef __KERNEL_NEON__
+  return shuffle_neon<ssei, i0, i1, i2, i3>(a, b);
+#  else
  return _mm_castps_si128(
      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+#  endif
 }

 template<size_t i0> __forceinline const ssei shuffle(const ssei &b)
@ -505,15 +513,27 @@ __forceinline const ssei vreduce_add(const ssei &v)

 __forceinline int reduce_min(const ssei &v)
 {
+#    ifdef __KERNEL_NEON__
+  return vminvq_s32(v);
+#    else
  return extract<0>(vreduce_min(v));
+#    endif
 }
 __forceinline int reduce_max(const ssei &v)
 {
+#    ifdef __KERNEL_NEON__
+  return vmaxvq_s32(v);
+#    else
  return extract<0>(vreduce_max(v));
+#    endif
 }
 __forceinline int reduce_add(const ssei &v)
 {
+#    ifdef __KERNEL_NEON__
+  return vaddvq_s32(v);
+#    else
  return extract<0>(vreduce_add(v));
+#    endif
 }

 __forceinline uint32_t select_min(const ssei &v)