Refactor: replace Cycles sse/avx types by vectorized float4/int4/float8/int8

The distinction existed for legacy reasons, to easily port of Embree intersection code without affecting the main vector types. However we are now using SIMD for these types as well, so no good reason to keep the distinction. Also more consistently pass these vector types by value in inline functions. Previously it was partially changed for functions used by Metal to avoid having to add address space qualifiers, simple to do it everywhere. Also removes function declarations for vector math headers, serves no real purpose. Differential Revision: https://developer.blender.org/D16146
2022-11-01 15:16:55 +01:00 · 2022-11-01 15:16:55 +01:00 · e1b3d91127
parent 32ec0521c5
commit e1b3d91127
37 changed files with 1844 additions and 4689 deletions
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -328,6 +328,7 @@ set(SRC_UTIL_HEADERS
  ../util/math_int2.h
  ../util/math_int3.h
  ../util/math_int4.h
+  ../util/math_int8.h
  ../util/math_matrix.h
  ../util/projection.h
  ../util/rect.h
@ -350,6 +351,8 @@ set(SRC_UTIL_HEADERS
  ../util/types_int3_impl.h
  ../util/types_int4.h
  ../util/types_int4_impl.h
+  ../util/types_int8.h
+  ../util/types_int8_impl.h
  ../util/types_spectrum.h
  ../util/types_uchar2.h
  ../util/types_uchar2_impl.h
--- a/intern/cycles/kernel/device/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@ -7,6 +7,7 @@
 * one with SSE2 intrinsics.
 */
 #if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE__
 #  define __KERNEL_SSE2__
 #endif

@ -29,11 +30,15 @@
 #    define __KERNEL_SSE41__
 #  endif
 #  ifdef __AVX__
-#    define __KERNEL_SSE__
+#    ifndef __KERNEL_SSE__
+#      define __KERNEL_SSE__
+#    endif
 #    define __KERNEL_AVX__
 #  endif
 #  ifdef __AVX2__
-#    define __KERNEL_SSE__
+#    ifndef __KERNEL_SSE__
+#      define __KERNEL_SSE__
+#    endif
 #    define __KERNEL_AVX2__
 #  endif
 #endif
--- a/intern/cycles/kernel/svm/noise.h
+++ b/intern/cycles/kernel/svm/noise.h
@ -39,11 +39,11 @@ ccl_device_noinline_cpu float perlin_1d(float x)
 }

 /* 2D, 3D, and 4D noise can be accelerated using SSE, so we first check if
- * SSE is supported, that is, if __KERNEL_SSE2__ is defined. If it is not
+ * SSE is supported, that is, if __KERNEL_SSE__ is defined. If it is not
 * supported, we do a standard implementation, but if it is supported, we
 * do an implementation using SSE intrinsics.
 */
-#if !defined(__KERNEL_SSE2__)
+#if !defined(__KERNEL_SSE__)

 /* ** Standard Implementation ** */

@ -250,18 +250,18 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)

 /* SSE Bilinear Interpolation:
 *
- * The function takes two ssef inputs:
+ * The function takes two float4 inputs:
 * - p : Contains the values at the points (v0, v1, v2, v3).
 * - f : Contains the values (x, y, _, _). The third and fourth values are unused.
 *
 * The interpolation is done in two steps:
 * 1. Interpolate (v0, v1) and (v2, v3) along the x axis to get g (g0, g1).
 *    (v2, v3) is generated by moving v2 and v3 to the first and second
- *    places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and
+ *    places of the float4 using the shuffle mask <2, 3, 2, 3>. The third and
 *    fourth values are unused.
 * 2. Interpolate g0 and g1 along the y axis to get the final value.
- *    g1 is generated by populating an ssef with the second value of g.
- *    Only the first value is important in the final ssef.
+ *    g1 is generated by populating an float4 with the second value of g.
+ *    Only the first value is important in the final float4.
 *
 * v1          v3          g1
 *  @ + + + + @            @                    y
@ -272,27 +272,27 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 * v0          v2          g0
 *
 */
-ccl_device_inline ssef bi_mix(ssef p, ssef f)
+ccl_device_inline float4 bi_mix(float4 p, float4 f)
 {
-  ssef g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f));
+  float4 g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f));
  return mix(g, shuffle<1>(g), shuffle<1>(f));
 }

-ccl_device_inline ssef fade(const ssef &t)
+ccl_device_inline float4 fade(const float4 t)
 {
-  ssef a = madd(t, 6.0f, -15.0f);
-  ssef b = madd(t, a, 10.0f);
+  float4 a = madd(t, make_float4(6.0f), make_float4(-15.0f));
+  float4 b = madd(t, a, make_float4(10.0f));
  return (t * t) * (t * b);
 }

 /* Negate val if the nth bit of h is 1. */
 #  define negate_if_nth_bit(val, h, n) ((val) ^ cast(((h) & (1 << (n))) << (31 - (n))))

-ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
+ccl_device_inline float4 grad(const int4 hash, const float4 x, const float4 y)
 {
-  ssei h = hash & 7;
-  ssef u = select(h < 4, x, y);
-  ssef v = 2.0f * select(h < 4, y, x);
+  int4 h = hash & 7;
+  float4 u = select(h < 4, x, y);
+  float4 v = 2.0f * select(h < 4, y, x);
  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
 }

@ -310,28 +310,28 @@ ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
 */
 ccl_device_noinline_cpu float perlin_2d(float x, float y)
 {
-  ssei XY;
-  ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY);
-  ssef uv = fade(fxy);
+  int4 XY;
+  float4 fxy = floorfrac(make_float4(x, y, 0.0f, 0.0f), &XY);
+  float4 uv = fade(fxy);

-  ssei XY1 = XY + 1;
-  ssei X = shuffle<0, 0, 0, 0>(XY, XY1);
-  ssei Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));
+  int4 XY1 = XY + make_int4(1);
+  int4 X = shuffle<0, 0, 0, 0>(XY, XY1);
+  int4 Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));

-  ssei h = hash_ssei2(X, Y);
+  int4 h = hash_int4_2(X, Y);

-  ssef fxy1 = fxy - 1.0f;
-  ssef fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
-  ssef fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));
+  float4 fxy1 = fxy - make_float4(1.0f);
+  float4 fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
+  float4 fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));

-  ssef g = grad(h, fx, fy);
+  float4 g = grad(h, fx, fy);

  return extract<0>(bi_mix(g, uv));
 }

 /* SSE Trilinear Interpolation:
 *
- * The function takes three ssef inputs:
+ * The function takes three float4 inputs:
 * - p : Contains the values at the points (v0, v1, v2, v3).
 * - q : Contains the values at the points (v4, v5, v6, v7).
 * - f : Contains the values (x, y, z, _). The fourth value is unused.
@ -340,11 +340,11 @@ ccl_device_noinline_cpu float perlin_2d(float x, float y)
 * 1. Interpolate p and q along the x axis to get s (s0, s1, s2, s3).
 * 2. Interpolate (s0, s1) and (s2, s3) along the y axis to get g (g0, g1).
 *    (s2, s3) is generated by moving v2 and v3 to the first and second
- *    places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and
+ *    places of the float4 using the shuffle mask <2, 3, 2, 3>. The third and
 *    fourth values are unused.
 * 3. Interpolate g0 and g1 along the z axis to get the final value.
- *    g1 is generated by populating an ssef with the second value of g.
- *    Only the first value is important in the final ssef.
+ *    g1 is generated by populating an float4 with the second value of g.
+ *    Only the first value is important in the final float4.
 *
 *   v3               v7
 *     @ + + + + + + @               s3 @
@ -362,10 +362,10 @@ ccl_device_noinline_cpu float perlin_2d(float x, float y)
 *          @ + + + + + + @                  @
 *        v0               v4                 s0
 */
-ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f)
+ccl_device_inline float4 tri_mix(float4 p, float4 q, float4 f)
 {
-  ssef s = mix(p, q, shuffle<0>(f));
-  ssef g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f));
+  float4 s = mix(p, q, shuffle<0>(f));
+  float4 g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f));
  return mix(g, shuffle<1>(g), shuffle<2>(f));
 }

@ -374,24 +374,24 @@ ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f)
 * supported, we do an SSE implementation, but if it is supported,
 * we do an implementation using AVX intrinsics.
 */
-#  if !defined(__KERNEL_AVX__)
+#  if !defined(__KERNEL_AVX2__)

-ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z)
+ccl_device_inline float4 grad(const int4 hash, const float4 x, const float4 y, const float4 z)
 {
-  ssei h = hash & 15;
-  ssef u = select(h < 8, x, y);
-  ssef vt = select((h == 12) | (h == 14), x, z);
-  ssef v = select(h < 4, y, vt);
+  int4 h = hash & 15;
+  float4 u = select(h < 8, x, y);
+  float4 vt = select((h == 12) | (h == 14), x, z);
+  float4 v = select(h < 4, y, vt);
  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
 }

-ccl_device_inline ssef
-grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &w)
+ccl_device_inline float4
+grad(const int4 hash, const float4 x, const float4 y, const float4 z, const float4 w)
 {
-  ssei h = hash & 31;
-  ssef u = select(h < 24, x, y);
-  ssef v = select(h < 16, y, z);
-  ssef s = select(h < 8, z, w);
+  int4 h = hash & 31;
+  float4 u = select(h < 24, x, y);
+  float4 v = select(h < 16, y, z);
+  float4 s = select(h < 8, z, w);
  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
 }

@ -401,7 +401,7 @@ grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &
 * between two trilinear interpolations.
 *
 */
-ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
+ccl_device_inline float4 quad_mix(float4 p, float4 q, float4 r, float4 s, float4 f)
 {
  return mix(tri_mix(p, q, f), tri_mix(r, s, f), shuffle<3>(f));
 }
@ -427,23 +427,23 @@ ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
 */
 ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 {
-  ssei XYZ;
-  ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
-  ssef uvw = fade(fxyz);
+  int4 XYZ;
+  float4 fxyz = floorfrac(make_float4(x, y, z, 0.0f), &XYZ);
+  float4 uvw = fade(fxyz);

-  ssei XYZ1 = XYZ + 1;
-  ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
-  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
+  int4 XYZ1 = XYZ + make_int4(1);
+  int4 Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
+  int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));

-  ssei h1 = hash_ssei3(shuffle<0>(XYZ), Y, Z);
-  ssei h2 = hash_ssei3(shuffle<0>(XYZ1), Y, Z);
+  int4 h1 = hash_int4_3(shuffle<0>(XYZ), Y, Z);
+  int4 h2 = hash_int4_3(shuffle<0>(XYZ1), Y, Z);

-  ssef fxyz1 = fxyz - 1.0f;
-  ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
-  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
+  float4 fxyz1 = fxyz - make_float4(1.0f);
+  float4 fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
+  float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));

-  ssef g1 = grad(h1, shuffle<0>(fxyz), fy, fz);
-  ssef g2 = grad(h2, shuffle<0>(fxyz1), fy, fz);
+  float4 g1 = grad(h1, shuffle<0>(fxyz), fy, fz);
+  float4 g2 = grad(h2, shuffle<0>(fxyz1), fy, fz);

  return extract<0>(tri_mix(g1, g2, uvw));
 }
@ -481,29 +481,29 @@ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 */
 ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
-  ssei XYZW;
-  ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
-  ssef uvws = fade(fxyzw);
+  int4 XYZW;
+  float4 fxyzw = floorfrac(make_float4(x, y, z, w), &XYZW);
+  float4 uvws = fade(fxyzw);

-  ssei XYZW1 = XYZW + 1;
-  ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
-  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
+  int4 XYZW1 = XYZW + make_int4(1);
+  int4 Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
+  int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));

-  ssei h1 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW));
-  ssei h2 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW));
+  int4 h1 = hash_int4_4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW));
+  int4 h2 = hash_int4_4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW));

-  ssei h3 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW1));
-  ssei h4 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW1));
+  int4 h3 = hash_int4_4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW1));
+  int4 h4 = hash_int4_4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW1));

-  ssef fxyzw1 = fxyzw - 1.0f;
-  ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
-  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
+  float4 fxyzw1 = fxyzw - make_float4(1.0f);
+  float4 fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
+  float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));

-  ssef g1 = grad(h1, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw));
-  ssef g2 = grad(h2, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw));
+  float4 g1 = grad(h1, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw));
+  float4 g2 = grad(h2, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw));

-  ssef g3 = grad(h3, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw1));
-  ssef g4 = grad(h4, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw1));
+  float4 g3 = grad(h3, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw1));
+  float4 g4 = grad(h4, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw1));

  return extract<0>(quad_mix(g1, g2, g3, g4, uvws));
 }
@ -512,22 +512,22 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)

 /* AVX Implementation */

-ccl_device_inline avxf grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z)
+ccl_device_inline vfloat8 grad(const vint8 hash, const vfloat8 x, const vfloat8 y, const vfloat8 z)
 {
-  avxi h = hash & 15;
-  avxf u = select(h < 8, x, y);
-  avxf vt = select((h == 12) | (h == 14), x, z);
-  avxf v = select(h < 4, y, vt);
+  vint8 h = hash & 15;
+  vfloat8 u = select(h < 8, x, y);
+  vfloat8 vt = select((h == 12) | (h == 14), x, z);
+  vfloat8 v = select(h < 4, y, vt);
  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
 }

-ccl_device_inline avxf
-grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf &w)
+ccl_device_inline vfloat8
+grad(const vint8 hash, const vfloat8 x, const vfloat8 y, const vfloat8 z, const vfloat8 w)
 {
-  avxi h = hash & 31;
-  avxf u = select(h < 24, x, y);
-  avxf v = select(h < 16, y, z);
-  avxf s = select(h < 8, z, w);
+  vint8 h = hash & 31;
+  vfloat8 u = select(h < 24, x, y);
+  vfloat8 v = select(h < 16, y, z);
+  vfloat8 s = select(h < 8, z, w);
  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
 }

@ -537,13 +537,13 @@ grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf &
 * 1. Interpolate p and q along the w axis to get s.
 * 2. Trilinearly interpolate (s0, s1, s2, s3) and (s4, s5, s6, s7) to get the final
 *    value. (s0, s1, s2, s3) and (s4, s5, s6, s7) are generated by extracting the
- *    low and high ssef from s.
+ *    low and high float4 from s.
 *
 */
-ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
+ccl_device_inline float4 quad_mix(vfloat8 p, vfloat8 q, float4 f)
 {
-  ssef fv = shuffle<3>(f);
-  avxf s = mix(p, q, avxf(fv, fv));
+  float4 fv = shuffle<3>(f);
+  vfloat8 s = mix(p, q, make_vfloat8(fv, fv));
  return tri_mix(low(s), high(s), f);
 }

@ -565,25 +565,25 @@ ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
 */
 ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 {
-  ssei XYZ;
-  ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
-  ssef uvw = fade(fxyz);
+  int4 XYZ;
+  float4 fxyz = floorfrac(make_float4(x, y, z, 0.0f), &XYZ);
+  float4 uvw = fade(fxyz);

-  ssei XYZ1 = XYZ + 1;
-  ssei X = shuffle<0>(XYZ);
-  ssei X1 = shuffle<0>(XYZ1);
-  ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
-  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
+  int4 XYZ1 = XYZ + make_int4(1);
+  int4 X = shuffle<0>(XYZ);
+  int4 X1 = shuffle<0>(XYZ1);
+  int4 Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
+  int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));

-  avxi h = hash_avxi3(avxi(X, X1), avxi(Y, Y), avxi(Z, Z));
+  vint8 h = hash_int8_3(make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z));

-  ssef fxyz1 = fxyz - 1.0f;
-  ssef fx = shuffle<0>(fxyz);
-  ssef fx1 = shuffle<0>(fxyz1);
-  ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
-  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
+  float4 fxyz1 = fxyz - make_float4(1.0f);
+  float4 fx = shuffle<0>(fxyz);
+  float4 fx1 = shuffle<0>(fxyz1);
+  float4 fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
+  float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));

-  avxf g = grad(h, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz));
+  vfloat8 g = grad(h, make_vfloat8(fx, fx1), make_vfloat8(fy, fy), make_vfloat8(fz, fz));

  return extract<0>(tri_mix(low(g), high(g), uvw));
 }
@ -617,31 +617,37 @@ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 */
 ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
-  ssei XYZW;
-  ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
-  ssef uvws = fade(fxyzw);
+  int4 XYZW;
+  float4 fxyzw = floorfrac(make_float4(x, y, z, w), &XYZW);
+  float4 uvws = fade(fxyzw);

-  ssei XYZW1 = XYZW + 1;
-  ssei X = shuffle<0>(XYZW);
-  ssei X1 = shuffle<0>(XYZW1);
-  ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
-  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
-  ssei W = shuffle<3>(XYZW);
-  ssei W1 = shuffle<3>(XYZW1);
+  int4 XYZW1 = XYZW + make_int4(1);
+  int4 X = shuffle<0>(XYZW);
+  int4 X1 = shuffle<0>(XYZW1);
+  int4 Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
+  int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
+  int4 W = shuffle<3>(XYZW);
+  int4 W1 = shuffle<3>(XYZW1);

-  avxi h1 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W, W));
-  avxi h2 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W1, W1));
+  vint8 h1 = hash_int8_4(make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z), make_vint8(W, W));
+  vint8 h2 = hash_int8_4(
+      make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z), make_vint8(W1, W1));

-  ssef fxyzw1 = fxyzw - 1.0f;
-  ssef fx = shuffle<0>(fxyzw);
-  ssef fx1 = shuffle<0>(fxyzw1);
-  ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
-  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
-  ssef fw = shuffle<3>(fxyzw);
-  ssef fw1 = shuffle<3>(fxyzw1);
+  float4 fxyzw1 = fxyzw - make_float4(1.0f);
+  float4 fx = shuffle<0>(fxyzw);
+  float4 fx1 = shuffle<0>(fxyzw1);
+  float4 fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
+  float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
+  float4 fw = shuffle<3>(fxyzw);
+  float4 fw1 = shuffle<3>(fxyzw1);

-  avxf g1 = grad(h1, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw, fw));
-  avxf g2 = grad(h2, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw1, fw1));
+  vfloat8 g1 = grad(
+      h1, make_vfloat8(fx, fx1), make_vfloat8(fy, fy), make_vfloat8(fz, fz), make_vfloat8(fw, fw));
+  vfloat8 g2 = grad(h2,
+                    make_vfloat8(fx, fx1),
+                    make_vfloat8(fy, fy),
+                    make_vfloat8(fz, fz),
+                    make_vfloat8(fw1, fw1));

  return extract<0>(quad_mix(g1, g2, uvws));
 }
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@ -45,17 +45,24 @@ set(SRC
 # Disable AVX tests on macOS. Rosetta has problems running them, and other
 # platforms should be enough to verify AVX operations are implemented correctly.
 if(NOT APPLE)
+  if(CXX_HAS_SSE)
+    list(APPEND SRC
+      util_float8_sse2_test.cpp
+    )
+    set_source_files_properties(util_float8_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+  endif()
+
  if(CXX_HAS_AVX)
    list(APPEND SRC
-      util_avxf_avx_test.cpp
+      util_float8_avx_test.cpp
    )
-    set_source_files_properties(util_avxf_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+    set_source_files_properties(util_float8_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
  endif()
  if(CXX_HAS_AVX2)
    list(APPEND SRC
-      util_avxf_avx2_test.cpp
+      util_float8_avx2_test.cpp
    )
-    set_source_files_properties(util_avxf_avx2_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+    set_source_files_properties(util_float8_avx2_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
  endif()
 endif()

--- a/intern/cycles/test/util_avxf_test.h
+++ b/intern/cycles/test/util_avxf_test.h
@ -1,211 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
-
-#include "testing/testing.h"
-#include "util/system.h"
-#include "util/types.h"
-
-CCL_NAMESPACE_BEGIN
-
-static bool validate_cpu_capabilities()
-{
-
-#ifdef __KERNEL_AVX2__
-  return system_cpu_support_avx2();
-#else
-#  ifdef __KERNEL_AVX__
-  return system_cpu_support_avx();
-#  endif
-#endif
-}
-
-#define INIT_AVX_TEST \
-  if (!validate_cpu_capabilities()) \
-    return; \
-\
-  const avxf avxf_a(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f); \
-  const avxf avxf_b(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); \
-  const avxf avxf_c(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f);
-
-#define compare_vector_scalar(a, b) \
-  for (size_t index = 0; index < a.size; index++) \
-    EXPECT_FLOAT_EQ(a[index], b);
-
-#define compare_vector_vector(a, b) \
-  for (size_t index = 0; index < a.size; index++) \
-    EXPECT_FLOAT_EQ(a[index], b[index]);
-
-#define compare_vector_vector_near(a, b, abserror) \
-  for (size_t index = 0; index < a.size; index++) \
-    EXPECT_NEAR(a[index], b[index], abserror);
-
-#define basic_test_vv(a, b, op) \
-  INIT_AVX_TEST \
-  avxf c = a op b; \
-  for (size_t i = 0; i < a.size; i++) \
-    EXPECT_FLOAT_EQ(c[i], a[i] op b[i]);
-
-/* vector op float tests */
-#define basic_test_vf(a, b, op) \
-  INIT_AVX_TEST \
-  avxf c = a op b; \
-  for (size_t i = 0; i < a.size; i++) \
-    EXPECT_FLOAT_EQ(c[i], a[i] op b);
-
-static const float float_b = 1.5f;
-
-TEST(TEST_CATEGORY_NAME, avxf_add_vv){basic_test_vv(avxf_a, avxf_b, +)} TEST(TEST_CATEGORY_NAME,
-                                                                             avxf_sub_vv){
-    basic_test_vv(avxf_a, avxf_b, -)} TEST(TEST_CATEGORY_NAME, avxf_mul_vv){
-    basic_test_vv(avxf_a, avxf_b, *)} TEST(TEST_CATEGORY_NAME, avxf_div_vv){
-    basic_test_vv(avxf_a, avxf_b, /)} TEST(TEST_CATEGORY_NAME, avxf_add_vf){
-    basic_test_vf(avxf_a, float_b, +)} TEST(TEST_CATEGORY_NAME, avxf_sub_vf){
-    basic_test_vf(avxf_a, float_b, -)} TEST(TEST_CATEGORY_NAME, avxf_mul_vf){
-    basic_test_vf(avxf_a, float_b, *)} TEST(TEST_CATEGORY_NAME,
-                                            avxf_div_vf){basic_test_vf(avxf_a, float_b, /)}
-
-TEST(TEST_CATEGORY_NAME, avxf_ctor)
-{
-  INIT_AVX_TEST
-  compare_vector_scalar(avxf(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f),
-                        static_cast<float>(index));
-  compare_vector_scalar(avxf(1.0f), 1.0f);
-  compare_vector_vector(avxf(1.0f, 2.0f), avxf(1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f));
-  compare_vector_vector(avxf(1.0f, 2.0f, 3.0f, 4.0f),
-                        avxf(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f));
-  compare_vector_vector(avxf(make_float3(1.0f, 2.0f, 3.0f)),
-                        avxf(0.0f, 3.0f, 2.0f, 1.0f, 0.0f, 3.0f, 2.0f, 1.0f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_sqrt)
-{
-  INIT_AVX_TEST
-  compare_vector_vector(mm256_sqrt(avxf(1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f, 49.0f, 64.0f)),
-                        avxf(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_min_max)
-{
-  INIT_AVX_TEST
-  compare_vector_vector(min(avxf_a, avxf_b), avxf_a);
-  compare_vector_vector(max(avxf_a, avxf_b), avxf_b);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_set_sign)
-{
-  INIT_AVX_TEST
-  avxf res = set_sign_bit<1, 0, 0, 0, 0, 0, 0, 0>(avxf_a);
-  compare_vector_vector(res, avxf(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, -0.8f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_msub)
-{
-  INIT_AVX_TEST
-  avxf res = msub(avxf_a, avxf_b, avxf_c);
-  avxf exp = avxf((avxf_a[7] * avxf_b[7]) - avxf_c[7],
-                  (avxf_a[6] * avxf_b[6]) - avxf_c[6],
-                  (avxf_a[5] * avxf_b[5]) - avxf_c[5],
-                  (avxf_a[4] * avxf_b[4]) - avxf_c[4],
-                  (avxf_a[3] * avxf_b[3]) - avxf_c[3],
-                  (avxf_a[2] * avxf_b[2]) - avxf_c[2],
-                  (avxf_a[1] * avxf_b[1]) - avxf_c[1],
-                  (avxf_a[0] * avxf_b[0]) - avxf_c[0]);
-  compare_vector_vector(res, exp);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_madd)
-{
-  INIT_AVX_TEST
-  avxf res = madd(avxf_a, avxf_b, avxf_c);
-  avxf exp = avxf((avxf_a[7] * avxf_b[7]) + avxf_c[7],
-                  (avxf_a[6] * avxf_b[6]) + avxf_c[6],
-                  (avxf_a[5] * avxf_b[5]) + avxf_c[5],
-                  (avxf_a[4] * avxf_b[4]) + avxf_c[4],
-                  (avxf_a[3] * avxf_b[3]) + avxf_c[3],
-                  (avxf_a[2] * avxf_b[2]) + avxf_c[2],
-                  (avxf_a[1] * avxf_b[1]) + avxf_c[1],
-                  (avxf_a[0] * avxf_b[0]) + avxf_c[0]);
-  compare_vector_vector(res, exp);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_nmadd)
-{
-  INIT_AVX_TEST
-  avxf res = nmadd(avxf_a, avxf_b, avxf_c);
-  avxf exp = avxf(avxf_c[7] - (avxf_a[7] * avxf_b[7]),
-                  avxf_c[6] - (avxf_a[6] * avxf_b[6]),
-                  avxf_c[5] - (avxf_a[5] * avxf_b[5]),
-                  avxf_c[4] - (avxf_a[4] * avxf_b[4]),
-                  avxf_c[3] - (avxf_a[3] * avxf_b[3]),
-                  avxf_c[2] - (avxf_a[2] * avxf_b[2]),
-                  avxf_c[1] - (avxf_a[1] * avxf_b[1]),
-                  avxf_c[0] - (avxf_a[0] * avxf_b[0]));
-  compare_vector_vector(res, exp);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_compare)
-{
-  INIT_AVX_TEST
-  avxf a(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f);
-  avxf b(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
-  avxb res = a <= b;
-  int exp[8] = {
-      a[0] <= b[0] ? -1 : 0,
-      a[1] <= b[1] ? -1 : 0,
-      a[2] <= b[2] ? -1 : 0,
-      a[3] <= b[3] ? -1 : 0,
-      a[4] <= b[4] ? -1 : 0,
-      a[5] <= b[5] ? -1 : 0,
-      a[6] <= b[6] ? -1 : 0,
-      a[7] <= b[7] ? -1 : 0,
-  };
-  compare_vector_vector(res, exp);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_permute)
-{
-  INIT_AVX_TEST
-  avxf res = permute<3, 0, 1, 7, 6, 5, 2, 4>(avxf_b);
-  compare_vector_vector(res, avxf(4.0f, 6.0f, 3.0f, 2.0f, 1.0f, 7.0f, 8.0f, 5.0f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_blend)
-{
-  INIT_AVX_TEST
-  avxf res = blend<0, 0, 1, 0, 1, 0, 1, 0>(avxf_a, avxf_b);
-  compare_vector_vector(res, avxf(0.1f, 0.2f, 3.0f, 0.4f, 5.0f, 0.6f, 7.0f, 0.8f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_shuffle)
-{
-  INIT_AVX_TEST
-  avxf res = shuffle<0, 1, 2, 3, 1, 3, 2, 0>(avxf_a);
-  compare_vector_vector(res, avxf(0.4f, 0.2f, 0.1f, 0.3f, 0.5f, 0.6f, 0.7f, 0.8f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_cross)
-{
-  INIT_AVX_TEST
-  avxf res = cross(avxf_b, avxf_c);
-  compare_vector_vector_near(res,
-                             avxf(0.0f,
-                                  -9.5367432e-07f,
-                                  0.0f,
-                                  4.7683716e-07f,
-                                  0.0f,
-                                  -3.8146973e-06f,
-                                  3.8146973e-06f,
-                                  3.8146973e-06f),
-                             0.000002000f);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_dot3)
-{
-  INIT_AVX_TEST
-  float den, den2;
-  dot3(avxf_a, avxf_b, den, den2);
-  EXPECT_FLOAT_EQ(den, 14.9f);
-  EXPECT_FLOAT_EQ(den2, 2.9f);
-}
-
-CCL_NAMESPACE_END
--- a/intern/cycles/test/util_float8_avx2_test.cpp
+++ b/intern/cycles/test/util_float8_avx2_test.cpp
@ -1,11 +1,13 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

+#define __KERNEL_SSE__
+#define __KERNEL_AVX__
 #define __KERNEL_AVX2__

 #define TEST_CATEGORY_NAME util_avx2

 #if (defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)) && \
    defined(__AVX2__)
-#  include "util_avxf_test.h"
+#  include "util_float8_test.h"
 #endif
--- a/intern/cycles/test/util_float8_avx_test.cpp
+++ b/intern/cycles/test/util_float8_avx_test.cpp
@ -1,11 +1,12 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

+#define __KERNEL_SSE__
 #define __KERNEL_AVX__

 #define TEST_CATEGORY_NAME util_avx

 #if (defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)) && \
    defined(__AVX__)
-#  include "util_avxf_test.h"
+#  include "util_float8_test.h"
 #endif
--- a/intern/cycles/test/util_float8_sse2_test.cpp
+++ b/intern/cycles/test/util_float8_sse2_test.cpp
@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#define __KERNEL_SSE__
+#define __KERNEL_SSE2__
+
+#define TEST_CATEGORY_NAME util_sse2
+
+#if (defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)) && \
+    defined(__SSE2__)
+#  include "util_float8_test.h"
+#endif
--- a/intern/cycles/test/util_float8_test.h
+++ b/intern/cycles/test/util_float8_test.h
@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "testing/testing.h"
+#include "util/math.h"
+#include "util/system.h"
+#include "util/types.h"
+
+CCL_NAMESPACE_BEGIN
+
+static bool validate_cpu_capabilities()
+{
+
+#if defined(__KERNEL_AVX2__)
+  return system_cpu_support_avx2();
+#elif defined(__KERNEL_AVX__)
+  return system_cpu_support_avx();
+#elif defined(__KERNEL_SSE2__)
+  return system_cpu_support_sse2();
+#else
+  return false;
+#endif
+}
+
+#define INIT_FLOAT8_TEST \
+  if (!validate_cpu_capabilities()) \
+    return; \
+\
+  const vfloat8 float8_a = make_vfloat8(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f); \
+  const vfloat8 float8_b = make_vfloat8(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); \
+  const vfloat8 float8_c = make_vfloat8(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f);
+
+#define compare_vector_scalar(a, b) \
+  for (size_t index = 0; index < 8; index++) \
+    EXPECT_FLOAT_EQ(a[index], b);
+
+#define compare_vector_vector(a, b) \
+  for (size_t index = 0; index < 8; index++) \
+    EXPECT_FLOAT_EQ(a[index], b[index]);
+
+#define compare_vector_vector_near(a, b, abserror) \
+  for (size_t index = 0; index < 8; index++) \
+    EXPECT_NEAR(a[index], b[index], abserror);
+
+#define basic_test_vv(a, b, op) \
+  INIT_FLOAT8_TEST \
+  vfloat8 c = a op b; \
+  for (size_t i = 0; i < 8; i++) \
+    EXPECT_FLOAT_EQ(c[i], a[i] op b[i]);
+
+/* vector op float tests */
+#define basic_test_vf(a, b, op) \
+  INIT_FLOAT8_TEST \
+  vfloat8 c = a op b; \
+  for (size_t i = 0; i < 8; i++) \
+    EXPECT_FLOAT_EQ(c[i], a[i] op b);
+
+static const float float_b = 1.5f;
+
+TEST(TEST_CATEGORY_NAME,
+     float8_add_vv){basic_test_vv(float8_a, float8_b, +)} TEST(TEST_CATEGORY_NAME, float8_sub_vv){
+    basic_test_vv(float8_a, float8_b, -)} TEST(TEST_CATEGORY_NAME, float8_mul_vv){
+    basic_test_vv(float8_a, float8_b, *)} TEST(TEST_CATEGORY_NAME, float8_div_vv){
+    basic_test_vv(float8_a, float8_b, /)} TEST(TEST_CATEGORY_NAME, float8_add_vf){
+    basic_test_vf(float8_a, float_b, +)} TEST(TEST_CATEGORY_NAME, float8_sub_vf){
+    basic_test_vf(float8_a, float_b, -)} TEST(TEST_CATEGORY_NAME, float8_mul_vf){
+    basic_test_vf(float8_a, float_b, *)} TEST(TEST_CATEGORY_NAME,
+                                              float8_div_vf){basic_test_vf(float8_a, float_b, /)}
+
+TEST(TEST_CATEGORY_NAME, float8_ctor)
+{
+  INIT_FLOAT8_TEST
+  compare_vector_scalar(make_vfloat8(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f),
+                        static_cast<float>(index));
+  compare_vector_scalar(make_vfloat8(1.0f), 1.0f);
+}
+
+TEST(TEST_CATEGORY_NAME, float8_sqrt)
+{
+  INIT_FLOAT8_TEST
+  compare_vector_vector(sqrt(make_vfloat8(1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f, 49.0f, 64.0f)),
+                        make_vfloat8(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f));
+}
+
+TEST(TEST_CATEGORY_NAME, float8_min_max)
+{
+  INIT_FLOAT8_TEST
+  compare_vector_vector(min(float8_a, float8_b), float8_a);
+  compare_vector_vector(max(float8_a, float8_b), float8_b);
+}
+
+TEST(TEST_CATEGORY_NAME, float8_shuffle)
+{
+  INIT_FLOAT8_TEST
+  vfloat8 res0 = shuffle<0, 1, 2, 3, 1, 3, 2, 0>(float8_a);
+  compare_vector_vector(res0, make_vfloat8(0.1f, 0.2f, 0.3f, 0.4f, 0.6f, 0.8f, 0.7f, 0.5f));
+  vfloat8 res1 = shuffle<3>(float8_a);
+  compare_vector_vector(res1, make_vfloat8(0.4f, 0.4f, 0.4f, 0.4f, 0.8f, 0.8f, 0.8f, 0.8f));
+  vfloat8 res2 = shuffle<3, 2, 1, 0>(float8_a, float8_b);
+  compare_vector_vector(res2, make_vfloat8(0.4f, 0.3f, 2.0f, 1.0f, 0.8f, 0.7f, 6.0f, 5.0f));
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@ -69,6 +69,7 @@ set(SRC_HEADERS
  math_int2.h
  math_int3.h
  math_int4.h
+  math_int8.h
  math_matrix.h
  md5.h
  murmurhash.h
@ -85,13 +86,7 @@ set(SRC_HEADERS
  rect.h
  set.h
  simd.h
-  avxf.h
-  avxb.h
-  avxi.h
  semaphore.h
-  sseb.h
-  ssef.h
-  ssei.h
  stack_allocator.h
  static_assert.h
  stats.h
@ -118,6 +113,8 @@ set(SRC_HEADERS
  types_int3_impl.h
  types_int4.h
  types_int4_impl.h
+  types_int8.h
+  types_int8_impl.h
  types_spectrum.h
  types_uchar2.h
  types_uchar2_impl.h
--- a/intern/cycles/util/avxb.h
+++ b/intern/cycles/util/avxb.h
@ -1,230 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2013 Intel Corporation
- * Modifications Copyright 2014-2022 Blender Foundation. */
-
-#ifndef __UTIL_AVXB_H__
-#define __UTIL_AVXB_H__
-
-CCL_NAMESPACE_BEGIN
-
-struct avxf;
-
-/*! 4-wide SSE bool type. */
-struct avxb {
-  typedef avxb Mask;   // mask type
-  typedef avxf Float;  // float type
-
-  enum { size = 8 };  // number of SIMD elements
-  union {
-    __m256 m256;
-    int32_t v[8];
-  };  // data
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constructors, Assignment & Cast Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline avxb()
-  {
-  }
-  __forceinline avxb(const avxb &other)
-  {
-    m256 = other.m256;
-  }
-  __forceinline avxb &operator=(const avxb &other)
-  {
-    m256 = other.m256;
-    return *this;
-  }
-
-  __forceinline avxb(const __m256 input) : m256(input)
-  {
-  }
-  __forceinline avxb(const __m128 &a, const __m128 &b)
-      : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1))
-  {
-  }
-  __forceinline operator const __m256 &(void) const
-  {
-    return m256;
-  }
-  __forceinline operator const __m256i(void) const
-  {
-    return _mm256_castps_si256(m256);
-  }
-  __forceinline operator const __m256d(void) const
-  {
-    return _mm256_castps_pd(m256);
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constants
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline avxb(FalseTy) : m256(_mm256_setzero_ps())
-  {
-  }
-  __forceinline avxb(TrueTy) : m256(_mm256_castsi256_ps(_mm256_set1_epi32(-1)))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Array Access
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline bool operator[](const size_t i) const
-  {
-    assert(i < 8);
-    return (_mm256_movemask_ps(m256) >> i) & 1;
-  }
-  __forceinline int32_t &operator[](const size_t i)
-  {
-    assert(i < 8);
-    return v[i];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator!(const avxb &a)
-{
-  return _mm256_xor_ps(a, avxb(True));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator&(const avxb &a, const avxb &b)
-{
-  return _mm256_and_ps(a, b);
-}
-__forceinline const avxb operator|(const avxb &a, const avxb &b)
-{
-  return _mm256_or_ps(a, b);
-}
-__forceinline const avxb operator^(const avxb &a, const avxb &b)
-{
-  return _mm256_xor_ps(a, b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator&=(avxb &a, const avxb &b)
-{
-  return a = a & b;
-}
-__forceinline const avxb operator|=(avxb &a, const avxb &b)
-{
-  return a = a | b;
-}
-__forceinline const avxb operator^=(avxb &a, const avxb &b)
-{
-  return a = a ^ b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator!=(const avxb &a, const avxb &b)
-{
-  return _mm256_xor_ps(a, b);
-}
-__forceinline const avxb operator==(const avxb &a, const avxb &b)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b));
-#else
-  __m128i a_lo = _mm_castps_si128(_mm256_extractf128_ps(a, 0));
-  __m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1));
-  __m128i b_lo = _mm_castps_si128(_mm256_extractf128_ps(b, 0));
-  __m128i b_hi = _mm_castps_si128(_mm256_extractf128_ps(b, 1));
-  __m128i c_lo = _mm_cmpeq_epi32(a_lo, b_lo);
-  __m128i c_hi = _mm_cmpeq_epi32(a_hi, b_hi);
-  __m256i result = _mm256_insertf128_si256(_mm256_castsi128_si256(c_lo), c_hi, 1);
-  return _mm256_castsi256_ps(result);
-#endif
-}
-
-__forceinline const avxb select(const avxb &m, const avxb &t, const avxb &f)
-{
-#if defined(__KERNEL_SSE41__)
-  return _mm256_blendv_ps(f, t, m);
-#else
-  return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f));
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb unpacklo(const avxb &a, const avxb &b)
-{
-  return _mm256_unpacklo_ps(a, b);
-}
-__forceinline const avxb unpackhi(const avxb &a, const avxb &b)
-{
-  return _mm256_unpackhi_ps(a, b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reduction Operations
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_SSE41__)
-__forceinline uint32_t popcnt(const avxb &a)
-{
-  return _mm_popcnt_u32(_mm256_movemask_ps(a));
-}
-#else
-__forceinline uint32_t popcnt(const avxb &a)
-{
-  return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]) + bool(a[4]) + bool(a[5]) + bool(a[6]) +
-         bool(a[7]);
-}
-#endif
-
-__forceinline bool reduce_and(const avxb &a)
-{
-  return _mm256_movemask_ps(a) == 0xf;
-}
-__forceinline bool reduce_or(const avxb &a)
-{
-  return _mm256_movemask_ps(a) != 0x0;
-}
-__forceinline bool all(const avxb &b)
-{
-  return _mm256_movemask_ps(b) == 0xf;
-}
-__forceinline bool any(const avxb &b)
-{
-  return _mm256_movemask_ps(b) != 0x0;
-}
-__forceinline bool none(const avxb &b)
-{
-  return _mm256_movemask_ps(b) == 0x0;
-}
-
-__forceinline uint32_t movemask(const avxb &a)
-{
-  return _mm256_movemask_ps(a);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Debug Functions
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_avxb(const char *label, const avxb &a)
-{
-  printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
-}
-
-CCL_NAMESPACE_END
-
-#endif
--- a/intern/cycles/util/avxf.h
+++ b/intern/cycles/util/avxf.h
@ -1,379 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2016 Intel Corporation */
-
-#ifndef __UTIL_AVXF_H__
-#define __UTIL_AVXF_H__
-
-CCL_NAMESPACE_BEGIN
-
-struct avxb;
-
-struct avxf {
-  typedef avxf Float;
-
-  enum { size = 8 }; /* Number of SIMD elements. */
-
-  union {
-    __m256 m256;
-    float f[8];
-    int i[8];
-  };
-
-  __forceinline avxf()
-  {
-  }
-  __forceinline avxf(const avxf &other)
-  {
-    m256 = other.m256;
-  }
-  __forceinline avxf &operator=(const avxf &other)
-  {
-    m256 = other.m256;
-    return *this;
-  }
-
-  __forceinline avxf(const __m256 a) : m256(a)
-  {
-  }
-  __forceinline avxf(const __m256i a) : m256(_mm256_castsi256_ps(a))
-  {
-  }
-
-  __forceinline operator const __m256 &() const
-  {
-    return m256;
-  }
-  __forceinline operator __m256 &()
-  {
-    return m256;
-  }
-
-  __forceinline avxf(float a) : m256(_mm256_set1_ps(a))
-  {
-  }
-
-  __forceinline avxf(float high32x4, float low32x4)
-      : m256(_mm256_set_ps(
-            high32x4, high32x4, high32x4, high32x4, low32x4, low32x4, low32x4, low32x4))
-  {
-  }
-
-  __forceinline avxf(float a3, float a2, float a1, float a0)
-      : m256(_mm256_set_ps(a3, a2, a1, a0, a3, a2, a1, a0))
-  {
-  }
-
-  __forceinline avxf(
-      float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0)
-      : m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0))
-  {
-  }
-
-  __forceinline avxf(float3 a) : m256(_mm256_set_ps(a.w, a.z, a.y, a.x, a.w, a.z, a.y, a.x))
-  {
-  }
-
-  __forceinline avxf(int a3, int a2, int a1, int a0)
-  {
-    const __m256i foo = _mm256_set_epi32(a3, a2, a1, a0, a3, a2, a1, a0);
-    m256 = _mm256_castsi256_ps(foo);
-  }
-
-  __forceinline avxf(int a7, int a6, int a5, int a4, int a3, int a2, int a1, int a0)
-  {
-    const __m256i foo = _mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0);
-    m256 = _mm256_castsi256_ps(foo);
-  }
-
-  __forceinline avxf(__m128 a, __m128 b)
-  {
-    const __m256 foo = _mm256_castps128_ps256(a);
-    m256 = _mm256_insertf128_ps(foo, b, 1);
-  }
-
-  __forceinline const float &operator[](const size_t i) const
-  {
-    assert(i < 8);
-    return f[i];
-  }
-  __forceinline float &operator[](const size_t i)
-  {
-    assert(i < 8);
-    return f[i];
-  }
-};
-
-__forceinline avxf cross(const avxf &a, const avxf &b)
-{
-  avxf r(0.0,
-         a[4] * b[5] - a[5] * b[4],
-         a[6] * b[4] - a[4] * b[6],
-         a[5] * b[6] - a[6] * b[5],
-         0.0,
-         a[0] * b[1] - a[1] * b[0],
-         a[2] * b[0] - a[0] * b[2],
-         a[1] * b[2] - a[2] * b[1]);
-  return r;
-}
-
-__forceinline void dot3(const avxf &a, const avxf &b, float &den, float &den2)
-{
-  const avxf t = _mm256_mul_ps(a.m256, b.m256);
-  den = ((float *)&t)[0] + ((float *)&t)[1] + ((float *)&t)[2];
-  den2 = ((float *)&t)[4] + ((float *)&t)[5] + ((float *)&t)[6];
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxf cast(const __m256i &a)
-{
-  return _mm256_castsi256_ps(a);
-}
-
-__forceinline const avxf mm256_sqrt(const avxf &a)
-{
-  return _mm256_sqrt_ps(a.m256);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxf operator+(const avxf &a, const avxf &b)
-{
-  return _mm256_add_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator+(const avxf &a, const float &b)
-{
-  return a + avxf(b);
-}
-__forceinline const avxf operator+(const float &a, const avxf &b)
-{
-  return avxf(a) + b;
-}
-
-__forceinline const avxf operator-(const avxf &a, const avxf &b)
-{
-  return _mm256_sub_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator-(const avxf &a, const float &b)
-{
-  return a - avxf(b);
-}
-__forceinline const avxf operator-(const float &a, const avxf &b)
-{
-  return avxf(a) - b;
-}
-
-__forceinline const avxf operator*(const avxf &a, const avxf &b)
-{
-  return _mm256_mul_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator*(const avxf &a, const float &b)
-{
-  return a * avxf(b);
-}
-__forceinline const avxf operator*(const float &a, const avxf &b)
-{
-  return avxf(a) * b;
-}
-
-__forceinline const avxf operator/(const avxf &a, const avxf &b)
-{
-  return _mm256_div_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator/(const avxf &a, const float &b)
-{
-  return a / avxf(b);
-}
-__forceinline const avxf operator/(const float &a, const avxf &b)
-{
-  return avxf(a) / b;
-}
-
-__forceinline const avxf operator|(const avxf &a, const avxf &b)
-{
-  return _mm256_or_ps(a.m256, b.m256);
-}
-
-__forceinline const avxf operator^(const avxf &a, const avxf &b)
-{
-  return _mm256_xor_ps(a.m256, b.m256);
-}
-
-__forceinline const avxf operator&(const avxf &a, const avxf &b)
-{
-  return _mm256_and_ps(a.m256, b.m256);
-}
-
-__forceinline const avxf max(const avxf &a, const avxf &b)
-{
-  return _mm256_max_ps(a.m256, b.m256);
-}
-__forceinline const avxf min(const avxf &a, const avxf &b)
-{
-  return _mm256_min_ps(a.m256, b.m256);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxf shuffle(const avxf &a, const __m256i &shuf)
-{
-  return _mm256_permutevar_ps(a, shuf);
-}
-
-template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
-__forceinline const avxf shuffle(const avxf &a)
-{
-  return _mm256_permutevar_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0));
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxf shuffle(const avxf &a, const avxf &b)
-{
-  return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
-}
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxf shuffle(const avxf &a)
-{
-  return shuffle<i0, i1, i2, i3>(a, a);
-}
-template<size_t i0> __forceinline const avxf shuffle(const avxf &a, const avxf &b)
-{
-  return shuffle<i0, i0, i0, i0>(a, b);
-}
-template<size_t i0> __forceinline const avxf shuffle(const avxf &a)
-{
-  return shuffle<i0>(a, a);
-}
-
-template<size_t i> __forceinline float extract(const avxf &a)
-{
-  __m256 b = shuffle<i, i, i, i>(a).m256;
-  return _mm256_cvtss_f32(b);
-}
-template<> __forceinline float extract<0>(const avxf &a)
-{
-  return _mm256_cvtss_f32(a.m256);
-}
-
-__forceinline ssef low(const avxf &a)
-{
-  return _mm256_extractf128_ps(a.m256, 0);
-}
-__forceinline ssef high(const avxf &a)
-{
-  return _mm256_extractf128_ps(a.m256, 1);
-}
-
-template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
-__forceinline const avxf permute(const avxf &a)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_permutevar8x32_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0));
-#else
-  float temp[8];
-  _mm256_storeu_ps((float *)&temp, a);
-  return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]);
-#endif
-}
-
-template<int S0, int S1, int S2, int S3, int S4, int S5, int S6, int S7>
-ccl_device_inline const avxf set_sign_bit(const avxf &a)
-{
-  return a ^ avxf(S7 << 31, S6 << 31, S5 << 31, S4 << 31, S3 << 31, S2 << 31, S1 << 31, S0 << 31);
-}
-
-template<size_t S0, size_t S1, size_t S2, size_t S3, size_t S4, size_t S5, size_t S6, size_t S7>
-ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
-{
-  return _mm256_blend_ps(
-      a, b, S7 << 0 | S6 << 1 | S5 << 2 | S4 << 3 | S3 << 4 | S2 << 5 | S1 << 6 | S0 << 7);
-}
-
-template<size_t S0, size_t S1, size_t S2, size_t S3>
-ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
-{
-  return blend<S0, S1, S2, S3, S0, S1, S2, S3>(a, b);
-}
-
-//#if defined(__KERNEL_SSE41__)
-__forceinline avxf maxi(const avxf &a, const avxf &b)
-{
-  const avxf ci = _mm256_max_ps(a, b);
-  return ci;
-}
-
-__forceinline avxf mini(const avxf &a, const avxf &b)
-{
-  const avxf ci = _mm256_min_ps(a, b);
-  return ci;
-}
-//#endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Ternary Operators
-////////////////////////////////////////////////////////////////////////////////
-__forceinline const avxf madd(const avxf &a, const avxf &b, const avxf &c)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_fmadd_ps(a, b, c);
-#else
-  return c + (a * b);
-#endif
-}
-
-__forceinline const avxf nmadd(const avxf &a, const avxf &b, const avxf &c)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_fnmadd_ps(a, b, c);
-#else
-  return c - (a * b);
-#endif
-}
-__forceinline const avxf msub(const avxf &a, const avxf &b, const avxf &c)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_fmsub_ps(a, b, c);
-#else
-  return (a * b) - c;
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-__forceinline const avxb operator<=(const avxf &a, const avxf &b)
-{
-  return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS);
-}
-
-__forceinline const avxf select(const avxb &m, const avxf &t, const avxf &f)
-{
-  return _mm256_blendv_ps(f, t, m);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Common Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline avxf mix(const avxf &a, const avxf &b, const avxf &t)
-{
-  return madd(t, b, (avxf(1.0f) - t) * a);
-}
-
-#ifndef _mm256_set_m128
-#  define _mm256_set_m128(/* __m128 */ hi, /* __m128 */ lo) \
-    _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 0x1)
-#endif
-
-#define _mm256_loadu2_m128(/* float const* */ hiaddr, /* float const* */ loaddr) \
-  _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr))
-
-CCL_NAMESPACE_END
-
-#endif
--- a/intern/cycles/util/avxi.h
+++ b/intern/cycles/util/avxi.h
@ -1,732 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2009-2013 Intel Corporation */
-
-#ifndef __UTIL_AVXI_H__
-#define __UTIL_AVXI_H__
-
-CCL_NAMESPACE_BEGIN
-
-struct avxb;
-
-struct avxi {
-  typedef avxb Mask;  // mask type for us
-  enum { size = 8 };  // number of SIMD elements
-  union {             // data
-    __m256i m256;
-#if !defined(__KERNEL_AVX2__)
-    struct {
-      __m128i l, h;
-    };
-#endif
-    int32_t v[8];
-  };
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constructors, Assignment & Cast Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline avxi()
-  {
-  }
-  __forceinline avxi(const avxi &a)
-  {
-    m256 = a.m256;
-  }
-  __forceinline avxi &operator=(const avxi &a)
-  {
-    m256 = a.m256;
-    return *this;
-  }
-
-  __forceinline avxi(const __m256i a) : m256(a)
-  {
-  }
-  __forceinline operator const __m256i &(void) const
-  {
-    return m256;
-  }
-  __forceinline operator __m256i &(void)
-  {
-    return m256;
-  }
-
-  __forceinline explicit avxi(const ssei &a)
-      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), a, 1))
-  {
-  }
-  __forceinline avxi(const ssei &a, const ssei &b)
-      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
-  {
-  }
-#if defined(__KERNEL_AVX2__)
-  __forceinline avxi(const __m128i &a, const __m128i &b)
-      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
-  {
-  }
-#else
-  __forceinline avxi(const __m128i &a, const __m128i &b) : l(a), h(b)
-  {
-  }
-#endif
-  __forceinline explicit avxi(const int32_t *const a)
-      : m256(_mm256_castps_si256(_mm256_loadu_ps((const float *)a)))
-  {
-  }
-  __forceinline avxi(int32_t a) : m256(_mm256_set1_epi32(a))
-  {
-  }
-  __forceinline avxi(int32_t a, int32_t b) : m256(_mm256_set_epi32(b, a, b, a, b, a, b, a))
-  {
-  }
-  __forceinline avxi(int32_t a, int32_t b, int32_t c, int32_t d)
-      : m256(_mm256_set_epi32(d, c, b, a, d, c, b, a))
-  {
-  }
-  __forceinline avxi(
-      int32_t a, int32_t b, int32_t c, int32_t d, int32_t e, int32_t f, int32_t g, int32_t h)
-      : m256(_mm256_set_epi32(h, g, f, e, d, c, b, a))
-  {
-  }
-
-  __forceinline explicit avxi(const __m256 a) : m256(_mm256_cvtps_epi32(a))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constants
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline avxi(ZeroTy) : m256(_mm256_setzero_si256())
-  {
-  }
-#if defined(__KERNEL_AVX2__)
-  __forceinline avxi(OneTy) : m256(_mm256_set1_epi32(1))
-  {
-  }
-  __forceinline avxi(PosInfTy) : m256(_mm256_set1_epi32(pos_inf))
-  {
-  }
-  __forceinline avxi(NegInfTy) : m256(_mm256_set1_epi32(neg_inf))
-  {
-  }
-#else
-  __forceinline avxi(OneTy) : m256(_mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1))
-  {
-  }
-  __forceinline avxi(PosInfTy)
-      : m256(_mm256_set_epi32(
-            pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf))
-  {
-  }
-  __forceinline avxi(NegInfTy)
-      : m256(_mm256_set_epi32(
-            neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf))
-  {
-  }
-#endif
-  __forceinline avxi(StepTy) : m256(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Array Access
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline const int32_t &operator[](const size_t i) const
-  {
-    assert(i < 8);
-    return v[i];
-  }
-  __forceinline int32_t &operator[](const size_t i)
-  {
-    assert(i < 8);
-    return v[i];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxi cast(const __m256 &a)
-{
-  return _mm256_castps_si256(a);
-}
-__forceinline const avxi operator+(const avxi &a)
-{
-  return a;
-}
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator-(const avxi &a)
-{
-  return _mm256_sub_epi32(_mm256_setzero_si256(), a.m256);
-}
-__forceinline const avxi abs(const avxi &a)
-{
-  return _mm256_abs_epi32(a.m256);
-}
-#else
-__forceinline const avxi operator-(const avxi &a)
-{
-  return avxi(_mm_sub_epi32(_mm_setzero_si128(), a.l), _mm_sub_epi32(_mm_setzero_si128(), a.h));
-}
-__forceinline const avxi abs(const avxi &a)
-{
-  return avxi(_mm_abs_epi32(a.l), _mm_abs_epi32(a.h));
-}
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator+(const avxi &a, const avxi &b)
-{
-  return _mm256_add_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator+(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_add_epi32(a.l, b.l), _mm_add_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi operator+(const avxi &a, const int32_t b)
-{
-  return a + avxi(b);
-}
-__forceinline const avxi operator+(const int32_t a, const avxi &b)
-{
-  return avxi(a) + b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator-(const avxi &a, const avxi &b)
-{
-  return _mm256_sub_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator-(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_sub_epi32(a.l, b.l), _mm_sub_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi operator-(const avxi &a, const int32_t b)
-{
-  return a - avxi(b);
-}
-__forceinline const avxi operator-(const int32_t a, const avxi &b)
-{
-  return avxi(a) - b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator*(const avxi &a, const avxi &b)
-{
-  return _mm256_mullo_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator*(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_mullo_epi32(a.l, b.l), _mm_mullo_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi operator*(const avxi &a, const int32_t b)
-{
-  return a * avxi(b);
-}
-__forceinline const avxi operator*(const int32_t a, const avxi &b)
-{
-  return avxi(a) * b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator&(const avxi &a, const avxi &b)
-{
-  return _mm256_and_si256(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator&(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-__forceinline const avxi operator&(const avxi &a, const int32_t b)
-{
-  return a & avxi(b);
-}
-__forceinline const avxi operator&(const int32_t a, const avxi &b)
-{
-  return avxi(a) & b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator|(const avxi &a, const avxi &b)
-{
-  return _mm256_or_si256(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator|(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-__forceinline const avxi operator|(const avxi &a, const int32_t b)
-{
-  return a | avxi(b);
-}
-__forceinline const avxi operator|(const int32_t a, const avxi &b)
-{
-  return avxi(a) | b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator^(const avxi &a, const avxi &b)
-{
-  return _mm256_xor_si256(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator^(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-__forceinline const avxi operator^(const avxi &a, const int32_t b)
-{
-  return a ^ avxi(b);
-}
-__forceinline const avxi operator^(const int32_t a, const avxi &b)
-{
-  return avxi(a) ^ b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator<<(const avxi &a, const int32_t n)
-{
-  return _mm256_slli_epi32(a.m256, n);
-}
-__forceinline const avxi operator>>(const avxi &a, const int32_t n)
-{
-  return _mm256_srai_epi32(a.m256, n);
-}
-
-__forceinline const avxi sra(const avxi &a, const int32_t b)
-{
-  return _mm256_srai_epi32(a.m256, b);
-}
-__forceinline const avxi srl(const avxi &a, const int32_t b)
-{
-  return _mm256_srli_epi32(a.m256, b);
-}
-#else
-__forceinline const avxi operator<<(const avxi &a, const int32_t n)
-{
-  return avxi(_mm_slli_epi32(a.l, n), _mm_slli_epi32(a.h, n));
-}
-__forceinline const avxi operator>>(const avxi &a, const int32_t n)
-{
-  return avxi(_mm_srai_epi32(a.l, n), _mm_srai_epi32(a.h, n));
-}
-
-__forceinline const avxi sra(const avxi &a, const int32_t b)
-{
-  return avxi(_mm_srai_epi32(a.l, b), _mm_srai_epi32(a.h, b));
-}
-__forceinline const avxi srl(const avxi &a, const int32_t b)
-{
-  return avxi(_mm_srli_epi32(a.l, b), _mm_srli_epi32(a.h, b));
-}
-#endif
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi min(const avxi &a, const avxi &b)
-{
-  return _mm256_min_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi min(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_min_epi32(a.l, b.l), _mm_min_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi min(const avxi &a, const int32_t b)
-{
-  return min(a, avxi(b));
-}
-__forceinline const avxi min(const int32_t a, const avxi &b)
-{
-  return min(avxi(a), b);
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi max(const avxi &a, const avxi &b)
-{
-  return _mm256_max_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi max(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_max_epi32(a.l, b.l), _mm_max_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi max(const avxi &a, const int32_t b)
-{
-  return max(a, avxi(b));
-}
-__forceinline const avxi max(const int32_t a, const avxi &b)
-{
-  return max(avxi(a), b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline avxi &operator+=(avxi &a, const avxi &b)
-{
-  return a = a + b;
-}
-__forceinline avxi &operator+=(avxi &a, const int32_t b)
-{
-  return a = a + b;
-}
-
-__forceinline avxi &operator-=(avxi &a, const avxi &b)
-{
-  return a = a - b;
-}
-__forceinline avxi &operator-=(avxi &a, const int32_t b)
-{
-  return a = a - b;
-}
-
-__forceinline avxi &operator*=(avxi &a, const avxi &b)
-{
-  return a = a * b;
-}
-__forceinline avxi &operator*=(avxi &a, const int32_t b)
-{
-  return a = a * b;
-}
-
-__forceinline avxi &operator&=(avxi &a, const avxi &b)
-{
-  return a = a & b;
-}
-__forceinline avxi &operator&=(avxi &a, const int32_t b)
-{
-  return a = a & b;
-}
-
-__forceinline avxi &operator|=(avxi &a, const avxi &b)
-{
-  return a = a | b;
-}
-__forceinline avxi &operator|=(avxi &a, const int32_t b)
-{
-  return a = a | b;
-}
-
-__forceinline avxi &operator^=(avxi &a, const avxi &b)
-{
-  return a = a ^ b;
-}
-__forceinline avxi &operator^=(avxi &a, const int32_t b)
-{
-  return a = a ^ b;
-}
-
-__forceinline avxi &operator<<=(avxi &a, const int32_t b)
-{
-  return a = a << b;
-}
-__forceinline avxi &operator>>=(avxi &a, const int32_t b)
-{
-  return a = a >> b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxb operator==(const avxi &a, const avxi &b)
-{
-  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a.m256, b.m256));
-}
-#else
-__forceinline const avxb operator==(const avxi &a, const avxi &b)
-{
-  return avxb(_mm_castsi128_ps(_mm_cmpeq_epi32(a.l, b.l)),
-              _mm_castsi128_ps(_mm_cmpeq_epi32(a.h, b.h)));
-}
-#endif
-__forceinline const avxb operator==(const avxi &a, const int32_t b)
-{
-  return a == avxi(b);
-}
-__forceinline const avxb operator==(const int32_t a, const avxi &b)
-{
-  return avxi(a) == b;
-}
-
-__forceinline const avxb operator!=(const avxi &a, const avxi &b)
-{
-  return !(a == b);
-}
-__forceinline const avxb operator!=(const avxi &a, const int32_t b)
-{
-  return a != avxi(b);
-}
-__forceinline const avxb operator!=(const int32_t a, const avxi &b)
-{
-  return avxi(a) != b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxb operator<(const avxi &a, const avxi &b)
-{
-  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b.m256, a.m256));
-}
-#else
-__forceinline const avxb operator<(const avxi &a, const avxi &b)
-{
-  return avxb(_mm_castsi128_ps(_mm_cmplt_epi32(a.l, b.l)),
-              _mm_castsi128_ps(_mm_cmplt_epi32(a.h, b.h)));
-}
-#endif
-__forceinline const avxb operator<(const avxi &a, const int32_t b)
-{
-  return a < avxi(b);
-}
-__forceinline const avxb operator<(const int32_t a, const avxi &b)
-{
-  return avxi(a) < b;
-}
-
-__forceinline const avxb operator>=(const avxi &a, const avxi &b)
-{
-  return !(a < b);
-}
-__forceinline const avxb operator>=(const avxi &a, const int32_t b)
-{
-  return a >= avxi(b);
-}
-__forceinline const avxb operator>=(const int32_t a, const avxi &b)
-{
-  return avxi(a) >= b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxb operator>(const avxi &a, const avxi &b)
-{
-  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a.m256, b.m256));
-}
-#else
-__forceinline const avxb operator>(const avxi &a, const avxi &b)
-{
-  return avxb(_mm_castsi128_ps(_mm_cmpgt_epi32(a.l, b.l)),
-              _mm_castsi128_ps(_mm_cmpgt_epi32(a.h, b.h)));
-}
-#endif
-__forceinline const avxb operator>(const avxi &a, const int32_t b)
-{
-  return a > avxi(b);
-}
-__forceinline const avxb operator>(const int32_t a, const avxi &b)
-{
-  return avxi(a) > b;
-}
-
-__forceinline const avxb operator<=(const avxi &a, const avxi &b)
-{
-  return !(a > b);
-}
-__forceinline const avxb operator<=(const avxi &a, const int32_t b)
-{
-  return a <= avxi(b);
-}
-__forceinline const avxb operator<=(const int32_t a, const avxi &b)
-{
-  return avxi(a) <= b;
-}
-
-__forceinline const avxi select(const avxb &m, const avxi &t, const avxi &f)
-{
-  return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_AVX2__)
-__forceinline avxi unpacklo(const avxi &a, const avxi &b)
-{
-  return _mm256_unpacklo_epi32(a.m256, b.m256);
-}
-__forceinline avxi unpackhi(const avxi &a, const avxi &b)
-{
-  return _mm256_unpackhi_epi32(a.m256, b.m256);
-}
-#else
-__forceinline avxi unpacklo(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-__forceinline avxi unpackhi(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-
-template<size_t i> __forceinline const avxi shuffle(const avxi &a)
-{
-  return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i, i, i, i)));
-}
-
-template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a)
-{
-  return _mm256_permute2f128_si256(a, a, (i1 << 4) | (i0 << 0));
-}
-
-template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a, const avxi &b)
-{
-  return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxi shuffle(const avxi &a)
-{
-  return _mm256_castps_si256(
-      _mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i3, i2, i1, i0)));
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxi shuffle(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_shuffle_ps(
-      _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
-}
-
-template<> __forceinline const avxi shuffle<0, 0, 2, 2>(const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b)));
-}
-template<> __forceinline const avxi shuffle<1, 1, 3, 3>(const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(b)));
-}
-template<> __forceinline const avxi shuffle<0, 1, 0, 1>(const avxi &b)
-{
-  return _mm256_castps_si256(
-      _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(b)))));
-}
-
-__forceinline const avxi broadcast(const int *ptr)
-{
-  return _mm256_castps_si256(_mm256_broadcast_ss((const float *)ptr));
-}
-template<size_t i> __forceinline const avxi insert(const avxi &a, const ssei &b)
-{
-  return _mm256_insertf128_si256(a, b, i);
-}
-template<size_t i> __forceinline const ssei extract(const avxi &a)
-{
-  return _mm256_extractf128_si256(a, i);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reductions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxi vreduce_min2(const avxi &v)
-{
-  return min(v, shuffle<1, 0, 3, 2>(v));
-}
-__forceinline const avxi vreduce_min4(const avxi &v)
-{
-  avxi v1 = vreduce_min2(v);
-  return min(v1, shuffle<2, 3, 0, 1>(v1));
-}
-__forceinline const avxi vreduce_min(const avxi &v)
-{
-  avxi v1 = vreduce_min4(v);
-  return min(v1, shuffle<1, 0>(v1));
-}
-
-__forceinline const avxi vreduce_max2(const avxi &v)
-{
-  return max(v, shuffle<1, 0, 3, 2>(v));
-}
-__forceinline const avxi vreduce_max4(const avxi &v)
-{
-  avxi v1 = vreduce_max2(v);
-  return max(v1, shuffle<2, 3, 0, 1>(v1));
-}
-__forceinline const avxi vreduce_max(const avxi &v)
-{
-  avxi v1 = vreduce_max4(v);
-  return max(v1, shuffle<1, 0>(v1));
-}
-
-__forceinline const avxi vreduce_add2(const avxi &v)
-{
-  return v + shuffle<1, 0, 3, 2>(v);
-}
-__forceinline const avxi vreduce_add4(const avxi &v)
-{
-  avxi v1 = vreduce_add2(v);
-  return v1 + shuffle<2, 3, 0, 1>(v1);
-}
-__forceinline const avxi vreduce_add(const avxi &v)
-{
-  avxi v1 = vreduce_add4(v);
-  return v1 + shuffle<1, 0>(v1);
-}
-
-__forceinline int reduce_min(const avxi &v)
-{
-  return extract<0>(extract<0>(vreduce_min(v)));
-}
-__forceinline int reduce_max(const avxi &v)
-{
-  return extract<0>(extract<0>(vreduce_max(v)));
-}
-__forceinline int reduce_add(const avxi &v)
-{
-  return extract<0>(extract<0>(vreduce_add(v)));
-}
-
-__forceinline uint32_t select_min(const avxi &v)
-{
-  return __bsf(movemask(v == vreduce_min(v)));
-}
-__forceinline uint32_t select_max(const avxi &v)
-{
-  return __bsf(movemask(v == vreduce_max(v)));
-}
-
-__forceinline uint32_t select_min(const avxb &valid, const avxi &v)
-{
-  const avxi a = select(valid, v, avxi(pos_inf));
-  return __bsf(movemask(valid & (a == vreduce_min(a))));
-}
-__forceinline uint32_t select_max(const avxb &valid, const avxi &v)
-{
-  const avxi a = select(valid, v, avxi(neg_inf));
-  return __bsf(movemask(valid & (a == vreduce_max(a))));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Output Operators
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_avxi(const char *label, const avxi &a)
-{
-  printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
-}
-
-CCL_NAMESPACE_END
-
-#endif
--- a/intern/cycles/util/color.h
+++ b/intern/cycles/util/color.h
@ -228,28 +228,27 @@ ccl_device float3 xyY_to_xyz(float x, float y, float Y)
 * exp = exponent, encoded as uint32_t
 * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as uint32_t
 */
-template<unsigned exp, unsigned e2coeff> ccl_device_inline ssef fastpow(const ssef &arg)
+template<unsigned exp, unsigned e2coeff> ccl_device_inline float4 fastpow(const float4 &arg)
 {
-  ssef ret;
-  ret = arg * cast(ssei(e2coeff));
-  ret = ssef(cast(ret));
-  ret = ret * cast(ssei(exp));
-  ret = cast(ssei(ret));
+  float4 ret = arg * cast(make_int4(e2coeff));
+  ret = make_float4(cast(ret));
+  ret = ret * cast(make_int4(exp));
+  ret = cast(make_int4(ret));
  return ret;
 }

 /* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */
-ccl_device_inline ssef improve_5throot_solution(const ssef &old_result, const ssef &x)
+ccl_device_inline float4 improve_5throot_solution(const float4 &old_result, const float4 &x)
 {
-  ssef approx2 = old_result * old_result;
-  ssef approx4 = approx2 * approx2;
-  ssef t = x / approx4;
-  ssef summ = madd(ssef(4.0f), old_result, t);
-  return summ * ssef(1.0f / 5.0f);
+  float4 approx2 = old_result * old_result;
+  float4 approx4 = approx2 * approx2;
+  float4 t = x / approx4;
+  float4 summ = madd(make_float4(4.0f), old_result, t);
+  return summ * make_float4(1.0f / 5.0f);
 }

 /* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */
-ccl_device_inline ssef fastpow24(const ssef &arg)
+ccl_device_inline float4 fastpow24(const float4 &arg)
 {
  /* max, avg and |avg| errors were calculated in gcc without FMA instructions
   * The final precision should be better than powf in glibc */
@ -257,9 +256,10 @@ ccl_device_inline ssef fastpow24(const ssef &arg)
  /* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize avg error */
  /* 0x3F4CCCCD = 4/5 */
  /* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */
-  ssef x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(arg);  // error max = 0.17  avg = 0.0018    |avg| = 0.05
-  ssef arg2 = arg * arg;
-  ssef arg4 = arg2 * arg2;
+  float4 x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(
+      arg);  // error max = 0.17  avg = 0.0018    |avg| = 0.05
+  float4 arg2 = arg * arg;
+  float4 arg4 = arg2 * arg2;

  /* error max = 0.018     avg = 0.0031    |avg| = 0.0031 */
  x = improve_5throot_solution(x, arg4);
@ -271,12 +271,12 @@ ccl_device_inline ssef fastpow24(const ssef &arg)
  return x * (x * x);
 }

-ccl_device ssef color_srgb_to_linear(const ssef &c)
+ccl_device float4 color_srgb_to_linear(const float4 &c)
 {
-  sseb cmp = c < ssef(0.04045f);
-  ssef lt = max(c * ssef(1.0f / 12.92f), ssef(0.0f));
-  ssef gtebase = (c + ssef(0.055f)) * ssef(1.0f / 1.055f); /* fma */
-  ssef gte = fastpow24(gtebase);
+  int4 cmp = c < make_float4(0.04045f);
+  float4 lt = max(c * make_float4(1.0f / 12.92f), make_float4(0.0f));
+  float4 gtebase = (c + make_float4(0.055f)) * make_float4(1.0f / 1.055f); /* fma */
+  float4 gte = fastpow24(gtebase);
  return select(cmp, lt, gte);
 }
 #endif /* __KERNEL_SSE2__ */
@ -302,10 +302,8 @@ ccl_device float4 color_linear_to_srgb_v4(float4 c)
 ccl_device float4 color_srgb_to_linear_v4(float4 c)
 {
 #ifdef __KERNEL_SSE2__
-  ssef r_ssef;
-  float4 &r = (float4 &)r_ssef;
-  r = c;
-  r_ssef = color_srgb_to_linear(r_ssef);
+  float4 r = c;
+  r = color_srgb_to_linear(r);
  r.w = c.w;
  return r;
 #else
--- a/intern/cycles/util/half.h
+++ b/intern/cycles/util/half.h
@ -154,17 +154,17 @@ ccl_device_inline half float_to_half_display(const float f)

 ccl_device_inline half4 float4_to_half4_display(const float4 f)
 {
-#ifdef __KERNEL_SSE2__
+#ifdef __KERNEL_SSE__
  /* CPU: SSE and AVX. */
-  ssef x = min(max(load4f(f), 0.0f), 65504.0f);
+  float4 x = min(max(f, make_float4(0.0f)), make_float4(65504.0f));
 #  ifdef __KERNEL_AVX2__
-  ssei rpack = _mm_cvtps_ph(x, 0);
+  int4 rpack = int4(_mm_cvtps_ph(x, 0));
 #  else
-  ssei absolute = cast(x) & 0x7FFFFFFF;
-  ssei Z = absolute + 0xC8000000;
-  ssei result = andnot(absolute < 0x38800000, Z);
-  ssei rshift = (result >> 13) & 0x7FFF;
-  ssei rpack = _mm_packs_epi32(rshift, rshift);
+  int4 absolute = cast(x) & make_int4(0x7FFFFFFF);
+  int4 Z = absolute + make_int4(0xC8000000);
+  int4 result = andnot(absolute < make_int4(0x38800000), Z);
+  int4 rshift = (result >> 13) & make_int4(0x7FFF);
+  int4 rpack = int4(_mm_packs_epi32(rshift, rshift));
 #  endif
  half4 h;
  _mm_storel_pi((__m64 *)&h, _mm_castsi128_ps(rpack));
--- a/intern/cycles/util/hash.h
+++ b/intern/cycles/util/hash.h
@ -222,7 +222,7 @@ ccl_device_inline float3 hash_float4_to_float3(float4 k)

 /* SSE Versions Of Jenkins Lookup3 Hash Functions */

-#ifdef __KERNEL_SSE2__
+#ifdef __KERNEL_SSE__
 #  define rot(x, k) (((x) << (k)) | (srl(x, 32 - (k))))

 #  define mix(a, b, c) \
@ -265,10 +265,10 @@ ccl_device_inline float3 hash_float4_to_float3(float4 k)
      c -= rot(b, 24); \
    }

-ccl_device_inline ssei hash_ssei(ssei kx)
+ccl_device_inline int4 hash_int4(int4 kx)
 {
-  ssei a, b, c;
-  a = b = c = ssei(0xdeadbeef + (1 << 2) + 13);
+  int4 a, b, c;
+  a = b = c = make_int4(0xdeadbeef + (1 << 2) + 13);

  a += kx;
  final(a, b, c);
@ -276,10 +276,10 @@ ccl_device_inline ssei hash_ssei(ssei kx)
  return c;
 }

-ccl_device_inline ssei hash_ssei2(ssei kx, ssei ky)
+ccl_device_inline int4 hash_int4_2(int4 kx, int4 ky)
 {
-  ssei a, b, c;
-  a = b = c = ssei(0xdeadbeef + (2 << 2) + 13);
+  int4 a, b, c;
+  a = b = c = make_int4(0xdeadbeef + (2 << 2) + 13);

  b += ky;
  a += kx;
@ -288,10 +288,10 @@ ccl_device_inline ssei hash_ssei2(ssei kx, ssei ky)
  return c;
 }

-ccl_device_inline ssei hash_ssei3(ssei kx, ssei ky, ssei kz)
+ccl_device_inline int4 hash_int4_3(int4 kx, int4 ky, int4 kz)
 {
-  ssei a, b, c;
-  a = b = c = ssei(0xdeadbeef + (3 << 2) + 13);
+  int4 a, b, c;
+  a = b = c = make_int4(0xdeadbeef + (3 << 2) + 13);

  c += kz;
  b += ky;
@ -301,10 +301,10 @@ ccl_device_inline ssei hash_ssei3(ssei kx, ssei ky, ssei kz)
  return c;
 }

-ccl_device_inline ssei hash_ssei4(ssei kx, ssei ky, ssei kz, ssei kw)
+ccl_device_inline int4 hash_int4_4(int4 kx, int4 ky, int4 kz, int4 kw)
 {
-  ssei a, b, c;
-  a = b = c = ssei(0xdeadbeef + (4 << 2) + 13);
+  int4 a, b, c;
+  a = b = c = make_int4(0xdeadbeef + (4 << 2) + 13);

  a += kx;
  b += ky;
@ -317,11 +317,11 @@ ccl_device_inline ssei hash_ssei4(ssei kx, ssei ky, ssei kz, ssei kw)
  return c;
 }

-#  if defined(__KERNEL_AVX__)
-ccl_device_inline avxi hash_avxi(avxi kx)
+#  if defined(__KERNEL_AVX2__)
+ccl_device_inline vint8 hash_int8(vint8 kx)
 {
-  avxi a, b, c;
-  a = b = c = avxi(0xdeadbeef + (1 << 2) + 13);
+  vint8 a, b, c;
+  a = b = c = make_vint8(0xdeadbeef + (1 << 2) + 13);

  a += kx;
  final(a, b, c);
@ -329,10 +329,10 @@ ccl_device_inline avxi hash_avxi(avxi kx)
  return c;
 }

-ccl_device_inline avxi hash_avxi2(avxi kx, avxi ky)
+ccl_device_inline vint8 hash_int8_2(vint8 kx, vint8 ky)
 {
-  avxi a, b, c;
-  a = b = c = avxi(0xdeadbeef + (2 << 2) + 13);
+  vint8 a, b, c;
+  a = b = c = make_vint8(0xdeadbeef + (2 << 2) + 13);

  b += ky;
  a += kx;
@ -341,10 +341,10 @@ ccl_device_inline avxi hash_avxi2(avxi kx, avxi ky)
  return c;
 }

-ccl_device_inline avxi hash_avxi3(avxi kx, avxi ky, avxi kz)
+ccl_device_inline vint8 hash_int8_3(vint8 kx, vint8 ky, vint8 kz)
 {
-  avxi a, b, c;
-  a = b = c = avxi(0xdeadbeef + (3 << 2) + 13);
+  vint8 a, b, c;
+  a = b = c = make_vint8(0xdeadbeef + (3 << 2) + 13);

  c += kz;
  b += ky;
@ -354,10 +354,10 @@ ccl_device_inline avxi hash_avxi3(avxi kx, avxi ky, avxi kz)
  return c;
 }

-ccl_device_inline avxi hash_avxi4(avxi kx, avxi ky, avxi kz, avxi kw)
+ccl_device_inline vint8 hash_int8_4(vint8 kx, vint8 ky, vint8 kz, vint8 kw)
 {
-  avxi a, b, c;
-  a = b = c = avxi(0xdeadbeef + (4 << 2) + 13);
+  vint8 a, b, c;
+  a = b = c = make_vint8(0xdeadbeef + (4 << 2) + 13);

  a += kx;
  b += ky;
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@ -532,12 +532,14 @@ CCL_NAMESPACE_END
 #include "util/math_int2.h"
 #include "util/math_int3.h"
 #include "util/math_int4.h"
+#include "util/math_int8.h"

 #include "util/math_float2.h"
-#include "util/math_float3.h"
 #include "util/math_float4.h"
 #include "util/math_float8.h"

+#include "util/math_float3.h"
+
 #include "util/rect.h"

 CCL_NAMESPACE_BEGIN
--- a/intern/cycles/util/math_float2.h
+++ b/intern/cycles/util/math_float2.h
@ -10,55 +10,6 @@

 CCL_NAMESPACE_BEGIN

-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline float2 operator-(const float2 &a);
-ccl_device_inline float2 operator*(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator*(const float2 &a, float f);
-ccl_device_inline float2 operator*(float f, const float2 &a);
-ccl_device_inline float2 operator/(float f, const float2 &a);
-ccl_device_inline float2 operator/(const float2 &a, float f);
-ccl_device_inline float2 operator/(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator+(const float2 &a, const float f);
-ccl_device_inline float2 operator+(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator-(const float2 &a, const float f);
-ccl_device_inline float2 operator-(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator+=(float2 &a, const float2 &b);
-ccl_device_inline float2 operator*=(float2 &a, const float2 &b);
-ccl_device_inline float2 operator*=(float2 &a, float f);
-ccl_device_inline float2 operator/=(float2 &a, const float2 &b);
-ccl_device_inline float2 operator/=(float2 &a, float f);
-
-ccl_device_inline bool operator==(const float2 &a, const float2 &b);
-ccl_device_inline bool operator!=(const float2 &a, const float2 &b);
-
-ccl_device_inline bool is_zero(const float2 &a);
-ccl_device_inline float average(const float2 &a);
-ccl_device_inline float distance(const float2 &a, const float2 &b);
-ccl_device_inline float dot(const float2 &a, const float2 &b);
-ccl_device_inline float cross(const float2 &a, const float2 &b);
-ccl_device_inline float len(const float2 a);
-ccl_device_inline float2 normalize(const float2 &a);
-ccl_device_inline float2 normalize_len(const float2 &a, float *t);
-ccl_device_inline float2 safe_normalize(const float2 &a);
-ccl_device_inline float2 min(const float2 &a, const float2 &b);
-ccl_device_inline float2 max(const float2 &a, const float2 &b);
-ccl_device_inline float2 clamp(const float2 &a, const float2 &mn, const float2 &mx);
-ccl_device_inline float2 fabs(const float2 &a);
-ccl_device_inline float2 as_float2(const float4 &a);
-ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t);
-ccl_device_inline float2 floor(const float2 &a);
-#endif /* !__KERNEL_METAL__ */
-
-ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b);
-
-/*******************************************************************************
- * Definition.
- */
-
 ccl_device_inline float2 zero_float2()
 {
  return make_float2(0.0f, 0.0f);
@ -75,63 +26,63 @@ ccl_device_inline float2 operator-(const float2 &a)
  return make_float2(-a.x, -a.y);
 }

-ccl_device_inline float2 operator*(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator*(const float2 a, const float2 b)
 {
  return make_float2(a.x * b.x, a.y * b.y);
 }

-ccl_device_inline float2 operator*(const float2 &a, float f)
+ccl_device_inline float2 operator*(const float2 a, float f)
 {
  return make_float2(a.x * f, a.y * f);
 }

-ccl_device_inline float2 operator*(float f, const float2 &a)
+ccl_device_inline float2 operator*(float f, const float2 a)
 {
  return make_float2(a.x * f, a.y * f);
 }

-ccl_device_inline float2 operator/(float f, const float2 &a)
+ccl_device_inline float2 operator/(float f, const float2 a)
 {
  return make_float2(f / a.x, f / a.y);
 }

-ccl_device_inline float2 operator/(const float2 &a, float f)
+ccl_device_inline float2 operator/(const float2 a, float f)
 {
  float invf = 1.0f / f;
  return make_float2(a.x * invf, a.y * invf);
 }

-ccl_device_inline float2 operator/(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator/(const float2 a, const float2 b)
 {
  return make_float2(a.x / b.x, a.y / b.y);
 }

-ccl_device_inline float2 operator+(const float2 &a, const float f)
-{
-  return a + make_float2(f, f);
-}
-
-ccl_device_inline float2 operator+(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator+(const float2 a, const float2 b)
 {
  return make_float2(a.x + b.x, a.y + b.y);
 }

-ccl_device_inline float2 operator-(const float2 &a, const float f)
+ccl_device_inline float2 operator+(const float2 a, const float f)
 {
-  return a - make_float2(f, f);
+  return a + make_float2(f, f);
 }

-ccl_device_inline float2 operator-(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator-(const float2 a, const float2 b)
 {
  return make_float2(a.x - b.x, a.y - b.y);
 }

-ccl_device_inline float2 operator+=(float2 &a, const float2 &b)
+ccl_device_inline float2 operator-(const float2 a, const float f)
+{
+  return a - make_float2(f, f);
+}
+
+ccl_device_inline float2 operator+=(float2 &a, const float2 b)
 {
  return a = a + b;
 }

-ccl_device_inline float2 operator*=(float2 &a, const float2 &b)
+ccl_device_inline float2 operator*=(float2 &a, const float2 b)
 {
  return a = a * b;
 }
@ -141,7 +92,7 @@ ccl_device_inline float2 operator*=(float2 &a, float f)
  return a = a * f;
 }

-ccl_device_inline float2 operator/=(float2 &a, const float2 &b)
+ccl_device_inline float2 operator/=(float2 &a, const float2 b)
 {
  return a = a / b;
 }
@ -152,74 +103,81 @@ ccl_device_inline float2 operator/=(float2 &a, float f)
  return a = a * invf;
 }

-ccl_device_inline bool operator==(const float2 &a, const float2 &b)
+ccl_device_inline bool operator==(const float2 a, const float2 b)
 {
  return (a.x == b.x && a.y == b.y);
 }

-ccl_device_inline bool operator!=(const float2 &a, const float2 &b)
+ccl_device_inline bool operator!=(const float2 a, const float2 b)
 {
  return !(a == b);
 }

-ccl_device_inline bool is_zero(const float2 &a)
+ccl_device_inline bool is_zero(const float2 a)
 {
  return (a.x == 0.0f && a.y == 0.0f);
 }

-ccl_device_inline float average(const float2 &a)
+ccl_device_inline float average(const float2 a)
 {
  return (a.x + a.y) * (1.0f / 2.0f);
 }

-ccl_device_inline float distance(const float2 &a, const float2 &b)
+ccl_device_inline float dot(const float2 a, const float2 b)
+{
+  return a.x * b.x + a.y * b.y;
+}
+#endif
+
+ccl_device_inline float len(const float2 a)
+{
+  return sqrtf(dot(a, a));
+}
+
+#if !defined(__KERNEL_METAL__)
+ccl_device_inline float distance(const float2 a, const float2 b)
 {
  return len(a - b);
 }

-ccl_device_inline float dot(const float2 &a, const float2 &b)
-{
-  return a.x * b.x + a.y * b.y;
-}
-
-ccl_device_inline float cross(const float2 &a, const float2 &b)
+ccl_device_inline float cross(const float2 a, const float2 b)
 {
  return (a.x * b.y - a.y * b.x);
 }

-ccl_device_inline float2 normalize(const float2 &a)
+ccl_device_inline float2 normalize(const float2 a)
 {
  return a / len(a);
 }

-ccl_device_inline float2 normalize_len(const float2 &a, ccl_private float *t)
+ccl_device_inline float2 normalize_len(const float2 a, ccl_private float *t)
 {
  *t = len(a);
  return a / (*t);
 }

-ccl_device_inline float2 safe_normalize(const float2 &a)
+ccl_device_inline float2 safe_normalize(const float2 a)
 {
  float t = len(a);
  return (t != 0.0f) ? a / t : a;
 }

-ccl_device_inline float2 min(const float2 &a, const float2 &b)
+ccl_device_inline float2 min(const float2 a, const float2 b)
 {
  return make_float2(min(a.x, b.x), min(a.y, b.y));
 }

-ccl_device_inline float2 max(const float2 &a, const float2 &b)
+ccl_device_inline float2 max(const float2 a, const float2 b)
 {
  return make_float2(max(a.x, b.x), max(a.y, b.y));
 }

-ccl_device_inline float2 clamp(const float2 &a, const float2 &mn, const float2 &mx)
+ccl_device_inline float2 clamp(const float2 a, const float2 mn, const float2 mx)
 {
  return min(max(a, mn), mx);
 }

-ccl_device_inline float2 fabs(const float2 &a)
+ccl_device_inline float2 fabs(const float2 a)
 {
  return make_float2(fabsf(a.x), fabsf(a.y));
 }
@ -229,28 +187,23 @@ ccl_device_inline float2 as_float2(const float4 &a)
  return make_float2(a.x, a.y);
 }

-ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t)
+ccl_device_inline float2 interp(const float2 a, const float2 b, float t)
 {
  return a + t * (b - a);
 }

-ccl_device_inline float2 mix(const float2 &a, const float2 &b, float t)
+ccl_device_inline float2 mix(const float2 a, const float2 b, float t)
 {
  return a + t * (b - a);
 }

-ccl_device_inline float2 floor(const float2 &a)
+ccl_device_inline float2 floor(const float2 a)
 {
  return make_float2(floorf(a.x), floorf(a.y));
 }

 #endif /* !__KERNEL_METAL__ */

-ccl_device_inline float len(const float2 a)
-{
-  return sqrtf(dot(a, a));
-}
-
 ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b)
 {
  return (b != 0.0f) ? a / b : zero_float2();
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
 * Copyright 2011-2022 Blender Foundation */

 #ifndef __UTIL_MATH_FLOAT3_H__
@ -10,73 +11,6 @@

 CCL_NAMESPACE_BEGIN

-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline float3 operator-(const float3 &a);
-ccl_device_inline float3 operator*(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator*(const float3 &a, const float f);
-ccl_device_inline float3 operator*(const float f, const float3 &a);
-ccl_device_inline float3 operator/(const float f, const float3 &a);
-ccl_device_inline float3 operator/(const float3 &a, const float f);
-ccl_device_inline float3 operator/(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator+(const float3 &a, const float f);
-ccl_device_inline float3 operator+(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator-(const float3 &a, const float f);
-ccl_device_inline float3 operator-(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator+=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator-=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator*=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator*=(float3 &a, float f);
-ccl_device_inline float3 operator/=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator/=(float3 &a, float f);
-
-ccl_device_inline bool operator==(const float3 &a, const float3 &b);
-ccl_device_inline bool operator!=(const float3 &a, const float3 &b);
-
-ccl_device_inline float distance(const float3 &a, const float3 &b);
-ccl_device_inline float dot(const float3 &a, const float3 &b);
-ccl_device_inline float dot_xy(const float3 &a, const float3 &b);
-ccl_device_inline float3 cross(const float3 &a, const float3 &b);
-ccl_device_inline float3 normalize(const float3 &a);
-ccl_device_inline float3 min(const float3 &a, const float3 &b);
-ccl_device_inline float3 max(const float3 &a, const float3 &b);
-ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx);
-ccl_device_inline float3 fabs(const float3 &a);
-ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t);
-ccl_device_inline float3 rcp(const float3 &a);
-ccl_device_inline float3 sqrt(const float3 &a);
-ccl_device_inline float3 floor(const float3 &a);
-ccl_device_inline float3 ceil(const float3 &a);
-ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
-#endif /* !defined(__KERNEL_METAL__) */
-
-ccl_device_inline float reduce_min(float3 a);
-ccl_device_inline float reduce_max(float3 a);
-ccl_device_inline float len(const float3 a);
-ccl_device_inline float len_squared(const float3 a);
-
-ccl_device_inline float3 project(const float3 v, const float3 v_proj);
-
-ccl_device_inline float3 safe_normalize(const float3 a);
-ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t);
-ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t);
-ccl_device_inline float3 safe_divide(const float3 a, const float3 b);
-ccl_device_inline float3 safe_divide(const float3 a, const float b);
-ccl_device_inline float3 interp(float3 a, float3 b, float t);
-ccl_device_inline float3 sqr(float3 a);
-
-ccl_device_inline bool is_zero(const float3 a);
-ccl_device_inline float reduce_add(const float3 a);
-ccl_device_inline float average(const float3 a);
-ccl_device_inline bool isequal(const float3 a, const float3 b);
-
-/*******************************************************************************
- * Definition.
- */
-
 ccl_device_inline float3 zero_float3()
 {
 #ifdef __KERNEL_SSE__
@ -109,7 +43,7 @@ ccl_device_inline float3 operator-(const float3 &a)
 #  endif
 }

-ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator*(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_mul_ps(a.m128, b.m128));
@ -118,7 +52,7 @@ ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
 #  endif
 }

-ccl_device_inline float3 operator*(const float3 &a, const float f)
+ccl_device_inline float3 operator*(const float3 a, const float f)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
@ -127,7 +61,7 @@ ccl_device_inline float3 operator*(const float3 &a, const float f)
 #  endif
 }

-ccl_device_inline float3 operator*(const float f, const float3 &a)
+ccl_device_inline float3 operator*(const float f, const float3 a)
 {
 #  if defined(__KERNEL_SSE__)
  return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
@ -136,7 +70,7 @@ ccl_device_inline float3 operator*(const float f, const float3 &a)
 #  endif
 }

-ccl_device_inline float3 operator/(const float f, const float3 &a)
+ccl_device_inline float3 operator/(const float f, const float3 a)
 {
 #  if defined(__KERNEL_SSE__)
  return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
@ -145,7 +79,7 @@ ccl_device_inline float3 operator/(const float f, const float3 &a)
 #  endif
 }

-ccl_device_inline float3 operator/(const float3 &a, const float f)
+ccl_device_inline float3 operator/(const float3 a, const float f)
 {
 #  if defined(__KERNEL_SSE__)
  return float3(_mm_div_ps(a.m128, _mm_set1_ps(f)));
@ -154,7 +88,7 @@ ccl_device_inline float3 operator/(const float3 &a, const float f)
 #  endif
 }

-ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator/(const float3 a, const float3 b)
 {
 #  if defined(__KERNEL_SSE__)
  return float3(_mm_div_ps(a.m128, b.m128));
@ -163,12 +97,7 @@ ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
 #  endif
 }

-ccl_device_inline float3 operator+(const float3 &a, const float f)
-{
-  return a + make_float3(f, f, f);
-}
-
-ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator+(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_add_ps(a.m128, b.m128));
@ -177,12 +106,12 @@ ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
 #  endif
 }

-ccl_device_inline float3 operator-(const float3 &a, const float f)
+ccl_device_inline float3 operator+(const float3 a, const float f)
 {
-  return a - make_float3(f, f, f);
+  return a + make_float3(f, f, f);
 }

-ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator-(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_sub_ps(a.m128, b.m128));
@ -191,17 +120,22 @@ ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
 #  endif
 }

-ccl_device_inline float3 operator+=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator-(const float3 a, const float f)
+{
+  return a - make_float3(f, f, f);
+}
+
+ccl_device_inline float3 operator+=(float3 &a, const float3 b)
 {
  return a = a + b;
 }

-ccl_device_inline float3 operator-=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator-=(float3 &a, const float3 b)
 {
  return a = a - b;
 }

-ccl_device_inline float3 operator*=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator*=(float3 &a, const float3 b)
 {
  return a = a * b;
 }
@ -211,7 +145,7 @@ ccl_device_inline float3 operator*=(float3 &a, float f)
  return a = a * f;
 }

-ccl_device_inline float3 operator/=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator/=(float3 &a, const float3 b)
 {
  return a = a / b;
 }
@ -223,7 +157,7 @@ ccl_device_inline float3 operator/=(float3 &a, float f)
 }

 #  if !(defined(__KERNEL_METAL__) || defined(__KERNEL_CUDA__))
-ccl_device_inline packed_float3 operator*=(packed_float3 &a, const float3 &b)
+ccl_device_inline packed_float3 operator*=(packed_float3 &a, const float3 b)
 {
  a = float3(a) * b;
  return a;
@ -235,7 +169,7 @@ ccl_device_inline packed_float3 operator*=(packed_float3 &a, float f)
  return a;
 }

-ccl_device_inline packed_float3 operator/=(packed_float3 &a, const float3 &b)
+ccl_device_inline packed_float3 operator/=(packed_float3 &a, const float3 b)
 {
  a = float3(a) / b;
  return a;
@ -248,7 +182,7 @@ ccl_device_inline packed_float3 operator/=(packed_float3 &a, float f)
 }
 #  endif

-ccl_device_inline bool operator==(const float3 &a, const float3 &b)
+ccl_device_inline bool operator==(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
  return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
@ -257,17 +191,12 @@ ccl_device_inline bool operator==(const float3 &a, const float3 &b)
 #  endif
 }

-ccl_device_inline bool operator!=(const float3 &a, const float3 &b)
+ccl_device_inline bool operator!=(const float3 a, const float3 b)
 {
  return !(a == b);
 }

-ccl_device_inline float distance(const float3 &a, const float3 &b)
-{
-  return len(a - b);
-}
-
-ccl_device_inline float dot(const float3 &a, const float3 &b)
+ccl_device_inline float dot(const float3 a, const float3 b)
 {
 #  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
@ -276,26 +205,62 @@ ccl_device_inline float dot(const float3 &a, const float3 &b)
 #  endif
 }

-ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
+#endif
+
+ccl_device_inline float dot_xy(const float3 a, const float3 b)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
  return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b));
-#  else
+#else
  return a.x * b.x + a.y * b.y;
-#  endif
+#endif
 }

-ccl_device_inline float3 cross(const float3 &a, const float3 &b)
+ccl_device_inline float len(const float3 a)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+  return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
+#else
+  return sqrtf(dot(a, a));
+#endif
+}
+
+ccl_device_inline float reduce_min(float3 a)
+{
+  return min(min(a.x, a.y), a.z);
+}
+
+ccl_device_inline float reduce_max(float3 a)
+{
+  return max(max(a.x, a.y), a.z);
+}
+
+ccl_device_inline float len_squared(const float3 a)
+{
+  return dot(a, a);
+}
+
+#ifndef __KERNEL_METAL__
+
+ccl_device_inline float distance(const float3 a, const float3 b)
+{
+  return len(a - b);
+}
+
+ccl_device_inline float3 cross(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
-  return float3(shuffle<1, 2, 0, 3>(
-      msub(ssef(a), shuffle<1, 2, 0, 3>(ssef(b)), shuffle<1, 2, 0, 3>(ssef(a)) * ssef(b))));
+  const float4 x = float4(a.m128);
+  const float4 y = shuffle<1, 2, 0, 3>(float4(b.m128));
+  const float4 z = float4(_mm_mul_ps(shuffle<1, 2, 0, 3>(float4(a.m128)), float4(b.m128)));
+
+  return float3(shuffle<1, 2, 0, 3>(msub(x, y, z)).m128);
 #  else
  return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
 #  endif
 }

-ccl_device_inline float3 normalize(const float3 &a)
+ccl_device_inline float3 normalize(const float3 a)
 {
 #  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
  __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
@ -305,7 +270,7 @@ ccl_device_inline float3 normalize(const float3 &a)
 #  endif
 }

-ccl_device_inline float3 min(const float3 &a, const float3 &b)
+ccl_device_inline float3 min(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_min_ps(a.m128, b.m128));
@ -314,7 +279,7 @@ ccl_device_inline float3 min(const float3 &a, const float3 &b)
 #  endif
 }

-ccl_device_inline float3 max(const float3 &a, const float3 &b)
+ccl_device_inline float3 max(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_max_ps(a.m128, b.m128));
@ -323,12 +288,12 @@ ccl_device_inline float3 max(const float3 &a, const float3 &b)
 #  endif
 }

-ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx)
+ccl_device_inline float3 clamp(const float3 a, const float3 mn, const float3 mx)
 {
  return min(max(a, mn), mx);
 }

-ccl_device_inline float3 fabs(const float3 &a)
+ccl_device_inline float3 fabs(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
 #    ifdef __KERNEL_NEON__
@ -342,7 +307,7 @@ ccl_device_inline float3 fabs(const float3 &a)
 #  endif
 }

-ccl_device_inline float3 sqrt(const float3 &a)
+ccl_device_inline float3 sqrt(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_sqrt_ps(a));
@ -351,7 +316,7 @@ ccl_device_inline float3 sqrt(const float3 &a)
 #  endif
 }

-ccl_device_inline float3 floor(const float3 &a)
+ccl_device_inline float3 floor(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_floor_ps(a));
@ -360,7 +325,7 @@ ccl_device_inline float3 floor(const float3 &a)
 #  endif
 }

-ccl_device_inline float3 ceil(const float3 &a)
+ccl_device_inline float3 ceil(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_ceil_ps(a));
@ -369,12 +334,12 @@ ccl_device_inline float3 ceil(const float3 &a)
 #  endif
 }

-ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
+ccl_device_inline float3 mix(const float3 a, const float3 b, float t)
 {
  return a + t * (b - a);
 }

-ccl_device_inline float3 rcp(const float3 &a)
+ccl_device_inline float3 rcp(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
  /* Don't use _mm_rcp_ps due to poor precision. */
@ -399,33 +364,6 @@ ccl_device_inline float3 log(float3 v)
  return make_float3(logf(v.x), logf(v.y), logf(v.z));
 }

-#endif /* !__KERNEL_METAL__ */
-
-ccl_device_inline float reduce_min(float3 a)
-{
-  return min(min(a.x, a.y), a.z);
-}
-
-ccl_device_inline float reduce_max(float3 a)
-{
-  return max(max(a.x, a.y), a.z);
-}
-
-ccl_device_inline float len(const float3 a)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-  return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
-#else
-  return sqrtf(dot(a, a));
-#endif
-}
-
-ccl_device_inline float len_squared(const float3 a)
-{
-  return dot(a, a);
-}
-
-#if !defined(__KERNEL_METAL__)
 ccl_device_inline float3 reflect(const float3 incident, const float3 normal)
 {
  float3 unit_normal = normalize(normal);
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
 * Copyright 2011-2022 Blender Foundation */

 #ifndef __UTIL_MATH_FLOAT4_H__
@ -10,85 +11,6 @@

 CCL_NAMESPACE_BEGIN

-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline float4 operator-(const float4 &a);
-ccl_device_inline float4 operator*(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator*(const float4 &a, float f);
-ccl_device_inline float4 operator*(float f, const float4 &a);
-ccl_device_inline float4 operator/(const float4 &a, float f);
-ccl_device_inline float4 operator/(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator+(const float4 &a, const float f);
-ccl_device_inline float4 operator+(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator-(const float4 &a, const float f);
-ccl_device_inline float4 operator-(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator+=(float4 &a, const float4 &b);
-ccl_device_inline float4 operator*=(float4 &a, const float4 &b);
-ccl_device_inline float4 operator*=(float4 &a, float f);
-ccl_device_inline float4 operator/=(float4 &a, float f);
-
-ccl_device_inline int4 operator<(const float4 &a, const float4 &b);
-ccl_device_inline int4 operator>=(const float4 &a, const float4 &b);
-ccl_device_inline int4 operator<=(const float4 &a, const float4 &b);
-ccl_device_inline bool operator==(const float4 &a, const float4 &b);
-
-ccl_device_inline float distance(const float4 &a, const float4 &b);
-ccl_device_inline float dot(const float4 &a, const float4 &b);
-ccl_device_inline float len_squared(const float4 &a);
-ccl_device_inline float4 rcp(const float4 &a);
-ccl_device_inline float4 sqrt(const float4 &a);
-ccl_device_inline float4 sqr(const float4 &a);
-ccl_device_inline float4 cross(const float4 &a, const float4 &b);
-ccl_device_inline bool is_zero(const float4 &a);
-ccl_device_inline float average(const float4 &a);
-ccl_device_inline float len(const float4 &a);
-ccl_device_inline float4 normalize(const float4 &a);
-ccl_device_inline float4 safe_normalize(const float4 &a);
-ccl_device_inline float4 min(const float4 &a, const float4 &b);
-ccl_device_inline float4 max(const float4 &a, const float4 &b);
-ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx);
-ccl_device_inline float4 fabs(const float4 &a);
-ccl_device_inline float4 floor(const float4 &a);
-ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
-#endif /* !__KERNEL_METAL__*/
-
-ccl_device_inline float4 safe_divide(const float4 a, const float4 b);
-ccl_device_inline float4 safe_divide(const float4 a, const float b);
-
-#ifdef __KERNEL_SSE__
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &b);
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &a, const float4 &b);
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b);
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &a, const float4 &b);
-template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 &a, const float4 &b);
-
-#  ifdef __KERNEL_SSE3__
-template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 &b);
-template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b);
-#  endif
-#endif /* __KERNEL_SSE__ */
-
-ccl_device_inline float reduce_min(const float4 a);
-ccl_device_inline float reduce_max(const float4 a);
-ccl_device_inline float reduce_add(const float4 a);
-
-ccl_device_inline bool isequal(const float4 a, const float4 b);
-
-#ifndef __KERNEL_GPU__
-ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b);
-#endif /* !__KERNEL_GPU__ */
-
-/*******************************************************************************
- * Definition.
- */
-
 ccl_device_inline float4 zero_float4()
 {
 #ifdef __KERNEL_SSE__
@ -103,6 +25,16 @@ ccl_device_inline float4 one_float4()
  return make_float4(1.0f, 1.0f, 1.0f, 1.0f);
 }

+ccl_device_inline int4 cast(const float4 a)
+{
+#ifdef __KERNEL_SSE__
+  return int4(_mm_castps_si128(a));
+#else
+  return make_int4(
+      __float_as_int(a.x), __float_as_int(a.y), __float_as_int(a.z), __float_as_int(a.w));
+#endif
+}
+
 #if !defined(__KERNEL_METAL__)
 ccl_device_inline float4 operator-(const float4 &a)
 {
@ -114,7 +46,7 @@ ccl_device_inline float4 operator-(const float4 &a)
 #  endif
 }

-ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator*(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return float4(_mm_mul_ps(a.m128, b.m128));
@ -123,7 +55,7 @@ ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline float4 operator*(const float4 &a, float f)
+ccl_device_inline float4 operator*(const float4 a, float f)
 {
 #  if defined(__KERNEL_SSE__)
  return a * make_float4(f);
@ -132,17 +64,17 @@ ccl_device_inline float4 operator*(const float4 &a, float f)
 #  endif
 }

-ccl_device_inline float4 operator*(float f, const float4 &a)
+ccl_device_inline float4 operator*(float f, const float4 a)
 {
  return a * f;
 }

-ccl_device_inline float4 operator/(const float4 &a, float f)
+ccl_device_inline float4 operator/(const float4 a, float f)
 {
  return a * (1.0f / f);
 }

-ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator/(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return float4(_mm_div_ps(a.m128, b.m128));
@ -151,12 +83,7 @@ ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline float4 operator+(const float4 &a, const float f)
-{
-  return a + make_float4(f, f, f, f);
-}
-
-ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator+(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return float4(_mm_add_ps(a.m128, b.m128));
@ -165,12 +92,12 @@ ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline float4 operator-(const float4 &a, const float f)
+ccl_device_inline float4 operator+(const float4 a, const float f)
 {
-  return a - make_float4(f, f, f, f);
+  return a + make_float4(f);
 }

-ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator-(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return float4(_mm_sub_ps(a.m128, b.m128));
@ -179,17 +106,22 @@ ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline float4 operator+=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator-(const float4 a, const float f)
+{
+  return a - make_float4(f);
+}
+
+ccl_device_inline float4 operator+=(float4 &a, const float4 b)
 {
  return a = a + b;
 }

-ccl_device_inline float4 operator-=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator-=(float4 &a, const float4 b)
 {
  return a = a - b;
 }

-ccl_device_inline float4 operator*=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator*=(float4 &a, const float4 b)
 {
  return a = a * b;
 }
@ -204,7 +136,7 @@ ccl_device_inline float4 operator/=(float4 &a, float f)
  return a = a / f;
 }

-ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator<(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
@ -213,7 +145,7 @@ ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator>=(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
@ -222,7 +154,7 @@ ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator<=(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
@ -231,7 +163,7 @@ ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline bool operator==(const float4 &a, const float4 &b)
+ccl_device_inline bool operator==(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
@ -240,95 +172,19 @@ ccl_device_inline bool operator==(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline float distance(const float4 &a, const float4 &b)
-{
-  return len(a - b);
-}
-
-ccl_device_inline float dot(const float4 &a, const float4 &b)
-{
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
-  __m128 t = vmulq_f32(a, b);
-  return vaddvq_f32(t);
-#    else
-  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
-#    endif
-#  else
-  return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
-#  endif
-}
-
-ccl_device_inline float len_squared(const float4 &a)
-{
-  return dot(a, a);
-}
-
-ccl_device_inline float4 rcp(const float4 &a)
+ccl_device_inline const float4 operator^(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
-  /* Don't use _mm_rcp_ps due to poor precision. */
-  return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
+  return float4(_mm_xor_ps(a.m128, b.m128));
 #  else
-  return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
+  return make_float4(__uint_as_float(__float_as_uint(a.x) ^ __float_as_uint(b.x)),
+                     __uint_as_float(__float_as_uint(a.y) ^ __float_as_uint(b.y)),
+                     __uint_as_float(__float_as_uint(a.z) ^ __float_as_uint(b.z)),
+                     __uint_as_float(__float_as_uint(a.w) ^ __float_as_uint(b.w)));
 #  endif
 }

-ccl_device_inline float4 sqrt(const float4 &a)
-{
-#  ifdef __KERNEL_SSE__
-  return float4(_mm_sqrt_ps(a.m128));
-#  else
-  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
-#  endif
-}
-
-ccl_device_inline float4 sqr(const float4 &a)
-{
-  return a * a;
-}
-
-ccl_device_inline float4 cross(const float4 &a, const float4 &b)
-{
-#  ifdef __KERNEL_SSE__
-  return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
-         (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
-#  else
-  return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
-#  endif
-}
-
-ccl_device_inline bool is_zero(const float4 &a)
-{
-#  ifdef __KERNEL_SSE__
-  return a == zero_float4();
-#  else
-  return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
-#  endif
-}
-
-ccl_device_inline float average(const float4 &a)
-{
-  return reduce_add(a) * 0.25f;
-}
-
-ccl_device_inline float len(const float4 &a)
-{
-  return sqrtf(dot(a, a));
-}
-
-ccl_device_inline float4 normalize(const float4 &a)
-{
-  return a / len(a);
-}
-
-ccl_device_inline float4 safe_normalize(const float4 &a)
-{
-  float t = len(a);
-  return (t != 0.0f) ? a / t : a;
-}
-
-ccl_device_inline float4 min(const float4 &a, const float4 &b)
+ccl_device_inline float4 min(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return float4(_mm_min_ps(a.m128, b.m128));
@ -337,7 +193,7 @@ ccl_device_inline float4 min(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline float4 max(const float4 &a, const float4 &b)
+ccl_device_inline float4 max(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return float4(_mm_max_ps(a.m128, b.m128));
@ -346,55 +202,119 @@ ccl_device_inline float4 max(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx)
+ccl_device_inline float4 clamp(const float4 a, const float4 mn, const float4 mx)
 {
  return min(max(a, mn), mx);
 }
-
-ccl_device_inline float4 fabs(const float4 &a)
-{
-#  if defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
-  return float4(vabsq_f32(a));
-#    else
-  return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
-#    endif
-#  else
-  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
-#  endif
-}
-
-ccl_device_inline float4 floor(const float4 &a)
-{
-#  ifdef __KERNEL_SSE__
-  return float4(_mm_floor_ps(a));
-#  else
-  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
-#  endif
-}
-
-ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
-{
-  return a + t * (b - a);
-}
-
-ccl_device_inline float4 saturate(const float4 &a)
-{
-  return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
-}
-
-ccl_device_inline float4 exp(float4 v)
-{
-  return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
-}
-
-ccl_device_inline float4 log(float4 v)
-{
-  return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
-}
-
 #endif /* !__KERNEL_METAL__*/

+ccl_device_inline const float4 madd(const float4 a, const float4 b, const float4 c)
+{
+#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_NEON__
+  return float4(vfmaq_f32(c, a, b));
+#  elif defined(__KERNEL_AVX2__)
+  return float4(_mm_fmadd_ps(a, b, c));
+#  else
+  return a * b + c;
+#  endif
+#else
+  return a * b + c;
+#endif
+}
+
+ccl_device_inline float4 msub(const float4 a, const float4 b, const float4 c)
+{
+#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_NEON__
+  return float4(vfmaq_f32(vnegq_f32(c), a, b));
+#  elif defined(__KERNEL_AVX2__)
+  return float4(_mm_fmsub_ps(a, b, c));
+#  else
+  return a * b - c;
+#  endif
+#else
+  return a * b - c;
+#endif
+}
+
+#ifdef __KERNEL_SSE__
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const float4 shuffle(const float4 b)
+{
+#  ifdef __KERNEL_NEON__
+  return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(b.m128));
+#  else
+  return float4(
+      _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0))));
+#  endif
+}
+
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a)
+{
+  return float4(_mm_movelh_ps(a, a));
+}
+
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a)
+{
+  return float4(_mm_movehl_ps(a, a));
+}
+
+#  ifdef __KERNEL_SSE3__
+template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 b)
+{
+  return float4(_mm_moveldup_ps(b));
+}
+
+template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 b)
+{
+  return float4(_mm_movehdup_ps(b));
+}
+#  endif /* __KERNEL_SSE3__ */
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const float4 shuffle(const float4 a, const float4 b)
+{
+#  ifdef __KERNEL_NEON__
+  return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b));
+#  else
+  return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)));
+#  endif
+}
+
+template<size_t i0> __forceinline const float4 shuffle(const float4 b)
+{
+  return shuffle<i0, i0, i0, i0>(b);
+}
+template<size_t i0> __forceinline const float4 shuffle(const float4 a, const float4 b)
+{
+#  ifdef __KERNEL_NEON__
+  return float4(shuffle_neon<float32x4_t, i0, i0, i0, i0>(a, b));
+#  else
+  return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0)));
+#  endif
+}
+
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a, const float4 b)
+{
+  return float4(_mm_movelh_ps(a, b));
+}
+
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a, const float4 b)
+{
+  return float4(_mm_movehl_ps(b, a));
+}
+
+template<size_t i> __forceinline float extract(const float4 a)
+{
+  return _mm_cvtss_f32(shuffle<i, i, i, i>(a));
+}
+template<> __forceinline float extract<0>(const float4 a)
+{
+  return _mm_cvtss_f32(a);
+}
+#endif
+
 ccl_device_inline float reduce_add(const float4 a)
 {
 #if defined(__KERNEL_SSE__)
@ -440,6 +360,166 @@ ccl_device_inline float reduce_max(const float4 a)
 #endif
 }

+#if !defined(__KERNEL_METAL__)
+ccl_device_inline float dot(const float4 a, const float4 b)
+{
+#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  __m128 t = vmulq_f32(a, b);
+  return vaddvq_f32(t);
+#    else
+  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
+#    endif
+#  else
+  return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+#  endif
+}
+#endif /* !defined(__KERNEL_METAL__) */
+
+ccl_device_inline float len(const float4 a)
+{
+  return sqrtf(dot(a, a));
+}
+
+ccl_device_inline float len_squared(const float4 a)
+{
+  return dot(a, a);
+}
+
+#if !defined(__KERNEL_METAL__)
+ccl_device_inline float distance(const float4 a, const float4 b)
+{
+  return len(a - b);
+}
+
+ccl_device_inline float4 rcp(const float4 a)
+{
+#  ifdef __KERNEL_SSE__
+  /* Don't use _mm_rcp_ps due to poor precision. */
+  return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
+#  else
+  return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
+#  endif
+}
+
+ccl_device_inline float4 sqrt(const float4 a)
+{
+#  ifdef __KERNEL_SSE__
+  return float4(_mm_sqrt_ps(a.m128));
+#  else
+  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
+#  endif
+}
+
+ccl_device_inline float4 sqr(const float4 a)
+{
+  return a * a;
+}
+
+ccl_device_inline float4 cross(const float4 a, const float4 b)
+{
+#  ifdef __KERNEL_SSE__
+  return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
+         (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
+#  else
+  return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
+#  endif
+}
+
+ccl_device_inline bool is_zero(const float4 a)
+{
+#  ifdef __KERNEL_SSE__
+  return a == zero_float4();
+#  else
+  return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+#  endif
+}
+
+ccl_device_inline float average(const float4 a)
+{
+  return reduce_add(a) * 0.25f;
+}
+
+ccl_device_inline float4 normalize(const float4 a)
+{
+  return a / len(a);
+}
+
+ccl_device_inline float4 safe_normalize(const float4 a)
+{
+  float t = len(a);
+  return (t != 0.0f) ? a / t : a;
+}
+
+ccl_device_inline float4 fabs(const float4 a)
+{
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  return float4(vabsq_f32(a));
+#    else
+  return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+#    endif
+#  else
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+#  endif
+}
+
+ccl_device_inline float4 floor(const float4 a)
+{
+#  ifdef __KERNEL_SSE__
+#    if defined(__KERNEL_NEON__)
+  return float4(vrndmq_f32(a));
+#    else
+  return float4(_mm_floor_ps(a));
+#    endif
+#  else
+  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+#  endif
+}
+
+ccl_device_inline float4 floorfrac(const float4 x, ccl_private int4 *i)
+{
+#  ifdef __KERNEL_SSE__
+  const float4 f = floor(x);
+  *i = int4(_mm_cvttps_epi32(f.m128));
+  return x - f;
+#  else
+  float4 r;
+  r.x = floorfrac(x.x, &i->x);
+  r.y = floorfrac(x.y, &i->y);
+  r.z = floorfrac(x.z, &i->z);
+  r.w = floorfrac(x.w, &i->w);
+  return r;
+#  endif
+}
+
+ccl_device_inline float4 mix(const float4 a, const float4 b, float t)
+{
+  return a + t * (b - a);
+}
+
+ccl_device_inline float4 mix(const float4 a, const float4 b, const float4 t)
+{
+  return a + t * (b - a);
+}
+
+ccl_device_inline float4 saturate(const float4 a)
+{
+  return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
+}
+
+ccl_device_inline float4 exp(float4 v)
+{
+  return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
+}
+
+ccl_device_inline float4 log(float4 v)
+{
+  return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
+}
+
+#endif /* !__KERNEL_METAL__*/
+
 ccl_device_inline bool isequal(const float4 a, const float4 b)
 {
 #if defined(__KERNEL_METAL__)
@ -449,68 +529,23 @@ ccl_device_inline bool isequal(const float4 a, const float4 b)
 #endif
 }

-#ifdef __KERNEL_SSE__
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &b)
-{
-#  if defined(__KERNEL_NEON__)
-  return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(b.m128));
-#  else
-  return float4(_mm_castsi128_ps(
-      _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
-#  endif
-}
-
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &a, const float4 &b)
-{
-#  if defined(__KERNEL_NEON__)
-  return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(a.m128, b.m128));
-#  else
-  return float4(_mm_shuffle_ps(a.m128, b.m128, _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
-#  endif
-}
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b)
-{
-  return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))));
-}
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &a, const float4 &b)
-{
-  return float4(_mm_movelh_ps(a.m128, b.m128));
-}
-
-template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 &a, const float4 &b)
-{
-  return float4(_mm_movehl_ps(b.m128, a.m128));
-}
-
-#  ifdef __KERNEL_SSE3__
-template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 &b)
-{
-  return float4(_mm_moveldup_ps(b));
-}
-
-template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b)
-{
-  return float4(_mm_movehdup_ps(b));
-}
-#  endif /* __KERNEL_SSE3__ */
-#endif   /* __KERNEL_SSE__ */
-
 #ifndef __KERNEL_GPU__
-ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b)
+ccl_device_inline float4 select(const int4 mask, const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
+#    ifdef __KERNEL_SSE41__
  return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
+#    else
+  return float4(
+      _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(mask), a), _mm_andnot_ps(_mm_castsi128_ps(mask), b)));
+#    endif
 #  else
  return make_float4(
      (mask.x) ? a.x : b.x, (mask.y) ? a.y : b.y, (mask.z) ? a.z : b.z, (mask.w) ? a.w : b.w);
 #  endif
 }

-ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
+ccl_device_inline float4 mask(const int4 mask, const float4 a)
 {
  /* Replace elements of x with zero where mask isn't set. */
  return select(mask, a, zero_float4());
--- a/intern/cycles/util/math_float8.h
+++ b/intern/cycles/util/math_float8.h
@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
 * Copyright 2022 Blender Foundation */

 #ifndef __UTIL_MATH_FLOAT8_H__
@ -10,193 +11,138 @@

 CCL_NAMESPACE_BEGIN

-/*******************************************************************************
- * Declaration.
- */
-
-ccl_device_inline float8_t operator+(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator+(const float8_t a, const float f);
-ccl_device_inline float8_t operator+(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator-(const float8_t a);
-ccl_device_inline float8_t operator-(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator-(const float8_t a, const float f);
-ccl_device_inline float8_t operator-(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator*(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator*(const float8_t a, const float f);
-ccl_device_inline float8_t operator*(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator/(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator/(const float8_t a, float f);
-ccl_device_inline float8_t operator/(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator+=(float8_t a, const float8_t b);
-
-ccl_device_inline float8_t operator*=(float8_t a, const float8_t b);
-ccl_device_inline float8_t operator*=(float8_t a, float f);
-
-ccl_device_inline float8_t operator/=(float8_t a, float f);
-
-ccl_device_inline bool operator==(const float8_t a, const float8_t b);
-
-ccl_device_inline float8_t rcp(const float8_t a);
-ccl_device_inline float8_t sqrt(const float8_t a);
-ccl_device_inline float8_t sqr(const float8_t a);
-ccl_device_inline bool is_zero(const float8_t a);
-ccl_device_inline float average(const float8_t a);
-ccl_device_inline float8_t min(const float8_t a, const float8_t b);
-ccl_device_inline float8_t max(const float8_t a, const float8_t b);
-ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx);
-ccl_device_inline float8_t fabs(const float8_t a);
-ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t);
-ccl_device_inline float8_t saturate(const float8_t a);
-
-ccl_device_inline float8_t safe_divide(const float8_t a, const float b);
-ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b);
-
-ccl_device_inline float reduce_min(const float8_t a);
-ccl_device_inline float reduce_max(const float8_t a);
-ccl_device_inline float reduce_add(const float8_t a);
-
-ccl_device_inline bool isequal(const float8_t a, const float8_t b);
-
-/*******************************************************************************
- * Definition.
- */
-
-ccl_device_inline float8_t zero_float8_t()
+ccl_device_inline vfloat8 zero_vfloat8()
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_setzero_ps());
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_setzero_ps());
 #else
-  return make_float8_t(0.0f);
+  return make_vfloat8(0.0f);
 #endif
 }

-ccl_device_inline float8_t one_float8_t()
+ccl_device_inline vfloat8 one_vfloat8()
 {
-  return make_float8_t(1.0f);
+  return make_vfloat8(1.0f);
 }

-ccl_device_inline float8_t operator+(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator+(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_add_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_add_ps(a.m256, b.m256));
 #else
-  return make_float8_t(
+  return make_vfloat8(
      a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h);
 #endif
 }

-ccl_device_inline float8_t operator+(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator+(const vfloat8 a, const float f)
 {
-  return a + make_float8_t(f);
+  return a + make_vfloat8(f);
 }

-ccl_device_inline float8_t operator+(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator+(const float f, const vfloat8 a)
 {
-  return make_float8_t(f) + a;
+  return make_vfloat8(f) + a;
 }

-ccl_device_inline float8_t operator-(const float8_t a)
+ccl_device_inline vfloat8 operator-(const vfloat8 a)
 {
-#ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX__
  __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
-  return float8_t(_mm256_xor_ps(a.m256, mask));
+  return vfloat8(_mm256_xor_ps(a.m256, mask));
 #else
-  return make_float8_t(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h);
+  return make_vfloat8(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h);
 #endif
 }

-ccl_device_inline float8_t operator-(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator-(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_sub_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_sub_ps(a.m256, b.m256));
 #else
-  return make_float8_t(
+  return make_vfloat8(
      a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h);
 #endif
 }

-ccl_device_inline float8_t operator-(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator-(const vfloat8 a, const float f)
 {
-  return a - make_float8_t(f);
+  return a - make_vfloat8(f);
 }

-ccl_device_inline float8_t operator-(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator-(const float f, const vfloat8 a)
 {
-  return make_float8_t(f) - a;
+  return make_vfloat8(f) - a;
 }

-ccl_device_inline float8_t operator*(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator*(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_mul_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_mul_ps(a.m256, b.m256));
 #else
-  return make_float8_t(
+  return make_vfloat8(
      a.a * b.a, a.b * b.b, a.c * b.c, a.d * b.d, a.e * b.e, a.f * b.f, a.g * b.g, a.h * b.h);
 #endif
 }

-ccl_device_inline float8_t operator*(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator*(const vfloat8 a, const float f)
 {
-  return a * make_float8_t(f);
+  return a * make_vfloat8(f);
 }

-ccl_device_inline float8_t operator*(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator*(const float f, const vfloat8 a)
 {
-  return make_float8_t(f) * a;
+  return make_vfloat8(f) * a;
 }

-ccl_device_inline float8_t operator/(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator/(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_div_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_div_ps(a.m256, b.m256));
 #else
-  return make_float8_t(
+  return make_vfloat8(
      a.a / b.a, a.b / b.b, a.c / b.c, a.d / b.d, a.e / b.e, a.f / b.f, a.g / b.g, a.h / b.h);
 #endif
 }

-ccl_device_inline float8_t operator/(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator/(const vfloat8 a, const float f)
 {
-  return a / make_float8_t(f);
+  return a / make_vfloat8(f);
 }

-ccl_device_inline float8_t operator/(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator/(const float f, const vfloat8 a)
 {
-  return make_float8_t(f) / a;
+  return make_vfloat8(f) / a;
 }

-ccl_device_inline float8_t operator+=(float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator+=(vfloat8 a, const vfloat8 b)
 {
  return a = a + b;
 }

-ccl_device_inline float8_t operator-=(float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator-=(vfloat8 a, const vfloat8 b)
 {
  return a = a - b;
 }

-ccl_device_inline float8_t operator*=(float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator*=(vfloat8 a, const vfloat8 b)
 {
  return a = a * b;
 }

-ccl_device_inline float8_t operator*=(float8_t a, float f)
+ccl_device_inline vfloat8 operator*=(vfloat8 a, float f)
 {
  return a = a * f;
 }

-ccl_device_inline float8_t operator/=(float8_t a, float f)
+ccl_device_inline vfloat8 operator/=(vfloat8 a, float f)
 {
  return a = a / f;
 }

-ccl_device_inline bool operator==(const float8_t a, const float8_t b)
+ccl_device_inline bool operator==(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX__
  return (_mm256_movemask_ps(_mm256_castsi256_ps(
              _mm256_cmpeq_epi32(_mm256_castps_si256(a.m256), _mm256_castps_si256(b.m256)))) &
          0b11111111) == 0b11111111;
@ -206,132 +152,180 @@ ccl_device_inline bool operator==(const float8_t a, const float8_t b)
 #endif
 }

-ccl_device_inline float8_t rcp(const float8_t a)
+ccl_device_inline const vfloat8 operator^(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_rcp_ps(a.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_xor_ps(a.m256, b.m256));
 #else
-  return make_float8_t(1.0f / a.a,
-                       1.0f / a.b,
-                       1.0f / a.c,
-                       1.0f / a.d,
-                       1.0f / a.e,
-                       1.0f / a.f,
-                       1.0f / a.g,
-                       1.0f / a.h);
+  return make_vfloat8(__uint_as_float(__float_as_uint(a.a) ^ __float_as_uint(b.a)),
+                      __uint_as_float(__float_as_uint(a.b) ^ __float_as_uint(b.b)),
+                      __uint_as_float(__float_as_uint(a.c) ^ __float_as_uint(b.c)),
+                      __uint_as_float(__float_as_uint(a.d) ^ __float_as_uint(b.d)),
+                      __uint_as_float(__float_as_uint(a.e) ^ __float_as_uint(b.e)),
+                      __uint_as_float(__float_as_uint(a.f) ^ __float_as_uint(b.f)),
+                      __uint_as_float(__float_as_uint(a.g) ^ __float_as_uint(b.g)),
+                      __uint_as_float(__float_as_uint(a.h) ^ __float_as_uint(b.h)));
 #endif
 }

-ccl_device_inline float8_t sqrt(const float8_t a)
+ccl_device_inline vfloat8 rcp(const vfloat8 a)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_sqrt_ps(a.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_rcp_ps(a.m256));
 #else
-  return make_float8_t(sqrtf(a.a),
-                       sqrtf(a.b),
-                       sqrtf(a.c),
-                       sqrtf(a.d),
-                       sqrtf(a.e),
-                       sqrtf(a.f),
-                       sqrtf(a.g),
-                       sqrtf(a.h));
+  return make_vfloat8(1.0f / a.a,
+                      1.0f / a.b,
+                      1.0f / a.c,
+                      1.0f / a.d,
+                      1.0f / a.e,
+                      1.0f / a.f,
+                      1.0f / a.g,
+                      1.0f / a.h);
 #endif
 }

-ccl_device_inline float8_t sqr(const float8_t a)
+ccl_device_inline vfloat8 sqrt(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_sqrt_ps(a.m256));
+#else
+  return make_vfloat8(sqrtf(a.a),
+                      sqrtf(a.b),
+                      sqrtf(a.c),
+                      sqrtf(a.d),
+                      sqrtf(a.e),
+                      sqrtf(a.f),
+                      sqrtf(a.g),
+                      sqrtf(a.h));
+#endif
+}
+
+ccl_device_inline vfloat8 sqr(const vfloat8 a)
 {
  return a * a;
 }

-ccl_device_inline bool is_zero(const float8_t a)
+ccl_device_inline bool is_zero(const vfloat8 a)
 {
-  return a == make_float8_t(0.0f);
+  return a == make_vfloat8(0.0f);
 }

-ccl_device_inline float average(const float8_t a)
+ccl_device_inline float reduce_add(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+  vfloat8 b(_mm256_hadd_ps(a.m256, a.m256));
+  vfloat8 h(_mm256_hadd_ps(b.m256, b.m256));
+  return h[0] + h[4];
+#else
+  return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h;
+#endif
+}
+
+ccl_device_inline float average(const vfloat8 a)
 {
  return reduce_add(a) / 8.0f;
 }

-ccl_device_inline float8_t min(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 min(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_min_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_min_ps(a.m256, b.m256));
 #else
-  return make_float8_t(min(a.a, b.a),
-                       min(a.b, b.b),
-                       min(a.c, b.c),
-                       min(a.d, b.d),
-                       min(a.e, b.e),
-                       min(a.f, b.f),
-                       min(a.g, b.g),
-                       min(a.h, b.h));
+  return make_vfloat8(min(a.a, b.a),
+                      min(a.b, b.b),
+                      min(a.c, b.c),
+                      min(a.d, b.d),
+                      min(a.e, b.e),
+                      min(a.f, b.f),
+                      min(a.g, b.g),
+                      min(a.h, b.h));
 #endif
 }

-ccl_device_inline float8_t max(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 max(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_max_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_max_ps(a.m256, b.m256));
 #else
-  return make_float8_t(max(a.a, b.a),
-                       max(a.b, b.b),
-                       max(a.c, b.c),
-                       max(a.d, b.d),
-                       max(a.e, b.e),
-                       max(a.f, b.f),
-                       max(a.g, b.g),
-                       max(a.h, b.h));
+  return make_vfloat8(max(a.a, b.a),
+                      max(a.b, b.b),
+                      max(a.c, b.c),
+                      max(a.d, b.d),
+                      max(a.e, b.e),
+                      max(a.f, b.f),
+                      max(a.g, b.g),
+                      max(a.h, b.h));
 #endif
 }

-ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx)
+ccl_device_inline vfloat8 clamp(const vfloat8 a, const vfloat8 mn, const vfloat8 mx)
 {
  return min(max(a, mn), mx);
 }

-ccl_device_inline float8_t fabs(const float8_t a)
+ccl_device_inline vfloat8 select(const vint8 mask, const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_blendv_ps(b, a, _mm256_castsi256_ps(mask)));
 #else
-  return make_float8_t(fabsf(a.a),
-                       fabsf(a.b),
-                       fabsf(a.c),
-                       fabsf(a.d),
-                       fabsf(a.e),
-                       fabsf(a.f),
-                       fabsf(a.g),
-                       fabsf(a.h));
+  return make_vfloat8((mask.a) ? a.a : b.a,
+                      (mask.b) ? a.b : b.b,
+                      (mask.c) ? a.c : b.c,
+                      (mask.d) ? a.d : b.d,
+                      (mask.e) ? a.e : b.e,
+                      (mask.f) ? a.f : b.f,
+                      (mask.g) ? a.g : b.g,
+                      (mask.h) ? a.h : b.h);
 #endif
 }

-ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t)
+ccl_device_inline vfloat8 fabs(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+#else
+  return make_vfloat8(fabsf(a.a),
+                      fabsf(a.b),
+                      fabsf(a.c),
+                      fabsf(a.d),
+                      fabsf(a.e),
+                      fabsf(a.f),
+                      fabsf(a.g),
+                      fabsf(a.h));
+#endif
+}
+
+ccl_device_inline vfloat8 mix(const vfloat8 a, const vfloat8 b, float t)
 {
  return a + t * (b - a);
 }

-ccl_device_inline float8_t saturate(const float8_t a)
+ccl_device_inline vfloat8 mix(const vfloat8 a, const vfloat8 b, vfloat8 t)
 {
-  return clamp(a, make_float8_t(0.0f), make_float8_t(1.0f));
+  return a + t * (b - a);
 }

-ccl_device_inline float8_t exp(float8_t v)
+ccl_device_inline vfloat8 saturate(const vfloat8 a)
 {
-  return make_float8_t(
+  return clamp(a, make_vfloat8(0.0f), make_vfloat8(1.0f));
+}
+
+ccl_device_inline vfloat8 exp(vfloat8 v)
+{
+  return make_vfloat8(
      expf(v.a), expf(v.b), expf(v.c), expf(v.d), expf(v.e), expf(v.f), expf(v.g), expf(v.h));
 }

-ccl_device_inline float8_t log(float8_t v)
+ccl_device_inline vfloat8 log(vfloat8 v)
 {
-  return make_float8_t(
+  return make_vfloat8(
      logf(v.a), logf(v.b), logf(v.c), logf(v.d), logf(v.e), logf(v.f), logf(v.g), logf(v.h));
 }

-ccl_device_inline float dot(const float8_t a, const float8_t b)
+ccl_device_inline float dot(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  float8_t t(_mm256_dp_ps(a.m256, b.m256, 0xFF));
+#ifdef __KERNEL_AVX__
+  vfloat8 t(_mm256_dp_ps(a.m256, b.m256, 0xFF));
  return t[0] + t[4];
 #else
  return (a.a * b.a) + (a.b * b.b) + (a.c * b.c) + (a.d * b.d) + (a.e * b.e) + (a.f * b.f) +
@ -339,62 +333,51 @@ ccl_device_inline float dot(const float8_t a, const float8_t b)
 #endif
 }

-ccl_device_inline float8_t pow(float8_t v, float e)
+ccl_device_inline vfloat8 pow(vfloat8 v, float e)
 {
-  return make_float8_t(powf(v.a, e),
-                       powf(v.b, e),
-                       powf(v.c, e),
-                       powf(v.d, e),
-                       powf(v.e, e),
-                       powf(v.f, e),
-                       powf(v.g, e),
-                       powf(v.h, e));
+  return make_vfloat8(powf(v.a, e),
+                      powf(v.b, e),
+                      powf(v.c, e),
+                      powf(v.d, e),
+                      powf(v.e, e),
+                      powf(v.f, e),
+                      powf(v.g, e),
+                      powf(v.h, e));
 }

-ccl_device_inline float reduce_min(const float8_t a)
+ccl_device_inline float reduce_min(const vfloat8 a)
 {
  return min(min(min(a.a, a.b), min(a.c, a.d)), min(min(a.e, a.f), min(a.g, a.h)));
 }

-ccl_device_inline float reduce_max(const float8_t a)
+ccl_device_inline float reduce_max(const vfloat8 a)
 {
  return max(max(max(a.a, a.b), max(a.c, a.d)), max(max(a.e, a.f), max(a.g, a.h)));
 }

-ccl_device_inline float reduce_add(const float8_t a)
-{
-#ifdef __KERNEL_AVX2__
-  float8_t b(_mm256_hadd_ps(a.m256, a.m256));
-  float8_t h(_mm256_hadd_ps(b.m256, b.m256));
-  return h[0] + h[4];
-#else
-  return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h;
-#endif
-}
-
-ccl_device_inline bool isequal(const float8_t a, const float8_t b)
+ccl_device_inline bool isequal(const vfloat8 a, const vfloat8 b)
 {
  return a == b;
 }

-ccl_device_inline float8_t safe_divide(const float8_t a, const float b)
+ccl_device_inline vfloat8 safe_divide(const vfloat8 a, const float b)
 {
-  return (b != 0.0f) ? a / b : make_float8_t(0.0f);
+  return (b != 0.0f) ? a / b : make_vfloat8(0.0f);
 }

-ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 safe_divide(const vfloat8 a, const vfloat8 b)
 {
-  return make_float8_t((b.a != 0.0f) ? a.a / b.a : 0.0f,
-                       (b.b != 0.0f) ? a.b / b.b : 0.0f,
-                       (b.c != 0.0f) ? a.c / b.c : 0.0f,
-                       (b.d != 0.0f) ? a.d / b.d : 0.0f,
-                       (b.e != 0.0f) ? a.e / b.e : 0.0f,
-                       (b.f != 0.0f) ? a.f / b.f : 0.0f,
-                       (b.g != 0.0f) ? a.g / b.g : 0.0f,
-                       (b.h != 0.0f) ? a.h / b.h : 0.0f);
+  return make_vfloat8((b.a != 0.0f) ? a.a / b.a : 0.0f,
+                      (b.b != 0.0f) ? a.b / b.b : 0.0f,
+                      (b.c != 0.0f) ? a.c / b.c : 0.0f,
+                      (b.d != 0.0f) ? a.d / b.d : 0.0f,
+                      (b.e != 0.0f) ? a.e / b.e : 0.0f,
+                      (b.f != 0.0f) ? a.f / b.f : 0.0f,
+                      (b.g != 0.0f) ? a.g / b.g : 0.0f,
+                      (b.h != 0.0f) ? a.h / b.h : 0.0f);
 }

-ccl_device_inline float8_t ensure_finite(float8_t v)
+ccl_device_inline vfloat8 ensure_finite(vfloat8 v)
 {
  v.a = ensure_finite(v.a);
  v.b = ensure_finite(v.b);
@ -408,12 +391,92 @@ ccl_device_inline float8_t ensure_finite(float8_t v)
  return v;
 }

-ccl_device_inline bool isfinite_safe(float8_t v)
+ccl_device_inline bool isfinite_safe(vfloat8 v)
 {
  return isfinite_safe(v.a) && isfinite_safe(v.b) && isfinite_safe(v.c) && isfinite_safe(v.d) &&
         isfinite_safe(v.e) && isfinite_safe(v.f) && isfinite_safe(v.g) && isfinite_safe(v.h);
 }

+ccl_device_inline vint8 cast(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_castps_si256(a));
+#else
+  return make_vint8(__float_as_int(a.a),
+                    __float_as_int(a.b),
+                    __float_as_int(a.c),
+                    __float_as_int(a.d),
+                    __float_as_int(a.e),
+                    __float_as_int(a.f),
+                    __float_as_int(a.g),
+                    __float_as_int(a.h));
+#endif
+}
+
+#ifdef __KERNEL_SSE__
+ccl_device_forceinline float4 low(const vfloat8 a)
+{
+#  ifdef __KERNEL_AVX__
+  return float4(_mm256_extractf128_ps(a.m256, 0));
+#  else
+  return make_float4(a.e, a.f, a.g, a.h);
+#  endif
+}
+ccl_device_forceinline float4 high(const vfloat8 a)
+{
+#  ifdef __KERNEL_AVX__
+  return float4(_mm256_extractf128_ps(a.m256, 1));
+#  else
+  return make_float4(a.a, a.b, a.c, a.d);
+#  endif
+}
+
+template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
+{
+#  ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_permutevar_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0)));
+#  else
+  return make_vfloat8(a[i0], a[i1], a[i2], a[i3], a[i4 + 4], a[i5 + 4], a[i6 + 4], a[i7 + 4]);
+#  endif
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a, const vfloat8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)));
+#  else
+  return make_vfloat8(shuffle<i0, i1, i2, i3>(high(a), high(b)),
+                      shuffle<i0, i1, i2, i3>(low(a), low(b)));
+#  endif
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
+{
+  return shuffle<i0, i1, i2, i3>(a, a);
+}
+template<size_t i0> ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a, const vfloat8 b)
+{
+  return shuffle<i0, i0, i0, i0>(a, b);
+}
+template<size_t i0> ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
+{
+  return shuffle<i0>(a, a);
+}
+
+template<size_t i> ccl_device_forceinline float extract(const vfloat8 a)
+{
+#  ifdef __KERNEL_AVX__
+  __m256 b = shuffle<i, i, i, i>(a).m256;
+  return _mm256_cvtss_f32(b);
+#  else
+  return a[i];
+#  endif
+}
+#endif
+
 CCL_NAMESPACE_END

 #endif /* __UTIL_MATH_FLOAT8_H__ */
--- a/intern/cycles/util/math_int2.h
+++ b/intern/cycles/util/math_int2.h
@ -10,23 +10,6 @@

 CCL_NAMESPACE_BEGIN

-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline bool operator==(const int2 a, const int2 b);
-ccl_device_inline int2 operator+(const int2 &a, const int2 &b);
-ccl_device_inline int2 operator+=(int2 &a, const int2 &b);
-ccl_device_inline int2 operator-(const int2 &a, const int2 &b);
-ccl_device_inline int2 operator*(const int2 &a, const int2 &b);
-ccl_device_inline int2 operator/(const int2 &a, const int2 &b);
-#endif /* !__KERNEL_METAL__ */
-
-/*******************************************************************************
- * Definition.
- */
-
 #if !defined(__KERNEL_METAL__)
 ccl_device_inline bool operator==(const int2 a, const int2 b)
 {
--- a/intern/cycles/util/math_int3.h
+++ b/intern/cycles/util/math_int3.h
@ -10,21 +10,6 @@

 CCL_NAMESPACE_BEGIN

-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline int3 min(int3 a, int3 b);
-ccl_device_inline int3 max(int3 a, int3 b);
-ccl_device_inline int3 clamp(const int3 &a, int mn, int mx);
-ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx);
-#endif /*  !defined(__KERNEL_METAL__) */
-
-/*******************************************************************************
- * Definition.
- */
-
 #if !defined(__KERNEL_METAL__)
 ccl_device_inline int3 min(int3 a, int3 b)
 {
@ -44,7 +29,7 @@ ccl_device_inline int3 max(int3 a, int3 b)
 #  endif
 }

-ccl_device_inline int3 clamp(const int3 &a, int mn, int mx)
+ccl_device_inline int3 clamp(const int3 a, int mn, int mx)
 {
 #  ifdef __KERNEL_SSE__
  return min(max(a, make_int3(mn)), make_int3(mx));
@ -53,7 +38,7 @@ ccl_device_inline int3 clamp(const int3 &a, int mn, int mx)
 #  endif
 }

-ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx)
+ccl_device_inline int3 clamp(const int3 a, int3 &mn, int mx)
 {
 #  ifdef __KERNEL_SSE__
  return min(max(a, mn), make_int3(mx));
@ -62,22 +47,22 @@ ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx)
 #  endif
 }

-ccl_device_inline bool operator==(const int3 &a, const int3 &b)
+ccl_device_inline bool operator==(const int3 a, const int3 b)
 {
  return a.x == b.x && a.y == b.y && a.z == b.z;
 }

-ccl_device_inline bool operator!=(const int3 &a, const int3 &b)
+ccl_device_inline bool operator!=(const int3 a, const int3 b)
 {
  return !(a == b);
 }

-ccl_device_inline bool operator<(const int3 &a, const int3 &b)
+ccl_device_inline bool operator<(const int3 a, const int3 b)
 {
  return a.x < b.x && a.y < b.y && a.z < b.z;
 }

-ccl_device_inline int3 operator+(const int3 &a, const int3 &b)
+ccl_device_inline int3 operator+(const int3 a, const int3 b)
 {
 #  ifdef __KERNEL_SSE__
  return int3(_mm_add_epi32(a.m128, b.m128));
@ -86,7 +71,7 @@ ccl_device_inline int3 operator+(const int3 &a, const int3 &b)
 #  endif
 }

-ccl_device_inline int3 operator-(const int3 &a, const int3 &b)
+ccl_device_inline int3 operator-(const int3 a, const int3 b)
 {
 #  ifdef __KERNEL_SSE__
  return int3(_mm_sub_epi32(a.m128, b.m128));
--- a/intern/cycles/util/math_int4.h
+++ b/intern/cycles/util/math_int4.h
@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
 * Copyright 2011-2022 Blender Foundation */

 #ifndef __UTIL_MATH_INT4_H__
@ -10,30 +11,8 @@

 CCL_NAMESPACE_BEGIN

-/*******************************************************************************
- * Declaration.
- */
-
 #ifndef __KERNEL_GPU__
-ccl_device_inline int4 operator+(const int4 &a, const int4 &b);
-ccl_device_inline int4 operator+=(int4 &a, const int4 &b);
-ccl_device_inline int4 operator>>(const int4 &a, int i);
-ccl_device_inline int4 operator<<(const int4 &a, int i);
-ccl_device_inline int4 operator<(const int4 &a, const int4 &b);
-ccl_device_inline int4 operator>=(const int4 &a, const int4 &b);
-ccl_device_inline int4 operator&(const int4 &a, const int4 &b);
-ccl_device_inline int4 min(int4 a, int4 b);
-ccl_device_inline int4 max(int4 a, int4 b);
-ccl_device_inline int4 clamp(const int4 &a, const int4 &mn, const int4 &mx);
-ccl_device_inline int4 select(const int4 &mask, const int4 &a, const int4 &b);
-#endif /* __KERNEL_GPU__ */
-
-/*******************************************************************************
- * Definition.
- */
-
-#ifndef __KERNEL_GPU__
-ccl_device_inline int4 operator+(const int4 &a, const int4 &b)
+ccl_device_inline int4 operator+(const int4 a, const int4 b)
 {
 #  ifdef __KERNEL_SSE__
  return int4(_mm_add_epi32(a.m128, b.m128));
@ -42,12 +21,26 @@ ccl_device_inline int4 operator+(const int4 &a, const int4 &b)
 #  endif
 }

-ccl_device_inline int4 operator+=(int4 &a, const int4 &b)
+ccl_device_inline int4 operator+=(int4 &a, const int4 b)
 {
  return a = a + b;
 }

-ccl_device_inline int4 operator>>(const int4 &a, int i)
+ccl_device_inline int4 operator-(const int4 a, const int4 b)
+{
+#  ifdef __KERNEL_SSE__
+  return int4(_mm_sub_epi32(a.m128, b.m128));
+#  else
+  return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+#  endif
+}
+
+ccl_device_inline int4 operator-=(int4 &a, const int4 b)
+{
+  return a = a - b;
+}
+
+ccl_device_inline int4 operator>>(const int4 a, int i)
 {
 #  ifdef __KERNEL_SSE__
  return int4(_mm_srai_epi32(a.m128, i));
@ -56,7 +49,7 @@ ccl_device_inline int4 operator>>(const int4 &a, int i)
 #  endif
 }

-ccl_device_inline int4 operator<<(const int4 &a, int i)
+ccl_device_inline int4 operator<<(const int4 a, int i)
 {
 #  ifdef __KERNEL_SSE__
  return int4(_mm_slli_epi32(a.m128, i));
@ -65,7 +58,7 @@ ccl_device_inline int4 operator<<(const int4 &a, int i)
 #  endif
 }

-ccl_device_inline int4 operator<(const int4 &a, const int4 &b)
+ccl_device_inline int4 operator<(const int4 a, const int4 b)
 {
 #  ifdef __KERNEL_SSE__
  return int4(_mm_cmplt_epi32(a.m128, b.m128));
@ -74,7 +67,26 @@ ccl_device_inline int4 operator<(const int4 &a, const int4 &b)
 #  endif
 }

-ccl_device_inline int4 operator>=(const int4 &a, const int4 &b)
+ccl_device_inline int4 operator<(const int4 a, const int b)
+{
+  return a < make_int4(b);
+}
+
+ccl_device_inline int4 operator==(const int4 a, const int4 b)
+{
+#  ifdef __KERNEL_SSE__
+  return int4(_mm_cmpeq_epi32(a.m128, b.m128));
+#  else
+  return make_int4(a.x == b.x, a.y == b.y, a.z == b.z, a.w == b.w);
+#  endif
+}
+
+ccl_device_inline int4 operator==(const int4 a, const int b)
+{
+  return a == make_int4(b);
+}
+
+ccl_device_inline int4 operator>=(const int4 a, const int4 b)
 {
 #  ifdef __KERNEL_SSE__
  return int4(_mm_xor_si128(_mm_set1_epi32(0xffffffff), _mm_cmplt_epi32(a.m128, b.m128)));
@ -83,7 +95,12 @@ ccl_device_inline int4 operator>=(const int4 &a, const int4 &b)
 #  endif
 }

-ccl_device_inline int4 operator&(const int4 &a, const int4 &b)
+ccl_device_inline int4 operator>=(const int4 a, const int b)
+{
+  return a >= make_int4(b);
+}
+
+ccl_device_inline int4 operator&(const int4 a, const int4 b)
 {
 #  ifdef __KERNEL_SSE__
  return int4(_mm_and_si128(a.m128, b.m128));
@ -92,6 +109,97 @@ ccl_device_inline int4 operator&(const int4 &a, const int4 &b)
 #  endif
 }

+ccl_device_inline int4 operator|(const int4 a, const int4 b)
+{
+#  ifdef __KERNEL_SSE__
+  return int4(_mm_or_si128(a.m128, b.m128));
+#  else
+  return make_int4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w);
+#  endif
+}
+
+ccl_device_inline int4 operator^(const int4 a, const int4 b)
+{
+#  ifdef __KERNEL_SSE__
+  return int4(_mm_xor_si128(a.m128, b.m128));
+#  else
+  return make_int4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+#  endif
+}
+
+ccl_device_inline int4 operator&(const int32_t a, const int4 b)
+{
+  return make_int4(a) & b;
+}
+
+ccl_device_inline int4 operator&(const int4 a, const int32_t b)
+{
+  return a & make_int4(b);
+}
+
+ccl_device_inline int4 operator|(const int32_t a, const int4 b)
+{
+  return make_int4(a) | b;
+}
+
+ccl_device_inline int4 operator|(const int4 a, const int32_t b)
+{
+  return a | make_int4(b);
+}
+
+ccl_device_inline int4 operator^(const int32_t a, const int4 b)
+{
+  return make_int4(a) ^ b;
+}
+
+ccl_device_inline int4 operator^(const int4 a, const int32_t b)
+{
+  return a ^ make_int4(b);
+}
+
+ccl_device_inline int4 &operator&=(int4 &a, const int4 b)
+{
+  return a = a & b;
+}
+ccl_device_inline int4 &operator&=(int4 &a, const int32_t b)
+{
+  return a = a & b;
+}
+
+ccl_device_inline int4 &operator|=(int4 &a, const int4 b)
+{
+  return a = a | b;
+}
+ccl_device_inline int4 &operator|=(int4 &a, const int32_t b)
+{
+  return a = a | b;
+}
+
+ccl_device_inline int4 &operator^=(int4 &a, const int4 b)
+{
+  return a = a ^ b;
+}
+ccl_device_inline int4 &operator^=(int4 &a, const int32_t b)
+{
+  return a = a ^ b;
+}
+
+ccl_device_inline int4 &operator<<=(int4 &a, const int32_t b)
+{
+  return a = a << b;
+}
+ccl_device_inline int4 &operator>>=(int4 &a, const int32_t b)
+{
+  return a = a >> b;
+}
+
+#  ifdef __KERNEL_SSE__
+ccl_device_forceinline const int4 srl(const int4 a, const int32_t b)
+{
+  return int4(_mm_srli_epi32(a.m128, b));
+}
+#  endif
+
 ccl_device_inline int4 min(int4 a, int4 b)
 {
 #  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
@ -110,12 +218,12 @@ ccl_device_inline int4 max(int4 a, int4 b)
 #  endif
 }

-ccl_device_inline int4 clamp(const int4 &a, const int4 &mn, const int4 &mx)
+ccl_device_inline int4 clamp(const int4 a, const int4 mn, const int4 mx)
 {
  return min(max(a, mn), mx);
 }

-ccl_device_inline int4 select(const int4 &mask, const int4 &a, const int4 &b)
+ccl_device_inline int4 select(const int4 mask, const int4 a, const int4 b)
 {
 #  ifdef __KERNEL_SSE__
  return int4(_mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b)));
@ -135,6 +243,52 @@ ccl_device_inline int4 load_int4(const int *v)
 }
 #endif /* __KERNEL_GPU__ */

+ccl_device_inline float4 cast(const int4 a)
+{
+#ifdef __KERNEL_SSE__
+  return float4(_mm_castsi128_ps(a));
+#else
+  return make_float4(
+      __int_as_float(a.x), __int_as_float(a.y), __int_as_float(a.z), __int_as_float(a.w));
+#endif
+}
+
+#ifdef __KERNEL_SSE__
+ccl_device_forceinline int4 andnot(const int4 a, const int4 b)
+{
+  return int4(_mm_andnot_si128(a.m128, b.m128));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline int4 shuffle(const int4 a)
+{
+#  ifdef __KERNEL_NEON__
+  int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a));
+  return int4(vreinterpretq_m128i_s32(result));
+#  else
+  return int4(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)));
+#  endif
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline int4 shuffle(const int4 a, const int4 b)
+{
+#  ifdef __KERNEL_NEON__
+  int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a),
+                                                             vreinterpretq_s32_m128i(b));
+  return int4(vreinterpretq_m128i_s32(result));
+#  else
+  return int4(_mm_castps_si128(
+      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))));
+#  endif
+}
+
+template<size_t i0> ccl_device_forceinline int4 shuffle(const int4 b)
+{
+  return shuffle<i0, i0, i0, i0>(b);
+}
+#endif
+
 CCL_NAMESPACE_END

 #endif /* __UTIL_MATH_INT4_H__ */
--- a/intern/cycles/util/math_int8.h
+++ b/intern/cycles/util/math_int8.h
@ -0,0 +1,355 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
+ * Copyright 2011-2022 Blender Foundation */
+
+#ifndef __UTIL_MATH_INT8_H__
+#define __UTIL_MATH_INT8_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+ccl_device_inline vint8 operator+(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_add_epi32(a.m256, b.m256));
+#  else
+  return make_vint8(
+      a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator+=(vint8 &a, const vint8 b)
+{
+  return a = a + b;
+}
+
+ccl_device_inline vint8 operator-(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_sub_epi32(a.m256, b.m256));
+#  else
+  return make_vint8(
+      a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator-=(vint8 &a, const vint8 b)
+{
+  return a = a - b;
+}
+
+ccl_device_inline vint8 operator>>(const vint8 a, int i)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_srai_epi32(a.m256, i));
+#  else
+  return make_vint8(
+      a.a >> i, a.b >> i, a.c >> i, a.d >> i, a.e >> i, a.f >> i, a.g >> i, a.h >> i);
+#  endif
+}
+
+ccl_device_inline vint8 operator<<(const vint8 a, int i)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_slli_epi32(a.m256, i));
+#  else
+  return make_vint8(
+      a.a << i, a.b << i, a.c << i, a.d << i, a.e << i, a.f << i, a.g << i, a.h << i);
+#  endif
+}
+
+ccl_device_inline vint8 operator<(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_cmpgt_epi32(b.m256, a.m256));
+#  else
+  return make_vint8(
+      a.a < b.a, a.b < b.b, a.c < b.c, a.d < b.d, a.e < b.e, a.f < b.f, a.g < b.g, a.h < b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator<(const vint8 a, const int b)
+{
+  return a < make_vint8(b);
+}
+
+ccl_device_inline vint8 operator==(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_cmpeq_epi32(a.m256, b.m256));
+#  else
+  return make_vint8(a.a == b.a,
+                    a.b == b.b,
+                    a.c == b.c,
+                    a.d == b.d,
+                    a.e == b.e,
+                    a.f == b.f,
+                    a.g == b.g,
+                    a.h == b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator==(const vint8 a, const int b)
+{
+  return a == make_vint8(b);
+}
+
+ccl_device_inline vint8 operator>=(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(
+      _mm256_xor_si256(_mm256_set1_epi32(0xffffffff), _mm256_cmpgt_epi32(b.m256, a.m256)));
+#  else
+  return make_vint8(a.a >= b.a,
+                    a.b >= b.b,
+                    a.c >= b.c,
+                    a.d >= b.d,
+                    a.e >= b.e,
+                    a.f >= b.f,
+                    a.g >= b.g,
+                    a.h >= b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator>=(const vint8 a, const int b)
+{
+  return a >= make_vint8(b);
+}
+
+ccl_device_inline vint8 operator&(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_and_si256(a.m256, b.m256));
+#  else
+  return make_vint8(
+      a.a & b.a, a.b & b.b, a.c & b.c, a.d & b.d, a.e & b.e, a.f & b.f, a.g & b.g, a.h & b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator|(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_or_si256(a.m256, b.m256));
+#  else
+  return make_vint8(
+      a.a | b.a, a.b | b.b, a.c | b.c, a.d | b.d, a.e | b.e, a.f | b.f, a.g | b.g, a.h | b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator^(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_xor_si256(a.m256, b.m256));
+#  else
+  return make_vint8(
+      a.a ^ b.a, a.b ^ b.b, a.c ^ b.c, a.d ^ b.d, a.e ^ b.e, a.f ^ b.f, a.g ^ b.g, a.h ^ b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator&(const int32_t a, const vint8 b)
+{
+  return make_vint8(a) & b;
+}
+
+ccl_device_inline vint8 operator&(const vint8 a, const int32_t b)
+{
+  return a & make_vint8(b);
+}
+
+ccl_device_inline vint8 operator|(const int32_t a, const vint8 b)
+{
+  return make_vint8(a) | b;
+}
+
+ccl_device_inline vint8 operator|(const vint8 a, const int32_t b)
+{
+  return a | make_vint8(b);
+}
+
+ccl_device_inline vint8 operator^(const int32_t a, const vint8 b)
+{
+  return make_vint8(a) ^ b;
+}
+
+ccl_device_inline vint8 operator^(const vint8 a, const int32_t b)
+{
+  return a ^ make_vint8(b);
+}
+
+ccl_device_inline vint8 &operator&=(vint8 &a, const vint8 b)
+{
+  return a = a & b;
+}
+ccl_device_inline vint8 &operator&=(vint8 &a, const int32_t b)
+{
+  return a = a & b;
+}
+
+ccl_device_inline vint8 &operator|=(vint8 &a, const vint8 b)
+{
+  return a = a | b;
+}
+ccl_device_inline vint8 &operator|=(vint8 &a, const int32_t b)
+{
+  return a = a | b;
+}
+
+ccl_device_inline vint8 &operator^=(vint8 &a, const vint8 b)
+{
+  return a = a ^ b;
+}
+ccl_device_inline vint8 &operator^=(vint8 &a, const int32_t b)
+{
+  return a = a ^ b;
+}
+
+ccl_device_inline vint8 &operator<<=(vint8 &a, const int32_t b)
+{
+  return a = a << b;
+}
+ccl_device_inline vint8 &operator>>=(vint8 &a, const int32_t b)
+{
+  return a = a >> b;
+}
+
+#  ifdef __KERNEL_AVX__
+ccl_device_forceinline const vint8 srl(const vint8 a, const int32_t b)
+{
+  return vint8(_mm256_srli_epi32(a.m256, b));
+}
+#  endif
+
+ccl_device_inline vint8 min(vint8 a, vint8 b)
+{
+#  if defined(__KERNEL_AVX__) && defined(__KERNEL_AVX41__)
+  return vint8(_mm256_min_epi32(a.m256, b.m256));
+#  else
+  return make_vint8(min(a.a, b.a),
+                    min(a.b, b.b),
+                    min(a.c, b.c),
+                    min(a.d, b.d),
+                    min(a.e, b.e),
+                    min(a.f, b.f),
+                    min(a.g, b.g),
+                    min(a.h, b.h));
+#  endif
+}
+
+ccl_device_inline vint8 max(vint8 a, vint8 b)
+{
+#  if defined(__KERNEL_AVX__) && defined(__KERNEL_AVX41__)
+  return vint8(_mm256_max_epi32(a.m256, b.m256));
+#  else
+  return make_vint8(max(a.a, b.a),
+                    max(a.b, b.b),
+                    max(a.c, b.c),
+                    max(a.d, b.d),
+                    max(a.e, b.e),
+                    max(a.f, b.f),
+                    max(a.g, b.g),
+                    max(a.h, b.h));
+#  endif
+}
+
+ccl_device_inline vint8 clamp(const vint8 a, const vint8 mn, const vint8 mx)
+{
+  return min(max(a, mn), mx);
+}
+
+ccl_device_inline vint8 select(const vint8 mask, const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_castps_si256(_mm256_blendv_ps(
+      _mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask))));
+#  else
+  return make_vint8((mask.a) ? a.a : b.a,
+                    (mask.b) ? a.b : b.b,
+                    (mask.c) ? a.c : b.c,
+                    (mask.d) ? a.d : b.d,
+                    (mask.e) ? a.e : b.e,
+                    (mask.f) ? a.f : b.f,
+                    (mask.g) ? a.g : b.g,
+                    (mask.h) ? a.h : b.h);
+#  endif
+}
+
+ccl_device_inline vint8 load_vint8(const int *v)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_loadu_si256((__m256i *)v));
+#  else
+  return make_vint8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+#  endif
+}
+#endif /* __KERNEL_GPU__ */
+
+ccl_device_inline vfloat8 cast(const vint8 a)
+{
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_castsi256_ps(a));
+#else
+  return make_vfloat8(__int_as_float(a.a),
+                      __int_as_float(a.b),
+                      __int_as_float(a.c),
+                      __int_as_float(a.d),
+                      __int_as_float(a.e),
+                      __int_as_float(a.f),
+                      __int_as_float(a.g),
+                      __int_as_float(a.h));
+#endif
+}
+
+#ifdef __KERNEL_AVX__
+template<size_t i> ccl_device_forceinline const vint8 shuffle(const vint8 a)
+{
+  return vint8(
+      _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i, i, i, i))));
+}
+
+template<size_t i0, size_t i1> ccl_device_forceinline const vint8 shuffle(const vint8 a)
+{
+  return vint8(_mm256_permute2f128_si256(a, a, (i1 << 4) | (i0 << 0)));
+}
+
+template<size_t i0, size_t i1>
+ccl_device_forceinline const vint8 shuffle(const vint8 a, const vint8 b)
+{
+  return vint8(_mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vint8 shuffle(const vint8 a)
+{
+  return vint8(
+      _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i3, i2, i1, i0))));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vint8 shuffle(const vint8 a, const vint8 b)
+{
+  return vint8(_mm256_castps_si256(_mm256_shuffle_ps(
+      _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))));
+}
+
+template<> __forceinline const vint8 shuffle<0, 0, 2, 2>(const vint8 b)
+{
+  return vint8(_mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b))));
+}
+template<> __forceinline const vint8 shuffle<1, 1, 3, 3>(const vint8 b)
+{
+  return vint8(_mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(b))));
+}
+template<> __forceinline const vint8 shuffle<0, 1, 0, 1>(const vint8 b)
+{
+  return vint8(_mm256_castps_si256(
+      _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(b))))));
+}
+#endif
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_INT8_H__ */
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@ -133,7 +133,9 @@ ccl_device_forceinline float ray_triangle_rcp(const float x)
 ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-  return madd(ssef(a.x), ssef(b.x), madd(ssef(a.y), ssef(b.y), ssef(a.z) * ssef(b.z)))[0];
+  return madd(make_float4(a.x),
+              make_float4(b.x),
+              madd(make_float4(a.y), make_float4(b.y), make_float4(a.z) * make_float4(b.z)))[0];
 #else
  return a.x * b.x + a.y * b.y + a.z * b.z;
 #endif
@ -142,9 +144,10 @@ ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
 ccl_device_inline float3 ray_triangle_cross(const float3 a, const float3 b)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-  return make_float3(msub(ssef(a.y), ssef(b.z), ssef(a.z) * ssef(b.y))[0],
-                     msub(ssef(a.z), ssef(b.x), ssef(a.x) * ssef(b.z))[0],
-                     msub(ssef(a.x), ssef(b.y), ssef(a.y) * ssef(b.x))[0]);
+  return make_float3(
+      msub(make_float4(a.y), make_float4(b.z), make_float4(a.z) * make_float4(b.y))[0],
+      msub(make_float4(a.z), make_float4(b.x), make_float4(a.x) * make_float4(b.z))[0],
+      msub(make_float4(a.x), make_float4(b.y), make_float4(a.y) * make_float4(b.x))[0]);
 #else
  return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
 #endif
--- a/intern/cycles/util/sseb.h
+++ b/intern/cycles/util/sseb.h
@ -1,345 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2013 Intel Corporation
- * Modifications Copyright 2014-2022 Blender Foundation. */
-
-#ifndef __UTIL_SSEB_H__
-#define __UTIL_SSEB_H__
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __KERNEL_SSE2__
-
-struct ssei;
-struct ssef;
-
-/*! 4-wide SSE bool type. */
-struct sseb {
-  typedef sseb Mask;   // mask type
-  typedef ssei Int;    // int type
-  typedef ssef Float;  // float type
-
-  enum { size = 4 };  // number of SIMD elements
-  union {
-    __m128 m128;
-    int32_t v[4];
-  };  // data
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constructors, Assignment & Cast Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline sseb()
-  {
-  }
-  __forceinline sseb(const sseb &other)
-  {
-    m128 = other.m128;
-  }
-  __forceinline sseb &operator=(const sseb &other)
-  {
-    m128 = other.m128;
-    return *this;
-  }
-
-  __forceinline sseb(const __m128 input) : m128(input)
-  {
-  }
-  __forceinline operator const __m128 &(void) const
-  {
-    return m128;
-  }
-  __forceinline operator const __m128i(void) const
-  {
-    return _mm_castps_si128(m128);
-  }
-  __forceinline operator const __m128d(void) const
-  {
-    return _mm_castps_pd(m128);
-  }
-
-  __forceinline sseb(bool a)
-      : m128(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)])
-  {
-  }
-  __forceinline sseb(bool a, bool b)
-      : m128(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)])
-  {
-  }
-  __forceinline sseb(bool a, bool b, bool c, bool d)
-      : m128(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)])
-  {
-  }
-  __forceinline sseb(int mask)
-  {
-    assert(mask >= 0 && mask < 16);
-    m128 = _mm_lookupmask_ps[mask];
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constants
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline sseb(FalseTy) : m128(_mm_setzero_ps())
-  {
-  }
-  __forceinline sseb(TrueTy)
-      : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Array Access
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline bool operator[](const size_t i) const
-  {
-    assert(i < 4);
-    return (_mm_movemask_ps(m128) >> i) & 1;
-  }
-  __forceinline int32_t &operator[](const size_t i)
-  {
-    assert(i < 4);
-    return v[i];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator!(const sseb &a)
-{
-  return _mm_xor_ps(a, sseb(True));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator&(const sseb &a, const sseb &b)
-{
-  return _mm_and_ps(a, b);
-}
-__forceinline const sseb operator|(const sseb &a, const sseb &b)
-{
-  return _mm_or_ps(a, b);
-}
-__forceinline const sseb operator^(const sseb &a, const sseb &b)
-{
-  return _mm_xor_ps(a, b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator&=(sseb &a, const sseb &b)
-{
-  return a = a & b;
-}
-__forceinline const sseb operator|=(sseb &a, const sseb &b)
-{
-  return a = a | b;
-}
-__forceinline const sseb operator^=(sseb &a, const sseb &b)
-{
-  return a = a ^ b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator!=(const sseb &a, const sseb &b)
-{
-  return _mm_xor_ps(a, b);
-}
-__forceinline const sseb operator==(const sseb &a, const sseb &b)
-{
-  return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b));
-}
-
-__forceinline const sseb select(const sseb &m, const sseb &t, const sseb &f)
-{
-#  if defined(__KERNEL_SSE41__)
-  return _mm_blendv_ps(f, t, m);
-#  else
-  return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
-#  endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb unpacklo(const sseb &a, const sseb &b)
-{
-  return _mm_unpacklo_ps(a, b);
-}
-__forceinline const sseb unpackhi(const sseb &a, const sseb &b)
-{
-  return _mm_unpackhi_ps(a, b);
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const sseb shuffle(const sseb &a)
-{
-#  ifdef __KERNEL_NEON__
-  return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a);
-#  else
-  return _mm_castsi128_ps(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)));
-#  endif
-}
-
-#  ifndef __KERNEL_NEON__
-template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a)
-{
-  return _mm_movelh_ps(a, a);
-}
-
-template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a)
-{
-  return _mm_movehl_ps(a, a);
-}
-#  endif
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const sseb shuffle(const sseb &a, const sseb &b)
-{
-#  ifdef __KERNEL_NEON__
-  return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a, b);
-#  else
-  return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
-#  endif
-}
-
-#  ifndef __KERNEL_NEON__
-template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a, const sseb &b)
-{
-  return _mm_movelh_ps(a, b);
-}
-
-template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a, const sseb &b)
-{
-  return _mm_movehl_ps(b, a);
-}
-#  endif
-
-#  if defined(__KERNEL_SSE3__) && !defined(__KERNEL_NEON__)
-template<> __forceinline const sseb shuffle<0, 0, 2, 2>(const sseb &a)
-{
-  return _mm_moveldup_ps(a);
-}
-template<> __forceinline const sseb shuffle<1, 1, 3, 3>(const sseb &a)
-{
-  return _mm_movehdup_ps(a);
-}
-#  endif
-
-#  if defined(__KERNEL_SSE41__)
-template<size_t dst, size_t src, size_t clr>
-__forceinline const sseb insert(const sseb &a, const sseb &b)
-{
-#    ifdef __KERNEL_NEON__
-  sseb res = a;
-  if (clr)
-    res[dst] = 0;
-  else
-    res[dst] = b[src];
-  return res;
-#    else
-  return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
-#    endif
-}
-template<size_t dst, size_t src> __forceinline const sseb insert(const sseb &a, const sseb &b)
-{
-  return insert<dst, src, 0>(a, b);
-}
-template<size_t dst> __forceinline const sseb insert(const sseb &a, const bool b)
-{
-  return insert<dst, 0>(a, sseb(b));
-}
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reduction Operations
-////////////////////////////////////////////////////////////////////////////////
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline uint32_t popcnt(const sseb &a)
-{
-#    if defined(__KERNEL_NEON__)
-  const int32x4_t mask = {1, 1, 1, 1};
-  int32x4_t t = vandq_s32(vreinterpretq_s32_m128(a.m128), mask);
-  return vaddvq_s32(t);
-#    else
-  return _mm_popcnt_u32(_mm_movemask_ps(a));
-#    endif
-}
-#  else
-__forceinline uint32_t popcnt(const sseb &a)
-{
-  return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]);
-}
-#  endif
-
-__forceinline bool reduce_and(const sseb &a)
-{
-#  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(vreinterpretq_s32_m128(a.m128)) == -4;
-#  else
-  return _mm_movemask_ps(a) == 0xf;
-#  endif
-}
-__forceinline bool reduce_or(const sseb &a)
-{
-#  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(vreinterpretq_s32_m128(a.m128)) != 0x0;
-#  else
-  return _mm_movemask_ps(a) != 0x0;
-#  endif
-}
-__forceinline bool all(const sseb &b)
-{
-#  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) == -4;
-#  else
-  return _mm_movemask_ps(b) == 0xf;
-#  endif
-}
-__forceinline bool any(const sseb &b)
-{
-#  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) != 0x0;
-#  else
-  return _mm_movemask_ps(b) != 0x0;
-#  endif
-}
-__forceinline bool none(const sseb &b)
-{
-#  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) == 0x0;
-#  else
-  return _mm_movemask_ps(b) == 0x0;
-#  endif
-}
-
-__forceinline uint32_t movemask(const sseb &a)
-{
-  return _mm_movemask_ps(a);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Debug Functions
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_sseb(const char *label, const sseb &a)
-{
-  printf("%s: %d %d %d %d\n", label, a[0], a[1], a[2], a[3]);
-}
-
-#endif
-
-CCL_NAMESPACE_END
-
-#endif
--- a/intern/cycles/util/ssef.h
+++ b/intern/cycles/util/ssef.h
--- a/intern/cycles/util/ssei.h
+++ b/intern/cycles/util/ssei.h
@ -1,633 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2013 Intel Corporation
- * Modifications Copyright 2014-2022 Blender Foundation. */
-
-#ifndef __UTIL_SSEI_H__
-#define __UTIL_SSEI_H__
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __KERNEL_SSE2__
-
-struct sseb;
-struct ssef;
-
-/*! 4-wide SSE integer type. */
-struct ssei {
-  typedef sseb Mask;   // mask type
-  typedef ssei Int;    // int type
-  typedef ssef Float;  // float type
-
-  enum { size = 4 };  // number of SIMD elements
-  union {
-    __m128i m128;
-    int32_t i[4];
-  };  // data
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constructors, Assignment & Cast Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline ssei()
-  {
-  }
-  __forceinline ssei(const ssei &a)
-  {
-    m128 = a.m128;
-  }
-  __forceinline ssei &operator=(const ssei &a)
-  {
-    m128 = a.m128;
-    return *this;
-  }
-
-  __forceinline ssei(const __m128i a) : m128(a)
-  {
-  }
-  __forceinline operator const __m128i &(void) const
-  {
-    return m128;
-  }
-  __forceinline operator __m128i &(void)
-  {
-    return m128;
-  }
-
-  __forceinline ssei(const int a) : m128(_mm_set1_epi32(a))
-  {
-  }
-  __forceinline ssei(int a, int b, int c, int d) : m128(_mm_setr_epi32(a, b, c, d))
-  {
-  }
-
-  __forceinline explicit ssei(const __m128 a) : m128(_mm_cvtps_epi32(a))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Array Access
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline const int32_t &operator[](const size_t index) const
-  {
-    assert(index < 4);
-    return i[index];
-  }
-  __forceinline int32_t &operator[](const size_t index)
-  {
-    assert(index < 4);
-    return i[index];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssei cast(const __m128 &a)
-{
-  return _mm_castps_si128(a);
-}
-__forceinline const ssei operator+(const ssei &a)
-{
-  return a;
-}
-__forceinline const ssei operator-(const ssei &a)
-{
-  return _mm_sub_epi32(_mm_setzero_si128(), a.m128);
-}
-#  if defined(__KERNEL_SSSE3__)
-__forceinline const ssei abs(const ssei &a)
-{
-  return _mm_abs_epi32(a.m128);
-}
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssei operator+(const ssei &a, const ssei &b)
-{
-  return _mm_add_epi32(a.m128, b.m128);
-}
-__forceinline const ssei operator+(const ssei &a, const int32_t &b)
-{
-  return a + ssei(b);
-}
-__forceinline const ssei operator+(const int32_t &a, const ssei &b)
-{
-  return ssei(a) + b;
-}
-
-__forceinline const ssei operator-(const ssei &a, const ssei &b)
-{
-  return _mm_sub_epi32(a.m128, b.m128);
-}
-__forceinline const ssei operator-(const ssei &a, const int32_t &b)
-{
-  return a - ssei(b);
-}
-__forceinline const ssei operator-(const int32_t &a, const ssei &b)
-{
-  return ssei(a) - b;
-}
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline const ssei operator*(const ssei &a, const ssei &b)
-{
-  return _mm_mullo_epi32(a.m128, b.m128);
-}
-__forceinline const ssei operator*(const ssei &a, const int32_t &b)
-{
-  return a * ssei(b);
-}
-__forceinline const ssei operator*(const int32_t &a, const ssei &b)
-{
-  return ssei(a) * b;
-}
-#  endif
-
-__forceinline const ssei operator&(const ssei &a, const ssei &b)
-{
-  return _mm_and_si128(a.m128, b.m128);
-}
-__forceinline const ssei operator&(const ssei &a, const int32_t &b)
-{
-  return a & ssei(b);
-}
-__forceinline const ssei operator&(const int32_t &a, const ssei &b)
-{
-  return ssei(a) & b;
-}
-
-__forceinline const ssei operator|(const ssei &a, const ssei &b)
-{
-  return _mm_or_si128(a.m128, b.m128);
-}
-__forceinline const ssei operator|(const ssei &a, const int32_t &b)
-{
-  return a | ssei(b);
-}
-__forceinline const ssei operator|(const int32_t &a, const ssei &b)
-{
-  return ssei(a) | b;
-}
-
-__forceinline const ssei operator^(const ssei &a, const ssei &b)
-{
-  return _mm_xor_si128(a.m128, b.m128);
-}
-__forceinline const ssei operator^(const ssei &a, const int32_t &b)
-{
-  return a ^ ssei(b);
-}
-__forceinline const ssei operator^(const int32_t &a, const ssei &b)
-{
-  return ssei(a) ^ b;
-}
-
-__forceinline const ssei operator<<(const ssei &a, const int32_t &n)
-{
-  return _mm_slli_epi32(a.m128, n);
-}
-__forceinline const ssei operator>>(const ssei &a, const int32_t &n)
-{
-  return _mm_srai_epi32(a.m128, n);
-}
-
-__forceinline const ssei andnot(const ssei &a, const ssei &b)
-{
-  return _mm_andnot_si128(a.m128, b.m128);
-}
-__forceinline const ssei andnot(const sseb &a, const ssei &b)
-{
-  return _mm_andnot_si128(cast(a.m128), b.m128);
-}
-__forceinline const ssei andnot(const ssei &a, const sseb &b)
-{
-  return _mm_andnot_si128(a.m128, cast(b.m128));
-}
-
-__forceinline const ssei sra(const ssei &a, const int32_t &b)
-{
-  return _mm_srai_epi32(a.m128, b);
-}
-__forceinline const ssei srl(const ssei &a, const int32_t &b)
-{
-  return _mm_srli_epi32(a.m128, b);
-}
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline const ssei min(const ssei &a, const ssei &b)
-{
-  return _mm_min_epi32(a.m128, b.m128);
-}
-__forceinline const ssei min(const ssei &a, const int32_t &b)
-{
-  return min(a, ssei(b));
-}
-__forceinline const ssei min(const int32_t &a, const ssei &b)
-{
-  return min(ssei(a), b);
-}
-
-__forceinline const ssei max(const ssei &a, const ssei &b)
-{
-  return _mm_max_epi32(a.m128, b.m128);
-}
-__forceinline const ssei max(const ssei &a, const int32_t &b)
-{
-  return max(a, ssei(b));
-}
-__forceinline const ssei max(const int32_t &a, const ssei &b)
-{
-  return max(ssei(a), b);
-}
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssei &operator+=(ssei &a, const ssei &b)
-{
-  return a = a + b;
-}
-__forceinline ssei &operator+=(ssei &a, const int32_t &b)
-{
-  return a = a + b;
-}
-
-__forceinline ssei &operator-=(ssei &a, const ssei &b)
-{
-  return a = a - b;
-}
-__forceinline ssei &operator-=(ssei &a, const int32_t &b)
-{
-  return a = a - b;
-}
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline ssei &operator*=(ssei &a, const ssei &b)
-{
-  return a = a * b;
-}
-__forceinline ssei &operator*=(ssei &a, const int32_t &b)
-{
-  return a = a * b;
-}
-#  endif
-
-__forceinline ssei &operator&=(ssei &a, const ssei &b)
-{
-  return a = a & b;
-}
-__forceinline ssei &operator&=(ssei &a, const int32_t &b)
-{
-  return a = a & b;
-}
-
-__forceinline ssei &operator|=(ssei &a, const ssei &b)
-{
-  return a = a | b;
-}
-__forceinline ssei &operator|=(ssei &a, const int32_t &b)
-{
-  return a = a | b;
-}
-
-__forceinline ssei &operator^=(ssei &a, const ssei &b)
-{
-  return a = a ^ b;
-}
-__forceinline ssei &operator^=(ssei &a, const int32_t &b)
-{
-  return a = a ^ b;
-}
-
-__forceinline ssei &operator<<=(ssei &a, const int32_t &b)
-{
-  return a = a << b;
-}
-__forceinline ssei &operator>>=(ssei &a, const int32_t &b)
-{
-  return a = a >> b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator==(const ssei &a, const ssei &b)
-{
-  return _mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128));
-}
-__forceinline const sseb operator==(const ssei &a, const int32_t &b)
-{
-  return a == ssei(b);
-}
-__forceinline const sseb operator==(const int32_t &a, const ssei &b)
-{
-  return ssei(a) == b;
-}
-
-__forceinline const sseb operator!=(const ssei &a, const ssei &b)
-{
-  return !(a == b);
-}
-__forceinline const sseb operator!=(const ssei &a, const int32_t &b)
-{
-  return a != ssei(b);
-}
-__forceinline const sseb operator!=(const int32_t &a, const ssei &b)
-{
-  return ssei(a) != b;
-}
-
-__forceinline const sseb operator<(const ssei &a, const ssei &b)
-{
-  return _mm_castsi128_ps(_mm_cmplt_epi32(a.m128, b.m128));
-}
-__forceinline const sseb operator<(const ssei &a, const int32_t &b)
-{
-  return a < ssei(b);
-}
-__forceinline const sseb operator<(const int32_t &a, const ssei &b)
-{
-  return ssei(a) < b;
-}
-
-__forceinline const sseb operator>=(const ssei &a, const ssei &b)
-{
-  return !(a < b);
-}
-__forceinline const sseb operator>=(const ssei &a, const int32_t &b)
-{
-  return a >= ssei(b);
-}
-__forceinline const sseb operator>=(const int32_t &a, const ssei &b)
-{
-  return ssei(a) >= b;
-}
-
-__forceinline const sseb operator>(const ssei &a, const ssei &b)
-{
-  return _mm_castsi128_ps(_mm_cmpgt_epi32(a.m128, b.m128));
-}
-__forceinline const sseb operator>(const ssei &a, const int32_t &b)
-{
-  return a > ssei(b);
-}
-__forceinline const sseb operator>(const int32_t &a, const ssei &b)
-{
-  return ssei(a) > b;
-}
-
-__forceinline const sseb operator<=(const ssei &a, const ssei &b)
-{
-  return !(a > b);
-}
-__forceinline const sseb operator<=(const ssei &a, const int32_t &b)
-{
-  return a <= ssei(b);
-}
-__forceinline const sseb operator<=(const int32_t &a, const ssei &b)
-{
-  return ssei(a) <= b;
-}
-
-__forceinline const ssei select(const sseb &m, const ssei &t, const ssei &f)
-{
-#  ifdef __KERNEL_SSE41__
-  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
-#  else
-  return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f));
-#  endif
-}
-
-__forceinline const ssei select(const int mask, const ssei &t, const ssei &f)
-{
-#  if defined(__KERNEL_SSE41__) && \
-      ((!defined(__clang__) && !defined(_MSC_VER)) || defined(__INTEL_COMPILER))
-  return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
-#  else
-  return select(sseb(mask), t, f);
-#  endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssei unpacklo(const ssei &a, const ssei &b)
-{
-  return _mm_unpacklo_epi32(a, b);
-}
-__forceinline ssei unpackhi(const ssei &a, const ssei &b)
-{
-  return _mm_unpackhi_epi32(a, b);
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const ssei shuffle(const ssei &a)
-{
-#  ifdef __KERNEL_NEON__
-  int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a));
-  return vreinterpretq_m128i_s32(result);
-#  else
-  return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
-#  endif
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const ssei shuffle(const ssei &a, const ssei &b)
-{
-#  ifdef __KERNEL_NEON__
-  int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a),
-                                                             vreinterpretq_s32_m128i(b));
-  return vreinterpretq_m128i_s32(result);
-#  else
-  return _mm_castps_si128(
-      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
-#  endif
-}
-
-template<size_t i0> __forceinline const ssei shuffle(const ssei &b)
-{
-  return shuffle<i0, i0, i0, i0>(b);
-}
-
-#  if defined(__KERNEL_SSE41__)
-template<size_t src> __forceinline int extract(const ssei &b)
-{
-  return _mm_extract_epi32(b, src);
-}
-template<size_t dst> __forceinline const ssei insert(const ssei &a, const int32_t b)
-{
-  return _mm_insert_epi32(a, b, dst);
-}
-#  else
-template<size_t src> __forceinline int extract(const ssei &b)
-{
-  return b[src];
-}
-template<size_t dst> __forceinline const ssei insert(const ssei &a, const int32_t b)
-{
-  ssei c = a;
-  c[dst] = b;
-  return c;
-}
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reductions
-////////////////////////////////////////////////////////////////////////////////
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline const ssei vreduce_min(const ssei &v)
-{
-  ssei h = min(shuffle<1, 0, 3, 2>(v), v);
-  return min(shuffle<2, 3, 0, 1>(h), h);
-}
-__forceinline const ssei vreduce_max(const ssei &v)
-{
-  ssei h = max(shuffle<1, 0, 3, 2>(v), v);
-  return max(shuffle<2, 3, 0, 1>(h), h);
-}
-__forceinline const ssei vreduce_add(const ssei &v)
-{
-  ssei h = shuffle<1, 0, 3, 2>(v) + v;
-  return shuffle<2, 3, 0, 1>(h) + h;
-}
-
-__forceinline int reduce_min(const ssei &v)
-{
-#    ifdef __KERNEL_NEON__
-  return vminvq_s32(vreinterpretq_s32_m128i(v));
-#    else
-  return extract<0>(vreduce_min(v));
-#    endif
-}
-__forceinline int reduce_max(const ssei &v)
-{
-#    ifdef __KERNEL_NEON__
-  return vmaxvq_s32(vreinterpretq_s32_m128i(v));
-#    else
-  return extract<0>(vreduce_max(v));
-#    endif
-}
-__forceinline int reduce_add(const ssei &v)
-{
-#    ifdef __KERNEL_NEON__
-  return vaddvq_s32(vreinterpretq_s32_m128i(v));
-#    else
-  return extract<0>(vreduce_add(v));
-#    endif
-}
-
-__forceinline uint32_t select_min(const ssei &v)
-{
-  return __bsf(movemask(v == vreduce_min(v)));
-}
-__forceinline uint32_t select_max(const ssei &v)
-{
-  return __bsf(movemask(v == vreduce_max(v)));
-}
-
-__forceinline uint32_t select_min(const sseb &valid, const ssei &v)
-{
-  const ssei a = select(valid, v, ssei((int)pos_inf));
-  return __bsf(movemask(valid & (a == vreduce_min(a))));
-}
-__forceinline uint32_t select_max(const sseb &valid, const ssei &v)
-{
-  const ssei a = select(valid, v, ssei((int)neg_inf));
-  return __bsf(movemask(valid & (a == vreduce_max(a))));
-}
-
-#  else
-
-__forceinline int ssei_min(int a, int b)
-{
-  return (a < b) ? a : b;
-}
-__forceinline int ssei_max(int a, int b)
-{
-  return (a > b) ? a : b;
-}
-__forceinline int reduce_min(const ssei &v)
-{
-  return ssei_min(ssei_min(v[0], v[1]), ssei_min(v[2], v[3]));
-}
-__forceinline int reduce_max(const ssei &v)
-{
-  return ssei_max(ssei_max(v[0], v[1]), ssei_max(v[2], v[3]));
-}
-__forceinline int reduce_add(const ssei &v)
-{
-  return v[0] + v[1] + v[2] + v[3];
-}
-
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Memory load and store operations
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssei load4i(const void *const a)
-{
-  return _mm_load_si128((__m128i *)a);
-}
-
-__forceinline void store4i(void *ptr, const ssei &v)
-{
-  _mm_store_si128((__m128i *)ptr, v);
-}
-
-__forceinline void storeu4i(void *ptr, const ssei &v)
-{
-  _mm_storeu_si128((__m128i *)ptr, v);
-}
-
-__forceinline void store4i(const sseb &mask, void *ptr, const ssei &i)
-{
-#  if defined(__KERNEL_AVX__)
-  _mm_maskstore_ps((float *)ptr, (__m128i)mask, _mm_castsi128_ps(i));
-#  else
-  *(ssei *)ptr = select(mask, i, *(ssei *)ptr);
-#  endif
-}
-
-__forceinline ssei load4i_nt(void *ptr)
-{
-#  if defined(__KERNEL_SSE41__)
-  return _mm_stream_load_si128((__m128i *)ptr);
-#  else
-  return _mm_load_si128((__m128i *)ptr);
-#  endif
-}
-
-__forceinline void store4i_nt(void *ptr, const ssei &v)
-{
-#  if defined(__KERNEL_SSE41__)
-  _mm_stream_ps((float *)ptr, _mm_castsi128_ps(v));
-#  else
-  _mm_store_si128((__m128i *)ptr, v);
-#  endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Debug Functions
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_ssei(const char *label, const ssei &a)
-{
-  printf("%s: %df %df %df %d\n", label, a[0], a[1], a[2], a[3]);
-}
-
-#endif
-
-CCL_NAMESPACE_END
-
-#endif
--- a/intern/cycles/util/transform.cpp
+++ b/intern/cycles/util/transform.cpp
@ -102,7 +102,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
    return projection_identity();
  }

-  memcpy(&tfmR, R, sizeof(R));
+  memcpy(&tfmR.x[0], R, sizeof(R));

  return tfmR;
 }
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@ -63,17 +63,16 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f
 {
  /* TODO(sergey): Disabled for now, causes crashes in certain cases. */
 #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
-  ssef x, y, z, w, aa;
-  aa = a.m128;
+  const float4 aa(a.m128);

-  x = _mm_loadu_ps(&t->x.x);
-  y = _mm_loadu_ps(&t->y.x);
-  z = _mm_loadu_ps(&t->z.x);
-  w = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
+  float4 x(_mm_loadu_ps(&t->x.x));
+  float4 y(_mm_loadu_ps(&t->y.x));
+  float4 z(_mm_loadu_ps(&t->z.x));
+  float4 w(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f));

-  _MM_TRANSPOSE4_PS(x, y, z, w);
+  _MM_TRANSPOSE4_PS(x.m128, y.m128, z.m128, w.m128);

-  ssef tmp = w;
+  float4 tmp = w;
  tmp = madd(shuffle<2>(aa), z, tmp);
  tmp = madd(shuffle<1>(aa), y, tmp);
  tmp = madd(shuffle<0>(aa), x, tmp);
@ -94,16 +93,16 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f
 ccl_device_inline float3 transform_direction(ccl_private const Transform *t, const float3 a)
 {
 #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
-  ssef x, y, z, w, aa;
-  aa = a.m128;
-  x = _mm_loadu_ps(&t->x.x);
-  y = _mm_loadu_ps(&t->y.x);
-  z = _mm_loadu_ps(&t->z.x);
-  w = _mm_setzero_ps();
+  const float4 aa(a.m128);

-  _MM_TRANSPOSE4_PS(x, y, z, w);
+  float4 x(_mm_loadu_ps(&t->x.x));
+  float4 y(_mm_loadu_ps(&t->y.x));
+  float4 z(_mm_loadu_ps(&t->z.x));
+  float4 w(_mm_setzero_ps());

-  ssef tmp = shuffle<2>(aa) * z;
+  _MM_TRANSPOSE4_PS(x.m128, y.m128, z.m128, w.m128);
+
+  float4 tmp = shuffle<2>(aa) * z;
  tmp = madd(shuffle<1>(aa), y, tmp);
  tmp = madd(shuffle<0>(aa), x, tmp);

--- a/intern/cycles/util/transform_inverse.h
+++ b/intern/cycles/util/transform_inverse.h
@ -9,26 +9,33 @@ CCL_NAMESPACE_BEGIN
 * Normally we don't use SSE41/AVX outside the kernel, but for this it's
 * important to match exactly for ray tracing precision. */

-ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b)
+ccl_device_forceinline float3 transform_inverse_cross(const float3 a_, const float3 b_)
 {
 #if defined(__AVX2__) && defined(__KERNEL_SSE2__)
-  const ssef sse_a = (const __m128 &)a;
-  const ssef sse_b = (const __m128 &)b;
-  const ssef r = shuffle<1, 2, 0, 3>(
-      ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b)));
+  const __m128 a = (const __m128 &)a_;
+  const __m128 b = (const __m128 &)b_;
+  const __m128 a_shuffle = _mm_castsi128_ps(
+      _mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(3, 0, 2, 1)));
+  const __m128 b_shuffle = _mm_castsi128_ps(
+      _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(3, 0, 2, 1)));
+  const __m128 r = _mm_castsi128_ps(
+      _mm_shuffle_epi32(_mm_castps_si128(_mm_fmsub_ps(a, b_shuffle, _mm_mul_ps(a_shuffle, b))),
+                        _MM_SHUFFLE(3, 0, 2, 1)));
  return (const float3 &)r;
 #endif

-  return cross(a, b);
+  return cross(a_, b_);
 }

-ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b)
+ccl_device_forceinline float transform_inverse_dot(const float3 a_, const float3 b_)
 {
-#ifdef __SSE4_1__
-  return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F));
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+  const __m128 a = (const __m128 &)a_;
+  const __m128 b = (const __m128 &)b_;
+  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
 #endif

-  return dot(a, b);
+  return dot(a_, b_);
 }

 ccl_device_forceinline Transform transform_inverse_impl(const Transform tfm)
--- a/intern/cycles/util/types.h
+++ b/intern/cycles/util/types.h
@ -97,6 +97,7 @@ ccl_device_inline void print_float(ccl_private const char *label, const float a)
 #include "util/types_int2.h"
 #include "util/types_int3.h"
 #include "util/types_int4.h"
+#include "util/types_int8.h"

 #include "util/types_uint2.h"
 #include "util/types_uint3.h"
@ -119,6 +120,7 @@ ccl_device_inline void print_float(ccl_private const char *label, const float a)
 #include "util/types_int2_impl.h"
 #include "util/types_int3_impl.h"
 #include "util/types_int4_impl.h"
+#include "util/types_int8_impl.h"

 #include "util/types_uint2_impl.h"
 #include "util/types_uint3_impl.h"
@ -129,16 +131,4 @@ ccl_device_inline void print_float(ccl_private const char *label, const float a)
 #include "util/types_float4_impl.h"
 #include "util/types_float8_impl.h"

-/* SSE types. */
-#ifndef __KERNEL_GPU__
-#  include "util/sseb.h"
-#  include "util/ssef.h"
-#  include "util/ssei.h"
-#  if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
-#    include "util/avxb.h"
-#    include "util/avxf.h"
-#    include "util/avxi.h"
-#  endif
-#endif
-
 #endif /* __UTIL_TYPES_H__ */
--- a/intern/cycles/util/types_float8.h
+++ b/intern/cycles/util/types_float8.h
@ -11,15 +11,15 @@
 CCL_NAMESPACE_BEGIN

 /* float8 is a reserved type in Metal that has not been implemented. For
- * that reason this is named float8_t and not using native vector types. */
+ * that reason this is named vfloat8 and not using native vector types. */

 #ifdef __KERNEL_GPU__
-struct float8_t
+struct vfloat8
 #else
-struct ccl_try_align(32) float8_t
+struct ccl_try_align(32) vfloat8
 #endif
 {
-#ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX__
  union {
    __m256 m256;
    struct {
@ -27,18 +27,18 @@ struct ccl_try_align(32) float8_t
    };
  };

-  __forceinline float8_t();
-  __forceinline float8_t(const float8_t &a);
-  __forceinline explicit float8_t(const __m256 &a);
+  __forceinline vfloat8();
+  __forceinline vfloat8(const vfloat8 &a);
+  __forceinline explicit vfloat8(const __m256 &a);

  __forceinline operator const __m256 &() const;
  __forceinline operator __m256 &();

-  __forceinline float8_t &operator=(const float8_t &a);
+  __forceinline vfloat8 &operator=(const vfloat8 &a);

-#else  /* __KERNEL_AVX2__ */
+#else  /* __KERNEL_AVX__ */
  float a, b, c, d, e, f, g, h;
-#endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX__ */

 #ifndef __KERNEL_GPU__
  __forceinline float operator[](int i) const;
@ -46,8 +46,11 @@ struct ccl_try_align(32) float8_t
 #endif
 };

-ccl_device_inline float8_t make_float8_t(float f);
-ccl_device_inline float8_t
-make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h);
+ccl_device_inline vfloat8 make_vfloat8(float f);
+ccl_device_inline vfloat8
+make_vfloat8(float a, float b, float c, float d, float e, float f, float g, float h);
+ccl_device_inline vfloat8 make_vfloat8(const float4 a, const float4 b);
+
+ccl_device_inline void print_vfloat8(ccl_private const char *label, const vfloat8 a);

 CCL_NAMESPACE_END
--- a/intern/cycles/util/types_float8_impl.h
+++ b/intern/cycles/util/types_float8_impl.h
@ -10,45 +10,45 @@

 CCL_NAMESPACE_BEGIN

-#ifdef __KERNEL_AVX2__
-__forceinline float8_t::float8_t()
+#ifdef __KERNEL_AVX__
+__forceinline vfloat8::vfloat8()
 {
 }

-__forceinline float8_t::float8_t(const float8_t &f) : m256(f.m256)
+__forceinline vfloat8::vfloat8(const vfloat8 &f) : m256(f.m256)
 {
 }

-__forceinline float8_t::float8_t(const __m256 &f) : m256(f)
+__forceinline vfloat8::vfloat8(const __m256 &f) : m256(f)
 {
 }

-__forceinline float8_t::operator const __m256 &() const
+__forceinline vfloat8::operator const __m256 &() const
 {
  return m256;
 }

-__forceinline float8_t::operator __m256 &()
+__forceinline vfloat8::operator __m256 &()
 {
  return m256;
 }

-__forceinline float8_t &float8_t::operator=(const float8_t &f)
+__forceinline vfloat8 &vfloat8::operator=(const vfloat8 &f)
 {
  m256 = f.m256;
  return *this;
 }
-#endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX__ */

 #ifndef __KERNEL_GPU__
-__forceinline float float8_t::operator[](int i) const
+__forceinline float vfloat8::operator[](int i) const
 {
  util_assert(i >= 0);
  util_assert(i < 8);
  return *(&a + i);
 }

-__forceinline float &float8_t::operator[](int i)
+__forceinline float &vfloat8::operator[](int i)
 {
  util_assert(i >= 0);
  util_assert(i < 8);
@ -56,25 +56,50 @@ __forceinline float &float8_t::operator[](int i)
 }
 #endif

-ccl_device_inline float8_t make_float8_t(float f)
+ccl_device_inline vfloat8 make_vfloat8(float f)
 {
-#ifdef __KERNEL_AVX2__
-  float8_t r(_mm256_set1_ps(f));
+#ifdef __KERNEL_AVX__
+  vfloat8 r(_mm256_set1_ps(f));
 #else
-  float8_t r = {f, f, f, f, f, f, f, f};
+  vfloat8 r = {f, f, f, f, f, f, f, f};
 #endif
  return r;
 }

-ccl_device_inline float8_t
-make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h)
+ccl_device_inline vfloat8
+make_vfloat8(float a, float b, float c, float d, float e, float f, float g, float h)
 {
-#ifdef __KERNEL_AVX2__
-  float8_t r(_mm256_setr_ps(a, b, c, d, e, f, g, h));
+#ifdef __KERNEL_AVX__
+  vfloat8 r(_mm256_setr_ps(a, b, c, d, e, f, g, h));
 #else
-  float8_t r = {a, b, c, d, e, f, g, h};
+  vfloat8 r = {a, b, c, d, e, f, g, h};
 #endif
  return r;
 }

+ccl_device_inline vfloat8 make_vfloat8(const float4 a, const float4 b)
+{
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1));
+#else
+  return make_vfloat8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w);
+#endif
+}
+
+ccl_device_inline void print_vfloat8(ccl_private const char *label, const vfloat8 a)
+{
+#ifdef __KERNEL_PRINTF__
+  printf("%s: %.8f %.8f %.8f %.8f %.8f %.8f %.8f %.8f\n",
+         label,
+         (double)a.a,
+         (double)a.b,
+         (double)a.c,
+         (double)a.d,
+         (double)a.e,
+         (double)a.f,
+         (double)a.g,
+         (double)a.h);
+#endif
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/util/types_int8.h
+++ b/intern/cycles/util/types_int8.h
@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+struct vfloat8;
+
+#ifdef __KERNEL_GPU__
+struct vint8
+#else
+struct ccl_try_align(32) vint8
+#endif
+{
+#ifdef __KERNEL_AVX__
+  union {
+    __m256i m256;
+    struct {
+      int a, b, c, d, e, f, g, h;
+    };
+  };
+
+  __forceinline vint8();
+  __forceinline vint8(const vint8 &a);
+  __forceinline explicit vint8(const __m256i &a);
+
+  __forceinline operator const __m256i &() const;
+  __forceinline operator __m256i &();
+
+  __forceinline vint8 &operator=(const vint8 &a);
+#else  /* __KERNEL_AVX__ */
+  int a, b, c, d, e, f, g, h;
+#endif /* __KERNEL_AVX__ */
+
+#ifndef __KERNEL_GPU__
+  __forceinline int operator[](int i) const;
+  __forceinline int &operator[](int i);
+#endif
+};
+
+ccl_device_inline vint8 make_vint8(int a, int b, int c, int d, int e, int f, int g, int h);
+ccl_device_inline vint8 make_vint8(int i);
+ccl_device_inline vint8 make_vint8(const vfloat8 f);
+ccl_device_inline vint8 make_vint8(const int4 a, const int4 b);
+
+CCL_NAMESPACE_END
--- a/intern/cycles/util/types_int8_impl.h
+++ b/intern/cycles/util/types_int8_impl.h
@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __KERNEL_AVX__
+__forceinline vint8::vint8()
+{
+}
+
+__forceinline vint8::vint8(const vint8 &a) : m256(a.m256)
+{
+}
+
+__forceinline vint8::vint8(const __m256i &a) : m256(a)
+{
+}
+
+__forceinline vint8::operator const __m256i &() const
+{
+  return m256;
+}
+
+__forceinline vint8::operator __m256i &()
+{
+  return m256;
+}
+
+__forceinline vint8 &vint8::operator=(const vint8 &a)
+{
+  m256 = a.m256;
+  return *this;
+}
+#endif /* __KERNEL_AVX__ */
+
+#ifndef __KERNEL_GPU__
+__forceinline int vint8::operator[](int i) const
+{
+  util_assert(i >= 0);
+  util_assert(i < 8);
+  return *(&a + i);
+}
+
+__forceinline int &vint8::operator[](int i)
+{
+  util_assert(i >= 0);
+  util_assert(i < 8);
+  return *(&a + i);
+}
+#endif
+
+ccl_device_inline vint8 make_vint8(int a, int b, int c, int d, int e, int f, int g, int h)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_set_epi32(h, g, f, e, d, c, b, a));
+#else
+  return {a, b, c, d, e, f, g, h};
+#endif
+}
+
+ccl_device_inline vint8 make_vint8(int i)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_set1_epi32(i));
+#else
+  return make_vint8(i, i, i, i, i, i, i, i);
+#endif
+}
+
+ccl_device_inline vint8 make_vint8(const vfloat8 f)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_cvtps_epi32(f.m256));
+#else
+  return make_vint8(
+      (int)f.a, (int)f.b, (int)f.c, (int)f.d, (int)f.e, (int)f.f, (int)f.g, (int)f.h);
+#endif
+}
+
+ccl_device_inline vint8 make_vint8(const int4 a, const int4 b)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_insertf128_si256(_mm256_castsi128_si256(a.m128), b.m128, 1));
+#else
+  return make_vint8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w);
+#endif
+}
+
+CCL_NAMESPACE_END