Page MenuHome
Paste P209

avx_intersector.patch
ActivePublic

Authored by Sv. Lockal (lockal) on Mar 7 2015, 7:04 PM.
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index 06917dc..496d950 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -42,11 +42,17 @@ CCL_NAMESPACE_BEGIN
/* Precalculated data for the ray->tri intersection. */
typedef struct IsectPrecalc {
+#ifndef __KERNEL_AVX__
/* Maximal dimension kz, and orthogonal dimensions. */
int kx, ky, kz;
/* Shear constants. */
float Sx, Sy, Sz;
+#else
+ /* Same for vectorized intersector */
+ ssei k;
+ ssef S;
+#endif
} IsectPrecalc;
/* Workaround for CUDA toolkit 6.5.16. */
@@ -78,14 +84,23 @@ void triangle_intersect_precalc(float3 dir,
/* Calculate the shear constants. */
float inf_dir_z = 1.0f / IDX(dir, kz);
- isect_precalc->Sx = IDX(dir, kx) * inf_dir_z;
- isect_precalc->Sy = IDX(dir, ky) * inf_dir_z;
- isect_precalc->Sz = inf_dir_z;
+ float Sx = IDX(dir, kx) * inf_dir_z;
+ float Sy = IDX(dir, ky) * inf_dir_z;
+ float Sz = inf_dir_z;
+
+ /* Store the dimensions and the shear constants. */
+#ifndef __KERNEL_AVX__
+ isect_precalc->Sx = Sx;
+ isect_precalc->Sy = Sy;
+ isect_precalc->Sz = Sz;
- /* Store the dimensions. */
isect_precalc->kx = kx;
isect_precalc->ky = ky;
isect_precalc->kz = kz;
+#else
+ isect_precalc->k = ssei(kx, ky, kz, 0);
+ isect_precalc->S = ssef(Sx, Sy, Sz, 0);
+#endif
}
/* TODO(sergey): Make it general utility function. */
@@ -103,6 +118,65 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
int object,
int triAddr)
{
+#ifdef __KERNEL_AVX__
+ /* Calculate vertices relative to ray origin. */
+ const sse3f tri(kernel_tex_fetch_ssef(__tri_woop, triAddr*TRI_NODE_SIZE + 0),
+ kernel_tex_fetch_ssef(__tri_woop, triAddr*TRI_NODE_SIZE + 1),
+ kernel_tex_fetch_ssef(__tri_woop, triAddr*TRI_NODE_SIZE + 2));
+
+ const ssef vP = load4f(P);
+
+ const ssef A = tri.x - vP;
+ const ssef B = tri.y - vP;
+ const ssef C = tri.z - vP;
+
+ const ssei vk = isect_precalc->k;
+ const ssef A_k = shuffle(A, vk);
+ const ssef B_k = shuffle(B, vk);
+ const ssef C_k = shuffle(C, vk);
+
+ /* Perform shear and scale of vertices. */
+ const avxf ABC_kxy(shuffle<0, 1, 0, 1>(A_k, B_k), C_k); /* Pack A_kx, A_ky, B_kx, B_ky, C_kx, C_ky, _, _ */
+ const avxf Sxy(shuffle<0, 1, 0, 1>(isect_precalc->S)); /* Pack Sx, Sy, Sx, Sy, Sx, Sy, _, _ */
+ const avxf ABC_kz(shuffle<2>(A_k, B_k), shuffle<2>(C_k)); /* Pack A_kz, A_kz, B_kz, B_kz, C_kz, C_kz, _, _ */
+ const avxf ABC_xy = nmadd(Sxy, ABC_kz, ABC_kxy); /* Pack Ax, Ay, Bx, By, Cx, Cy */
+
+ /* Calculate scaled barycentric coordinates. */
+ /* Pack cy, cx, ay, ax, by, bx, _, _ */
+#ifdef __KERNEL_AVX2__
+ const avxf CAB_yx = shuffle<5, 4, 1, 0, 3, 2, 3, 2>(ABC_xy);
+#else
+ const avxf CAB_yx = shuffle<0, 2>(shuffle<1, 0, 1, 0>(shuffle<1, 0>(ABC_xy), ABC_xy), shuffle<3, 2, 3, 2>(ABC_xy));
+#endif
+
+ /* Get packed result in V, W, _, _, U, _, _, _ */
+ const avxf VWU = hsub(ABC_xy * CAB_yx);
+ const ssef VWU1 = extract<0>(VWU), VWU2 = extract<1>(VWU);
+ const ssef vU = shuffle<0>(VWU2);
+ const ssef vV = shuffle<0>(VWU1);
+ const ssef vW = shuffle<1>(VWU1);
+
+ if (movemask((vU ^ vV) | (vU ^ vW))) {
+ return false;
+ }
+
+ /* Calculate determinant. */
+ float det = extract<0>(vU + vV + vW);
+ if(UNLIKELY(det == 0.0f)) {
+ return false;
+ }
+
+ const ssef UVW = shuffle<0, 2, 0, 2>(shuffle<0, 1, 0, 1>(vU, vV), vW);
+ const float U = extract<0>(vU), V = extract<0>(vV);
+
+ /* Calculate scaled z−coordinates of vertices and use them to calculate
+ * the hit distance.
+ */
+ const ssef ABC_k_z = shuffle<0, 2, 2, 2>(shuffle<2>(A_k, B_k), C_k);
+ const float T = extract<2>(isect_precalc->S) * dot3(UVW, ABC_k_z);
+
+ int sign_mask = (__float_as_int(U) & 0x80000000);
+#else
const int kx = isect_precalc->kx;
const int ky = isect_precalc->ky;
const int kz = isect_precalc->kz;
@@ -154,6 +228,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
* the hit distance.
*/
const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz;
+#endif
/* Perform "near clipping". */
const float abs_T = xor_signmast(T, sign_mask);
@@ -190,9 +265,15 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
* it's quite tricky.
*/
if(UNLIKELY(abs_det > 100000.0f && t < 1e-3f)) {
+#ifdef __KERNEL_AVX__
+ const ssef Ng = cross(A - B, C - A);
+ const float pleucker_den = dot(Ng, load4f(dir));
+ const float pleucker_T = dot(A, Ng);
+#else
const float3 Ng = cross(A - B, C - A);
const float pleucker_den = dot(Ng, dir);
const float pleucker_T = dot(A, Ng);
+#endif
if(UNLIKELY(pleucker_T * pleucker_den < 0.0f)) {
return false;
}
@@ -229,6 +310,65 @@ ccl_device_inline void triangle_intersect_subsurface(
uint *lcg_state,
int max_hits)
{
+#ifdef __KERNEL_AVX__
+ /* Calculate vertices relative to ray origin. */
+ const sse3f tri(kernel_tex_fetch_ssef(__tri_woop, triAddr*TRI_NODE_SIZE + 0),
+ kernel_tex_fetch_ssef(__tri_woop, triAddr*TRI_NODE_SIZE + 1),
+ kernel_tex_fetch_ssef(__tri_woop, triAddr*TRI_NODE_SIZE + 2));
+
+ const ssef vP = load4f(P);
+
+ const ssef A = tri.x - vP;
+ const ssef B = tri.y - vP;
+ const ssef C = tri.z - vP;
+
+ const ssei vk = isect_precalc->k;
+ const ssef A_k = shuffle(A, vk);
+ const ssef B_k = shuffle(B, vk);
+ const ssef C_k = shuffle(C, vk);
+
+ /* Perform shear and scale of vertices. */
+ const avxf ABC_kxy(shuffle<0, 1, 0, 1>(A_k, B_k), C_k); /* Pack A_kx, A_ky, B_kx, B_ky, C_kx, C_ky, _, _ */
+ const avxf Sxy(shuffle<0, 1, 0, 1>(isect_precalc->S)); /* Pack Sx, Sy, Sx, Sy, Sx, Sy, _, _ */
+ const avxf ABC_kz(shuffle<2>(A_k, B_k), shuffle<2>(C_k)); /* Pack A_kz, A_kz, B_kz, B_kz, C_kz, C_kz, _, _ */
+ const avxf ABC_xy = nmadd(Sxy, ABC_kz, ABC_kxy); /* Pack Ax, Ay, Bx, By, Cx, Cy */
+
+ /* Calculate scaled barycentric coordinates. */
+ /* Pack cy, cx, ay, ax, by, bx, _, _ */
+#ifdef __KERNEL_AVX2__
+ const avxf CAB_yx = shuffle<5, 4, 1, 0, 3, 2, 3, 2>(ABC_xy);
+#else
+ const avxf CAB_yx = shuffle<0, 2>(shuffle<1, 0, 1, 0>(shuffle<1, 0>(ABC_xy), ABC_xy), shuffle<3, 2, 3, 2>(ABC_xy));
+#endif
+
+ /* Get packed result in V, W, _, _, U, _, _, _ */
+ const avxf VWU = hsub(ABC_xy * CAB_yx);
+ const ssef VWU1 = extract<0>(VWU), VWU2 = extract<1>(VWU);
+ const ssef vU = shuffle<0>(VWU2);
+ const ssef vV = shuffle<0>(VWU1);
+ const ssef vW = shuffle<1>(VWU1);
+
+ if (movemask((vU ^ vV) | (vU ^ vW))) {
+ return;
+ }
+
+ /* Calculate determinant. */
+ float det = extract<0>(vU + vV + vW);
+ if(UNLIKELY(det == 0.0f)) {
+ return;
+ }
+
+ const ssef UVW = shuffle<0, 2, 0, 2>(shuffle<0, 1, 0, 1>(vU, vV), vW);
+ const float U = extract<0>(vU), V = extract<0>(vV);
+
+ /* Calculate scaled z−coordinates of vertices and use them to calculate
+ * the hit distance.
+ */
+ const ssef ABC_k_z = shuffle<0, 2, 2, 2>(shuffle<2>(A_k, B_k), C_k);
+ const float T = extract<2>(isect_precalc->S) * dot3(UVW, ABC_k_z);
+
+ int sign_mask = (__float_as_int(U) & 0x80000000);
+#else
const int kx = isect_precalc->kx;
const int ky = isect_precalc->ky;
const int kz = isect_precalc->kz;
@@ -283,6 +423,7 @@ ccl_device_inline void triangle_intersect_subsurface(
const float Bz = Sz * B_kz;
const float Cz = Sz * C_kz;
const float T = U * Az + V * Bz + W * Cz;
+#endif
if ((xor_signmast(T, sign_mask) < 0.0f) ||
(xor_signmast(T, sign_mask) > tmax * xor_signmast(det, sign_mask)))
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 0acb9e9..e0a3a63 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -37,6 +37,8 @@ set(SRC_HEADERS
util_aligned_malloc.h
util_args.h
util_atomic.h
+ util_avxb.h
+ util_avxf.h
util_boundbox.h
util_cache.h
util_debug.h
diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h
new file mode 100644
index 0000000..eb80e08
--- /dev/null
+++ b/intern/cycles/util/util_avxb.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright 2009-2013 Intel Corporation
+ * Modifications Copyright 2015, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_AVXB_H__
+#define __UTIL_AVXB_H__
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __KERNEL_AVX__
+
+/*! 8-wide AVX bool type. */
+struct avxb
+{
+ typedef avxb Mask; // mask type for us
+ enum { size = 8 }; // number of SIMD elements
+ union { // data
+ __m256 m256;
+ struct { __m128 l,h; };
+ int32_t v[8];
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline avxb () {}
+ __forceinline avxb ( const avxb& a ) { m256 = a.m256; }
+ __forceinline avxb& operator=( const avxb& a ) { m256 = a.m256; return *this; }
+
+ __forceinline avxb( const __m256 a ) : m256(a) {}
+ __forceinline operator const __m256&( void ) const { return m256; }
+ __forceinline operator const __m256i( void ) const { return _mm256_castps_si256(m256); }
+ __forceinline operator const __m256d( void ) const { return _mm256_castps_pd(m256); }
+
+ __forceinline avxb ( const sseb& a ) : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
+ __forceinline avxb ( const sseb& a, const sseb& b) : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
+ __forceinline avxb ( const __m128 a, const __m128 b) : l(a), h(b) {}
+
+ __forceinline avxb ( bool a ) : m256(avxb(sseb(a), sseb(a))) {}
+ __forceinline avxb ( bool a, bool b) : m256(avxb(sseb(a), sseb(b))) {}
+ __forceinline avxb ( bool a, bool b, bool c, bool d) : m256(avxb(sseb(a,b), sseb(c,d))) {}
+ __forceinline avxb ( bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h ) : m256(avxb(sseb(a,b,c,d), sseb(e,f,g,h))) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline avxb( FalseTy ) : m256(_mm256_setzero_ps()) {}
+ __forceinline avxb( TrueTy ) : m256(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator []( const size_t i ) const { assert(i < 8); return (_mm256_movemask_ps(m256) >> i) & 1; }
+ __forceinline int32_t& operator []( const size_t i ) { assert(i < 8); return v[i]; }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator !( const avxb& a ) { return _mm256_xor_ps(a, avxb(True)); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Binary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator &( const avxb& a, const avxb& b ) { return _mm256_and_ps(a, b); }
+__forceinline const avxb operator |( const avxb& a, const avxb& b ) { return _mm256_or_ps (a, b); }
+__forceinline const avxb operator ^( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); }
+
+__forceinline avxb operator &=( avxb& a, const avxb& b ) { return a = a & b; }
+__forceinline avxb operator |=( avxb& a, const avxb& b ) { return a = a | b; }
+__forceinline avxb operator ^=( avxb& a, const avxb& b ) { return a = a ^ b; }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Comparison Operators + Select
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator !=( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); }
+__forceinline const avxb operator ==( const avxb& a, const avxb& b ) { return _mm256_xor_ps(_mm256_xor_ps(a,b),avxb(True)); }
+
+__forceinline const avxb select( const avxb& mask, const avxb& t, const avxb& f ) {
+ return _mm256_blendv_ps(f, t, mask);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Movement/Shifting/Shuffling Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline avxb unpacklo( const avxb& a, const avxb& b ) { return _mm256_unpacklo_ps(a.m256, b.m256); }
+__forceinline avxb unpackhi( const avxb& a, const avxb& b ) { return _mm256_unpackhi_ps(a.m256, b.m256); }
+
+template<size_t i> __forceinline const avxb shuffle( const avxb& a ) {
+ return _mm256_permute_ps(a, _MM_SHUFFLE(i, i, i, i));
+}
+
+template<size_t i0, size_t i1> __forceinline const avxb shuffle( const avxb& a ) {
+ return _mm256_permute2f128_ps(a, a, (i1 << 4) | (i0 << 0));
+}
+
+template<size_t i0, size_t i1> __forceinline const avxb shuffle( const avxb& a, const avxb& b) {
+ return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const avxb shuffle( const avxb& a ) {
+ return _mm256_permute_ps(a, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const avxb shuffle( const avxb& a, const avxb& b ) {
+ return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+
+template<> __forceinline const avxb shuffle<0, 0, 2, 2>( const avxb& b ) { return _mm256_moveldup_ps(b); }
+template<> __forceinline const avxb shuffle<1, 1, 3, 3>( const avxb& b ) { return _mm256_movehdup_ps(b); }
+template<> __forceinline const avxb shuffle<0, 1, 0, 1>( const avxb& b ) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(b))); }
+
+template<size_t i> __forceinline const avxb insert (const avxb& a, const sseb& b) { return _mm256_insertf128_ps (a,b,i); }
+template<size_t i> __forceinline const sseb extract(const avxb& a ) { return _mm256_extractf128_ps(a ,i); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Reduction Operations
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline size_t popcnt( const avxb& a ) { return __popcnt(_mm256_movemask_ps(a)); }
+__forceinline bool reduce_and( const avxb& a ) { return _mm256_movemask_ps(a) == 0xff; }
+__forceinline bool reduce_or ( const avxb& a ) { return !_mm256_testz_ps(a,a); }
+__forceinline bool all ( const avxb& a ) { return _mm256_movemask_ps(a) == 0xff; }
+__forceinline bool none ( const avxb& a ) { return _mm256_testz_ps(a,a) != 0; }
+__forceinline bool any ( const avxb& a ) { return !_mm256_testz_ps(a,a); }
+
+__forceinline size_t movemask( const avxb& a ) { return _mm256_movemask_ps(a); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Debug Functions
+////////////////////////////////////////////////////////////////////////////////
+
+ccl_device_inline void print_avxb(const char *label, const avxb &a)
+{
+ printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
+}
+
+#endif
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/util/util_avxf.h b/intern/cycles/util/util_avxf.h
new file mode 100644
index 0000000..94e5eca
--- /dev/null
+++ b/intern/cycles/util/util_avxf.h
@@ -0,0 +1,314 @@
+/*
+ * Copyright 2009-2013 Intel Corporation
+ * Modifications Copyright 2015, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_AVXF_H__
+#define __UTIL_AVXF_H__
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __KERNEL_AVX__
+
+/*! 8-wide AVX float type. */
+struct avxf
+{
+ typedef avxb Mask; // mask type for us
+ enum { size = 8 }; // number of SIMD elements
+ union { __m256 m256; float v[8]; }; // data
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline avxf ( ) {}
+ __forceinline avxf ( const avxf& other ) { m256 = other.m256; }
+ __forceinline avxf& operator=( const avxf& other ) { m256 = other.m256; return *this; }
+
+ __forceinline avxf( const __m256 a ) : m256(a) {}
+ __forceinline operator const __m256&( void ) const { return m256; }
+ __forceinline operator __m256&( void ) { return m256; }
+
+ __forceinline explicit avxf( const ssef& a ) : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
+ __forceinline avxf( const ssef& a, const ssef& b ) : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
+
+ static __forceinline avxf load( const void* const ptr ) { return *(__m256*)ptr; }
+
+ __forceinline explicit avxf( const char* const a ) : m256(_mm256_loadu_ps((const float*)a)) {}
+ __forceinline avxf( const float& a ) : m256(_mm256_broadcast_ss(&a)) {}
+ __forceinline avxf( float a, float b) : m256(_mm256_set_ps(b, a, b, a, b, a, b, a)) {}
+ __forceinline avxf( float a, float b, float c, float d ) : m256(_mm256_set_ps(d, c, b, a, d, c, b, a)) {}
+ __forceinline avxf( float a, float b, float c, float d, float e, float f, float g, float h ) : m256(_mm256_set_ps(h, g, f, e, d, c, b, a)) {}
+
+ __forceinline explicit avxf( const __m256i a ) : m256(_mm256_cvtepi32_ps(a)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const float& operator []( const size_t i ) const { assert(i < 8); return v[i]; }
+ __forceinline float& operator []( const size_t i ) { assert(i < 8); return v[i]; }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxf operator +( const avxf& a ) { return a; }
+__forceinline const avxf operator -( const avxf& a ) {
+ const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
+ return _mm256_xor_ps(a.m256, mask);
+}
+__forceinline const avxf abs ( const avxf& a ) {
+ const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
+ return _mm256_and_ps(a.m256, mask);
+}
+__forceinline const avxf sign ( const avxf& a ) { return _mm256_blendv_ps(avxf(1.0f), -avxf(1.0f), _mm256_cmp_ps(a, avxf(0.0f), _CMP_NGE_UQ )); }
+__forceinline const avxf signmsk ( const avxf& a ) { return _mm256_and_ps(a.m256,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); }
+
+__forceinline const avxf rcp ( const avxf& a ) {
+ const avxf r = _mm256_rcp_ps(a.m256);
+ return _mm256_sub_ps(_mm256_add_ps(r, r), _mm256_mul_ps(_mm256_mul_ps(r, r), a));
+}
+__forceinline const avxf sqr ( const avxf& a ) { return _mm256_mul_ps(a,a); }
+__forceinline const avxf sqrt ( const avxf& a ) { return _mm256_sqrt_ps(a.m256); }
+__forceinline const avxf rsqrt( const avxf& a ) {
+ const avxf r = _mm256_rsqrt_ps(a.m256);
+ return _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(1.5f), r), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r)));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Binary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxf operator +( const avxf& a, const avxf& b ) { return _mm256_add_ps(a.m256, b.m256); }
+__forceinline const avxf operator +( const avxf& a, const float b ) { return a + avxf(b); }
+__forceinline const avxf operator +( const float a, const avxf& b ) { return avxf(a) + b; }
+
+__forceinline const avxf operator -( const avxf& a, const avxf& b ) { return _mm256_sub_ps(a.m256, b.m256); }
+__forceinline const avxf operator -( const avxf& a, const float b ) { return a - avxf(b); }
+__forceinline const avxf operator -( const float a, const avxf& b ) { return avxf(a) - b; }
+
+__forceinline const avxf operator *( const avxf& a, const avxf& b ) { return _mm256_mul_ps(a.m256, b.m256); }
+__forceinline const avxf operator *( const avxf& a, const float b ) { return a * avxf(b); }
+__forceinline const avxf operator *( const float a, const avxf& b ) { return avxf(a) * b; }
+
+__forceinline const avxf operator /( const avxf& a, const avxf& b ) { return a * rcp(b); }
+//__forceinline const avxf operator /( const avxf& a, const float b ) { return a * rcp(b); }
+__forceinline const avxf operator /( const float a, const avxf& b ) { return a * rcp(b); }
+
+__forceinline const avxf operator^( const avxf& a, const avxf& b ) { return _mm256_xor_ps(a.m256,b.m256); }
+
+__forceinline const avxf min( const avxf& a, const avxf& b ) { return _mm256_min_ps(a.m256, b.m256); }
+__forceinline const avxf min( const avxf& a, const float b ) { return _mm256_min_ps(a.m256, avxf(b)); }
+__forceinline const avxf min( const float a, const avxf& b ) { return _mm256_min_ps(avxf(a), b.m256); }
+
+__forceinline const avxf max( const avxf& a, const avxf& b ) { return _mm256_max_ps(a.m256, b.m256); }
+__forceinline const avxf max( const avxf& a, const float b ) { return _mm256_max_ps(a.m256, avxf(b)); }
+__forceinline const avxf max( const float a, const avxf& b ) { return _mm256_max_ps(avxf(a), b.m256); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Ternary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxf madd ( const avxf& a, const avxf& b, const avxf& c) { return _mm256_fmadd_ps(a,b,c); }
+__forceinline const avxf msub ( const avxf& a, const avxf& b, const avxf& c) { return _mm256_fmsub_ps(a,b,c); }
+__forceinline const avxf nmadd ( const avxf& a, const avxf& b, const avxf& c) { return _mm256_fnmadd_ps(a,b,c); }
+__forceinline const avxf nmsub ( const avxf& a, const avxf& b, const avxf& c) { return _mm256_fnmsub_ps(a,b,c); }
+#else
+__forceinline const avxf madd ( const avxf& a, const avxf& b, const avxf& c) { return a*b+c; }
+__forceinline const avxf msub ( const avxf& a, const avxf& b, const avxf& c) { return a*b-c; }
+__forceinline const avxf nmadd ( const avxf& a, const avxf& b, const avxf& c) { return c-a*b;}
+__forceinline const avxf nmsub ( const avxf& a, const avxf& b, const avxf& c) { return -a*b-c; }
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Assignment Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline avxf& operator +=( avxf& a, const avxf& b ) { return a = a + b; }
+__forceinline avxf& operator +=( avxf& a, const float b ) { return a = a + b; }
+
+__forceinline avxf& operator -=( avxf& a, const avxf& b ) { return a = a - b; }
+__forceinline avxf& operator -=( avxf& a, const float b ) { return a = a - b; }
+
+__forceinline avxf& operator *=( avxf& a, const avxf& b ) { return a = a * b; }
+__forceinline avxf& operator *=( avxf& a, const float b ) { return a = a * b; }
+
+__forceinline avxf& operator /=( avxf& a, const avxf& b ) { return a = a / b; }
+__forceinline avxf& operator /=( avxf& a, const float b ) { return a = a / b; }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Comparison Operators + Select
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator ==( const avxf& a, const avxf& b ) { return _mm256_cmp_ps(a.m256, b.m256, _CMP_EQ_UQ ); }
+__forceinline const avxb operator ==( const avxf& a, const float b ) { return _mm256_cmp_ps(a.m256, avxf(b), _CMP_EQ_UQ ); }
+__forceinline const avxb operator ==( const float a, const avxf& b ) { return _mm256_cmp_ps(avxf(a), b.m256, _CMP_EQ_UQ ); }
+
+__forceinline const avxb operator !=( const avxf& a, const avxf& b ) { return _mm256_cmp_ps(a.m256, b.m256, _CMP_NEQ_UQ); }
+__forceinline const avxb operator !=( const avxf& a, const float b ) { return _mm256_cmp_ps(a.m256, avxf(b), _CMP_NEQ_UQ); }
+__forceinline const avxb operator !=( const float a, const avxf& b ) { return _mm256_cmp_ps(avxf(a), b.m256, _CMP_NEQ_UQ); }
+
+__forceinline const avxb operator < ( const avxf& a, const avxf& b ) { return _mm256_cmp_ps(a.m256, b.m256, _CMP_NGE_UQ ); }
+__forceinline const avxb operator < ( const avxf& a, const float b ) { return _mm256_cmp_ps(a.m256, avxf(b), _CMP_NGE_UQ ); }
+__forceinline const avxb operator < ( const float a, const avxf& b ) { return _mm256_cmp_ps(avxf(a), b.m256, _CMP_NGE_UQ ); }
+
+__forceinline const avxb operator >=( const avxf& a, const avxf& b ) { return _mm256_cmp_ps(a.m256, b.m256, _CMP_NLT_UQ); }
+__forceinline const avxb operator >=( const avxf& a, const float b ) { return _mm256_cmp_ps(a.m256, avxf(b), _CMP_NLT_UQ); }
+__forceinline const avxb operator >=( const float a, const avxf& b ) { return _mm256_cmp_ps(avxf(a), b.m256, _CMP_NLT_UQ); }
+
+__forceinline const avxb operator > ( const avxf& a, const avxf& b ) { return _mm256_cmp_ps(a.m256, b.m256, _CMP_NLE_UQ); }
+__forceinline const avxb operator > ( const avxf& a, const float b ) { return _mm256_cmp_ps(a.m256, avxf(b), _CMP_NLE_UQ); }
+__forceinline const avxb operator > ( const float a, const avxf& b ) { return _mm256_cmp_ps(avxf(a), b.m256, _CMP_NLE_UQ); }
+
+__forceinline const avxb operator <=( const avxf& a, const avxf& b ) { return _mm256_cmp_ps(a.m256, b.m256, _CMP_NGT_UQ ); }
+__forceinline const avxb operator <=( const avxf& a, const float b ) { return _mm256_cmp_ps(a.m256, avxf(b), _CMP_NGT_UQ ); }
+__forceinline const avxb operator <=( const float a, const avxf& b ) { return _mm256_cmp_ps(avxf(a), b.m256, _CMP_NGT_UQ ); }
+
+__forceinline const avxf select( const avxb& mask, const avxf& t, const avxf& f ) {
+ return _mm256_blendv_ps(f, t, mask);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Rounding Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxf round_even( const avxf& a ) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+__forceinline const avxf round_down( const avxf& a ) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
+__forceinline const avxf round_up ( const avxf& a ) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF ); }
+__forceinline const avxf round_zero( const avxf& a ) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO ); }
+__forceinline const avxf floor ( const avxf& a ) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
+__forceinline const avxf ceil ( const avxf& a ) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF ); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Movement/Shifting/Shuffling Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline avxf unpacklo( const avxf& a, const avxf& b ) { return _mm256_unpacklo_ps(a.m256, b.m256); }
+__forceinline avxf unpackhi( const avxf& a, const avxf& b ) { return _mm256_unpackhi_ps(a.m256, b.m256); }
+
+template<size_t i> __forceinline const avxf shuffle( const avxf& a ) {
+ return _mm256_permute_ps(a, _MM_SHUFFLE(i, i, i, i));
+}
+
+template<size_t i0, size_t i1> __forceinline const avxf shuffle( const avxf& a ) {
+ return _mm256_permute2f128_ps(a, a, (i1 << 4) | (i0 << 0));
+}
+
+template<size_t i0, size_t i1> __forceinline const avxf shuffle( const avxf& a, const avxf& b) {
+ return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const avxf shuffle( const avxf& a ) {
+ return _mm256_permute_ps(a, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const avxf shuffle( const avxf& a, const avxf& b ) {
+ return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+
+template<> __forceinline const avxf shuffle<0, 0, 2, 2>( const avxf& a ) { return _mm256_moveldup_ps(a); }
+template<> __forceinline const avxf shuffle<1, 1, 3, 3>( const avxf& a ) { return _mm256_movehdup_ps(a); }
+
+#ifdef __KERNEL_AVX2__
+template<size_t i0, size_t i1, size_t i2, size_t i3, size_t i4, size_t i5, size_t i6, size_t i7>
+__forceinline const avxf shuffle( const avxf& a ) {
+ return _mm256_permutevar8x32_ps(a, _mm256_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7));
+}
+#endif
+
+__forceinline const avxf broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); }
+template<size_t i> __forceinline const avxf insert (const avxf& a, const ssef& b) { return _mm256_insertf128_ps (a,b,i); }
+template<size_t i> __forceinline const ssef extract (const avxf& a ) { return _mm256_extractf128_ps(a ,i); }
+template<> __forceinline const ssef extract<0>(const avxf& a ) { return _mm256_castps256_ps128(a); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Transpose
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline void transpose4(const avxf& r0, const avxf& r1, const avxf& r2, const avxf& r3, avxf& c0, avxf& c1, avxf& c2, avxf& c3)
+{
+ avxf l02 = unpacklo(r0,r2);
+ avxf h02 = unpackhi(r0,r2);
+ avxf l13 = unpacklo(r1,r3);
+ avxf h13 = unpackhi(r1,r3);
+ c0 = unpacklo(l02,l13);
+ c1 = unpackhi(l02,l13);
+ c2 = unpacklo(h02,h13);
+ c3 = unpackhi(h02,h13);
+}
+
+__forceinline void transpose(const avxf& r0, const avxf& r1, const avxf& r2, const avxf& r3, const avxf& r4, const avxf& r5, const avxf& r6, const avxf& r7,
+ avxf& c0, avxf& c1, avxf& c2, avxf& c3, avxf& c4, avxf& c5, avxf& c6, avxf& c7)
+{
+ avxf h0,h1,h2,h3; transpose4(r0,r1,r2,r3,h0,h1,h2,h3);
+ avxf h4,h5,h6,h7; transpose4(r4,r5,r6,r7,h4,h5,h6,h7);
+ c0 = shuffle<0,2>(h0,h4);
+ c1 = shuffle<0,2>(h1,h5);
+ c2 = shuffle<0,2>(h2,h6);
+ c3 = shuffle<0,2>(h3,h7);
+ c4 = shuffle<1,3>(h0,h4);
+ c5 = shuffle<1,3>(h1,h5);
+ c6 = shuffle<1,3>(h2,h6);
+ c7 = shuffle<1,3>(h3,h7);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Reductions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxf vreduce_min2(const avxf& v) { return min(v,shuffle<1,0,3,2>(v)); }
+__forceinline const avxf vreduce_min4(const avxf& v) { avxf v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+__forceinline const avxf vreduce_min (const avxf& v) { avxf v1 = vreduce_min4(v); return min(v1,shuffle<1,0>(v1)); }
+
+__forceinline const avxf vreduce_max2(const avxf& v) { return max(v,shuffle<1,0,3,2>(v)); }
+__forceinline const avxf vreduce_max4(const avxf& v) { avxf v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+__forceinline const avxf vreduce_max (const avxf& v) { avxf v1 = vreduce_max4(v); return max(v1,shuffle<1,0>(v1)); }
+
+__forceinline const avxf vreduce_add2(const avxf& v) { return v + shuffle<1,0,3,2>(v); }
+__forceinline const avxf vreduce_add4(const avxf& v) { avxf v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+__forceinline const avxf vreduce_add (const avxf& v) { avxf v1 = vreduce_add4(v); return v1 + shuffle<1,0>(v1); }
+
+__forceinline const avxf hsub(const avxf& v) { return _mm256_hsub_ps(v, v); }
+
+__forceinline float reduce_min(const avxf& v) { return _mm_cvtss_f32(extract<0>(vreduce_min(v))); }
+__forceinline float reduce_max(const avxf& v) { return _mm_cvtss_f32(extract<0>(vreduce_max(v))); }
+__forceinline float reduce_add(const avxf& v) { return _mm_cvtss_f32(extract<0>(vreduce_add(v))); }
+
+__forceinline size_t select_min(const avxf& v) { return __bsf(movemask(v == vreduce_min(v))); }
+__forceinline size_t select_max(const avxf& v) { return __bsf(movemask(v == vreduce_max(v))); }
+
+__forceinline size_t select_min(const avxb& valid, const avxf& v) { const avxf a = select(valid,v,avxf(pos_inf)); return __bsf(movemask(valid & (a == vreduce_min(a)))); }
+__forceinline size_t select_max(const avxb& valid, const avxf& v) { const avxf a = select(valid,v,avxf(neg_inf)); return __bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Debug Functions
+////////////////////////////////////////////////////////////////////////////////
+
+ccl_device_inline void print_avxf(const char *label, const ssef &a)
+{
+ printf("%s: %.8f %.8f %.8f %.8f %.8f %.8f %.8f %.8f\n", label,
+ (double)a[0], (double)a[1], (double)a[2], (double)a[3],
+ (double)a[4], (double)a[5], (double)a[6], (double)a[7]);
+}
+
+#endif
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 625f26c..665c55f 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -434,6 +434,8 @@ CCL_NAMESPACE_END
#include "util_sseb.h"
#include "util_ssei.h"
#include "util_ssef.h"
+#include "util_avxb.h"
+#include "util_avxf.h"
#endif /* __UTIL_SIMD_TYPES_H__ */

Event Timeline

Sv. Lockal (lockal) edited the content of this paste. (Show Details)Mar 7 2015, 7:04 PM
Sv. Lockal (lockal) changed the title of this paste from untitled to avx_intersector.patch.
Sv. Lockal (lockal) updated the paste's language from autodetect to diff.
Sv. Lockal (lockal) added a project: Cycles.