Cycles: SSE optimization for line segments/ribbons hair

Gives ~11% speedup for hair.blend, ~10% for koro_final.blend

Also extract few common subexpressions in hair calculation.

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D318
This commit is contained in:
Sv. Lockal 2014-03-23 00:45:48 +04:00
parent 0ef416722e
commit c45c472e1b
Notes: blender-bot 2023-02-14 07:49:06 +01:00
Referenced by issue #48684, Circular/radial bands in this hair scene (most noticeable in CPU)
2 changed files with 139 additions and 28 deletions

View File

@ -596,6 +596,13 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
{
/* define few macros to minimize code duplication for SSE */
#ifndef __KERNEL_SSE2__
#define len3_squared(x) len_squared(x)
#define len3(x) len(x)
#define dot3(x, y) dot(x, y)
#endif
/* curve Intersection check */
int flags = kernel_data.curve.curveflags;
@ -606,6 +613,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
int k0 = cnum + segment;
int k1 = k0 + 1;
#ifndef __KERNEL_SSE2__
float4 P1 = kernel_tex_fetch(__curve_keys, k0);
float4 P2 = kernel_tex_fetch(__curve_keys, k1);
@ -617,36 +625,72 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
/* minimum width extension */
float r1 = or1;
float r2 = or2;
float3 dif = P - p1;
float3 dif_second = P - p2;
if(difl != 0.0f) {
float pixelsize = min(len(p1 - P) * difl, extmax);
float pixelsize = min(len3(dif) * difl, extmax);
r1 = or1 < pixelsize ? pixelsize : or1;
pixelsize = min(len(p2 - P) * difl, extmax);
pixelsize = min(len3(dif_second) * difl, extmax);
r2 = or2 < pixelsize ? pixelsize : or2;
}
/* --- */
float mr = max(r1,r2);
float3 dif = P - p1;
float3 dir = 1.0f/idir;
float l = len(p2 - p1);
float3 dir = 1.0f / idir;
float3 p21_diff = p2 - p1;
float3 sphere_dif1 = (dif + dif_second) * 0.5f;
float sphere_b_tmp = dot3(dir, sphere_dif1);
float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
#else
const __m128 p1 = _mm_load_ps(&kg->__curve_keys.data[k0].x);
const __m128 p2 = _mm_load_ps(&kg->__curve_keys.data[k1].x);
const __m128 or12 = shuffle<3, 3, 3, 3>(p1, p2);
__m128 r12 = or12;
const __m128 vP = load_m128(P);
const __m128 dif = _mm_sub_ps(vP, p1);
const __m128 dif_second = _mm_sub_ps(vP, p2);
if(difl != 0.0f) {
const __m128 len1_sq = len3_squared_splat(dif);
const __m128 len2_sq = len3_squared_splat(dif_second);
const __m128 len12 = _mm_sqrt_ps(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
const __m128 pixelsize12 = _mm_min_ps(_mm_mul_ps(len12, _mm_set1_ps(difl)), _mm_set1_ps(extmax));
r12 = _mm_max_ps(or12, pixelsize12);
}
float or1 = _mm_cvtss_f32(or12), or2 = _mm_cvtss_f32(broadcast<2>(or12));
float r1 = _mm_cvtss_f32(r12), r2 = _mm_cvtss_f32(broadcast<2>(r12));
const __m128 dir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir));
const __m128 p21_diff = _mm_sub_ps(p2, p1);
const __m128 sphere_dif1 = _mm_mul_ps(_mm_add_ps(dif, dif_second), _mm_set1_ps(0.5f));
const __m128 sphere_b_tmp = dot3_splat(dir, sphere_dif1);
const __m128 sphere_dif2 = fnma(sphere_b_tmp, dir, sphere_dif1);
#endif
float mr = max(r1, r2);
float l = len3(p21_diff);
float invl = 1.0f / l;
float sp_r = mr + 0.5f * l;
float3 sphere_dif = P - ((p1 + p2) * 0.5f);
float sphere_b = dot(dir,sphere_dif);
sphere_dif = sphere_dif - sphere_b * dir;
sphere_b = dot(dir,sphere_dif);
float sdisc = sphere_b * sphere_b - len_squared(sphere_dif) + sp_r * sp_r;
float sphere_b = dot3(dir, sphere_dif2);
float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r;
if(sdisc < 0.0f)
return false;
/* obtain parameters and test midpoint distance for suitable modes */
float3 tg = (p2 - p1) / l;
float gd = (r2 - r1) / l;
float dirz = dot(dir,tg);
float difz = dot(dif,tg);
#ifndef __KERNEL_SSE2__
float3 tg = p21_diff * invl;
#else
const __m128 tg = _mm_mul_ps(p21_diff, _mm_set1_ps(invl));
#endif
float gd = (r2 - r1) * invl;
float dirz = dot3(dir, tg);
float difz = dot3(dif, tg);
float a = 1.0f - (dirz*dirz*(1 + gd*gd));
float halfb = dot(dir,dif) - dirz*(difz + gd*(difz*gd + r1));
float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1));
float tcentre = -halfb/a;
float zcentre = difz + (dirz * tcentre);
@ -657,11 +701,15 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
return false;
/* test minimum separation */
#ifndef __KERNEL_SSE2__
float3 cprod = cross(tg, dir);
float3 cprod2 = cross(tg, dif);
float cprodsq = len_squared(cprod);
float cprod2sq = len_squared(cprod2);
float distscaled = dot(cprod,dif);
float cprod2sq = len3_squared(cross(tg, dif));
#else
const __m128 cprod = cross(tg, dir);
float cprod2sq = len3_squared(cross_zxy(tg, dif));
#endif
float cprodsq = len3_squared(cprod);
float distscaled = dot3(cprod, dif);
if(cprodsq == 0)
distscaled = cprod2sq;
@ -672,10 +720,15 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
return false;
/* calculate true intersection */
float3 tdif = P - p1 + tcentre * dir;
float tdifz = dot(tdif,tg);
float tb = 2*(dot(dir,tdif) - dirz*(tdifz + gd*(tdifz*gd + r1)));
float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - r1*r1 - 2*r1*tdifz*gd;
#ifndef __KERNEL_SSE2__
float3 tdif = dif + tcentre * dir;
#else
const __m128 tdif = fma(_mm_set1_ps(tcentre), dir, dif);
#endif
float tdifz = dot3(tdif, tg);
float tdifma = tdifz*gd + r1;
float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma));
float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma;
float td = tb*tb - 4*a*tc;
if (td < 0.0f)
@ -709,7 +762,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
}
/* stochastic fade from minimum width */
float adjradius = or1 + z * (or2 - or1) / l;
float adjradius = or1 + z * (or2 - or1) * invl;
adjradius = adjradius / (r1 + z * gd);
if(lcg_state && adjradius != 1.0f) {
if(lcg_step_float(lcg_state) > adjradius)
@ -721,9 +774,9 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
if (flags & CURVE_KN_ENCLOSEFILTER) {
float enc_ratio = 1.01f;
if((dot(P - p1, tg) > -r1 * enc_ratio) && (dot(P - p2, tg) < r2 * enc_ratio)) {
if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
float c2 = dot(dif,dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
if(a2*c2 < 0.0f)
return false;
}
@ -739,7 +792,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
isect->prim = curveAddr;
isect->segment = segment;
isect->object = object;
isect->u = z/l;
isect->u = z*invl;
isect->v = td/(4*a*a);
/*isect->v = 1.0f - adjradius;*/
isect->t = t;
@ -753,6 +806,12 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
}
return false;
#ifndef __KERNEL_SSE2__
#undef len3_squared
#undef len3
#undef dot3
#endif
}
#endif

View File

@ -154,6 +154,12 @@ ccl_device_inline const __m128 fms(const __m128& a, const __m128& b, const __m12
return _mm_sub_ps(_mm_mul_ps(a, b), c);
}
/* calculate -a*b+c (replacement for fused negated-multiply-subtract on SSE CPUs) */
ccl_device_inline const __m128 fnma(const __m128& a, const __m128& b, const __m128& c)
{
return _mm_sub_ps(c, _mm_mul_ps(a, b));
}
template<size_t N> ccl_device_inline const __m128 broadcast(const __m128& a)
{
return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(N, N, N, N)));
@ -204,6 +210,52 @@ ccl_device_inline const __m128 load_m128(const float3 &vec)
}
#endif /* __KERNEL_WITH_SSE_ALIGN__ */
ccl_device_inline const __m128 dot3_splat(const __m128& a, const __m128& b)
{
#ifdef __KERNEL_SSE41__
return _mm_dp_ps(a, b, 0x7f);
#else
__m128 t = _mm_mul_ps(a, b);
return _mm_set1_ps(((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2]);
#endif
}
ccl_device_inline float dot3(const __m128& a, const __m128& b)
{
#ifdef __KERNEL_SSE41__
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7f));
#else
__m128 t = _mm_mul_ps(a, b);
return ((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2];
#endif
}
ccl_device_inline const __m128 len3_squared_splat(const __m128& a)
{
return dot3_splat(a, a);
}
ccl_device_inline float len3_squared(const __m128& a)
{
return dot3(a, a);
}
ccl_device_inline float len3(const __m128& a)
{
return _mm_cvtss_f32(_mm_sqrt_ss(dot3_splat(a, a)));
}
/* calculate shuffled cross product, useful when order of components does not matter */
ccl_device_inline const __m128 cross_zxy(const __m128& a, const __m128& b)
{
return fms(a, shuffle<1, 2, 0, 3>(b), _mm_mul_ps(b, shuffle<1, 2, 0, 3>(a)));
}
ccl_device_inline const __m128 cross(const __m128& a, const __m128& b)
{
return shuffle<1, 2, 0, 3>(cross_zxy(a, b));
}
#endif /* __KERNEL_SSE2__ */
CCL_NAMESPACE_END