Page MenuHome

exp2f
ActivePublic

Authored by Sv. Lockal (lockal) on Mar 25 2014, 11:57 PM.
#include <xmmintrin.h>
#include <immintrin.h>
#include <math.h>
double exp_coeffs[64] = {
1.0,
1.0108892860517005,
1.0218971486541166,
1.0330248790212284,
1.0442737824274138,
1.0556451783605572,
1.0671404006768237,
1.0787607977571199,
1.0905077326652577,
1.1023825833078409,
1.1143867425958924,
1.1265216186082418,
1.1387886347566916,
1.1511892299529827,
1.1637248587775775,
1.1763969916502812,
1.189207115002721,
1.2021567314527031,
1.215247359980469,
1.22848053610687,
1.241857812073484,
1.2553807570246911,
1.2690509571917332,
1.2828700160787783,
1.2968395546510096,
1.3109612115247644,
1.3252366431597413,
1.3396675240533029,
1.3542555469368927,
1.3690024229745905,
1.383909881963832,
1.3989796725383112,
1.4142135623730951,
1.42961333839197,
1.4451808069770467,
1.460917794180647,
1.4768261459394993,
1.4929077282912648,
1.5091644275934228,
1.5255981507445384,
1.5422108254079407,
1.5590044002378369,
1.5759808451078865,
1.593142151342267,
1.6104903319492543,
1.6280274218573478,
1.6457554781539649,
1.6636765803267364,
1.681792830507429,
1.7001063537185235,
1.7186192981224779,
1.7373338352737062,
1.7562521603732995,
1.7753764925265212,
1.7947090750031072,
1.8142521755003989,
1.8340080864093424,
1.8539791250833855,
1.8741676341103,
1.8945759815869656,
1.9152065613971474,
1.9360617934922943,
1.9571441241754002,
1.9784560263879509
};
__attribute__((noinline)) __m128 exp2f(__m128 x)
{
__m256d clxd = _mm256_cvtps_pd(_mm_min_ps(_mm_set1_ps(128.0f), x));
/* split exponent part in 64 regions to fetch correction coefficients */
__m256d clxd64 = _mm256_mul_pd(_mm256_set1_pd(64.0), clxd);
__m128i clxd64i = _mm256_cvtpd_epi32(clxd64);
__m256d clxd64id = _mm256_cvtepi32_pd(clxd64i);
__m256d ext = _mm256_fmadd_pd(clxd64id, _mm256_set1_pd(-1.0/64.0), clxd);
__m256d t1 = _mm256_mul_pd(ext, _mm256_set1_pd(M_LN2));
__m256d t2 = _mm256_fmadd_pd(t1, _mm256_set1_pd(1.0/6.0), _mm256_set1_pd(0.5));
__m256d t3 = _mm256_mul_pd(t1, t1);
__m256d t4 = _mm256_fmadd_pd(t3, t2, t1);
__m128i xf1 = _mm_srli_epi32(_mm_slli_epi32(clxd64i, 26), 26);
__m128i xf2 = _mm_srli_si128(xf1, 4);
__m128i xf3 = _mm_srli_si128(xf2, 4);
__m128i xf4 = _mm_srli_si128(xf3, 4);
int f1 = _mm_cvtsi128_si32(xf1);
int f2 = _mm_cvtsi128_si32(xf2);
int f3 = _mm_cvtsi128_si32(xf3);
int f4 = _mm_cvtsi128_si32(xf4);
__m128d g1 = _mm_loadh_pd(_mm_load_sd(exp_coeffs + f1), exp_coeffs + f2);
__m128d g2 = _mm_loadh_pd(_mm_load_sd(exp_coeffs + f3), exp_coeffs + f4);
__m256d kt = _mm256_insertf128_pd(_mm256_castpd128_pd256(g1), g2, 1);
__m256d ktf = _mm256_fmadd_pd(kt, t4, kt);
__m128i exp1 = _mm_srai_epi32(clxd64i, 6);
__m128i exp2 = _mm_srli_si128(exp1, 8);
/* construct base part from exponent */
__m128i bp1 = _mm_slli_epi64(_mm_cvtepi32_epi64(exp1), 52);
__m128i bp2 = _mm_slli_epi64(_mm_cvtepi32_epi64(exp2), 52);
/* construct correction part */
__m128i cp1 = _mm256_extractf128_si256(_mm256_castpd_si256(ktf), 0);
__m128i cp2 = _mm256_extractf128_si256(_mm256_castpd_si256(ktf), 1);
__m128d res1 = _mm_castsi128_pd(_mm_add_epi64(bp1, cp1));
__m128d res2 = _mm_castsi128_pd(_mm_add_epi64(bp2, cp2));
__m256d resultd = _mm256_insertf128_pd(_mm256_castpd128_pd256(res1), res2, 1);
__m128 result =_mm256_cvtpd_ps(resultd);
/* handle underflow */
__m128 lowcase = _mm_cmplt_ps(_mm_set1_ps(-149.0f), x);
__m128 r = _mm_and_ps(result, lowcase);
/* handle nans */
__m128 notnan = _mm_cmpeq_ps(x, x);
__m128 nancase = _mm_add_ps(x, x);
return _mm_or_ps(_mm_and_ps(notnan, r), _mm_andnot_ps(notnan, nancase));
}

Event Timeline

Sv. Lockal (lockal) changed the title of this paste from untitled to exp2f.
Sv. Lockal (lockal) updated the paste's language from autodetect to c.