Cycles: Optimization for CMJ in CUDA kernels

Two things:
- Use intrinsics for clz/ctz (ctz is implemented via ffs()).
- Use faster sqrt() function which precision is enough for
  integer values.
This commit is contained in:
Sergey Sharybin 2015-03-13 12:14:43 +05:00
parent aa4cb95a5c
commit 61eab743f1
Notes: blender-bot 2023-02-14 09:24:53 +01:00
Referenced by issue #43911, Cloth Physics Bending
1 changed files with 8 additions and 0 deletions

View File

@ -47,6 +47,8 @@ ccl_device_inline int cmj_fast_div_pow2(int a, int b)
# else
return a >> __builtin_ctz(b);
# endif
#elif defined(__KERNEL_CUDA__)
return a >> (__ffs(b) - 1);
#else
return a/b;
#endif
@ -63,6 +65,8 @@ ccl_device_inline uint cmj_w_mask(uint w)
# else
return ((1 << (32 - __builtin_clz(w))) - 1);
# endif
#elif defined(__KERNEL_CUDA__)
return ((1 << (32 - __clz(w))) - 1);
#else
w |= w >> 1;
w |= w >> 2;
@ -167,7 +171,11 @@ ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
{
kernel_assert(s < N);
#if defined(__KERNEL_CUDA__)
int m = float_to_int(__fsqrt_ru(N));
#else
int m = float_to_int(sqrtf(N));
#endif
int n = (N + m - 1)/m;
float invN = 1.0f/N;
float invm = 1.0f/m;