Eevee: Attempt to optimize GTAO shader.

Unroll horizon search loop. Use fast version of acos.
On nvidia linux, unrolling the 2nd loop is giving very high compilation time.
This commit is contained in:
Clément Foucault 2017-06-23 02:52:15 +02:00
parent fbffd6d364
commit 3a243ad83f
2 changed files with 145 additions and 68 deletions

View File

@ -5,28 +5,21 @@
#define MAX_PHI_STEP 32
/* NOTICE : this is multiplied by 2 */
#define MAX_THETA_STEP 6.0
#define MAX_THETA_STEP 12
uniform sampler2D minMaxDepthTex;
uniform float aoDistance;
uniform float aoSamples;
uniform float aoFactor;
float sample_depth(vec2 co, int level)
float get_max_horizon(vec2 co, vec3 x, float h, float lod)
{
return textureLod(minMaxDepthTex, co, float(level)).g;
}
float get_max_horizon(vec2 co, vec3 x, float h, float step)
{
if (co.x > 1.0 || co.x < 0.0 || co.y > 1.0 || co.y < 0.0)
return h;
float depth = sample_depth(co, int(step));
float depth = textureLod(minMaxDepthTex, co, floor(lod)).g;
/* Background case */
if (depth == 1.0)
return h;
/* this is really slow and is only a problem
* if the far clip plane is near enough to notice */
// depth += step(1.0, depth) * 1e20;
vec3 s = get_view_space_from_depth(co, depth); /* s View coordinate */
vec3 omega_s = s - x;
@ -39,6 +32,124 @@ float get_max_horizon(vec2 co, vec3 x, float h, float step)
return mix(h, max_h, blend);
}
void search_step(
vec2 t_phi, vec3 x, vec2 x_, float rand, vec2 pixel_ratio,
inout float j, inout float ofs, inout float h1, inout float h2)
{
ofs += ofs; /* Step size is doubled each iteration */
vec2 s_ = t_phi * ofs * rand * pixel_ratio; /* s^ Screen coordinate */
vec2 co;
co = x_ + s_;
h1 = get_max_horizon(co, x, h1, j);
co = x_ - s_;
h2 = get_max_horizon(co, x, h2, j);
j += 0.5;
}
void search_horizon(
vec2 t_phi, vec3 x, vec2 x_, float rand,
float max_dist, vec2 pixel_ratio, float pixel_len,
inout float h1, inout float h2)
{
float ofs = 1.5 * pixel_len;
float j = 0.0;
#if 0 /* manually unrolled bellow */
for (int i = 0; i < MAX_THETA_STEP; i++) {
search_step(t_phi, x, x_, rand, pixel_ratio, j, ofs, h1, h2);
if (ofs > max_dist)
return;
}
#endif
search_step(t_phi, x, x_, rand, pixel_ratio, j, ofs, h1, h2);
if (ofs > max_dist) return;
search_step(t_phi, x, x_, rand, pixel_ratio, j, ofs, h1, h2);
if (ofs > max_dist) return;
search_step(t_phi, x, x_, rand, pixel_ratio, j, ofs, h1, h2);
if (ofs > max_dist) return;
search_step(t_phi, x, x_, rand, pixel_ratio, j, ofs, h1, h2);
if (ofs > max_dist) return;
search_step(t_phi, x, x_, rand, pixel_ratio, j, ofs, h1, h2);
if (ofs > max_dist) return;
search_step(t_phi, x, x_, rand, pixel_ratio, j, ofs, h1, h2);
if (ofs > max_dist) return;
search_step(t_phi, x, x_, rand, pixel_ratio, j, ofs, h1, h2);
if (ofs > max_dist) return;
search_step(t_phi, x, x_, rand, pixel_ratio, j, ofs, h1, h2);
if (ofs > max_dist) return;
search_step(t_phi, x, x_, rand, pixel_ratio, j, ofs, h1, h2);
if (ofs > max_dist) return;
search_step(t_phi, x, x_, rand, pixel_ratio, j, ofs, h1, h2);
if (ofs > max_dist) return;
search_step(t_phi, x, x_, rand, pixel_ratio, j, ofs, h1, h2);
if (ofs > max_dist) return;
search_step(t_phi, x, x_, rand, pixel_ratio, j, ofs, h1, h2);
}
void integrate_slice(
float iter, vec3 x, vec3 normal, vec2 x_, vec2 noise,
float max_dist, vec2 pixel_ratio, float pixel_len,
inout float visibility, inout vec3 bent_normal)
{
float phi = M_PI * ((noise.r + iter) / aoSamples);
/* Rotate with random direction to get jittered result. */
vec2 t_phi = vec2(cos(phi), sin(phi)); /* Screen space direction */
/* Search maximum horizon angles h1 and h2 */
float h1 = -1.0, h2 = -1.0; /* init at cos(pi) */
search_horizon(t_phi, x, x_, noise.g, max_dist, pixel_ratio, pixel_len, h1, h2);
/* (Slide 54) */
h1 = -fast_acos(h1);
h2 = fast_acos(h2);
/* Projecting Normal to Plane P defined by t_phi and omega_o */
vec3 h = vec3(t_phi.y, -t_phi.x, 0.0); /* Normal vector to Integration plane */
vec3 t = vec3(-t_phi, 0.0);
vec3 n_proj = normal - h * dot(h, normal);
float n_proj_len = max(1e-16, length(n_proj));
/* Clamping thetas (slide 58) */
float cos_n = clamp(n_proj.z / n_proj_len, -1.0, 1.0);
float n = sign(dot(n_proj, t)) * fast_acos(cos_n); /* Angle between view vec and normal */
h1 = n + max(h1 - n, -M_PI_2);
h2 = n + min(h2 - n, M_PI_2);
/* Solving inner integral */
float sin_n = sin(n);
float h1_2 = 2.0 * h1;
float h2_2 = 2.0 * h2;
float vd = (-cos(h1_2 - n) + cos_n + h1_2 * sin_n) + (-cos(h2_2 - n) + cos_n + h2_2 * sin_n);
vd *= 0.25 * n_proj_len;
visibility += vd;
#ifdef USE_BENT_NORMAL
/* Finding Bent normal */
float b_angle = (h1 + h2) / 2.0;
/* The 0.5 factor below is here to equilibrate the accumulated vectors.
* (sin(b_angle) * -t_phi) will accumulate to (phi_step * result_nor.xy * 0.5).
* (cos(b_angle) * 0.5) will accumulate to (phi_step * result_nor.z * 0.5). */
/* Weight sample by vd */
bent_normal += vec3(sin(b_angle) * -t_phi, cos(b_angle) * 0.5) * vd;
#endif
}
void gtao(vec3 normal, vec3 position, vec2 noise, out float visibility
#ifdef USE_BENT_NORMAL
, out vec3 bent_normal
@ -66,62 +177,12 @@ void gtao(vec3 normal, vec3 position, vec2 noise, out float visibility
visibility = 0.0;
#ifdef USE_BENT_NORMAL
bent_normal = vec3(0.0);
#else
vec3 bent_normal = vec3(0.0);
#endif
for (float i = 0.0; i < aoSamples && i < MAX_PHI_STEP; i++) {
float phi = M_PI * ((noise.r + i) / aoSamples);
/* Rotate with random direction to get jittered result. */
vec2 t_phi = vec2(cos(phi), sin(phi)); /* Screen space direction */
/* Search maximum horizon angles h1 and h2 */
float h1 = -1.0, h2 = -1.0; /* init at cos(pi) */
float ofs = 1.5 * pixel_len;
for (float j = 0.0; ofs < max_dist && j < MAX_THETA_STEP; j += 0.5) {
ofs += ofs; /* Step size is doubled each iteration */
vec2 s_ = t_phi * ofs * noise.g * pixel_ratio; /* s^ Screen coordinate */
vec2 co;
co = x_ + s_;
h1 = get_max_horizon(co, x, h1, j);
co = x_ - s_;
h2 = get_max_horizon(co, x, h2, j);
}
/* (Slide 54) */
h1 = -acos(h1);
h2 = acos(h2);
/* Projecting Normal to Plane P defined by t_phi and omega_o */
vec3 h = vec3(t_phi.y, -t_phi.x, 0.0); /* Normal vector to Integration plane */
vec3 t = vec3(-t_phi, 0.0);
vec3 n_proj = normal - h * dot(h, normal);
float n_proj_len = max(1e-16, length(n_proj));
/* Clamping thetas (slide 58) */
float cos_n = clamp(n_proj.z / n_proj_len, -1.0, 1.0);
float n = sign(dot(n_proj, t)) * acos(cos_n); /* Angle between view vec and normal */
h1 = n + max(h1 - n, -M_PI_2);
h2 = n + min(h2 - n, M_PI_2);
/* Solving inner integral */
float sin_n = sin(n);
float h1_2 = 2.0 * h1;
float h2_2 = 2.0 * h2;
float vd = (-cos(h1_2 - n) + cos_n + h1_2 * sin_n) + (-cos(h2_2 - n) + cos_n + h2_2 * sin_n);
vd *= 0.25 * n_proj_len;
visibility += vd;
#ifdef USE_BENT_NORMAL
/* Finding Bent normal */
float b_angle = (h1 + h2) / 2.0;
/* The 0.5 factor below is here to equilibrate the accumulated vectors.
* (sin(b_angle) * -t_phi) will accumulate to (phi_step * result_nor.xy * 0.5).
* (cos(b_angle) * 0.5) will accumulate to (phi_step * result_nor.z * 0.5). */
/* Weight sample by vd */
bent_normal += vec3(sin(b_angle) * -t_phi, cos(b_angle) * 0.5) * vd;
#endif
for (float i = 0.0; i < MAX_PHI_STEP; i++) {
if (i >= aoSamples) break;
integrate_slice(i, x, normal, x_, noise, max_dist, pixel_ratio, pixel_len, visibility, bent_normal);
}
visibility = clamp(visibility / aoSamples, 1e-8, 1.0);

View File

@ -146,6 +146,22 @@ float distance_squared(vec3 a, vec3 b) { a -= b; return dot(a, a); }
float inverse_distance(vec3 V) { return max( 1 / length(V), 1e-8); }
/* ------- Fast Math ------- */
/* [Drobot2014a] Low Level Optimizations for GCN */
float fast_sqrt(float x)
{
return intBitsToFloat(0x1fbd1df5 + (floatBitsToInt(x) >> 1));
}
/* [Eberly2014] GPGPU Programming for Games and Science */
float fast_acos(float x)
{
float res = -0.156583 * abs(x) + M_PI_2;
res *= fast_sqrt(1.0 - abs(x));
return (x >= 0) ? res : M_PI - res;
}
float line_plane_intersect_dist(vec3 lineorigin, vec3 linedirection, vec3 planeorigin, vec3 planenormal)
{
return dot(planenormal, planeorigin - lineorigin) / dot(planenormal, linedirection);