DrawManager: Use Compute Shader to Update Hair.

This patch will use compute shaders to create the VBO for hair.
The previous implementation uses transform feedback.

Timings before: between 0.000069s and 0.000362s.
Timings after:  between 0.000032s and 0.000092s.

Speedup isn't noticeable by end-users. The patch is used to test
the new compute shader pipeline and integrate it with the draw
manager. Allowing EEVEE, Workbench and other draw engines to
use compute shaders with the introduction of `DRW_shgroup_call_compute`
and `DRW_shgroup_vertex_buffer`.

Future improvements are possible by generating the index buffer
of hair directly on the GPU.

NOTE: that compute shaders aren't supported by Apple and still use
the transform feedback workaround.

Reviewed By: fclem

Differential Revision: https://developer.blender.org/D11057
This commit is contained in:
Jeroen Bakker 2021-05-28 08:16:26 +02:00
parent 4a1ba155d5
commit 6b03621c01
14 changed files with 347 additions and 98 deletions

View File

@ -322,6 +322,7 @@ data_to_c_simple(intern/shaders/common_globals_lib.glsl SRC)
data_to_c_simple(intern/shaders/common_pointcloud_lib.glsl SRC)
data_to_c_simple(intern/shaders/common_hair_lib.glsl SRC)
data_to_c_simple(intern/shaders/common_hair_refine_vert.glsl SRC)
data_to_c_simple(intern/shaders/common_hair_refine_comp.glsl SRC)
data_to_c_simple(intern/shaders/common_math_lib.glsl SRC)
data_to_c_simple(intern/shaders/common_math_geom_lib.glsl SRC)
data_to_c_simple(intern/shaders/common_view_lib.glsl SRC)

View File

@ -438,6 +438,10 @@ void DRW_shgroup_call_range(
void DRW_shgroup_call_instance_range(
DRWShadingGroup *shgroup, Object *ob, struct GPUBatch *geom, uint i_sta, uint i_ct);
void DRW_shgroup_call_compute(DRWShadingGroup *shgroup,
int groups_x_len,
int groups_y_len,
int groups_z_len);
void DRW_shgroup_call_procedural_points(DRWShadingGroup *sh, Object *ob, uint point_count);
void DRW_shgroup_call_procedural_lines(DRWShadingGroup *sh, Object *ob, uint line_count);
void DRW_shgroup_call_procedural_triangles(DRWShadingGroup *sh, Object *ob, uint tri_count);
@ -575,6 +579,9 @@ void DRW_shgroup_uniform_vec4_array_copy(DRWShadingGroup *shgroup,
const char *name,
const float (*value)[4],
int arraysize);
void DRW_shgroup_vertex_buffer(DRWShadingGroup *shgroup,
const char *name,
struct GPUVertBuf *vertex_buffer);
bool DRW_shgroup_is_empty(DRWShadingGroup *shgroup);

View File

@ -243,7 +243,8 @@ static void hair_batch_cache_ensure_procedural_final_points(ParticleHairCache *c
GPUVertFormat format = {0};
GPU_vertformat_attr_add(&format, "pos", GPU_COMP_F32, 4, GPU_FETCH_FLOAT);
cache->final[subdiv].proc_buf = GPU_vertbuf_create_with_format(&format);
cache->final[subdiv].proc_buf = GPU_vertbuf_create_with_format_ex(&format,
GPU_USAGE_DEVICE_ONLY);
/* Create a destination buffer for the transform feedback. Sized appropriately */
/* Those are points! not line segments. */

View File

@ -36,15 +36,28 @@
#include "BKE_duplilist.h"
#include "GPU_batch.h"
#include "GPU_capabilities.h"
#include "GPU_compute.h"
#include "GPU_shader.h"
#include "GPU_texture.h"
#include "GPU_vertex_buffer.h"
#include "draw_hair_private.h"
#ifndef __APPLE__
# define USE_TRANSFORM_FEEDBACK
# define USE_COMPUTE_SHADERS
#endif
BLI_INLINE bool drw_hair_use_compute_shaders(void)
{
#ifdef USE_COMPUTE_SHADERS
return GPU_compute_shader_support();
#else
return false;
#endif
}
typedef enum ParticleRefineShader {
PART_REFINE_CATMULL_ROM = 0,
PART_REFINE_MAX_SHADER,
@ -71,38 +84,89 @@ static DRWPass *g_tf_pass; /* XXX can be a problem with multiple DRWManager in t
extern char datatoc_common_hair_lib_glsl[];
extern char datatoc_common_hair_refine_vert_glsl[];
extern char datatoc_common_hair_refine_comp_glsl[];
extern char datatoc_gpu_shader_3D_smooth_color_frag_glsl[];
static GPUShader *hair_refine_shader_get(ParticleRefineShader sh)
/* TODO(jbakker): move shader creation to `draw_shaders` and add test cases. */
/* TODO(jbakker): replace defines with `constexpr` to check compilation on all OSs. Currently the
* APPLE codepath does not compile on other platforms and vice versa. */
#ifdef USE_COMPUTE_SHADERS
static GPUShader *hair_refine_shader_compute_create(ParticleRefineShader UNUSED(refinement))
{
if (g_refine_shaders[sh]) {
return g_refine_shaders[sh];
}
char *vert_with_lib = BLI_string_joinN(datatoc_common_hair_lib_glsl,
datatoc_common_hair_refine_vert_glsl);
#ifdef USE_TRANSFORM_FEEDBACK
const char *var_names[1] = {"finalColor"};
g_refine_shaders[sh] = DRW_shader_create_with_transform_feedback(
vert_with_lib, NULL, "#define HAIR_PHASE_SUBDIV\n", GPU_SHADER_TFB_POINTS, var_names, 1);
#else
g_refine_shaders[sh] = DRW_shader_create(vert_with_lib,
NULL,
datatoc_gpu_shader_3D_smooth_color_frag_glsl,
"#define blender_srgb_to_framebuffer_space(a) a\n"
"#define HAIR_PHASE_SUBDIV\n"
"#define TF_WORKAROUND\n");
GPUShader *sh = NULL;
sh = GPU_shader_create_compute(datatoc_common_hair_refine_comp_glsl,
datatoc_common_hair_lib_glsl,
"#define HAIR_PHASE_SUBDIV\n",
__func__);
return sh;
}
#endif
MEM_freeN(vert_with_lib);
#ifdef USE_TRANSFORM_FEEDBACK
static GPUShader *hair_refine_shader_transform_feedback_create(
ParticleRefineShader UNUSED(refinement))
{
GPUShader *sh = NULL;
return g_refine_shaders[sh];
char *shader_src = BLI_string_joinN(datatoc_common_hair_lib_glsl,
datatoc_common_hair_refine_vert_glsl);
const char *var_names[1] = {"finalColor"};
sh = DRW_shader_create_with_transform_feedback(
shader_src, NULL, "#define HAIR_PHASE_SUBDIV\n", GPU_SHADER_TFB_POINTS, var_names, 1);
MEM_freeN(shader_src);
return sh;
}
#endif
static GPUShader *hair_refine_shader_transform_feedback_workaround_create(
ParticleRefineShader UNUSED(refinement))
{
GPUShader *sh = NULL;
char *shader_src = BLI_string_joinN(datatoc_common_hair_lib_glsl,
datatoc_common_hair_refine_vert_glsl);
sh = DRW_shader_create(shader_src,
NULL,
datatoc_gpu_shader_3D_smooth_color_frag_glsl,
"#define blender_srgb_to_framebuffer_space(a) a\n"
"#define HAIR_PHASE_SUBDIV\n"
"#define TF_WORKAROUND\n");
MEM_freeN(shader_src);
return sh;
}
static GPUShader *hair_refine_shader_get(ParticleRefineShader refinement)
{
if (g_refine_shaders[refinement]) {
return g_refine_shaders[refinement];
}
#ifdef USE_COMPUTE_SHADERS
if (drw_hair_use_compute_shaders()) {
g_refine_shaders[refinement] = hair_refine_shader_compute_create(refinement);
if (g_refine_shaders[refinement]) {
return g_refine_shaders[refinement];
}
}
#endif
#ifdef USE_TRANSFORM_FEEDBACK
g_refine_shaders[refinement] = hair_refine_shader_transform_feedback_create(refinement);
if (g_refine_shaders[refinement]) {
return g_refine_shaders[refinement];
}
#endif
g_refine_shaders[refinement] = hair_refine_shader_transform_feedback_workaround_create(
refinement);
return g_refine_shaders[refinement];
}
void DRW_hair_init(void)
{
#ifdef USE_TRANSFORM_FEEDBACK
#if defined(USE_TRANSFORM_FEEDBACK) || defined(USE_COMPUTE_SHADERS)
g_tf_pass = DRW_pass_create("Update Hair Pass", 0);
#else
g_tf_pass = DRW_pass_create("Update Hair Pass", DRW_STATE_WRITE_COLOR);
@ -125,6 +189,67 @@ void DRW_hair_init(void)
}
}
static void drw_hair_particle_cache_shgrp_attach_resources(DRWShadingGroup *shgrp,
ParticleHairCache *cache,
const int subdiv)
{
DRW_shgroup_uniform_texture(shgrp, "hairPointBuffer", cache->point_tex);
DRW_shgroup_uniform_texture(shgrp, "hairStrandBuffer", cache->strand_tex);
DRW_shgroup_uniform_texture(shgrp, "hairStrandSegBuffer", cache->strand_seg_tex);
DRW_shgroup_uniform_int(shgrp, "hairStrandsRes", &cache->final[subdiv].strands_res, 1);
}
static void drw_hair_particle_cache_update_compute(ParticleHairCache *cache, const int subdiv)
{
const int strands_len = cache->strands_len;
const int final_points_len = cache->final[subdiv].strands_res * strands_len;
if (final_points_len > 0) {
GPUShader *shader = hair_refine_shader_get(PART_REFINE_CATMULL_ROM);
DRWShadingGroup *shgrp = DRW_shgroup_create(shader, g_tf_pass);
drw_hair_particle_cache_shgrp_attach_resources(shgrp, cache, subdiv);
DRW_shgroup_vertex_buffer(shgrp, "hairPointOutputBuffer", cache->final[subdiv].proc_buf);
const int max_strands_per_call = GPU_max_work_group_count(0);
int strands_start = 0;
while (strands_start < strands_len) {
int batch_strands_len = MIN2(strands_len - strands_start, max_strands_per_call);
DRWShadingGroup *subgroup = DRW_shgroup_create_sub(shgrp);
DRW_shgroup_uniform_int_copy(subgroup, "hairStrandOffset", strands_start);
DRW_shgroup_call_compute(subgroup, batch_strands_len, cache->final[subdiv].strands_res, 1);
strands_start += batch_strands_len;
}
}
}
static void drw_hair_particle_cache_update_transform_feedback(ParticleHairCache *cache,
const int subdiv)
{
const int final_points_len = cache->final[subdiv].strands_res * cache->strands_len;
if (final_points_len > 0) {
GPUShader *tf_shader = hair_refine_shader_get(PART_REFINE_CATMULL_ROM);
#ifdef USE_TRANSFORM_FEEDBACK
DRWShadingGroup *tf_shgrp = DRW_shgroup_transform_feedback_create(
tf_shader, g_tf_pass, cache->final[subdiv].proc_buf);
#else
DRWShadingGroup *tf_shgrp = DRW_shgroup_create(tf_shader, g_tf_pass);
ParticleRefineCall *pr_call = MEM_mallocN(sizeof(*pr_call), __func__);
pr_call->next = g_tf_calls;
pr_call->vbo = cache->final[subdiv].proc_buf;
pr_call->shgrp = tf_shgrp;
pr_call->vert_len = final_points_len;
g_tf_calls = pr_call;
DRW_shgroup_uniform_int(tf_shgrp, "targetHeight", &g_tf_target_height, 1);
DRW_shgroup_uniform_int(tf_shgrp, "targetWidth", &g_tf_target_width, 1);
DRW_shgroup_uniform_int(tf_shgrp, "idOffset", &g_tf_id_offset, 1);
#endif
drw_hair_particle_cache_shgrp_attach_resources(tf_shgrp, cache, subdiv);
DRW_shgroup_call_procedural_points(tf_shgrp, NULL, final_points_len);
}
}
static ParticleHairCache *drw_hair_particle_cache_get(
Object *object, ParticleSystem *psys, ModifierData *md, int subdiv, int thickness_res)
{
@ -140,32 +265,11 @@ static ParticleHairCache *drw_hair_particle_cache_get(
}
if (update) {
int final_points_len = cache->final[subdiv].strands_res * cache->strands_len;
if (final_points_len > 0) {
GPUShader *tf_shader = hair_refine_shader_get(PART_REFINE_CATMULL_ROM);
#ifdef USE_TRANSFORM_FEEDBACK
DRWShadingGroup *tf_shgrp = DRW_shgroup_transform_feedback_create(
tf_shader, g_tf_pass, cache->final[subdiv].proc_buf);
#else
DRWShadingGroup *tf_shgrp = DRW_shgroup_create(tf_shader, g_tf_pass);
ParticleRefineCall *pr_call = MEM_mallocN(sizeof(*pr_call), __func__);
pr_call->next = g_tf_calls;
pr_call->vbo = cache->final[subdiv].proc_buf;
pr_call->shgrp = tf_shgrp;
pr_call->vert_len = final_points_len;
g_tf_calls = pr_call;
DRW_shgroup_uniform_int(tf_shgrp, "targetHeight", &g_tf_target_height, 1);
DRW_shgroup_uniform_int(tf_shgrp, "targetWidth", &g_tf_target_width, 1);
DRW_shgroup_uniform_int(tf_shgrp, "idOffset", &g_tf_id_offset, 1);
#endif
DRW_shgroup_uniform_texture(tf_shgrp, "hairPointBuffer", cache->point_tex);
DRW_shgroup_uniform_texture(tf_shgrp, "hairStrandBuffer", cache->strand_tex);
DRW_shgroup_uniform_texture(tf_shgrp, "hairStrandSegBuffer", cache->strand_seg_tex);
DRW_shgroup_uniform_int(tf_shgrp, "hairStrandsRes", &cache->final[subdiv].strands_res, 1);
DRW_shgroup_call_procedural_points(tf_shgrp, NULL, final_points_len);
if (drw_hair_use_compute_shaders()) {
drw_hair_particle_cache_update_compute(cache, subdiv);
}
else {
drw_hair_particle_cache_update_transform_feedback(cache, subdiv);
}
}
return cache;
@ -367,9 +471,11 @@ void DRW_hair_update(void)
MEM_freeN(data);
GPU_framebuffer_free(fb);
#else
/* TODO(fclem): replace by compute shader. */
/* Just render using transform feedback. */
/* Just render the pass when using compute shaders or transform feedback. */
DRW_draw_pass(g_tf_pass);
if (drw_hair_use_compute_shaders()) {
GPU_memory_barrier(GPU_BARRIER_SHADER_STORAGE);
}
#endif
}

View File

@ -187,6 +187,10 @@ typedef enum {
DRW_CMD_DRAW_INSTANCE = 2,
DRW_CMD_DRAW_INSTANCE_RANGE = 3,
DRW_CMD_DRAW_PROCEDURAL = 4,
/* Compute Commands. */
DRW_CMD_COMPUTE = 8,
/* Other Commands */
DRW_CMD_CLEAR = 12,
DRW_CMD_DRWSTATE = 13,
@ -224,6 +228,12 @@ typedef struct DRWCommandDrawInstanceRange {
uint inst_count;
} DRWCommandDrawInstanceRange;
typedef struct DRWCommandCompute {
int groups_x_len;
int groups_y_len;
int groups_z_len;
} DRWCommandCompute;
typedef struct DRWCommandDrawProcedural {
GPUBatch *batch;
DRWResourceHandle handle;
@ -260,6 +270,7 @@ typedef union DRWCommand {
DRWCommandDrawInstance instance;
DRWCommandDrawInstanceRange instance_range;
DRWCommandDrawProcedural procedural;
DRWCommandCompute compute;
DRWCommandSetMutableState state;
DRWCommandSetStencil stencil;
DRWCommandSetSelectID select_id;
@ -274,6 +285,7 @@ struct DRWCallBuffer {
};
/** Used by #DRWUniform.type */
/* TODO(jbakker): rename to DRW_RESOURCE/DRWResourceType. */
typedef enum {
DRW_UNIFORM_INT = 0,
DRW_UNIFORM_INT_COPY,
@ -286,6 +298,7 @@ typedef enum {
DRW_UNIFORM_BLOCK,
DRW_UNIFORM_BLOCK_REF,
DRW_UNIFORM_TFEEDBACK_TARGET,
DRW_UNIFORM_VERTEX_BUFFER_AS_STORAGE,
/** Per drawcall uniforms/UBO */
DRW_UNIFORM_BLOCK_OBMATS,
DRW_UNIFORM_BLOCK_OBINFOS,

View File

@ -47,6 +47,7 @@
#endif
#include "GPU_buffers.h"
#include "GPU_capabilities.h"
#include "GPU_material.h"
#include "GPU_uniform_buffer.h"
@ -446,6 +447,19 @@ void DRW_shgroup_uniform_vec4_array_copy(DRWShadingGroup *shgroup,
}
}
void DRW_shgroup_vertex_buffer(DRWShadingGroup *shgroup,
const char *name,
GPUVertBuf *vertex_buffer)
{
int location = GPU_shader_get_ssbo(shgroup->shader, name);
if (location == -1) {
BLI_assert(false && "Unable to locate binding of shader storage buffer objects.");
return;
}
drw_shgroup_uniform_create_ex(
shgroup, location, DRW_UNIFORM_VERTEX_BUFFER_AS_STORAGE, vertex_buffer, 0, 0, 1);
}
/** \} */
/* -------------------------------------------------------------------- */
@ -700,6 +714,17 @@ static void drw_command_draw_intance_range(
cmd->inst_count = count;
}
static void drw_command_compute(DRWShadingGroup *shgroup,
int groups_x_len,
int groups_y_len,
int groups_z_len)
{
DRWCommandCompute *cmd = drw_command_create(shgroup, DRW_CMD_COMPUTE);
cmd->groups_x_len = groups_x_len;
cmd->groups_y_len = groups_y_len;
cmd->groups_z_len = groups_z_len;
}
static void drw_command_draw_procedural(DRWShadingGroup *shgroup,
GPUBatch *batch,
DRWResourceHandle handle,
@ -815,6 +840,17 @@ void DRW_shgroup_call_instance_range(
drw_command_draw_intance_range(shgroup, geom, handle, i_sta, i_ct);
}
void DRW_shgroup_call_compute(DRWShadingGroup *shgroup,
int groups_x_len,
int groups_y_len,
int groups_z_len)
{
BLI_assert(groups_x_len > 0 && groups_y_len > 0 && groups_z_len > 0);
BLI_assert(GPU_compute_shader_support());
drw_command_compute(shgroup, groups_x_len, groups_y_len, groups_z_len);
}
static void drw_shgroup_call_procedural_add_ex(DRWShadingGroup *shgroup,
GPUBatch *geom,
Object *ob,

View File

@ -29,6 +29,7 @@
#include "BKE_global.h"
#include "GPU_compute.h"
#include "GPU_platform.h"
#include "GPU_shader.h"
#include "GPU_state.h"
@ -672,6 +673,9 @@ static void draw_update_uniforms(DRWShadingGroup *shgroup,
*use_tfeedback = GPU_shader_transform_feedback_enable(shgroup->shader,
((GPUVertBuf *)uni->pvalue));
break;
case DRW_UNIFORM_VERTEX_BUFFER_AS_STORAGE:
GPU_vertbuf_bind_as_ssbo((GPUVertBuf *)uni->pvalue, uni->location);
break;
/* Legacy/Fallback support. */
case DRW_UNIFORM_BASE_INSTANCE:
state->baseinst_loc = uni->location;
@ -1050,6 +1054,12 @@ static void draw_shgroup(DRWShadingGroup *shgroup, DRWState pass_state)
cmd->instance_range.inst_count,
false);
break;
case DRW_CMD_COMPUTE:
GPU_compute_dispatch(shgroup->shader,
cmd->compute.groups_x_len,
cmd->compute.groups_y_len,
cmd->compute.groups_z_len);
break;
}
}

View File

@ -28,6 +28,9 @@ uniform bool hairCloseTip = true;
uniform vec4 hairDupliMatrix[4];
/* Strand batch offset when used in compute shaders. */
uniform int hairStrandOffset = 0;
/* -- Per control points -- */
uniform samplerBuffer hairPointBuffer; /* RGBA32F */
#define point_position xyz
@ -43,13 +46,37 @@ uniform usamplerBuffer hairStrandSegBuffer; /* R16UI */
/* -- Subdivision stage -- */
/**
* We use a transform feedback to preprocess the strands and add more subdivision to it.
* For the moment these are simple smooth interpolation but one could hope to see the full
* We use a transform feedback or compute shader to preprocess the strands and add more subdivision
* to it. For the moment these are simple smooth interpolation but one could hope to see the full
* children particle modifiers being evaluated at this stage.
*
* If no more subdivision is needed, we can skip this step.
*/
#ifdef GPU_VERTEX_SHADER
float hair_get_local_time()
{
return float(gl_VertexID % hairStrandsRes) / float(hairStrandsRes - 1);
}
int hair_get_id()
{
return gl_VertexID / hairStrandsRes;
}
#endif
#ifdef GPU_COMPUTE_SHADER
float hair_get_local_time()
{
return float(gl_GlobalInvocationID.y) / float(hairStrandsRes - 1);
}
int hair_get_id()
{
return int(gl_GlobalInvocationID.x) + hairStrandOffset;
}
#endif
#ifdef HAIR_PHASE_SUBDIV
int hair_get_base_id(float local_time, int strand_segments, out float interp_time)
{
@ -64,9 +91,9 @@ int hair_get_base_id(float local_time, int strand_segments, out float interp_tim
void hair_get_interp_attrs(
out vec4 data0, out vec4 data1, out vec4 data2, out vec4 data3, out float interp_time)
{
float local_time = float(gl_VertexID % hairStrandsRes) / float(hairStrandsRes - 1);
float local_time = hair_get_local_time();
int hair_id = gl_VertexID / hairStrandsRes;
int hair_id = hair_get_id();
int strand_offset = int(texelFetch(hairStrandBuffer, hair_id).x);
int strand_segments = int(texelFetch(hairStrandSegBuffer, hair_id).x);
@ -96,6 +123,7 @@ void hair_get_interp_attrs(
*/
#if !defined(HAIR_PHASE_SUBDIV) && defined(GPU_VERTEX_SHADER)
int hair_get_strand_id(void)
{
return gl_VertexID / (hairStrandsRes * hairThicknessRes);
@ -227,3 +255,45 @@ vec2 hair_resolve_barycentric(vec2 vert_barycentric)
return vec2(1.0 - vert_barycentric.x, 0.0);
}
}
/* Hair interpolation functions. */
vec4 hair_get_weights_cardinal(float t)
{
float t2 = t * t;
float t3 = t2 * t;
#if defined(CARDINAL)
float fc = 0.71;
#else /* defined(CATMULL_ROM) */
float fc = 0.5;
#endif
vec4 weights;
/* GLSL Optimized version of key_curve_position_weights() */
float fct = t * fc;
float fct2 = t2 * fc;
float fct3 = t3 * fc;
weights.x = (fct2 * 2.0 - fct3) - fct;
weights.y = (t3 * 2.0 - fct3) + (-t2 * 3.0 + fct2) + 1.0;
weights.z = (-t3 * 2.0 + fct3) + (t2 * 3.0 - (2.0 * fct2)) + fct;
weights.w = fct3 - fct2;
return weights;
}
/* TODO(fclem): This one is buggy, find why. (it's not the optimization!!) */
vec4 hair_get_weights_bspline(float t)
{
float t2 = t * t;
float t3 = t2 * t;
vec4 weights;
/* GLSL Optimized version of key_curve_position_weights() */
weights.xz = vec2(-0.16666666, -0.5) * t3 + (0.5 * t2 + 0.5 * vec2(-t, t) + 0.16666666);
weights.y = (0.5 * t3 - t2 + 0.66666666);
weights.w = (0.16666666 * t3);
return weights;
}
vec4 hair_interp_data(vec4 v0, vec4 v1, vec4 v2, vec4 v3, vec4 w)
{
return v0 * w.x + v1 * w.y + v2 * w.z + v3 * w.w;
}

View File

@ -0,0 +1,24 @@
/*
* To be compiled with common_hair_lib.glsl.
*/
layout(local_size_x = 1, local_size_y = 1) in;
layout(std430, binding = 0) writeonly buffer hairPointOutputBuffer
{
vec4 posTime[];
}
out_vertbuf;
void main(void)
{
float interp_time;
vec4 data0, data1, data2, data3;
hair_get_interp_attrs(data0, data1, data2, data3, interp_time);
vec4 weights = hair_get_weights_cardinal(interp_time);
vec4 result = hair_interp_data(data0, data1, data2, data3, weights);
uint index = uint(hair_get_id() * hairStrandsRes) + gl_GlobalInvocationID.y;
out_vertbuf.posTime[index] = result;
}

View File

@ -3,47 +3,6 @@
out vec4 finalColor;
vec4 get_weights_cardinal(float t)
{
float t2 = t * t;
float t3 = t2 * t;
#if defined(CARDINAL)
float fc = 0.71;
#else /* defined(CATMULL_ROM) */
float fc = 0.5;
#endif
vec4 weights;
/* GLSL Optimized version of key_curve_position_weights() */
float fct = t * fc;
float fct2 = t2 * fc;
float fct3 = t3 * fc;
weights.x = (fct2 * 2.0 - fct3) - fct;
weights.y = (t3 * 2.0 - fct3) + (-t2 * 3.0 + fct2) + 1.0;
weights.z = (-t3 * 2.0 + fct3) + (t2 * 3.0 - (2.0 * fct2)) + fct;
weights.w = fct3 - fct2;
return weights;
}
/* TODO(fclem): This one is buggy, find why. (it's not the optimization!!) */
vec4 get_weights_bspline(float t)
{
float t2 = t * t;
float t3 = t2 * t;
vec4 weights;
/* GLSL Optimized version of key_curve_position_weights() */
weights.xz = vec2(-0.16666666, -0.5) * t3 + (0.5 * t2 + 0.5 * vec2(-t, t) + 0.16666666);
weights.y = (0.5 * t3 - t2 + 0.66666666);
weights.w = (0.16666666 * t3);
return weights;
}
vec4 interp_data(vec4 v0, vec4 v1, vec4 v2, vec4 v3, vec4 w)
{
return v0 * w.x + v1 * w.y + v2 * w.z + v3 * w.w;
}
#ifdef TF_WORKAROUND
uniform int targetWidth;
uniform int targetHeight;
@ -56,8 +15,8 @@ void main(void)
vec4 data0, data1, data2, data3;
hair_get_interp_attrs(data0, data1, data2, data3, interp_time);
vec4 weights = get_weights_cardinal(interp_time);
finalColor = interp_data(data0, data1, data2, data3, weights);
vec4 weights = hair_get_weights_cardinal(interp_time);
finalColor = hair_interp_data(data0, data1, data2, data3, weights);
#ifdef TF_WORKAROUND
int id = gl_VertexID - idOffset;

View File

@ -37,6 +37,8 @@ int GPU_max_textures(void);
int GPU_max_textures_vert(void);
int GPU_max_textures_geom(void);
int GPU_max_textures_frag(void);
int GPU_max_work_group_count(int index);
int GPU_max_work_group_size(int index);
int GPU_max_uniforms_vert(void);
int GPU_max_uniforms_frag(void);
int GPU_max_batch_indices(void);

View File

@ -82,6 +82,16 @@ int GPU_max_textures(void)
return GCaps.max_textures;
}
int GPU_max_work_group_count(int index)
{
return GCaps.max_work_group_count[index];
}
int GPU_max_work_group_size(int index)
{
return GCaps.max_work_group_size[index];
}
int GPU_max_uniforms_vert(void)
{
return GCaps.max_uniforms_vert;

View File

@ -41,6 +41,8 @@ struct GPUCapabilities {
int max_textures_vert = 0;
int max_textures_geom = 0;
int max_textures_frag = 0;
int max_work_group_count[3] = {0, 0, 0};
int max_work_group_size[3] = {0, 0, 0};
int max_uniforms_vert = 0;
int max_uniforms_frag = 0;
int max_batch_indices = 0;

View File

@ -438,6 +438,14 @@ void GLBackend::capabilities_init()
GCaps.mem_stats_support = GLEW_NVX_gpu_memory_info || GLEW_ATI_meminfo;
GCaps.shader_image_load_store_support = GLEW_ARB_shader_image_load_store;
GCaps.compute_shader_support = GLEW_ARB_compute_shader;
if (GCaps.compute_shader_support) {
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, &GCaps.max_work_group_count[0]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 1, &GCaps.max_work_group_count[1]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 2, &GCaps.max_work_group_count[2]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 0, &GCaps.max_work_group_size[0]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 1, &GCaps.max_work_group_size[1]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 2, &GCaps.max_work_group_size[2]);
}
GCaps.shader_storage_buffer_objects_support = GLEW_ARB_shader_storage_buffer_object;
/* GL specific capabilities. */
glGetIntegerv(GL_MAX_3D_TEXTURE_SIZE, &GLContext::max_texture_3d_size);