Cycles: reduce closure memory usage for emission/shadow shader data.

With a Titan Xp, reduces path trace local memory from 1092MB to 840MB.
Benchmark performance was within 1% with both RX 480 and Titan Xp.

Original patch was implemented by Sergey.

Differential Revision: https://developer.blender.org/D2249
This commit is contained in:
Brecht Van Lommel 2017-11-01 21:02:28 +01:00
parent c571be4e05
commit 8a72be7697
Notes: blender-bot 2023-02-14 06:25:23 +01:00
Referenced by issue #53249, [regression] OpenCL performance becomes very random with big scenes.
20 changed files with 80 additions and 61 deletions

View File

@ -20,17 +20,16 @@ ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType ty
{
kernel_assert(size <= sizeof(ShaderClosure));
int num_closure = sd->num_closure;
int num_closure_extra = sd->num_closure_extra;
if(num_closure + num_closure_extra >= MAX_CLOSURE)
if(sd->num_closure_left == 0)
return NULL;
ShaderClosure *sc = &sd->closure[num_closure];
ShaderClosure *sc = &sd->closure[sd->num_closure];
sc->type = type;
sc->weight = weight;
sd->num_closure++;
sd->num_closure_left--;
return sc;
}
@ -44,18 +43,16 @@ ccl_device ccl_addr_space void *closure_alloc_extra(ShaderData *sd, int size)
* This lets us keep the same fast array iteration over closures, as we
* found linked list iteration and iteration with skipping to be slower. */
int num_extra = ((size + sizeof(ShaderClosure) - 1) / sizeof(ShaderClosure));
int num_closure = sd->num_closure;
int num_closure_extra = sd->num_closure_extra + num_extra;
if(num_closure + num_closure_extra > MAX_CLOSURE) {
if(num_extra > sd->num_closure_left) {
/* Remove previous closure. */
sd->num_closure--;
sd->num_closure_extra++;
sd->num_closure_left++;
return NULL;
}
sd->num_closure_extra = num_closure_extra;
return (ccl_addr_space void*)(sd->closure + MAX_CLOSURE - num_closure_extra);
sd->num_closure_left -= num_extra;
return (ccl_addr_space void*)(sd->closure + sd->num_closure + sd->num_closure_left);
}
ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 weight)

View File

@ -51,7 +51,7 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
path_state_init(kg, &emission_sd, &state, rng_hash, sample, NULL);
/* evaluate surface shader */
shader_eval_surface(kg, sd, &state, state.flag);
shader_eval_surface(kg, sd, &state, state.flag, MAX_CLOSURE);
/* TODO, disable more closures we don't need besides transparent */
shader_bsdf_disable_transparency(kg, sd);
@ -239,12 +239,12 @@ ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
}
else {
/* surface color of the pass only */
shader_eval_surface(kg, sd, state, 0);
shader_eval_surface(kg, sd, state, 0, MAX_CLOSURE);
return kernel_bake_shader_bsdf(kg, sd, type);
}
}
else {
shader_eval_surface(kg, sd, state, 0);
shader_eval_surface(kg, sd, state, 0, MAX_CLOSURE);
color = kernel_bake_shader_bsdf(kg, sd, type);
}
@ -337,7 +337,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
{
float3 N = sd.N;
if((sd.flag & SD_HAS_BUMP)) {
shader_eval_surface(kg, &sd, &state, 0);
shader_eval_surface(kg, &sd, &state, 0, MAX_CLOSURE);
N = shader_bsdf_average_normal(kg, &sd);
}
@ -352,7 +352,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
}
case SHADER_EVAL_EMISSION:
{
shader_eval_surface(kg, &sd, &state, 0);
shader_eval_surface(kg, &sd, &state, 0, 0);
out = shader_emissive_eval(kg, &sd);
break;
}

View File

@ -70,14 +70,11 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
/* no path flag, we're evaluating this for all closures. that's weak but
* we'd have to do multiple evaluations otherwise */
path_state_modify_bounce(state, true);
shader_eval_surface(kg, emission_sd, state, 0);
shader_eval_surface(kg, emission_sd, state, 0, 0);
path_state_modify_bounce(state, false);
/* evaluate emissive closure */
if(emission_sd->flag & SD_EMISSION)
eval = shader_emissive_eval(kg, emission_sd);
else
eval = make_float3(0.0f, 0.0f, 0.0f);
eval = shader_emissive_eval(kg, emission_sd);
}
eval *= ls->eval_fac;

View File

@ -443,7 +443,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
sd,
&isect,
ray);
shader_eval_surface(kg, sd, state, state->flag);
shader_eval_surface(kg, sd, state, state->flag, MAX_CLOSURE);
shader_prepare_closures(sd, state);
/* Apply shadow catcher, holdout, emission. */
@ -561,7 +561,7 @@ ccl_device_forceinline void kernel_path_integrate(
bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L);
/* Find intersection with lamps and compute emission for MIS. */
kernel_path_lamp_emission(kg, state, ray, throughput, &isect, emission_sd, L);
kernel_path_lamp_emission(kg, state, ray, throughput, &isect, &sd, L);
#ifdef __VOLUME__
/* Volume integration. */
@ -585,7 +585,7 @@ ccl_device_forceinline void kernel_path_integrate(
/* Shade background. */
if(!hit) {
kernel_path_background(kg, state, ray, throughput, emission_sd, L);
kernel_path_background(kg, state, ray, throughput, &sd, L);
break;
}
else if(path_state_ao_bounce(kg, state)) {
@ -594,7 +594,7 @@ ccl_device_forceinline void kernel_path_integrate(
/* Setup and evaluate shader. */
shader_setup_from_ray(kg, &sd, &isect, ray);
shader_eval_surface(kg, &sd, state, state->flag);
shader_eval_surface(kg, &sd, state, state->flag, MAX_CLOSURE);
shader_prepare_closures(&sd, state);
/* Apply shadow catcher, holdout, emission. */
@ -706,9 +706,11 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
PathRadiance L;
path_radiance_init(&L, kernel_data.film.use_light_pass);
ShaderData emission_sd;
ShaderDataTinyStorage emission_sd_storage;
ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
PathState state;
path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray);
path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray);
/* Integrate. */
kernel_path_integrate(kg,
@ -717,7 +719,7 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
&ray,
&L,
buffer,
&emission_sd);
emission_sd);
kernel_write_result(kg, buffer, sample, &L);
}

View File

@ -436,10 +436,12 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
/* shader data memory used for both volumes and surfaces, saves stack space */
ShaderData sd;
/* shader data used by emission, shadows, volume stacks, indirect path */
ShaderData emission_sd, indirect_sd;
ShaderDataTinyStorage emission_sd_storage;
ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
ShaderData indirect_sd;
PathState state;
path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray);
path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray);
/* Main Loop
* Here we only handle transparency intersections from the camera ray.
@ -460,7 +462,7 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
&isect,
hit,
&indirect_sd,
&emission_sd,
emission_sd,
L);
#endif /* __VOLUME__ */
@ -472,7 +474,7 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
/* Setup and evaluate shader. */
shader_setup_from_ray(kg, &sd, &isect, &ray);
shader_eval_surface(kg, &sd, &state, state.flag);
shader_eval_surface(kg, &sd, &state, state.flag, MAX_CLOSURE);
shader_merge_closures(&sd);
/* Apply shadow catcher, holdout, emission. */
@ -481,7 +483,7 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
&state,
&ray,
throughput,
&emission_sd,
emission_sd,
L,
buffer))
{
@ -513,14 +515,14 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, throughput);
kernel_branched_path_ao(kg, &sd, emission_sd, L, &state, throughput);
}
#endif /* __AO__ */
#ifdef __SUBSURFACE__
/* bssrdf scatter to a different location on the same object */
if(sd.flag & SD_BSSRDF) {
kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, emission_sd,
L, &state, &ray, throughput);
}
#endif /* __SUBSURFACE__ */
@ -534,13 +536,13 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
int all = (kernel_data.integrator.sample_all_lights_direct) ||
(state.flag & PATH_RAY_SHADOW_CATCHER);
kernel_branched_path_surface_connect_light(kg,
&sd, &emission_sd, &hit_state, throughput, 1.0f, L, all);
&sd, emission_sd, &hit_state, throughput, 1.0f, L, all);
}
#endif /* __EMISSION__ */
/* indirect light */
kernel_branched_path_surface_indirect_light(kg,
&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L);
&sd, &indirect_sd, emission_sd, throughput, 1.0f, &hit_state, L);
/* continue in case of transparency */
throughput *= shader_bsdf_transparency(kg, &sd);

View File

@ -955,10 +955,10 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
/* Surface Evaluation */
ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
ccl_addr_space PathState *state, int path_flag)
ccl_addr_space PathState *state, int path_flag, int max_closure)
{
sd->num_closure = 0;
sd->num_closure_extra = 0;
sd->num_closure_left = max_closure;
#ifdef __OSL__
if(kg->osl)
@ -988,7 +988,7 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd,
ccl_addr_space PathState *state, int path_flag)
{
sd->num_closure = 0;
sd->num_closure_extra = 0;
sd->num_closure_left = 0;
#ifdef __SVM__
# ifdef __OSL__
@ -1129,12 +1129,13 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
ShaderData *sd,
ccl_addr_space PathState *state,
ccl_addr_space VolumeStack *stack,
int path_flag)
int path_flag,
int max_closure)
{
/* reset closures once at the start, we will be accumulating the closures
* for all volumes in the stack into a single array of closures */
sd->num_closure = 0;
sd->num_closure_extra = 0;
sd->num_closure_left = max_closure;
sd->flag = 0;
sd->object_flag = 0;
@ -1184,7 +1185,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state)
{
sd->num_closure = 0;
sd->num_closure_extra = 0;
sd->num_closure_left = 0;
/* this will modify sd->P */
#ifdef __SVM__

View File

@ -86,7 +86,8 @@ ccl_device_forceinline bool shadow_handle_transparent_isect(
shader_eval_surface(kg,
shadow_sd,
state,
PATH_RAY_SHADOW);
PATH_RAY_SHADOW,
0);
path_state_modify_bounce(state, false);
*throughput *= shader_bsdf_transparency(kg, shadow_sd);
}

View File

@ -80,7 +80,7 @@ ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, const Shad
{
sd->flag &= ~SD_CLOSURE_FLAGS;
sd->num_closure = 0;
sd->num_closure_extra = 0;
sd->num_closure_left = MAX_CLOSURE;
if(hit) {
Bssrdf *bssrdf = (Bssrdf *)sc;
@ -154,7 +154,7 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
if(bump || texture_blur > 0.0f) {
/* average color and normal at incoming point */
shader_eval_surface(kg, sd, state, state_flag);
shader_eval_surface(kg, sd, state, state_flag, MAX_CLOSURE);
float3 in_color = shader_bssrdf_sum(sd, (bump)? N: NULL, NULL);
/* we simply divide out the average color and multiply with the average

View File

@ -984,7 +984,7 @@ typedef ccl_addr_space struct ShaderData {
/* Closure data, we store a fixed array of closures */
int num_closure;
int num_closure_extra;
int num_closure_left;
float randb_closure;
float3 svm_closure_weight;
@ -997,6 +997,11 @@ typedef ccl_addr_space struct ShaderData {
struct ShaderClosure closure[MAX_CLOSURE];
} ShaderData;
typedef ccl_addr_space struct ShaderDataTinyStorage {
char pad[sizeof(ShaderData) - sizeof(ShaderClosure) * MAX_CLOSURE];
} ShaderDataTinyStorage;
#define AS_SHADER_DATA(shader_data_tiny_storage) ((ShaderData*)shader_data_tiny_storage)
/* Path State */
#ifdef __VOLUME__

View File

@ -43,7 +43,7 @@ ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
float3 *extinction)
{
sd->P = P;
shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW);
shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW, 0);
if(sd->flag & SD_EXTINCTION) {
*extinction = sd->closure_transparent_extinction;
@ -62,7 +62,7 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
VolumeShaderCoefficients *coeff)
{
sd->P = P;
shader_eval_volume(kg, sd, state, state->volume_stack, state->flag);
shader_eval_volume(kg, sd, state, state->volume_stack, state->flag, MAX_CLOSURE);
if(!(sd->flag & (SD_EXTINCTION|SD_SCATTER|SD_EMISSION)))
return false;

View File

@ -122,7 +122,12 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
*/
*throughput = make_float3(1.0f, 1.0f, 1.0f);
path_radiance_init(L, kernel_data.film.use_light_pass);
path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, rng_hash, sample, ray);
path_state_init(kg,
AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
state,
rng_hash,
sample,
ray);
#ifdef __SUBSURFACE__
kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
#endif

View File

@ -98,7 +98,16 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
BsdfEval L_light;
bool is_lamp;
if(direct_emission(kg, sd, &kernel_split_state.sd_DL_shadow[ray_index], &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
if(direct_emission(kg,
sd,
AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
&ls,
state,
&light_ray,
&L_light,
&is_lamp,
terminate))
{
/* Write intermediate data to global memory to access from
* the next kernel.
*/

View File

@ -31,7 +31,7 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K
ShaderData *sd = &kernel_split_state.sd[ray_index];
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
/* GPU: no decoupled ray marching, scatter probalistically */
int num_samples = kernel_data.integrator.volume_samples;
@ -141,7 +141,7 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
ShaderData *sd = &kernel_split_state.sd[ray_index];
ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);

View File

@ -101,7 +101,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
throughput = kernel_split_state.throughput[ray_index];

View File

@ -64,7 +64,7 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
path_radiance_init(&kernel_split_state.path_radiance[ray_index], kernel_data.film.use_light_pass);
path_state_init(kg,
&kernel_split_state.sd_DL_shadow[ray_index],
AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
&kernel_split_state.path_state[ray_index],
rng_hash,
sample,

View File

@ -50,7 +50,7 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg)
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, state->flag);
shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, state->flag, MAX_CLOSURE);
#ifdef __BRANCHED_PATH__
if(kernel_data.integrator.branched) {
shader_merge_closures(&kernel_split_state.sd[ray_index]);

View File

@ -34,7 +34,7 @@ ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
}
ShaderData *sd = &kernel_split_state.sd[ray_index];
ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
float3 throughput = kernel_split_state.throughput[ray_index];

View File

@ -47,7 +47,7 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
float3 throughput = kernel_split_state.throughput[ray_index];
BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
bool is_lamp = kernel_split_state.is_lamp[ray_index];
# if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)

View File

@ -111,7 +111,7 @@ typedef ccl_global struct SplitBranchedState {
SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \
SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \
SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \
SPLIT_DATA_SUBSURFACE_ENTRIES \
SPLIT_DATA_VOLUME_ENTRIES \
SPLIT_DATA_BRANCHED_ENTRIES \
@ -127,7 +127,7 @@ typedef ccl_global struct SplitBranchedState {
SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \
SPLIT_DATA_SUBSURFACE_ENTRIES \
SPLIT_DATA_VOLUME_ENTRIES \
SPLIT_DATA_BRANCHED_ENTRIES \

View File

@ -39,7 +39,7 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
ShaderData *sd = &branched_state->sd;
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
ShaderClosure *sc = &sd->closure[i];
@ -229,7 +229,7 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
ShaderData *sd = &kernel_split_state.sd[ray_index];
ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
if(sd->flag & SD_BSSRDF) {