Fix T40370: cycles CUDA baking timeout with high number of AA samples.

Now baking does one AA sample at a time, just like final render. There is
also some code for shader antialiasing that solves T40369 but it is disabled
for now because there may be unpredictable side effects.
This commit is contained in:
Brecht Van Lommel 2014-06-06 14:40:09 +02:00
parent 553264ff8e
commit e4e58d4612
Notes: blender-bot 2023-02-14 10:35:23 +01:00
Referenced by issue #40528, Graphics driver crash on baking
Referenced by issue #40370, Cycles baking: "CUDA error: Launch exceeded timeout in ..."
17 changed files with 249 additions and 169 deletions

View File

@ -492,26 +492,6 @@ static void populate_bake_data(BakeData *data, BL::BakePixel pixel_array, const
}
}
static bool is_light_pass(ShaderEvalType type)
{
switch (type) {
case SHADER_EVAL_AO:
case SHADER_EVAL_COMBINED:
case SHADER_EVAL_SHADOW:
case SHADER_EVAL_DIFFUSE_DIRECT:
case SHADER_EVAL_GLOSSY_DIRECT:
case SHADER_EVAL_TRANSMISSION_DIRECT:
case SHADER_EVAL_SUBSURFACE_DIRECT:
case SHADER_EVAL_DIFFUSE_INDIRECT:
case SHADER_EVAL_GLOSSY_INDIRECT:
case SHADER_EVAL_TRANSMISSION_INDIRECT:
case SHADER_EVAL_SUBSURFACE_INDIRECT:
return true;
default:
return false;
}
}
void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, int num_pixels, int depth, float result[])
{
ShaderEvalType shader_type = get_shader_type(pass_type);
@ -529,7 +509,7 @@ void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::Bake
Pass::add(PASS_UV, scene->film->passes);
}
if(is_light_pass(shader_type)) {
if(BakeManager::is_light_pass(shader_type)) {
/* force use_light_pass to be true */
Pass::add(PASS_LIGHT, scene->film->passes);
}

View File

@ -393,7 +393,8 @@ public:
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
for(int sample = 0; sample < task.num_samples; sample++)
kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
if(task.get_cancel() || task_pool.canceled())
break;
@ -404,7 +405,8 @@ public:
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
for(int sample = 0; sample < task.num_samples; sample++)
kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
if(task.get_cancel() || task_pool.canceled())
break;
@ -415,7 +417,8 @@ public:
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
for(int sample = 0; sample < task.num_samples; sample++)
kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
if(task.get_cancel() || task_pool.canceled())
break;
@ -426,7 +429,8 @@ public:
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
if(system_cpu_support_sse2()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
for(int sample = 0; sample < task.num_samples; sample++)
kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
if(task.get_cancel() || task_pool.canceled())
break;
@ -436,7 +440,8 @@ public:
#endif
{
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
for(int sample = 0; sample < task.num_samples; sample++)
kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
if(task.get_cancel() || task_pool.canceled())
break;

View File

@ -764,40 +764,45 @@ public:
int shader_w = min(shader_chunk_size, end - shader_x);
/* pass in parameters */
int offset = 0;
for(int sample = 0; sample < task.num_samples; sample++) {
/* pass in parameters */
int offset = 0;
cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input)));
offset += sizeof(d_input);
cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input)));
offset += sizeof(d_input);
cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output)));
offset += sizeof(d_output);
cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output)));
offset += sizeof(d_output);
int shader_eval_type = task.shader_eval_type;
offset = align_up(offset, __alignof(shader_eval_type));
int shader_eval_type = task.shader_eval_type;
offset = align_up(offset, __alignof(shader_eval_type));
cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type));
offset += sizeof(task.shader_eval_type);
cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type));
offset += sizeof(task.shader_eval_type);
cuda_assert(cuParamSeti(cuShader, offset, shader_x));
offset += sizeof(shader_x);
cuda_assert(cuParamSeti(cuShader, offset, shader_x));
offset += sizeof(shader_x);
cuda_assert(cuParamSeti(cuShader, offset, shader_w));
offset += sizeof(shader_w);
cuda_assert(cuParamSeti(cuShader, offset, shader_w));
offset += sizeof(shader_w);
cuda_assert(cuParamSetSize(cuShader, offset));
cuda_assert(cuParamSeti(cuShader, offset, sample));
offset += sizeof(sample);
/* launch kernel */
int threads_per_block;
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
cuda_assert(cuParamSetSize(cuShader, offset));
int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
/* launch kernel */
int threads_per_block;
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1));
cuda_assert(cuLaunchGrid(cuShader, xblocks, 1));
int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
cuda_assert(cuCtxSynchronize());
cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1));
cuda_assert(cuLaunchGrid(cuShader, xblocks, 1));
cuda_assert(cuCtxSynchronize());
}
}
cuda_pop_context();

View File

@ -1067,19 +1067,24 @@ public:
else
kernel = ckShaderKernel;
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data));
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input));
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output));
for(int sample = 0; sample < task.num_samples; sample++) {
cl_int d_sample = task.sample;
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data));
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input));
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output));
#define KERNEL_TEX(type, ttype, name) \
set_kernel_arg_mem(kernel, &narg, #name);
set_kernel_arg_mem(kernel, &narg, #name);
#include "kernel_textures.h"
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_sample), (void*)&d_sample));
enqueue_kernel(kernel, task.shader_w, 1);
enqueue_kernel(kernel, task.shader_w, 1);
}
}
void thread_run(DeviceTask *task)

View File

@ -115,7 +115,7 @@ __kernel void kernel_ocl_shader(
ccl_global type *name,
#include "kernel_textures.h"
int type, int sx, int sw)
int type, int sx, int sw, int sample)
{
KernelGlobals kglobals, *kg = &kglobals;
@ -128,7 +128,7 @@ __kernel void kernel_ocl_shader(
int x = sx + get_global_id(0);
if(x < sx + sw)
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x);
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
}
__kernel void kernel_ocl_bake(
@ -140,7 +140,7 @@ __kernel void kernel_ocl_bake(
ccl_global type *name,
#include "kernel_textures.h"
int type, int sx, int sw)
int type, int sx, int sw, int sample)
{
KernelGlobals kglobals, *kg = &kglobals;
@ -153,6 +153,6 @@ __kernel void kernel_ocl_bake(
int x = sx + get_global_id(0);
if(x < sx + sw)
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x);
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
}

View File

@ -120,12 +120,12 @@ void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *bu
/* Shader Evaluation */
void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
{
if(type >= SHADER_EVAL_BAKE)
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
else
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
}
CCL_NAMESPACE_END

View File

@ -146,22 +146,22 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw)
kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int sample)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
if(x < sx + sw)
kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x);
kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample);
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx, int sw)
kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx, int sw, int sample)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
if(x < sx + sw)
kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x);
kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample);
}
#endif

View File

@ -41,7 +41,7 @@ void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
float sample_scale, int x, int y, int offset, int stride);
void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output,
int type, int i);
int type, int i, int sample);
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
@ -51,7 +51,7 @@ void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf
void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
float sample_scale, int x, int y, int offset, int stride);
void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
int type, int i);
int type, int i, int sample);
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
@ -62,7 +62,7 @@ void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf
void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
float sample_scale, int x, int y, int offset, int stride);
void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output,
int type, int i);
int type, int i, int sample);
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
@ -73,7 +73,7 @@ void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *bu
void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
float sample_scale, int x, int y, int offset, int stride);
void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output,
int type, int i);
int type, int i, int sample);
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
@ -84,7 +84,7 @@ void kernel_cpu_avx_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buff
void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
float sample_scale, int x, int y, int offset, int stride);
void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output,
int type, int i);
int type, int i, int sample);
#endif
CCL_NAMESPACE_END

View File

@ -67,12 +67,12 @@ void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float
/* Shader Evaluate */
void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
{
if(type >= SHADER_EVAL_BAKE)
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
else
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
}
CCL_NAMESPACE_END

View File

@ -17,108 +17,114 @@
CCL_NAMESPACE_BEGIN
ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, RNG rng,
const bool is_combined, const bool is_ao, const bool is_sss)
const bool is_combined, const bool is_ao, const bool is_sss, int sample)
{
int samples = kernel_data.integrator.aa_samples;
/* initialize master radiance accumulator */
kernel_assert(kernel_data.film.use_light_pass);
path_radiance_init(L, kernel_data.film.use_light_pass);
/* take multiple samples */
for(int sample = 0; sample < samples; sample++) {
PathRadiance L_sample;
PathState state;
Ray ray;
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
bool is_sss_sample = is_sss;
PathRadiance L_sample;
PathState state;
Ray ray;
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
bool is_sss_sample = is_sss;
/* init radiance */
path_radiance_init(&L_sample, kernel_data.film.use_light_pass);
/* init radiance */
path_radiance_init(&L_sample, kernel_data.film.use_light_pass);
/* init path state */
path_state_init(kg, &state, &rng, sample);
state.num_samples = samples;
/* init path state */
path_state_init(kg, &state, &rng, sample);
state.num_samples = kernel_data.integrator.aa_samples;
/* evaluate surface shader */
float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
shader_eval_surface(kg, sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
/* evaluate surface shader */
float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
shader_eval_surface(kg, sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
/* TODO, disable the closures we won't need */
/* TODO, disable the closures we won't need */
#ifdef __BRANCHED_PATH__
if(!kernel_data.integrator.branched) {
/* regular path tracer */
if(!kernel_data.integrator.branched) {
/* regular path tracer */
#endif
/* sample ambient occlusion */
if(is_combined || is_ao) {
kernel_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
}
/* sample ambient occlusion */
if(is_combined || is_ao) {
kernel_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
}
#ifdef __SUBSURFACE__
/* sample subsurface scattering */
if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
if (kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput))
is_sss_sample = true;
}
/* sample subsurface scattering */
if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
if (kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput))
is_sss_sample = true;
}
#endif
/* sample light and BSDF */
if((!is_sss_sample) && (!is_ao)) {
/* sample light and BSDF */
if((!is_sss_sample) && (!is_ao)) {
if(sd->flag & SD_EMISSION) {
float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
}
if(sd->flag & SD_EMISSION) {
float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
}
if(kernel_path_integrate_lighting(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) {
if(kernel_path_integrate_lighting(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) {
#ifdef __LAMP_MIS__
state.ray_t = 0.0f;
state.ray_t = 0.0f;
#endif
/* compute indirect light */
kernel_path_indirect(kg, &rng, ray, throughput, state.num_samples, state, &L_sample);
/* compute indirect light */
kernel_path_indirect(kg, &rng, ray, throughput, 1, state, &L_sample);
/* sum and reset indirect light pass variables for the next samples */
path_radiance_sum_indirect(&L_sample);
path_radiance_reset_indirect(&L_sample);
}
/* sum and reset indirect light pass variables for the next samples */
path_radiance_sum_indirect(&L_sample);
path_radiance_reset_indirect(&L_sample);
}
#ifdef __BRANCHED_PATH__
}
else {
/* branched path tracer */
#ifdef __BRANCHED_PATH__
}
else {
/* branched path tracer */
/* sample ambient occlusion */
if(is_combined || is_ao) {
kernel_branched_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
}
/* sample ambient occlusion */
if(is_combined || is_ao) {
kernel_branched_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
}
#ifdef __SUBSURFACE__
/* sample subsurface scattering */
if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
kernel_branched_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, throughput);
}
#endif
/* sample light and BSDF */
if((!is_sss_sample) && (!is_ao)) {
if(sd->flag & SD_EMISSION) {
float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
}
kernel_branched_path_integrate_lighting(kg, &rng,
sd, throughput, 1.0f, &state, &L_sample);
}
/* sample subsurface scattering */
if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
kernel_branched_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, throughput);
}
#endif
/* accumulate into master L */
path_radiance_accum_sample(L, &L_sample, samples);
/* sample light and BSDF */
if((!is_sss_sample) && (!is_ao)) {
if(sd->flag & SD_EMISSION) {
float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
}
kernel_branched_path_integrate_lighting(kg, &rng,
sd, throughput, 1.0f, &state, &L_sample);
}
}
#endif
/* accumulate into master L */
path_radiance_accum_sample(L, &L_sample, 1);
}
ccl_device bool is_aa_pass(ShaderEvalType type)
{
switch(type) {
case SHADER_EVAL_UV:
case SHADER_EVAL_NORMAL:
return false;
default:
return true;
}
}
@ -142,7 +148,20 @@ ccl_device bool is_light_pass(ShaderEvalType type)
}
}
ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i)
#if 0
ccl_device_inline float bake_clamp_mirror_repeat(float u)
{
/* use mirror repeat (like opengl texture) so that if the barycentric
* coordinate goes past the end of the triangle it is not always clamped
* to the same value, gives ugly patterns */
float fu = floorf(u);
u = u - fu;
return (((int)fu) & 1)? 1.0f - u: u;
}
#endif
ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i, int sample)
{
ShaderData sd;
uint4 in = input[i * 2];
@ -164,6 +183,24 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
float dvdx = __uint_as_float(diff.z);
float dvdy = __uint_as_float(diff.w);
int num_samples = kernel_data.integrator.aa_samples;
/* random number generator */
RNG rng = cmj_hash(i, 0);
#if 0
uint rng_state = cmj_hash(i, 0);
float filter_x, filter_y;
path_rng_init(kg, &rng_state, sample, num_samples, &rng, 0, 0, &filter_x, &filter_y);
/* subpixel u/v offset */
if(sample > 0) {
u = bake_clamp_mirror_repeat(u + dudx*(filter_x - 0.5f) + dudy*(filter_y - 0.5f));
v = bake_clamp_mirror_repeat(v + dvdx*(filter_x - 0.5f) + dvdy*(filter_y - 0.5f));
}
#endif
/* triangle */
int shader;
float3 P, Ng;
@ -190,12 +227,14 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
sd.dv.dx = dvdx;
sd.dv.dy = dvdy;
/* light passes */
if(is_light_pass(type)) {
RNG rng = cmj_hash(i, 0);
compute_light_pass(kg, &sd, &L, rng, (type == SHADER_EVAL_COMBINED),
(type == SHADER_EVAL_AO),
(type == SHADER_EVAL_SUBSURFACE_DIRECT ||
type == SHADER_EVAL_SUBSURFACE_INDIRECT));
compute_light_pass(kg, &sd, &L, rng,
(type == SHADER_EVAL_COMBINED),
(type == SHADER_EVAL_AO),
(type == SHADER_EVAL_SUBSURFACE_DIRECT ||
type == SHADER_EVAL_SUBSURFACE_INDIRECT),
sample);
}
switch (type) {
@ -350,11 +389,15 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
}
/* write output */
output[i] = make_float4(out.x, out.y, out.z, 1.0f);
return;
float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f;
if(sample == 0)
output[i] = make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
else
output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
}
ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i)
ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i, int sample)
{
ShaderData sd;
uint4 in = input[i];
@ -401,7 +444,10 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *inpu
}
/* write output */
output[i] = make_float4(out.x, out.y, out.z, 0.0f);
if(sample == 0)
output[i] = make_float4(out.x, out.y, out.z, 0.0f);
else
output[i] += make_float4(out.x, out.y, out.z, 0.0f);
}
CCL_NAMESPACE_END

View File

@ -64,12 +64,12 @@ void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
/* Shader Evaluate */
void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
{
if(type >= SHADER_EVAL_BAKE)
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
else
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
}
CCL_NAMESPACE_END

View File

@ -66,12 +66,12 @@ void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
/* Shader Evaluate */
void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
{
if(type >= SHADER_EVAL_BAKE)
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
else
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
}
CCL_NAMESPACE_END

View File

@ -67,12 +67,12 @@ void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, flo
/* Shader Evaluate */
void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
{
if(type >= SHADER_EVAL_BAKE)
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
else
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
}
CCL_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/
#include "bake.h"
#include "integrator.h"
CCL_NAMESPACE_BEGIN
@ -152,6 +153,7 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
task.shader_eval_type = shader_type;
task.shader_x = 0;
task.shader_w = d_output.size();
task.num_samples = is_aa_pass(shader_type)? scene->integrator->aa_samples: 1;
task.get_cancel = function_bind(&Progress::get_cancel, &progress);
device->task_add(task);
@ -203,4 +205,35 @@ void BakeManager::device_free(Device *device, DeviceScene *dscene)
{
}
bool BakeManager::is_aa_pass(ShaderEvalType type)
{
switch(type) {
case SHADER_EVAL_UV:
case SHADER_EVAL_NORMAL:
return false;
default:
return true;
}
}
bool BakeManager::is_light_pass(ShaderEvalType type)
{
switch(type) {
case SHADER_EVAL_AO:
case SHADER_EVAL_COMBINED:
case SHADER_EVAL_SHADOW:
case SHADER_EVAL_DIFFUSE_DIRECT:
case SHADER_EVAL_GLOSSY_DIRECT:
case SHADER_EVAL_TRANSMISSION_DIRECT:
case SHADER_EVAL_SUBSURFACE_DIRECT:
case SHADER_EVAL_DIFFUSE_INDIRECT:
case SHADER_EVAL_GLOSSY_INDIRECT:
case SHADER_EVAL_TRANSMISSION_INDIRECT:
case SHADER_EVAL_SUBSURFACE_INDIRECT:
return true;
default:
return false;
}
}
CCL_NAMESPACE_END

View File

@ -17,10 +17,11 @@
#ifndef __BAKE_H__
#define __BAKE_H__
#include "util_vector.h"
#include "device.h"
#include "scene.h"
#include "session.h"
#include "util_progress.h"
#include "util_vector.h"
CCL_NAMESPACE_BEGIN
@ -64,6 +65,9 @@ public:
void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
void device_free(Device *device, DeviceScene *dscene);
static bool is_light_pass(ShaderEvalType type);
static bool is_aa_pass(ShaderEvalType type);
bool need_update;
private:

View File

@ -66,6 +66,7 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
main_task.shader_eval_type = SHADER_EVAL_BACKGROUND;
main_task.shader_x = 0;
main_task.shader_w = width*height;
main_task.num_samples = 1;
main_task.get_cancel = function_bind(&Progress::get_cancel, &progress);
/* disabled splitting for now, there's an issue with multi-GPU mem_copy_from */

View File

@ -119,6 +119,7 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
task.shader_eval_type = SHADER_EVAL_DISPLACE;
task.shader_x = 0;
task.shader_w = d_output.size();
task.num_samples = 1;
task.get_cancel = function_bind(&Progress::get_cancel, &progress);
device->task_add(task);