Fix T40370: cycles CUDA baking timeout with high number of AA samples.

Now baking does one AA sample at a time, just like final render. There is also some code for shader antialiasing that solves T40369 but it is disabled for now because there may be unpredictable side effects.
Referenced by issue #40528, Graphics driver crash on baking Referenced by issue #40370, Cycles baking: "CUDA error: Launch exceeded timeout in ..."
2014-06-06 14:40:09 +02:00 · 2014-06-06 14:40:09 +02:00 · e4e58d4612 · 2023-02-14 10:35:23 +01:00
parent 553264ff8e
commit e4e58d4612
17 changed files with 249 additions and 169 deletions
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@ -492,26 +492,6 @@ static void populate_bake_data(BakeData *data, BL::BakePixel pixel_array, const
 	}
 }

-static bool is_light_pass(ShaderEvalType type)
-{
-	switch (type) {
-		case SHADER_EVAL_AO:
-		case SHADER_EVAL_COMBINED:
-		case SHADER_EVAL_SHADOW:
-		case SHADER_EVAL_DIFFUSE_DIRECT:
-		case SHADER_EVAL_GLOSSY_DIRECT:
-		case SHADER_EVAL_TRANSMISSION_DIRECT:
-		case SHADER_EVAL_SUBSURFACE_DIRECT:
-		case SHADER_EVAL_DIFFUSE_INDIRECT:
-		case SHADER_EVAL_GLOSSY_INDIRECT:
-		case SHADER_EVAL_TRANSMISSION_INDIRECT:
-		case SHADER_EVAL_SUBSURFACE_INDIRECT:
-			return true;
-		default:
-			return false;
-	}
-}
-
 void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, int num_pixels, int depth, float result[])
 {
 	ShaderEvalType shader_type = get_shader_type(pass_type);
@ -529,7 +509,7 @@ void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::Bake
 		Pass::add(PASS_UV, scene->film->passes);
 	}

-	if(is_light_pass(shader_type)) {
+	if(BakeManager::is_light_pass(shader_type)) {
 		/* force use_light_pass to be true */
 		Pass::add(PASS_LIGHT, scene->film->passes);
 	}
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@ -393,7 +393,8 @@ public:
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 		if(system_cpu_support_avx()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-				kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+				for(int sample = 0; sample < task.num_samples; sample++)
+					kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);

 				if(task.get_cancel() || task_pool.canceled())
 					break;
@ -404,7 +405,8 @@ public:
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
 		if(system_cpu_support_sse41()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-				kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+				for(int sample = 0; sample < task.num_samples; sample++)
+					kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);

 				if(task.get_cancel() || task_pool.canceled())
 					break;
@ -415,7 +417,8 @@ public:
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 		if(system_cpu_support_sse3()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-				kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+				for(int sample = 0; sample < task.num_samples; sample++)
+					kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);

 				if(task.get_cancel() || task_pool.canceled())
 					break;
@ -426,7 +429,8 @@ public:
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 		if(system_cpu_support_sse2()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-				kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+				for(int sample = 0; sample < task.num_samples; sample++)
+					kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);

 				if(task.get_cancel() || task_pool.canceled())
 					break;
@ -436,7 +440,8 @@ public:
 #endif
 		{
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-				kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+				for(int sample = 0; sample < task.num_samples; sample++)
+					kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);

 				if(task.get_cancel() || task_pool.canceled())
 					break;
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@ -764,40 +764,45 @@ public:

 			int shader_w = min(shader_chunk_size, end - shader_x);

-			/* pass in parameters */
-			int offset = 0;
+			for(int sample = 0; sample < task.num_samples; sample++) {
+				/* pass in parameters */
+				int offset = 0;

-			cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input)));
-			offset += sizeof(d_input);
+				cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input)));
+				offset += sizeof(d_input);

-			cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output)));
-			offset += sizeof(d_output);
+				cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output)));
+				offset += sizeof(d_output);

-			int shader_eval_type = task.shader_eval_type;
-			offset = align_up(offset, __alignof(shader_eval_type));
+				int shader_eval_type = task.shader_eval_type;
+				offset = align_up(offset, __alignof(shader_eval_type));

-			cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type));
-			offset += sizeof(task.shader_eval_type);
+				cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type));
+				offset += sizeof(task.shader_eval_type);

-			cuda_assert(cuParamSeti(cuShader, offset, shader_x));
-			offset += sizeof(shader_x);
+				cuda_assert(cuParamSeti(cuShader, offset, shader_x));
+				offset += sizeof(shader_x);

-			cuda_assert(cuParamSeti(cuShader, offset, shader_w));
-			offset += sizeof(shader_w);
+				cuda_assert(cuParamSeti(cuShader, offset, shader_w));
+				offset += sizeof(shader_w);

-			cuda_assert(cuParamSetSize(cuShader, offset));
+				cuda_assert(cuParamSeti(cuShader, offset, sample));
+				offset += sizeof(sample);

-			/* launch kernel */
-			int threads_per_block;
-			cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
+				cuda_assert(cuParamSetSize(cuShader, offset));

-			int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
+				/* launch kernel */
+				int threads_per_block;
+				cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));

-			cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
-			cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1));
-			cuda_assert(cuLaunchGrid(cuShader, xblocks, 1));
+				int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;

-			cuda_assert(cuCtxSynchronize());
+				cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
+				cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1));
+				cuda_assert(cuLaunchGrid(cuShader, xblocks, 1));
+
+				cuda_assert(cuCtxSynchronize());
+			}
 		}

 		cuda_pop_context();
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@ -1067,19 +1067,24 @@ public:
 		else
 			kernel = ckShaderKernel;

-		opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data));
-		opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input));
-		opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output));
+		for(int sample = 0; sample < task.num_samples; sample++) {
+			cl_int d_sample = task.sample;
+
+			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data));
+			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input));
+			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output));

 #define KERNEL_TEX(type, ttype, name) \
-	set_kernel_arg_mem(kernel, &narg, #name);
+		set_kernel_arg_mem(kernel, &narg, #name);
 #include "kernel_textures.h"

-		opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
-		opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
-		opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
+			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
+			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
+			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
+			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_sample), (void*)&d_sample));

-		enqueue_kernel(kernel, task.shader_w, 1);
+			enqueue_kernel(kernel, task.shader_w, 1);
+		}
 	}

 	void thread_run(DeviceTask *task)
--- a/intern/cycles/kernel/kernel.cl
+++ b/intern/cycles/kernel/kernel.cl
@ -115,7 +115,7 @@ __kernel void kernel_ocl_shader(
 	ccl_global type *name,
 #include "kernel_textures.h"

-	int type, int sx, int sw)
+	int type, int sx, int sw, int sample)
 {
 	KernelGlobals kglobals, *kg = &kglobals;

@ -128,7 +128,7 @@ __kernel void kernel_ocl_shader(
 	int x = sx + get_global_id(0);

 	if(x < sx + sw)
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x);
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
 }

 __kernel void kernel_ocl_bake(
@ -140,7 +140,7 @@ __kernel void kernel_ocl_bake(
 	ccl_global type *name,
 #include "kernel_textures.h"

-	int type, int sx, int sw)
+	int type, int sx, int sw, int sample)
 {
 	KernelGlobals kglobals, *kg = &kglobals;

@ -153,6 +153,6 @@ __kernel void kernel_ocl_bake(
 	int x = sx + get_global_id(0);

 	if(x < sx + sw)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
 }

--- a/intern/cycles/kernel/kernel.cpp
+++ b/intern/cycles/kernel/kernel.cpp
@ -120,12 +120,12 @@ void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *bu

 /* Shader Evaluation */

-void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
 {
 	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 	else
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/kernel.cu
+++ b/intern/cycles/kernel/kernel.cu
@ -146,22 +146,22 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal

 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw)
+kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int sample)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;

 	if(x < sx + sw)
-		kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x);
+		kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample);
 }

 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx, int sw)
+kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx, int sw, int sample)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;

 	if(x < sx + sw)
-		kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x);
+		kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample);
 }

 #endif
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@ -41,7 +41,7 @@ void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i);
+	int type, int i, int sample);

 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
@ -51,7 +51,7 @@ void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf
 void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i);
+	int type, int i, int sample);
 #endif

 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
@ -62,7 +62,7 @@ void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf
 void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i);
+	int type, int i, int sample);
 #endif

 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
@ -73,7 +73,7 @@ void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *bu
 void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i);
+	int type, int i, int sample);
 #endif

 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
@ -84,7 +84,7 @@ void kernel_cpu_avx_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buff
 void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i);
+	int type, int i, int sample);
 #endif

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernel_avx.cpp
@ -67,12 +67,12 @@ void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float

 /* Shader Evaluate */

-void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
 {
 	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 	else
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@ -17,108 +17,114 @@
 CCL_NAMESPACE_BEGIN

 ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, RNG rng,
-                                   const bool is_combined, const bool is_ao, const bool is_sss)
+                                   const bool is_combined, const bool is_ao, const bool is_sss, int sample)
 {
-	int samples = kernel_data.integrator.aa_samples;
-
 	/* initialize master radiance accumulator */
 	kernel_assert(kernel_data.film.use_light_pass);
 	path_radiance_init(L, kernel_data.film.use_light_pass);

-	/* take multiple samples */
-	for(int sample = 0; sample < samples; sample++) {
-		PathRadiance L_sample;
-		PathState state;
-		Ray ray;
-		float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-		bool is_sss_sample = is_sss;
+	PathRadiance L_sample;
+	PathState state;
+	Ray ray;
+	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+	bool is_sss_sample = is_sss;

-		/* init radiance */
-		path_radiance_init(&L_sample, kernel_data.film.use_light_pass);
+	/* init radiance */
+	path_radiance_init(&L_sample, kernel_data.film.use_light_pass);

-		/* init path state */
-		path_state_init(kg, &state, &rng, sample);
-		state.num_samples = samples;
+	/* init path state */
+	path_state_init(kg, &state, &rng, sample);
+	state.num_samples = kernel_data.integrator.aa_samples;

-		/* evaluate surface shader */
-		float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
-		shader_eval_surface(kg, sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
+	/* evaluate surface shader */
+	float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
+	shader_eval_surface(kg, sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN);

-		/* TODO, disable the closures we won't need */
+	/* TODO, disable the closures we won't need */

 #ifdef __BRANCHED_PATH__
-		if(!kernel_data.integrator.branched) {
-			/* regular path tracer */
+	if(!kernel_data.integrator.branched) {
+		/* regular path tracer */
 #endif

-			/* sample ambient occlusion */
-			if(is_combined || is_ao) {
-				kernel_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
-			}
+		/* sample ambient occlusion */
+		if(is_combined || is_ao) {
+			kernel_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
+		}

 #ifdef __SUBSURFACE__
-			/* sample subsurface scattering */
-			if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
-				/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
-				if (kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput))
-					is_sss_sample = true;
-			}
+		/* sample subsurface scattering */
+		if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
+			/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
+			if (kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput))
+				is_sss_sample = true;
+		}
 #endif

-			/* sample light and BSDF */
-			if((!is_sss_sample) && (!is_ao)) {
+		/* sample light and BSDF */
+		if((!is_sss_sample) && (!is_ao)) {

-				if(sd->flag & SD_EMISSION) {
-					float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-					path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
-				}
+			if(sd->flag & SD_EMISSION) {
+				float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
+				path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
+			}

-				if(kernel_path_integrate_lighting(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) {
+			if(kernel_path_integrate_lighting(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) {
 #ifdef __LAMP_MIS__
-					state.ray_t = 0.0f;
+				state.ray_t = 0.0f;
 #endif
-					/* compute indirect light */
-					kernel_path_indirect(kg, &rng, ray, throughput, state.num_samples, state, &L_sample);
+				/* compute indirect light */
+				kernel_path_indirect(kg, &rng, ray, throughput, 1, state, &L_sample);

-					/* sum and reset indirect light pass variables for the next samples */
-					path_radiance_sum_indirect(&L_sample);
-					path_radiance_reset_indirect(&L_sample);
-				}
+				/* sum and reset indirect light pass variables for the next samples */
+				path_radiance_sum_indirect(&L_sample);
+				path_radiance_reset_indirect(&L_sample);
 			}
-#ifdef __BRANCHED_PATH__
 		}
-		else {
-			/* branched path tracer */
+#ifdef __BRANCHED_PATH__
+	}
+	else {
+		/* branched path tracer */

-			/* sample ambient occlusion */
-			if(is_combined || is_ao) {
-				kernel_branched_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
-			}
+		/* sample ambient occlusion */
+		if(is_combined || is_ao) {
+			kernel_branched_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
+		}

 #ifdef __SUBSURFACE__
-			/* sample subsurface scattering */
-			if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
-				/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
-				kernel_branched_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, throughput);
-			}
-#endif
-
-			/* sample light and BSDF */
-			if((!is_sss_sample) && (!is_ao)) {
-
-				if(sd->flag & SD_EMISSION) {
-					float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-					path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
-				}
-
-				kernel_branched_path_integrate_lighting(kg, &rng,
-					sd, throughput, 1.0f, &state, &L_sample);
-			}
+		/* sample subsurface scattering */
+		if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
+			/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
+			kernel_branched_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, throughput);
 		}
 #endif

-		/* accumulate into master L */
-		path_radiance_accum_sample(L, &L_sample, samples);
+		/* sample light and BSDF */
+		if((!is_sss_sample) && (!is_ao)) {
+
+			if(sd->flag & SD_EMISSION) {
+				float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
+				path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
+			}
+
+			kernel_branched_path_integrate_lighting(kg, &rng,
+				sd, throughput, 1.0f, &state, &L_sample);
+		}
+	}
+#endif
+
+	/* accumulate into master L */
+	path_radiance_accum_sample(L, &L_sample, 1);
+}
+
+ccl_device bool is_aa_pass(ShaderEvalType type)
+{
+	switch(type) {
+		case SHADER_EVAL_UV:
+		case SHADER_EVAL_NORMAL:
+			return false;
+		default:
+			return true;
 	}
 }

@ -142,7 +148,20 @@ ccl_device bool is_light_pass(ShaderEvalType type)
 	}
 }

-ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i)
+#if 0
+ccl_device_inline float bake_clamp_mirror_repeat(float u)
+{
+	/* use mirror repeat (like opengl texture) so that if the barycentric
+	 * coordinate goes past the end of the triangle it is not always clamped
+	 * to the same value, gives ugly patterns */
+	float fu = floorf(u);
+	u = u - fu;
+
+	return (((int)fu) & 1)? 1.0f - u: u;
+}
+#endif
+
+ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i, int sample)
 {
 	ShaderData sd;
 	uint4 in = input[i * 2];
@ -164,6 +183,24 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	float dvdx = __uint_as_float(diff.z);
 	float dvdy = __uint_as_float(diff.w);

+	int num_samples = kernel_data.integrator.aa_samples;
+
+	/* random number generator */
+	RNG rng = cmj_hash(i, 0);
+
+#if 0
+	uint rng_state = cmj_hash(i, 0);
+	float filter_x, filter_y;
+	path_rng_init(kg, &rng_state, sample, num_samples, &rng, 0, 0, &filter_x, &filter_y);
+
+	/* subpixel u/v offset */
+	if(sample > 0) {
+		u = bake_clamp_mirror_repeat(u + dudx*(filter_x - 0.5f) + dudy*(filter_y - 0.5f));
+		v = bake_clamp_mirror_repeat(v + dvdx*(filter_x - 0.5f) + dvdy*(filter_y - 0.5f));
+	}
+#endif
+
+	/* triangle */
 	int shader;
 	float3 P, Ng;

@ -190,12 +227,14 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	sd.dv.dx = dvdx;
 	sd.dv.dy = dvdy;

+	/* light passes */
 	if(is_light_pass(type)) {
-		RNG rng = cmj_hash(i, 0);
-		compute_light_pass(kg, &sd, &L, rng, (type == SHADER_EVAL_COMBINED),
-		                                     (type == SHADER_EVAL_AO),
-		                                     (type == SHADER_EVAL_SUBSURFACE_DIRECT ||
-		                                      type == SHADER_EVAL_SUBSURFACE_INDIRECT));
+		compute_light_pass(kg, &sd, &L, rng,
+		                   (type == SHADER_EVAL_COMBINED),
+		                   (type == SHADER_EVAL_AO),
+		                   (type == SHADER_EVAL_SUBSURFACE_DIRECT ||
+		                    type == SHADER_EVAL_SUBSURFACE_INDIRECT),
+		                   sample);
 	}

 	switch (type) {
@ -350,11 +389,15 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	}

 	/* write output */
-	output[i] = make_float4(out.x, out.y, out.z, 1.0f);
-	return;
+	float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f;
+
+	if(sample == 0)
+		output[i] = make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
+	else
+		output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
 }

-ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i)
+ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i, int sample)
 {
 	ShaderData sd;
 	uint4 in = input[i];
@ -401,7 +444,10 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *inpu
 	}
 	
 	/* write output */
-	output[i] = make_float4(out.x, out.y, out.z, 0.0f);
+	if(sample == 0)
+		output[i] = make_float4(out.x, out.y, out.z, 0.0f);
+	else
+		output[i] += make_float4(out.x, out.y, out.z, 0.0f);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernel_sse2.cpp
@ -64,12 +64,12 @@ void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa

 /* Shader Evaluate */

-void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
 {
 	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 	else
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernel_sse3.cpp
@ -66,12 +66,12 @@ void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa

 /* Shader Evaluate */

-void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
 {
 	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 	else
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernel_sse41.cpp
@ -67,12 +67,12 @@ void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, flo

 /* Shader Evaluate */

-void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
 {
 	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 	else
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@ -15,6 +15,7 @@
 */

 #include "bake.h"
+#include "integrator.h"

 CCL_NAMESPACE_BEGIN

@ -152,6 +153,7 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
 	task.shader_eval_type = shader_type;
 	task.shader_x = 0;
 	task.shader_w = d_output.size();
+	task.num_samples = is_aa_pass(shader_type)? scene->integrator->aa_samples: 1;
 	task.get_cancel = function_bind(&Progress::get_cancel, &progress);

 	device->task_add(task);
@ -203,4 +205,35 @@ void BakeManager::device_free(Device *device, DeviceScene *dscene)
 {
 }

+bool BakeManager::is_aa_pass(ShaderEvalType type)
+{
+	switch(type) {
+		case SHADER_EVAL_UV:
+		case SHADER_EVAL_NORMAL:
+			return false;
+		default:
+			return true;
+	}
+}
+
+bool BakeManager::is_light_pass(ShaderEvalType type)
+{
+	switch(type) {
+		case SHADER_EVAL_AO:
+		case SHADER_EVAL_COMBINED:
+		case SHADER_EVAL_SHADOW:
+		case SHADER_EVAL_DIFFUSE_DIRECT:
+		case SHADER_EVAL_GLOSSY_DIRECT:
+		case SHADER_EVAL_TRANSMISSION_DIRECT:
+		case SHADER_EVAL_SUBSURFACE_DIRECT:
+		case SHADER_EVAL_DIFFUSE_INDIRECT:
+		case SHADER_EVAL_GLOSSY_INDIRECT:
+		case SHADER_EVAL_TRANSMISSION_INDIRECT:
+		case SHADER_EVAL_SUBSURFACE_INDIRECT:
+			return true;
+		default:
+			return false;
+	}
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/render/bake.h
+++ b/intern/cycles/render/bake.h
@ -17,10 +17,11 @@
 #ifndef __BAKE_H__
 #define __BAKE_H__

-#include "util_vector.h"
 #include "device.h"
 #include "scene.h"
-#include "session.h"
+
+#include "util_progress.h"
+#include "util_vector.h"

 CCL_NAMESPACE_BEGIN

@ -64,6 +65,9 @@ public:
 	void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
 	void device_free(Device *device, DeviceScene *dscene);

+	static bool is_light_pass(ShaderEvalType type);
+	static bool is_aa_pass(ShaderEvalType type);
+
 	bool need_update;

 private:
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@ -66,6 +66,7 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
 	main_task.shader_eval_type = SHADER_EVAL_BACKGROUND;
 	main_task.shader_x = 0;
 	main_task.shader_w = width*height;
+	main_task.num_samples = 1;
 	main_task.get_cancel = function_bind(&Progress::get_cancel, &progress);

 	/* disabled splitting for now, there's an issue with multi-GPU mem_copy_from */
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@ -119,6 +119,7 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 	task.shader_eval_type = SHADER_EVAL_DISPLACE;
 	task.shader_x = 0;
 	task.shader_w = d_output.size();
+	task.num_samples = 1;
 	task.get_cancel = function_bind(&Progress::get_cancel, &progress);

 	device->task_add(task);