Cycles: OpenCL kernel split

This commit contains all the work related on the AMD megakernel split work which was mainly done by Varun Sundar, George Kyriazis and Lenny Wang, plus some help from Sergey Sharybin, Martijn Berger, Thomas Dinges and likely someone else which we're forgetting to mention. Currently only AMD cards are enabled for the new split kernel, but it is possible to force split opencl kernel to be used by setting the following environment variable: CYCLES_OPENCL_SPLIT_KERNEL_TEST=1. Not all the features are supported yet, and that being said no motion blur, camera blur, SSS and volumetrics for now. Also transparent shadows are disabled on AMD device because of some compiler bug. This kernel is also only implements regular path tracing and supporting branched one will take a bit. Branched path tracing is exposed to the interface still, which is a bit misleading and will be hidden there soon. More feature will be enabled once they're ported to the split kernel and tested. Neither regular CPU nor CUDA has any difference, they're generating the same exact code, which means no regressions/improvements there. Based on the research paper: https://research.nvidia.com/sites/default/files/publications/laine2013hpg_paper.pdf Here's the documentation: https://docs.google.com/document/d/1LuXW-CV-sVJkQaEGZlMJ86jZ8FmoPfecaMdR-oiWbUY/edit Design discussion of the patch: https://developer.blender.org/T44197 Differential Revision: https://developer.blender.org/D1200
Referenced by issue #73489, wrong assumptions about running OpenCL kernels on GPU device
2015-05-09 19:34:30 +05:00 · 2015-05-09 19:34:30 +05:00 · 7f4479da42 · 2023-02-13 23:36:22 +01:00
parent f680c1b54a
commit 7f4479da42
57 changed files with 5826 additions and 870 deletions
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@ -55,6 +55,7 @@ public:
 	bool advanced_shading;
 	bool pack_images;
 	bool extended_images; /* flag for GPU and Multi device */
+	bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
 	vector<DeviceInfo> multi_devices;

 	DeviceInfo()
@ -66,6 +67,7 @@ public:
 		advanced_shading = true;
 		pack_images = false;
 		extended_images = false;
+		use_split_kernel = false;
 	}
 };

--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -14,6 +14,17 @@ set(INC_SYS
 set(SRC
 	kernel.cpp
 	kernel.cl
+	kernel_data_init.cl
+	kernel_queue_enqueue.cl
+	kernel_scene_intersect.cl
+	kernel_lamp_emission.cl
+	kernel_background_buffer_update.cl
+	kernel_shader_eval.cl
+	kernel_holdout_emission_blurring_pathtermination_ao.cl
+	kernel_direct_lighting.cl
+	kernel_shadow_blocked.cl
+	kernel_next_iteration_setup.cl
+	kernel_sum_all_radiance.cl
 	kernel.cu
 )

@ -36,17 +47,22 @@ set(SRC_HEADERS
 	kernel_montecarlo.h
 	kernel_passes.h
 	kernel_path.h
+	kernel_path_common.h
 	kernel_path_state.h
 	kernel_path_surface.h
 	kernel_path_volume.h
 	kernel_projection.h
+	kernel_queues.h
 	kernel_random.h
 	kernel_shader.h
+	kernel_shaderdata_vars.h
 	kernel_shadow.h
+	kernel_split.h
 	kernel_subsurface.h
 	kernel_textures.h
 	kernel_types.h
 	kernel_volume.h
+	kernel_work_stealing.h
 )

 set(SRC_CLOSURE_HEADERS
@ -68,6 +84,7 @@ set(SRC_CLOSURE_HEADERS
 	closure/emissive.h
 	closure/volume.h
 )
+
 set(SRC_SVM_HEADERS
 	svm/svm.h
 	svm/svm_attribute.h
@ -284,6 +301,17 @@ endif()
 #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)

 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@ -47,79 +47,79 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader
 	switch(sc->type) {
 		case CLOSURE_BSDF_DIFFUSE_ID:
 		case CLOSURE_BSDF_BSSRDF_ID:
-			label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #ifdef __SVM__
 		case CLOSURE_BSDF_OREN_NAYAR_ID:
-			label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		/*case CLOSURE_BSDF_PHONG_RAMP_ID:
-			label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-			label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;*/
 		case CLOSURE_BSDF_TRANSLUCENT_ID:
-			label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFLECTION_ID:
-			label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFRACTION_ID:
-			label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_TRANSPARENT_ID:
-			label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-			label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-			label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-			label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-			label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-			label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_GLOSSY_TOON_ID:
-			label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-			label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-			label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 #ifdef __VOLUME__
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-			label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+			label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 		default:
@ -139,67 +139,67 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 		return OSLShader::bsdf_eval(sd, sc, omega_in, *pdf);
 #endif

-	if(dot(sd->Ng, omega_in) >= 0.0f) {
+	if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) {
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			/*case CLOSURE_BSDF_PHONG_RAMP_ID:
-				eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-				eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;*/
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 			default:
@ -211,57 +211,57 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 			default:
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@ -29,13 +29,13 @@ CCL_NAMESPACE_BEGIN

 ccl_device_inline int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeElement *elem)
 {
-	if(sd->object == PRIM_NONE)
+	if(ccl_fetch(sd, object) == PRIM_NONE)
 		return (int)ATTR_STD_NOT_FOUND;

 	/* for SVM, find attribute by unique id */
-	uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
+	uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
 #ifdef __HAIR__
-	attr_offset = (sd->type & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
+	attr_offset = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
 #endif
 	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 	
@ -49,7 +49,7 @@ ccl_device_inline int find_attribute(KernelGlobals *kg, const ShaderData *sd, ui

 	*elem = (AttributeElement)attr_map.y;
 	
-	if(sd->prim == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH)
+	if(ccl_fetch(sd, prim) == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH)
 		return ATTR_STD_NOT_FOUND;

 	/* return result */
--- a/intern/cycles/kernel/geom/geom_bvh.h
+++ b/intern/cycles/kernel/geom/geom_bvh.h
@ -447,6 +447,7 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }

+#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__)
 /* ToDo: Move to another file? */
 ccl_device int intersections_compare(const void *a, const void *b)
 {
@ -460,6 +461,7 @@ ccl_device int intersections_compare(const void *a, const void *b)
 	else
 		return 0;
 }
+#endif

 CCL_NAMESPACE_END

--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@ -236,25 +236,25 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh
 ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool subsurface)
 {
 	/* get shader */
-	sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
+	ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));

 	/* get motion info */
 	int numsteps, numverts;
-	object_motion_info(kg, sd->object, &numsteps, &numverts, NULL);
+	object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL);

 	/* figure out which steps we need to fetch and their interpolation factor */
 	int maxstep = numsteps*2;
-	int step = min((int)(sd->time*maxstep), maxstep-1);
-	float t = sd->time*maxstep - step;
+	int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1);
+	float t = ccl_fetch(sd, time)*maxstep - step;

 	/* find attribute */
 	AttributeElement elem;
-	int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_POSITION, &elem);
+	int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_POSITION, &elem);
 	kernel_assert(offset != ATTR_STD_NOT_FOUND);

 	/* fetch vertex coordinates */
 	float3 verts[3], next_verts[3];
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)));

 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
@ -268,33 +268,33 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD
 #ifdef __SUBSURFACE__
 	if(!subsurface)
 #endif
-		sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
+		ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts);
 #ifdef __SUBSURFACE__
 	else
-		sd->P = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts);
+		ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts);
 #endif

 	/* compute face normal */
 	float3 Ng;
-	if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+	if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED)
 		Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
 	else
 		Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));

-	sd->Ng = Ng;
-	sd->N = Ng;
+	ccl_fetch(sd, Ng) = Ng;
+	ccl_fetch(sd, N) = Ng;

 	/* compute derivatives of P w.r.t. uv */
 #ifdef __DPDU__
-	sd->dPdu = (verts[0] - verts[2]);
-	sd->dPdv = (verts[1] - verts[2]);
+	ccl_fetch(sd, dPdu) = (verts[0] - verts[2]);
+	ccl_fetch(sd, dPdv) = (verts[1] - verts[2]);
 #endif

 	/* compute smooth normal */
-	if(sd->shader & SHADER_SMOOTH_NORMAL) {
+	if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
 		/* find attribute */
 		AttributeElement elem;
-		int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_NORMAL, &elem);
+		int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_NORMAL, &elem);
 		kernel_assert(offset != ATTR_STD_NOT_FOUND);

 		/* fetch vertex coordinates */
@ -308,10 +308,10 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD
 		normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];

 		/* interpolate between vertices */
-		float u = sd->u;
-		float v = sd->v;
+		float u = ccl_fetch(sd, u);
+		float v = ccl_fetch(sd, v);
 		float w = 1.0f - u - v;
-		sd->N = (u*normals[0] + v*normals[1] + w*normals[2]);
+		ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]);
 	}
 }

--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@ -123,9 +123,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
 ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point(&sd->ob_tfm, *P);
+	*P = transform_point(&ccl_fetch(sd, ob_tfm), *P);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@ -135,9 +135,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader
 ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point(&sd->ob_itfm, *P);
+	*P = transform_point(&ccl_fetch(sd, ob_itfm), *P);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@ -147,9 +147,9 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons
 ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	*N = normalize(transform_direction_transposed(&sd->ob_tfm, *N));
+	*N = normalize(transform_direction_transposed(&ccl_fetch(sd, ob_tfm), *N));
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
 	*N = normalize(transform_direction_transposed(&tfm, *N));
 #endif
 }
@ -159,9 +159,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const
 ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	*N = normalize(transform_direction_transposed(&sd->ob_itfm, *N));
+	*N = normalize(transform_direction_transposed(&ccl_fetch(sd, ob_itfm), *N));
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
 	*N = normalize(transform_direction_transposed(&tfm, *N));
 #endif
 }
@ -171,9 +171,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa
 ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction(&sd->ob_tfm, *D);
+	*D = transform_direction(&ccl_fetch(sd, ob_tfm), *D);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@ -183,9 +183,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData
 ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction(&sd->ob_itfm, *D);
+	*D = transform_direction(&ccl_fetch(sd, ob_itfm), *D);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@ -194,13 +194,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha

 ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
 {
-	if(sd->object == OBJECT_NONE)
+	if(ccl_fetch(sd, object) == OBJECT_NONE)
 		return make_float3(0.0f, 0.0f, 0.0f);

 #ifdef __OBJECT_MOTION__
-	return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
+	return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
 	return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
 #endif
 }
@ -296,7 +296,7 @@ ccl_device_inline void object_motion_info(KernelGlobals *kg, int object, int *nu

 ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
 {
-	return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2 + 1);
+	return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2 + 1);
 }

 /* Particle data from which object was instanced */
@ -377,7 +377,7 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir)

 /* Transform ray into object space to enter static object in BVH */

-ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
+ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
 {
 	Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);

@ -425,7 +425,7 @@ ccl_device_inline void qbvh_instance_push(KernelGlobals *kg,

 /* Transorm ray to exit static object in BVH */

-ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
+ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
 {
 	if(*t != FLT_MAX) {
 		Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
@ -520,5 +520,38 @@ ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg, int obj

 #endif

+/* TODO(sergey): This is only for until we've got OpenCL 2.0
+ * on all devices we consider supported. It'll be replaced with
+ * generic address space.
+ */
+
+#ifdef __KERNEL_OPENCL__
+ccl_device_inline void object_dir_transform_addrspace(KernelGlobals *kg,
+                                                      const ShaderData *sd,
+                                                      ccl_addr_space float3 *D)
+{
+	float3 private_D = *D;
+	object_dir_transform(kg, sd, &private_D);
+	*D = private_D;
+}
+
+ccl_device_inline void object_normal_transform_addrspace(KernelGlobals *kg,
+                                                         const ShaderData *sd,
+                                                         ccl_addr_space float3 *N)
+{
+	float3 private_N = *N;
+	object_dir_transform(kg, sd, &private_N);
+	*N = private_N;
+}
+#endif
+
+#ifndef __KERNEL_OPENCL__
+#  define object_dir_transform_auto object_dir_transform
+#  define object_normal_transform_auto object_normal_transform
+#else
+#  define object_dir_transform_auto object_dir_transform_addrspace
+#  define object_normal_transform_auto object_normal_transform_addrspace
+#endif
+
 CCL_NAMESPACE_END

--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@ -25,16 +25,16 @@ CCL_NAMESPACE_BEGIN

 ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
 {
-	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
 		return triangle_attribute_float(kg, sd, elem, offset, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(sd->type & PRIMITIVE_ALL_CURVE) {
+	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float(kg, sd, elem, offset, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
+	else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
 		return volume_attribute_float(kg, sd, elem, offset, dx, dy);
 	}
 #endif
@ -47,16 +47,16 @@ ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *

 ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
 {
-	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
 		return triangle_attribute_float3(kg, sd, elem, offset, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(sd->type & PRIMITIVE_ALL_CURVE) {
+	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float3(kg, sd, elem, offset, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
+	else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
 		return volume_attribute_float3(kg, sd, elem, offset, dx, dy);
 	}
 #endif
@ -108,9 +108,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
 ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
-	if(sd->type & PRIMITIVE_ALL_CURVE)
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)
 #ifdef __DPDU__
-		return normalize(sd->dPdu);
+		return normalize(ccl_fetch(sd, dPdu));
 #else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #endif
@ -124,12 +124,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 		float3 data = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL);
 		data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f);
 		object_normal_transform(kg, sd, &data);
-		return cross(sd->N, normalize(cross(data, sd->N)));
+		return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N))));
 	}
 	else {
 		/* otherwise use surface derivatives */
 #ifdef __DPDU__
-		return normalize(sd->dPdu);
+		return normalize(ccl_fetch(sd, dPdu));
 #else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #endif
@ -144,16 +144,16 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 	float3 center;

 #ifdef __HAIR__
-	bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE;
+	bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE;
 	if(is_curve_primitive) {
 		center = curve_motion_center_location(kg, sd);

-		if(!(sd->flag & SD_TRANSFORM_APPLIED))
+		if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED))
 			object_position_transform(kg, sd, &center);
 	}
 	else
 #endif
-		center = sd->P;
+		center = ccl_fetch(sd, P);

 	float3 motion_pre = center, motion_post = center;

@ -164,16 +164,16 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 	if(offset != ATTR_STD_NOT_FOUND) {
 		/* get motion info */
 		int numverts, numkeys;
-		object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);
+		object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys);

 		/* lookup attributes */
-		int offset_next = (sd->type & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys;
+		int offset_next = (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys;

 		motion_pre = primitive_attribute_float3(kg, sd, elem, offset, NULL, NULL);
 		motion_post = primitive_attribute_float3(kg, sd, elem, offset_next, NULL, NULL);

 #ifdef __HAIR__
-		if(is_curve_primitive && (sd->flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+		if(is_curve_primitive && (ccl_fetch(sd, flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
 			object_position_transform(kg, sd, &motion_pre);
 			object_position_transform(kg, sd, &motion_post);
 		}
@ -184,10 +184,10 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 	 * transformation was set match the world/object space of motion_pre/post */
 	Transform tfm;
 	
-	tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_PRE);
+	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE);
 	motion_pre = transform_point(&tfm, motion_pre);

-	tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST);
+	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST);
 	motion_post = transform_point(&tfm, motion_post);

 	float3 motion_center;
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@ -27,14 +27,14 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 {
 	/* load triangle vertices */
-	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));

 	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
 	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
 	float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
 	
 	/* return normal */
-	if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+	if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED)
 		return normalize(cross(v2 - v0, v1 - v0));
 	else
 		return normalize(cross(v1 - v0, v2 - v0));
@ -94,7 +94,7 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo

 /* Ray differentials on triangle */

-ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, float3 *dPdu, float3 *dPdv)
+ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, ccl_addr_space float3 *dPdu, ccl_addr_space float3 *dPdv)
 {
 	/* fetch triangle vertex coordinates */
 	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@ -116,34 +116,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
 		if(dx) *dx = 0.0f;
 		if(dy) *dy = 0.0f;

-		return kernel_tex_fetch(__attributes_float, offset + sd->prim);
+		return kernel_tex_fetch(__attributes_float, offset + ccl_fetch(sd, prim));
 	}
 	else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
-		float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+		float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));

 		float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x));
 		float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y));
 		float f2 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.z));

 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif

-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
 	else if(elem == ATTR_ELEMENT_CORNER) {
-		int tri = offset + sd->prim*3;
+		int tri = offset + ccl_fetch(sd, prim)*3;
 		float f0 = kernel_tex_fetch(__attributes_float, tri + 0);
 		float f1 = kernel_tex_fetch(__attributes_float, tri + 1);
 		float f2 = kernel_tex_fetch(__attributes_float, tri + 2);

 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif

-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
 	else {
 		if(dx) *dx = 0.0f;
@ -159,24 +159,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);

-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + ccl_fetch(sd, prim)));
 	}
 	else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
-		float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+		float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));

 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x)));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y)));
 		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z)));

 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif

-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
 	else if(elem == ATTR_ELEMENT_CORNER || elem == ATTR_ELEMENT_CORNER_BYTE) {
-		int tri = offset + sd->prim*3;
+		int tri = offset + ccl_fetch(sd, prim)*3;
 		float3 f0, f1, f2;

 		if(elem == ATTR_ELEMENT_CORNER) {
@ -191,11 +191,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		}

 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif

-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
 	else {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
--- a/intern/cycles/kernel/kernel.cl
+++ b/intern/cycles/kernel/kernel.cl
@ -25,6 +25,8 @@
 #include "kernel_path.h"
 #include "kernel_bake.h"

+#ifdef __COMPILE_ONLY_MEGAKERNEL__
+
 __kernel void kernel_ocl_path_trace(
 	ccl_constant KernelData *data,
 	ccl_global float *buffer,
@ -52,6 +54,58 @@ __kernel void kernel_ocl_path_trace(
 		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
 }

+#else // __COMPILE_ONLY_MEGAKERNEL__
+
+__kernel void kernel_ocl_shader(
+	ccl_constant KernelData *data,
+	ccl_global uint4 *input,
+	ccl_global float4 *output,
+
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name,
+#include "kernel_textures.h"
+
+	int type, int sx, int sw, int offset, int sample)
+{
+	KernelGlobals kglobals, *kg = &kglobals;
+
+	kg->data = data;
+
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "kernel_textures.h"
+
+	int x = sx + get_global_id(0);
+
+	if(x < sx + sw)
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
+}
+
+__kernel void kernel_ocl_bake(
+	ccl_constant KernelData *data,
+	ccl_global uint4 *input,
+	ccl_global float4 *output,
+
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name,
+#include "kernel_textures.h"
+
+	int type, int sx, int sw, int offset, int sample)
+{
+	KernelGlobals kglobals, *kg = &kglobals;
+
+	kg->data = data;
+
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "kernel_textures.h"
+
+	int x = sx + get_global_id(0);
+
+	if(x < sx + sw)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
+}
+
 __kernel void kernel_ocl_convert_to_byte(
 	ccl_constant KernelData *data,
 	ccl_global uchar4 *rgba,
@ -106,53 +160,4 @@ __kernel void kernel_ocl_convert_to_half_float(
 		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }

-__kernel void kernel_ocl_shader(
-	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
-
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel_textures.h"
-
-	int type, int sx, int sw, int offset, int sample)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel_textures.h"
-
-	int x = sx + get_global_id(0);
-
-	if(x < sx + sw)
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
-}
-
-__kernel void kernel_ocl_bake(
-	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
-
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel_textures.h"
-
-	int type, int sx, int sw, int offset, int sample)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel_textures.h"
-
-	int x = sx + get_global_id(0);
-
-	if(x < sx + sw)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
-}
-
+#endif // __COMPILE_ONLY_MEGAKERNEL__
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@ -176,7 +176,7 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 #endif
 }

-ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, float3 *throughput,
+ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
 	BsdfEval *bsdf_eval, float bsdf_pdf, int bounce, int bsdf_label)
 {
 	float inverse_pdf = 1.0f/bsdf_pdf;
--- a/intern/cycles/kernel/kernel_background_buffer_update.cl
+++ b/intern/cycles/kernel/kernel_background_buffer_update.cl
@ -0,0 +1,282 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_background_buffer_update kernel.
+ * This is the fourth kernel in the ray tracing logic, and the third
+ * of the path iteration kernels. This kernel takes care of rays that hit
+ * the background (sceneintersect kernel), and for the rays of
+ * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in
+ * the output buffer. This kernel also takes care of rays that have been determined
+ * to-be-regenerated.
+ *
+ * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel
+ *
+ * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
+ * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state
+ * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_ocl_path_trace_background_buffer_update --|--- PathRadiance_coop
+ * throughput_coop --------------------------------------|                                                     |--- L_transparent_coop
+ * per_sample_output_buffers ----------------------------|                                                     |--- per_sample_output_buffers
+ * Ray_coop ---------------------------------------------|                                                     |--- ray_state
+ * PathState_coop ---------------------------------------|                                                     |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * L_transparent_coop -----------------------------------|                                                     |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * ray_state --------------------------------------------|                                                     |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----|                                                     |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                                                     |--- work_array
+ * parallel_samples -------------------------------------|                                                     |--- PathState_coop
+ * end_sample -------------------------------------------|                                                     |--- throughput_coop
+ * kg (globals + data) ----------------------------------|                                                     |--- rng_coop
+ * rng_state --------------------------------------------|                                                     |--- Ray
+ * PathRadiance_coop ------------------------------------|                                                     |
+ * sw ---------------------------------------------------|                                                     |
+ * sh ---------------------------------------------------|                                                     |
+ * sx ---------------------------------------------------|                                                     |
+ * sy ---------------------------------------------------|                                                     |
+ * stride -----------------------------------------------|                                                     |
+ * work_array -------------------------------------------|                                                     |--- work_array
+ * queuesize --------------------------------------------|                                                     |
+ * start_sample -----------------------------------------|                                                     |--- work_pool_wgs
+ * work_pool_wgs ----------------------------------------|                                                     |
+ * num_samples ------------------------------------------|                                                     |
+ *
+ * note on shader_data : shader_data argument is neither an input nor an output for this kernel. It is just filled and consumed here itself.
+ * Note on Queues :
+ * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty
+ */
+__kernel void kernel_ocl_path_trace_background_buffer_update(
+	ccl_global char *globals,
+	ccl_constant KernelData *data,
+	ccl_global char *shader_data,
+	ccl_global float *per_sample_output_buffers,
+	ccl_global uint *rng_state,
+	ccl_global uint *rng_coop,                   /* Required for buffer Update */
+	ccl_global float3 *throughput_coop,          /* Required for background hit processing */
+	PathRadiance *PathRadiance_coop,  /* Required for background hit processing and buffer Update */
+	ccl_global Ray *Ray_coop,                    /* Required for background hit processing */
+	ccl_global PathState *PathState_coop,        /* Required for background hit processing */
+	ccl_global float *L_transparent_coop,        /* Required for background hit processing and buffer Update */
+	ccl_global char *ray_state,                  /* Stores information on the current state of a ray */
+	int sw, int sh, int sx, int sy, int stride,
+	int rng_state_offset_x,
+	int rng_state_offset_y,
+	int rng_state_stride,
+	ccl_global unsigned int *work_array,         /* Denotes work of each ray */
+	ccl_global int *Queue_data,                  /* Queues memory */
+	ccl_global int *Queue_index,                 /* Tracks the number of elements in each queue */
+	int queuesize,                               /* Size (capacity) of each queue */
+	int end_sample,
+	int start_sample,
+#ifdef __WORK_STEALING__
+	ccl_global unsigned int *work_pool_wgs,
+	unsigned int num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+	DebugData *debugdata_coop,
+#endif
+	int parallel_samples                         /* Number of samples to be processed in parallel */
+	)
+{
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	if(ray_index == 0) {
+		/* We will empty this queue in this kernel */
+		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
+	char enqueue_flag = 0;
+	ray_index = get_ray_index(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, Queue_data, queuesize, 1);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not required.
+	 * If we are executing on a CPU device, then we need to keep all threads active
+	 * since we have barrier() calls later in the kernel. CPU devices
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		/* Load kernel globals structure and ShaderData strucuture */
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+
+#ifdef __KERNEL_DEBUG__
+		DebugData *debug_data = &debugdata_coop[ray_index];
+#endif
+		ccl_global PathState *state = &PathState_coop[ray_index];
+		PathRadiance *L = L = &PathRadiance_coop[ray_index];
+		ccl_global Ray *ray = &Ray_coop[ray_index];
+		ccl_global float3 *throughput = &throughput_coop[ray_index];
+		ccl_global float *L_transparent = &L_transparent_coop[ray_index];
+		ccl_global uint *rng = &rng_coop[ray_index];
+
+#ifdef __WORK_STEALING__
+		unsigned int my_work;
+		ccl_global float *initial_per_sample_output_buffers;
+		ccl_global uint *initial_rng;
+#endif
+		unsigned int sample;
+		unsigned int tile_x;
+		unsigned int tile_y;
+		unsigned int pixel_x;
+		unsigned int pixel_y;
+		unsigned int my_sample_tile;
+
+#ifdef __WORK_STEALING__
+		my_work = work_array[ray_index];
+		sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+		get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+		my_sample_tile = 0;
+		initial_per_sample_output_buffers = per_sample_output_buffers;
+		initial_rng = rng_state;
+#else // __WORK_STEALING__
+		sample = work_array[ray_index];
+		int tile_index = ray_index / parallel_samples;
+		/* buffer and rng_state's stride is "stride". Find x and y using ray_index */
+		tile_x = tile_index % sw;
+		tile_y = tile_index / sw;
+		my_sample_tile = ray_index - (tile_index * parallel_samples);
+#endif
+		rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
+		per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+
+		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+			/* eval background shader if nothing hit */
+			if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
+				*L_transparent = (*L_transparent) + average((*throughput));
+#ifdef __PASSES__
+			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
+#endif
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			}
+
+			if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
+			{
+#ifdef __BACKGROUND__
+				/* sample background shader */
+				float3 L_background = indirect_background(kg, state, ray, sd);
+				path_radiance_accum_background(L, (*throughput), L_background, state->bounce);
+#endif
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			}
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+			float3 L_sum = path_radiance_clamp_and_sum(kg, L);
+			kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
+#ifdef __KERNEL_DEBUG__
+			kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
+#endif
+			float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
+
+			/* accumulate result in output buffer */
+			kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+			path_rng_end(kg, rng_state, *rng);
+
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+#ifdef __WORK_STEALING__
+			/* We have completed current work; So get next work */
+			int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
+			if(!valid_work) {
+				/* If work is invalid, this means no more work is available and the thread may exit */
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+			}
+#else
+			if((sample + parallel_samples) >= end_sample) {
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+			}
+#endif
+			if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+#ifdef __WORK_STEALING__
+				work_array[ray_index] = my_work;
+				/* Get the sample associated with the current work */
+				sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+				/* Get pixel and tile position associated with current work */
+				get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+				my_sample_tile = 0;
+
+				/* Remap rng_state according to the current work */
+				rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride);
+				/* Remap per_sample_output_buffers according to the current work */
+				per_sample_output_buffers = initial_per_sample_output_buffers
+											+ (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+#else
+				work_array[ray_index] = sample + parallel_samples;
+				sample = work_array[ray_index];
+
+				/* Get ray position from ray index */
+				pixel_x = sx + ((ray_index / parallel_samples) % sw);
+				pixel_y = sy + ((ray_index / parallel_samples) / sw);
+#endif
+
+				/* initialize random numbers and ray */
+				kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
+
+				if(ray->t != 0.0f) {
+					/* Initialize throughput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/
+					*throughput = make_float3(1.0f, 1.0f, 1.0f);
+					*L_transparent = 0.0f;
+					path_radiance_init(L, kernel_data.film.use_light_pass);
+					path_state_init(kg, state, rng, sample, ray);
+#ifdef __KERNEL_DEBUG__
+					debug_data_init(debug_data);
+#endif
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+					enqueue_flag = 1;
+				} else {
+					/*These rays do not participate in path-iteration */
+					float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					/* accumulate result in output buffer */
+					kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+					path_rng_end(kg, rng_state, *rng);
+
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+				}
+			}
+		}
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; These rays
+	 * will be made active during next SceneIntersectkernel
+	 */
+	enqueue_ray_index_local(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
+}
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@ -39,7 +39,7 @@ ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v)
 	return bokeh;
 }

-ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
 {
 	/* create ray form raster position */
 	Transform rastertocamera = kernel_data.cam.rastertocamera;
@ -108,8 +108,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo
 }

 /* Orthographic Camera */
-
-ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
 {
 	/* create ray form raster position */
 	Transform rastertocamera = kernel_data.cam.rastertocamera;
@ -175,7 +174,7 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl

 /* Panorama Camera */

-ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
 {
 	Transform rastertocamera = kernel_data.cam.rastertocamera;
 	float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f));
@ -256,7 +255,7 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float
 /* Common */

 ccl_device void camera_sample(KernelGlobals *kg, int x, int y, float filter_u, float filter_v,
-	float lens_u, float lens_v, float time, Ray *ray)
+	float lens_u, float lens_v, float time, ccl_addr_space Ray *ray)
 {
 	/* pixel filter */
 	int filter_table_offset = kernel_data.film.filter_table_offset;
@ -319,7 +318,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 {
 	if(kernel_data.cam.type != CAMERA_PANORAMA) {
 		/* perspective / ortho */
-		if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
+		if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
 			P += camera_position(kg);

 		Transform tfm = kernel_data.cam.worldtondc;
@ -329,7 +328,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 		/* panorama */
 		Transform tfm = kernel_data.cam.worldtocamera;

-		if(sd->object != OBJECT_NONE)
+		if(ccl_fetch(sd, object) != OBJECT_NONE)
 			P = normalize(transform_point(&tfm, P));
 		else
 			P = normalize(transform_direction(&tfm, P));
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@ -40,6 +40,8 @@
 #include "util_half.h"
 #include "util_types.h"

+#define ccl_addr_space
+
 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
 * much slower than the double version.  This was fixed in glibc 2.16.
 */
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@ -41,6 +41,7 @@
 #define ccl_global
 #define ccl_constant
 #define ccl_may_alias
+#define ccl_addr_space

 /* No assert supported for CUDA */

--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@ -40,6 +40,12 @@
 #define ccl_local __local
 #define ccl_private __private

+#ifdef __SPLIT_KERNEL__
+#define ccl_addr_space __global
+#else
+#define ccl_addr_space
+#endif
+
 /* Selective nodes compilation. */
 #ifndef __NODES_MAX_GROUP__
 #  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
--- a/intern/cycles/kernel/kernel_data_init.cl
+++ b/intern/cycles/kernel/kernel_data_init.cl
@ -0,0 +1,384 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_data_initialization kernel
+ * This kernel Initializes structures needed in path-iteration kernels.
+ * This is the first kernel in ray-tracing logic.
+ *
+ * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
+ *
+ * Its input and output are as follows,
+ *
+ * Un-initialized rng---------------|--- kernel_ocl_path_trace_data_initialization ---|--- Initialized rng
+ * Un-initialized throughput -------|                                                 |--- Initialized throughput
+ * Un-initialized L_transparent ----|                                                 |--- Initialized L_transparent
+ * Un-initialized PathRadiance -----|                                                 |--- Initialized PathRadiance
+ * Un-initialized Ray --------------|                                                 |--- Initialized Ray
+ * Un-initialized PathState --------|                                                 |--- Initialized PathState
+ * Un-initialized QueueData --------|                                                 |--- Initialized QueueData (to QUEUE_EMPTY_SLOT)
+ * Un-initilaized QueueIndex -------|                                                 |--- Initialized QueueIndex (to 0)
+ * Un-initialized use_queues_flag---|                                                 |--- Initialized use_queues_flag (to false)
+ * Un-initialized ray_state --------|                                                 |--- Initialized ray_state
+ * parallel_samples --------------- |                                                 |--- Initialized per_sample_output_buffers
+ * rng_state -----------------------|                                                 |--- Initialized work_array
+ * data ----------------------------|                                                 |--- Initialized work_pool_wgs
+ * start_sample --------------------|                                                 |
+ * sx ------------------------------|                                                 |
+ * sy ------------------------------|                                                 |
+ * sw ------------------------------|                                                 |
+ * sh ------------------------------|                                                 |
+ * stride --------------------------|                                                 |
+ * queuesize -----------------------|                                                 |
+ * num_samples ---------------------|                                                 |
+ *
+ * Note on Queues :
+ * All slots in queues are initialized to queue empty slot;
+ * The number of elements in the queues is initialized to 0;
+ */
+__kernel void kernel_ocl_path_trace_data_initialization(
+	ccl_global char *globals,
+	ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
+	ccl_global char *shader_data_sd_DL_shadow,     /* Arguments related to ShaderData */
+
+	ccl_global float3 *P_sd,
+	ccl_global float3 *P_sd_DL_shadow,
+
+	ccl_global float3 *N_sd,
+	ccl_global float3 *N_sd_DL_shadow,
+
+	ccl_global float3 *Ng_sd,
+	ccl_global float3 *Ng_sd_DL_shadow,
+
+	ccl_global float3 *I_sd,
+	ccl_global float3 *I_sd_DL_shadow,
+
+	ccl_global int *shader_sd,
+	ccl_global int *shader_sd_DL_shadow,
+
+	ccl_global int *flag_sd,
+	ccl_global int *flag_sd_DL_shadow,
+
+	ccl_global int *prim_sd,
+	ccl_global int *prim_sd_DL_shadow,
+
+	ccl_global int *type_sd,
+	ccl_global int *type_sd_DL_shadow,
+
+	ccl_global float *u_sd,
+	ccl_global float *u_sd_DL_shadow,
+
+	ccl_global float *v_sd,
+	ccl_global float *v_sd_DL_shadow,
+
+	ccl_global int *object_sd,
+	ccl_global int *object_sd_DL_shadow,
+
+	ccl_global float *time_sd,
+	ccl_global float *time_sd_DL_shadow,
+
+	ccl_global float *ray_length_sd,
+	ccl_global float *ray_length_sd_DL_shadow,
+
+	ccl_global int *ray_depth_sd,
+	ccl_global int *ray_depth_sd_DL_shadow,
+
+	ccl_global int *transparent_depth_sd,
+	ccl_global int *transparent_depth_sd_DL_shadow,
+	#ifdef __RAY_DIFFERENTIALS__
+	ccl_global differential3 *dP_sd,
+	ccl_global differential3 *dP_sd_DL_shadow,
+
+	ccl_global differential3 *dI_sd,
+	ccl_global differential3 *dI_sd_DL_shadow,
+
+	ccl_global differential *du_sd,
+	ccl_global differential *du_sd_DL_shadow,
+
+	ccl_global differential *dv_sd,
+	ccl_global differential *dv_sd_DL_shadow,
+	#endif
+	#ifdef __DPDU__
+	ccl_global float3 *dPdu_sd,
+	ccl_global float3 *dPdu_sd_DL_shadow,
+
+	ccl_global float3 *dPdv_sd,
+	ccl_global float3 *dPdv_sd_DL_shadow,
+	#endif
+	ShaderClosure *closure_sd,
+	ShaderClosure *closure_sd_DL_shadow,
+
+	ccl_global int *num_closure_sd,
+	ccl_global int *num_closure_sd_DL_shadow,
+
+	ccl_global float *randb_closure_sd,
+	ccl_global float *randb_closure_sd_DL_shadow,
+
+	ccl_global float3 *ray_P_sd,
+	ccl_global float3 *ray_P_sd_DL_shadow,
+
+	ccl_global differential3 *ray_dP_sd,
+	ccl_global differential3 *ray_dP_sd_DL_shadow,
+
+	ccl_constant KernelData *data,
+	ccl_global float *per_sample_output_buffers,
+	ccl_global uint *rng_state,
+	ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
+	ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
+	ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
+	PathRadiance *PathRadiance_coop,  /* PathRadiance array to store PathRadiance values for all rays */
+	ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
+	ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
+	ccl_global char *ray_state,                  /* Stores information on current state of a ray */
+
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name,
+#include "kernel_textures.h"
+
+	int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
+	int rng_state_offset_x,
+	int rng_state_offset_y,
+	int rng_state_stride,
+	ccl_global int *Queue_data,                  /* Memory for queues */
+	ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
+	int queuesize,                               /* size (capacity) of the queue */
+	ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
+	ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
+#ifdef __WORK_STEALING__
+	ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
+	unsigned int num_samples,                    /* Total number of samples per pixel */
+#endif
+#ifdef __KERNEL_DEBUG__
+	DebugData *debugdata_coop,
+#endif
+	int parallel_samples                         /* Number of samples to be processed in parallel */
+	)
+{
+
+	/* Load kernel globals structure */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+
+	kg->data = data;
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "kernel_textures.h"
+
+	/* Load ShaderData structure */
+	ShaderData *sd = (ShaderData *)shader_data_sd;
+	ShaderData *sd_DL_shadow = (ShaderData *)shader_data_sd_DL_shadow;
+
+	sd->P = P_sd;
+	sd_DL_shadow->P = P_sd_DL_shadow;
+
+	sd->N = N_sd;
+	sd_DL_shadow->N = N_sd_DL_shadow;
+
+	sd->Ng = Ng_sd;
+	sd_DL_shadow->Ng = Ng_sd_DL_shadow;
+
+	sd->I = I_sd;
+	sd_DL_shadow->I = I_sd_DL_shadow;
+
+	sd->shader = shader_sd;
+	sd_DL_shadow->shader = shader_sd_DL_shadow;
+
+	sd->flag = flag_sd;
+	sd_DL_shadow->flag = flag_sd_DL_shadow;
+
+	sd->prim = prim_sd;
+	sd_DL_shadow->prim = prim_sd_DL_shadow;
+
+	sd->type = type_sd;
+	sd_DL_shadow->type = type_sd_DL_shadow;
+
+	sd->u = u_sd;
+	sd_DL_shadow->u = u_sd_DL_shadow;
+
+	sd->v = v_sd;
+	sd_DL_shadow->v = v_sd_DL_shadow;
+
+	sd->object = object_sd;
+	sd_DL_shadow->object = object_sd_DL_shadow;
+
+	sd->time = time_sd;
+	sd_DL_shadow->time = time_sd_DL_shadow;
+
+	sd->ray_length = ray_length_sd;
+	sd_DL_shadow->ray_length = ray_length_sd_DL_shadow;
+
+	sd->ray_depth = ray_depth_sd;
+	sd_DL_shadow->ray_depth = ray_depth_sd_DL_shadow;
+
+	sd->transparent_depth = transparent_depth_sd;
+	sd_DL_shadow->transparent_depth = transparent_depth_sd_DL_shadow;
+
+#ifdef __RAY_DIFFERENTIALS__
+	sd->dP = dP_sd;
+	sd_DL_shadow->dP = dP_sd_DL_shadow;
+
+	sd->dI = dI_sd;
+	sd_DL_shadow->dI = dI_sd_DL_shadow;
+
+	sd->du = du_sd;
+	sd_DL_shadow->du = du_sd_DL_shadow;
+
+	sd->dv = dv_sd;
+	sd_DL_shadow->dv = dv_sd_DL_shadow;
+#ifdef __DPDU__
+	sd->dPdu = dPdu_sd;
+	sd_DL_shadow->dPdu = dPdu_sd_DL_shadow;
+
+	sd->dPdv = dPdv_sd;
+	sd_DL_shadow->dPdv = dPdv_sd_DL_shadow;
+#endif
+#endif
+
+	sd->closure = closure_sd;
+	sd_DL_shadow->closure = closure_sd_DL_shadow;
+
+	sd->num_closure = num_closure_sd;
+	sd_DL_shadow->num_closure = num_closure_sd_DL_shadow;
+
+	sd->randb_closure = randb_closure_sd;
+	sd_DL_shadow->randb_closure = randb_closure_sd_DL_shadow;
+
+	sd->ray_P = ray_P_sd;
+	sd_DL_shadow->ray_P = ray_P_sd_DL_shadow;
+
+	sd->ray_dP = ray_dP_sd;
+	sd_DL_shadow->ray_dP = ray_dP_sd_DL_shadow;
+
+	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+
+#ifdef __WORK_STEALING__
+	int lid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	/* Initialize work_pool_wgs */
+	if(lid == 0) {
+		int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0);
+		work_pool_wgs[group_index] = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+#endif // __WORK_STEALING__
+
+	/* Initialize queue data and queue index */
+	if(thread_index < queuesize) {
+		/* Initialize active ray queue */
+		Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		/* Initialize background and buffer update queue */
+		Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		/* Initialize shadow ray cast of AO queue */
+		Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		/* Initialize shadow ray cast of direct lighting queue */
+		Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+	}
+
+	if(thread_index == 0) {
+		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+		/* The scene-intersect kernel should not use the queues very first time.
+		 * since the queue would be empty.
+		 */
+		use_queues_flag[0] = 0;
+	}
+
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	if(x < (sw * parallel_samples) && y < sh) {
+
+		int ray_index = x + y * (sw * parallel_samples);
+
+		/* This is the first assignment to ray_state; So we dont use ASSIGN_RAY_STATE macro */
+		ray_state[ray_index] = RAY_ACTIVE;
+
+		unsigned int my_sample;
+		unsigned int pixel_x;
+		unsigned int pixel_y;
+		unsigned int tile_x;
+		unsigned int tile_y;
+		unsigned int my_sample_tile;
+
+#ifdef __WORK_STEALING__
+		unsigned int my_work = 0;
+		/* get work */
+		get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
+		/* Get the sample associated with the work */
+		my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+
+		my_sample_tile = 0;
+
+		/* Get pixel and tile position associated with the work */
+		get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+		work_array[ray_index] = my_work;
+#else // __WORK_STEALING__
+
+		unsigned int tile_index = ray_index / parallel_samples;
+		tile_x = tile_index % sw;
+		tile_y = tile_index / sw;
+		my_sample_tile = ray_index - (tile_index * parallel_samples);
+		my_sample = my_sample_tile + start_sample;
+
+		/* Initialize work array */
+		work_array[ray_index] = my_sample ;
+
+		/* Calculate pixel position of this ray */
+		pixel_x = sx + tile_x;
+		pixel_y = sy + tile_y;
+#endif // __WORK_STEALING__
+
+		rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
+
+		/* Initialise per_sample_output_buffers to all zeros */
+		per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride;
+		int per_sample_output_buffers_iterator = 0;
+		for(per_sample_output_buffers_iterator = 0; per_sample_output_buffers_iterator < kernel_data.film.pass_stride; per_sample_output_buffers_iterator++) {
+			per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f;
+		}
+
+		/* initialize random numbers and ray */
+		kernel_path_trace_setup(kg, rng_state, my_sample, pixel_x, pixel_y, &rng_coop[ray_index], &Ray_coop[ray_index]);
+
+		if(Ray_coop[ray_index].t != 0.0f) {
+			/* Initialize throuput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/
+			throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
+			L_transparent_coop[ray_index] = 0.0f;
+			path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass);
+			path_state_init(kg, &PathState_coop[ray_index], &rng_coop[ray_index], my_sample, &Ray_coop[ray_index]);
+#ifdef __KERNEL_DEBUG__
+			debug_data_init(&debugdata_coop[ray_index]);
+#endif
+		} else {
+			/*These rays do not participate in path-iteration */
+
+			float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+			/* accumulate result in output buffer */
+			kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
+			path_rng_end(kg, rng_state, rng_coop[ray_index]);
+
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+		}
+	}
+
+	/* Mark rest of the ray-state indices as RAY_INACTIVE */
+	if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) {
+		/* First assignment, hence we dont use ASSIGN_RAY_STATE macro */
+		ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;
+	}
+}
--- a/intern/cycles/kernel/kernel_debug.h
+++ b/intern/cycles/kernel/kernel_debug.h
@ -23,7 +23,7 @@ ccl_device_inline void debug_data_init(DebugData *debug_data)

 ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
                                                 ccl_global float *buffer,
-                                                 PathState *state,
+                                                 ccl_addr_space PathState *state,
                                                 DebugData *debug_data,
                                                 int sample)
 {
--- a/intern/cycles/kernel/kernel_differential.h
+++ b/intern/cycles/kernel/kernel_differential.h
@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN

 /* See "Tracing Ray Differentials", Homan Igehy, 1999. */

-ccl_device void differential_transfer(differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t)
+ccl_device void differential_transfer(ccl_addr_space differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t)
 {
 	/* ray differential transfer through homogeneous medium, to
 	 * compute dPdx/dy at a shading point from the incoming ray */
@ -31,7 +31,7 @@ ccl_device void differential_transfer(differential3 *dP_, const differential3 dP
 	dP_->dy = tmpy - dot(tmpy, Ng)*tmp;
 }

-ccl_device void differential_incoming(differential3 *dI, const differential3 dD)
+ccl_device void differential_incoming(ccl_addr_space differential3 *dI, const differential3 dD)
 {
 	/* compute dIdx/dy at a shading point, we just need to negate the
 	 * differential of the ray direction */
@ -40,7 +40,7 @@ ccl_device void differential_incoming(differential3 *dI, const differential3 dD)
 	dI->dy = -dD.dy;
 }

-ccl_device void differential_dudv(differential *du, differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng)
+ccl_device void differential_dudv(ccl_addr_space differential *du, ccl_addr_space differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng)
 {
 	/* now we have dPdx/dy from the ray differential transfer, and dPdu/dv
 	 * from the primitive, we can compute dudx/dy and dvdx/dy. these are
--- a/intern/cycles/kernel/kernel_direct_lighting.cl
+++ b/intern/cycles/kernel/kernel_direct_lighting.cl
@ -0,0 +1,137 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_direct_lighting kernel.
+ * This is the eighth kernel in the ray tracing logic. This is the seventh
+ * of the path iteration kernels. This kernel takes care of direct lighting
+ * logic. However, the "shadow ray cast" part of direct lighting is handled
+ * in the next kernel.
+ *
+ * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed.
+ * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and
+ * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop -----------------------------------------|--- kernel_ocl_path_trace_direct_lighting --|--- BSDFEval_coop
+ * PathState_coop -----------------------------------|                                            |--- ISLamp_coop
+ * shader_data --------------------------------------|                                            |--- LightRay_coop
+ * ray_state ----------------------------------------|                                            |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                                            |
+ * kg (globals + data) ------------------------------|                                            |
+ * queuesize ----------------------------------------|                                            |
+ *
+ * note on shader_DL : shader_DL is neither input nor output to this kernel; shader_DL is filled and consumed in this kernel itself.
+ * Note on Queues :
+ * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
+ * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked
+ * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag.
+ *
+ * State of queues when this kernel is called :
+ * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
+ * before and after this kernel call.
+ * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this
+ * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
+ */
+__kernel void kernel_ocl_path_trace_direct_lighting(
+	ccl_global char *globals,
+	ccl_constant KernelData *data,
+	ccl_global char *shader_data,           /* Required for direct lighting */
+	ccl_global char *shader_DL,             /* Required for direct lighting */
+	ccl_global uint *rng_coop,              /* Required for direct lighting */
+	ccl_global PathState *PathState_coop,   /* Required for direct lighting */
+	ccl_global int *ISLamp_coop,            /* Required for direct lighting */
+	ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
+	ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
+	ccl_global char *ray_state,             /* Denotes the state of each ray */
+	ccl_global int *Queue_data,             /* Queue memory */
+	ccl_global int *Queue_index,            /* Tracks the number of elements in each queue */
+	int queuesize                           /* Size (capacity) of each queue */
+	)
+{
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not required
+	 * If we are executing on a CPU device, then we need to keep all threads active
+	 * since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+			/* Load kernel globals structure and ShaderData structure */
+			KernelGlobals *kg = (KernelGlobals *)globals;
+			ShaderData *sd = (ShaderData *)shader_data;
+			ShaderData *sd_DL  = (ShaderData *)shader_DL;
+
+			ccl_global PathState *state = &PathState_coop[ray_index];
+
+			/* direct lighting */
+#ifdef __EMISSION__
+			if((kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) {
+				/* sample illumination from lights to find path contribution */
+				ccl_global RNG* rng = &rng_coop[ray_index];
+				float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+				float light_u, light_v;
+				path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+#ifdef __OBJECT_MOTION__
+				light_ray.time = sd->time;
+#endif
+				LightSample ls;
+				light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
+
+				Ray light_ray;
+				BsdfEval L_light;
+				bool is_lamp;
+				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce, sd_DL)) {
+					/* write intermediate data to global memory to access from the next kernel */
+					LightRay_coop[ray_index] = light_ray;
+					BSDFEval_coop[ray_index] = L_light;
+					ISLamp_coop[ray_index] = is_lamp;
+					/// mark ray state for next shadow kernel
+					ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+					enqueue_flag = 1;
+				}
+			}
+#endif
+		}
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+#ifdef __EMISSION__
+	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays */
+	enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_DL_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
+#endif
+}
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@ -17,12 +17,20 @@
 CCL_NAMESPACE_BEGIN

 /* Direction Emission */
-
 ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
-	LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce, int transparent_bounce)
+	LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce, int transparent_bounce
+#ifdef __SPLIT_KERNEL__
+	,ShaderData *sd_input
+#endif
+)
 {
 	/* setup shading at emitter */
-	ShaderData sd;
+#ifdef __SPLIT_KERNEL__
+	ShaderData *sd = sd_input;
+#else
+	ShaderData sd_object;
+	ShaderData *sd = &sd_object;
+#endif
 	float3 eval;

 #ifdef __BACKGROUND_MIS__
@ -37,23 +45,23 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		ray.dP = differential3_zero();
 		ray.dD = dI;

-		shader_setup_from_background(kg, &sd, &ray, bounce+1, transparent_bounce);
-		eval = shader_eval_background(kg, &sd, 0, SHADER_CONTEXT_EMISSION);
+		shader_setup_from_background(kg, sd, &ray, bounce+1, transparent_bounce);
+		eval = shader_eval_background(kg, sd, 0, SHADER_CONTEXT_EMISSION);
 	}
 	else
 #endif
 	{
-		shader_setup_from_sample(kg, &sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, transparent_bounce);
+		shader_setup_from_sample(kg, sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, transparent_bounce);

-		ls->Ng = sd.Ng;
+		ls->Ng = ccl_fetch(sd, Ng);

 		/* no path flag, we're evaluating this for all closures. that's weak but
 		 * we'd have to do multiple evaluations otherwise */
-		shader_eval_surface(kg, &sd, 0.0f, 0, SHADER_CONTEXT_EMISSION);
+		shader_eval_surface(kg, sd, 0.0f, 0, SHADER_CONTEXT_EMISSION);

 		/* evaluate emissive closure */
-		if(sd.flag & SD_EMISSION)
-			eval = shader_emissive_eval(kg, &sd);
+		if(ccl_fetch(sd, flag) & SD_EMISSION)
+			eval = shader_emissive_eval(kg, sd);
 		else
 			eval = make_float3(0.0f, 0.0f, 0.0f);
 	}
@ -63,9 +71,14 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 	return eval;
 }

+/* The argument sd_DL is meaningful only for split kernel. Other uses can just pass NULL */
 ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 	LightSample *ls, Ray *ray, BsdfEval *eval, bool *is_lamp,
-	int bounce, int transparent_bounce)
+	int bounce, int transparent_bounce
+#ifdef __SPLIT_KERNEL__
+	, ShaderData *sd_DL
+#endif
+	)
 {
 	if(ls->pdf == 0.0f)
 		return false;
@ -74,7 +87,14 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 	differential3 dD = differential3_zero();

 	/* evaluate closure */
-	float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, sd->time, bounce, transparent_bounce);
+
+	float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, ccl_fetch(sd, time),
+	                                         bounce,
+	                                         transparent_bounce
+#ifdef __SPLIT_KERNEL__
+	                                         ,sd_DL
+#endif
+	                                         );

 	if(is_zero(light_eval))
 		return false;
@ -83,7 +103,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 	float bsdf_pdf;

 #ifdef __VOLUME__
-	if(sd->prim != PRIM_NONE)
+	if(ccl_fetch(sd, prim) != PRIM_NONE)
 		shader_bsdf_eval(kg, sd, ls->D, eval, &bsdf_pdf);
 	else
 		shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf);
@ -118,8 +138,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,

 	if(ls->shader & SHADER_CAST_SHADOW) {
 		/* setup ray */
-		bool transmit = (dot(sd->Ng, ls->D) < 0.0f);
-		ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng);
+		bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f);
+		ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));

 		if(ls->t == FLT_MAX) {
 			/* distant light */
@ -132,7 +152,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 			ray->D = normalize_len(ray->D, &ray->t);
 		}

-		ray->dP = sd->dP;
+		ray->dP = ccl_fetch(sd, dP);
 		ray->dD = differential3_zero();
 	}
 	else {
@ -154,14 +174,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
 	float3 L = shader_emissive_eval(kg, sd);

 #ifdef __HAIR__
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE))
 #else
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS))
 #endif
 	{
 		/* multiple importance sampling, get triangle light pdf,
 		 * and compute weight with respect to BSDF pdf */
-		float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t);
+		float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t);
 		float mis_weight = power_heuristic(bsdf_pdf, pdf);

 		return L*mis_weight;
@ -172,7 +192,12 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader

 /* Indirect Lamp Emission */

-ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *state, Ray *ray, float3 *emission)
+/* The argument sd is meaningful only for split kernel. Other uses can just pass NULL */
+ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *state, Ray *ray, float3 *emission
+#ifdef __SPLIT_KERNEL__
+                                                ,ShaderData *sd
+#endif
+                                                )
 {
 	bool hit_lamp = false;

@ -196,7 +221,13 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st
 		}
 #endif

-		float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time, state->bounce, state->transparent_bounce);
+		float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time,
+		                                state->bounce,
+		                                state->transparent_bounce
+#ifdef __SPLIT_KERNEL__
+		                                ,sd
+#endif
+		                                );

 #ifdef __VOLUME__
 		if(state->volume_stack[0].shader != SHADER_NONE) {
@ -225,7 +256,11 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st

 /* Indirect Background */

-ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *state, Ray *ray)
+ccl_device_noinline float3 indirect_background(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space Ray *ray
+#ifdef __SPLIT_KERNEL__
+                                               ,ShaderData *sd_global
+#endif
+                                               )
 {
 #ifdef __BACKGROUND__
 	int shader = kernel_data.background.surface_shader;
@ -241,11 +276,17 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *sta
 			return make_float3(0.0f, 0.0f, 0.0f);
 	}

+#ifdef __SPLIT_KERNEL__
 	/* evaluate background closure */
+	Ray priv_ray = *ray;
+	shader_setup_from_background(kg, sd_global, &priv_ray, state->bounce+1, state->transparent_bounce);
+	float3 L = shader_eval_background(kg, sd_global, state->flag, SHADER_CONTEXT_EMISSION);
+#else
 	ShaderData sd;
 	shader_setup_from_background(kg, &sd, ray, state->bounce+1, state->transparent_bounce);

 	float3 L = shader_eval_background(kg, &sd, state->flag, SHADER_CONTEXT_EMISSION);
+#endif

 #ifdef __BACKGROUND_MIS__
 	/* check if background light exists or if we should skip pdf */
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@ -80,7 +80,7 @@ typedef struct KernelGlobals {} KernelGlobals;

 #ifdef __KERNEL_OPENCL__

-typedef struct KernelGlobals {
+typedef ccl_addr_space struct KernelGlobals {
 	ccl_constant KernelData *data;

 #define KERNEL_TEX(type, ttype, name) \
--- a/intern/cycles/kernel/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/kernel/kernel_holdout_emission_blurring_pathtermination_ao.cl
@ -0,0 +1,283 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao kernel.
+ * This is the sixth kernel in the ray tracing logic. This is the fifth
+ * of the path iteration kernels. This kernel takes care of the logic to process
+ * "material of type holdout", indirect primitive emission, bsdf blurring,
+ * probabilistic path termination and AO.
+ *
+ * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed.
+ * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and
+ * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS
+ *
+ * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * throughput_coop --------------------------------------|                                                                          |--- PathState_coop
+ * PathRadiance_coop ------------------------------------|                                                                          |--- throughput_coop
+ * Intersection_coop ------------------------------------|                                                                          |--- L_transparent_coop
+ * PathState_coop ---------------------------------------|                                                                          |--- per_sample_output_buffers
+ * L_transparent_coop -----------------------------------|                                                                          |--- PathRadiance_coop
+ * shader_data ------------------------------------------|                                                                          |--- ShaderData
+ * ray_state --------------------------------------------|                                                                          |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------|                                                                          |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                                                          |--- AOAlpha_coop
+ * kg (globals + data) ----------------------------------|                                                                          |--- AOBSDF_coop
+ * parallel_samples -------------------------------------|                                                                          |--- AOLightRay_coop
+ * per_sample_output_buffers ----------------------------|                                                                          |
+ * sw ---------------------------------------------------|                                                                          |
+ * sh ---------------------------------------------------|                                                                          |
+ * sx ---------------------------------------------------|                                                                          |
+ * sy ---------------------------------------------------|                                                                          |
+ * stride -----------------------------------------------|                                                                          |
+ * work_array -------------------------------------------|                                                                          |
+ * queuesize --------------------------------------------|                                                                          |
+ * start_sample -----------------------------------------|                                                                          |
+ *
+ * Note on Queues :
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
+ * the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER
+ * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
+ * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
+ * changed to RAY_UPDATE_BUFFER, there is no problem.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
+ */
+
+__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
+	ccl_global char *globals,
+	ccl_constant KernelData *data,
+	ccl_global char *shader_data,               /* Required throughout the kernel except probabilistic path termination and AO */
+	ccl_global float *per_sample_output_buffers,
+	ccl_global uint *rng_coop,                  /* Required for "kernel_write_data_passes" and AO */
+	ccl_global float3 *throughput_coop,         /* Required for handling holdout material and AO */
+	ccl_global float *L_transparent_coop,       /* Required for handling holdout material */
+	PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */
+	ccl_global PathState *PathState_coop,       /* Required throughout the kernel and AO */
+	Intersection *Intersection_coop, /* Required for indirect primitive emission */
+	ccl_global float3 *AOAlpha_coop,            /* Required for AO */
+	ccl_global float3 *AOBSDF_coop,             /* Required for AO */
+	ccl_global Ray *AOLightRay_coop,            /* Required for AO */
+	int sw, int sh, int sx, int sy, int stride,
+	ccl_global char *ray_state,                /* Denotes the state of each ray */
+	ccl_global unsigned int *work_array,       /* Denotes the work that each ray belongs to */
+	ccl_global int *Queue_data,                /* Queue memory */
+	ccl_global int *Queue_index,               /* Tracks the number of elements in each queue */
+	int queuesize,                             /* Size (capacity) of each queue */
+#ifdef __WORK_STEALING__
+	unsigned int start_sample,
+#endif
+	int parallel_samples                       /* Number of samples to be processed in parallel */
+	)
+{
+	ccl_local unsigned int local_queue_atomics_bg;
+	ccl_local unsigned int local_queue_atomics_ao;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics_bg = 0;
+		local_queue_atomics_ao = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = 0;
+	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not required
+	 * If we are executing on a CPU device, then we need to keep all threads active
+	 * since we have barrier() calls later in the kernel. CPU devices
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		/* Load kernel globals structure and ShaderData structure */
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+
+#ifdef __WORK_STEALING__
+		unsigned int my_work;
+		unsigned int pixel_x;
+		unsigned int pixel_y;
+#endif
+		unsigned int tile_x;
+		unsigned int tile_y;
+		int my_sample_tile;
+		unsigned int sample;
+
+		ccl_global RNG *rng = 0x0;
+		ccl_global PathState *state = 0x0;
+		float3 throughput;
+
+		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+
+			throughput = throughput_coop[ray_index];
+			state = &PathState_coop[ray_index];
+			rng = &rng_coop[ray_index];
+#ifdef __WORK_STEALING__
+			my_work = work_array[ray_index];
+			sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+			get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+			my_sample_tile = 0;
+#else // __WORK_STEALING__
+			sample = work_array[ray_index];
+			/* buffer's stride is "stride"; Find x and y using ray_index */
+			int tile_index = ray_index / parallel_samples;
+			tile_x = tile_index % sw;
+			tile_y = tile_index / sw;
+			my_sample_tile = ray_index - (tile_index * parallel_samples);
+#endif // __WORK_STEALING__
+			per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+
+			/* holdout */
+#ifdef __HOLDOUT__
+			if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) && (state->flag & PATH_RAY_CAMERA)) {
+				if(kernel_data.background.transparent) {
+					float3 holdout_weight;
+
+					if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK)
+						holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
+					else
+						holdout_weight = shader_holdout_eval(kg, sd);
+
+					/* any throughput is ok, should all be identical here */
+					L_transparent_coop[ray_index] += average(holdout_weight*throughput);
+				}
+
+				if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+					enqueue_flag = 1;
+				}
+			}
+#endif
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+
+			PathRadiance *L = &PathRadiance_coop[ray_index];
+			/* holdout mask objects do not write data passes */
+			kernel_write_data_passes(kg, per_sample_output_buffers, L, sd, sample, state, throughput);
+
+			/* blurring of bsdf after bounces, for rays that have a small likelihood
+				* of following this particular path (diffuse, rough glossy) */
+			if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+				float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
+
+				if(blur_pdf < 1.0f) {
+					float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
+					shader_bsdf_blur(kg, sd, blur_roughness);
+				}
+			}
+
+#ifdef __EMISSION__
+			/* emission */
+			if(ccl_fetch(sd, flag) & SD_EMISSION) {
+				/* todo: is isect.t wrong here for transparent surfaces? */
+				float3 emission = indirect_primitive_emission(kg, sd, Intersection_coop[ray_index].t, state->flag, state->ray_pdf);
+				path_radiance_accum_emission(L, throughput, emission, state->bounce);
+			}
+#endif
+
+			/* path termination. this is a strange place to put the termination, it's
+			 * mainly due to the mixed in MIS that we use. gives too many unneeded
+			 * shader evaluations, only need emission if we are going to terminate */
+			float probability = path_state_terminate_probability(kg, state, throughput);
+
+			if(probability == 0.0f) {
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+				enqueue_flag = 1;
+			}
+
+			if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+				if(probability != 1.0f) {
+					float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
+
+					if(terminate >= probability) {
+						ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+						enqueue_flag = 1;
+					} else {
+						throughput_coop[ray_index] = throughput/probability;
+					}
+				}
+			}
+		}
+
+#ifdef __AO__
+		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+			/* ambient occlusion */
+			if(kernel_data.integrator.use_ambient_occlusion || (ccl_fetch(sd, flag) & SD_AO)) {
+				/* todo: solve correlation */
+				float bsdf_u, bsdf_v;
+				path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+				float ao_factor = kernel_data.background.ao_factor;
+				float3 ao_N;
+				AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+				AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
+
+				float3 ao_D;
+				float ao_pdf;
+				sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+				if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+					Ray _ray;
+					_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+					_ray.D = ao_D;
+					_ray.t = kernel_data.background.ao_distance;
+#ifdef __OBJECT_MOTION__
+					_ray.time = ccl_fetch(sd, time);
+#endif
+					_ray.dP = ccl_fetch(sd, dP);
+					_ray.dD = differential3_zero();
+					AOLightRay_coop[ray_index] = _ray;
+
+					ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+					enqueue_flag_AO_SHADOW_RAY_CAST = 1;
+				}
+			}
+		}
+#endif
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays */
+	enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics_bg, Queue_data, Queue_index);
+#ifdef __AO__
+	/* Enqueue to-shadow-ray-cast rays */
+	enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, enqueue_flag_AO_SHADOW_RAY_CAST, queuesize, &local_queue_atomics_ao, Queue_data, Queue_index);
+#endif
+}
--- a/intern/cycles/kernel/kernel_lamp_emission.cl
+++ b/intern/cycles/kernel/kernel_lamp_emission.cl
@ -0,0 +1,209 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_lamp_emission
+ * This is the 3rd kernel in the ray-tracing logic. This is the second of the
+ * path-iteration kernels. This kernel takes care of the indirect lamp emission logic.
+ * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE
+ * and RAY_HIT_BACKGROUND.
+ * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
+ * The input/output of the kernel is as follows,
+ * Throughput_coop ------------------------------------|--- kernel_ocl_path_trace_lamp_emission --|--- PathRadiance_coop
+ * Ray_coop -------------------------------------------|                                          |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * PathState_coop -------------------------------------|                                          |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * kg (globals + data) --------------------------------|                                          |
+ * Intersection_coop ----------------------------------|                                          |
+ * ray_state ------------------------------------------|                                          |
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----|                                          |
+ * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----|                                          |
+ * queuesize ------------------------------------------|                                          |
+ * use_queues_flag ------------------------------------|                                          |
+ * sw -------------------------------------------------|                                          |
+ * sh -------------------------------------------------|                                          |
+ * parallel_samples -----------------------------------|                                          |
+ *
+ * note : shader_data is neither input nor output. Its just filled and consumed in the same, kernel_ocl_path_trace_lamp_emission, kernel.
+ */
+__kernel void kernel_ocl_path_trace_lamp_emission(
+	ccl_global char *globals,
+	ccl_constant KernelData *data,
+	ccl_global char *shader_data,               /* Required for lamp emission */
+	ccl_global float3 *throughput_coop,         /* Required for lamp emission */
+	PathRadiance *PathRadiance_coop, /* Required for lamp emission */
+	ccl_global Ray *Ray_coop,                   /* Required for lamp emission */
+	ccl_global PathState *PathState_coop,       /* Required for lamp emission */
+	Intersection *Intersection_coop, /* Required for lamp emission */
+	ccl_global char *ray_state,                 /* Denotes the state of each ray */
+	int sw, int sh,
+	ccl_global int *Queue_data,                 /* Memory for queues */
+	ccl_global int *Queue_index,                /* Tracks the number of elements in queues */
+	int queuesize,                              /* Size (capacity) of queues */
+	ccl_global char *use_queues_flag,           /* used to decide if this kernel should use queues to fetch ray index */
+	int parallel_samples                        /* Number of samples to be processed in parallel */
+	)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	/* We will empty this queue in this kernel */
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+	}
+
+	/* Fetch use_queues_flag */
+	ccl_local char local_use_queues_flag;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_use_queues_flag = use_queues_flag[0];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index;
+	if(local_use_queues_flag) {
+		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+		ray_index = get_ray_index(thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 1);
+
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	} else {
+		if(x < (sw * parallel_samples) && y < sh){
+			ray_index = x + y * (sw * parallel_samples);
+		} else {
+			return;
+		}
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+		PathRadiance *L = &PathRadiance_coop[ray_index];
+
+		float3 throughput = throughput_coop[ray_index];
+		Ray ray = Ray_coop[ray_index];
+		PathState state = PathState_coop[ray_index];
+
+#ifdef __LAMP_MIS__
+		if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
+			/* ray starting from previous non-transparent bounce */
+			Ray light_ray;
+
+			light_ray.P = ray.P - state.ray_t*ray.D;
+			state.ray_t += Intersection_coop[ray_index].t;
+			light_ray.D = ray.D;
+			light_ray.t = state.ray_t;
+			light_ray.time = ray.time;
+			light_ray.dD = ray.dD;
+			light_ray.dP = ray.dP;
+			/* intersect with lamp */
+			float3 emission;
+
+			if(indirect_lamp_emission(kg, &state, &light_ray, &emission, sd)) {
+				path_radiance_accum_emission(L, throughput, emission, state.bounce);
+			}
+		}
+#endif
+		/* __VOLUME__ feature is disabled */
+#if 0
+#ifdef __VOLUME__
+		/* volume attenuation, emission, scatter */
+		if(state.volume_stack[0].shader != SHADER_NONE) {
+			Ray volume_ray = ray;
+			volume_ray.t = (hit)? isect.t: FLT_MAX;
+
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+
+#ifdef __VOLUME_DECOUPLED__
+			int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method);
+
+			if(decoupled) {
+				/* cache steps along volume for repeated sampling */
+				VolumeSegment volume_segment;
+				ShaderData volume_sd;
+
+				shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+				kernel_volume_decoupled_record(kg, &state,
+					&volume_ray, &volume_sd, &volume_segment, heterogeneous);
+
+				volume_segment.sampling_method = sampling_method;
+
+				/* emission */
+				if(volume_segment.closure_flag & SD_EMISSION)
+					path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+
+				/* scattering */
+				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+
+				if(volume_segment.closure_flag & SD_SCATTER) {
+					bool all = false;
+
+					/* direct light sampling */
+					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+						throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
+
+					/* indirect sample. if we use distance sampling and take just
+					 * one sample for direct and indirect light, we could share
+					 * this computation, but makes code a bit complex */
+					float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
+					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
+
+					result = kernel_volume_decoupled_scatter(kg,
+						&state, &volume_ray, &volume_sd, &throughput,
+						rphase, rscatter, &volume_segment, NULL, true);
+				}
+
+				if(result != VOLUME_PATH_SCATTERED)
+					throughput *= volume_segment.accum_transmittance;
+
+				/* free cached steps */
+				kernel_volume_decoupled_free(kg, &volume_segment);
+
+				if(result == VOLUME_PATH_SCATTERED) {
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+						continue;
+					else
+						break;
+				}
+			}
+			else
+#endif
+			{
+				/* integrate along volume segment with distance sampling */
+				ShaderData volume_sd;
+				VolumeIntegrateResult result = kernel_volume_integrate(
+					kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous);
+
+#ifdef __VOLUME_SCATTER__
+				if(result == VOLUME_PATH_SCATTERED) {
+					/* direct lighting */
+					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L);
+
+					/* indirect light bounce */
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+						continue;
+					else
+						break;
+				}
+#endif
+			}
+		}
+#endif
+#endif
+	}
+}
--- a/intern/cycles/kernel/kernel_next_iteration_setup.cl
+++ b/intern/cycles/kernel/kernel_next_iteration_setup.cl
@ -0,0 +1,176 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_setup_next_iteration kernel.
+ * This is the tenth kernel in the ray tracing logic. This is the ninth
+ * of the path iteration kernels. This kernel takes care of setting up
+ * Ray for the next iteration of path-iteration and accumulating radiance
+ * corresponding to AO and direct-lighting
+ *
+ * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_ocl_path_trace_setup_next_iteration -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * throughput_coop --------------------------------------|                                                |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * PathRadiance_coop ------------------------------------|                                                |--- throughput_coop
+ * PathState_coop ---------------------------------------|                                                |--- PathRadiance_coop
+ * shader_data ------------------------------------------|                                                |--- PathState_coop
+ * ray_state --------------------------------------------|                                                |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------|                                                |--- Ray_coop
+ * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                                |--- use_queues_flag
+ * Ray_coop ---------------------------------------------|                                                |
+ * kg (globals + data) ----------------------------------|                                                |
+ * LightRay_dl_coop -------------------------------------|
+ * ISLamp_coop ------------------------------------------|
+ * BSDFEval_coop ----------------------------------------|
+ * LightRay_ao_coop -------------------------------------|
+ * AOBSDF_coop ------------------------------------------|
+ * AOAlpha_coop -----------------------------------------|
+ *
+ * Note on queues,
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
+ * the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF
+ * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
+ * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
+ * changed to RAY_UPDATE_BUFF, there is no problem.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays
+ */
+
+__kernel void kernel_ocl_path_trace_setup_next_iteration(
+	ccl_global char *globals,
+	ccl_constant KernelData *data,
+	ccl_global char *shader_data,               /* Required for setting up ray for next iteration */
+	ccl_global uint *rng_coop,                  /* Required for setting up ray for next iteration */
+	ccl_global float3 *throughput_coop,         /* Required for setting up ray for next iteration */
+	PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */
+	ccl_global Ray *Ray_coop,                   /* Required for setting up ray for next iteration */
+	ccl_global PathState *PathState_coop,       /* Required for setting up ray for next iteration */
+	ccl_global Ray *LightRay_dl_coop,           /* Required for radiance update - direct lighting */
+	ccl_global int *ISLamp_coop,                /* Required for radiance update - direct lighting */
+	ccl_global BsdfEval *BSDFEval_coop,         /* Required for radiance update - direct lighting */
+	ccl_global Ray *LightRay_ao_coop,           /* Required for radiance update - AO */
+	ccl_global float3 *AOBSDF_coop,             /* Required for radiance update - AO */
+	ccl_global float3 *AOAlpha_coop,            /* Required for radiance update - AO */
+	ccl_global char *ray_state,                 /* Denotes the state of each ray */
+	ccl_global int *Queue_data,                 /* Queue memory */
+	ccl_global int *Queue_index,                /* Tracks the number of elements in each queue */
+	int queuesize,                              /* Size (capacity) of each queue */
+	ccl_global char *use_queues_flag            /* flag to decide if scene_intersect kernel should use queues to fetch ray index */
+	)
+{
+
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		/* If we are here, then it means that scene-intersect kernel
+		* has already been executed atleast once. From the next time,
+		* scene-intersect kernel may operate on queues to fetch ray index
+		*/
+		use_queues_flag[0] = 1;
+
+		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS
+		 * queues that were made empty during the previous kernel
+		 */
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+
+	char enqueue_flag = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not required
+	 * If we are executing on a CPU device, then we need to keep all threads active
+	 * since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		/* Load kernel globals structure and ShaderData structure */
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+		PathRadiance *L = 0x0;
+		ccl_global PathState *state = 0x0;
+
+		/* Path radiance update for AO/Direct_lighting's shadow blocked */
+		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
+			state = &PathState_coop[ray_index];
+			L = &PathRadiance_coop[ray_index];
+			float3 _throughput = throughput_coop[ray_index];
+
+			if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
+				float3 shadow = LightRay_ao_coop[ray_index].P;
+				char update_path_radiance = LightRay_ao_coop[ray_index].t;
+				if(update_path_radiance) {
+					path_radiance_accum_ao(L, _throughput, AOAlpha_coop[ray_index], AOBSDF_coop[ray_index], shadow, state->bounce);
+				}
+				REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+			}
+
+			if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
+				float3 shadow = LightRay_dl_coop[ray_index].P;
+				char update_path_radiance = LightRay_dl_coop[ray_index].t;
+				if(update_path_radiance) {
+					BsdfEval L_light = BSDFEval_coop[ray_index];
+					path_radiance_accum_light(L, _throughput, &L_light, shadow, 1.0f, state->bounce, ISLamp_coop[ray_index]);
+				}
+				REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+			}
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+
+			ccl_global float3 *throughput = &throughput_coop[ray_index];
+			ccl_global Ray *ray = &Ray_coop[ray_index];
+			ccl_global RNG* rng = &rng_coop[ray_index];
+			state = &PathState_coop[ray_index];
+			L = &PathRadiance_coop[ray_index];
+
+			/* compute direct lighting and next bounce */
+			if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) {
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+				enqueue_flag = 1;
+			}
+		}
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays */
+	enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
+}
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@ -19,23 +19,49 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value)
 {
 	ccl_global float *buf = buffer;
+#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+	atomic_add_float(buf, value);
+#else
 	*buf = (sample == 0)? value: *buf + value;
+#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
 }

 ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value)
 {
+#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+	ccl_global float *buf_x = buffer + 0;
+	ccl_global float *buf_y = buffer + 1;
+	ccl_global float *buf_z = buffer + 2;
+
+	atomic_add_float(buf_x, value.x);
+	atomic_add_float(buf_y, value.y);
+	atomic_add_float(buf_z, value.z);
+#else
 	ccl_global float3 *buf = (ccl_global float3*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
+#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
 }

 ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value)
 {
+#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+	ccl_global float *buf_x = buffer + 0;
+	ccl_global float *buf_y = buffer + 1;
+	ccl_global float *buf_z = buffer + 2;
+	ccl_global float *buf_w = buffer + 3;
+
+	atomic_add_float(buf_x, value.x);
+	atomic_add_float(buf_y, value.y);
+	atomic_add_float(buf_z, value.z);
+	atomic_add_float(buf_w, value.w);
+#else
 	ccl_global float4 *buf = (ccl_global float4*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
+#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
 }

 ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
-	ShaderData *sd, int sample, PathState *state, float3 throughput)
+	ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
 {
 #ifdef __PASSES__
 	int path_flag = state->flag;
@ -49,18 +75,18 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		return;
 	
 	if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
-		if(!(sd->flag & SD_TRANSPARENT) ||
+		if(!(ccl_fetch(sd, flag) & SD_TRANSPARENT) ||
 		   kernel_data.film.pass_alpha_threshold == 0.0f ||
 		   average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold)
 		{

 			if(sample == 0) {
 				if(flag & PASS_DEPTH) {
-					float depth = camera_distance(kg, sd->P);
+					float depth = camera_distance(kg, ccl_fetch(sd, P));
 					kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth);
 				}
 				if(flag & PASS_OBJECT_ID) {
-					float id = object_pass_id(kg, sd->object);
+					float id = object_pass_id(kg, ccl_fetch(sd, object));
 					kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id);
 				}
 				if(flag & PASS_MATERIAL_ID) {
@ -70,7 +96,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 			}

 			if(flag & PASS_NORMAL) {
-				float3 normal = sd->N;
+				float3 normal = ccl_fetch(sd, N);
 				kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal);
 			}
 			if(flag & PASS_UV) {
@ -101,7 +127,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		float mist_start = kernel_data.film.mist_start;
 		float mist_inv_depth = kernel_data.film.mist_inv_depth;

-		float depth = camera_distance(kg, sd->P);
+		float depth = camera_distance(kg, ccl_fetch(sd, P));
 		float mist = saturate((depth - mist_start)*mist_inv_depth);

 		/* falloff */
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@ -42,6 +42,7 @@
 #include "kernel_path_state.h"
 #include "kernel_shadow.h"
 #include "kernel_emission.h"
+#include "kernel_path_common.h"
 #include "kernel_path_surface.h"
 #include "kernel_path_volume.h"

@ -305,17 +306,17 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *

 	sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);

-	if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
+	if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
 		Ray light_ray;
 		float3 ao_shadow;

-		light_ray.P = ray_offset(sd->P, sd->Ng);
+		light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
 		light_ray.D = ao_D;
 		light_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
-		light_ray.time = sd->time;
+		light_ray.time = ccl_fetch(sd, time);
 #endif
-		light_ray.dP = sd->dP;
+		light_ray.dP = ccl_fetch(sd, dP);
 		light_ray.dD = differential3_zero();

 		if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
@ -341,17 +342,17 @@ ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathR

 		sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);

-		if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
+		if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
 			Ray light_ray;
 			float3 ao_shadow;

-			light_ray.P = ray_offset(sd->P, sd->Ng);
+			light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
 			light_ray.D = ao_D;
 			light_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
-			light_ray.time = sd->time;
+			light_ray.time = ccl_fetch(sd, time);
 #endif
-			light_ray.dP = sd->dP;
+			light_ray.dP = ccl_fetch(sd, dP);
 			light_ray.dD = differential3_zero();

 			if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
@ -381,7 +382,7 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
 #ifdef __VOLUME__
 		Ray volume_ray = *ray;
 		bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-		                                sd->flag & SD_OBJECT_INTERSECTS_VOLUME;
+		                                ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
 #endif

 		/* compute lighting with the BSDF closure */
@ -712,8 +713,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 	RNG *rng, ShaderData *sd, float3 throughput, float num_samples_adjust,
 	PathState *state, PathRadiance *L)
 {
-	for(int i = 0; i< sd->num_closure; i++) {
-		const ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+		const ShaderClosure *sc = &ccl_fetch(sd, closure)[i];

 		if(!CLOSURE_IS_BSDF(sc->type))
 			continue;
@ -764,8 +765,8 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
                                                        Ray *ray,
                                                        float3 throughput)
 {
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = &ccl_fetch(sd, closure)[i];

 		if(!CLOSURE_IS_BSSRDF(sc->type))
 			continue;
@ -786,7 +787,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 #ifdef __VOLUME__
 			Ray volume_ray = *ray;
 			bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-			                                sd->flag & SD_OBJECT_INTERSECTS_VOLUME;
+			                                ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
 #endif

 			/* compute lighting with the BSDF closure */
@ -1143,32 +1144,6 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in

 #endif

-ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int x, int y, RNG *rng, Ray *ray)
-{
-	float filter_u;
-	float filter_v;
-
-	int num_samples = kernel_data.integrator.aa_samples;
-
-	path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);
-
-	/* sample camera ray */
-
-	float lens_u = 0.0f, lens_v = 0.0f;
-
-	if(kernel_data.cam.aperturesize > 0.0f)
-		path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
-
-	float time = 0.0f;
-
-#ifdef __CAMERA_MOTION__
-	if(kernel_data.cam.shuttertime != -1.0f)
-		time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME);
-#endif
-
-	camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
-}
-
 ccl_device void kernel_path_trace(KernelGlobals *kg,
 	ccl_global float *buffer, ccl_global uint *rng_state,
 	int sample, int x, int y, int offset, int stride)
--- a/intern/cycles/kernel/kernel_path_common.h
+++ b/intern/cycles/kernel/kernel_path_common.h
@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
+                                               ccl_global uint *rng_state,
+                                               int sample,
+                                               int x, int y,
+                                               ccl_addr_space RNG *rng,
+                                               ccl_addr_space Ray *ray)
+{
+	float filter_u;
+	float filter_v;
+
+	int num_samples = kernel_data.integrator.aa_samples;
+
+	path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);
+
+	/* sample camera ray */
+
+	float lens_u = 0.0f, lens_v = 0.0f;
+
+	if(kernel_data.cam.aperturesize > 0.0f)
+		path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
+
+	float time = 0.0f;
+
+#ifdef __CAMERA_MOTION__
+	if(kernel_data.cam.shuttertime != -1.0f)
+		time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME);
+#endif
+
+	camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@ -16,7 +16,7 @@

 CCL_NAMESPACE_BEGIN

-ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG *rng, int sample, Ray *ray)
+ccl_device_inline void path_state_init(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space RNG *rng, int sample, ccl_addr_space Ray *ray)
 {
 	state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP;

@ -51,7 +51,7 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG
 #endif
 }

-ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int label)
+ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathState *state, int label)
 {
 	/* ray through transparent keeps same flags from previous ray and is
 	 * not counted as a regular bounce, transparent has separate max */
@ -138,7 +138,7 @@ ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *s
 	return flag;
 }

-ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, PathState *state, const float3 throughput)
+ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_addr_space PathState *state, const float3 throughput)
 {
 	if(state->flag & PATH_RAY_TRANSPARENT) {
 		/* transparent rays treated separately */
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@ -24,7 +24,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 {
 #ifdef __EMISSION__
 	/* sample illumination from lights to find path contribution */
-	if(!(sd->flag & SD_BSDF_HAS_EVAL))
+	if(!(ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))
 		return;

 	Ray light_ray;
@ -32,7 +32,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 	bool is_lamp;

 #ifdef __OBJECT_MOTION__
-	light_ray.time = sd->time;
+	light_ray.time = ccl_fetch(sd, time);
 #endif

 	if(sample_all_lights) {
@ -53,7 +53,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);

 				LightSample ls;
-				lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls);
+				lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls);

 				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 					/* trace shadow ray */
@ -85,7 +85,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 					light_t = 0.5f*light_t;

 				LightSample ls;
-				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
+				light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);

 				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 					/* trace shadow ray */
@ -106,7 +106,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);

 		LightSample ls;
-		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
+		light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);

 		/* sample random light */
 		if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
@ -149,15 +149,15 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 	path_state_next(kg, state, label);

 	/* setup ray */
-	ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
+	ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
 	ray->D = bsdf_omega_in;
 	ray->t = FLT_MAX;
 #ifdef __RAY_DIFFERENTIALS__
-	ray->dP = sd->dP;
+	ray->dP = ccl_fetch(sd, dP);
 	ray->dD = bsdf_domega_in;
 #endif
 #ifdef __OBJECT_MOTION__
-	ray->time = sd->time;
+	ray->time = ccl_fetch(sd, time);
 #endif

 #ifdef __VOLUME__
@ -181,12 +181,13 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,

 #endif

+#ifndef __SPLIT_KERNEL__
 /* path tracing: connect path directly to position on a light and add it to L */
-ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L)
+ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_addr_space RNG *rng,
+	ShaderData *sd, float3 throughput, ccl_addr_space PathState *state, PathRadiance *L)
 {
 #ifdef __EMISSION__
-	if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
+	if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
 		return;

 	/* sample illumination from lights to find path contribution */
@ -199,11 +200,11 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 	bool is_lamp;

 #ifdef __OBJECT_MOTION__
-	light_ray.time = sd->time;
+	light_ray.time = ccl_fetch(sd, time);
 #endif

 	LightSample ls;
-	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
+	light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);

 	if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 		/* trace shadow ray */
@ -216,13 +217,14 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 	}
 #endif
 }
+#endif

 /* path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, ccl_addr_space RNG *rng,
+	ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, PathRadiance *L, ccl_addr_space Ray *ray)
 {
 	/* no BSDF? we can stop here */
-	if(sd->flag & SD_BSDF) {
+	if(ccl_fetch(sd, flag) & SD_BSDF) {
 		/* sample BSDF */
 		float bsdf_pdf;
 		BsdfEval bsdf_eval;
@ -254,16 +256,16 @@ ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 		path_state_next(kg, state, label);

 		/* setup ray */
-		ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
+		ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
 		ray->D = bsdf_omega_in;

 		if(state->bounce == 0)
-			ray->t -= sd->ray_length; /* clipping works through transparent */
+			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
 		else
 			ray->t = FLT_MAX;

 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = sd->dP;
+		ray->dP = ccl_fetch(sd, dP);
 		ray->dD = bsdf_domega_in;
 #endif

@ -275,21 +277,21 @@ ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 		return true;
 	}
 #ifdef __VOLUME__
-	else if(sd->flag & SD_HAS_ONLY_VOLUME) {
+	else if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) {
 		/* no surface shader but have a volume shader? act transparent */

 		/* update path state, count as transparent */
 		path_state_next(kg, state, LABEL_TRANSPARENT);

 		if(state->bounce == 0)
-			ray->t -= sd->ray_length; /* clipping works through transparent */
+			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
 		else
 			ray->t = FLT_MAX;

 		/* setup ray position, direction stays unchanged */
-		ray->P = ray_offset(sd->P, -sd->Ng);
+		ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng));
 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = sd->dP;
+		ray->dP = ccl_fetch(sd, dP);
 #endif

 		/* enter/exit volume */
--- a/intern/cycles/kernel/kernel_queue_enqueue.cl
+++ b/intern/cycles/kernel/kernel_queue_enqueue.cl
@ -0,0 +1,98 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_compat_opencl.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_queues.h"
+
+/*
+ * The kernel "kernel_ocl_path_trace_queue_enqueue" enqueues rays of
+ * different ray state into their appropriate Queues;
+ * 1. Rays that have been determined to hit the background from the
+ * "kernel_ocl_path_trace_scene_intersect" kernel
+ * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * The input and output of the kernel is as follows,
+ *
+ * ray_state -------------------------------------------|--- kernel_ocl_path_trace_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                                          |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                          |
+ * queuesize -------------------------------------------|                                          |
+ *
+ * Note on Queues :
+ * State of queues during the first time this kernel is called :
+ * At entry,
+ * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
+ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
+ *
+ * State of queue during other times this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
+ */
+
+__kernel void kernel_ocl_path_trace_queue_enqueue(
+	ccl_global int *Queue_data,   /* Queue memory */
+	ccl_global int *Queue_index,  /* Tracks the number of elements in each queue */
+	ccl_global char *ray_state,   /* Denotes the state of each ray */
+	int queuesize                 /* Size (capacity) of each queue */
+	)
+{
+	/* We have only 2 cases (Hit/Not-Hit) */
+	ccl_local unsigned int local_queue_atomics[2];
+
+	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+
+	if(lidx < 2 ) {
+		local_queue_atomics[lidx] = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int queue_number = -1;
+
+	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+	} else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	}
+
+	unsigned int my_lqidx;
+	if(queue_number != -1) {
+		my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(lidx == 0) {
+		local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, local_queue_atomics, Queue_index);
+		local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, local_queue_atomics, Queue_index);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	unsigned int my_gqidx;
+	if(queue_number != -1) {
+		my_gqidx = get_global_queue_index(queue_number, queuesize, my_lqidx, local_queue_atomics);
+		Queue_data[my_gqidx] = ray_index;
+	}
+}
--- a/intern/cycles/kernel/kernel_queues.h
+++ b/intern/cycles/kernel/kernel_queues.h
@ -0,0 +1,132 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_QUEUE_H__
+#define __KERNEL_QUEUE_H__
+
+/*
+ * Queue utility functions for split kernel
+ */
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+/*
+ * Enqueue ray index into the queue
+ */
+ccl_device void enqueue_ray_index (
+              int ray_index,               /* Ray index to be enqueued */
+              int queue_number,            /* Queue in which the ray index should be enqueued*/
+              ccl_global int *queues,      /* Buffer of all queues */
+              int queue_size,              /* Size of each queue */
+              ccl_global int *queue_index /* Array of size num_queues; Used for atomic increment */
+              )
+{
+	/* This thread's queue index */
+	int my_queue_index = atomic_inc(&queue_index[queue_number]) + (queue_number * queue_size);
+	queues[my_queue_index] = ray_index;
+}
+
+/*
+ * Get the ray index for this thread
+ * Returns a positive ray_index for threads that have to do some work;
+ * Returns 'QUEUE_EMPTY_SLOT' for threads that don't have any work
+ * i.e All ray's in the queue has been successfully allocated and there
+ * is no more ray to allocate to other threads.
+ */
+ccl_device int get_ray_index (
+              int thread_index,       /* Global thread index */
+              int queue_number,       /* Queue to operate on */
+              ccl_global int *queues, /* Buffer of all queues */
+              int queuesize,          /* Size of a queue */
+              int empty_queue         /* Empty the queue slot as soon as we fetch the ray index */
+              )
+{
+	int ray_index = queues[queue_number * queuesize + thread_index];
+
+	if(empty_queue && ray_index != QUEUE_EMPTY_SLOT) {
+		queues[queue_number * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+	}
+
+	return ray_index;
+}
+
+/* The following functions are to realize Local memory variant of enqueue ray index function */
+
+/* All threads should call this function */
+ccl_device void enqueue_ray_index_local(
+              int ray_index,                               /* Ray index to enqueue*/
+              int queue_number,                            /* Queue in which to enqueue ray index */
+              char enqueue_flag,                           /* True for threads whose ray index has to be enqueued */
+              int queuesize,                               /* queue size */
+              ccl_local unsigned int *local_queue_atomics,   /* To to local queue atomics */
+              ccl_global int *Queue_data,                  /* Queues */
+              ccl_global int *Queue_index                  /* To do global queue atomics */
+              )
+{
+	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+
+	/* Get local queue id */
+	unsigned int lqidx;
+	if(enqueue_flag) {
+		lqidx = atomic_inc(local_queue_atomics);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* Get global queue offset */
+	if(lidx == 0) {
+		*local_queue_atomics = atomic_add(&Queue_index[queue_number], *local_queue_atomics);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* Get global queue index and enqueue ray */
+	if(enqueue_flag) {
+		unsigned int my_gqidx = queue_number * queuesize + (*local_queue_atomics) + lqidx;
+		Queue_data[my_gqidx] = ray_index;
+	}
+}
+
+ccl_device unsigned int get_local_queue_index(
+               int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */
+               ccl_local unsigned int *local_queue_atomics
+               )
+{
+	int my_lqidx = atomic_inc(&local_queue_atomics[queue_number]);
+	return my_lqidx;
+}
+
+ccl_device unsigned int get_global_per_queue_offset(
+               int queue_number,
+               ccl_local unsigned int *local_queue_atomics,
+               ccl_global int* global_queue_atomics
+               )
+{
+	unsigned int queue_offset = atomic_add((&global_queue_atomics[queue_number]), local_queue_atomics[queue_number]);
+	return queue_offset;
+}
+
+ccl_device unsigned int get_global_queue_index(
+               int queue_number,
+               int queuesize,
+               unsigned int lqidx,
+               ccl_local unsigned int * global_per_queue_offset
+               )
+{
+	int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number];
+	return my_gqidx;
+}
+
+#endif // __KERNEL_QUEUE_H__
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@ -98,7 +98,7 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons
 	return index;
 }

-ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@ -132,7 +132,7 @@ ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int
 #endif
 }

-ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_inline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@ -149,7 +149,7 @@ ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int
 	}
 }

-ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
+ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, ccl_addr_space RNG *rng, int x, int y, float *fx, float *fy)
 {
 #ifdef __SOBOL_FULL_SCREEN__
 	uint px, py;
@ -261,12 +261,12 @@ ccl_device uint lcg_init(uint seed)
 * For branches in the path we must be careful not to reuse the same number
 * in a sequence and offset accordingly. */

-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension);
 }

-ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
 {
 	/* the rng_offset is not increased for transparent bounces. if we do then
 	 * fully transparent objects can become subtly visible by the different
@ -279,23 +279,23 @@ ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *r
 	return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension);
 }

-ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension, float *fx, float *fy)
+ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy);
 }

-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension);
 }

-ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
 {
 	int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
 	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension);
 }

-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
+ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
 }
--- a/intern/cycles/kernel/kernel_scene_intersect.cl
+++ b/intern/cycles/kernel/kernel_scene_intersect.cl
@ -0,0 +1,164 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_scene_intersect kernel.
+ * This is the second kernel in the ray tracing logic. This is the first
+ * of the path iteration kernels. This kernel takes care of scene_intersect function.
+ *
+ * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
+ * This kernel processes rays of ray state RAY_ACTIVE
+ * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND.
+ *
+ * The input and output are as follows,
+ *
+ * Ray_coop ---------------------------------------|--------- kernel_ocl_path_trace_scene_intersect----------|--- PathState
+ * PathState_coop ---------------------------------|                                                         |--- Intersection
+ * ray_state --------------------------------------|                                                         |--- ray_state
+ * use_queues_flag --------------------------------|                                                         |
+ * parallel_samples -------------------------------|                                                         |
+ * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                                                         |
+ * kg (data + globals) ----------------------------|                                                         |
+ * rng_coop ---------------------------------------|                                                         |
+ * sw ---------------------------------------------|                                                         |
+ * sh ---------------------------------------------|                                                         |
+ * queuesize --------------------------------------|                                                         |
+ *
+ * Note on Queues :
+ * Ideally we would want kernel_ocl_path_trace_scene_intersect to work on queues.
+ * But during the very first time, the queues wil be empty and hence we perform a direct mapping
+ * between ray-index and thread-index; From the next time onward, the queue will be filled and
+ * we may start operating on queues.
+ *
+ * State of queue during the first time this kernel is called :
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel
+ *
+ * State of queues during other times this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays;
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ;
+ * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These
+ * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing
+ * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays
+ * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues)
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and
+ * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND
+ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change
+ */
+
+__kernel void kernel_ocl_path_trace_scene_intersect(
+	ccl_global char *globals,
+	ccl_constant KernelData *data,
+	ccl_global uint *rng_coop,
+	ccl_global Ray *Ray_coop,                   /* Required for scene_intersect */
+	ccl_global PathState *PathState_coop,       /* Required for scene_intersect */
+	Intersection *Intersection_coop,            /* Required for scene_intersect */
+	ccl_global char *ray_state,                 /* Denotes the state of each ray */
+	int sw, int sh,
+	ccl_global int *Queue_data,                 /* Memory for queues */
+	ccl_global int *Queue_index,                /* Tracks the number of elements in queues */
+	int queuesize,                              /* Size (capacity) of queues */
+	ccl_global char *use_queues_flag,           /* used to decide if this kernel should use queues to fetch ray index */
+#ifdef __KERNEL_DEBUG__
+	DebugData *debugdata_coop,
+#endif
+	int parallel_samples                        /* Number of samples to be processed in parallel */
+	)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	/* Fetch use_queues_flag */
+	ccl_local char local_use_queues_flag;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_use_queues_flag = use_queues_flag[0];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index;
+	if(local_use_queues_flag) {
+		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+		ray_index = get_ray_index(thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	} else {
+		if(x < (sw * parallel_samples) && y < sh){
+			ray_index = x + y * (sw * parallel_samples);
+		} else {
+			return;
+		}
+	}
+
+	/* All regenerated rays become active here */
+	if(IS_STATE(ray_state, ray_index, RAY_REGENERATED))
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
+
+	if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE))
+		return;
+
+	/* Load kernel globals structure */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+
+#ifdef __KERNEL_DEBUG__
+	DebugData *debug_data = &debugdata_coop[ray_index];
+#endif
+	Intersection *isect = &Intersection_coop[ray_index];
+	PathState state = PathState_coop[ray_index];
+	Ray ray = Ray_coop[ray_index];
+
+	/* intersect scene */
+	uint visibility = path_state_ray_visibility(kg, &state);
+
+#ifdef __HAIR__
+	float difl = 0.0f, extmax = 0.0f;
+	uint lcg_state = 0;
+	RNG rng = rng_coop[ray_index];
+
+	if(kernel_data.bvh.have_curves) {
+		if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
+			float3 pixdiff = ray.dD.dx + ray.dD.dy;
+			/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
+			difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
+		}
+
+		extmax = kernel_data.curve.maximum_width;
+		lcg_state = lcg_state_init(&rng, &state, 0x51633e2d);
+	}
+
+	bool hit = scene_intersect(kg, &ray, visibility, isect, &lcg_state, difl, extmax);
+#else
+	bool hit = scene_intersect(kg, &ray, visibility, isect, NULL, 0.0f, 0.0f);
+#endif
+
+#ifdef __KERNEL_DEBUG__
+	if(state.flag & PATH_RAY_CAMERA) {
+		debug_data->num_bvh_traversal_steps += isect->num_traversal_steps;
+	}
+#endif
+
+	if(!hit) {
+		/* Change the state of rays that hit the background;
+		 * These rays undergo special processing in the
+		 * background_bufferUpdate kernel*/
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
+	}
+}
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@ -52,55 +52,55 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
 	const Intersection *isect, const Ray *ray, int bounce, int transparent_bounce)
 {
 #ifdef __INSTANCING__
-	sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
+	ccl_fetch(sd, object) = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
 #endif

-	sd->type = isect->type;
-	sd->flag = kernel_tex_fetch(__object_flag, sd->object);
+	ccl_fetch(sd, type) = isect->type;
+	ccl_fetch(sd, flag) = kernel_tex_fetch(__object_flag, ccl_fetch(sd, object));

 	/* matrices and time */
 #ifdef __OBJECT_MOTION__
 	shader_setup_object_transforms(kg, sd, ray->time);
-	sd->time = ray->time;
+	ccl_fetch(sd, time) = ray->time;
 #endif

-	sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
-	sd->ray_length = isect->t;
-	sd->ray_depth = bounce;
-	sd->transparent_depth = transparent_bounce;
+	ccl_fetch(sd, prim) = kernel_tex_fetch(__prim_index, isect->prim);
+	ccl_fetch(sd, ray_length) = isect->t;
+	ccl_fetch(sd, ray_depth) = bounce;
+	ccl_fetch(sd, transparent_depth) = transparent_bounce;

 #ifdef __UV__
-	sd->u = isect->u;
-	sd->v = isect->v;
+	ccl_fetch(sd, u) = isect->u;
+	ccl_fetch(sd, v) = isect->v;
 #endif

 #ifdef __HAIR__
-	if(sd->type & PRIMITIVE_ALL_CURVE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 		/* curve */
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));

-		sd->shader = __float_as_int(curvedata.z);
-		sd->P = bvh_curve_refine(kg, sd, isect, ray);
+		ccl_fetch(sd, shader) = __float_as_int(curvedata.z);
+		ccl_fetch(sd, P) = bvh_curve_refine(kg, sd, isect, ray);
 	}
 	else
 #endif
-	if(sd->type & PRIMITIVE_TRIANGLE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
 		/* static triangle */
 		float3 Ng = triangle_normal(kg, sd);
-		sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
+		ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));

 		/* vectors */
-		sd->P = triangle_refine(kg, sd, isect, ray);
-		sd->Ng = Ng;
-		sd->N = Ng;
+		ccl_fetch(sd, P) = triangle_refine(kg, sd, isect, ray);
+		ccl_fetch(sd, Ng) = Ng;
+		ccl_fetch(sd, N) = Ng;
 		
 		/* smooth normal */
-		if(sd->shader & SHADER_SMOOTH_NORMAL)
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL)
+			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));

 #ifdef __DPDU__
 		/* dPdu/dPdv */
-		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
 #endif
 	}
 	else {
@ -108,40 +108,40 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
 		motion_triangle_shader_setup(kg, sd, isect, ray, false);
 	}

-	sd->I = -ray->D;
+	ccl_fetch(sd, I) = -ray->D;

-	sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
+	ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);

 #ifdef __INSTANCING__
 	if(isect->object != OBJECT_NONE) {
 		/* instance transform */
-		object_normal_transform(kg, sd, &sd->N);
-		object_normal_transform(kg, sd, &sd->Ng);
+		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
+		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng));
 #ifdef __DPDU__
-		object_dir_transform(kg, sd, &sd->dPdu);
-		object_dir_transform(kg, sd, &sd->dPdv);
+		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
+		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
 #endif
 	}
 #endif

 	/* backfacing test */
-	bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+	bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);

 	if(backfacing) {
-		sd->flag |= SD_BACKFACING;
-		sd->Ng = -sd->Ng;
-		sd->N = -sd->N;
+		ccl_fetch(sd, flag) |= SD_BACKFACING;
+		ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
+		ccl_fetch(sd, N) = -ccl_fetch(sd, N);
 #ifdef __DPDU__
-		sd->dPdu = -sd->dPdu;
-		sd->dPdv = -sd->dPdv;
+		ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
+		ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
 #endif
 	}

 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t);
-	differential_incoming(&sd->dI, ray->dD);
-	differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
+	differential_transfer(&ccl_fetch(sd, dP), ray->dP, ray->D, ray->dD, ccl_fetch(sd, Ng), isect->t);
+	differential_incoming(&ccl_fetch(sd, dI), ray->dD);
+	differential_dudv(&ccl_fetch(sd, du), &ccl_fetch(sd, dv), ccl_fetch(sd, dPdu), ccl_fetch(sd, dPdv), ccl_fetch(sd, dP), ccl_fetch(sd, Ng));
 #endif
 }

@ -230,105 +230,105 @@ ccl_device void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
 	int shader, int object, int prim, float u, float v, float t, float time, int bounce, int transparent_bounce)
 {
 	/* vectors */
-	sd->P = P;
-	sd->N = Ng;
-	sd->Ng = Ng;
-	sd->I = I;
-	sd->shader = shader;
-	sd->type = (prim == PRIM_NONE)? PRIMITIVE_NONE: PRIMITIVE_TRIANGLE;
+	ccl_fetch(sd, P) = P;
+	ccl_fetch(sd, N) = Ng;
+	ccl_fetch(sd, Ng) = Ng;
+	ccl_fetch(sd, I) = I;
+	ccl_fetch(sd, shader) = shader;
+	ccl_fetch(sd, type) = (prim == PRIM_NONE)? PRIMITIVE_NONE: PRIMITIVE_TRIANGLE;

 	/* primitive */
 #ifdef __INSTANCING__
-	sd->object = object;
+	ccl_fetch(sd, object) = object;
 #endif
 	/* currently no access to bvh prim index for strand sd->prim*/
-	sd->prim = prim;
+	ccl_fetch(sd, prim) = prim;
 #ifdef __UV__
-	sd->u = u;
-	sd->v = v;
+	ccl_fetch(sd, u) = u;
+	ccl_fetch(sd, v) = v;
 #endif
-	sd->ray_length = t;
-	sd->ray_depth = bounce;
-	sd->transparent_depth = transparent_bounce;
+	ccl_fetch(sd, ray_length) = t;
+	ccl_fetch(sd, ray_depth) = bounce;
+	ccl_fetch(sd, transparent_depth) = transparent_bounce;

 	/* detect instancing, for non-instanced the object index is -object-1 */
 #ifdef __INSTANCING__
 	bool instanced = false;

-	if(sd->prim != PRIM_NONE) {
-		if(sd->object >= 0)
+	if(ccl_fetch(sd, prim) != PRIM_NONE) {
+		if(ccl_fetch(sd, object) >= 0)
 			instanced = true;
 		else
 #endif
-			sd->object = ~sd->object;
+			ccl_fetch(sd, object) = ~ccl_fetch(sd, object);
 #ifdef __INSTANCING__
 	}
 #endif

-	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
-	if(sd->object != OBJECT_NONE) {
-		sd->flag |= kernel_tex_fetch(__object_flag, sd->object);
+	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);
+	if(ccl_fetch(sd, object) != OBJECT_NONE) {
+		ccl_fetch(sd, flag) |= kernel_tex_fetch(__object_flag, ccl_fetch(sd, object));

 #ifdef __OBJECT_MOTION__
 		shader_setup_object_transforms(kg, sd, time);
 	}

-	sd->time = time;
+	ccl_fetch(sd, time) = time;
 #else
 	}
 #endif

-	if(sd->type & PRIMITIVE_TRIANGLE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
 		/* smooth normal */
-		if(sd->shader & SHADER_SMOOTH_NORMAL) {
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));

 #ifdef __INSTANCING__
 			if(instanced)
-				object_normal_transform(kg, sd, &sd->N);
+				object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
 #endif
 		}

 		/* dPdu/dPdv */
 #ifdef __DPDU__
-		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));

 #ifdef __INSTANCING__
 		if(instanced) {
-			object_dir_transform(kg, sd, &sd->dPdu);
-			object_dir_transform(kg, sd, &sd->dPdv);
+			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
+			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
 		}
 #endif
 #endif
 	}
 	else {
 #ifdef __DPDU__
-		sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
-		sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
+		ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
+		ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 	}

 	/* backfacing test */
-	if(sd->prim != PRIM_NONE) {
-		bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+	if(ccl_fetch(sd, prim) != PRIM_NONE) {
+		bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);

 		if(backfacing) {
-			sd->flag |= SD_BACKFACING;
-			sd->Ng = -sd->Ng;
-			sd->N = -sd->N;
+			ccl_fetch(sd, flag) |= SD_BACKFACING;
+			ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
+			ccl_fetch(sd, N) = -ccl_fetch(sd, N);
 #ifdef __DPDU__
-			sd->dPdu = -sd->dPdu;
-			sd->dPdv = -sd->dPdv;
+			ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
+			ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
 #endif
 		}
 	}

 #ifdef __RAY_DIFFERENTIALS__
 	/* no ray differentials here yet */
-	sd->dP = differential3_zero();
-	sd->dI = differential3_zero();
-	sd->du = differential_zero();
-	sd->dv = differential_zero();
+	ccl_fetch(sd, dP) = differential3_zero();
+	ccl_fetch(sd, dI) = differential3_zero();
+	ccl_fetch(sd, du) = differential_zero();
+	ccl_fetch(sd, dv) = differential_zero();
 #endif
 }

@ -355,45 +355,46 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce, int transparent_bounce)
 {
 	/* vectors */
-	sd->P = ray->D;
-	sd->N = -ray->D;
-	sd->Ng = -ray->D;
-	sd->I = -ray->D;
-	sd->shader = kernel_data.background.surface_shader;
-	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
+	ccl_fetch(sd, P) = ray->D;
+	ccl_fetch(sd, N) = -ray->D;
+	ccl_fetch(sd, Ng) = -ray->D;
+	ccl_fetch(sd, I) = -ray->D;
+	ccl_fetch(sd, shader) = kernel_data.background.surface_shader;
+	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);
 #ifdef __OBJECT_MOTION__
-	sd->time = ray->time;
+	ccl_fetch(sd, time) = ray->time;
 #endif
-	sd->ray_length = 0.0f;
-	sd->ray_depth = bounce;
-	sd->transparent_depth = transparent_bounce;
+	ccl_fetch(sd, ray_length) = 0.0f;
+	ccl_fetch(sd, ray_depth) = bounce;
+	ccl_fetch(sd, transparent_depth) = transparent_bounce;

 #ifdef __INSTANCING__
-	sd->object = PRIM_NONE;
+	ccl_fetch(sd, object) = PRIM_NONE;
 #endif
-	sd->prim = PRIM_NONE;
+	ccl_fetch(sd, prim) = PRIM_NONE;
 #ifdef __UV__
-	sd->u = 0.0f;
-	sd->v = 0.0f;
+	ccl_fetch(sd, u) = 0.0f;
+	ccl_fetch(sd, v) = 0.0f;
 #endif

 #ifdef __DPDU__
 	/* dPdu/dPdv */
-	sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
-	sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
+	ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
+	ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
 #endif

 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	sd->dP = ray->dD;
-	differential_incoming(&sd->dI, sd->dP);
-	sd->du = differential_zero();
-	sd->dv = differential_zero();
+	ccl_fetch(sd, dP) = ray->dD;
+	differential_incoming(&ccl_fetch(sd, dI), ccl_fetch(sd, dP));
+	ccl_fetch(sd, du) = differential_zero();
+	ccl_fetch(sd, dv) = differential_zero();
 #endif
 }

 /* ShaderData setup from point inside volume */

+#ifdef __VOLUME__
 ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce, int transparent_bounce)
 {
 	/* vectors */
@ -439,6 +440,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s
 	sd->ray_P = ray->P;
 	sd->ray_dP = ray->dP;
 }
+#endif

 /* Merging */

@ -491,11 +493,11 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderDa
 {
 	/* this is the veach one-sample model with balance heuristic, some pdf
 	 * factors drop out when using balance heuristic weighting */
-	for(int i = 0; i< sd->num_closure; i++) {
+	for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
 		if(i == skip_bsdf)
 			continue;

-		const ShaderClosure *sc = &sd->closure[i];
+		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);

 		if(CLOSURE_IS_BSDF(sc->type)) {
 			float bsdf_pdf = 0.0f;
@ -513,7 +515,7 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderDa
 	*pdf = (sum_sample_weight > 0.0f)? sum_pdf/sum_sample_weight: 0.0f;
 }

-ccl_device void shader_bsdf_eval(KernelGlobals *kg, const ShaderData *sd,
+ccl_device void shader_bsdf_eval(KernelGlobals *kg, ShaderData *sd,
 	const float3 omega_in, BsdfEval *eval, float *pdf)
 {
 	bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass);
@ -527,22 +529,22 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
 {
 	int sampled = 0;

-	if(sd->num_closure > 1) {
+	if(ccl_fetch(sd, num_closure) > 1) {
 		/* pick a BSDF closure based on sample weights */
 		float sum = 0.0f;

-		for(sampled = 0; sampled < sd->num_closure; sampled++) {
-			const ShaderClosure *sc = &sd->closure[sampled];
+		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
+			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
 			
 			if(CLOSURE_IS_BSDF(sc->type))
 				sum += sc->sample_weight;
 		}

-		float r = sd->randb_closure*sum;
+		float r = ccl_fetch(sd, randb_closure)*sum;
 		sum = 0.0f;

-		for(sampled = 0; sampled < sd->num_closure; sampled++) {
-			const ShaderClosure *sc = &sd->closure[sampled];
+		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
+			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
 			
 			if(CLOSURE_IS_BSDF(sc->type)) {
 				sum += sc->sample_weight;
@ -552,13 +554,14 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
 			}
 		}

-		if(sampled == sd->num_closure) {
+		if(sampled == ccl_fetch(sd, num_closure)) {
 			*pdf = 0.0f;
 			return LABEL_NONE;
 		}
 	}

-	const ShaderClosure *sc = &sd->closure[sampled];
+	const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+
 	int label;
 	float3 eval;

@ -568,7 +571,7 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
 	if(*pdf != 0.0f) {
 		bsdf_eval_init(bsdf_eval, sc->type, eval*sc->weight, kernel_data.film.use_light_pass);

-		if(sd->num_closure > 1) {
+		if(ccl_fetch(sd, num_closure) > 1) {
 			float sweight = sc->sample_weight;
 			_shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight);
 		}
@ -595,8 +598,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, const ShaderData *s

 ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
 {
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);

 		if(CLOSURE_IS_BSDF(sc->type))
 			bsdf_blur(kg, sc, roughness);
@ -605,13 +608,13 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn

 ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
 {
-	if(sd->flag & SD_HAS_ONLY_VOLUME)
+	if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)
 		return make_float3(1.0f, 1.0f, 1.0f);

 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);

-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);

 		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl
 			eval += sc->weight;
@ -634,8 +637,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);

-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);

 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
 			eval += sc->weight;
@ -648,8 +651,8 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);

-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);

 		if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
 			eval += sc->weight;
@ -662,8 +665,8 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);

-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);

 		if(CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
 			eval += sc->weight;
@ -676,8 +679,8 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);

-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);

 		if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type))
 			eval += sc->weight;
@ -691,8 +694,8 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);

-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);

 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
 			eval += sc->weight*ao_factor;
@ -700,12 +703,12 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 		}
 		else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) {
 			eval += sc->weight;
-			N += sd->N*average(sc->weight);
+			N += ccl_fetch(sd, N)*average(sc->weight);
 		}
 	}

 	if(is_zero(N))
-		N = sd->N;
+		N = ccl_fetch(sd, N);
 	else
 		N = normalize(N);

@ -719,8 +722,8 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 	float texture_blur = 0.0f, weight_sum = 0.0f;

-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);

 		if(CLOSURE_IS_BSSRDF(sc->type)) {
 			float avg_weight = fabsf(average(sc->weight));
@ -733,7 +736,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	}

 	if(N_)
-		*N_ = (is_zero(N))? sd->N: normalize(N);
+		*N_ = (is_zero(N))? ccl_fetch(sd, N): normalize(N);

 	if(texture_blur_)
 		*texture_blur_ = texture_blur/weight_sum;
@ -745,7 +748,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b

 ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc)
 {
-	return emissive_simple_eval(sd->Ng, sd->I);
+	return emissive_simple_eval(ccl_fetch(sd, Ng), ccl_fetch(sd, I));
 }

 ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
@ -753,8 +756,8 @@ ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
 	float3 eval;
 	eval = make_float3(0.0f, 0.0f, 0.0f);

-	for(int i = 0; i < sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);

 		if(CLOSURE_IS_EMISSION(sc->type))
 			eval += emissive_eval(kg, sd, sc)*sc->weight;
@ -769,8 +772,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 weight = make_float3(0.0f, 0.0f, 0.0f);

-	for(int i = 0; i < sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);

 		if(CLOSURE_IS_HOLDOUT(sc->type))
 			weight += sc->weight;
@ -784,8 +787,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 	float randb, int path_flag, ShaderContext ctx)
 {
-	sd->num_closure = 0;
-	sd->randb_closure = randb;
+	ccl_fetch(sd, num_closure) = 0;
+	ccl_fetch(sd, randb_closure) = randb;

 #ifdef __OSL__
 	if(kg->osl)
@ -796,11 +799,11 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 #ifdef __SVM__
 		svm_eval_nodes(kg, sd, SHADER_TYPE_SURFACE, path_flag);
 #else
-		sd->closure->weight = make_float3(0.8f, 0.8f, 0.8f);
-		sd->closure->N = sd->N;
-		sd->closure->data0 = 0.0f;
-		sd->closure->data1 = 0.0f;
-		sd->flag |= bsdf_diffuse_setup(&sd->closure);
+		ccl_fetch_array(sd, closure, 0)->weight = make_float3(0.8f, 0.8f, 0.8f);
+		ccl_fetch_array(sd, closure, 0)->N = ccl_fetch(sd, N);
+		ccl_fetch_array(sd, closure, 0)->data0 = 0.0f;
+		ccl_fetch_array(sd, closure, 0)->data1 = 0.0f;
+		ccl_fetch(sd, flag) |= bsdf_diffuse_setup(ccl_fetch_array(sd, closure, 0));
 #endif
 	}
 }
@ -809,8 +812,8 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,

 ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int path_flag, ShaderContext ctx)
 {
-	sd->num_closure = 0;
-	sd->randb_closure = 0.0f;
+	ccl_fetch(sd, num_closure) = 0;
+	ccl_fetch(sd, randb_closure) = 0.0f;

 #ifdef __OSL__
 	if(kg->osl) {
@ -825,8 +828,8 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int

 		float3 eval = make_float3(0.0f, 0.0f, 0.0f);

-		for(int i = 0; i< sd->num_closure; i++) {
-			const ShaderClosure *sc = &sd->closure[i];
+		for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+			const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);

 			if(CLOSURE_IS_BACKGROUND(sc->type))
 				eval += sc->weight;
@ -999,8 +1002,8 @@ ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,

 ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx)
 {
-	sd->num_closure = 0;
-	sd->randb_closure = 0.0f;
+	ccl_fetch(sd, num_closure) = 0;
+	ccl_fetch(sd, randb_closure) = 0.0f;

 	/* this will modify sd->P */
 #ifdef __SVM__
--- a/intern/cycles/kernel/kernel_shader_eval.cl
+++ b/intern/cycles/kernel/kernel_shader_eval.cl
@ -0,0 +1,93 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_shader_evaluation kernel
+ * This kernel is the 5th kernel in the ray tracing logic. This is
+ * the 4rd kernel in path iteration. This kernel sets up the ShaderData
+ * structure from the values computed by the previous kernels. It also identifies
+ * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ *
+ * The input and output of the kernel is as follows,
+ * rng_coop -------------------------------------------|--- kernel_ocl_path_trace_shader_evaluation --|--- shader_data
+ * Ray_coop -------------------------------------------|                                              |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * PathState_coop -------------------------------------|                                              |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Intersection_coop ----------------------------------|                                              |
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------|                                              |
+ * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---|                                              |
+ * ray_state ------------------------------------------|                                              |
+ * kg (globals + data) --------------------------------|                                              |
+ * queuesize ------------------------------------------|                                              |
+ *
+ * Note on Queues :
+ * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
+ * only the rays of state RAY_ACTIVE;
+ * State of queues when this kernel is called,
+ * at entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ * at exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays
+ */
+
+__kernel void kernel_ocl_path_trace_shader_evaluation(
+	ccl_global char *globals,
+	ccl_constant KernelData *data,
+	ccl_global char *shader_data,               /* Output ShaderData structure to be filled */
+	ccl_global uint *rng_coop,                  /* Required for rbsdf calculation */
+	ccl_global Ray *Ray_coop,                   /* Required for setting up shader from ray */
+	ccl_global PathState *PathState_coop,       /* Required for all functions in this kernel */
+	Intersection *Intersection_coop, /* Required for setting up shader from ray */
+	ccl_global char *ray_state,                 /* Denotes the state of each ray */
+	ccl_global int *Queue_data,                 /* queue memory */
+	ccl_global int *Queue_index,                /* Tracks the number of elements in each queue */
+	int queuesize                               /* Size (capacity) of each queue */
+	)
+{
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue */
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
+
+	enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
+
+	ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+
+	/* Continue on with shader evaluation */
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+		Intersection *isect = &Intersection_coop[ray_index];
+		ccl_global uint *rng = &rng_coop[ray_index];
+		ccl_global PathState *state = &PathState_coop[ray_index];
+		Ray ray = Ray_coop[ray_index];
+
+		shader_setup_from_ray(kg, sd, isect, &ray, state->bounce, state->transparent_bounce);
+		float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
+		shader_eval_surface(kg, sd, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+	}
+}
--- a/intern/cycles/kernel/kernel_shaderdata_vars.h
+++ b/intern/cycles/kernel/kernel_shaderdata_vars.h
@ -0,0 +1,99 @@
+/*
+* Copyright 2011-2015 Blender Foundation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef SD_VAR
+#define SD_VAR(type, what)
+#endif
+#ifndef SD_CLOSURE_VAR
+#define SD_CLOSURE_VAR(type, what, max_closure)
+#endif
+
+/* position */
+SD_VAR(float3, P)
+/* smooth normal for shading */
+SD_VAR(float3, N)
+/* true geometric normal */
+SD_VAR(float3, Ng)
+/* view/incoming direction */
+SD_VAR(float3, I)
+/* shader id */
+SD_VAR(int, shader)
+/* booleans describing shader, see ShaderDataFlag */
+SD_VAR(int, flag)
+
+/* primitive id if there is one, ~0 otherwise */
+SD_VAR(int, prim)
+
+/* combined type and curve segment for hair */
+SD_VAR(int, type)
+
+/* parametric coordinates
+* - barycentric weights for triangles */
+SD_VAR(float, u)
+SD_VAR(float, v)
+/* object id if there is one, ~0 otherwise */
+SD_VAR(int, object)
+
+/* motion blur sample time */
+SD_VAR(float, time)
+
+/* length of the ray being shaded */
+SD_VAR(float, ray_length)
+
+/* ray bounce depth */
+SD_VAR(int, ray_depth)
+
+/* ray transparent depth */
+SD_VAR(int, transparent_depth)
+
+#ifdef __RAY_DIFFERENTIALS__
+/* differential of P. these are orthogonal to Ng, not N */
+SD_VAR(differential3, dP)
+/* differential of I */
+SD_VAR(differential3, dI)
+/* differential of u, v */
+SD_VAR(differential, du)
+SD_VAR(differential, dv)
+#endif
+#ifdef __DPDU__
+/* differential of P w.r.t. parametric coordinates. note that dPdu is
+* not readily suitable as a tangent for shading on triangles. */
+SD_VAR(float3, dPdu)
+SD_VAR(float3, dPdv)
+#endif
+
+#ifdef __OBJECT_MOTION__
+/* object <-> world space transformations, cached to avoid
+* re-interpolating them constantly for shading */
+SD_VAR(Transform, ob_tfm)
+SD_VAR(Transform, ob_itfm)
+#endif
+
+/* Closure data, we store a fixed array of closures */
+SD_CLOSURE_VAR(ShaderClosure, closure, MAX_CLOSURE)
+SD_VAR(int, num_closure)
+SD_VAR(float, randb_closure)
+
+/* ray start position, only set for backgrounds */
+SD_VAR(float3, ray_P)
+SD_VAR(differential3, ray_dP)
+
+#ifdef __OSL__
+SD_VAR(struct KernelGlobals *, osl_globals)
+#endif
+
+#undef SD_VAR
+#undef SD_CLOSURE_VAR
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@ -180,19 +180,37 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 * potentially transparent, and only in that case start marching. this gives
 * one extra ray cast for the cases were we do want transparency. */

-ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow)
+/* The arguments sd_mem and isect_mem are meaningful only for OpenCL split kernel. Other uses can just pass a NULL */
+ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space Ray *ray_input, float3 *shadow
+#ifdef __SPLIT_KERNEL__
+                                      , ShaderData *sd_mem, Intersection *isect_mem
+#endif
+                                      )
 {
 	*shadow = make_float3(1.0f, 1.0f, 1.0f);

-	if(ray->t == 0.0f)
+	if(ray_input->t == 0.0f)
 		return false;

-	Intersection isect;
-	bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
+#ifdef __SPLIT_KERNEL__
+	Ray private_ray = *ray_input;
+	Ray *ray = &private_ray;
+#else
+	Ray *ray = ray_input;
+#endif
+
+#ifdef __SPLIT_KERNEL__
+	Intersection *isect = isect_mem;
+#else
+	Intersection isect_object;
+	Intersection *isect = &isect_object;
+#endif
+
+	bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);

 #ifdef __TRANSPARENT_SHADOWS__
 	if(blocked && kernel_data.integrator.transparent_shadows) {
-		if(shader_transparent_shadow(kg, &isect)) {
+		if(shader_transparent_shadow(kg, isect)) {
 			float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 			float3 Pend = ray->P + ray->D*ray->t;
 			int bounce = state->transparent_bounce;
@ -204,9 +222,8 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 				if(bounce >= kernel_data.integrator.transparent_max_bounce)
 					return true;

-				if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect, NULL, 0.0f, 0.0f))
+				if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
 				{
-
 #ifdef __VOLUME__
 					/* attenuation for last line segment towards light */
 					if(ps.volume_stack[0].shader != SHADER_NONE)
@ -218,39 +235,44 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 					return false;
 				}

-				if(!shader_transparent_shadow(kg, &isect))
+				if(!shader_transparent_shadow(kg, isect))
 					return true;

 #ifdef __VOLUME__
 				/* attenuation between last surface and next surface */
 				if(ps.volume_stack[0].shader != SHADER_NONE) {
 					Ray segment_ray = *ray;
-					segment_ray.t = isect.t;
+					segment_ray.t = isect->t;
 					kernel_volume_shadow(kg, &ps, &segment_ray, &throughput);
 				}
 #endif

 				/* setup shader data at surface */
-				ShaderData sd;
-				shader_setup_from_ray(kg, &sd, &isect, ray, state->bounce+1, bounce);
+#ifdef __SPLIT_KERNEL__
+				ShaderData *sd = sd_mem;
+#else
+				ShaderData sd_object;
+				ShaderData *sd = &sd_object;
+#endif
+				shader_setup_from_ray(kg, sd, isect, ray, state->bounce+1, bounce);

 				/* attenuation from transparent surface */
-				if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-					shader_eval_surface(kg, &sd, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
-					throughput *= shader_bsdf_transparency(kg, &sd);
+				if(!(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)) {
+					shader_eval_surface(kg, sd, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
+					throughput *= shader_bsdf_transparency(kg, sd);
 				}

 				if(is_zero(throughput))
 					return true;

 				/* move ray forward */
-				ray->P = ray_offset(sd.P, -sd.Ng);
+				ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng));
 				if(ray->t != FLT_MAX)
 					ray->D = normalize_len(Pend - ray->P, &ray->t);

 #ifdef __VOLUME__
 				/* exit/enter volume */
-				kernel_volume_stack_enter_exit(kg, &sd, ps.volume_stack);
+				kernel_volume_stack_enter_exit(kg, sd, ps.volume_stack);
 #endif

 				bounce++;
--- a/intern/cycles/kernel/kernel_shadow_blocked.cl
+++ b/intern/cycles/kernel/kernel_shadow_blocked.cl
@ -0,0 +1,126 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_shadow_blocked kernel.
+ * This is the ninth kernel in the ray tracing logic. This is the eighth
+ * of the path iteration kernels. This kernel takes care of "shadow ray cast"
+ * logic of the direct lighting and AO  part of ray tracing.
+ *
+ * The input and output are as follows,
+ *
+ * PathState_coop ----------------------------------|--- kernel_ocl_path_trace_shadow_blocked --|
+ * LightRay_dl_coop --------------------------------|                                           |--- LightRay_dl_coop
+ * LightRay_ao_coop --------------------------------|                                           |--- LightRay_ao_coop
+ * ray_state ---------------------------------------|                                           |--- ray_state
+ * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS &       |                                           |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS)
+	      QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                                           |
+ * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS&
+	      QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                                           |
+ * kg (globals + data) -----------------------------|                                           |
+ * queuesize ---------------------------------------|                                           |
+ *
+ * Note on shader_shadow : shader_shadow is neither input nor output to this kernel. shader_shadow is filled and consumed in this kernel itself.
+ * Note on queues :
+ * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty
+ * these queues this kernel.
+ * State of queues when this kernel is called :
+ * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
+ * before and after this kernel call.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO
+ * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit.
+ */
+
+__kernel void kernel_ocl_path_trace_shadow_blocked_direct_lighting(
+	ccl_global char *globals,
+	ccl_constant KernelData *data,
+	ccl_global char *shader_shadow,             /* Required for shadow blocked */
+	ccl_global PathState *PathState_coop,       /* Required for shadow blocked */
+	ccl_global Ray *LightRay_dl_coop,           /* Required for direct lighting's shadow blocked */
+	ccl_global Ray *LightRay_ao_coop,           /* Required for AO's shadow blocked */
+	Intersection *Intersection_coop_AO,
+	Intersection *Intersection_coop_DL,
+	ccl_global char *ray_state,
+	ccl_global int *Queue_data,                 /* Queue memory */
+	ccl_global int *Queue_index,                /* Tracks the number of elements in each queue */
+	int queuesize,                              /* Size (capacity) of each queue */
+	int total_num_rays
+	)
+{
+#if 0
+	/* we will make the Queue_index entries '0' in the next kernel */
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		/* We empty this queue here */
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+#endif
+
+	int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0);
+
+	ccl_local unsigned int ao_queue_length;
+	ccl_local unsigned int dl_queue_length;
+	if(lidx == 0) {
+		ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
+		dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* flag determining if the current ray is to process shadow ray for AO or DL */
+	char shadow_blocked_type = -1;
+	/* flag determining if we need to update L */
+	char update_path_radiance = 0;
+
+	int ray_index = QUEUE_EMPTY_SLOT;
+	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	if(thread_index < ao_queue_length + dl_queue_length) {
+		if(thread_index < ao_queue_length) {
+			ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1);
+			shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
+		} else {
+			ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1);
+			shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
+		}
+	}
+
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+
+	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
+		/* Load kernel global structure */
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd_shadow  = (ShaderData *)shader_shadow;
+
+		ccl_global PathState *state = &PathState_coop[ray_index];
+		ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index];
+		ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index];
+		Intersection *isect_ao_global = &Intersection_coop_AO[ray_index];
+		Intersection *isect_dl_global = &Intersection_coop_DL[ray_index];
+
+		ccl_global Ray *light_ray_global = shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO ? light_ray_ao_global : light_ray_dl_global;
+		Intersection *isect_global = RAY_SHADOW_RAY_CAST_AO ? isect_ao_global : isect_dl_global;
+
+		float3 shadow;
+		update_path_radiance = !(shadow_blocked(kg, state, light_ray_global, &shadow, sd_shadow, isect_global));
+
+		/* We use light_ray_global's P and t to store shadow and update_path_radiance */
+		light_ray_global->P = shadow;
+		light_ray_global->t = update_path_radiance;
+	}
+}
--- a/intern/cycles/kernel/kernel_split.h
+++ b/intern/cycles/kernel/kernel_split.h
@ -0,0 +1,87 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef  _KERNEL_SPLIT_H_
+#define  _KERNEL_SPLIT_H_
+
+#include "kernel_compat_opencl.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+
+/* atomic_add_float function should be defined prior to its usage in kernel_passes.h */
+#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+/* Utility functions for float atomics */
+/* float atomics impl credits : http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html */
+ccl_device_inline void atomic_add_float(volatile ccl_global float *source, const float operand) {
+	union {
+		unsigned int intVal;
+		float floatVal;
+
+	} newVal;
+	union {
+		unsigned int intVal;
+		float floatVal;
+
+	} prevVal;
+	do {
+		prevVal.floatVal = *source;
+		newVal.floatVal = prevVal.floatVal + operand;
+
+	} while (atomic_cmpxchg((volatile ccl_global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
+}
+#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+
+#ifdef __OSL__
+#include "osl_shader.h"
+#endif
+
+#include "kernel_random.h"
+#include "kernel_projection.h"
+#include "kernel_montecarlo.h"
+#include "kernel_differential.h"
+#include "kernel_camera.h"
+
+#include "geom/geom.h"
+
+#include "kernel_accumulate.h"
+#include "kernel_shader.h"
+#include "kernel_light.h"
+#include "kernel_passes.h"
+
+#ifdef __SUBSURFACE__
+#include "kernel_subsurface.h"
+#endif
+
+#ifdef __VOLUME__
+#include "kernel_volume.h"
+#endif
+
+#include "kernel_path_state.h"
+#include "kernel_shadow.h"
+#include "kernel_emission.h"
+#include "kernel_path_common.h"
+#include "kernel_path_surface.h"
+#include "kernel_path_volume.h"
+
+#ifdef __KERNEL_DEBUG__
+#include "kernel_debug.h"
+#endif
+
+#include "kernel_queues.h"
+#include "kernel_work_stealing.h"
+
+#endif
--- a/intern/cycles/kernel/kernel_sum_all_radiance.cl
+++ b/intern/cycles/kernel/kernel_sum_all_radiance.cl
@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_compat_opencl.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+
+/*
+* Since we process various samples in parallel; The output radiance of different samples
+* are stored in different locations; This kernel combines the output radiance contributed
+* by all different samples and stores them in the RenderTile's output buffer.
+*/
+
+__kernel void kernel_ocl_path_trace_sum_all_radiance(
+	ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
+	ccl_global float *buffer,                    /* Output buffer of RenderTile */
+	ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
+	int parallel_samples, int sw, int sh, int stride,
+	int buffer_offset_x,
+	int buffer_offset_y,
+	int buffer_stride,
+	int start_sample)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	if(x < sw && y < sh) {
+		buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride);
+		per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride);
+
+		int sample_stride = (data->film.pass_stride);
+
+		int sample_iterator = 0;
+		int pass_stride_iterator = 0;
+		int num_floats = data->film.pass_stride;
+
+		for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) {
+			for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) {
+				*(buffer + pass_stride_iterator) = (start_sample == 0 && sample_iterator == 0) ? *(per_sample_output_buffer + pass_stride_iterator)
+				: *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator);
+			}
+			per_sample_output_buffer += sample_stride;
+		}
+	}
+}
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@ -24,6 +24,13 @@
 #define __KERNEL_CPU__
 #endif

+/* TODO(sergey): This is only to make it possible to include this header
+ * from outside of the kernel. but this could be done somewhat cleaner?
+ */
+#ifndef ccl_addr_space
+#define ccl_addr_space
+#endif
+
 CCL_NAMESPACE_BEGIN

 /* constants */
@ -90,7 +97,19 @@ CCL_NAMESPACE_BEGIN

 #ifdef __KERNEL_OPENCL_NVIDIA__
 #define __KERNEL_SHADING__
-#define __KERNEL_ADV_SHADING__
+/* TODO(sergey): Advanced shading code still requires work
+ * for split kernel.
+ */
+#  ifndef __SPLIT_KERNEL__
+#    define __KERNEL_ADV_SHADING__
+#  else
+#    define __MULTI_CLOSURE__
+#    define __TRANSPARENT_SHADOWS__
+#    define __PASSES__
+#    define __BACKGROUND_MIS__
+#    define __LAMP_MIS__
+#    define __AO__
+#  endif
 #endif

 #ifdef __KERNEL_OPENCL_APPLE__
@ -103,7 +122,7 @@ CCL_NAMESPACE_BEGIN
 #define __KERNEL_SHADING__
 //__KERNEL_ADV_SHADING__
 #define __MULTI_CLOSURE__
-#define __TRANSPARENT_SHADOWS__
+//#define __TRANSPARENT_SHADOWS__
 #define __PASSES__
 #define __BACKGROUND_MIS__
 #define __LAMP_MIS__
@ -117,10 +136,22 @@ CCL_NAMESPACE_BEGIN
 #ifdef __KERNEL_OPENCL_INTEL_CPU__
 #define __CL_USE_NATIVE__
 #define __KERNEL_SHADING__
-#define __KERNEL_ADV_SHADING__
+/* TODO(sergey): Advanced shading code still requires work
+ * for split kernel.
+ */
+#  ifndef __SPLIT_KERNEL__
+#    define __KERNEL_ADV_SHADING__
+#  else
+#    define __MULTI_CLOSURE__
+#    define __TRANSPARENT_SHADOWS__
+#    define __PASSES__
+#    define __BACKGROUND_MIS__
+#    define __LAMP_MIS__
+#    define __AO__
+#  endif
 #endif

-#endif
+#endif // __KERNEL_OPENCL__

 /* kernel features */
 #define __SOBOL__
@ -322,7 +353,7 @@ typedef enum PassType {

 #ifdef __PASSES__

-typedef struct PathRadiance {
+typedef ccl_addr_space struct PathRadiance {
 	int use_light_pass;

 	float3 emission;
@ -374,7 +405,7 @@ typedef struct BsdfEval {

 #else

-typedef float3 PathRadiance;
+typedef ccl_addr_space float3 PathRadiance;
 typedef float3 BsdfEval;

 #endif
@ -441,9 +472,9 @@ typedef struct differential {
 typedef struct Ray {
 	float3 P;		/* origin */
 	float3 D;		/* direction */
+
 	float t;		/* length of the ray */
 	float time;		/* time (for motion blur) */
-
 #ifdef __RAY_DIFFERENTIALS__
 	differential3 dP;
 	differential3 dD;
@ -452,7 +483,7 @@ typedef struct Ray {

 /* Intersection */

-typedef struct Intersection {
+typedef ccl_addr_space struct Intersection {
 	float t, u, v;
 	int prim;
 	int object;
@ -537,7 +568,11 @@ typedef enum AttributeStandard {
 /* Closure data */

 #ifdef __MULTI_CLOSURE__
-#define MAX_CLOSURE 64
+#  ifndef __MAX_CLOSURE__
+#     define MAX_CLOSURE 64
+#  else
+#    define MAX_CLOSURE __MAX_CLOSURE__
+#  endif
 #else
 #define MAX_CLOSURE 1
 #endif
@ -547,7 +582,7 @@ typedef enum AttributeStandard {
 *   does not put own padding trying to align this members.
 * - We make sure OSL pointer is also 16 bytes aligned.
 */
-typedef struct ShaderClosure {
+typedef ccl_addr_space struct ShaderClosure {
 	float3 weight;
 	float3 N;
 	float3 T;
@ -632,78 +667,23 @@ enum ShaderDataFlag {

 struct KernelGlobals;

-typedef struct ShaderData {
-	/* position */
-	float3 P;
-	/* smooth normal for shading */
-	float3 N;
-	/* true geometric normal */
-	float3 Ng;
-	/* view/incoming direction */
-	float3 I;
-	/* shader id */
-	int shader;
-	/* booleans describing shader, see ShaderDataFlag */
-	int flag;
-
-	/* primitive id if there is one, ~0 otherwise */
-	int prim;
-
-	/* combined type and curve segment for hair */
-	int type;
-
-	/* parametric coordinates
-	 * - barycentric weights for triangles */
-	float u, v;
-	/* object id if there is one, ~0 otherwise */
-	int object;
-
-	/* motion blur sample time */
-	float time;
-	
-	/* length of the ray being shaded */
-	float ray_length;
-	
-	/* ray bounce depth */
-	int ray_depth;
-
-	/* ray transparent depth */
-	int transparent_depth;
-
-#ifdef __RAY_DIFFERENTIALS__
-	/* differential of P. these are orthogonal to Ng, not N */
-	differential3 dP;
-	/* differential of I */
-	differential3 dI;
-	/* differential of u, v */
-	differential du;
-	differential dv;
-#endif
-#ifdef __DPDU__
-	/* differential of P w.r.t. parametric coordinates. note that dPdu is
-	 * not readily suitable as a tangent for shading on triangles. */
-	float3 dPdu, dPdv;
+#ifdef __SPLIT_KERNEL__
+#define SD_VAR(type, what) ccl_global type *what;
+#define SD_CLOSURE_VAR(type, what, max_closure) type *what;
+#define TIDX (get_global_id(1) * get_global_size(0) + get_global_id(0))
+#define ccl_fetch(s, t) (s->t[TIDX])
+#define ccl_fetch_array(s, t, index) (&s->t[TIDX * MAX_CLOSURE + index])
+#else
+#define SD_VAR(type, what) type what;
+#define SD_CLOSURE_VAR(type, what, max_closure) type what[max_closure];
+#define ccl_fetch(s, t) (s->t)
+#define ccl_fetch_array(s, t, index) (&s->t[index])
 #endif

-#ifdef __OBJECT_MOTION__
-	/* object <-> world space transformations, cached to avoid
-	 * re-interpolating them constantly for shading */
-	Transform ob_tfm;
-	Transform ob_itfm;
-#endif
+typedef ccl_addr_space struct ShaderData {

-	/* Closure data, we store a fixed array of closures */
-	ShaderClosure closure[MAX_CLOSURE];
-	int num_closure;
-	float randb_closure;
+#include "kernel_shaderdata_vars.h"

-	/* ray start position, only set for backgrounds */
-	float3 ray_P;
-	differential3 ray_dP;
-
-#ifdef __OSL__
-	struct KernelGlobals *osl_globals;
-#endif
 } ShaderData;

 /* Path State */
@ -996,13 +976,62 @@ typedef struct KernelData {
 } KernelData;

 #ifdef __KERNEL_DEBUG__
-typedef struct DebugData {
+typedef ccl_addr_space struct DebugData {
 	// Total number of BVH node traversal steps and primitives intersections
 	// for the camera rays.
 	int num_bvh_traversal_steps;
 } DebugData;
 #endif

+/* Declarations required for split kernel */
+
+/* Macro for queues */
+/* Value marking queue's empty slot */
+#define QUEUE_EMPTY_SLOT -1
+
+/*
+* Queue 1 - Active rays
+* Queue 2 - Background queue
+* Queue 3 - Shadow ray cast kernel - AO
+* Queeu 4 - Shadow ray cast kernel - direct lighting
+*/
+#define NUM_QUEUES 4
+
+/* Queue names */
+enum QueueNumber {
+	QUEUE_ACTIVE_AND_REGENERATED_RAYS,         /* All active rays and regenerated rays are enqueued here */
+	QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,      /* All
+	                                            * 1.Background-hit rays,
+	                                            * 2.Rays that has exited path-iteration but needs to update output buffer
+	                                            * 3.Rays to be regenerated
+	                                            * are enqueued here */
+	QUEUE_SHADOW_RAY_CAST_AO_RAYS,             /* All rays for which a shadow ray should be cast to determine radiance
+	                                              contribution for AO are enqueued here */
+	QUEUE_SHADOW_RAY_CAST_DL_RAYS,             /* All rays for which a shadow ray should be cast to determine radiance
+	                                              contributuin for direct lighting are enqueued here */
+};
+
+/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */
+#define RAY_STATE_MASK 0x007
+#define RAY_FLAG_MASK 0x0F8
+enum RayState {
+	RAY_ACTIVE = 0,             // Denotes ray is actively involved in path-iteration
+	RAY_INACTIVE = 1,           // Denotes ray has completed processing all samples and is inactive
+	RAY_UPDATE_BUFFER = 2,      // Denoted ray has exited path-iteration and needs to update output buffer
+	RAY_HIT_BACKGROUND = 3,     // Donotes ray has hit background
+	RAY_TO_REGENERATE = 4,      // Denotes ray has to be regenerated
+	RAY_REGENERATED = 5,        // Denotes ray has been regenerated
+	RAY_SKIP_DL = 6,            // Denotes ray should skip direct lighting
+	RAY_SHADOW_RAY_CAST_AO = 16, // Flag's ray has to execute shadow blocked function in AO part
+	RAY_SHADOW_RAY_CAST_DL = 32 // Flag's ray has to execute shadow blocked function in direct lighting part
+};
+
+#define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
+#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state)
+#define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag))
+#define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
+#define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
+
 CCL_NAMESPACE_END

 #endif /*  __KERNEL_TYPES_H__ */
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@ -0,0 +1,193 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_WORK_STEALING_H__
+#define __KERNEL_WORK_STEALING_H__
+
+/*
+ * Utility functions for work stealing
+ */
+
+#ifdef __WORK_STEALING__
+
+#ifdef __KERNEL_OPENCL__
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#endif
+
+uint get_group_id_with_ray_index(uint ray_index,
+                                 uint tile_dim_x,
+                                 uint tile_dim_y,
+                                 uint parallel_samples,
+                                 int dim)
+{
+	if(dim == 0) {
+		uint x_span = ray_index % (tile_dim_x * parallel_samples);
+		return x_span / get_local_size(0);
+	}
+	else /*if(dim == 1)*/ {
+		kernel_assert(dim == 1);
+		uint y_span = ray_index / (tile_dim_x * parallel_samples);
+		return y_span / get_local_size(1);
+	}
+}
+
+uint get_total_work(uint tile_dim_x,
+                    uint tile_dim_y,
+                    uint grp_idx,
+                    uint grp_idy,
+                    uint num_samples)
+{
+	uint threads_within_tile_border_x =
+		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+		                                     : get_local_size(0);
+	uint threads_within_tile_border_y =
+		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+		                                     : get_local_size(1);
+
+	threads_within_tile_border_x =
+		(threads_within_tile_border_x == 0) ? get_local_size(0)
+		                                    : threads_within_tile_border_x;
+	threads_within_tile_border_y =
+		(threads_within_tile_border_y == 0) ? get_local_size(1)
+		                                    : threads_within_tile_border_y;
+
+	return threads_within_tile_border_x *
+	       threads_within_tile_border_y *
+	       num_samples;
+}
+
+/* Returns 0 in case there is no next work available */
+/* Returns 1 in case work assigned is valid */
+int get_next_work(ccl_global uint *work_pool,
+                  ccl_private uint *my_work,
+                  uint tile_dim_x,
+                  uint tile_dim_y,
+                  uint num_samples,
+                  uint parallel_samples,
+                  uint ray_index)
+{
+	uint grp_idx = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           0);
+	uint grp_idy = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           1);
+	uint total_work = get_total_work(tile_dim_x,
+	                                 tile_dim_y,
+	                                 grp_idx,
+	                                 grp_idy,
+	                                 num_samples);
+	uint group_index = grp_idy * get_num_groups(0) + grp_idx;
+	*my_work = atomic_inc(&work_pool[group_index]);
+	return (*my_work < total_work) ? 1 : 0;
+}
+
+/* This function assumes that the passed my_work is valid. */
+/* Decode sample number w.r.t. assigned my_work. */
+uint get_my_sample(uint my_work,
+                   uint tile_dim_x,
+                   uint tile_dim_y,
+                   uint parallel_samples,
+                   uint ray_index)
+{
+	uint grp_idx = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           0);
+	uint grp_idy = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           1);
+	uint threads_within_tile_border_x =
+		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+		                                     : get_local_size(0);
+	uint threads_within_tile_border_y =
+		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+		                                     : get_local_size(1);
+
+	threads_within_tile_border_x =
+		(threads_within_tile_border_x == 0) ? get_local_size(0)
+		                                    : threads_within_tile_border_x;
+	threads_within_tile_border_y =
+		(threads_within_tile_border_y == 0) ? get_local_size(1)
+		                                    : threads_within_tile_border_y;
+
+	return my_work /
+	       (threads_within_tile_border_x * threads_within_tile_border_y);
+}
+
+/* Decode pixel and tile position w.r.t. assigned my_work. */
+void get_pixel_tile_position(ccl_private uint *pixel_x,
+                             ccl_private uint *pixel_y,
+                             ccl_private uint *tile_x,
+                             ccl_private uint *tile_y,
+                             uint my_work,
+                             uint tile_dim_x,
+                             uint tile_dim_y,
+                             uint tile_offset_x,
+                             uint tile_offset_y,
+                             uint parallel_samples,
+                             uint ray_index)
+{
+	uint grp_idx = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           0);
+	uint grp_idy = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           1);
+	uint threads_within_tile_border_x =
+		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+		                                     : get_local_size(0);
+	uint threads_within_tile_border_y =
+		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+		                                     : get_local_size(1);
+
+	threads_within_tile_border_x =
+		(threads_within_tile_border_x == 0) ? get_local_size(0)
+		                                    : threads_within_tile_border_x;
+	threads_within_tile_border_y =
+		(threads_within_tile_border_y == 0) ? get_local_size(1)
+		                                    : threads_within_tile_border_y;
+
+	uint total_associated_pixels =
+		threads_within_tile_border_x * threads_within_tile_border_y;
+	uint work_group_pixel_index = my_work % total_associated_pixels;
+	uint work_group_pixel_x =
+		work_group_pixel_index % threads_within_tile_border_x;
+	uint work_group_pixel_y =
+		work_group_pixel_index / threads_within_tile_border_x;
+
+	*pixel_x =
+		tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x;
+	*pixel_y =
+		tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y;
+	*tile_x = *pixel_x - tile_offset_x;
+	*tile_y = *pixel_y - tile_offset_y;
+}
+
+#endif  /* __WORK_STEALING__ */
+
+#endif  /* __KERNEL_WORK_STEALING_H__ */
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@ -189,7 +189,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ShaderType type, int path_flag)
 {
 	float stack[SVM_STACK_SIZE];
-	int offset = sd->shader & SHADER_MASK;
+	int offset = ccl_fetch(sd, shader) & SHADER_MASK;

 	while(1) {
 		uint4 node = read_node(kg, &offset);
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@ -22,12 +22,12 @@ ccl_device void svm_node_attr_init(KernelGlobals *kg, ShaderData *sd,
 	uint4 node, NodeAttributeType *type,
 	NodeAttributeType *mesh_type, AttributeElement *elem, int *offset, uint *out_offset)
 {
-	if(sd->object != OBJECT_NONE) {
+	if(ccl_fetch(sd, object) != OBJECT_NONE) {
 		/* find attribute by unique id */
 		uint id = node.y;
-		uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
+		uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
 #ifdef __HAIR__
-		attr_offset = (sd->type & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
+		attr_offset = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
 #endif
 		uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 		
--- a/intern/cycles/kernel/svm/svm_camera.h
+++ b/intern/cycles/kernel/svm/svm_camera.h
@ -23,7 +23,7 @@ ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack,
 	float3 vector;

 	Transform tfm = kernel_data.cam.worldtocamera;
-	vector = transform_point(&tfm, sd->P);
+	vector = transform_point(&tfm, ccl_fetch(sd, P));
 	zdepth = vector.z;
 	distance = len(vector);

--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@ -25,12 +25,12 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
 			sc->data0 = eta;
 			sc->data1 = 0.0f;
 			sc->data2 = 0.0f;
-			sd->flag |= bsdf_refraction_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_refraction_setup(sc);
 		}
 		else {
 			sc->data0 = 0.0f;
 			sc->data1 = 0.0f;
-			sd->flag |= bsdf_reflection_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_reflection_setup(sc);
 		}
 	}
 	else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) {
@ -39,9 +39,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
 		sc->data2 = eta;

 		if(refract)
-			sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(sc);
 		else
-			sd->flag |= bsdf_microfacet_beckmann_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(sc);
 	}
 	else {
 		sc->data0 = roughness;
@ -49,23 +49,23 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
 		sc->data2 = eta;

 		if(refract)
-			sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(sc);
 		else
-			sd->flag |= bsdf_microfacet_ggx_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(sc);
 	}
 }

 ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, ClosureType type, float mix_weight)
 {
-	ShaderClosure *sc = &sd->closure[sd->num_closure];
+	ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));

-	if(sd->num_closure < MAX_CLOSURE) {
+	if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
 		sc->weight *= mix_weight;
 		sc->type = type;
 #ifdef __OSL__
 		sc->prim = NULL;
 #endif
-		sd->num_closure++;
+		ccl_fetch(sd, num_closure)++;
 		return sc;
 	}

@ -74,14 +74,15 @@ ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, C

 ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float mix_weight)
 {
-	ShaderClosure *sc = &sd->closure[sd->num_closure];
+	ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+
 	float3 weight = sc->weight * mix_weight;
 	float sample_weight = fabsf(average(weight));

-	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) {
+	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
 		sc->weight = weight;
 		sc->sample_weight = sample_weight;
-		sd->num_closure++;
+		ccl_fetch(sd, num_closure)++;
 #ifdef __OSL__
 		sc->prim = NULL;
 #endif
@ -93,14 +94,15 @@ ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float

 ccl_device_inline ShaderClosure *svm_node_closure_get_absorption(ShaderData *sd, float mix_weight)
 {
-	ShaderClosure *sc = &sd->closure[sd->num_closure];
+	ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+
 	float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - sc->weight) * mix_weight;
 	float sample_weight = fabsf(average(weight));

-	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) {
+	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
 		sc->weight = weight;
 		sc->sample_weight = sample_weight;
-		sd->num_closure++;
+		ccl_fetch(sd, num_closure)++;
 #ifdef __OSL__
 		sc->prim = NULL;
 #endif
@ -124,7 +126,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 	if(mix_weight == 0.0f)
 		return;

-	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N; 
+	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): ccl_fetch(sd, N);

 	float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z);
 	float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w);
@ -142,13 +144,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data0 = 0.0f;
 					sc->data1 = 0.0f;
 					sc->data2 = 0.0f;
-					sd->flag |= bsdf_diffuse_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_diffuse_setup(sc);
 				}
 				else {
 					sc->data0 = roughness;
 					sc->data1 = 0.0f;
 					sc->data2 = 0.0f;
-					sd->flag |= bsdf_oren_nayar_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(sc);
 				}
 			}
 			break;
@ -161,7 +163,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->data1 = 0.0f;
 				sc->data2 = 0.0f;
 				sc->N = N;
-				sd->flag |= bsdf_translucent_setup(sc);
+				ccl_fetch(sd, flag) |= bsdf_translucent_setup(sc);
 			}
 			break;
 		}
@ -173,7 +175,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->data1 = 0.0f;
 				sc->data2 = 0.0f;
 				sc->N = N;
-				sd->flag |= bsdf_transparent_setup(sc);
+				ccl_fetch(sd, flag) |= bsdf_transparent_setup(sc);
 			}
 			break;
 		}
@ -195,13 +197,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *

 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFLECTION_ID)
-					sd->flag |= bsdf_reflection_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_reflection_setup(sc);
 				else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID)
-					sd->flag |= bsdf_microfacet_beckmann_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(sc);
 				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID)
-					sd->flag |= bsdf_microfacet_ggx_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(sc);
 				else
-					sd->flag |= bsdf_ashikhmin_shirley_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(sc);
 			}

 			break;
@ -219,7 +221,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->N = N;

 				float eta = fmaxf(param2, 1e-5f);
-				eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
+				eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;

 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFRACTION_ID) {
@ -227,7 +229,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data1 = 0.0f;
 					sc->data2 = 0.0f;

-					sd->flag |= bsdf_refraction_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_refraction_setup(sc);
 				}
 				else {
 					sc->data0 = param1;
@ -235,9 +237,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data2 = eta;

 					if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID)
-						sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(sc);
 					else
-						sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(sc);
 				}
 			}

@ -254,15 +256,15 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #endif
 			/* index of refraction */
 			float eta = fmaxf(param2, 1e-5f);
-			eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
+			eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;

 			/* fresnel */
-			float cosNO = dot(N, sd->I);
+			float cosNO = dot(N, ccl_fetch(sd, I));
 			float fresnel = fresnel_dielectric_cos(cosNO, eta);
 			float roughness = param1;

 			/* reflection */
-			ShaderClosure *sc = &sd->closure[sd->num_closure];
+			ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
 			float3 weight = sc->weight;
 			float sample_weight = sc->sample_weight;

@ -283,7 +285,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #endif

 			/* refraction */
-			sc = &sd->closure[sd->num_closure];
+			sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
 			sc->weight = weight;
 			sc->sample_weight = sample_weight;

@ -332,11 +334,11 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->data2 = 0.0f;

 				if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID)
-					sd->flag |= bsdf_microfacet_beckmann_aniso_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(sc);
 				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID)
-					sd->flag |= bsdf_microfacet_ggx_aniso_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(sc);
 				else
-					sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(sc);
 			}
 			break;
 		}
@ -350,7 +352,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->data0 = saturate(param1);
 				sc->data1 = 0.0f;
 				sc->data2 = 0.0f;
-				sd->flag |= bsdf_ashikhmin_velvet_setup(sc);
+				ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(sc);
 			}
 			break;
 		}
@ -366,9 +368,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->data2 = 0.0f;
 				
 				if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID)
-					sd->flag |= bsdf_diffuse_toon_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(sc);
 				else
-					sd->flag |= bsdf_glossy_toon_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(sc);
 			}
 			break;
 		}
@ -376,7 +378,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: {
 			
-			if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) {
+			if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 				ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);

 				if(sc) {
@ -389,11 +391,11 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->N = N;
 					sc->data0 = 0.0f;
 					sc->data1 = 0.0f;
-					sd->flag |= bsdf_transparent_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_transparent_setup(sc);
 				}
 			}
 			else {
-				ShaderClosure *sc = &sd->closure[sd->num_closure];
+				ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
 				sc = svm_node_closure_get_bsdf(sd, mix_weight);

 				if(sc) {
@ -402,18 +404,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data1 = param2;
 					sc->data2 = -stack_load_float(stack, data_node.z);

-					if(!(sd->type & PRIMITIVE_ALL_CURVE)) {
-						sc->T = normalize(sd->dPdv);
+					if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) {
+						sc->T = normalize(ccl_fetch(sd, dPdv));
 						sc->data2 = 0.0f;
 					}
 					else
-						sc->T = normalize(sd->dPdu);
+						sc->T = normalize(ccl_fetch(sd, dPdu));

 					if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) {
-						sd->flag |= bsdf_hair_reflection_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(sc);
 					}
 					else {
-						sd->flag |= bsdf_hair_transmission_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(sc);
 					}
 				}
 			}
@ -423,9 +425,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #endif

 #ifdef __SUBSURFACE__
+#ifndef __SPLIT_KERNEL__
+#  define sc_next(sc) sc++
+#  else
+#  define sc_next(sc) sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure))
+#  endif
 		case CLOSURE_BSSRDF_CUBIC_ID:
 		case CLOSURE_BSSRDF_GAUSSIAN_ID: {
-			ShaderClosure *sc = &sd->closure[sd->num_closure];
+			ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
 			float3 weight = sc->weight * mix_weight;
 			float sample_weight = fabsf(average(weight));
 			
@ -435,7 +442,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR)
 				param1 = 0.0f;

-			if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure+2 < MAX_CLOSURE) {
+			if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure)+2 < MAX_CLOSURE) {
 				/* radius * scale */
 				float3 radius = stack_load_float3(stack, data_node.z)*param1;
 				/* sharpness */
@ -455,10 +462,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->prim = NULL;
 #endif
 					sc->N = N;
-					sd->flag |= bssrdf_setup(sc, (ClosureType)type);
+					ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);

-					sd->num_closure++;
-					sc++;
+					ccl_fetch(sd, num_closure)++;
+					sc_next(sc);
 				}

 				if(fabsf(weight.y) > 0.0f) {
@ -472,10 +479,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->prim = NULL;
 #endif
 					sc->N = N;
-					sd->flag |= bssrdf_setup(sc, (ClosureType)type);
+					ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);

-					sd->num_closure++;
-					sc++;
+					ccl_fetch(sd, num_closure)++;
+					sc_next(sc);
 				}

 				if(fabsf(weight.z) > 0.0f) {
@ -489,15 +496,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->prim = NULL;
 #endif
 					sc->N = N;
-					sd->flag |= bssrdf_setup(sc, (ClosureType)type);
+					ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);

-					sd->num_closure++;
-					sc++;
+					ccl_fetch(sd, num_closure)++;
+					sc_next(sc);
 				}
 			}

 			break;
 		}
+#  undef sc_next
 #endif
 		default:
 			break;
@ -525,7 +533,7 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
 			ShaderClosure *sc = svm_node_closure_get_absorption(sd, mix_weight * density);

 			if(sc) {
-				sd->flag |= volume_absorption_setup(sc);
+				ccl_fetch(sd, flag) |= volume_absorption_setup(sc);
 			}
 			break;
 		}
@ -535,7 +543,7 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
 			if(sc) {
 				sc->data0 = param2; /* g */
 				sc->data1 = 0.0f;
-				sd->flag |= volume_henyey_greenstein_setup(sc);
+				ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(sc);
 			}
 			break;
 		}
@ -560,7 +568,7 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
 	else
 		svm_node_closure_get_non_bsdf(sd, CLOSURE_EMISSION_ID, 1.0f);

-	sd->flag |= SD_EMISSION;
+	ccl_fetch(sd, flag) |= SD_EMISSION;
 }

 ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
@ -594,7 +602,7 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod
 	else
 		svm_node_closure_get_non_bsdf(sd, CLOSURE_HOLDOUT_ID, 1.0f);

-	sd->flag |= SD_HOLDOUT;
+	ccl_fetch(sd, flag) |= SD_HOLDOUT;
 }

 ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node)
@ -612,15 +620,17 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack,
 	else
 		svm_node_closure_get_non_bsdf(sd, CLOSURE_AMBIENT_OCCLUSION_ID, 1.0f);

-	sd->flag |= SD_AO;
+	ccl_fetch(sd, flag) |= SD_AO;
 }

 /* Closure Nodes */

 ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight)
 {
-	if(sd->num_closure < MAX_CLOSURE)
-		sd->closure[sd->num_closure].weight = weight;
+	if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+		sc->weight = weight;
+	}
 }

 ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b)
@ -670,7 +680,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
 {
 	float3 normal = stack_load_float3(stack, in_direction);
-	sd->N = normal;
+	ccl_fetch(sd, N) = normal;
 	stack_store_float3(stack, out_normal, normal);
 }

--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@ -25,11 +25,11 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 	uint normal_offset, distance_offset, invert;
 	decode_node_uchar4(node.y, &normal_offset, &distance_offset, &invert, NULL);

-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);

 	/* get surface tangents from normal */
-	float3 Rx = cross(sd->dP.dy, normal_in);
-	float3 Ry = cross(normal_in, sd->dP.dx);
+	float3 Rx = cross(ccl_fetch(sd, dP).dy, normal_in);
+	float3 Ry = cross(normal_in, ccl_fetch(sd, dP).dx);

 	/* get bump values */
 	uint c_offset, x_offset, y_offset, strength_offset;
@ -40,7 +40,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 	float h_y = stack_load_float(stack, y_offset);

 	/* compute surface gradient and determinant */
-	float det = dot(sd->dP.dx, Rx);
+	float det = dot(ccl_fetch(sd, dP).dx, Rx);
 	float3 surfgrad = (h_x - h_c)*Rx + (h_y - h_c)*Ry;

 	float absdet = fabsf(det);
@ -65,7 +65,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 ccl_device void svm_node_set_displacement(ShaderData *sd, float *stack, uint fac_offset)
 {
 	float d = stack_load_float(stack, fac_offset);
-	sd->P += sd->N*d*0.1f; /* todo: get rid of this factor */
+	ccl_fetch(sd, P) += ccl_fetch(sd, N)*d*0.1f; /* todo: get rid of this factor */
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@ -23,12 +23,12 @@ ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset,
 	uint normal_offset, out_offset;
 	decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL);
 	float eta = (stack_valid(ior_offset))? stack_load_float(stack, ior_offset): __uint_as_float(ior_value);
-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
 	
 	eta = fmaxf(eta, 1e-5f);
-	eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
+	eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;

-	float f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
+	float f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);

 	stack_store_float(stack, out_offset, f);
 }
@ -44,18 +44,18 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
 	decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL);

 	float blend = (stack_valid(blend_offset))? stack_load_float(stack, blend_offset): __uint_as_float(blend_value);
-	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): sd->N;
+	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);

 	float f;

 	if(type == NODE_LAYER_WEIGHT_FRESNEL) {
 		float eta = fmaxf(1.0f - blend, 1e-5f);
-		eta = (sd->flag & SD_BACKFACING)? eta: 1.0f/eta;
+		eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? eta: 1.0f/eta;

-		f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
+		f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
 	}
 	else {
-		f = fabsf(dot(sd->I, normal_in));
+		f = fabsf(dot(ccl_fetch(sd, I), normal_in));

 		if(blend != 0.5f) {
 			blend = clamp(blend, 0.0f, 1.0f-1e-5f);
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@ -23,15 +23,15 @@ ccl_device void svm_node_geometry(KernelGlobals *kg, ShaderData *sd, float *stac
 	float3 data;

 	switch(type) {
-		case NODE_GEOM_P: data = sd->P; break;
-		case NODE_GEOM_N: data = sd->N; break;
+		case NODE_GEOM_P: data = ccl_fetch(sd, P); break;
+		case NODE_GEOM_N: data = ccl_fetch(sd, N); break;
 #ifdef __DPDU__
 		case NODE_GEOM_T: data = primitive_tangent(kg, sd); break;
 #endif
-		case NODE_GEOM_I: data = sd->I; break;
-		case NODE_GEOM_Ng: data = sd->Ng; break;
+		case NODE_GEOM_I: data = ccl_fetch(sd, I); break;
+		case NODE_GEOM_Ng: data = ccl_fetch(sd, Ng); break;
 #ifdef __UV__
-		case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
+		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u), ccl_fetch(sd, v), 0.0f); break;
 #endif
 	}

@ -44,8 +44,8 @@ ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;

 	switch(type) {
-		case NODE_GEOM_P: data = sd->P + sd->dP.dx; break;
-		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f); break;
+		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; break;
+		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dx, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dx, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}

@ -61,8 +61,8 @@ ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;

 	switch(type) {
-		case NODE_GEOM_P: data = sd->P + sd->dP.dy; break;
-		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f); break;
+		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; break;
+		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dy, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dy, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}

@ -83,9 +83,9 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s
 			stack_store_float3(stack, out_offset, object_location(kg, sd));
 			return;
 		}
-		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, sd->object); break;
+		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, ccl_fetch(sd, object)); break;
 		case NODE_INFO_MAT_INDEX: data = shader_pass_id(kg, sd); break;
-		case NODE_INFO_OB_RANDOM: data = object_random_number(kg, sd->object); break;
+		case NODE_INFO_OB_RANDOM: data = object_random_number(kg, ccl_fetch(sd, object)); break;
 		default: data = 0.0f; break;
 	}

@ -98,44 +98,44 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float
 {
 	switch(type) {
 		case NODE_INFO_PAR_INDEX: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float(stack, out_offset, particle_index(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_AGE: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float(stack, out_offset, particle_age(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LIFETIME: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LOCATION: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float3(stack, out_offset, particle_location(kg, particle_id));
 			break;
 		}
 #if 0	/* XXX float4 currently not supported in SVM stack */
 		case NODE_INFO_PAR_ROTATION: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id));
 			break;
 		}
 #endif
 		case NODE_INFO_PAR_SIZE: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float(stack, out_offset, particle_size(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_ANGULAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id));
 			break;
 		}
@ -153,7 +153,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *sta

 	switch(type) {
 		case NODE_INFO_CURVE_IS_STRAND: {
-			data = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
+			data = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) != 0;
 			stack_store_float(stack, out_offset, data);
 			break;
 		}
@ -165,7 +165,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *sta
 			break;
 		}
 		/*case NODE_INFO_CURVE_FADE: {
-			data = sd->curve_transparency;
+			data = ccl_fetch(sd, curve_transparency);
 			stack_store_float(stack, out_offset, data);
 			break;
 		}*/
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@ -392,10 +392,10 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta
 ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	/* get object space normal */
-	float3 N = sd->N;
+	float3 N = ccl_fetch(sd, N);

-	N = sd->N;
-	if(sd->object != OBJECT_NONE)
+	N = ccl_fetch(sd, N);
+	if(ccl_fetch(sd, object) != OBJECT_NONE)
 		object_inverse_normal_transform(kg, sd, &N);

 	/* project from direction vector to barycentric coordinates in triangles */
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@ -31,10 +31,10 @@ ccl_device void svm_node_light_path(ShaderData *sd, float *stack, uint type, uin
 		case NODE_LP_reflection: info = (path_flag & PATH_RAY_REFLECT)? 1.0f: 0.0f; break;
 		case NODE_LP_transmission: info = (path_flag & PATH_RAY_TRANSMIT)? 1.0f: 0.0f; break;
 		case NODE_LP_volume_scatter: info = (path_flag & PATH_RAY_VOLUME_SCATTER)? 1.0f: 0.0f; break;
-		case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break;
-		case NODE_LP_ray_length: info = sd->ray_length; break;
-		case NODE_LP_ray_depth: info = (float)sd->ray_depth; break;
-		case NODE_LP_ray_transparent: info = (float)sd->transparent_depth; break;
+		case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break;
+		case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break;
+		case NODE_LP_ray_depth: info = (float)ccl_fetch(sd, ray_depth); break;
+		case NODE_LP_ray_transparent: info = (float)ccl_fetch(sd, transparent_depth); break;
 	}

 	stack_store_float(stack, out_offset, info);
@ -53,14 +53,14 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)

 	switch(type) {
 		case NODE_LIGHT_FALLOFF_QUADRATIC: break;
-		case NODE_LIGHT_FALLOFF_LINEAR: strength *= sd->ray_length; break;
-		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= sd->ray_length*sd->ray_length; break;
+		case NODE_LIGHT_FALLOFF_LINEAR: strength *= ccl_fetch(sd, ray_length); break;
+		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); break;
 	}

 	float smooth = stack_load_float(stack, smooth_offset);

 	if(smooth > 0.0f) {
-		float squared = sd->ray_length*sd->ray_length;
+		float squared = ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length);
 		strength *= squared/(smooth + squared);
 	}

--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@ -31,9 +31,9 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,

 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = sd->P;
+			data = ccl_fetch(sd, P);
 			if(node.w == 0) {
-				if(sd->object != OBJECT_NONE) {
+				if(ccl_fetch(sd, object) != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@ -48,48 +48,48 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = sd->N;
-			if(sd->object != OBJECT_NONE)
+			data = ccl_fetch(sd, N);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;

-			if(sd->object != OBJECT_NONE)
-				data = transform_point(&tfm, sd->P);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = transform_point(&tfm, ccl_fetch(sd, P));
 			else
-				data = transform_point(&tfm, sd->P + camera_position(kg));
+				data = transform_point(&tfm, ccl_fetch(sd, P) + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, sd->ray_P);
+			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P));
 			else
-				data = camera_world_to_ndc(kg, sd, sd->P);
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P));
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(sd->object != OBJECT_NONE)
-				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
 			else
-				data = sd->I;
+				data = ccl_fetch(sd, I);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, sd->object);
+			data = object_dupli_generated(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, sd->object);
+			data = object_dupli_uv(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = sd->P;
+			data = ccl_fetch(sd, P);

 #ifdef __VOLUME__
-			if(sd->object != OBJECT_NONE)
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@ -113,9 +113,9 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,

 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = sd->P + sd->dP.dx;
+			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
 			if(node.w == 0) {
-				if(sd->object != OBJECT_NONE) {
+				if(ccl_fetch(sd, object) != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@ -130,48 +130,48 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = sd->N;
-			if(sd->object != OBJECT_NONE)
+			data = ccl_fetch(sd, N);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;

-			if(sd->object != OBJECT_NONE)
-				data = transform_point(&tfm, sd->P + sd->dP.dx);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
 			else
-				data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg));
+				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx);
+			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dx);
 			else
-				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(sd->object != OBJECT_NONE)
-				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
 			else
-				data = sd->I;
+				data = ccl_fetch(sd, I);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, sd->object);
+			data = object_dupli_generated(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, sd->object);
+			data = object_dupli_uv(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = sd->P + sd->dP.dx;
+			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;

 #ifdef __VOLUME__
-			if(sd->object != OBJECT_NONE)
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@ -198,9 +198,9 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,

 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = sd->P + sd->dP.dy;
+			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
 			if(node.w == 0) {
-				if(sd->object != OBJECT_NONE) {
+				if(ccl_fetch(sd, object) != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@ -215,48 +215,48 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = sd->N;
-			if(sd->object != OBJECT_NONE)
+			data = ccl_fetch(sd, N);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;

-			if(sd->object != OBJECT_NONE)
-				data = transform_point(&tfm, sd->P + sd->dP.dy);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
 			else
-				data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg));
+				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy);
+			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dy);
 			else
-				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(sd->object != OBJECT_NONE)
-				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
 			else
-				data = sd->I;
+				data = ccl_fetch(sd, I);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, sd->object);
+			data = object_dupli_generated(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, sd->object);
+			data = object_dupli_uv(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = sd->P + sd->dP.dy;
+			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;

 #ifdef __VOLUME__
-			if(sd->object != OBJECT_NONE)
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@ -281,7 +281,7 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st

 	if(space == NODE_NORMAL_MAP_TANGENT) {
 		/* tangent space */
-		if(sd->object == OBJECT_NONE) {
+		if(ccl_fetch(sd, object) == OBJECT_NONE) {
 			stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f));
 			return;
 		}
@ -302,11 +302,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 		float sign = primitive_attribute_float(kg, sd, attr_sign_elem, attr_sign_offset, NULL, NULL);
 		float3 normal;

-		if(sd->shader & SHADER_SMOOTH_NORMAL) {
+		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
 			normal = primitive_attribute_float3(kg, sd, attr_normal_elem, attr_normal_offset, NULL, NULL);
 		}
 		else {
-			normal = sd->Ng;
+			normal = ccl_fetch(sd, Ng);
 			object_inverse_normal_transform(kg, sd, &normal);
 		}

@ -337,7 +337,7 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st

 	if(strength != 1.0f) {
 		strength = max(strength, 0.0f);
-		N = normalize(sd->N + (N - sd->N)*strength);
+		N = normalize(ccl_fetch(sd, N) + (N - ccl_fetch(sd, N))*strength);
 	}

 	stack_store_float3(stack, normal_offset, N);
@ -367,7 +367,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 		float3 generated;

 		if(attr_offset == ATTR_STD_NOT_FOUND)
-			generated = sd->P;
+			generated = ccl_fetch(sd, P);
 		else
 			generated = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL);

@ -380,7 +380,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 	}

 	object_normal_transform(kg, sd, &tangent);
-	tangent = cross(sd->N, normalize(cross(tangent, sd->N)));
+	tangent = cross(ccl_fetch(sd, N), normalize(cross(tangent, ccl_fetch(sd, N))));
 	stack_store_float3(stack, tangent_offset, tangent);
 }

--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
 	NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito;
 	
 	Transform tfm;
-	bool is_object = (sd->object != OBJECT_NONE);
+	bool is_object = (ccl_fetch(sd, object) != OBJECT_NONE);
 	bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL);
 	
 	/* From world */
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@ -41,9 +41,9 @@ ccl_device float wireframe(KernelGlobals *kg,
                           float3 *P)
 {
 #ifdef __HAIR__
-	if(sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
+	if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)
 #else
-	if(sd->prim != PRIM_NONE)
+	if(ccl_fetch(sd, prim) != PRIM_NONE)
 #endif
 	{
 		float3 Co[3];
@ -52,12 +52,12 @@ ccl_device float wireframe(KernelGlobals *kg,
 		/* Triangles */
 		int np = 3;

-		if(sd->type & PRIMITIVE_TRIANGLE)
-			triangle_vertices(kg, sd->prim, Co);
+		if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE)
+			triangle_vertices(kg, ccl_fetch(sd, prim), Co);
 		else
-			motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co);
+			motion_triangle_vertices(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), Co);

-		if(!(sd->flag & SD_TRANSFORM_APPLIED)) {
+		if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED)) {
 			object_position_transform(kg, sd, &Co[0]);
 			object_position_transform(kg, sd, &Co[1]);
 			object_position_transform(kg, sd, &Co[2]);
@ -66,8 +66,8 @@ ccl_device float wireframe(KernelGlobals *kg,
 		if(pixel_size) {
 			// Project the derivatives of P to the viewing plane defined
 			// by I so we have a measure of how big is a pixel at this point
-			float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I);
-			float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I);
+			float pixelwidth_x = len(ccl_fetch(sd, dP).dx - dot(ccl_fetch(sd, dP).dx, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
+			float pixelwidth_y = len(ccl_fetch(sd, dP).dy - dot(ccl_fetch(sd, dP).dy, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
 			// Take the average of both axis' length
 			pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f;
 		}
@ -106,16 +106,27 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg,
 	int pixel_size = (int)use_pixel_size;

 	/* Calculate wireframe */
-	float f = wireframe(kg, sd, size, pixel_size, &sd->P);
+#ifdef __SPLIT_KERNEL__
+	/* TODO(sergey): This is because sd is actually a global space,
+	 * which makes it difficult to re-use same wireframe() function.
+	 *
+	 * With OpenCL 2.0 it's possible to avoid this change, but for until
+	 * then we'll be living with such an exception.
+	 */
+	float3 P = ccl_fetch(sd, P);
+	float f = wireframe(kg, sd, size, pixel_size, &P);
+#else
+	float f = wireframe(kg, sd, size, pixel_size, &ccl_fetch(sd, P));
+#endif

 	/* TODO(sergey): Think of faster way to calculate derivatives. */
 	if(bump_offset == NODE_BUMP_OFFSET_DX) {
-		float3 Px = sd->P - sd->dP.dx;
-		f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(sd->dP.dx);
+		float3 Px = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dx;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(ccl_fetch(sd, dP).dx);
 	}
 	else if(bump_offset == NODE_BUMP_OFFSET_DY) {
-		float3 Py = sd->P - sd->dP.dy;
-		f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(sd->dP.dy);
+		float3 Py = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dy;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(ccl_fetch(sd, dP).dy);
 	}

 	if(stack_valid(out_fac))
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@ -807,7 +807,10 @@ void Session::update_status_time(bool show_pause, bool show_done)

 		substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles);

-		if((is_gpu && !is_multidevice) || (is_cpu && num_tiles == 1)) {
+		if(((is_gpu && !is_multidevice) || (is_cpu && num_tiles == 1)) && !device->info.use_split_kernel) {
+			/* When using split-kernel (OpenCL) each thread in a tile will be working on a different
+			 * sample. Can't display sample number when device uses split-kernel
+			 */
 			/* when rendering on GPU multithreading happens within single tile, as in
 			 * tiles are handling sequentially and in this case we could display
 			 * currently rendering sample number