Code cleanup: simplify kernel side work stealing code.

2017-09-21 03:37:22 +02:00 · 2017-09-21 03:37:22 +02:00 · 07ec0effb6
parent 01dfaac77b
commit 07ec0effb6
5 changed files with 65 additions and 124 deletions
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@ -27,90 +27,54 @@ CCL_NAMESPACE_BEGIN
 #  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #endif

-ccl_device_inline uint kernel_total_work_size(KernelGlobals *kg)
-{
-	return kernel_split_params.w * kernel_split_params.h * kernel_split_params.num_samples;
-}
-
-ccl_device_inline uint kernel_num_work_pools(KernelGlobals *kg)
-{
-	return ccl_global_size(0) * ccl_global_size(1) / WORK_POOL_SIZE;
-}
-
-ccl_device_inline uint work_pool_from_ray_index(KernelGlobals *kg, uint ray_index)
-{
-	return ray_index / WORK_POOL_SIZE;
-}
-
-ccl_device_inline uint work_pool_work_size(KernelGlobals *kg, uint work_pool)
-{
-	uint total_work_size = kernel_total_work_size(kg);
-	uint num_pools = kernel_num_work_pools(kg);
-
-	if(work_pool >= num_pools || work_pool * WORK_POOL_SIZE >= total_work_size) {
-		return 0;
-	}
-
-	uint work_size = (total_work_size / (num_pools * WORK_POOL_SIZE)) * WORK_POOL_SIZE;
-
-	uint remainder = (total_work_size % (num_pools * WORK_POOL_SIZE));
-	if(work_pool < remainder / WORK_POOL_SIZE) {
-		work_size += WORK_POOL_SIZE;
-	}
-	else if(work_pool == remainder / WORK_POOL_SIZE) {
-		work_size += remainder % WORK_POOL_SIZE;
-	}
-
-	return work_size;
-}
-
-ccl_device_inline uint get_global_work_index(KernelGlobals *kg, uint work_index, uint ray_index)
-{
-	uint num_pools = kernel_num_work_pools(kg);
-	uint pool = work_pool_from_ray_index(kg, ray_index);
-
-	return (work_index / WORK_POOL_SIZE) * (num_pools * WORK_POOL_SIZE)
-	       + (pool * WORK_POOL_SIZE)
-	       + (work_index % WORK_POOL_SIZE);
-}
-
 /* Returns true if there is work */
-ccl_device bool get_next_work(KernelGlobals *kg, ccl_private uint *work_index, uint ray_index)
+ccl_device bool get_next_work(KernelGlobals *kg,
+                              uint thread_index,
+                              ccl_private uint *global_work_index)
 {
-	uint work_pool = work_pool_from_ray_index(kg, ray_index);
-	uint pool_size = work_pool_work_size(kg, work_pool);
+	uint total_work_size = kernel_split_params.w
+	                     * kernel_split_params.h
+	                     * kernel_split_params.num_samples;

-	if(pool_size == 0) {
+	/* With a small amount of work there may be more threads than work due to
+	 * rounding up of global size, stop such threads immediately. */
+	if(thread_index >= total_work_size) {
 		return false;
 	}

-	*work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[work_pool]);
-	return (*work_index < pool_size);
+	/* Increase atomic work index counter in pool. */
+	uint pool = thread_index / WORK_POOL_SIZE;
+	uint work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[pool]);
+
+	/* Map per-pool work index to a global work index. */
+	uint global_size = ccl_global_size(0) * ccl_global_size(1);
+	kernel_assert(global_size % WORK_POOL_SIZE == 0);
+	kernel_assert(thread_index < global_size);
+
+	*global_work_index = (work_index / WORK_POOL_SIZE) * global_size
+	                   + (pool * WORK_POOL_SIZE)
+	                   + (work_index % WORK_POOL_SIZE);
+
+	/* Test if all work for this pool is done. */
+	return (*global_work_index < total_work_size);
 }

-/* This function assumes that the passed `work` is valid. */
-/* Decode sample number w.r.t. assigned `work`. */
-ccl_device uint get_work_sample(KernelGlobals *kg, uint work_index, uint ray_index)
+/* Map global work index to pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(KernelGlobals *kg,
+                                      uint global_work_index,
+                                      ccl_private uint *x,
+                                      ccl_private uint *y,
+                                      ccl_private uint *sample)
 {
-	return get_global_work_index(kg, work_index, ray_index) / (kernel_split_params.w * kernel_split_params.h);
-}
+	uint tile_pixels = kernel_split_params.w * kernel_split_params.h;
+	uint sample_offset = global_work_index / tile_pixels;
+	uint pixel_offset = global_work_index - sample_offset * tile_pixels;
+	uint y_offset = pixel_offset / kernel_split_params.w;
+	uint x_offset = pixel_offset - y_offset * kernel_split_params.w;

-/* Decode pixel and tile position w.r.t. assigned `work`. */
-ccl_device void get_work_pixel_tile_position(KernelGlobals *kg,
-                             ccl_private uint *pixel_x,
-                             ccl_private uint *pixel_y,
-                             ccl_private uint *tile_x,
-                             ccl_private uint *tile_y,
-                             uint work_index,
-                             uint ray_index)
-{
-	uint pixel_index = get_global_work_index(kg, work_index, ray_index) % (kernel_split_params.w*kernel_split_params.h);
-
-	*tile_x = pixel_index % kernel_split_params.w;
-	*tile_y = pixel_index / kernel_split_params.w;
-
-	*pixel_x = *tile_x + kernel_split_params.x;
-	*pixel_y = *tile_y + kernel_split_params.y;
+	*x = kernel_split_params.x + x_offset;
+	*y = kernel_split_params.y + y_offset;
+	*sample = kernel_split_params.start_sample + sample_offset;
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@ -84,14 +84,9 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
 	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];

 	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-		uint work_index = kernel_split_state.work_array[ray_index];
-		uint sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
-
-		uint tile_x, tile_y, pixel_x, pixel_y;
-		get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index);
-
-		ccl_global float *buffer = kernel_split_params.buffer;
-		buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
+		uint sample = state->sample;
+		uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+		ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;

 		/* accumulate result in output buffer */
 		kernel_write_result(kg, buffer, sample, L);
@ -102,31 +97,26 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
 	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
 		/* We have completed current work; So get next work */
 		uint work_index;
-		int valid_work = get_next_work(kg, &work_index, ray_index);
-		if(!valid_work) {
+		if(!get_next_work(kg, ray_index, &work_index)) {
 			/* If work is invalid, this means no more work is available and the thread may exit */
 			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
 		}

 		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-			kernel_split_state.work_array[ray_index] = work_index;
-			/* Get the sample associated with the current work */
-			uint sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
-			/* Get pixel and tile position associated with current work */
-			uint tile_x, tile_y, pixel_x, pixel_y;
-			get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index);
+			uint x, y, sample;
+			get_work_pixel(kg, work_index, &x, &y, &sample);

-			/* Remap rng_state according to the current work */
+			/* Remap rng_state to current pixel. */
 			ccl_global uint *rng_state = kernel_split_params.rng_state;
-			rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride;
+			rng_state += kernel_split_params.offset + x + y*stride;

-			/* Remap buffer according to the current work */
-			ccl_global float *buffer = kernel_split_params.buffer;
-			buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
+			/* Store buffer offset for writing to passes. */
+			uint buffer_offset = (kernel_split_params.offset + x + y*stride) * kernel_data.film.pass_stride;
+			kernel_split_state.buffer_offset[ray_index] = buffer_offset;

 			/* Initialize random numbers and ray. */
 			uint rng_hash;
-			kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, &rng_hash, ray);
+			kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, ray);

 			if(ray->t != 0.0f) {
 				/* Initialize throughput, path radiance, Ray, PathState;
@ -145,6 +135,7 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
 				/* These rays do not participate in path-iteration. */
 				float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 				/* Accumulate result in output buffer. */
+				ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
 				kernel_write_pass_float4(buffer, sample, L_rad);

 				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@ -90,8 +90,6 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 	if(ray_index != QUEUE_EMPTY_SLOT) {
 #endif

-	int stride = kernel_split_params.stride;
-
 	ccl_global PathState *state = 0x0;
 	float3 throughput;

@ -99,15 +97,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 	ShaderData *sd = &kernel_split_state.sd[ray_index];

 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		uint work_index = kernel_split_state.work_array[ray_index];
-		uint pixel_x, pixel_y, tile_x, tile_y;
-		get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
-		                        &tile_x, &tile_y,
-		                        work_index,
-		                        ray_index);
-
-		ccl_global float *buffer = kernel_split_params.buffer;
-		buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride;
+		uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+		ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;

 		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
 		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
--- a/intern/cycles/kernel/split/kernel_path_init.h
+++ b/intern/cycles/kernel/split/kernel_path_init.h
@ -29,38 +29,32 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
 	 */
 	kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;

-	uint work_index = 0;
 	/* Get work. */
-	if(!get_next_work(kg, &work_index, ray_index)) {
+	uint work_index;
+	if(!get_next_work(kg, ray_index, &work_index)) {
 		/* No more work, mark ray as inactive */
 		kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;

 		return;
 	}

-	/* Get the sample associated with the work. */
-	uint sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
-
-	/* Get pixel and tile position associated with the work. */
-	uint pixel_x, pixel_y, tile_x, tile_y;
-	get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
-	                             &tile_x, &tile_y,
-	                             work_index,
-	                             ray_index);
-	kernel_split_state.work_array[ray_index] = work_index;
+	uint x, y, sample;
+	get_work_pixel(kg, work_index, &x, &y, &sample);

+	/* Remap rng_state and buffer to current pixel. */
 	ccl_global uint *rng_state = kernel_split_params.rng_state;
-	rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
+	rng_state += kernel_split_params.offset + x + y*kernel_split_params.stride;

-	ccl_global float *buffer = kernel_split_params.buffer;
-	buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride;
+	/* Store buffer offset for writing to passes. */
+	uint buffer_offset = (kernel_split_params.offset + x + y*kernel_split_params.stride) * kernel_data.film.pass_stride;
+	kernel_split_state.buffer_offset[ray_index] = buffer_offset;

 	/* Initialize random numbers and ray. */
 	uint rng_hash;
 	kernel_path_trace_setup(kg,
 	                        rng_state,
 	                        sample,
-	                        pixel_x, pixel_y,
+	                        x, y,
 	                        &rng_hash,
 	                        &kernel_split_state.ray[ray_index]);

@ -84,6 +78,7 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
 		/* These rays do not participate in path-iteration. */
 		float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 		/* Accumulate result in output buffer. */
+		ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
 		kernel_write_pass_float4(buffer, sample, L_rad);
 		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
 	}
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@ -122,7 +122,7 @@ typedef ccl_global struct SplitBranchedState {
 	SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
 	SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
 	SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \
-	SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \
+	SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \
 	SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
 	SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
 	SPLIT_DATA_SUBSURFACE_ENTRIES \