Cycles: Fix Metal kernel compilation for AMD GPUs
Workaround for a compilation issue preventing kernels compiling for AMD GPUs: Avoid problematic use of templates on Metal by making `gpu_parallel_active_index_array` a wrapper macro, and moving `blocksize` to be a macro parameter. Reviewed By: brecht Differential Revision: https://developer.blender.org/D14081
This commit is contained in:
parent
40fce61a6a
commit
27d3140b13
Notes:
blender-bot
2023-02-14 09:24:53 +01:00
Referenced by issue #95728, Previous render slots get overwritten with the latest render
|
@ -295,7 +295,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
|||
int kernel_index);
|
||||
ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
|
||||
|
||||
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
|
||||
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
|
||||
num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
|
||||
}
|
||||
|
||||
|
@ -310,7 +310,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
|||
int kernel_index);
|
||||
ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
|
||||
|
||||
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
|
||||
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
|
||||
num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
|
||||
}
|
||||
|
||||
|
@ -322,7 +322,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
|||
{
|
||||
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
|
||||
|
||||
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
|
||||
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
|
||||
num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
|
||||
}
|
||||
|
||||
|
@ -335,7 +335,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
|||
{
|
||||
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
|
||||
|
||||
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
|
||||
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
|
||||
num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
|
||||
}
|
||||
|
||||
|
@ -348,7 +348,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
|||
{
|
||||
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
|
||||
|
||||
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
|
||||
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
|
||||
num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
|
||||
}
|
||||
|
||||
|
@ -391,7 +391,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
|||
int num_active_paths);
|
||||
ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
|
||||
|
||||
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
|
||||
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
|
||||
num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
|
||||
}
|
||||
|
||||
|
@ -424,7 +424,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
|||
int num_active_paths);
|
||||
ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
|
||||
|
||||
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
|
||||
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
|
||||
num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
|
||||
}
|
||||
|
||||
|
|
|
@ -31,44 +31,26 @@ CCL_NAMESPACE_BEGIN
|
|||
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
|
||||
#endif
|
||||
|
||||
#ifdef __KERNEL_METAL__
|
||||
struct ActiveIndexContext {
|
||||
ActiveIndexContext(int _thread_index,
|
||||
int _global_index,
|
||||
int _threadgroup_size,
|
||||
int _simdgroup_size,
|
||||
int _simd_lane_index,
|
||||
int _simd_group_index,
|
||||
int _num_simd_groups,
|
||||
threadgroup int *_simdgroup_offset)
|
||||
: thread_index(_thread_index),
|
||||
global_index(_global_index),
|
||||
blocksize(_threadgroup_size),
|
||||
ccl_gpu_warp_size(_simdgroup_size),
|
||||
thread_warp(_simd_lane_index),
|
||||
warp_index(_simd_group_index),
|
||||
num_warps(_num_simd_groups),
|
||||
warp_offset(_simdgroup_offset)
|
||||
{
|
||||
}
|
||||
|
||||
const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index,
|
||||
num_warps;
|
||||
threadgroup int *warp_offset;
|
||||
|
||||
template<uint blocksizeDummy, typename IsActiveOp>
|
||||
void active_index_array(const uint num_states,
|
||||
ccl_global int *indices,
|
||||
ccl_global int *num_indices,
|
||||
IsActiveOp is_active_op)
|
||||
{
|
||||
const uint state_index = global_index;
|
||||
#else
|
||||
#ifndef __KERNEL_METAL__
|
||||
template<uint blocksize, typename IsActiveOp>
|
||||
__device__ void gpu_parallel_active_index_array(const uint num_states,
|
||||
ccl_global int *indices,
|
||||
ccl_global int *num_indices,
|
||||
IsActiveOp is_active_op)
|
||||
__device__
|
||||
#endif
|
||||
void gpu_parallel_active_index_array_impl(const uint num_states,
|
||||
ccl_global int *indices,
|
||||
ccl_global int *num_indices,
|
||||
#ifdef __KERNEL_METAL__
|
||||
const uint is_active,
|
||||
const uint blocksize,
|
||||
const int thread_index,
|
||||
const uint state_index,
|
||||
const int ccl_gpu_warp_size,
|
||||
const int thread_warp,
|
||||
const int warp_index,
|
||||
const int num_warps,
|
||||
threadgroup int *warp_offset)
|
||||
{
|
||||
#else
|
||||
IsActiveOp is_active_op)
|
||||
{
|
||||
extern ccl_gpu_shared int warp_offset[];
|
||||
|
||||
|
@ -79,61 +61,59 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
|
|||
const uint num_warps = blocksize / ccl_gpu_warp_size;
|
||||
|
||||
const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
|
||||
|
||||
/* Test if state corresponding to this thread is active. */
|
||||
const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
|
||||
#endif
|
||||
|
||||
/* Test if state corresponding to this thread is active. */
|
||||
const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
|
||||
/* For each thread within a warp compute how many other active states precede it. */
|
||||
const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
|
||||
ccl_gpu_thread_mask(thread_warp));
|
||||
|
||||
/* For each thread within a warp compute how many other active states precede it. */
|
||||
const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
|
||||
ccl_gpu_thread_mask(thread_warp));
|
||||
|
||||
/* Last thread in warp stores number of active states for each warp. */
|
||||
if (thread_warp == ccl_gpu_warp_size - 1) {
|
||||
warp_offset[warp_index] = thread_offset + is_active;
|
||||
}
|
||||
|
||||
ccl_gpu_syncthreads();
|
||||
|
||||
/* Last thread in block converts per-warp sizes to offsets, increments global size of
|
||||
* index array and gets offset to write to. */
|
||||
if (thread_index == blocksize - 1) {
|
||||
/* TODO: parallelize this. */
|
||||
int offset = 0;
|
||||
for (int i = 0; i < num_warps; i++) {
|
||||
int num_active = warp_offset[i];
|
||||
warp_offset[i] = offset;
|
||||
offset += num_active;
|
||||
}
|
||||
|
||||
const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
|
||||
warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
|
||||
}
|
||||
|
||||
ccl_gpu_syncthreads();
|
||||
|
||||
/* Write to index array. */
|
||||
if (is_active) {
|
||||
const uint block_offset = warp_offset[num_warps];
|
||||
indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
|
||||
}
|
||||
/* Last thread in warp stores number of active states for each warp. */
|
||||
if (thread_warp == ccl_gpu_warp_size - 1) {
|
||||
warp_offset[warp_index] = thread_offset + is_active;
|
||||
}
|
||||
|
||||
#ifdef __KERNEL_METAL__
|
||||
}; /* end class ActiveIndexContext */
|
||||
ccl_gpu_syncthreads();
|
||||
|
||||
/* Last thread in block converts per-warp sizes to offsets, increments global size of
|
||||
* index array and gets offset to write to. */
|
||||
if (thread_index == blocksize - 1) {
|
||||
/* TODO: parallelize this. */
|
||||
int offset = 0;
|
||||
for (int i = 0; i < num_warps; i++) {
|
||||
int num_active = warp_offset[i];
|
||||
warp_offset[i] = offset;
|
||||
offset += num_active;
|
||||
}
|
||||
|
||||
const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
|
||||
warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
|
||||
}
|
||||
|
||||
ccl_gpu_syncthreads();
|
||||
|
||||
/* Write to index array. */
|
||||
if (is_active) {
|
||||
const uint block_offset = warp_offset[num_warps];
|
||||
indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __KERNEL_METAL__
|
||||
|
||||
# define gpu_parallel_active_index_array(dummy, num_states, indices, num_indices, is_active_op) \
|
||||
const uint is_active = (ccl_gpu_global_id_x() < num_states) ? is_active_op(ccl_gpu_global_id_x()) : 0; \
|
||||
gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active, \
|
||||
metal_local_size, metal_local_id, metal_global_id, simdgroup_size, simd_lane_index, \
|
||||
simd_group_index, num_simd_groups, simdgroup_offset)
|
||||
|
||||
#else
|
||||
|
||||
# define gpu_parallel_active_index_array(blocksize, num_states, indices, num_indices, is_active_op) \
|
||||
gpu_parallel_active_index_array_impl<blocksize>(num_states, indices, num_indices, is_active_op)
|
||||
|
||||
/* inject the required thread params into a struct, and redirect to its templated member function
|
||||
*/
|
||||
# define gpu_parallel_active_index_array \
|
||||
ActiveIndexContext(metal_local_id, \
|
||||
metal_global_id, \
|
||||
metal_local_size, \
|
||||
simdgroup_size, \
|
||||
simd_lane_index, \
|
||||
simd_group_index, \
|
||||
num_simd_groups, \
|
||||
simdgroup_offset) \
|
||||
.active_index_array
|
||||
#endif
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
|
Loading…
Reference in New Issue