Cycles code refactor: use __launch_bounds__ instead of -maxrregcount for CUDA.

This makes it easier to have per kernel number of registers. Also, all the
tunable parameters for this are now in kernel.cu, rather than spread over cmake,
scons and device_cuda.cpp.
This commit is contained in:
Brecht Van Lommel 2014-04-16 19:04:58 +02:00
parent f2f3ef8692
commit 2851ed4a55
4 changed files with 114 additions and 56 deletions

View File

@ -253,7 +253,6 @@ public:
return false;
}
return true;
}
@ -315,17 +314,6 @@ public:
string kernel = path_join(kernel_path, "kernel.cu");
string include = kernel_path;
const int machine = system_cpu_bits();
string arch_flags;
/* CUDA 5.x build flags for different archs */
if(major == 2) {
/* sm_2x */
arch_flags = "--maxrregcount=40 --use_fast_math";
}
else if(major == 3) {
/* sm_3x */
arch_flags = "--maxrregcount=32 --use_fast_math";
}
double starttime = time_dt();
printf("Compiling CUDA kernel ...\n");
@ -333,8 +321,8 @@ public:
path_create_directories(cubin);
string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
"-o \"%s\" --ptxas-options=\"-v\" %s -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), arch_flags.c_str(), include.c_str(), cuda_version);
"-o \"%s\" --ptxas-options=\"-v\" -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version);
printf("%s\n", command.c_str());
@ -665,9 +653,18 @@ public:
cuda_assert(cuParamSetSize(cuPathTrace, offset))
/* launch kernel: todo find optimal size, cache config for fermi */
int xthreads = 16;
int ythreads = 16;
/* launch kernel */
int threads_per_block;
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace))
/*int num_registers;
cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace))
printf("threads_per_block %d\n", threads_per_block);
printf("num_registers %d\n", num_registers);*/
int xthreads = (int)sqrt(threads_per_block);
int ythreads = (int)sqrt(threads_per_block);
int xblocks = (rtile.w + xthreads - 1)/xthreads;
int yblocks = (rtile.h + ythreads - 1)/ythreads;
@ -730,9 +727,12 @@ public:
cuda_assert(cuParamSetSize(cuFilmConvert, offset))
/* launch kernel: todo find optimal size, cache config for fermi */
int xthreads = 16;
int ythreads = 16;
/* launch kernel */
int threads_per_block;
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert))
int xthreads = (int)sqrt(threads_per_block);
int ythreads = (int)sqrt(threads_per_block);
int xblocks = (task.w + xthreads - 1)/xthreads;
int yblocks = (task.h + ythreads - 1)/ythreads;
@ -752,40 +752,42 @@ public:
cuda_push_context();
CUfunction cuDisplace;
CUfunction cuShader;
CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
/* get kernel function */
cuda_assert(cuModuleGetFunction(&cuDisplace, cuModule, "kernel_cuda_shader"))
cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader"))
/* pass in parameters */
int offset = 0;
cuda_assert(cuParamSetv(cuDisplace, offset, &d_input, sizeof(d_input)))
cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input)))
offset += sizeof(d_input);
cuda_assert(cuParamSetv(cuDisplace, offset, &d_output, sizeof(d_output)))
cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output)))
offset += sizeof(d_output);
int shader_eval_type = task.shader_eval_type;
offset = align_up(offset, __alignof(shader_eval_type));
cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_eval_type))
cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type))
offset += sizeof(task.shader_eval_type);
cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_x))
cuda_assert(cuParamSeti(cuShader, offset, task.shader_x))
offset += sizeof(task.shader_x);
cuda_assert(cuParamSetSize(cuDisplace, offset))
cuda_assert(cuParamSetSize(cuShader, offset))
/* launch kernel: todo find optimal size, cache config for fermi */
int xthreads = 16;
int xblocks = (task.shader_w + xthreads - 1)/xthreads;
/* launch kernel */
int threads_per_block;
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader))
cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1))
cuda_assert(cuFuncSetBlockShape(cuDisplace, xthreads, 1, 1))
cuda_assert(cuLaunchGrid(cuDisplace, xblocks, 1))
int xblocks = (task.shader_w + threads_per_block - 1)/threads_per_block;
cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1))
cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1))
cuda_assert(cuLaunchGrid(cuShader, xblocks, 1))
cuda_pop_context();
}

View File

@ -161,16 +161,6 @@ if(WITH_CYCLES_CUDA_BINARIES)
set(cuda_cubin kernel_${arch}.cubin)
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
# CUDA 5.x build flags for different archs
if(${arch} MATCHES "sm_2[0-9]")
# sm_2x
set(cuda_arch_flags "--maxrregcount=40")
elseif(${arch} MATCHES "sm_3[0-9]")
# sm_3x
set(cuda_arch_flags "--maxrregcount=32")
endif()
set(cuda_math_flags "--use_fast_math")
if(CUDA_VERSION LESS 50 AND ${arch} MATCHES "sm_35")

View File

@ -87,14 +87,6 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
for arch in cuda_archs:
cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch)
# CUDA 5.x build flags for different archs
if arch.startswith("sm_2"):
# sm_2x
cuda_arch_flags = "--maxrregcount=40 --use_fast_math"
elif arch.startswith("sm_3"):
# sm_3x
cuda_arch_flags = "--maxrregcount=32 --use_fast_math"
if env['BF_CYCLES_CUDA_ENV']:
MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd"
command = "\"%s\" & \"%s\" -arch=%s %s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, nvcc_flags, cuda_arch_flags, kernel_file, cubin_file)

View File

@ -24,7 +24,71 @@
#include "kernel_path.h"
#include "kernel_displace.h"
extern "C" __global__ void kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
/* device data taken from CUDA occupancy calculator */
#ifdef __CUDA_ARCH__
/* 2.0 and 2.1 */
#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
#define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
#define CUDA_BLOCK_MAX_THREADS 1024
#define CUDA_THREAD_MAX_REGISTERS 63
/* tunable parameters */
#define CUDA_THREADS_BLOCK_WIDTH 16
#define CUDA_KERNEL_MAX_REGISTERS 32
#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
/* 3.0 and 3.5 */
#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
#define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
#define CUDA_BLOCK_MAX_THREADS 1024
#define CUDA_THREAD_MAX_REGISTERS 63
/* tunable parameters */
#define CUDA_THREADS_BLOCK_WIDTH 16
#define CUDA_KERNEL_MAX_REGISTERS 32
#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
/* unknown architecture */
#else
#error "Unknown or unuspported CUDA architecture, can't determine launch bounds"
#endif
/* compute number of threads per block and minimum blocks per multiprocessor
* given the maximum number of registers per thread */
#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
__launch_bounds__( \
threads_block_width*threads_block_width, \
CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
)
/* sanity checks */
#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
#error "Maximum number of threads per block exceeded"
#endif
#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
#error "Maximum number of blocks per multiprocessor exceeded"
#endif
#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
#error "Maximum number of registers per thread exceeded"
#endif
#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
#error "Maximum number of registers per thread exceeded"
#endif
/* kernels */
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
@ -34,7 +98,9 @@ extern "C" __global__ void kernel_cuda_path_trace(float *buffer, uint *rng_state
}
#ifdef __BRANCHED_PATH__
extern "C" __global__ void kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
@ -44,7 +110,9 @@ extern "C" __global__ void kernel_cuda_branched_path_trace(float *buffer, uint *
}
#endif
extern "C" __global__ void kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
@ -53,7 +121,9 @@ extern "C" __global__ void kernel_cuda_convert_to_byte(uchar4 *rgba, float *buff
kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
}
extern "C" __global__ void kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
@ -62,10 +132,14 @@ extern "C" __global__ void kernel_cuda_convert_to_half_float(uchar4 *rgba, float
kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
}
extern "C" __global__ void kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx)
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x);
}
#endif