Cycles: Add an experimental CUDA kernel.

Now we build 2 .cubins per architecture (e.g. kernel_sm_21.cubin, kernel_experimental_sm_21.cubin).
The experimental kernel can be used by switching to the Experimental Feature Set: http://wiki.blender.org/index.php/Doc:2.6/Manual/Render/Cycles/Experimental_Features

This enables Subsurface Scattering and Correlated Multi Jitter Sampling on GPU, while keeping the stability and performance of the regular kernel.

Differential Revision: https://developer.blender.org/D762
Patch by Sergey and myself.

Developer / Builder Note:
CUDA Toolkit 6.5 is highly recommended for this, also note that building the experimental kernel requires a lot of system memory (~7-8GB).
This commit is contained in:
Thomas Dinges 2014-08-26 17:02:03 +02:00
parent f6e049cd5a
commit fb3f32760d
Notes: blender-bot 2023-02-14 10:09:24 +01:00
Referenced by issue #41639, Pie menu : pie menu calling another pie menu : need to double click (on second menu)
7 changed files with 88 additions and 55 deletions

View File

@ -984,8 +984,9 @@ if env['OURPLATFORM']!='darwin':
dir=os.path.join(env['BF_INSTALLDIR'], VERSION, 'scripts', 'addons','cycles', 'lib')
for arch in env['BF_CYCLES_CUDA_BINARIES_ARCH']:
kernel_build_dir = os.path.join(B.root_build_dir, 'intern/cycles/kernel')
cubin_file = os.path.join(kernel_build_dir, "kernel_%s.cubin" % arch)
cubininstall.append(env.Install(dir=dir,source=cubin_file))
for suffix in ('', '_experimental'):
cubin_file = os.path.join(kernel_build_dir, "kernel%s_%s.cubin" % (suffix, arch))
cubininstall.append(env.Install(dir=dir,source=cubin_file))
# osl shaders
if env['WITH_BF_CYCLES_OSL']:

View File

@ -154,7 +154,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
sub.prop(cscene, "subsurface_samples", text="Subsurface")
sub.prop(cscene, "volume_samples", text="Volume")
if use_cpu(context):
if use_cpu(context) or cscene.feature_set == 'EXPERIMENTAL':
layout.row().prop(cscene, "sampling_pattern", text="Pattern")
for rl in scene.render.layers:

View File

@ -197,14 +197,18 @@ public:
return true;
}
string compile_kernel()
string compile_kernel(bool experimental)
{
/* compute cubin name */
int major, minor;
cuDeviceComputeCapability(&major, &minor, cuDevId);
/* attempt to use kernel provided with blender */
string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
string cubin;
if(experimental)
cubin = path_get(string_printf("lib/kernel_experimental_sm_%d%d.cubin", major, minor));
else
cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
if(path_exists(cubin))
return cubin;
@ -212,7 +216,10 @@ public:
string kernel_path = path_get("kernel");
string md5 = path_files_md5_hash(kernel_path);
cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
if(experimental)
cubin = string_printf("cycles_kernel_experimental_sm%d%d_%s.cubin", major, minor, md5.c_str());
else
cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
cubin = path_user_get(path_join("cache", cubin));
/* if exists already, use it */
@ -263,6 +270,9 @@ public:
string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
"-o \"%s\" --ptxas-options=\"-v\" -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
nvcc, major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version);
if(experimental)
command += " -D__KERNEL_CUDA_EXPERIMENTAL__";
printf("%s\n", command.c_str());
@ -293,7 +303,7 @@ public:
return false;
/* get kernel */
string cubin = compile_kernel();
string cubin = compile_kernel(experimental);
if(cubin == "")
return false;

View File

@ -160,37 +160,50 @@ if(WITH_CYCLES_CUDA_BINARIES)
set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
set(cuda_cubins)
foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
set(cuda_cubin kernel_${arch}.cubin)
macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
if(${experimental})
set(cuda_extra_flags "-D__KERNEL_CUDA_EXPERIMENTAL__")
set(cuda_cubin kernel_experimental_${arch}.cubin)
else()
set(cuda_extra_flags "")
set(cuda_cubin kernel_${arch}.cubin)
endif()
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
set(cuda_math_flags "--use_fast_math")
if(CUDA_VERSION LESS 60 AND ${arch} MATCHES "sm_50")
message(WARNING "Can't build kernel for CUDA sm_50 architecture, skipping")
else()
add_custom_command(
OUTPUT ${cuda_cubin}
COMMAND ${CUDA_NVCC_EXECUTABLE}
-arch=${arch}
-m${CUDA_BITS}
--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu
-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
--ptxas-options="-v"
${cuda_arch_flags}
${cuda_version_flags}
${cuda_math_flags}
-I${CMAKE_CURRENT_SOURCE_DIR}/../util
-I${CMAKE_CURRENT_SOURCE_DIR}/svm
-DCCL_NAMESPACE_BEGIN=
-DCCL_NAMESPACE_END=
-DNVCC
add_custom_command(
OUTPUT ${cuda_cubin}
COMMAND ${CUDA_NVCC_EXECUTABLE}
-arch=${arch}
-m${CUDA_BITS}
--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu
-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
--ptxas-options="-v"
${cuda_arch_flags}
${cuda_version_flags}
${cuda_math_flags}
${cuda_extra_flags}
-I${CMAKE_CURRENT_SOURCE_DIR}/../util
-I${CMAKE_CURRENT_SOURCE_DIR}/svm
-DCCL_NAMESPACE_BEGIN=
-DCCL_NAMESPACE_END=
-DNVCC
DEPENDS ${cuda_sources})
DEPENDS ${cuda_sources})
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
list(APPEND cuda_cubins ${cuda_cubin})
endif()
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
list(APPEND cuda_cubins ${cuda_cubin})
unset(cuda_extra_flags)
endmacro()
foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
# Compile regular kernel
CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
# Compile experimental kernel
CYCLES_CUDA_KERNEL_ADD(${arch} TRUE)
endforeach()
add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})

View File

@ -83,30 +83,35 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h')
last_cubin_file = None
configs = (("kernel_%s.cubin", ''),
("kernel_experimental_%s.cubin", ' -D__KERNEL_CUDA_EXPERIMENTAL__'))
# add command for each cuda architecture
for arch in cuda_archs:
if cuda_version < 60 and arch == "sm_50":
print("Can't build kernel for CUDA sm_50 architecture, skipping")
continue
for config in configs:
# TODO(sergey): Use dict instead ocouple in order to increase readability?
name = config[0]
extra_flags = config[1]
cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch)
cubin_file = os.path.join(build_dir, name % arch)
current_flags = nvcc_flags + extra_flags
if env['BF_CYCLES_CUDA_ENV']:
MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd"
command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, nvcc_flags, kernel_file, cubin_file)
else:
command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, kernel_file, cubin_file)
if env['BF_CYCLES_CUDA_ENV']:
MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd"
command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, current_flags, kernel_file, cubin_file)
else:
command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, current_flags, kernel_file, cubin_file)
kernel.Command(cubin_file, 'kernel.cu', command)
kernel.Depends(cubin_file, dependencies)
kernel.Command(cubin_file, 'kernel.cu', command)
kernel.Depends(cubin_file, dependencies)
kernel_binaries.append(cubin_file)
if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']:
# trick to compile one kernel at a time to reduce memory usage
if last_cubin_file:
kernel.Depends(cubin_file, last_cubin_file)
last_cubin_file = cubin_file
kernel_binaries.append(cubin_file)
if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']:
# trick to compile one kernel at a time to reduce memory usage
if last_cubin_file:
kernel.Depends(cubin_file, last_cubin_file)
last_cubin_file = cubin_file
Return('kernel_binaries')

View File

@ -79,8 +79,11 @@ CCL_NAMESPACE_BEGIN
#define __VOLUME_SCATTER__
/* Experimental on GPU */
//#define __VOLUME_DECOUPLED__
//#define __SUBSURFACE__
#ifdef __KERNEL_CUDA_EXPERIMENTAL__
#define __SUBSURFACE__
#define __CMJ__
#endif
#endif
#ifdef __KERNEL_OPENCL__

View File

@ -942,12 +942,13 @@ static void node_shader_buts_anisotropic(uiLayout *layout, bContext *UNUSED(C),
static void node_shader_buts_subsurface(uiLayout *layout, bContext *C, PointerRNA *ptr)
{
/* SSS does not work on GPU yet */
/* SSS only enabled in Experimental Kernel */
PointerRNA scene = CTX_data_pointer_get(C, "scene");
if (scene.data) {
PointerRNA cscene = RNA_pointer_get(&scene, "cycles");
if (cscene.data && (RNA_enum_get(&cscene, "device") == 1 && U.compute_device_type != 0))
uiItemL(layout, IFACE_("SSS not supported on GPU"), ICON_ERROR);
if (cscene.data && (RNA_enum_get(&cscene, "device") == 1 && U.compute_device_type != 0
&& RNA_enum_get(&cscene, "feature_set") == 0))
uiItemL(layout, IFACE_("Only enabled in experimental GPU kernel"), ICON_ERROR);
}
uiItemR(layout, ptr, "falloff", 0, "", ICON_NONE);