Cycles: Add an experimental CUDA kernel.

Now we build 2 .cubins per architecture (e.g. kernel_sm_21.cubin, kernel_experimental_sm_21.cubin). The experimental kernel can be used by switching to the Experimental Feature Set: http://wiki.blender.org/index.php/Doc:2.6/Manual/Render/Cycles/Experimental_Features This enables Subsurface Scattering and Correlated Multi Jitter Sampling on GPU, while keeping the stability and performance of the regular kernel. Differential Revision: https://developer.blender.org/D762 Patch by Sergey and myself. Developer / Builder Note: CUDA Toolkit 6.5 is highly recommended for this, also note that building the experimental kernel requires a lot of system memory (~7-8GB).
Referenced by issue #41639, Pie menu : pie menu calling another pie menu : need to double click (on second menu)
2014-08-26 17:02:03 +02:00 · 2014-08-26 17:02:03 +02:00 · fb3f32760d · 2023-02-14 10:09:24 +01:00
parent f6e049cd5a
commit fb3f32760d
7 changed files with 88 additions and 55 deletions
--- a/5
+++ b/5
@ -984,8 +984,9 @@ if env['OURPLATFORM']!='darwin':
            dir=os.path.join(env['BF_INSTALLDIR'], VERSION, 'scripts', 'addons','cycles', 'lib')
            for arch in env['BF_CYCLES_CUDA_BINARIES_ARCH']:
                kernel_build_dir = os.path.join(B.root_build_dir, 'intern/cycles/kernel')
-                cubin_file = os.path.join(kernel_build_dir, "kernel_%s.cubin" % arch)
-                cubininstall.append(env.Install(dir=dir,source=cubin_file))
+                for suffix in ('', '_experimental'):
+                    cubin_file = os.path.join(kernel_build_dir, "kernel%s_%s.cubin" % (suffix, arch))
+                    cubininstall.append(env.Install(dir=dir,source=cubin_file))

        # osl shaders
        if env['WITH_BF_CYCLES_OSL']:
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@ -154,7 +154,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
            sub.prop(cscene, "subsurface_samples", text="Subsurface")
            sub.prop(cscene, "volume_samples", text="Volume")

-        if use_cpu(context):
+        if use_cpu(context) or cscene.feature_set == 'EXPERIMENTAL':
            layout.row().prop(cscene, "sampling_pattern", text="Pattern")

        for rl in scene.render.layers:
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@ -197,14 +197,18 @@ public:
 		return true;
 	}

-	string compile_kernel()
+	string compile_kernel(bool experimental)
 	{
 		/* compute cubin name */
 		int major, minor;
 		cuDeviceComputeCapability(&major, &minor, cuDevId);

 		/* attempt to use kernel provided with blender */
-		string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
+		string cubin;
+		if(experimental)
+			cubin = path_get(string_printf("lib/kernel_experimental_sm_%d%d.cubin", major, minor));
+		else
+			cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
 		if(path_exists(cubin))
 			return cubin;

@ -212,7 +216,10 @@ public:
 		string kernel_path = path_get("kernel");
 		string md5 = path_files_md5_hash(kernel_path);

-		cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
+		if(experimental)
+			cubin = string_printf("cycles_kernel_experimental_sm%d%d_%s.cubin", major, minor, md5.c_str());
+		else
+			cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
 		cubin = path_user_get(path_join("cache", cubin));

 		/* if exists already, use it */
@ -263,6 +270,9 @@ public:
 		string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
 			"-o \"%s\" --ptxas-options=\"-v\" -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
 			nvcc, major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version);
+		
+		if(experimental)
+			command += " -D__KERNEL_CUDA_EXPERIMENTAL__";

 		printf("%s\n", command.c_str());

@ -293,7 +303,7 @@ public:
 			return false;

 		/* get kernel */
-		string cubin = compile_kernel();
+		string cubin = compile_kernel(experimental);

 		if(cubin == "")
 			return false;
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -160,37 +160,50 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
 	set(cuda_cubins)

-	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
-		set(cuda_cubin kernel_${arch}.cubin)
+	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
+		if(${experimental})
+			set(cuda_extra_flags "-D__KERNEL_CUDA_EXPERIMENTAL__")
+			set(cuda_cubin kernel_experimental_${arch}.cubin)
+		else()
+			set(cuda_extra_flags "")
+			set(cuda_cubin kernel_${arch}.cubin)
+		endif()

 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
 		set(cuda_math_flags "--use_fast_math")

-		if(CUDA_VERSION LESS 60 AND ${arch} MATCHES "sm_50")
-			message(WARNING "Can't build kernel for CUDA sm_50 architecture, skipping")
-		else()
-			add_custom_command(
-				OUTPUT ${cuda_cubin}
-				COMMAND ${CUDA_NVCC_EXECUTABLE}
-				        -arch=${arch}
-				        -m${CUDA_BITS}
-				        --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu
-				        -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
-				        --ptxas-options="-v"
-				        ${cuda_arch_flags}
-				        ${cuda_version_flags}
-				        ${cuda_math_flags}
-				        -I${CMAKE_CURRENT_SOURCE_DIR}/../util
-				        -I${CMAKE_CURRENT_SOURCE_DIR}/svm
-				        -DCCL_NAMESPACE_BEGIN=
-				        -DCCL_NAMESPACE_END=
-				        -DNVCC
+		add_custom_command(
+			OUTPUT ${cuda_cubin}
+			COMMAND ${CUDA_NVCC_EXECUTABLE}
+					-arch=${arch}
+					-m${CUDA_BITS}
+					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu
+					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
+					--ptxas-options="-v"
+					${cuda_arch_flags}
+					${cuda_version_flags}
+					${cuda_math_flags}
+					${cuda_extra_flags}
+					-I${CMAKE_CURRENT_SOURCE_DIR}/../util
+					-I${CMAKE_CURRENT_SOURCE_DIR}/svm
+					-DCCL_NAMESPACE_BEGIN=
+					-DCCL_NAMESPACE_END=
+					-DNVCC

-				DEPENDS ${cuda_sources})
+			DEPENDS ${cuda_sources})

-			delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
-			list(APPEND cuda_cubins ${cuda_cubin})
-		endif()
+		delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
+		list(APPEND cuda_cubins ${cuda_cubin})
+
+		unset(cuda_extra_flags)
+	endmacro()
+
+	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
+		# Compile regular kernel
+		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
+
+		# Compile experimental kernel
+		CYCLES_CUDA_KERNEL_ADD(${arch} TRUE)
 	endforeach()

 	add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})
--- a/intern/cycles/kernel/SConscript
+++ b/intern/cycles/kernel/SConscript
@ -83,30 +83,35 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
    dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h')
    last_cubin_file = None

+    configs = (("kernel_%s.cubin", ''),
+               ("kernel_experimental_%s.cubin", ' -D__KERNEL_CUDA_EXPERIMENTAL__'))
+
    # add command for each cuda architecture
    for arch in cuda_archs:
-        if cuda_version < 60 and arch == "sm_50":
-            print("Can't build kernel for CUDA sm_50 architecture, skipping")
-            continue
+        for config in configs:
+            # TODO(sergey): Use dict instead ocouple in order to increase readability?
+            name = config[0]
+            extra_flags = config[1]

-        cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch)
+            cubin_file = os.path.join(build_dir, name % arch)
+            current_flags = nvcc_flags + extra_flags

-        if env['BF_CYCLES_CUDA_ENV']:
-            MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd"
-            command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, nvcc_flags, kernel_file, cubin_file)
-        else:
-            command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, kernel_file, cubin_file)
+            if env['BF_CYCLES_CUDA_ENV']:
+                MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd"
+                command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, current_flags, kernel_file, cubin_file)
+            else:
+                command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, current_flags, kernel_file, cubin_file)

-        kernel.Command(cubin_file, 'kernel.cu', command)
-        kernel.Depends(cubin_file, dependencies)
+            kernel.Command(cubin_file, 'kernel.cu', command)
+            kernel.Depends(cubin_file, dependencies)

-        kernel_binaries.append(cubin_file)
-        
-        if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']:
-            # trick to compile one kernel at a time to reduce memory usage
-            if last_cubin_file:
-                kernel.Depends(cubin_file, last_cubin_file)
-            last_cubin_file = cubin_file
+            kernel_binaries.append(cubin_file)
+
+            if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']:
+                # trick to compile one kernel at a time to reduce memory usage
+                if last_cubin_file:
+                    kernel.Depends(cubin_file, last_cubin_file)
+                last_cubin_file = cubin_file

 Return('kernel_binaries')

--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@ -79,8 +79,11 @@ CCL_NAMESPACE_BEGIN
 #define __VOLUME_SCATTER__

 /* Experimental on GPU */
-//#define __VOLUME_DECOUPLED__
-//#define __SUBSURFACE__
+#ifdef __KERNEL_CUDA_EXPERIMENTAL__
+#define __SUBSURFACE__
+#define __CMJ__
+#endif
+
 #endif

 #ifdef __KERNEL_OPENCL__
--- a/source/blender/editors/space_node/drawnode.c
+++ b/source/blender/editors/space_node/drawnode.c
@ -942,12 +942,13 @@ static void node_shader_buts_anisotropic(uiLayout *layout, bContext *UNUSED(C),

 static void node_shader_buts_subsurface(uiLayout *layout, bContext *C, PointerRNA *ptr)
 {
-	/* SSS does not work on GPU yet */
+	/* SSS only enabled in Experimental Kernel */
 	PointerRNA scene = CTX_data_pointer_get(C, "scene");
 	if (scene.data) {
 		PointerRNA cscene = RNA_pointer_get(&scene, "cycles");
-		if (cscene.data && (RNA_enum_get(&cscene, "device") == 1 && U.compute_device_type != 0))
-			uiItemL(layout, IFACE_("SSS not supported on GPU"), ICON_ERROR);
+		if (cscene.data && (RNA_enum_get(&cscene, "device") == 1 && U.compute_device_type != 0
+			&& RNA_enum_get(&cscene, "feature_set") == 0))
+			uiItemL(layout, IFACE_("Only enabled in experimental GPU kernel"), ICON_ERROR);
 	}

 	uiItemR(layout, ptr, "falloff", 0, "", ICON_NONE);