Cycles CUDA: make CUDA toolkit 6.0 the official supported version.

This also updates the configurations to build kernels for compute capability
5.0 cards, when using and older CUDA toolkit version this will be skipped.

Also includes tweaks to improve performance with this version:
* Increase max registers on sm_30, sm_35 and sm_50
* No longer use texture storage on sm_30
This commit is contained in:
Brecht Van Lommel 2014-04-30 10:54:17 +02:00
parent 4d1a109dde
commit 741f17f05b
14 changed files with 28 additions and 22 deletions

View File

@ -264,7 +264,7 @@ option(WITH_CYCLES_STANDALONE "Build cycles standalone application" OFF)
option(WITH_CYCLES_STANDALONE_GUI "Build cycles standalone with GUI" OFF)
option(WITH_CYCLES_OSL "Build Cycles with OSL support" OFF)
option(WITH_CYCLES_CUDA_BINARIES "Build cycles CUDA binaries" OFF)
set(CYCLES_CUDA_BINARIES_ARCH sm_20 sm_21 sm_30 sm_35 CACHE STRING "CUDA architectures to build binaries for")
set(CYCLES_CUDA_BINARIES_ARCH sm_20 sm_21 sm_30 sm_35 sm_50 CACHE STRING "CUDA architectures to build binaries for")
mark_as_advanced(CYCLES_CUDA_BINARIES_ARCH)
unset(PLATFORM_DEFAULT)

View File

@ -2,4 +2,4 @@ BF_BUILDDIR = '../blender-build/linux-glibc211-i686'
BF_INSTALLDIR = '../blender-install/linux-glibc211-i686'
BF_NUMJOBS = 1
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30', 'sm_35']
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30', 'sm_35', 'sm_50']

View File

@ -2,4 +2,4 @@ BF_BUILDDIR = '../blender-build/linux-glibc211-x86_64'
BF_INSTALLDIR = '../blender-install/linux-glibc211-x86_64'
BF_NUMJOBS = 1
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30', 'sm_35']
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30', 'sm_35', 'sm_50']

View File

@ -199,7 +199,7 @@ BF_BOOST_LIBPATH = '${BF_BOOST}/lib'
WITH_BF_CYCLES_CUDA_BINARIES = False
BF_CYCLES_CUDA_NVCC = '/usr/local/cuda/bin/nvcc'
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30', 'sm_35']
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30', 'sm_35', 'sm_50']
#Freestyle
WITH_BF_FREESTYLE = True

View File

@ -206,7 +206,7 @@ WITH_BF_CYCLES = WITH_BF_OIIO and WITH_BF_BOOST
WITH_BF_CYCLES_CUDA_BINARIES = False
BF_CYCLES_CUDA_NVCC = '/usr/local/cuda/bin/nvcc'
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30', 'sm_35']
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30', 'sm_35', 'sm_50']
WITH_BF_OPENMP = True

View File

@ -145,7 +145,7 @@ BF_OPENCOLLADA_LIBPATH = '${BF_OPENCOLLADA}/lib/opencollada'
WITH_BF_CYCLES = True
WITH_BF_CYCLES_CUDA_BINARIES = False
BF_CYCLES_CUDA_NVCC = "" # Path to the NVIDIA CUDA compiler
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30', 'sm_35']
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30', 'sm_35', 'sm_50']
WITH_BF_OIIO = True
BF_OIIO = LIBDIR + '/openimageio'
@ -175,7 +175,7 @@ WITH_BF_OPENMP = True
#CUDA
WITH_BF_CYCLES_CUDA_BINARIES = False
#BF_CYCLES_CUDA_NVCC = "" # Path to the nvidia compiler
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30']
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30', 'sm_35', 'sm_50']
#Freestyle
WITH_BF_FREESTYLE = True

View File

@ -226,7 +226,7 @@ WITH_BF_CYCLES_CUDA_BINARIES = False
if VC_VERSION == '11.0':
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30']
else:
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30', 'sm_35']
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30', 'sm_35', 'sm_50']
#Ray trace optimization
WITH_BF_RAYOPTIMIZATION = True

View File

@ -144,7 +144,7 @@ BF_OPENCOLLADA_LIBPATH = '${BF_OPENCOLLADA}/lib/opencollada'
WITH_BF_CYCLES = True
WITH_BF_CYCLES_CUDA_BINARIES = False
BF_CYCLES_CUDA_NVCC = "" # Path to the NVIDIA CUDA compiler
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30', 'sm_35']
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30', 'sm_35', 'sm_50']
WITH_BF_OIIO = True
BF_OIIO = LIBDIR + '/openimageio'

View File

@ -224,7 +224,7 @@ BF_BOOST_LIBPATH = '${BF_BOOST}/lib'
#CUDA
WITH_BF_CYCLES_CUDA_BINARIES = False
#BF_CYCLES_CUDA_NVCC = "" # Path to the nvidia compiler
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30', 'sm_35']
BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_20', 'sm_21', 'sm_30', 'sm_35', 'sm_50']
#Ray trace optimization
WITH_BF_RAYOPTIMIZATION = True

View File

@ -222,7 +222,7 @@ public:
/* In order to use full 6GB of memory on Titan cards, use arrays instead
* of textures. On earlier cards this seems slower, but on Titan it is
* actually slightly faster in tests. */
use_texture_storage = (cuDevArchitecture < 350);
use_texture_storage = (cuDevArchitecture < 300);
cuda_pop_context();
}

View File

@ -146,11 +146,11 @@ if(WITH_CYCLES_CUDA_BINARIES)
set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")
# warn for other versions
if(CUDA_VERSION MATCHES "50")
if(CUDA_VERSION MATCHES "60")
else()
message(WARNING
"CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, "
"build may succeed but only CUDA 5.0 is officially supported")
"build may succeed but only CUDA 6.0 is officially supported")
endif()
# build for each arch
@ -162,8 +162,10 @@ if(WITH_CYCLES_CUDA_BINARIES)
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
set(cuda_math_flags "--use_fast_math")
if(CUDA_VERSION LESS 50 AND ${arch} MATCHES "sm_35")
if(CUDA_VERSION LESS 60 AND ${arch} MATCHES "sm_50")
message(WARNING "Can't build kernel for CUDA sm_50 architecture, skipping")
elseif(CUDA_VERSION LESS 50 AND ${arch} MATCHES "sm_35")
message(WARNING "Can't build kernel for CUDA sm_35 architecture, skipping")
else()
add_custom_command(

View File

@ -69,8 +69,8 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
cuda_major_minor = re.findall(r'release (\d+).(\d+)', output)[0]
cuda_version = int(cuda_major_minor[0])*10 + int(cuda_major_minor[1])
if cuda_version != 50:
print("CUDA version %d.%d detected, build may succeed but only CUDA 5.0 is officially supported." % (cuda_version/10, cuda_version%10))
if cuda_version != 60:
print("CUDA version %d.%d detected, build may succeed but only CUDA 6.0 is officially supported." % (cuda_version/10, cuda_version%10))
# nvcc flags
nvcc_flags = "-m%s" % (bits)
@ -85,6 +85,10 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
# add command for each cuda architecture
for arch in cuda_archs:
if cuda_version < 60 and arch == "sm_50":
print("Can't build kernel for CUDA sm_50 architecture, skipping")
continue
cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch)
if env['BF_CYCLES_CUDA_ENV']:

View File

@ -49,8 +49,8 @@
/* tunable parameters */
#define CUDA_THREADS_BLOCK_WIDTH 16
#define CUDA_KERNEL_MAX_REGISTERS 32
#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
#define CUDA_KERNEL_MAX_REGISTERS 63
#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* 5.0 */
#elif __CUDA_ARCH__ == 500
@ -61,8 +61,8 @@
/* tunable parameters */
#define CUDA_THREADS_BLOCK_WIDTH 16
#define CUDA_KERNEL_MAX_REGISTERS 32
#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
#define CUDA_KERNEL_MAX_REGISTERS 63
#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* unknown architecture */
#else

View File

@ -60,7 +60,7 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
/* In order to use full 6GB of memory on Titan cards, use arrays instead
* of textures. On earlier cards this seems slower, but on Titan it is
* actually slightly faster in tests. */
#if __CUDA_ARCH__ < 350
#if __CUDA_ARCH__ < 300
#define __KERNEL_CUDA_TEX_STORAGE__
#endif