Cycles: Add an AVX2 CPU kernel.

This kernel is compiled with AVX2, FMA3, and BMI compiler flags. At the moment only Intel Haswell benefits from this, but future AMD CPUs will have these instructions as well.

Makes rendering on Haswell CPUs a few percent faster, only benchmarked with clang on OS X though.

Part of my GSoC 2014.
This commit is contained in:
Thomas Dinges 2014-06-13 22:23:58 +02:00
parent b4aa51f8d7
commit 866c7fb6e6
9 changed files with 186 additions and 0 deletions

View File

@ -20,8 +20,10 @@ if(WIN32 AND MSVC)
# /arch:AVX for VC2012 and above
if(NOT MSVC_VERSION LESS 1700)
set(CYCLES_AVX_ARCH_FLAGS "/arch:AVX")
set(CYCLES_AVX2_ARCH_FLAGS "/arch:AVX /arch:AVX2")
elseif(NOT CMAKE_CL_64)
set(CYCLES_AVX_ARCH_FLAGS "/arch:SSE2")
set(CYCLES_AVX2_ARCH_FLAGS "/arch:SSE2")
endif()
# there is no /arch:SSE3, but intrinsics are available anyway
@ -30,11 +32,13 @@ if(WIN32 AND MSVC)
set(CYCLES_SSE3_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
set(CYCLES_SSE41_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
else()
set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
@ -48,6 +52,7 @@ elseif(CMAKE_COMPILER_IS_GNUCC)
set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse")
set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse")
set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse")
set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mbmi -mbmi2 -mfpmath=sse")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
@ -57,6 +62,7 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3")
set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1")
set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx")
set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 —mfma -mbmi -mbmi2")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
endif()
@ -67,6 +73,7 @@ if(CXX_HAS_SSE)
-DWITH_KERNEL_SSE3
-DWITH_KERNEL_SSE41
-DWITH_KERNEL_AVX
-DWITH_KERNEL_AVX2
)
endif()

View File

@ -39,6 +39,7 @@ sources.remove(path.join('kernel', 'kernel_sse2.cpp'))
sources.remove(path.join('kernel', 'kernel_sse3.cpp'))
sources.remove(path.join('kernel', 'kernel_sse41.cpp'))
sources.remove(path.join('kernel', 'kernel_avx.cpp'))
sources.remove(path.join('kernel', 'kernel_avx2.cpp'))
incs = []
defs = []
@ -98,6 +99,7 @@ elif env['OURPLATFORM'] == 'win64-vc':
if env['MSVC_VERSION'] >= '12.0':
kernel_flags['sse41'] = kernel_flags['sse3']
kernel_flags['avx'] = kernel_flags['sse41'] + ' /arch:AVX'
kernel_flags['avx2'] = kernel_flags['sse41'] + ' /arch:AVX /arch:AVX2'
else:
# -mavx only available with relatively new gcc/clang
kernel_flags['sse2'] = '-ffast-math -msse -msse2 -mfpmath=sse'
@ -106,6 +108,7 @@ else:
if (env['C_COMPILER_ID'] == 'gcc' and env['CCVERSION'] >= '4.6') or (env['C_COMPILER_ID'] == 'clang' and env['CCVERSION'] >= '3.1'):
kernel_flags['avx'] = kernel_flags['sse41'] + ' -mavx'
kernel_flags['avx2'] = kernel_flags['avx'] + ' -mavx2 -mfma -mbmi -mbmi2'
for kernel_type in kernel_flags.keys():
defs.append('WITH_KERNEL_' + kernel_type.upper())

View File

@ -62,6 +62,7 @@ public:
system_cpu_support_sse3();
system_cpu_support_sse41();
system_cpu_support_avx();
system_cpu_support_avx2();
}
~CPUDevice()
@ -167,6 +168,28 @@ public:
int start_sample = tile.start_sample;
int end_sample = tile.start_sample + tile.num_samples;
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
if(system_cpu_support_avx2()) {
for(int sample = start_sample; sample < end_sample; sample++) {
if (task.get_cancel() || task_pool.canceled()) {
if(task.need_finish_queue == false)
break;
}
for(int y = tile.y; y < tile.y + tile.h; y++) {
for(int x = tile.x; x < tile.x + tile.w; x++) {
kernel_cpu_avx2_path_trace(&kg, render_buffer, rng_state,
sample, x, y, tile.offset, tile.stride);
}
}
tile.sample = sample + 1;
task.update_progress(tile);
}
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
for(int sample = start_sample; sample < end_sample; sample++) {
@ -293,6 +316,15 @@ public:
float sample_scale = 1.0f/(task.sample + 1);
if(task.rgba_half) {
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
if(system_cpu_support_avx2()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_avx2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
sample_scale, x, y, task.offset, task.stride);
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
for(int y = task.y; y < task.y + task.h; y++)
@ -337,6 +369,15 @@ public:
}
}
else {
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
if(system_cpu_support_avx2()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_avx2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
sample_scale, x, y, task.offset, task.stride);
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
for(int y = task.y; y < task.y + task.h; y++)
@ -390,6 +431,18 @@ public:
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
if(system_cpu_support_avx2()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
for(int sample = 0; sample < task.num_samples; sample++)
kernel_cpu_avx2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
if(task.get_cancel() || task_pool.canceled())
break;
}
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {

View File

@ -214,12 +214,14 @@ if(CXX_HAS_SSE)
kernel_sse3.cpp
kernel_sse41.cpp
kernel_avx.cpp
kernel_avx2.cpp
)
set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
set_source_files_properties(kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
endif()

View File

@ -87,6 +87,17 @@ void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output,
int type, int i, int sample);
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
int sample, int x, int y, int offset, int stride);
void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
float sample_scale, int x, int y, int offset, int stride);
void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
float sample_scale, int x, int y, int offset, int stride);
void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
int type, int i, int sample);
#endif
CCL_NAMESPACE_END
#endif /* __KERNEL_H__ */

View File

@ -0,0 +1,87 @@
/*
* Copyright 2011-2014 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License
*/
/* Optimized CPU kernel entry points. This file is compiled with AVX2
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
#define __KERNEL_SSE2__
#define __KERNEL_SSE3__
#define __KERNEL_SSSE3__
#define __KERNEL_SSE41__
#define __KERNEL_AVX__
#define __KERNEL_AVX2__
#endif
#include "util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
#include "kernel.h"
#include "kernel_compat_cpu.h"
#include "kernel_math.h"
#include "kernel_types.h"
#include "kernel_globals.h"
#include "kernel_film.h"
#include "kernel_path.h"
#include "kernel_bake.h"
CCL_NAMESPACE_BEGIN
/* Path Tracing */
void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
{
#ifdef __BRANCHED_PATH__
if(kernel_data.integrator.branched)
kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
else
#endif
kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
}
/* Film */
void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
{
kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
}
void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
{
kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
}
/* Shader Evaluate */
void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
{
if(type >= SHADER_EVAL_BAKE)
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
else
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
}
CCL_NAMESPACE_END
#else
/* needed for some linkers in combination with scons making empty compilation unit in a library */
void __dummy_function_cycles_avx2(void);
void __dummy_function_cycles_avx2(void) {}
#endif

View File

@ -65,10 +65,15 @@
#define WITH_CYCLES_OPTIMIZED_KERNEL_AVX
#endif
#ifdef WITH_KERNEL_AVX2
#define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
#endif
/* MSVC 2008, no SSE41 (broken blendv intrinsic) and no AVX support */
#if defined(_MSC_VER) && (_MSC_VER < 1700)
#undef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
#undef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
#undef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
#endif
#endif

View File

@ -127,9 +127,12 @@ struct CPUCapabilities {
bool sse42;
bool sse4a;
bool avx;
bool avx2;
bool xop;
bool fma3;
bool fma4;
bool bmi1;
bool bmi2;
};
static CPUCapabilities& system_cpu_capabilities()
@ -180,6 +183,11 @@ static CPUCapabilities& system_cpu_capabilities()
#endif
caps.avx = (xcr_feature_mask & 0x6) == 0x6;
}
__cpuid(result, 0x00000007);
caps.bmi1 = (result[1] & ((int)1 << 3)) != 0;
caps.bmi2 = (result[1] & ((int)1 << 8)) != 0;
caps.avx2 = (result[1] & ((int)1 << 5)) != 0;
}
#if 0
@ -221,6 +229,11 @@ bool system_cpu_support_avx()
CPUCapabilities& caps = system_cpu_capabilities();
return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx;
}
bool system_cpu_support_avx2()
{
CPUCapabilities& caps = system_cpu_capabilities();
return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2;
}
#else
bool system_cpu_support_sse2()
@ -242,6 +255,10 @@ bool system_cpu_support_avx()
{
return false;
}
bool system_cpu_support_avx2()
{
return false;
}
#endif

View File

@ -28,6 +28,7 @@ bool system_cpu_support_sse2();
bool system_cpu_support_sse3();
bool system_cpu_support_sse41();
bool system_cpu_support_avx();
bool system_cpu_support_avx2();
CCL_NAMESPACE_END