Cycles: remove SSE3 and AVX kernel optimization levels

While keeping SSE2, SSE4.1 and AVX2. This does not affect hardware support, it
only slightly reduces performance for some older CPUs.

To reduce maintenance cost and improve compile times.

Differential Revision: https://developer.blender.org/D16978
This commit is contained in:
Brecht Van Lommel 2023-01-11 16:16:21 +01:00
parent debd912bef
commit a84a8a528d
16 changed files with 5 additions and 154 deletions

View File

@ -85,15 +85,11 @@ elseif(WIN32 AND MSVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang")
# there is no /arch:SSE3, but intrinsics are available anyway
if(CMAKE_CL_64)
set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
set(CYCLES_SSE3_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
else()
set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
endif()
@ -126,11 +122,7 @@ elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
endif()
set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -msse -msse2")
set(CYCLES_SSE3_KERNEL_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS} -msse3 -mssse3")
set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS} -msse4.1")
if(CXX_HAS_AVX)
set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx")
endif()
set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS} -msse3 -mssse3 -msse4.1")
if(CXX_HAS_AVX2)
set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
endif()
@ -144,13 +136,8 @@ elseif(WIN32 AND CMAKE_CXX_COMPILER_ID MATCHES "Intel")
if(CXX_HAS_SSE)
set(CYCLES_SSE2_KERNEL_FLAGS "/QxSSE2")
set(CYCLES_SSE3_KERNEL_FLAGS "/QxSSSE3")
set(CYCLES_SSE41_KERNEL_FLAGS "/QxSSE4.1")
if(CXX_HAS_AVX)
set(CYCLES_AVX_KERNEL_FLAGS "/arch:AVX")
endif()
if(CXX_HAS_AVX2)
set(CYCLES_AVX2_KERNEL_FLAGS "/QxCORE-AVX2")
endif()
@ -174,13 +161,8 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
set(CYCLES_SSE2_KERNEL_FLAGS "-xsse2")
endif()
set(CYCLES_SSE3_KERNEL_FLAGS "-xssse3")
set(CYCLES_SSE41_KERNEL_FLAGS "-xsse4.1")
if(CXX_HAS_AVX)
set(CYCLES_AVX_KERNEL_FLAGS "-xavx")
endif()
if(CXX_HAS_AVX2)
set(CYCLES_AVX2_KERNEL_FLAGS "-xcore-avx2")
endif()
@ -190,15 +172,10 @@ endif()
if(CXX_HAS_SSE)
add_definitions(
-DWITH_KERNEL_SSE2
-DWITH_KERNEL_SSE3
-DWITH_KERNEL_SSE41
)
endif()
if(CXX_HAS_AVX)
add_definitions(-DWITH_KERNEL_AVX)
endif()
if(CXX_HAS_AVX2)
add_definitions(-DWITH_KERNEL_AVX2)
endif()

View File

@ -951,9 +951,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
return _cycles.debug_flags_update(scene)
debug_use_cpu_avx2: BoolProperty(name="AVX2", default=True)
debug_use_cpu_avx: BoolProperty(name="AVX", default=True)
debug_use_cpu_sse41: BoolProperty(name="SSE41", default=True)
debug_use_cpu_sse3: BoolProperty(name="SSE3", default=True)
debug_use_cpu_sse2: BoolProperty(name="SSE2", default=True)
debug_bvh_layout: EnumProperty(
name="BVH Layout",

View File

@ -2112,9 +2112,7 @@ class CYCLES_RENDER_PT_debug(CyclesDebugButtonsPanel, Panel):
row = col.row(align=True)
row.prop(cscene, "debug_use_cpu_sse2", toggle=True)
row.prop(cscene, "debug_use_cpu_sse3", toggle=True)
row.prop(cscene, "debug_use_cpu_sse41", toggle=True)
row.prop(cscene, "debug_use_cpu_avx", toggle=True)
row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
col.prop(cscene, "debug_bvh_layout", text="BVH")

View File

@ -63,9 +63,7 @@ static void debug_flags_sync_from_scene(BL::Scene b_scene)
PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
/* Synchronize CPU flags. */
flags.cpu.avx2 = get_boolean(cscene, "debug_use_cpu_avx2");
flags.cpu.avx = get_boolean(cscene, "debug_use_cpu_avx");
flags.cpu.sse41 = get_boolean(cscene, "debug_use_cpu_sse41");
flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3");
flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
flags.cpu.bvh_layout = (BVHLayout)get_enum(cscene, "debug_bvh_layout");
/* Synchronize CUDA flags. */

View File

@ -45,9 +45,7 @@ string device_cpu_capabilities()
{
string capabilities = "";
capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
capabilities += system_cpu_support_avx() ? "AVX " : "";
capabilities += system_cpu_support_avx2() ? "AVX2" : "";
if (capabilities[capabilities.size() - 1] == ' ')
capabilities.resize(capabilities.size() - 1);

View File

@ -9,8 +9,7 @@ CCL_NAMESPACE_BEGIN
#define KERNEL_FUNCTIONS(name) \
KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
KERNEL_NAME_EVAL(cpu_sse41, name), KERNEL_NAME_EVAL(cpu_avx2, name)
#define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
#define REGISTER_KERNEL_FILM_CONVERT(name) \

View File

@ -17,13 +17,10 @@ template<typename FunctionType> class CPUKernelFunction {
public:
CPUKernelFunction(FunctionType kernel_default,
FunctionType kernel_sse2,
FunctionType kernel_sse3,
FunctionType kernel_sse41,
FunctionType kernel_avx,
FunctionType kernel_avx2)
{
kernel_info_ = get_best_kernel_info(
kernel_default, kernel_sse2, kernel_sse3, kernel_sse41, kernel_avx, kernel_avx2);
kernel_info_ = get_best_kernel_info(kernel_default, kernel_sse2, kernel_sse41, kernel_avx2);
}
template<typename... Args> inline auto operator()(Args... args) const
@ -60,16 +57,12 @@ template<typename FunctionType> class CPUKernelFunction {
KernelInfo get_best_kernel_info(FunctionType kernel_default,
FunctionType kernel_sse2,
FunctionType kernel_sse3,
FunctionType kernel_sse41,
FunctionType kernel_avx,
FunctionType kernel_avx2)
{
/* Silence warnings about unused variables when compiling without some architectures. */
(void)kernel_sse2;
(void)kernel_sse3;
(void)kernel_sse41;
(void)kernel_avx;
(void)kernel_avx2;
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
@ -78,24 +71,12 @@ template<typename FunctionType> class CPUKernelFunction {
}
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
return KernelInfo("AVX", kernel_avx);
}
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
return KernelInfo("SSE4.1", kernel_sse41);
}
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
return KernelInfo("SSE3", kernel_sse3);
}
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
return KernelInfo("SSE2", kernel_sse2);

View File

@ -14,9 +14,7 @@ set(INC_SYS
set(SRC_KERNEL_DEVICE_CPU
device/cpu/kernel.cpp
device/cpu/kernel_sse2.cpp
device/cpu/kernel_sse3.cpp
device/cpu/kernel_sse41.cpp
device/cpu/kernel_avx.cpp
device/cpu/kernel_avx2.cpp
)
@ -940,14 +938,9 @@ set_source_files_properties(device/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CY
if(CXX_HAS_SSE)
set_source_files_properties(device/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(device/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(device/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX)
set_source_files_properties(device/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX2)
set_source_files_properties(device/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
endif()

View File

@ -35,15 +35,9 @@ void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem
#define KERNEL_ARCH cpu_sse2
#include "kernel/device/cpu/kernel_arch.h"
#define KERNEL_ARCH cpu_sse3
#include "kernel/device/cpu/kernel_arch.h"
#define KERNEL_ARCH cpu_sse41
#include "kernel/device/cpu/kernel_arch.h"
#define KERNEL_ARCH cpu_avx
#include "kernel/device/cpu/kernel_arch.h"
#define KERNEL_ARCH cpu_avx2
#include "kernel/device/cpu/kernel_arch.h"

View File

@ -1,26 +0,0 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
#include "util/optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug T36316. */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
#include "kernel/device/cpu/kernel.h"
#define KERNEL_ARCH cpu_avx
#include "kernel/device/cpu/kernel_arch_impl.h"

View File

@ -1,23 +0,0 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
#include "util/optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug T36316. */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
#include "kernel/device/cpu/kernel.h"
#define KERNEL_ARCH cpu_sse3
#include "kernel/device/cpu/kernel_arch_impl.h"

View File

@ -45,19 +45,6 @@ set(SRC
# Disable AVX tests on macOS. Rosetta has problems running them, and other
# platforms should be enough to verify AVX operations are implemented correctly.
if(NOT APPLE)
if(CXX_HAS_SSE)
list(APPEND SRC
util_float8_sse2_test.cpp
)
set_source_files_properties(util_float8_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX)
list(APPEND SRC
util_float8_avx_test.cpp
)
set_source_files_properties(util_float8_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX2)
list(APPEND SRC
util_float8_avx2_test.cpp

View File

@ -29,9 +29,7 @@ void DebugFlags::CPU::reset()
} while (0)
CHECK_CPU_FLAGS(avx2, "CYCLES_CPU_NO_AVX2");
CHECK_CPU_FLAGS(avx, "CYCLES_CPU_NO_AVX");
CHECK_CPU_FLAGS(sse41, "CYCLES_CPU_NO_SSE41");
CHECK_CPU_FLAGS(sse3, "CYCLES_CPU_NO_SSE3");
CHECK_CPU_FLAGS(sse2, "CYCLES_CPU_NO_SSE2");
#undef STRINGIFY

View File

@ -26,9 +26,7 @@ class DebugFlags {
/* Flags describing which instructions sets are allowed for use. */
bool avx2 = true;
bool avx = true;
bool sse41 = true;
bool sse3 = true;
bool sse2 = true;
/* Check functions to see whether instructions up to the given one
@ -36,19 +34,11 @@ class DebugFlags {
*/
bool has_avx2()
{
return has_avx() && avx2;
}
bool has_avx()
{
return has_sse41() && avx;
return has_sse41() && avx2;
}
bool has_sse41()
{
return has_sse3() && sse41;
}
bool has_sse3()
{
return has_sse2() && sse3;
return has_sse2() && sse41;
}
bool has_sse2()
{

View File

@ -17,9 +17,6 @@
# ifdef WITH_KERNEL_SSE2
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# endif
# ifdef WITH_KERNEL_SSE3
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# endif
/* x86-64
*
@ -30,15 +27,9 @@
/* SSE2 is always available on x86-64 CPUs, so auto enable */
# define __KERNEL_SSE2__
/* no SSE2 kernel on x86-64, part of regular kernel */
# ifdef WITH_KERNEL_SSE3
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# endif
# ifdef WITH_KERNEL_SSE41
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# endif
# ifdef WITH_KERNEL_AVX
# define WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# endif
# ifdef WITH_KERNEL_AVX2
# define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# endif

View File

@ -17,9 +17,7 @@ int system_console_width();
std::string system_cpu_brand_string();
int system_cpu_bits();
bool system_cpu_support_sse2();
bool system_cpu_support_sse3();
bool system_cpu_support_sse41();
bool system_cpu_support_avx();
bool system_cpu_support_avx2();
size_t system_physical_ram();