Cycles: Add support for rendering on Intel GPUs using oneAPI
This patch adds a new Cycles device with similar functionality to the existing GPU devices. Kernel compilation and runtime interaction happen via oneAPI DPC++ compiler and SYCL API. This implementation is primarly focusing on Intel® Arc™ GPUs and other future Intel GPUs. The first supported drivers are 101.1660 on Windows and 22.10.22597 on Linux. The necessary tools for compilation are: - A SYCL compiler such as oneAPI DPC++ compiler or https://github.com/intel/llvm - Intel® oneAPI Level Zero which is used for low level device queries: https://github.com/oneapi-src/level-zero - To optionally generate prebuilt graphics binaries: Intel® Graphics Compiler All are included in Linux precompiled libraries on svn: https://svn.blender.org/svnroot/bf-blender/trunk/lib The same goes for Windows precompiled binaries but for the graphics compiler, available as "Intel® Graphics Offline Compiler for OpenCL™ Code" from https://www.intel.com/content/www/us/en/developer/articles/tool/oneapi-standalone-components.html, for which path can be set as OCLOC_INSTALL_DIR. Being based on the open SYCL standard, this implementation could also be extended to run on other compatible non-Intel hardware in the future. Reviewed By: sergey, brecht Differential Revision: https://developer.blender.org/D15254 Co-authored-by: Nikita Sirgienko <nikita.sirgienko@intel.com> Co-authored-by: Stefan Werner <stefan.werner@intel.com>
This commit is contained in:
parent
302b04a5a3
commit
a02992f131
Notes:
blender-bot
2023-02-13 15:52:36 +01:00
Referenced by issue #99900, Segfault on Linux when running third party Python library function with multithreading enabled Referenced by issue #96840, Cycles oneAPI device
|
@ -454,6 +454,21 @@ if(APPLE)
|
|||
option(WITH_CYCLES_DEVICE_METAL "Enable Cycles Apple Metal compute support" ON)
|
||||
endif()
|
||||
|
||||
# oneAPI
|
||||
if(NOT APPLE)
|
||||
option(WITH_CYCLES_DEVICE_ONEAPI "Enable Cycles oneAPI compute support" OFF)
|
||||
option(WITH_CYCLES_ONEAPI_BINARIES "Enable Ahead-Of-Time compilation for Cycles oneAPI device" OFF)
|
||||
option(WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED "Enable use of SYCL host (CPU) device execution by oneAPI implementation. This option is for debugging purposes and impacts GPU execution." OFF)
|
||||
|
||||
# https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html
|
||||
SET (CYCLES_ONEAPI_SPIR64_GEN_DEVICES "dg2" CACHE STRING "oneAPI Intel GPU architectures to build binaries for")
|
||||
SET (CYCLES_ONEAPI_SYCL_TARGETS spir64 spir64_gen CACHE STRING "oneAPI targets to build AOT binaries for")
|
||||
|
||||
mark_as_advanced(WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED)
|
||||
mark_as_advanced(CYCLES_ONEAPI_SPIR64_GEN_DEVICES)
|
||||
mark_as_advanced(CYCLES_ONEAPI_SYCL_TARGETS)
|
||||
endif()
|
||||
|
||||
# Draw Manager
|
||||
option(WITH_DRAW_DEBUG "Add extra debug capabilities to Draw Manager" OFF)
|
||||
mark_as_advanced(WITH_DRAW_DEBUG)
|
||||
|
|
|
@ -33,6 +33,7 @@ include(cmake/versions.cmake)
|
|||
include(cmake/options.cmake)
|
||||
include(cmake/boost_build_options.cmake)
|
||||
include(cmake/download.cmake)
|
||||
include(cmake/macros.cmake)
|
||||
|
||||
if(ENABLE_MINGW64)
|
||||
include(cmake/setup_mingw64.cmake)
|
||||
|
@ -96,6 +97,15 @@ include(cmake/fmt.cmake)
|
|||
include(cmake/robinmap.cmake)
|
||||
if(NOT APPLE)
|
||||
include(cmake/xr_openxr.cmake)
|
||||
if(NOT WIN32 OR BUILD_MODE STREQUAL Release)
|
||||
include(cmake/dpcpp.cmake)
|
||||
include(cmake/dpcpp_deps.cmake)
|
||||
endif()
|
||||
if(NOT WIN32)
|
||||
include(cmake/igc.cmake)
|
||||
include(cmake/gmmlib.cmake)
|
||||
include(cmake/ocloc.cmake)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# OpenColorIO and dependencies.
|
||||
|
|
|
@ -101,3 +101,19 @@ download_source(ROBINMAP)
|
|||
download_source(IMATH)
|
||||
download_source(PYSTRING)
|
||||
download_source(LEVEL_ZERO)
|
||||
download_source(DPCPP)
|
||||
download_source(VCINTRINSICS)
|
||||
download_source(OPENCLHEADERS)
|
||||
download_source(ICDLOADER)
|
||||
download_source(MP11)
|
||||
download_source(SPIRV_HEADERS)
|
||||
download_source(IGC)
|
||||
download_source(IGC_LLVM)
|
||||
download_source(IGC_OPENCL_CLANG)
|
||||
download_source(IGC_VCINTRINSICS)
|
||||
download_source(IGC_SPIRV_HEADERS)
|
||||
download_source(IGC_SPIRV_TOOLS)
|
||||
download_source(IGC_SPIRV_TRANSLATOR)
|
||||
download_source(GMMLIB)
|
||||
download_source(OCLOC)
|
||||
|
||||
|
|
|
@ -0,0 +1,107 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
|
||||
if(WIN32)
|
||||
set(LLVM_GENERATOR "Ninja")
|
||||
else()
|
||||
set(LLVM_GENERATOR "Unix Makefiles")
|
||||
endif()
|
||||
|
||||
set(DPCPP_CONFIGURE_ARGS
|
||||
# When external deps dpcpp needs are not found it will automatically
|
||||
# download the during the configure stage using FetchContent. Given
|
||||
# we need to keep an archive of all source used during build for compliance
|
||||
# reasons it CANNOT download anything we do not know about. By setting
|
||||
# this property to ON, all downloads are disabled, and we will have to
|
||||
# provide the missing deps some other way, a build error beats a compliance
|
||||
# violation
|
||||
--cmake-opt FETCHCONTENT_FULLY_DISCONNECTED=ON
|
||||
)
|
||||
set(DPCPP_SOURCE_ROOT ${BUILD_DIR}/dpcpp/src/external_dpcpp/)
|
||||
set(DPCPP_EXTRA_ARGS
|
||||
# When external deps dpcpp needs are not found it will automatically
|
||||
# download the during the configure stage using FetchContent. Given
|
||||
# we need to keep an archive of all source used during build for compliance
|
||||
# reasons it CANNOT download anything we do not know about. By setting
|
||||
# this property to ON, all downloads are disabled, and we will have to
|
||||
# provide the missing deps some other way, a build or configure error
|
||||
# beats a compliance violation
|
||||
-DFETCHCONTENT_FULLY_DISCONNECTED=ON
|
||||
-DLLVMGenXIntrinsics_SOURCE_DIR=${BUILD_DIR}/vcintrinsics/src/external_vcintrinsics/
|
||||
-DOpenCL_HEADERS=file://${PACKAGE_DIR}/${OPENCLHEADERS_FILE}
|
||||
-DOpenCL_LIBRARY_SRC=file://${PACKAGE_DIR}/${ICDLOADER_FILE}
|
||||
-DBOOST_MP11_SOURCE_DIR=${BUILD_DIR}/mp11/src/external_mp11/
|
||||
-DLEVEL_ZERO_LIBRARY=${LIBDIR}/level-zero/lib/${LIBPREFIX}ze_loader${SHAREDLIBEXT}
|
||||
-DLEVEL_ZERO_INCLUDE_DIR=${LIBDIR}/level-zero/include
|
||||
-DLLVM_EXTERNAL_SPIRV_HEADERS_SOURCE_DIR=${BUILD_DIR}/spirvheaders/src/external_spirvheaders/
|
||||
# Below here is copied from an invocation of buildbot/config.py
|
||||
-DLLVM_ENABLE_ASSERTIONS=ON
|
||||
-DLLVM_TARGETS_TO_BUILD=X86
|
||||
-DLLVM_EXTERNAL_PROJECTS=sycl^^llvm-spirv^^opencl^^libdevice^^xpti^^xptifw
|
||||
-DLLVM_EXTERNAL_SYCL_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/sycl
|
||||
-DLLVM_EXTERNAL_LLVM_SPIRV_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/llvm-spirv
|
||||
-DLLVM_EXTERNAL_XPTI_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/xpti
|
||||
-DXPTI_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/xpti
|
||||
-DLLVM_EXTERNAL_XPTIFW_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/xptifw
|
||||
-DLLVM_EXTERNAL_LIBDEVICE_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/libdevice
|
||||
-DLLVM_ENABLE_PROJECTS=clang^^sycl^^llvm-spirv^^opencl^^libdevice^^xpti^^xptifw
|
||||
-DLIBCLC_TARGETS_TO_BUILD=
|
||||
-DLIBCLC_GENERATE_REMANGLED_VARIANTS=OFF
|
||||
-DSYCL_BUILD_PI_HIP_PLATFORM=AMD
|
||||
-DLLVM_BUILD_TOOLS=ON
|
||||
-DSYCL_ENABLE_WERROR=OFF
|
||||
-DSYCL_INCLUDE_TESTS=ON
|
||||
-DLLVM_ENABLE_DOXYGEN=OFF
|
||||
-DLLVM_ENABLE_SPHINX=OFF
|
||||
-DBUILD_SHARED_LIBS=OFF
|
||||
-DSYCL_ENABLE_XPTI_TRACING=ON
|
||||
-DLLVM_ENABLE_LLD=OFF
|
||||
-DXPTI_ENABLE_WERROR=OFF
|
||||
-DSYCL_CLANG_EXTRA_FLAGS=
|
||||
-DSYCL_ENABLE_PLUGINS=level_zero
|
||||
-DCMAKE_INSTALL_RPATH=\$ORIGIN
|
||||
-DPython3_ROOT_DIR=${LIBDIR}/python/
|
||||
-DPython3_EXECUTABLE=${PYTHON_BINARY}
|
||||
-DPYTHON_EXECUTABLE=${PYTHON_BINARY}
|
||||
)
|
||||
|
||||
if(WIN32)
|
||||
list(APPEND DPCPP_EXTRA_ARGS -DPython3_FIND_REGISTRY=NEVER)
|
||||
endif()
|
||||
|
||||
ExternalProject_Add(external_dpcpp
|
||||
URL file://${PACKAGE_DIR}/${DPCPP_FILE}
|
||||
DOWNLOAD_DIR ${DOWNLOAD_DIR}
|
||||
URL_HASH ${DPCPP_HASH_TYPE}=${DPCPP_HASH}
|
||||
PREFIX ${BUILD_DIR}/dpcpp
|
||||
CMAKE_GENERATOR ${LLVM_GENERATOR}
|
||||
SOURCE_SUBDIR llvm
|
||||
LIST_SEPARATOR ^^
|
||||
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBDIR}/dpcpp ${DEFAULT_CMAKE_FLAGS} ${DPCPP_EXTRA_ARGS}
|
||||
#CONFIGURE_COMMAND ${PYTHON_BINARY} ${BUILD_DIR}/dpcpp/src/external_dpcpp/buildbot/configure.py ${DPCPP_CONFIGURE_ARGS}
|
||||
#BUILD_COMMAND echo "." #${PYTHON_BINARY} ${BUILD_DIR}/dpcpp/src/external_dpcpp/buildbot/compile.py
|
||||
INSTALL_COMMAND ${CMAKE_COMMAND} --build . -- deploy-sycl-toolchain
|
||||
PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${BUILD_DIR}/dpcpp/src/external_dpcpp < ${PATCH_DIR}/dpcpp.diff
|
||||
INSTALL_DIR ${LIBDIR}/dpcpp
|
||||
)
|
||||
|
||||
add_dependencies(
|
||||
external_dpcpp
|
||||
external_python
|
||||
external_python_site_packages
|
||||
external_vcintrinsics
|
||||
external_openclheaders
|
||||
external_icdloader
|
||||
external_mp11
|
||||
external_level-zero
|
||||
external_spirvheaders
|
||||
)
|
||||
|
||||
if(BUILD_MODE STREQUAL Release AND WIN32)
|
||||
ExternalProject_Add_Step(external_dpcpp after_install
|
||||
COMMAND ${CMAKE_COMMAND} -E rm -f ${LIBDIR}/dpcpp/bin/clang-cl.exe
|
||||
COMMAND ${CMAKE_COMMAND} -E rm -f ${LIBDIR}/dpcpp/bin/clang-cpp.exe
|
||||
COMMAND ${CMAKE_COMMAND} -E rm -f ${LIBDIR}/dpcpp/bin/clang.exe
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_directory ${LIBDIR}/dpcpp ${HARVEST_TARGET}/dpcpp
|
||||
)
|
||||
endif()
|
|
@ -0,0 +1,62 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
# These are build time requirements for dpcpp
|
||||
# We only have to unpack these dpcpp will build
|
||||
# them.
|
||||
|
||||
ExternalProject_Add(external_vcintrinsics
|
||||
URL file://${PACKAGE_DIR}/${VCINTRINSICS_FILE}
|
||||
URL_HASH ${VCINTRINSICS_HASH_TYPE}=${VCINTRINSICS_HASH}
|
||||
DOWNLOAD_DIR ${DOWNLOAD_DIR}
|
||||
PREFIX ${BUILD_DIR}/vcintrinsics
|
||||
CONFIGURE_COMMAND echo .
|
||||
BUILD_COMMAND echo .
|
||||
INSTALL_COMMAND echo .
|
||||
)
|
||||
|
||||
# opencl headers do not have to be unpacked, dpcpp will do it
|
||||
# but it wouldn't hurt to do it anyway as an opertunity to validate
|
||||
# the hash is correct.
|
||||
ExternalProject_Add(external_openclheaders
|
||||
URL file://${PACKAGE_DIR}/${OPENCLHEADERS_FILE}
|
||||
URL_HASH ${OPENCLHEADERS_HASH_TYPE}=${OPENCLHEADERS_HASH}
|
||||
DOWNLOAD_DIR ${DOWNLOAD_DIR}
|
||||
PREFIX ${BUILD_DIR}/openclheaders
|
||||
CONFIGURE_COMMAND echo .
|
||||
BUILD_COMMAND echo .
|
||||
INSTALL_COMMAND echo .
|
||||
)
|
||||
|
||||
# icdloader does not have to be unpacked, dpcpp will do it
|
||||
# but it wouldn't hurt to do it anyway as an opertunity to validate
|
||||
# the hash is correct.
|
||||
ExternalProject_Add(external_icdloader
|
||||
URL file://${PACKAGE_DIR}/${ICDLOADER_FILE}
|
||||
URL_HASH ${ICDLOADER_HASH_TYPE}=${ICDLOADER_HASH}
|
||||
DOWNLOAD_DIR ${DOWNLOAD_DIR}
|
||||
PREFIX ${BUILD_DIR}/icdloader
|
||||
CONFIGURE_COMMAND echo .
|
||||
BUILD_COMMAND echo .
|
||||
INSTALL_COMMAND echo .
|
||||
)
|
||||
|
||||
ExternalProject_Add(external_mp11
|
||||
URL file://${PACKAGE_DIR}/${MP11_FILE}
|
||||
URL_HASH ${MP11_HASH_TYPE}=${MP11_HASH}
|
||||
DOWNLOAD_DIR ${DOWNLOAD_DIR}
|
||||
PREFIX ${BUILD_DIR}/mp11
|
||||
CONFIGURE_COMMAND echo .
|
||||
BUILD_COMMAND echo .
|
||||
INSTALL_COMMAND echo .
|
||||
)
|
||||
|
||||
ExternalProject_Add(external_spirvheaders
|
||||
URL file://${PACKAGE_DIR}/${SPIRV_HEADERS_FILE}
|
||||
URL_HASH ${SPIRV_HEADERS_HASH_TYPE}=${SPIRV_HEADERS_HASH}
|
||||
DOWNLOAD_DIR ${DOWNLOAD_DIR}
|
||||
PREFIX ${BUILD_DIR}/spirvheaders
|
||||
CONFIGURE_COMMAND echo .
|
||||
BUILD_COMMAND echo .
|
||||
INSTALL_COMMAND echo .
|
||||
)
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
set(GMMLIB_EXTRA_ARGS
|
||||
)
|
||||
|
||||
ExternalProject_Add(external_gmmlib
|
||||
URL file://${PACKAGE_DIR}/${GMMLIB_FILE}
|
||||
URL_HASH ${GMMLIB_HASH_TYPE}=${GMMLIB_HASH}
|
||||
DOWNLOAD_DIR ${DOWNLOAD_DIR}
|
||||
PREFIX ${BUILD_DIR}/gmmlib
|
||||
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBDIR}/gmmlib ${DEFAULT_CMAKE_FLAGS} ${GMMLIB_EXTRA_ARGS}
|
||||
INSTALL_DIR ${LIBDIR}/gmmlib
|
||||
)
|
||||
|
|
@ -192,6 +192,10 @@ harvest(zstd/lib zstd/lib "*.a")
|
|||
if(UNIX AND NOT APPLE)
|
||||
harvest(libglu/lib mesa/lib "*.so*")
|
||||
harvest(mesa/lib64 mesa/lib "*.so*")
|
||||
endif()
|
||||
|
||||
harvest(dpcpp dpcpp "*")
|
||||
harvest(igc dpcpp/lib/igc "*")
|
||||
harvest(ocloc dpcpp/lib/ocloc "*")
|
||||
endif()
|
||||
|
||||
endif()
|
||||
|
|
|
@ -0,0 +1,126 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
unpack_only(igc_vcintrinsics)
|
||||
unpack_only(igc_spirv_headers)
|
||||
unpack_only(igc_spirv_tools)
|
||||
|
||||
#
|
||||
# igc_opencl_clang contains patches that need to be applied
|
||||
# to external_igc_llvm and igc_spirv_translator, we unpack
|
||||
# igc_opencl_clang first, then have the patch stages of
|
||||
# external_igc_llvm and igc_spirv_translator apply them.
|
||||
#
|
||||
|
||||
ExternalProject_Add(external_igc_opencl_clang
|
||||
URL file://${PACKAGE_DIR}/${IGC_OPENCL_CLANG_FILE}
|
||||
DOWNLOAD_DIR ${DOWNLOAD_DIR}
|
||||
URL_HASH ${IGC_OPENCL_CLANG_HASH_TYPE}=${IGC_OPENCL_CLANG_HASH}
|
||||
PREFIX ${BUILD_DIR}/igc_opencl_clang
|
||||
CONFIGURE_COMMAND echo .
|
||||
BUILD_COMMAND echo .
|
||||
INSTALL_COMMAND echo .
|
||||
PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${BUILD_DIR}/igc_opencl_clang/src/external_igc_opencl_clang/ < ${PATCH_DIR}/igc_opencl_clang.diff
|
||||
)
|
||||
|
||||
set(IGC_OPENCL_CLANG_PATCH_DIR ${BUILD_DIR}/igc_opencl_clang/src/external_igc_opencl_clang/patches)
|
||||
set(IGC_LLVM_SOURCE_DIR ${BUILD_DIR}/igc_llvm/src/external_igc_llvm)
|
||||
set(IGC_SPIRV_TRANSLATOR_SOURCE_DIR ${BUILD_DIR}/igc_spirv_translator/src/external_igc_spirv_translator)
|
||||
|
||||
ExternalProject_Add(external_igc_llvm
|
||||
URL file://${PACKAGE_DIR}/${IGC_LLVM_FILE}
|
||||
DOWNLOAD_DIR ${DOWNLOAD_DIR}
|
||||
URL_HASH ${IGC_LLVM_HASH_TYPE}=${IGC_LLVM_HASH}
|
||||
PREFIX ${BUILD_DIR}/igc_llvm
|
||||
CONFIGURE_COMMAND echo .
|
||||
BUILD_COMMAND echo .
|
||||
INSTALL_COMMAND echo .
|
||||
PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0001-OpenCL-3.0-support.patch &&
|
||||
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0002-Remove-__IMAGE_SUPPORT__-macro-for-SPIR.patch &&
|
||||
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0003-Avoid-calling-ParseCommandLineOptions-in-BackendUtil.patch &&
|
||||
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0004-OpenCL-support-cl_ext_float_atomics.patch &&
|
||||
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0005-OpenCL-Add-cl_khr_integer_dot_product.patch &&
|
||||
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0001-Memory-leak-fix-for-Managed-Static-Mutex.patch &&
|
||||
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0002-Remove-repo-name-in-LLVM-IR.patch
|
||||
)
|
||||
add_dependencies(
|
||||
external_igc_llvm
|
||||
external_igc_opencl_clang
|
||||
)
|
||||
|
||||
ExternalProject_Add(external_igc_spirv_translator
|
||||
URL file://${PACKAGE_DIR}/${IGC_SPIRV_TRANSLATOR_FILE}
|
||||
DOWNLOAD_DIR ${DOWNLOAD_DIR}
|
||||
URL_HASH ${IGC_SPIRV_TRANSLATOR_HASH_TYPE}=${IGC_SPIRV_TRANSLATOR_HASH}
|
||||
PREFIX ${BUILD_DIR}/igc_spirv_translator
|
||||
CONFIGURE_COMMAND echo .
|
||||
BUILD_COMMAND echo .
|
||||
INSTALL_COMMAND echo .
|
||||
PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0001-update-SPIR-V-headers-for-SPV_INTEL_split_barrier.patch &&
|
||||
${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0002-Add-support-for-split-barriers-extension-SPV_INTEL_s.patch &&
|
||||
${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0003-Support-cl_bf16_conversions.patch
|
||||
)
|
||||
add_dependencies(
|
||||
external_igc_spirv_translator
|
||||
external_igc_opencl_clang
|
||||
)
|
||||
|
||||
if(WIN32)
|
||||
set(IGC_GENERATOR "Ninja")
|
||||
set(IGC_TARGET Windows64)
|
||||
else()
|
||||
set(IGC_GENERATOR "Unix Makefiles")
|
||||
set(IGC_TARGET Linux64)
|
||||
endif()
|
||||
|
||||
set(IGC_EXTRA_ARGS
|
||||
-DIGC_OPTION__ARCHITECTURE_TARGET=${IGC_TARGET}
|
||||
-DIGC_OPTION__ARCHITECTURE_HOST=${IGC_TARGET}
|
||||
)
|
||||
|
||||
if(UNIX AND NOT APPLE)
|
||||
list(APPEND IGC_EXTRA_ARGS
|
||||
-DFLEX_EXECUTABLE=${LIBDIR}/flex/bin/flex
|
||||
-DFLEX_INCLUDE_DIR=${LIBDIR}/flex/include
|
||||
)
|
||||
endif()
|
||||
|
||||
ExternalProject_Add(external_igc
|
||||
URL file://${PACKAGE_DIR}/${IGC_FILE}
|
||||
DOWNLOAD_DIR ${DOWNLOAD_DIR}
|
||||
URL_HASH ${IGC_HASH_TYPE}=${IGC_HASH}
|
||||
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBDIR}/igc ${DEFAULT_CMAKE_FLAGS} ${IGC_EXTRA_ARGS}
|
||||
|
||||
# IGC is pretty set in its way where sub projects ought to live, for some it offers
|
||||
# hooks to supply alternatives folders, other are just hardocded with no way to configure
|
||||
# we symlink everything here, since it's less work than trying to convince the cmake
|
||||
# scripts to accept alternative locations.
|
||||
#
|
||||
PATCH_COMMAND ${CMAKE_COMMAND} -E create_symlink ${BUILD_DIR}/igc_llvm/src/external_igc_llvm/ ${BUILD_DIR}/igc/src/llvm-project &&
|
||||
${CMAKE_COMMAND} -E create_symlink ${BUILD_DIR}/igc_opencl_clang/src/external_igc_opencl_clang/ ${BUILD_DIR}/igc/src/llvm-project/llvm/projects/opencl-clang &&
|
||||
${CMAKE_COMMAND} -E create_symlink ${BUILD_DIR}/igc_spirv_translator/src/external_igc_spirv_translator/ ${BUILD_DIR}/igc/src/llvm-project/llvm/projects/llvm-spirv &&
|
||||
${CMAKE_COMMAND} -E create_symlink ${BUILD_DIR}/igc_spirv_tools/src/external_igc_spirv_tools/ ${BUILD_DIR}/igc/src/SPIRV-Tools &&
|
||||
${CMAKE_COMMAND} -E create_symlink ${BUILD_DIR}/igc_spirv_headers/src/external_igc_spirv_headers/ ${BUILD_DIR}/igc/src/SPIRV-Headers &&
|
||||
${CMAKE_COMMAND} -E create_symlink ${BUILD_DIR}/igc_vcintrinsics/src/external_igc_vcintrinsics/ ${BUILD_DIR}/igc/src/vc-intrinsics
|
||||
PREFIX ${BUILD_DIR}/igc
|
||||
INSTALL_DIR ${LIBDIR}/igc
|
||||
INSTALL_COMMAND ${CMAKE_COMMAND} --install . --strip
|
||||
CMAKE_GENERATOR ${IGC_GENERATOR}
|
||||
)
|
||||
|
||||
add_dependencies(
|
||||
external_igc
|
||||
external_igc_vcintrinsics
|
||||
external_igc_llvm
|
||||
external_igc_opencl_clang
|
||||
external_igc_vcintrinsics
|
||||
external_igc_spirv_headers
|
||||
external_igc_spirv_tools
|
||||
external_igc_spirv_translator
|
||||
)
|
||||
|
||||
if(UNIX AND NOT APPLE)
|
||||
add_dependencies(
|
||||
external_igc
|
||||
external_flex
|
||||
)
|
||||
endif()
|
|
@ -0,0 +1,18 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
# shorthand to only unpack a certain dependency
|
||||
macro(unpack_only name)
|
||||
string(TOUPPER ${name} UPPER_NAME)
|
||||
set(TARGET_FILE ${${UPPER_NAME}_FILE})
|
||||
set(TARGET_HASH_TYPE ${${UPPER_NAME}_HASH_TYPE})
|
||||
set(TARGET_HASH ${${UPPER_NAME}_HASH})
|
||||
ExternalProject_Add(external_${name}
|
||||
URL file://${PACKAGE_DIR}/${TARGET_FILE}
|
||||
URL_HASH ${TARGET_HASH_TYPE}=${TARGET_HASH}
|
||||
DOWNLOAD_DIR ${DOWNLOAD_DIR}
|
||||
PREFIX ${BUILD_DIR}/${name}
|
||||
CONFIGURE_COMMAND echo .
|
||||
BUILD_COMMAND echo .
|
||||
INSTALL_COMMAND echo .
|
||||
)
|
||||
endmacro()
|
|
@ -0,0 +1,24 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
set(OCLOC_EXTRA_ARGS
|
||||
-DNEO_SKIP_UNIT_TESTS=1
|
||||
-DNEO_BUILD_WITH_OCL=0
|
||||
-DBUILD_WITH_L0=0
|
||||
-DIGC_DIR=${LIBDIR}/igc
|
||||
-DGMM_DIR=${LIBDIR}/gmmlib
|
||||
)
|
||||
|
||||
ExternalProject_Add(external_ocloc
|
||||
URL file://${PACKAGE_DIR}/${OCLOC_FILE}
|
||||
URL_HASH ${OCLOC_HASH_TYPE}=${OCLOC_HASH}
|
||||
DOWNLOAD_DIR ${DOWNLOAD_DIR}
|
||||
PREFIX ${BUILD_DIR}/ocloc
|
||||
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBDIR}/ocloc ${DEFAULT_CMAKE_FLAGS} ${OCLOC_EXTRA_ARGS}
|
||||
INSTALL_DIR ${LIBDIR}/ocloc
|
||||
)
|
||||
|
||||
add_dependencies(
|
||||
external_ocloc
|
||||
external_igc
|
||||
external_gmmlib
|
||||
)
|
|
@ -38,6 +38,7 @@ message("BUILD_DIR = ${BUILD_DIR}")
|
|||
if(WIN32)
|
||||
set(PATCH_CMD ${DOWNLOAD_DIR}/mingw/mingw64/msys/1.0/bin/patch.exe)
|
||||
set(LIBEXT ".lib")
|
||||
set(SHAREDLIBEXT ".lib")
|
||||
set(LIBPREFIX "")
|
||||
|
||||
# For OIIO and OSL
|
||||
|
@ -96,6 +97,7 @@ if(WIN32)
|
|||
else()
|
||||
set(PATCH_CMD patch)
|
||||
set(LIBEXT ".a")
|
||||
set(SHAREDLIBEXT ".so")
|
||||
set(LIBPREFIX "lib")
|
||||
|
||||
if(APPLE)
|
||||
|
|
|
@ -502,3 +502,134 @@ set(LEVEL_ZERO_URI https://github.com/oneapi-src/level-zero/archive/refs/tags/${
|
|||
set(LEVEL_ZERO_HASH c39bb05a8e5898aa6c444e1704105b93d3f1888b9c333f8e7e73825ffbfb2617)
|
||||
set(LEVEL_ZERO_HASH_TYPE SHA256)
|
||||
set(LEVEL_ZERO_FILE level-zero-${LEVEL_ZERO_VERSION}.tar.gz)
|
||||
|
||||
set(DPCPP_VERSION 20220620)
|
||||
set(DPCPP_URI https://github.com/intel/llvm/archive/refs/tags/sycl-nightly/${DPCPP_VERSION}.tar.gz)
|
||||
set(DPCPP_HASH a5f41abd5229d28afa92cbd8a5d8d786ee698bf239f722929fd686276bad692c)
|
||||
set(DPCPP_HASH_TYPE SHA256)
|
||||
set(DPCPP_FILE DPCPP-${DPCPP_VERSION}.tar.gz)
|
||||
|
||||
########################
|
||||
### DPCPP DEPS BEGIN ###
|
||||
########################
|
||||
# The following deps are build time requirements for dpcpp, when possible
|
||||
# the source in the dpcpp source tree for the version chosen is documented
|
||||
# by each dep, these will only have to be downloaded and unpacked, dpcpp
|
||||
# will take care of building them, unpack is being done in dpcpp_deps.cmake
|
||||
|
||||
# Source llvm/lib/SYCLLowerIR/CMakeLists.txt
|
||||
set(VCINTRINSICS_VERSION 984bb27baacce6ee5c716c2e64845f2a1928025b)
|
||||
set(VCINTRINSICS_URI https://github.com/intel/vc-intrinsics/archive/${VCINTRINSICS_VERSION}.tar.gz)
|
||||
set(VCINTRINSICS_HASH abea415a15a0dd11fdc94dee8fb462910f2548311b787e02f42509789e1b0d7b)
|
||||
set(VCINTRINSICS_HASH_TYPE SHA256)
|
||||
set(VCINTRINSICS_FILE vc-intrinsics-${VCINTRINSICS_VERSION}.tar.gz)
|
||||
|
||||
# Source opencl/CMakeLists.txt
|
||||
set(OPENCLHEADERS_VERSION dcd5bede6859d26833cd85f0d6bbcee7382dc9b3)
|
||||
set(OPENCLHEADERS_URI https://github.com/KhronosGroup/OpenCL-Headers/archive/${OPENCLHEADERS_VERSION}.tar.gz)
|
||||
set(OPENCLHEADERS_HASH ca8090359654e94f2c41e946b7e9d826253d795ae809ce7c83a7d3c859624693)
|
||||
set(OPENCLHEADERS_HASH_TYPE SHA256)
|
||||
set(OPENCLHEADERS_FILE opencl_headers-${OPENCLHEADERS_VERSION}.tar.gz)
|
||||
|
||||
# Source opencl/CMakeLists.txt
|
||||
set(ICDLOADER_VERSION aec3952654832211636fc4af613710f80e203b0a)
|
||||
set(ICDLOADER_URI https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/${ICDLOADER_VERSION}.tar.gz)
|
||||
set(ICDLOADER_HASH e1880551d67bd8dc31d13de63b94bbfd6b1f315b6145dad1ffcd159b89bda93c)
|
||||
set(ICDLOADER_HASH_TYPE SHA256)
|
||||
set(ICDLOADER_FILE icdloader-${ICDLOADER_VERSION}.tar.gz)
|
||||
|
||||
# Source sycl/cmake/modules/AddBoostMp11Headers.cmake
|
||||
# Using external MP11 here, getting AddBoostMp11Headers.cmake to recognize
|
||||
# our copy in boost directly was more trouble than it was worth.
|
||||
set(MP11_VERSION 7bc4e1ae9b36ec8ee635c3629b59ec525bbe82b9)
|
||||
set(MP11_URI https://github.com/boostorg/mp11/archive/${MP11_VERSION}.tar.gz)
|
||||
set(MP11_HASH 071ee2bd3952ec89882edb3af25dd1816f6b61723f66e42eea32f4d02ceef426)
|
||||
set(MP11_HASH_TYPE SHA256)
|
||||
set(MP11_FILE mp11-${MP11_VERSION}.tar.gz)
|
||||
|
||||
# Source llvm-spirv/CMakeLists.txt (repo)
|
||||
# Source llvm-spirv/spirv-headers-tag.conf (hash)
|
||||
set(SPIRV_HEADERS_VERSION 36c0c1596225e728bd49abb7ef56a3953e7ed468)
|
||||
set(SPIRV_HEADERS_URI https://github.com/KhronosGroup/SPIRV-Headers/archive/${SPIRV_HEADERS_VERSION}.tar.gz)
|
||||
set(SPIRV_HEADERS_HASH 7a5c89633f8740456fe8adee052033e134476d267411d1336c0cb1e587a9229a)
|
||||
set(SPIRV_HEADERS_HASH_TYPE SHA256)
|
||||
set(SPIRV_HEADERS_FILE SPIR-V-Headers-${SPIRV_HEADERS_VERSION}.tar.gz)
|
||||
|
||||
######################
|
||||
### DPCPP DEPS END ###
|
||||
######################
|
||||
|
||||
##########################################
|
||||
### Intel Graphics Compiler DEPS BEGIN ###
|
||||
##########################################
|
||||
# The following deps are build time requirements for the intel graphics
|
||||
# compiler, the versions used are taken from the following location
|
||||
# https://github.com/intel/intel-graphics-compiler/releases
|
||||
|
||||
set(IGC_VERSION 1.0.11222)
|
||||
set(IGC_URI https://github.com/intel/intel-graphics-compiler/archive/refs/tags/igc-${IGC_VERSION}.tar.gz)
|
||||
set(IGC_HASH d92f0608dcbb52690855685f9447282e5c09c0ba98ae35fabf114fcf8b1e9fcf)
|
||||
set(IGC_HASH_TYPE SHA256)
|
||||
set(IGC_FILE igc-${IGC_VERSION}.tar.gz)
|
||||
|
||||
set(IGC_LLVM_VERSION llvmorg-11.1.0)
|
||||
set(IGC_LLVM_URI https://github.com/llvm/llvm-project/archive/refs/tags/${IGC_LLVM_VERSION}.tar.gz)
|
||||
set(IGC_LLVM_HASH 53a0719f3f4b0388013cfffd7b10c7d5682eece1929a9553c722348d1f866e79)
|
||||
set(IGC_LLVM_HASH_TYPE SHA256)
|
||||
set(IGC_LLVM_FILE ${IGC_LLVM_VERSION}.tar.gz)
|
||||
|
||||
# WARNING WARNING WARNING
|
||||
#
|
||||
# IGC_OPENCL_CLANG contains patches for some of its dependencies.
|
||||
#
|
||||
# Whenever IGC_OPENCL_CLANG_VERSION changes, one *MUST* inspect
|
||||
# IGC_OPENCL_CLANG's patches folder and update igc.cmake to account for
|
||||
# any added or removed patches.
|
||||
#
|
||||
# WARNING WARNING WARNING
|
||||
|
||||
set(IGC_OPENCL_CLANG_VERSION bbdd1587f577397a105c900be114b56755d1f7dc)
|
||||
set(IGC_OPENCL_CLANG_URI https://github.com/intel/opencl-clang/archive/${IGC_OPENCL_CLANG_VERSION}.tar.gz)
|
||||
set(IGC_OPENCL_CLANG_HASH d08315f1b0d8a6fef33de2b3e6aa7356534c324910634962c72523d970773efc)
|
||||
set(IGC_OPENCL_CLANG_HASH_TYPE SHA256)
|
||||
set(IGC_OPENCL_CLANG_FILE opencl-clang-${IGC_OPENCL_CLANG_VERSION}.tar.gz)
|
||||
|
||||
set(IGC_VCINTRINSICS_VERSION v0.4.0)
|
||||
set(IGC_VCINTRINSICS_URI https://github.com/intel/vc-intrinsics/archive/refs/tags/${IGC_VCINTRINSICS_VERSION}.tar.gz)
|
||||
set(IGC_VCINTRINSICS_HASH c8b92682ad5031cf9d5b82a40e7d5c0e763cd9278660adbcaa69aab988e4b589)
|
||||
set(IGC_VCINTRINSICS_HASH_TYPE SHA256)
|
||||
set(IGC_VCINTRINSICS_FILE vc-intrinsics-${IGC_VCINTRINSICS_VERSION}.tar.gz)
|
||||
|
||||
set(IGC_SPIRV_HEADERS_VERSION sdk-1.3.204.1)
|
||||
set(IGC_SPIRV_HEADERS_URI https://github.com/KhronosGroup/SPIRV-Headers/archive/refs/tags/${IGC_SPIRV_HEADERS_VERSION}.tar.gz)
|
||||
set(IGC_SPIRV_HEADERS_HASH 262864053968c217d45b24b89044a7736a32361894743dd6cfe788df258c746c)
|
||||
set(IGC_SPIRV_HEADERS_HASH_TYPE SHA256)
|
||||
set(IGC_SPIRV_HEADERS_FILE SPIR-V-Headers-${IGC_SPIRV_HEADERS_VERSION}.tar.gz)
|
||||
|
||||
set(IGC_SPIRV_TOOLS_VERSION sdk-1.3.204.1)
|
||||
set(IGC_SPIRV_TOOLS_URI https://github.com/KhronosGroup/SPIRV-Tools/archive/refs/tags/${IGC_SPIRV_TOOLS_VERSION}.tar.gz)
|
||||
set(IGC_SPIRV_TOOLS_HASH 6e19900e948944243024aedd0a201baf3854b377b9cc7a386553bc103b087335)
|
||||
set(IGC_SPIRV_TOOLS_HASH_TYPE SHA256)
|
||||
set(IGC_SPIRV_TOOLS_FILE SPIR-V-Tools-${IGC_SPIRV_TOOLS_VERSION}.tar.gz)
|
||||
|
||||
set(IGC_SPIRV_TRANSLATOR_VERSION 99420daab98998a7e36858befac9c5ed109d4920)
|
||||
set(IGC_SPIRV_TRANSLATOR_URI https://github.com/KhronosGroup/SPIRV-LLVM-Translator/archive/${IGC_SPIRV_TRANSLATOR_VERSION}.tar.gz)
|
||||
set(IGC_SPIRV_TRANSLATOR_HASH 77dfb4ddb6bfb993535562c02ddea23f0a0d1c5a0258c1afe7e27c894ff783a8)
|
||||
set(IGC_SPIRV_TRANSLATOR_HASH_TYPE SHA256)
|
||||
set(IGC_SPIRV_TRANSLATOR_FILE SPIR-V-Translator-${IGC_SPIRV_TRANSLATOR_VERSION}.tar.gz)
|
||||
|
||||
########################################
|
||||
### Intel Graphics Compiler DEPS END ###
|
||||
########################################
|
||||
|
||||
set(GMMLIB_VERSION intel-gmmlib-22.1.2)
|
||||
set(GMMLIB_URI https://github.com/intel/gmmlib/archive/refs/tags/${GMMLIB_VERSION}.tar.gz)
|
||||
set(GMMLIB_HASH 3b9a6d5e7e3f5748b3d0a2fb0e980ae943907fece0980bd9c0508e71c838e334)
|
||||
set(GMMLIB_HASH_TYPE SHA256)
|
||||
set(GMMLIB_FILE ${GMMLIB_VERSION}.tar.gz)
|
||||
|
||||
set(OCLOC_VERSION 22.20.23198)
|
||||
set(OCLOC_URI https://github.com/intel/compute-runtime/archive/refs/tags/${OCLOC_VERSION}.tar.gz)
|
||||
set(OCLOC_HASH ab22b8bf2560a57fdd3def0e35a62ca75991406f959c0263abb00cd6cd9ae998)
|
||||
set(OCLOC_HASH_TYPE SHA256)
|
||||
set(OCLOC_FILE ocloc-${OCLOC_VERSION}.tar.gz)
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
diff -Naur external_dpcpp.orig/sycl/source/CMakeLists.txt external_dpcpp/sycl/source/CMakeLists.txt
|
||||
--- external_dpcpp.orig/sycl/source/CMakeLists.txt 2022-05-20 04:19:45.067771362 +0000
|
||||
+++ external_dpcpp/sycl/source/CMakeLists.txt 2022-05-20 04:21:49.708025048 +0000
|
||||
@@ -66,10 +66,10 @@
|
||||
target_compile_options(${LIB_OBJ_NAME} PUBLIC
|
||||
-fvisibility=hidden -fvisibility-inlines-hidden)
|
||||
set(linker_script "${CMAKE_CURRENT_SOURCE_DIR}/ld-version-script.txt")
|
||||
- set(abi_linker_script "${CMAKE_CURRENT_SOURCE_DIR}/abi_replacements_linux.txt")
|
||||
- target_link_libraries(
|
||||
- ${LIB_NAME} PRIVATE "-Wl,${abi_linker_script}")
|
||||
- set_target_properties(${LIB_NAME} PROPERTIES LINK_DEPENDS ${abi_linker_script})
|
||||
+# set(abi_linker_script "${CMAKE_CURRENT_SOURCE_DIR}/abi_replacements_linux.txt")
|
||||
+# target_link_libraries(
|
||||
+# ${LIB_NAME} PRIVATE "-Wl,${abi_linker_script}")
|
||||
+# set_target_properties(${LIB_NAME} PROPERTIES LINK_DEPENDS ${abi_linker_script})
|
||||
target_link_libraries(
|
||||
${LIB_NAME} PRIVATE "-Wl,--version-script=${linker_script}")
|
||||
set_target_properties(${LIB_NAME} PROPERTIES LINK_DEPENDS ${linker_script})
|
||||
diff -Naur llvm-sycl-nightly-20220501.orig\opencl/CMakeLists.txt llvm-sycl-nightly-20220501\opencl/CMakeLists.txt
|
||||
--- llvm-sycl-nightly-20220501.orig/opencl/CMakeLists.txt 2022-04-29 13:47:11 -0600
|
||||
+++ llvm-sycl-nightly-20220501/opencl/CMakeLists.txt 2022-05-21 15:25:06 -0600
|
||||
@@ -11,6 +11,11 @@
|
||||
)
|
||||
endif()
|
||||
|
||||
+# Blender code below is determined to use FetchContent_Declare
|
||||
+# temporarily allow it (but feed it our downloaded tarball
|
||||
+# in the OpenCL_HEADERS variable
|
||||
+set(FETCHCONTENT_FULLY_DISCONNECTED OFF)
|
||||
+
|
||||
# Repo URLs
|
||||
|
||||
set(OCL_HEADERS_REPO
|
||||
@@ -77,5 +82,6 @@
|
||||
|
||||
FetchContent_MakeAvailable(ocl-icd)
|
||||
add_library(OpenCL-ICD ALIAS OpenCL)
|
||||
+set(FETCHCONTENT_FULLY_DISCONNECTED ON)
|
||||
|
||||
add_subdirectory(opencl-aot)
|
||||
diff -Naur llvm-sycl-nightly-20220208.orig/libdevice/cmake/modules/SYCLLibdevice.cmake llvm-sycl-nightly-20220208/libdevice/cmake/modules/SYCLLibdevice.cmake
|
||||
--- llvm-sycl-nightly-20220208.orig/libdevice/cmake/modules/SYCLLibdevice.cmake 2022-02-08 09:17:24 -0700
|
||||
+++ llvm-sycl-nightly-20220208/libdevice/cmake/modules/SYCLLibdevice.cmake 2022-05-24 11:35:51 -0600
|
||||
@@ -36,7 +36,9 @@
|
||||
add_custom_target(libsycldevice-obj)
|
||||
add_custom_target(libsycldevice-spv)
|
||||
|
||||
-add_custom_target(libsycldevice DEPENDS
|
||||
+# Blender: add ALL here otherwise this target will not build
|
||||
+# and cause an error due to missing files during the install phase.
|
||||
+add_custom_target(libsycldevice ALL DEPENDS
|
||||
libsycldevice-obj
|
||||
libsycldevice-spv)
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
diff -Naur external_igc_opencl_clang.orig/CMakeLists.txt external_igc_opencl_clang/CMakeLists.txt
|
||||
--- external_igc_opencl_clang.orig/CMakeLists.txt 2022-03-16 05:51:10 -0600
|
||||
+++ external_igc_opencl_clang/CMakeLists.txt 2022-05-23 10:40:09 -0600
|
||||
@@ -126,22 +126,24 @@
|
||||
)
|
||||
endif()
|
||||
|
||||
-
|
||||
- set(SPIRV_BASE_REVISION llvm_release_110)
|
||||
- set(TARGET_BRANCH "ocl-open-110")
|
||||
- get_filename_component(LLVM_MONOREPO_DIR ${LLVM_SOURCE_DIR} DIRECTORY)
|
||||
- set(LLVM_PATCHES_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/patches/llvm
|
||||
- ${CMAKE_CURRENT_SOURCE_DIR}/patches/clang)
|
||||
- apply_patches(${LLVM_MONOREPO_DIR}
|
||||
- "${LLVM_PATCHES_DIRS}"
|
||||
- ${LLVM_BASE_REVISION}
|
||||
- ${TARGET_BRANCH}
|
||||
- ret)
|
||||
- apply_patches(${SPIRV_SOURCE_DIR}
|
||||
- ${CMAKE_CURRENT_SOURCE_DIR}/patches/spirv
|
||||
- ${SPIRV_BASE_REVISION}
|
||||
- ${TARGET_BRANCH}
|
||||
- ret)
|
||||
+ #
|
||||
+ # Blender: Why apply these manually in igc.cmake
|
||||
+ #
|
||||
+ #set(SPIRV_BASE_REVISION llvm_release_110)
|
||||
+ #set(TARGET_BRANCH "ocl-open-110")
|
||||
+ #get_filename_component(LLVM_MONOREPO_DIR ${LLVM_SOURCE_DIR} DIRECTORY)
|
||||
+ #set(LLVM_PATCHES_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/patches/llvm
|
||||
+ # ${CMAKE_CURRENT_SOURCE_DIR}/patches/clang)
|
||||
+ #apply_patches(${LLVM_MONOREPO_DIR}
|
||||
+ # "${LLVM_PATCHES_DIRS}"
|
||||
+ # ${LLVM_BASE_REVISION}
|
||||
+ # ${TARGET_BRANCH}
|
||||
+ # ret)
|
||||
+ #apply_patches(${SPIRV_SOURCE_DIR}
|
||||
+ # ${CMAKE_CURRENT_SOURCE_DIR}/patches/spirv
|
||||
+ # ${SPIRV_BASE_REVISION}
|
||||
+ # ${TARGET_BRANCH}
|
||||
+ # ret)
|
||||
endif(NOT USE_PREBUILT_LLVM)
|
||||
|
||||
#
|
|
@ -0,0 +1,56 @@
|
|||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright 2021-2022 Intel Corporation
|
||||
|
||||
# - Find Level Zero library
|
||||
# Find Level Zero headers and libraries needed by oneAPI implementation
|
||||
# This module defines
|
||||
# LEVEL_ZERO_LIBRARY, libraries to link against in order to use L0.
|
||||
# LEVEL_ZERO_INCLUDE_DIR, directories where L0 headers can be found.
|
||||
# LEVEL_ZERO_ROOT_DIR, The base directory to search for L0 files.
|
||||
# This can also be an environment variable.
|
||||
# LEVEL_ZERO_FOUND, If false, then don't try to use L0.
|
||||
|
||||
IF(NOT LEVEL_ZERO_ROOT_DIR AND NOT $ENV{LEVEL_ZERO_ROOT_DIR} STREQUAL "")
|
||||
SET(LEVEL_ZERO_ROOT_DIR $ENV{LEVEL_ZERO_ROOT_DIR})
|
||||
ENDIF()
|
||||
|
||||
SET(_level_zero_search_dirs
|
||||
${LEVEL_ZERO_ROOT_DIR}
|
||||
/usr/lib
|
||||
/usr/local/lib
|
||||
)
|
||||
|
||||
FIND_LIBRARY(_LEVEL_ZERO_LIBRARY
|
||||
NAMES
|
||||
ze_loader
|
||||
HINTS
|
||||
${_level_zero_search_dirs}
|
||||
PATH_SUFFIXES
|
||||
lib64 lib
|
||||
)
|
||||
|
||||
FIND_PATH(_LEVEL_ZERO_INCLUDE_DIR
|
||||
NAMES
|
||||
level_zero/ze_api.h
|
||||
HINTS
|
||||
${_level_zero_search_dirs}
|
||||
PATH_SUFFIXES
|
||||
include
|
||||
)
|
||||
|
||||
INCLUDE(FindPackageHandleStandardArgs)
|
||||
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(LevelZero DEFAULT_MSG _LEVEL_ZERO_LIBRARY _LEVEL_ZERO_INCLUDE_DIR)
|
||||
|
||||
IF(LevelZero_FOUND)
|
||||
SET(LEVEL_ZERO_LIBRARY ${_LEVEL_ZERO_LIBRARY})
|
||||
SET(LEVEL_ZERO_INCLUDE_DIR ${_LEVEL_ZERO_INCLUDE_DIR} ${_LEVEL_ZERO_INCLUDE_PARENT_DIR})
|
||||
SET(LEVEL_ZERO_FOUND TRUE)
|
||||
ELSE()
|
||||
SET(LEVEL_ZERO_FOUND FALSE)
|
||||
ENDIF()
|
||||
|
||||
MARK_AS_ADVANCED(
|
||||
LEVEL_ZERO_LIBRARY
|
||||
LEVEL_ZERO_INCLUDE_DIR
|
||||
)
|
|
@ -0,0 +1,88 @@
|
|||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright 2021-2022 Intel Corporation
|
||||
|
||||
# - Find SYCL library
|
||||
# Find the native SYCL header and libraries needed by oneAPI implementation
|
||||
# This module defines
|
||||
# SYCL_COMPILER, compiler which will be used for compilation of SYCL code
|
||||
# SYCL_LIBRARY, libraries to link against in order to use SYCL.
|
||||
# SYCL_INCLUDE_DIR, directories where SYCL headers can be found
|
||||
# SYCL_ROOT_DIR, The base directory to search for SYCL files.
|
||||
# This can also be an environment variable.
|
||||
# SYCL_FOUND, If false, then don't try to use SYCL.
|
||||
|
||||
IF(NOT SYCL_ROOT_DIR AND NOT $ENV{SYCL_ROOT_DIR} STREQUAL "")
|
||||
SET(SYCL_ROOT_DIR $ENV{SYCL_ROOT_DIR})
|
||||
ENDIF()
|
||||
|
||||
SET(_sycl_search_dirs
|
||||
${SYCL_ROOT_DIR}
|
||||
/usr/lib
|
||||
/usr/local/lib
|
||||
/opt/intel/oneapi/compiler/latest/linux/
|
||||
C:/Program\ Files\ \(x86\)/Intel/oneAPI/compiler/latest/windows
|
||||
)
|
||||
|
||||
# Find DPC++ compiler.
|
||||
# Since the compiler name is possibly conflicting with the system-wide
|
||||
# CLang start with looking for either dpcpp or clang binary in the given
|
||||
# list of search paths only. If that fails, try to look for a system-wide
|
||||
# dpcpp binary.
|
||||
FIND_PROGRAM(SYCL_COMPILER
|
||||
NAMES
|
||||
dpcpp
|
||||
clang++
|
||||
HINTS
|
||||
${_sycl_search_dirs}
|
||||
PATH_SUFFIXES
|
||||
bin
|
||||
NO_CMAKE_FIND_ROOT_PATH
|
||||
NAMES_PER_DIR
|
||||
)
|
||||
|
||||
# NOTE: No clang++ here so that we do not pick up a system-wide CLang
|
||||
# compiler.
|
||||
if(NOT SYCL_COMPILER)
|
||||
FIND_PROGRAM(SYCL_COMPILER
|
||||
NAMES
|
||||
dpcpp
|
||||
HINTS
|
||||
${_sycl_search_dirs}
|
||||
PATH_SUFFIXES
|
||||
bin
|
||||
)
|
||||
endif()
|
||||
|
||||
FIND_LIBRARY(SYCL_LIBRARY
|
||||
NAMES
|
||||
sycl
|
||||
HINTS
|
||||
${_sycl_search_dirs}
|
||||
PATH_SUFFIXES
|
||||
lib64 lib
|
||||
)
|
||||
|
||||
FIND_PATH(SYCL_INCLUDE_DIR
|
||||
NAMES
|
||||
CL/sycl.hpp
|
||||
HINTS
|
||||
${_sycl_search_dirs}
|
||||
PATH_SUFFIXES
|
||||
include
|
||||
include/sycl
|
||||
)
|
||||
|
||||
INCLUDE(FindPackageHandleStandardArgs)
|
||||
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(SYCL DEFAULT_MSG SYCL_LIBRARY SYCL_INCLUDE_DIR)
|
||||
|
||||
IF(SYCL_FOUND)
|
||||
get_filename_component(_SYCL_INCLUDE_PARENT_DIR ${SYCL_INCLUDE_DIR} DIRECTORY)
|
||||
SET(SYCL_INCLUDE_DIR ${SYCL_INCLUDE_DIR} ${_SYCL_INCLUDE_PARENT_DIR})
|
||||
ELSE()
|
||||
SET(SYCL_SYCL_FOUND FALSE)
|
||||
ENDIF()
|
||||
|
||||
MARK_AS_ADVANCED(
|
||||
_SYCL_INCLUDE_PARENT_DIR
|
||||
)
|
|
@ -70,7 +70,8 @@ if(NOT WIN32)
|
|||
set(WITH_JACK ON CACHE BOOL "" FORCE)
|
||||
endif()
|
||||
if(WIN32)
|
||||
set(WITH_WASAPI ON CACHE BOOL "" FORCE)
|
||||
set(WITH_WASAPI ON CACHE BOOL "" FORCE)
|
||||
set(WITH_CYCLES_DEVICE_ONEAPI ON CACHE BOOL "" FORCE)
|
||||
endif()
|
||||
if(UNIX AND NOT APPLE)
|
||||
set(WITH_DOC_MANPAGE ON CACHE BOOL "" FORCE)
|
||||
|
@ -78,6 +79,11 @@ if(UNIX AND NOT APPLE)
|
|||
set(WITH_PULSEAUDIO ON CACHE BOOL "" FORCE)
|
||||
set(WITH_X11_XINPUT ON CACHE BOOL "" FORCE)
|
||||
set(WITH_X11_XF86VMODE ON CACHE BOOL "" FORCE)
|
||||
|
||||
# Disable oneAPI on Linux for the time being.
|
||||
# The AoT compilation takes too long to be used officially in the buildbot CI/CD and the JIT
|
||||
# compilation has ABI compatibility issues when running builds made on centOS on Ubuntu.
|
||||
set(WITH_CYCLES_DEVICE_ONEAPI OFF CACHE BOOL "" FORCE)
|
||||
endif()
|
||||
if(NOT APPLE)
|
||||
set(WITH_XR_OPENXR ON CACHE BOOL "" FORCE)
|
||||
|
@ -86,4 +92,7 @@ if(NOT APPLE)
|
|||
set(WITH_CYCLES_CUDA_BINARIES ON CACHE BOOL "" FORCE)
|
||||
set(WITH_CYCLES_CUBIN_COMPILER OFF CACHE BOOL "" FORCE)
|
||||
set(WITH_CYCLES_HIP_BINARIES ON CACHE BOOL "" FORCE)
|
||||
|
||||
# Disable AoT kernels compilations until buildbot can deliver them in a reasonabel time.
|
||||
set(WITH_CYCLES_ONEAPI_BINARIES OFF CACHE BOOL "" FORCE)
|
||||
endif()
|
||||
|
|
|
@ -38,9 +38,15 @@ if(EXISTS ${LIBDIR})
|
|||
message(STATUS "Using pre-compiled LIBDIR: ${LIBDIR}")
|
||||
|
||||
file(GLOB LIB_SUBDIRS ${LIBDIR}/*)
|
||||
|
||||
# Ignore Mesa software OpenGL libraries, they are not intended to be
|
||||
# linked against but to optionally override at runtime.
|
||||
list(REMOVE_ITEM LIB_SUBDIRS ${LIBDIR}/mesa)
|
||||
|
||||
# Ignore DPC++ as it contains its own copy of LLVM/CLang which we do
|
||||
# not need to be ever discovered for the Blender linking.
|
||||
list(REMOVE_ITEM LIB_SUBDIRS ${LIBDIR}/dpcpp)
|
||||
|
||||
# NOTE: Make sure "proper" compiled zlib comes first before the one
|
||||
# which is a part of OpenCollada. They have different ABI, and we
|
||||
# do need to use the official one.
|
||||
|
@ -271,6 +277,18 @@ if(WITH_CYCLES AND WITH_CYCLES_OSL)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if(WITH_CYCLES_DEVICE_ONEAPI)
|
||||
set(CYCLES_LEVEL_ZERO ${LIBDIR}/level-zero CACHE PATH "Path to Level Zero installation")
|
||||
if(EXISTS ${CYCLES_LEVEL_ZERO} AND NOT LEVEL_ZERO_ROOT_DIR)
|
||||
set(LEVEL_ZERO_ROOT_DIR ${CYCLES_LEVEL_ZERO})
|
||||
endif()
|
||||
|
||||
set(CYCLES_SYCL ${LIBDIR}/dpcpp CACHE PATH "Path to DPC++ and SYCL installation")
|
||||
if(EXISTS ${CYCLES_SYCL} AND NOT SYCL_ROOT_DIR)
|
||||
set(SYCL_ROOT_DIR ${CYCLES_SYCL})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(WITH_OPENVDB)
|
||||
find_package_wrapper(OpenVDB)
|
||||
find_package_wrapper(Blosc)
|
||||
|
|
|
@ -950,3 +950,6 @@ endif()
|
|||
|
||||
set(ZSTD_INCLUDE_DIRS ${LIBDIR}/zstd/include)
|
||||
set(ZSTD_LIBRARIES ${LIBDIR}/zstd/lib/zstd_static.lib)
|
||||
|
||||
set(LEVEL_ZERO_ROOT_DIR ${LIBDIR}/level_zero)
|
||||
set(SYCL_ROOT_DIR ${LIBDIR}/dpcpp)
|
||||
|
|
|
@ -263,6 +263,10 @@ if(WITH_CYCLES_DEVICE_OPTIX)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if (WITH_CYCLES_DEVICE_ONEAPI)
|
||||
add_definitions(-DWITH_ONEAPI)
|
||||
endif()
|
||||
|
||||
if(WITH_CYCLES_EMBREE)
|
||||
add_definitions(-DWITH_EMBREE)
|
||||
include_directories(
|
||||
|
|
|
@ -118,7 +118,8 @@ enum_device_type = (
|
|||
('CUDA', "CUDA", "CUDA", 1),
|
||||
('OPTIX', "OptiX", "OptiX", 3),
|
||||
('HIP', "HIP", "HIP", 4),
|
||||
('METAL', "Metal", "Metal", 5)
|
||||
('METAL', "Metal", "Metal", 5),
|
||||
('ONEAPI', "oneAPI", "oneAPI", 6)
|
||||
)
|
||||
|
||||
enum_texture_limit = (
|
||||
|
@ -1397,7 +1398,8 @@ class CyclesPreferences(bpy.types.AddonPreferences):
|
|||
|
||||
def get_device_types(self, context):
|
||||
import _cycles
|
||||
has_cuda, has_optix, has_hip, has_metal = _cycles.get_device_types()
|
||||
has_cuda, has_optix, has_hip, has_metal, has_oneapi = _cycles.get_device_types()
|
||||
|
||||
list = [('NONE', "None", "Don't use compute device", 0)]
|
||||
if has_cuda:
|
||||
list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
|
||||
|
@ -1407,6 +1409,8 @@ class CyclesPreferences(bpy.types.AddonPreferences):
|
|||
list.append(('HIP', "HIP", "Use HIP for GPU acceleration", 4))
|
||||
if has_metal:
|
||||
list.append(('METAL', "Metal", "Use Metal for GPU acceleration", 5))
|
||||
if has_oneapi:
|
||||
list.append(('ONEAPI', "oneAPI", "Use oneAPI for GPU acceleration", 6))
|
||||
|
||||
return list
|
||||
|
||||
|
@ -1438,7 +1442,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
|
|||
|
||||
def update_device_entries(self, device_list):
|
||||
for device in device_list:
|
||||
if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP', 'METAL'}:
|
||||
if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP', 'METAL', 'ONEAPI'}:
|
||||
continue
|
||||
# Try to find existing Device entry
|
||||
entry = self.find_existing_device_entry(device)
|
||||
|
@ -1482,7 +1486,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
|
|||
import _cycles
|
||||
# Ensure `self.devices` is not re-allocated when the second call to
|
||||
# get_devices_for_type is made, freeing items from the first list.
|
||||
for device_type in ('CUDA', 'OPTIX', 'HIP', 'METAL'):
|
||||
for device_type in ('CUDA', 'OPTIX', 'HIP', 'METAL', 'ONEAPI'):
|
||||
self.update_device_entries(_cycles.available_devices(device_type))
|
||||
|
||||
# Deprecated: use refresh_devices instead.
|
||||
|
@ -1550,13 +1554,25 @@ class CyclesPreferences(bpy.types.AddonPreferences):
|
|||
elif sys.platform.startswith("linux"):
|
||||
col.label(text="Requires AMD GPU with Vega or RDNA architecture", icon='BLANK1')
|
||||
col.label(text="and AMD driver version 22.10 or newer", icon='BLANK1')
|
||||
elif device_type == 'ONEAPI':
|
||||
import sys
|
||||
col.label(text="Requires Intel GPU with Xe-HPG architecture", icon='BLANK1')
|
||||
if sys.platform.startswith("win"):
|
||||
col.label(text="and Windows driver version 101.1660 or newer", icon='BLANK1')
|
||||
elif sys.platform.startswith("linux"):
|
||||
col.label(text="and Linux driver version xx.xx.20066 or newer", icon='BLANK1')
|
||||
elif device_type == 'METAL':
|
||||
col.label(text="Requires Apple Silicon with macOS 12.2 or newer", icon='BLANK1')
|
||||
col.label(text="or AMD with macOS 12.3 or newer", icon='BLANK1')
|
||||
return
|
||||
|
||||
for device in devices:
|
||||
box.prop(device, "use", text=device.name)
|
||||
import unicodedata
|
||||
box.prop(device, "use", text=device.name
|
||||
.replace('(TM)', unicodedata.lookup('TRADE MARK SIGN'))
|
||||
.replace('(R)', unicodedata.lookup('REGISTERED SIGN'))
|
||||
.replace('(C)', unicodedata.lookup('COPYRIGHT SIGN'))
|
||||
)
|
||||
|
||||
def draw_impl(self, layout, context):
|
||||
row = layout.row()
|
||||
|
|
|
@ -110,6 +110,10 @@ def use_optix(context):
|
|||
|
||||
return (get_device_type(context) == 'OPTIX' and cscene.device == 'GPU')
|
||||
|
||||
def use_oneapi(context):
|
||||
cscene = context.scene.cycles
|
||||
|
||||
return (get_device_type(context) == 'ONEAPI' and cscene.device == 'GPU')
|
||||
|
||||
def use_multi_device(context):
|
||||
cscene = context.scene.cycles
|
||||
|
|
|
@ -15,6 +15,7 @@ enum ComputeDevice {
|
|||
COMPUTE_DEVICE_OPTIX = 3,
|
||||
COMPUTE_DEVICE_HIP = 4,
|
||||
COMPUTE_DEVICE_METAL = 5,
|
||||
COMPUTE_DEVICE_ONEAPI = 6,
|
||||
|
||||
COMPUTE_DEVICE_NUM
|
||||
};
|
||||
|
@ -76,6 +77,9 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
|
|||
else if (compute_device == COMPUTE_DEVICE_METAL) {
|
||||
mask |= DEVICE_MASK_METAL;
|
||||
}
|
||||
else if (compute_device == COMPUTE_DEVICE_ONEAPI) {
|
||||
mask |= DEVICE_MASK_ONEAPI;
|
||||
}
|
||||
vector<DeviceInfo> devices = Device::available_devices(mask);
|
||||
|
||||
/* Match device preferences and available devices. */
|
||||
|
|
|
@ -871,18 +871,20 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
|
|||
static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
|
||||
{
|
||||
vector<DeviceType> device_types = Device::available_types();
|
||||
bool has_cuda = false, has_optix = false, has_hip = false, has_metal = false;
|
||||
bool has_cuda = false, has_optix = false, has_hip = false, has_metal = false, has_oneapi = false;
|
||||
foreach (DeviceType device_type, device_types) {
|
||||
has_cuda |= (device_type == DEVICE_CUDA);
|
||||
has_optix |= (device_type == DEVICE_OPTIX);
|
||||
has_hip |= (device_type == DEVICE_HIP);
|
||||
has_metal |= (device_type == DEVICE_METAL);
|
||||
has_oneapi |= (device_type == DEVICE_ONEAPI);
|
||||
}
|
||||
PyObject *list = PyTuple_New(4);
|
||||
PyObject *list = PyTuple_New(5);
|
||||
PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
|
||||
PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
|
||||
PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_hip));
|
||||
PyTuple_SET_ITEM(list, 3, PyBool_FromLong(has_metal));
|
||||
PyTuple_SET_ITEM(list, 4, PyBool_FromLong(has_oneapi));
|
||||
return list;
|
||||
}
|
||||
|
||||
|
@ -914,6 +916,9 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg)
|
|||
else if (override == "METAL") {
|
||||
BlenderSession::device_override = DEVICE_MASK_METAL;
|
||||
}
|
||||
else if (override == "ONEAPI") {
|
||||
BlenderSession::device_override = DEVICE_MASK_ONEAPI;
|
||||
}
|
||||
else {
|
||||
printf("\nError: %s is not a valid Cycles device.\n", override.c_str());
|
||||
Py_RETURN_FALSE;
|
||||
|
|
|
@ -91,6 +91,8 @@ if(CYCLES_STANDALONE_REPOSITORY)
|
|||
_set_default(USD_ROOT_DIR "${_cycles_lib_dir}/usd")
|
||||
_set_default(WEBP_ROOT_DIR "${_cycles_lib_dir}/webp")
|
||||
_set_default(ZLIB_ROOT "${_cycles_lib_dir}/zlib")
|
||||
_set_default(LEVEL_ZERO_ROOT_DIR "${_cycles_lib_dir}/level-zero")
|
||||
_set_default(SYCL_ROOT_DIR "${_cycles_lib_dir}/dpcpp")
|
||||
|
||||
# Ignore system libraries
|
||||
set(CMAKE_IGNORE_PATH "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES};${CMAKE_SYSTEM_INCLUDE_PATH};${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES};${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}")
|
||||
|
@ -647,3 +649,22 @@ if(WITH_CYCLES_DEVICE_METAL)
|
|||
message(STATUS "Found Metal: ${METAL_LIBRARY}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
###########################################################################
|
||||
# oneAPI
|
||||
###########################################################################
|
||||
|
||||
if (WITH_CYCLES_DEVICE_ONEAPI)
|
||||
find_package(SYCL)
|
||||
find_package(LevelZero)
|
||||
|
||||
if (SYCL_FOUND AND LEVEL_ZERO_FOUND)
|
||||
message(STATUS "Found oneAPI: ${SYCL_LIBRARY}")
|
||||
message(STATUS "Found Level Zero: ${LEVEL_ZERO_LIBRARY}")
|
||||
else()
|
||||
message(STATUS "oneAPI or Level Zero not found, disabling oneAPI device from Cycles")
|
||||
set(WITH_CYCLES_DEVICE_ONEAPI OFF)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
unset(_cycles_lib_dir)
|
||||
|
|
|
@ -82,6 +82,15 @@ set(SRC_HIP
|
|||
hip/util.h
|
||||
)
|
||||
|
||||
set(SRC_ONEAPI
|
||||
oneapi/device_impl.cpp
|
||||
oneapi/device_impl.h
|
||||
oneapi/device.cpp
|
||||
oneapi/device.h
|
||||
oneapi/queue.cpp
|
||||
oneapi/queue.h
|
||||
)
|
||||
|
||||
set(SRC_DUMMY
|
||||
dummy/device.cpp
|
||||
dummy/device.h
|
||||
|
@ -134,6 +143,7 @@ set(SRC
|
|||
${SRC_DUMMY}
|
||||
${SRC_MULTI}
|
||||
${SRC_OPTIX}
|
||||
${SRC_ONEAPI}
|
||||
${SRC_HEADERS}
|
||||
)
|
||||
|
||||
|
@ -181,6 +191,9 @@ if(WITH_CYCLES_DEVICE_METAL)
|
|||
${SRC_METAL}
|
||||
)
|
||||
endif()
|
||||
if (WITH_CYCLES_DEVICE_ONEAPI)
|
||||
add_definitions(-DWITH_ONEAPI)
|
||||
endif()
|
||||
|
||||
if(WITH_OPENIMAGEDENOISE)
|
||||
list(APPEND LIB
|
||||
|
@ -193,6 +206,11 @@ include_directories(SYSTEM ${INC_SYS})
|
|||
|
||||
cycles_add_library(cycles_device "${LIB}" ${SRC})
|
||||
|
||||
if(WITH_CYCLES_DEVICE_ONEAPI)
|
||||
# Need to have proper rebuilding in case of changes in cycles_kernel_oneapi due external project behaviour
|
||||
add_dependencies(cycles_device cycles_kernel_oneapi)
|
||||
endif()
|
||||
|
||||
source_group("cpu" FILES ${SRC_CPU})
|
||||
source_group("cuda" FILES ${SRC_CUDA})
|
||||
source_group("dummy" FILES ${SRC_DUMMY})
|
||||
|
@ -200,4 +218,5 @@ source_group("hip" FILES ${SRC_HIP})
|
|||
source_group("multi" FILES ${SRC_MULTI})
|
||||
source_group("metal" FILES ${SRC_METAL})
|
||||
source_group("optix" FILES ${SRC_OPTIX})
|
||||
source_group("oneapi" FILES ${SRC_ONEAPI})
|
||||
source_group("common" FILES ${SRC_BASE} ${SRC_HEADERS})
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include "device/hip/device.h"
|
||||
#include "device/metal/device.h"
|
||||
#include "device/multi/device.h"
|
||||
#include "device/oneapi/device.h"
|
||||
#include "device/optix/device.h"
|
||||
|
||||
#include "util/foreach.h"
|
||||
|
@ -39,6 +40,7 @@ vector<DeviceInfo> Device::optix_devices;
|
|||
vector<DeviceInfo> Device::cpu_devices;
|
||||
vector<DeviceInfo> Device::hip_devices;
|
||||
vector<DeviceInfo> Device::metal_devices;
|
||||
vector<DeviceInfo> Device::oneapi_devices;
|
||||
uint Device::devices_initialized_mask = 0;
|
||||
|
||||
/* Device */
|
||||
|
@ -101,6 +103,13 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
|
|||
device = device_metal_create(info, stats, profiler);
|
||||
break;
|
||||
#endif
|
||||
|
||||
#ifdef WITH_ONEAPI
|
||||
case DEVICE_ONEAPI:
|
||||
device = device_oneapi_create(info, stats, profiler);
|
||||
break;
|
||||
#endif
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@ -126,6 +135,8 @@ DeviceType Device::type_from_string(const char *name)
|
|||
return DEVICE_HIP;
|
||||
else if (strcmp(name, "METAL") == 0)
|
||||
return DEVICE_METAL;
|
||||
else if (strcmp(name, "ONEAPI") == 0)
|
||||
return DEVICE_ONEAPI;
|
||||
|
||||
return DEVICE_NONE;
|
||||
}
|
||||
|
@ -144,6 +155,8 @@ string Device::string_from_type(DeviceType type)
|
|||
return "HIP";
|
||||
else if (type == DEVICE_METAL)
|
||||
return "METAL";
|
||||
else if (type == DEVICE_ONEAPI)
|
||||
return "ONEAPI";
|
||||
|
||||
return "";
|
||||
}
|
||||
|
@ -163,6 +176,9 @@ vector<DeviceType> Device::available_types()
|
|||
#endif
|
||||
#ifdef WITH_METAL
|
||||
types.push_back(DEVICE_METAL);
|
||||
#endif
|
||||
#ifdef WITH_ONEAPI
|
||||
types.push_back(DEVICE_ONEAPI);
|
||||
#endif
|
||||
return types;
|
||||
}
|
||||
|
@ -219,6 +235,20 @@ vector<DeviceInfo> Device::available_devices(uint mask)
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef WITH_ONEAPI
|
||||
if (mask & DEVICE_MASK_ONEAPI) {
|
||||
if (!(devices_initialized_mask & DEVICE_MASK_ONEAPI)) {
|
||||
if (device_oneapi_init()) {
|
||||
device_oneapi_info(oneapi_devices);
|
||||
}
|
||||
devices_initialized_mask |= DEVICE_MASK_ONEAPI;
|
||||
}
|
||||
foreach (DeviceInfo &info, oneapi_devices) {
|
||||
devices.push_back(info);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (mask & DEVICE_MASK_CPU) {
|
||||
if (!(devices_initialized_mask & DEVICE_MASK_CPU)) {
|
||||
device_cpu_info(cpu_devices);
|
||||
|
@ -282,6 +312,15 @@ string Device::device_capabilities(uint mask)
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef WITH_ONEAPI
|
||||
if (mask & DEVICE_MASK_ONEAPI) {
|
||||
if (device_oneapi_init()) {
|
||||
capabilities += "\noneAPI device capabilities:\n";
|
||||
capabilities += device_oneapi_capabilities();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef WITH_METAL
|
||||
if (mask & DEVICE_MASK_METAL) {
|
||||
if (device_metal_init()) {
|
||||
|
@ -380,6 +419,7 @@ void Device::free_memory()
|
|||
cuda_devices.free_memory();
|
||||
optix_devices.free_memory();
|
||||
hip_devices.free_memory();
|
||||
oneapi_devices.free_memory();
|
||||
cpu_devices.free_memory();
|
||||
metal_devices.free_memory();
|
||||
}
|
||||
|
|
|
@ -40,6 +40,7 @@ enum DeviceType {
|
|||
DEVICE_OPTIX,
|
||||
DEVICE_HIP,
|
||||
DEVICE_METAL,
|
||||
DEVICE_ONEAPI,
|
||||
DEVICE_DUMMY,
|
||||
};
|
||||
|
||||
|
@ -49,6 +50,7 @@ enum DeviceTypeMask {
|
|||
DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
|
||||
DEVICE_MASK_HIP = (1 << DEVICE_HIP),
|
||||
DEVICE_MASK_METAL = (1 << DEVICE_METAL),
|
||||
DEVICE_MASK_ONEAPI = (1 << DEVICE_ONEAPI),
|
||||
DEVICE_MASK_ALL = ~0
|
||||
};
|
||||
|
||||
|
@ -273,6 +275,7 @@ class Device {
|
|||
static vector<DeviceInfo> cpu_devices;
|
||||
static vector<DeviceInfo> hip_devices;
|
||||
static vector<DeviceInfo> metal_devices;
|
||||
static vector<DeviceInfo> oneapi_devices;
|
||||
static uint devices_initialized_mask;
|
||||
};
|
||||
|
||||
|
|
|
@ -0,0 +1,181 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2021-2022 Intel Corporation */
|
||||
|
||||
#include "device/oneapi/device.h"
|
||||
|
||||
#include "util/log.h"
|
||||
|
||||
#ifdef WITH_ONEAPI
|
||||
# include "device/device.h"
|
||||
# include "device/oneapi/device_impl.h"
|
||||
|
||||
# include "util/path.h"
|
||||
# include "util/string.h"
|
||||
|
||||
# ifdef __linux__
|
||||
# include <dlfcn.h>
|
||||
# endif
|
||||
#endif /* WITH_ONEAPI */
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifdef WITH_ONEAPI
|
||||
static OneAPIDLLInterface oneapi_dll;
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
# define LOAD_ONEAPI_SHARED_LIBRARY(path) (void *)(LoadLibrary(path))
|
||||
# define FREE_SHARED_LIBRARY(handle) FreeLibrary((HMODULE)handle)
|
||||
# define GET_SHARED_LIBRARY_SYMBOL(handle, name) GetProcAddress((HMODULE)handle, name)
|
||||
#elif __linux__
|
||||
# define LOAD_ONEAPI_SHARED_LIBRARY(path) dlopen(path, RTLD_NOW)
|
||||
# define FREE_SHARED_LIBRARY(handle) dlclose(handle)
|
||||
# define GET_SHARED_LIBRARY_SYMBOL(handle, name) dlsym(handle, name)
|
||||
#endif
|
||||
|
||||
bool device_oneapi_init()
|
||||
{
|
||||
#if !defined(WITH_ONEAPI)
|
||||
return false;
|
||||
#else
|
||||
|
||||
string lib_path = path_get("lib");
|
||||
# ifdef _WIN32
|
||||
lib_path = path_join(lib_path, "cycles_kernel_oneapi.dll");
|
||||
# else
|
||||
lib_path = path_join(lib_path, "cycles_kernel_oneapi.so");
|
||||
# endif
|
||||
void *lib_handle = LOAD_ONEAPI_SHARED_LIBRARY(lib_path.c_str());
|
||||
|
||||
/* This shouldn't happen, but it still makes sense to have a branch for this. */
|
||||
if (lib_handle == NULL) {
|
||||
LOG(ERROR) << "oneAPI kernel shared library cannot be loaded for some reason. This should not "
|
||||
"happen, however, it occurs hence oneAPI rendering will be disabled";
|
||||
return false;
|
||||
}
|
||||
|
||||
# define DLL_INTERFACE_CALL(function, return_type, ...) \
|
||||
(oneapi_dll.function) = reinterpret_cast<decltype(oneapi_dll.function)>( \
|
||||
GET_SHARED_LIBRARY_SYMBOL(lib_handle, #function)); \
|
||||
if (oneapi_dll.function == NULL) { \
|
||||
LOG(ERROR) << "oneAPI shared library function \"" << #function \
|
||||
<< "\" has not been loaded from kernel shared - disable oneAPI " \
|
||||
"library disable oneAPI implementation due to this"; \
|
||||
FREE_SHARED_LIBRARY(lib_handle); \
|
||||
return false; \
|
||||
}
|
||||
# include "kernel/device/oneapi/dll_interface_template.h"
|
||||
# undef DLL_INTERFACE_CALL
|
||||
|
||||
VLOG_INFO << "oneAPI kernel shared library has been loaded successfully";
|
||||
|
||||
/* We need to have this oneapi kernel shared library during all life-span of the Blender.
|
||||
* So it is not unloaded because of this.
|
||||
* FREE_SHARED_LIBRARY(lib_handle); */
|
||||
|
||||
/* NOTE(@nsirgien): we need to enable JIT cache from here and
|
||||
* right now this cache policy is controlled by env. variables. */
|
||||
/* NOTE(hallade) we also disable use of copy engine as it
|
||||
* improves stability as of intel/llvm sycl-nightly/20220529.
|
||||
* All these env variable can be set beforehand by end-users and
|
||||
* will in that case -not- be overwritten. */
|
||||
# ifdef _WIN32
|
||||
if (getenv("SYCL_CACHE_PERSISTENT") == nullptr) {
|
||||
_putenv_s("SYCL_CACHE_PERSISTENT", "1");
|
||||
}
|
||||
if (getenv("SYCL_CACHE_TRESHOLD") == nullptr) {
|
||||
_putenv_s("SYCL_CACHE_THRESHOLD", "0");
|
||||
}
|
||||
if (getenv("SYCL_DEVICE_FILTER") == nullptr) {
|
||||
_putenv_s("SYCL_DEVICE_FILTER", "host,level_zero");
|
||||
}
|
||||
if (getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE") == nullptr) {
|
||||
_putenv_s("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0");
|
||||
}
|
||||
# elif __linux__
|
||||
setenv("SYCL_CACHE_PERSISTENT", "1", false);
|
||||
setenv("SYCL_CACHE_THRESHOLD", "0", false);
|
||||
setenv("SYCL_DEVICE_FILTER", "host,level_zero", false);
|
||||
setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false);
|
||||
# endif
|
||||
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(_WIN32) || defined(__linux__)
|
||||
# undef LOAD_SYCL_SHARED_LIBRARY
|
||||
# undef LOAD_ONEAPI_SHARED_LIBRARY
|
||||
# undef FREE_SHARED_LIBRARY
|
||||
# undef GET_SHARED_LIBRARY_SYMBOL
|
||||
#endif
|
||||
|
||||
Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
|
||||
{
|
||||
#ifdef WITH_ONEAPI
|
||||
return new OneapiDevice(info, oneapi_dll, stats, profiler);
|
||||
#else
|
||||
(void)info;
|
||||
(void)stats;
|
||||
(void)profiler;
|
||||
|
||||
LOG(FATAL) << "Requested to create oneAPI device while not enabled for this build.";
|
||||
|
||||
return nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef WITH_ONEAPI
|
||||
static void device_iterator_cb(const char *id, const char *name, int num, void *user_ptr)
|
||||
{
|
||||
vector<DeviceInfo> *devices = (vector<DeviceInfo> *)user_ptr;
|
||||
|
||||
DeviceInfo info;
|
||||
|
||||
info.type = DEVICE_ONEAPI;
|
||||
info.description = name;
|
||||
info.num = num;
|
||||
|
||||
/* NOTE(@nsirgien): Should be unique at least on proper oneapi installation. */
|
||||
info.id = id;
|
||||
|
||||
info.has_nanovdb = true;
|
||||
info.denoisers = 0;
|
||||
|
||||
info.has_gpu_queue = true;
|
||||
|
||||
/* NOTE(@nsirgien): oneAPI right now is focused on one device usage. In future it maybe will
|
||||
* change, but right now peer access from one device to another device is not supported. */
|
||||
info.has_peer_memory = false;
|
||||
|
||||
/* NOTE(@nsirgien): Seems not possible to know from SYCL/oneAPI or Level0. */
|
||||
info.display_device = false;
|
||||
|
||||
devices->push_back(info);
|
||||
VLOG_INFO << "Added device \"" << name << "\" with id \"" << info.id << "\".";
|
||||
}
|
||||
#endif
|
||||
|
||||
void device_oneapi_info(vector<DeviceInfo> &devices)
|
||||
{
|
||||
#ifdef WITH_ONEAPI
|
||||
(oneapi_dll.oneapi_iterate_devices)(device_iterator_cb, &devices);
|
||||
#else /* WITH_ONEAPI */
|
||||
(void)devices;
|
||||
#endif /* WITH_ONEAPI */
|
||||
}
|
||||
|
||||
string device_oneapi_capabilities()
|
||||
{
|
||||
string capabilities;
|
||||
#ifdef WITH_ONEAPI
|
||||
char *c_capabilities = (oneapi_dll.oneapi_device_capabilities)();
|
||||
if (c_capabilities) {
|
||||
capabilities = c_capabilities;
|
||||
(oneapi_dll.oneapi_free)(c_capabilities);
|
||||
}
|
||||
#endif
|
||||
return capabilities;
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
|
@ -0,0 +1,24 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2011-2022 Blender Foundation */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "util/string.h"
|
||||
#include "util/vector.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
class Device;
|
||||
class DeviceInfo;
|
||||
class Profiler;
|
||||
class Stats;
|
||||
|
||||
bool device_oneapi_init();
|
||||
|
||||
Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
|
||||
|
||||
void device_oneapi_info(vector<DeviceInfo> &devices);
|
||||
|
||||
string device_oneapi_capabilities();
|
||||
|
||||
CCL_NAMESPACE_END
|
|
@ -0,0 +1,426 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2021-2022 Intel Corporation */
|
||||
|
||||
#ifdef WITH_ONEAPI
|
||||
|
||||
# include "device/oneapi/device_impl.h"
|
||||
|
||||
# include "util/debug.h"
|
||||
# include "util/log.h"
|
||||
|
||||
# include "kernel/device/oneapi/kernel.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
static void queue_error_cb(const char *message, void *user_ptr)
|
||||
{
|
||||
if (user_ptr) {
|
||||
*reinterpret_cast<std::string *>(user_ptr) = message;
|
||||
}
|
||||
}
|
||||
|
||||
OneapiDevice::OneapiDevice(const DeviceInfo &info,
|
||||
OneAPIDLLInterface &oneapi_dll_object,
|
||||
Stats &stats,
|
||||
Profiler &profiler)
|
||||
: Device(info, stats, profiler),
|
||||
device_queue_(nullptr),
|
||||
texture_info_(this, "texture_info", MEM_GLOBAL),
|
||||
kg_memory_(nullptr),
|
||||
kg_memory_device_(nullptr),
|
||||
kg_memory_size_(0),
|
||||
oneapi_dll_(oneapi_dll_object)
|
||||
{
|
||||
need_texture_info_ = false;
|
||||
|
||||
oneapi_dll_.oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
|
||||
|
||||
/* Oneapi calls should be initialised on this moment. */
|
||||
assert(oneapi_dll_.oneapi_create_queue != nullptr);
|
||||
|
||||
bool is_finished_ok = oneapi_dll_.oneapi_create_queue(device_queue_, info.num);
|
||||
if (is_finished_ok == false) {
|
||||
set_error("oneAPI queue initialization error: got runtime exception \"" +
|
||||
oneapi_error_string_ + "\"");
|
||||
}
|
||||
else {
|
||||
VLOG_DEBUG << "oneAPI queue has been successfully created for the device \""
|
||||
<< info.description << "\"";
|
||||
assert(device_queue_);
|
||||
}
|
||||
|
||||
size_t globals_segment_size;
|
||||
is_finished_ok = oneapi_dll_.oneapi_kernel_globals_size(device_queue_, globals_segment_size);
|
||||
if (is_finished_ok == false) {
|
||||
set_error("oneAPI constant memory initialization got runtime exception \"" +
|
||||
oneapi_error_string_ + "\"");
|
||||
}
|
||||
else {
|
||||
VLOG_DEBUG << "Successfully created global/constant memory segment (kernel globals object)";
|
||||
}
|
||||
|
||||
kg_memory_ = oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, globals_segment_size, 16);
|
||||
oneapi_dll_.oneapi_usm_memset(device_queue_, kg_memory_, 0, globals_segment_size);
|
||||
|
||||
kg_memory_device_ = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, globals_segment_size);
|
||||
|
||||
kg_memory_size_ = globals_segment_size;
|
||||
}
|
||||
|
||||
OneapiDevice::~OneapiDevice()
|
||||
{
|
||||
texture_info_.free();
|
||||
oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_);
|
||||
oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_device_);
|
||||
|
||||
for (ConstMemMap::iterator mt = const_mem_map_.begin(); mt != const_mem_map_.end(); mt++)
|
||||
delete mt->second;
|
||||
|
||||
if (device_queue_)
|
||||
oneapi_dll_.oneapi_free_queue(device_queue_);
|
||||
}
|
||||
|
||||
bool OneapiDevice::check_peer_access(Device * /*peer_device*/)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
BVHLayoutMask OneapiDevice::get_bvh_layout_mask() const
|
||||
{
|
||||
return BVH_LAYOUT_BVH2;
|
||||
}
|
||||
|
||||
bool OneapiDevice::load_kernels(const uint requested_features)
|
||||
{
|
||||
assert(device_queue_);
|
||||
/* NOTE(@nsirgien): oneAPI can support compilation of kernel code with sertain feature set
|
||||
* with specialization constants, but it hasn't been implemented yet. */
|
||||
(void)requested_features;
|
||||
|
||||
bool is_finished_ok = oneapi_dll_.oneapi_run_test_kernel(device_queue_);
|
||||
if (is_finished_ok == false) {
|
||||
set_error("oneAPI kernel load: got runtime exception \"" + oneapi_error_string_ + "\"");
|
||||
}
|
||||
else {
|
||||
VLOG_INFO << "Runtime compilation done for \"" << info.description << "\"";
|
||||
assert(device_queue_);
|
||||
}
|
||||
return is_finished_ok;
|
||||
}
|
||||
|
||||
void OneapiDevice::load_texture_info()
|
||||
{
|
||||
if (need_texture_info_) {
|
||||
need_texture_info_ = false;
|
||||
texture_info_.copy_to_device();
|
||||
}
|
||||
}
|
||||
|
||||
void OneapiDevice::generic_alloc(device_memory &mem)
|
||||
{
|
||||
size_t memory_size = mem.memory_size();
|
||||
|
||||
/* TODO(@nsirgien): In future, if scene doesn't fit into device memory, then
|
||||
* we can use USM host memory.
|
||||
* Because of the expected performance impact, implementation of this has had a low priority
|
||||
* and is not implemented yet. */
|
||||
|
||||
assert(device_queue_);
|
||||
/* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device
|
||||
* and shared. For new project it maybe more beneficial to use USM shared memory, because it
|
||||
* provides automatic migration mechanism in order to allow to use the same pointer on host and
|
||||
* on device, without need to worry about explicit memory transfer operations. But for
|
||||
* Blender/Cycles this type of memory is not very suitable in current application architecture,
|
||||
* because Cycles already uses two different pointer for host activity and device activity, and
|
||||
* also has to perform all needed memory transfer operations. So, USM device memory
|
||||
* type has been used for oneAPI device in order to better fit in Cycles architecture. */
|
||||
void *device_pointer = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, memory_size);
|
||||
if (device_pointer == nullptr) {
|
||||
size_t max_memory_on_device = oneapi_dll_.oneapi_get_memcapacity(device_queue_);
|
||||
set_error("oneAPI kernel - device memory allocation error for " +
|
||||
string_human_readable_size(mem.memory_size()) +
|
||||
", possibly caused by lack of available memory space on the device: " +
|
||||
string_human_readable_size(stats.mem_used) + " of " +
|
||||
string_human_readable_size(max_memory_on_device) + " is already allocated");
|
||||
return;
|
||||
}
|
||||
assert(device_pointer);
|
||||
|
||||
mem.device_pointer = reinterpret_cast<ccl::device_ptr>(device_pointer);
|
||||
mem.device_size = memory_size;
|
||||
|
||||
stats.mem_alloc(memory_size);
|
||||
}
|
||||
|
||||
void OneapiDevice::generic_copy_to(device_memory &mem)
|
||||
{
|
||||
size_t memory_size = mem.memory_size();
|
||||
|
||||
/* Copy operation from host shouldn't be requested if there is no memory allocated on host. */
|
||||
assert(mem.host_pointer);
|
||||
assert(device_queue_);
|
||||
oneapi_dll_.oneapi_usm_memcpy(
|
||||
device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size);
|
||||
}
|
||||
|
||||
/* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */
|
||||
SyclQueue *OneapiDevice::sycl_queue()
|
||||
{
|
||||
return device_queue_;
|
||||
}
|
||||
|
||||
string OneapiDevice::oneapi_error_message()
|
||||
{
|
||||
return string(oneapi_error_string_);
|
||||
}
|
||||
|
||||
OneAPIDLLInterface OneapiDevice::oneapi_dll_object()
|
||||
{
|
||||
return oneapi_dll_;
|
||||
}
|
||||
|
||||
void *OneapiDevice::kernel_globals_device_pointer()
|
||||
{
|
||||
return kg_memory_device_;
|
||||
}
|
||||
|
||||
void OneapiDevice::generic_free(device_memory &mem)
|
||||
{
|
||||
assert(mem.device_pointer);
|
||||
stats.mem_free(mem.device_size);
|
||||
mem.device_size = 0;
|
||||
|
||||
assert(device_queue_);
|
||||
oneapi_dll_.oneapi_usm_free(device_queue_, (void *)mem.device_pointer);
|
||||
mem.device_pointer = 0;
|
||||
}
|
||||
|
||||
void OneapiDevice::mem_alloc(device_memory &mem)
|
||||
{
|
||||
if (mem.type == MEM_TEXTURE) {
|
||||
assert(!"mem_alloc not supported for textures.");
|
||||
}
|
||||
else if (mem.type == MEM_GLOBAL) {
|
||||
assert(!"mem_alloc not supported for global memory.");
|
||||
}
|
||||
else {
|
||||
if (mem.name) {
|
||||
VLOG_DEBUG << "OneapiDevice::mem_alloc: \"" << mem.name << "\", "
|
||||
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
||||
<< string_human_readable_size(mem.memory_size()) << ")";
|
||||
}
|
||||
generic_alloc(mem);
|
||||
}
|
||||
}
|
||||
|
||||
void OneapiDevice::mem_copy_to(device_memory &mem)
|
||||
{
|
||||
if (mem.name) {
|
||||
VLOG_DEBUG << "OneapiDevice::mem_copy_to: \"" << mem.name << "\", "
|
||||
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
||||
<< string_human_readable_size(mem.memory_size()) << ")";
|
||||
}
|
||||
|
||||
if (mem.type == MEM_GLOBAL) {
|
||||
global_free(mem);
|
||||
global_alloc(mem);
|
||||
}
|
||||
else if (mem.type == MEM_TEXTURE) {
|
||||
tex_free((device_texture &)mem);
|
||||
tex_alloc((device_texture &)mem);
|
||||
}
|
||||
else {
|
||||
if (!mem.device_pointer)
|
||||
mem_alloc(mem);
|
||||
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
}
|
||||
|
||||
void OneapiDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem)
|
||||
{
|
||||
if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
|
||||
assert(!"mem_copy_from not supported for textures.");
|
||||
}
|
||||
else if (mem.host_pointer) {
|
||||
const size_t size = (w > 0 || h > 0 || elem > 0) ? (elem * w * h) : mem.memory_size();
|
||||
const size_t offset = elem * y * w;
|
||||
|
||||
if (mem.name) {
|
||||
VLOG_DEBUG << "OneapiDevice::mem_copy_from: \"" << mem.name << "\" object of "
|
||||
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
||||
<< string_human_readable_size(mem.memory_size()) << ") from offset " << offset
|
||||
<< " data " << size << " bytes";
|
||||
}
|
||||
|
||||
assert(device_queue_);
|
||||
|
||||
assert(size != 0);
|
||||
assert(mem.device_pointer);
|
||||
char *shifted_host = reinterpret_cast<char *>(mem.host_pointer) + offset;
|
||||
char *shifted_device = reinterpret_cast<char *>(mem.device_pointer) + offset;
|
||||
bool is_finished_ok = oneapi_dll_.oneapi_usm_memcpy(
|
||||
device_queue_, shifted_host, shifted_device, size);
|
||||
if (is_finished_ok == false) {
|
||||
set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
|
||||
"\"");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void OneapiDevice::mem_zero(device_memory &mem)
|
||||
{
|
||||
if (mem.name) {
|
||||
VLOG_DEBUG << "OneapiDevice::mem_zero: \"" << mem.name << "\", "
|
||||
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
||||
<< string_human_readable_size(mem.memory_size()) << ")\n";
|
||||
}
|
||||
|
||||
if (!mem.device_pointer) {
|
||||
mem_alloc(mem);
|
||||
}
|
||||
if (!mem.device_pointer) {
|
||||
return;
|
||||
}
|
||||
|
||||
assert(device_queue_);
|
||||
bool is_finished_ok = oneapi_dll_.oneapi_usm_memset(
|
||||
device_queue_, (void *)mem.device_pointer, 0, mem.memory_size());
|
||||
if (is_finished_ok == false) {
|
||||
set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
|
||||
"\"");
|
||||
}
|
||||
}
|
||||
|
||||
void OneapiDevice::mem_free(device_memory &mem)
|
||||
{
|
||||
if (mem.name) {
|
||||
VLOG_DEBUG << "OneapiDevice::mem_free: \"" << mem.name << "\", "
|
||||
<< string_human_readable_number(mem.device_size) << " bytes. ("
|
||||
<< string_human_readable_size(mem.device_size) << ")\n";
|
||||
}
|
||||
|
||||
if (mem.type == MEM_GLOBAL) {
|
||||
global_free(mem);
|
||||
}
|
||||
else if (mem.type == MEM_TEXTURE) {
|
||||
tex_free((device_texture &)mem);
|
||||
}
|
||||
else {
|
||||
generic_free(mem);
|
||||
}
|
||||
}
|
||||
|
||||
device_ptr OneapiDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
|
||||
{
|
||||
return reinterpret_cast<device_ptr>(reinterpret_cast<char *>(mem.device_pointer) +
|
||||
mem.memory_elements_size(offset));
|
||||
}
|
||||
|
||||
void OneapiDevice::const_copy_to(const char *name, void *host, size_t size)
|
||||
{
|
||||
assert(name);
|
||||
|
||||
VLOG_DEBUG << "OneapiDevice::const_copy_to \"" << name << "\" object "
|
||||
<< string_human_readable_number(size) << " bytes. ("
|
||||
<< string_human_readable_size(size) << ")";
|
||||
|
||||
ConstMemMap::iterator i = const_mem_map_.find(name);
|
||||
device_vector<uchar> *data;
|
||||
|
||||
if (i == const_mem_map_.end()) {
|
||||
data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
|
||||
data->alloc(size);
|
||||
const_mem_map_.insert(ConstMemMap::value_type(name, data));
|
||||
}
|
||||
else {
|
||||
data = i->second;
|
||||
}
|
||||
|
||||
assert(data->memory_size() <= size);
|
||||
memcpy(data->data(), host, size);
|
||||
data->copy_to_device();
|
||||
|
||||
oneapi_dll_.oneapi_set_global_memory(
|
||||
device_queue_, kg_memory_, name, (void *)data->device_pointer);
|
||||
|
||||
oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
|
||||
}
|
||||
|
||||
void OneapiDevice::global_alloc(device_memory &mem)
|
||||
{
|
||||
assert(mem.name);
|
||||
|
||||
size_t size = mem.memory_size();
|
||||
VLOG_DEBUG << "OneapiDevice::global_alloc \"" << mem.name << "\" object "
|
||||
<< string_human_readable_number(size) << " bytes. ("
|
||||
<< string_human_readable_size(size) << ")";
|
||||
|
||||
generic_alloc(mem);
|
||||
generic_copy_to(mem);
|
||||
|
||||
oneapi_dll_.oneapi_set_global_memory(
|
||||
device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer);
|
||||
|
||||
oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
|
||||
}
|
||||
|
||||
void OneapiDevice::global_free(device_memory &mem)
|
||||
{
|
||||
if (mem.device_pointer) {
|
||||
generic_free(mem);
|
||||
}
|
||||
}
|
||||
|
||||
void OneapiDevice::tex_alloc(device_texture &mem)
|
||||
{
|
||||
generic_alloc(mem);
|
||||
generic_copy_to(mem);
|
||||
|
||||
/* Resize if needed. Also, in case of resize - allocate in advance for future allocs. */
|
||||
const uint slot = mem.slot;
|
||||
if (slot >= texture_info_.size()) {
|
||||
texture_info_.resize(slot + 128);
|
||||
}
|
||||
|
||||
texture_info_[slot] = mem.info;
|
||||
need_texture_info_ = true;
|
||||
|
||||
texture_info_[slot].data = (uint64_t)mem.device_pointer;
|
||||
}
|
||||
|
||||
void OneapiDevice::tex_free(device_texture &mem)
|
||||
{
|
||||
/* There is no texture memory in SYCL. */
|
||||
if (mem.device_pointer) {
|
||||
generic_free(mem);
|
||||
}
|
||||
}
|
||||
|
||||
unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create()
|
||||
{
|
||||
return make_unique<OneapiDeviceQueue>(this);
|
||||
}
|
||||
|
||||
bool OneapiDevice::should_use_graphics_interop()
|
||||
{
|
||||
/* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so
|
||||
* return false. */
|
||||
return false;
|
||||
}
|
||||
|
||||
void *OneapiDevice::usm_aligned_alloc_host(size_t memory_size, size_t alignment)
|
||||
{
|
||||
assert(device_queue_);
|
||||
return oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, memory_size, alignment);
|
||||
}
|
||||
|
||||
void OneapiDevice::usm_free(void *usm_ptr)
|
||||
{
|
||||
assert(device_queue_);
|
||||
return oneapi_dll_.oneapi_usm_free(device_queue_, usm_ptr);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif
|
|
@ -0,0 +1,100 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2021-2022 Intel Corporation */
|
||||
|
||||
#ifdef WITH_ONEAPI
|
||||
|
||||
# include "device/device.h"
|
||||
# include "device/oneapi/device.h"
|
||||
# include "device/oneapi/queue.h"
|
||||
|
||||
# include "util/map.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
class DeviceQueue;
|
||||
|
||||
class OneapiDevice : public Device {
|
||||
private:
|
||||
SyclQueue *device_queue_;
|
||||
|
||||
using ConstMemMap = map<string, device_vector<uchar> *>;
|
||||
ConstMemMap const_mem_map_;
|
||||
device_vector<TextureInfo> texture_info_;
|
||||
bool need_texture_info_;
|
||||
void *kg_memory_;
|
||||
void *kg_memory_device_;
|
||||
size_t kg_memory_size_ = (size_t)0;
|
||||
OneAPIDLLInterface oneapi_dll_;
|
||||
std::string oneapi_error_string_;
|
||||
|
||||
public:
|
||||
virtual BVHLayoutMask get_bvh_layout_mask() const override;
|
||||
|
||||
OneapiDevice(const DeviceInfo &info,
|
||||
OneAPIDLLInterface &oneapi_dll_object,
|
||||
Stats &stats,
|
||||
Profiler &profiler);
|
||||
|
||||
virtual ~OneapiDevice();
|
||||
|
||||
bool check_peer_access(Device *peer_device) override;
|
||||
|
||||
bool load_kernels(const uint requested_features) override;
|
||||
|
||||
void load_texture_info();
|
||||
|
||||
void generic_alloc(device_memory &mem);
|
||||
|
||||
void generic_copy_to(device_memory &mem);
|
||||
|
||||
void generic_free(device_memory &mem);
|
||||
|
||||
SyclQueue *sycl_queue();
|
||||
|
||||
string oneapi_error_message();
|
||||
|
||||
OneAPIDLLInterface oneapi_dll_object();
|
||||
|
||||
void *kernel_globals_device_pointer();
|
||||
|
||||
void mem_alloc(device_memory &mem) override;
|
||||
|
||||
void mem_copy_to(device_memory &mem) override;
|
||||
|
||||
void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;
|
||||
|
||||
void mem_copy_from(device_memory &mem)
|
||||
{
|
||||
mem_copy_from(mem, 0, 0, 0, 0);
|
||||
}
|
||||
|
||||
void mem_zero(device_memory &mem) override;
|
||||
|
||||
void mem_free(device_memory &mem) override;
|
||||
|
||||
device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override;
|
||||
|
||||
virtual void const_copy_to(const char *name, void *host, size_t size) override;
|
||||
|
||||
void global_alloc(device_memory &mem);
|
||||
|
||||
void global_free(device_memory &mem);
|
||||
|
||||
void tex_alloc(device_texture &mem);
|
||||
|
||||
void tex_free(device_texture &mem);
|
||||
|
||||
/* Graphics resources interoperability. */
|
||||
virtual bool should_use_graphics_interop() override;
|
||||
|
||||
virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
|
||||
|
||||
/* NOTE(@nsirgien): Create this methods to avoid some compilation problems on Windows with host
|
||||
* side compilation (MSVC). */
|
||||
void *usm_aligned_alloc_host(size_t memory_size, size_t alignment);
|
||||
void usm_free(void *usm_ptr);
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif
|
|
@ -0,0 +1,17 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2011-2022 Blender Foundation */
|
||||
|
||||
#pragma once
|
||||
|
||||
/* Include kernel header to get access to sycl-specific types, like SyclQueue and
|
||||
* OneAPIDeviceIteratorCallback. */
|
||||
#include "kernel/device/oneapi/kernel.h"
|
||||
|
||||
#ifdef WITH_ONEAPI
|
||||
struct OneAPIDLLInterface {
|
||||
# define DLL_INTERFACE_CALL(function, return_type, ...) \
|
||||
return_type (*function)(__VA_ARGS__) = nullptr;
|
||||
# include "kernel/device/oneapi/dll_interface_template.h"
|
||||
# undef DLL_INTERFACE_CALL
|
||||
};
|
||||
#endif
|
|
@ -0,0 +1,165 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2021-2022 Intel Corporation */
|
||||
|
||||
#ifdef WITH_ONEAPI
|
||||
|
||||
# include "device/oneapi/queue.h"
|
||||
# include "device/oneapi/device_impl.h"
|
||||
# include "util/log.h"
|
||||
# include "util/time.h"
|
||||
# include <iomanip>
|
||||
# include <vector>
|
||||
|
||||
# include "kernel/device/oneapi/kernel.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
struct KernelExecutionInfo {
|
||||
double elapsed_summary = 0.0;
|
||||
int enqueue_count = 0;
|
||||
};
|
||||
|
||||
/* OneapiDeviceQueue */
|
||||
|
||||
OneapiDeviceQueue::OneapiDeviceQueue(OneapiDevice *device)
|
||||
: DeviceQueue(device),
|
||||
oneapi_device_(device),
|
||||
oneapi_dll_(device->oneapi_dll_object()),
|
||||
kernel_context_(nullptr)
|
||||
{
|
||||
}
|
||||
|
||||
OneapiDeviceQueue::~OneapiDeviceQueue()
|
||||
{
|
||||
delete kernel_context_;
|
||||
}
|
||||
|
||||
int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const
|
||||
{
|
||||
int num_states;
|
||||
|
||||
/* TODO: implement and use get_num_multiprocessors and get_max_num_threads_per_multiprocessor. */
|
||||
const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount(
|
||||
oneapi_device_->sycl_queue());
|
||||
if (compute_units >= 128) {
|
||||
/* dGPU path, make sense to allocate more states, because it will be dedicated GPU memory. */
|
||||
int base = 1024 * 1024;
|
||||
/* linear dependency (with coefficient less that 1) from amount of compute units. */
|
||||
num_states = (base * (compute_units / 128)) * 3 / 4;
|
||||
|
||||
/* Limit amount of integrator states by one quarter of device memory, because
|
||||
* other allocations will need some space as well
|
||||
* TODO: base this calculation on the how many states what the GPU is actually capable of
|
||||
* running, with some headroom to improve occupancy. If the texture don't fit, offload into
|
||||
* unified memory. */
|
||||
size_t states_memory_size = num_states * state_size;
|
||||
size_t device_memory_amount =
|
||||
(oneapi_dll_.oneapi_get_memcapacity)(oneapi_device_->sycl_queue());
|
||||
if (states_memory_size >= device_memory_amount / 4) {
|
||||
num_states = device_memory_amount / 4 / state_size;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* iGPU path - no real need to allocate a lot of integrator states because it is shared GPU
|
||||
* memory. */
|
||||
num_states = 1024 * 512;
|
||||
}
|
||||
|
||||
VLOG_DEVICE_STATS << "GPU queue concurrent states: " << num_states << ", using up to "
|
||||
<< string_human_readable_size(num_states * state_size);
|
||||
|
||||
return num_states;
|
||||
}
|
||||
|
||||
int OneapiDeviceQueue::num_concurrent_busy_states() const
|
||||
{
|
||||
const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount(
|
||||
oneapi_device_->sycl_queue());
|
||||
if (compute_units >= 128) {
|
||||
return 1024 * 1024;
|
||||
}
|
||||
else {
|
||||
return 1024 * 512;
|
||||
}
|
||||
}
|
||||
|
||||
void OneapiDeviceQueue::init_execution()
|
||||
{
|
||||
oneapi_device_->load_texture_info();
|
||||
|
||||
SyclQueue *device_queue = oneapi_device_->sycl_queue();
|
||||
void *kg_dptr = (void *)oneapi_device_->kernel_globals_device_pointer();
|
||||
assert(device_queue);
|
||||
assert(kg_dptr);
|
||||
kernel_context_ = new KernelContext{device_queue, kg_dptr};
|
||||
|
||||
debug_init_execution();
|
||||
}
|
||||
|
||||
bool OneapiDeviceQueue::enqueue(DeviceKernel kernel,
|
||||
const int signed_kernel_work_size,
|
||||
DeviceKernelArguments const &_args)
|
||||
{
|
||||
if (oneapi_device_->have_error()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
void **args = const_cast<void **>(_args.values);
|
||||
|
||||
debug_enqueue(kernel, signed_kernel_work_size);
|
||||
assert(signed_kernel_work_size >= 0);
|
||||
size_t kernel_work_size = (size_t)signed_kernel_work_size;
|
||||
|
||||
size_t kernel_local_size = oneapi_dll_.oneapi_kernel_preferred_local_size(
|
||||
kernel_context_->queue, (::DeviceKernel)kernel, kernel_work_size);
|
||||
size_t uniformed_kernel_work_size = round_up(kernel_work_size, kernel_local_size);
|
||||
|
||||
assert(kernel_context_);
|
||||
|
||||
/* Call the oneAPI kernel DLL to launch the requested kernel. */
|
||||
bool is_finished_ok = oneapi_dll_.oneapi_enqueue_kernel(
|
||||
kernel_context_, kernel, uniformed_kernel_work_size, args);
|
||||
|
||||
if (is_finished_ok == false) {
|
||||
oneapi_device_->set_error("oneAPI kernel \"" + std::string(device_kernel_as_string(kernel)) +
|
||||
"\" execution error: got runtime exception \"" +
|
||||
oneapi_device_->oneapi_error_message() + "\"");
|
||||
}
|
||||
|
||||
return is_finished_ok;
|
||||
}
|
||||
|
||||
bool OneapiDeviceQueue::synchronize()
|
||||
{
|
||||
if (oneapi_device_->have_error()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool is_finished_ok = oneapi_dll_.oneapi_queue_synchronize(oneapi_device_->sycl_queue());
|
||||
if (is_finished_ok == false)
|
||||
oneapi_device_->set_error("oneAPI unknown kernel execution error: got runtime exception \"" +
|
||||
oneapi_device_->oneapi_error_message() + "\"");
|
||||
|
||||
debug_synchronize();
|
||||
|
||||
return !(oneapi_device_->have_error());
|
||||
}
|
||||
|
||||
void OneapiDeviceQueue::zero_to_device(device_memory &mem)
|
||||
{
|
||||
oneapi_device_->mem_zero(mem);
|
||||
}
|
||||
|
||||
void OneapiDeviceQueue::copy_to_device(device_memory &mem)
|
||||
{
|
||||
oneapi_device_->mem_copy_to(mem);
|
||||
}
|
||||
|
||||
void OneapiDeviceQueue::copy_from_device(device_memory &mem)
|
||||
{
|
||||
oneapi_device_->mem_copy_from(mem);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* WITH_ONEAPI */
|
|
@ -0,0 +1,51 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2021-2022 Intel Corporation */
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef WITH_ONEAPI
|
||||
|
||||
# include "device/kernel.h"
|
||||
# include "device/memory.h"
|
||||
# include "device/queue.h"
|
||||
|
||||
# include "device/oneapi/device.h"
|
||||
# include "device/oneapi/dll_interface.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
class OneapiDevice;
|
||||
class device_memory;
|
||||
|
||||
/* Base class for Oneapi queues. */
|
||||
class OneapiDeviceQueue : public DeviceQueue {
|
||||
public:
|
||||
explicit OneapiDeviceQueue(OneapiDevice *device);
|
||||
~OneapiDeviceQueue();
|
||||
|
||||
virtual int num_concurrent_states(const size_t state_size) const override;
|
||||
|
||||
virtual int num_concurrent_busy_states() const override;
|
||||
|
||||
virtual void init_execution() override;
|
||||
|
||||
virtual bool enqueue(DeviceKernel kernel,
|
||||
const int kernel_work_size,
|
||||
DeviceKernelArguments const &args) override;
|
||||
|
||||
virtual bool synchronize() override;
|
||||
|
||||
virtual void zero_to_device(device_memory &mem) override;
|
||||
virtual void copy_to_device(device_memory &mem) override;
|
||||
virtual void copy_from_device(device_memory &mem) override;
|
||||
|
||||
protected:
|
||||
OneapiDevice *oneapi_device_;
|
||||
OneAPIDLLInterface oneapi_dll_;
|
||||
KernelContext *kernel_context_;
|
||||
bool with_kernel_statistics_;
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* WITH_ONEAPI */
|
|
@ -1103,6 +1103,8 @@ static const char *device_type_for_description(const DeviceType type)
|
|||
return "OptiX";
|
||||
case DEVICE_HIP:
|
||||
return "HIP";
|
||||
case DEVICE_ONEAPI:
|
||||
return "oneAPI";
|
||||
case DEVICE_DUMMY:
|
||||
return "Dummy";
|
||||
case DEVICE_MULTI:
|
||||
|
|
|
@ -37,6 +37,10 @@ set(SRC_KERNEL_DEVICE_OPTIX
|
|||
device/optix/kernel_shader_raytrace.cu
|
||||
)
|
||||
|
||||
set(SRC_KERNEL_DEVICE_ONEAPI
|
||||
device/oneapi/kernel.cpp
|
||||
)
|
||||
|
||||
set(SRC_KERNEL_DEVICE_CPU_HEADERS
|
||||
device/cpu/compat.h
|
||||
device/cpu/image.h
|
||||
|
@ -78,6 +82,17 @@ set(SRC_KERNEL_DEVICE_METAL_HEADERS
|
|||
device/metal/globals.h
|
||||
)
|
||||
|
||||
set(SRC_KERNEL_DEVICE_ONEAPI_HEADERS
|
||||
device/oneapi/compat.h
|
||||
device/oneapi/context_begin.h
|
||||
device/oneapi/context_end.h
|
||||
device/oneapi/device_id.h
|
||||
device/oneapi/globals.h
|
||||
device/oneapi/image.h
|
||||
device/oneapi/kernel.h
|
||||
device/oneapi/kernel_templates.h
|
||||
)
|
||||
|
||||
set(SRC_KERNEL_CLOSURE_HEADERS
|
||||
closure/alloc.h
|
||||
closure/bsdf.h
|
||||
|
@ -687,6 +702,212 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
|
|||
cycles_set_solution_folder(cycles_kernel_optix)
|
||||
endif()
|
||||
|
||||
if(WITH_CYCLES_DEVICE_ONEAPI)
|
||||
if(WIN32)
|
||||
set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.dll)
|
||||
else()
|
||||
set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.so)
|
||||
endif()
|
||||
|
||||
set(cycles_oneapi_kernel_sources
|
||||
${SRC_KERNEL_DEVICE_ONEAPI}
|
||||
${SRC_KERNEL_HEADERS}
|
||||
${SRC_KERNEL_DEVICE_GPU_HEADERS}
|
||||
${SRC_KERNEL_DEVICE_ONEAPI_HEADERS}
|
||||
${SRC_UTIL_HEADERS}
|
||||
)
|
||||
|
||||
# SYCL_CPP_FLAGS is a variable that the user can set to pass extra compiler options
|
||||
set(sycl_compiler_flags
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/${SRC_KERNEL_DEVICE_ONEAPI}
|
||||
-fsycl
|
||||
-fsycl-unnamed-lambda
|
||||
-fdelayed-template-parsing
|
||||
-mllvm -inlinedefault-threshold=300
|
||||
-mllvm -inlinehint-threshold=400
|
||||
-shared
|
||||
-DWITH_ONEAPI
|
||||
-ffast-math
|
||||
-DNDEBUG
|
||||
-O2
|
||||
-o ${cycles_kernel_oneapi_lib}
|
||||
-I${CMAKE_CURRENT_SOURCE_DIR}/..
|
||||
-I${LEVEL_ZERO_INCLUDE_DIR}
|
||||
${LEVEL_ZERO_LIBRARY}
|
||||
${SYCL_CPP_FLAGS}
|
||||
)
|
||||
|
||||
|
||||
if (WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED)
|
||||
list(APPEND sycl_compiler_flags -DWITH_ONEAPI_SYCL_HOST_ENABLED)
|
||||
endif()
|
||||
|
||||
# Set defaults for spir64 and spir64_gen options
|
||||
if (NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64)
|
||||
set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'")
|
||||
endif()
|
||||
if (NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen)
|
||||
SET (CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "${CYCLES_ONEAPI_SYCL_OPTIONS_spir64}" CACHE STRING "Extra build options for spir64_gen target")
|
||||
endif()
|
||||
# enabling zebin (graphics binary format with improved compatibility) on Windows only while support on Linux isn't available yet
|
||||
if(WIN32)
|
||||
string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "--format zebin ")
|
||||
endif()
|
||||
string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "-device ${CYCLES_ONEAPI_SPIR64_GEN_DEVICES} ")
|
||||
|
||||
if (WITH_CYCLES_ONEAPI_BINARIES)
|
||||
# Iterate over all targest and their options
|
||||
list (JOIN CYCLES_ONEAPI_SYCL_TARGETS "," targets_string)
|
||||
list (APPEND sycl_compiler_flags -fsycl-targets=${targets_string})
|
||||
foreach(target ${CYCLES_ONEAPI_SYCL_TARGETS})
|
||||
if(DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_${target})
|
||||
list (APPEND sycl_compiler_flags -Xsycl-target-backend=${target} "${CYCLES_ONEAPI_SYCL_OPTIONS_${target}}")
|
||||
endif()
|
||||
endforeach()
|
||||
else()
|
||||
# If AOT is disabled, build for spir64
|
||||
list(APPEND sycl_compiler_flags
|
||||
-fsycl-targets=spir64
|
||||
-Xsycl-target-backend=spir64 "${CYCLES_ONEAPI_SYCL_OPTIONS_spir64}")
|
||||
endif()
|
||||
|
||||
if(WITH_NANOVDB)
|
||||
list(APPEND sycl_compiler_flags
|
||||
-DWITH_NANOVDB
|
||||
-I"${NANOVDB_INCLUDE_DIR}")
|
||||
endif()
|
||||
|
||||
if(WITH_CYCLES_DEBUG)
|
||||
list(APPEND sycl_compiler_flags -DWITH_CYCLES_DEBUG)
|
||||
endif()
|
||||
|
||||
get_filename_component(sycl_compiler_root ${SYCL_COMPILER} DIRECTORY)
|
||||
get_filename_component(sycl_compiler_compiler_name ${SYCL_COMPILER} NAME_WE)
|
||||
|
||||
if(NOT OCLOC_INSTALL_DIR)
|
||||
get_filename_component(OCLOC_INSTALL_DIR "${sycl_compiler_root}/../lib/ocloc" ABSOLUTE)
|
||||
endif()
|
||||
if(WITH_CYCLES_ONEAPI_BINARIES AND NOT EXISTS ${OCLOC_INSTALL_DIR})
|
||||
message(FATAL_ERROR "WITH_CYCLES_ONEAPI_BINARIES requires ocloc but ${OCLOC_INSTALL_DIR} directory doesn't exist."
|
||||
" A different ocloc directory can be set using OCLOC_INSTALL_DIR cmake variable.")
|
||||
endif()
|
||||
|
||||
if(UNIX AND NOT APPLE)
|
||||
if(NOT WITH_CXX11_ABI)
|
||||
check_library_exists(sycl
|
||||
_ZN2cl4sycl7handler22verifyUsedKernelBundleERKSs ${sycl_compiler_root}/../lib SYCL_NO_CXX11_ABI)
|
||||
if(SYCL_NO_CXX11_ABI)
|
||||
list(APPEND sycl_compiler_flags -D_GLIBCXX_USE_CXX11_ABI=0)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(WIN32)
|
||||
list(APPEND sycl_compiler_flags
|
||||
-fms-extensions
|
||||
-fms-compatibility
|
||||
-D_WINDLL
|
||||
-D_MBCS
|
||||
-DWIN32
|
||||
-D_WINDOWS
|
||||
-D_CRT_NONSTDC_NO_DEPRECATE
|
||||
-D_CRT_SECURE_NO_DEPRECATE
|
||||
-DONEAPI_EXPORT)
|
||||
|
||||
if(sycl_compiler_compiler_name MATCHES "dpcpp")
|
||||
# The oneAPI distribution calls the compiler "dpcpp" and comes with a script that sets environment variables.
|
||||
add_custom_command(
|
||||
OUTPUT ${cycles_kernel_oneapi_lib}
|
||||
COMMAND "${sycl_compiler_root}/../../env/vars.bat"
|
||||
COMMAND ${SYCL_COMPILER} $<$<CONFIG:Debug>:-g>$<$<CONFIG:RelWithDebInfo>:-g> ${sycl_compiler_flags}
|
||||
DEPENDS ${cycles_oneapi_kernel_sources})
|
||||
else()
|
||||
# The open source SYCL compiler just goes by clang++ and does not have such a script.
|
||||
# Set the variables manually.
|
||||
string(REPLACE /Redist/ /Tools/ MSVC_TOOLS_DIR ${MSVC_REDIST_DIR})
|
||||
if(NOT CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION) # case for Ninja on Windows
|
||||
get_filename_component(cmake_mt_dir ${CMAKE_MT} DIRECTORY)
|
||||
string(REPLACE /bin/ /Lib/ WINDOWS_KIT_DIR ${cmake_mt_dir})
|
||||
get_filename_component(WINDOWS_KIT_DIR "${WINDOWS_KIT_DIR}/../" ABSOLUTE)
|
||||
else()
|
||||
set(WINDOWS_KIT_DIR ${WINDOWS_KITS_DIR}/Lib/${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION})
|
||||
endif()
|
||||
list(APPEND sycl_compiler_flags
|
||||
-L "${MSVC_TOOLS_DIR}/lib/x64"
|
||||
-L "${WINDOWS_KIT_DIR}/um/x64"
|
||||
-L "${WINDOWS_KIT_DIR}/ucrt/x64")
|
||||
add_custom_command(
|
||||
OUTPUT ${cycles_kernel_oneapi_lib}
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
"LIB=${sycl_compiler_root}/../lib" # for compiler to find sycl.lib
|
||||
"PATH=${OCLOC_INSTALL_DIR};${sycl_compiler_root}"
|
||||
${SYCL_COMPILER} $<$<CONFIG:Debug>:-g>$<$<CONFIG:RelWithDebInfo>:-g> ${sycl_compiler_flags}
|
||||
DEPENDS ${cycles_oneapi_kernel_sources})
|
||||
endif()
|
||||
else()
|
||||
list(APPEND sycl_compiler_flags -fPIC)
|
||||
|
||||
# avoid getting __FAST_MATH__ to be defined for the graphics compiler on CentOS 7 until the compile-time issue it triggers gets fixed.
|
||||
if(WITH_CYCLES_ONEAPI_BINARIES)
|
||||
list(APPEND sycl_compiler_flags -fhonor-nans)
|
||||
endif()
|
||||
|
||||
# add $ORIGIN to cycles_kernel_oneapi.so rpath so libsycl.so and
|
||||
# libpi_level_zero.so can be placed next to it and get found.
|
||||
list(APPEND sycl_compiler_flags -Wl,-rpath,'$$ORIGIN')
|
||||
|
||||
# The oneAPI distribution calls the compiler "dpcpp" and comes with a script that sets environment variables.
|
||||
if(sycl_compiler_compiler_name MATCHES "dpcpp")
|
||||
add_custom_command(
|
||||
OUTPUT ${cycles_kernel_oneapi_lib}
|
||||
COMMAND bash -c \"source ${sycl_compiler_root}/../../env/vars.sh&&${SYCL_COMPILER} $<$<CONFIG:Debug>:-g>$<$<CONFIG:RelWithDebInfo>:-g> ${sycl_compiler_flags}\"
|
||||
DEPENDS ${cycles_oneapi_kernel_sources})
|
||||
else()
|
||||
# The open source SYCL compiler just goes by clang++ and does not have such a script.
|
||||
# Set the variables manually.
|
||||
if(NOT IGC_INSTALL_DIR)
|
||||
get_filename_component(IGC_INSTALL_DIR "${sycl_compiler_root}/../lib/igc" ABSOLUTE)
|
||||
endif()
|
||||
add_custom_command(
|
||||
OUTPUT ${cycles_kernel_oneapi_lib}
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
"LD_LIBRARY_PATH=${sycl_compiler_root}/../lib:${OCLOC_INSTALL_DIR}/lib:${IGC_INSTALL_DIR}/lib"
|
||||
"PATH=${OCLOC_INSTALL_DIR}/bin:${sycl_compiler_root}:$ENV{PATH}" # env PATH is for compiler to find ld
|
||||
${SYCL_COMPILER} $<$<CONFIG:Debug>:-g>$<$<CONFIG:RelWithDebInfo>:-g> ${sycl_compiler_flags}
|
||||
DEPENDS ${cycles_oneapi_kernel_sources})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# install dynamic libraries required at runtime
|
||||
if(WIN32)
|
||||
set(SYCL_RUNTIME_DEPENDENCIES
|
||||
sycl.dll
|
||||
pi_level_zero.dll
|
||||
)
|
||||
if(NOT WITH_BLENDER)
|
||||
# For the Cycles standalone put libraries next to the Cycles application.
|
||||
delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH})
|
||||
else()
|
||||
# For Blender put the libraries next to the Blender executable.
|
||||
#
|
||||
# Note that the installation path in the delayed_install is relative to the versioned folder,
|
||||
# which means we need to go one level up.
|
||||
delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" "../")
|
||||
endif()
|
||||
elseif(UNIX AND NOT APPLE)
|
||||
file(GLOB SYCL_RUNTIME_DEPENDENCIES
|
||||
${sycl_compiler_root}/../lib/libsycl.so
|
||||
${sycl_compiler_root}/../lib/libsycl.so.[0-9]
|
||||
${sycl_compiler_root}/../lib/libsycl.so.[0-9].[0-9].[0-9]-[0-9]
|
||||
)
|
||||
list(APPEND SYCL_RUNTIME_DEPENDENCIES ${sycl_compiler_root}/../lib/libpi_level_zero.so)
|
||||
delayed_install("" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH}/lib)
|
||||
endif()
|
||||
|
||||
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cycles_kernel_oneapi_lib}" ${CYCLES_INSTALL_PATH}/lib)
|
||||
add_custom_target(cycles_kernel_oneapi ALL DEPENDS ${cycles_kernel_oneapi_lib})
|
||||
endif()
|
||||
|
||||
# OSL module
|
||||
|
||||
if(WITH_CYCLES_OSL)
|
||||
|
@ -752,6 +973,7 @@ cycles_add_library(cycles_kernel "${LIB}"
|
|||
${SRC_KERNEL_DEVICE_HIP_HEADERS}
|
||||
${SRC_KERNEL_DEVICE_OPTIX_HEADERS}
|
||||
${SRC_KERNEL_DEVICE_METAL_HEADERS}
|
||||
${SRC_KERNEL_DEVICE_ONEAPI_HEADERS}
|
||||
)
|
||||
|
||||
source_group("bake" FILES ${SRC_KERNEL_BAKE_HEADERS})
|
||||
|
@ -764,6 +986,7 @@ source_group("device\\gpu" FILES ${SRC_KERNEL_DEVICE_GPU_HEADERS})
|
|||
source_group("device\\hip" FILES ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_HIP_HEADERS})
|
||||
source_group("device\\optix" FILES ${SRC_KERNEL_DEVICE_OPTIX} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS})
|
||||
source_group("device\\metal" FILES ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_DEVICE_METAL_HEADERS})
|
||||
source_group("device\\oneapi" FILES ${SRC_KERNEL_DEVICE_ONEAPI} ${SRC_KERNEL_DEVICE_ONEAPI_HEADERS})
|
||||
source_group("film" FILES ${SRC_KERNEL_FILM_HEADERS})
|
||||
source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS})
|
||||
source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS})
|
||||
|
@ -782,6 +1005,9 @@ endif()
|
|||
if(WITH_CYCLES_HIP)
|
||||
add_dependencies(cycles_kernel cycles_kernel_hip)
|
||||
endif()
|
||||
if(WITH_CYCLES_DEVICE_ONEAPI)
|
||||
add_dependencies(cycles_kernel cycles_kernel_oneapi)
|
||||
endif()
|
||||
|
||||
# Install kernel source for runtime compilation
|
||||
|
||||
|
|
|
@ -14,6 +14,8 @@
|
|||
|
||||
#ifdef __KERNEL_METAL__
|
||||
# include "kernel/device/metal/context_begin.h"
|
||||
#elif defined(__KERNEL_ONEAPI__)
|
||||
# include "kernel/device/oneapi/context_begin.h"
|
||||
#endif
|
||||
|
||||
#include "kernel/device/gpu/work_stealing.h"
|
||||
|
@ -40,6 +42,8 @@
|
|||
|
||||
#ifdef __KERNEL_METAL__
|
||||
# include "kernel/device/metal/context_end.h"
|
||||
#elif defined(__KERNEL_ONEAPI__)
|
||||
# include "kernel/device/oneapi/context_end.h"
|
||||
#endif
|
||||
|
||||
#include "kernel/film/read.h"
|
||||
|
|
|
@ -18,15 +18,68 @@ CCL_NAMESPACE_BEGIN
|
|||
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
|
||||
#endif
|
||||
|
||||
#ifndef __KERNEL_METAL__
|
||||
/* TODO: abstract more device differences, define ccl_gpu_local_syncthreads,
|
||||
* ccl_gpu_thread_warp, ccl_gpu_warp_index, ccl_gpu_num_warps for all devices
|
||||
* and keep device specific code in compat.h */
|
||||
|
||||
#ifdef __KERNEL_ONEAPI__
|
||||
# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
|
||||
template<typename IsActiveOp>
|
||||
void cpu_serial_active_index_array_impl(const uint num_states,
|
||||
ccl_global int *ccl_restrict indices,
|
||||
ccl_global int *ccl_restrict num_indices,
|
||||
IsActiveOp is_active_op)
|
||||
{
|
||||
int write_index = 0;
|
||||
for (int state_index = 0; state_index < num_states; state_index++) {
|
||||
if (is_active_op(state_index))
|
||||
indices[write_index++] = state_index;
|
||||
}
|
||||
*num_indices = write_index;
|
||||
return;
|
||||
}
|
||||
# endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */
|
||||
|
||||
template<typename IsActiveOp>
|
||||
void gpu_parallel_active_index_array_impl(const uint num_states,
|
||||
ccl_global int *ccl_restrict indices,
|
||||
ccl_global int *ccl_restrict num_indices,
|
||||
IsActiveOp is_active_op)
|
||||
{
|
||||
const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>();
|
||||
const uint blocksize = item_id.get_local_range(0);
|
||||
|
||||
sycl::multi_ptr<int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1],
|
||||
sycl::access::address_space::local_space>
|
||||
ptr = sycl::ext::oneapi::group_local_memory<
|
||||
int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1]>(item_id.get_group());
|
||||
int *warp_offset = *ptr;
|
||||
|
||||
/* NOTE(@nsirgien): Here we calculate the same value as below but
|
||||
* faster for DPC++ : seems CUDA converting "%", "/", "*" based calculations below into
|
||||
* something faster already but DPC++ doesn't, so it's better to use
|
||||
* direct request of needed parameters - switching from this computation to computation below
|
||||
* will cause 2.5x performance slowdown. */
|
||||
const uint thread_index = item_id.get_local_id(0);
|
||||
const uint thread_warp = item_id.get_sub_group().get_local_id();
|
||||
|
||||
const uint warp_index = item_id.get_sub_group().get_group_id();
|
||||
const uint num_warps = item_id.get_sub_group().get_group_range()[0];
|
||||
|
||||
const uint state_index = item_id.get_global_id(0);
|
||||
|
||||
/* Test if state corresponding to this thread is active. */
|
||||
const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
|
||||
#else /* !__KERNEL__ONEAPI__ */
|
||||
# ifndef __KERNEL_METAL__
|
||||
template<uint blocksize, typename IsActiveOp>
|
||||
__device__
|
||||
#endif
|
||||
# endif
|
||||
void
|
||||
gpu_parallel_active_index_array_impl(const uint num_states,
|
||||
ccl_global int *indices,
|
||||
ccl_global int *num_indices,
|
||||
#ifdef __KERNEL_METAL__
|
||||
# ifdef __KERNEL_METAL__
|
||||
const uint is_active,
|
||||
const uint blocksize,
|
||||
const int thread_index,
|
||||
|
@ -37,7 +90,7 @@ __device__
|
|||
const int num_warps,
|
||||
threadgroup int *warp_offset)
|
||||
{
|
||||
#else
|
||||
# else
|
||||
IsActiveOp is_active_op)
|
||||
{
|
||||
extern ccl_gpu_shared int warp_offset[];
|
||||
|
@ -52,18 +105,33 @@ __device__
|
|||
|
||||
/* Test if state corresponding to this thread is active. */
|
||||
const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
|
||||
#endif
|
||||
|
||||
# endif
|
||||
#endif /* !__KERNEL_ONEAPI__ */
|
||||
/* For each thread within a warp compute how many other active states precede it. */
|
||||
#ifdef __KERNEL_ONEAPI__
|
||||
const uint thread_offset = sycl::exclusive_scan_over_group(
|
||||
item_id.get_sub_group(), is_active, std::plus<>());
|
||||
#else
|
||||
const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
|
||||
ccl_gpu_thread_mask(thread_warp));
|
||||
#endif
|
||||
|
||||
/* Last thread in warp stores number of active states for each warp. */
|
||||
#ifdef __KERNEL_ONEAPI__
|
||||
if (thread_warp == item_id.get_sub_group().get_local_range()[0] - 1) {
|
||||
#else
|
||||
if (thread_warp == ccl_gpu_warp_size - 1) {
|
||||
#endif
|
||||
warp_offset[warp_index] = thread_offset + is_active;
|
||||
}
|
||||
|
||||
#ifdef __KERNEL_ONEAPI__
|
||||
/* NOTE(@nsirgien): For us here only local memory writing (warp_offset) is important,
|
||||
* so faster local barriers can be used. */
|
||||
ccl_gpu_local_syncthreads();
|
||||
#else
|
||||
ccl_gpu_syncthreads();
|
||||
#endif
|
||||
|
||||
/* Last thread in block converts per-warp sizes to offsets, increments global size of
|
||||
* index array and gets offset to write to. */
|
||||
|
@ -80,7 +148,13 @@ __device__
|
|||
warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
|
||||
}
|
||||
|
||||
#ifdef __KERNEL_ONEAPI__
|
||||
/* NOTE(@nsirgien): For us here only important local memory writing (warp_offset),
|
||||
* so faster local barriers can be used. */
|
||||
ccl_gpu_local_syncthreads();
|
||||
#else
|
||||
ccl_gpu_syncthreads();
|
||||
#endif
|
||||
|
||||
/* Write to index array. */
|
||||
if (is_active) {
|
||||
|
@ -107,7 +181,19 @@ __device__
|
|||
simd_group_index, \
|
||||
num_simd_groups, \
|
||||
simdgroup_offset)
|
||||
|
||||
#elif defined(__KERNEL_ONEAPI__)
|
||||
# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
|
||||
# define gpu_parallel_active_index_array( \
|
||||
blocksize, num_states, indices, num_indices, is_active_op) \
|
||||
if (ccl_gpu_global_size_x() == 1) \
|
||||
cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \
|
||||
else \
|
||||
gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op);
|
||||
# else
|
||||
# define gpu_parallel_active_index_array( \
|
||||
blocksize, num_states, indices, num_indices, is_active_op) \
|
||||
gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
|
||||
# endif
|
||||
#else
|
||||
|
||||
# define gpu_parallel_active_index_array( \
|
||||
|
|
|
@ -0,0 +1,206 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2021-2022 Intel Corporation */
|
||||
|
||||
#pragma once
|
||||
|
||||
#define __KERNEL_GPU__
|
||||
#define __KERNEL_ONEAPI__
|
||||
|
||||
#define CCL_NAMESPACE_BEGIN
|
||||
#define CCL_NAMESPACE_END
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#ifndef __NODES_MAX_GROUP__
|
||||
# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
|
||||
#endif
|
||||
#ifndef __NODES_FEATURES__
|
||||
# define __NODES_FEATURES__ NODE_FEATURE_ALL
|
||||
#endif
|
||||
|
||||
/* This one does not have an abstraction.
|
||||
* It's used by other devices directly.
|
||||
*/
|
||||
|
||||
#define __device__
|
||||
|
||||
/* Qualifier wrappers for different names on different devices */
|
||||
|
||||
#define ccl_device
|
||||
#define ccl_global
|
||||
#define ccl_always_inline __attribute__((always_inline))
|
||||
#define ccl_device_inline inline
|
||||
#define ccl_noinline
|
||||
#define ccl_inline_constant const constexpr
|
||||
#define ccl_static_constant const
|
||||
#define ccl_device_forceinline __attribute__((always_inline))
|
||||
#define ccl_device_noinline ccl_device ccl_noinline
|
||||
#define ccl_device_noinline_cpu ccl_device
|
||||
#define ccl_device_inline_method ccl_device
|
||||
#define ccl_restrict __restrict__
|
||||
#define ccl_loop_no_unroll
|
||||
#define ccl_optional_struct_init
|
||||
#define ccl_private
|
||||
#define ATTR_FALLTHROUGH __attribute__((fallthrough))
|
||||
#define ccl_constant const
|
||||
#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
|
||||
#define ccl_align(n) __attribute__((aligned(n)))
|
||||
#define kernel_assert(cond)
|
||||
#define ccl_may_alias
|
||||
|
||||
/* clang-format off */
|
||||
|
||||
/* kernel.h adapters */
|
||||
#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
|
||||
#define ccl_gpu_kernel_threads(block_num_threads)
|
||||
|
||||
#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
|
||||
# define KG_ND_ITEMS \
|
||||
kg->nd_item_local_id_0 = item.get_local_id(0); \
|
||||
kg->nd_item_local_range_0 = item.get_local_range(0); \
|
||||
kg->nd_item_group_0 = item.get_group(0); \
|
||||
kg->nd_item_group_range_0 = item.get_group_range(0); \
|
||||
kg->nd_item_global_id_0 = item.get_global_id(0); \
|
||||
kg->nd_item_global_range_0 = item.get_global_range(0);
|
||||
#else
|
||||
# define KG_ND_ITEMS
|
||||
#endif
|
||||
|
||||
#define ccl_gpu_kernel_signature(name, ...) \
|
||||
void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
|
||||
size_t kernel_global_size, \
|
||||
size_t kernel_local_size, \
|
||||
sycl::handler &cgh, \
|
||||
__VA_ARGS__) { \
|
||||
(kg); \
|
||||
cgh.parallel_for<class kernel_##name>( \
|
||||
sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
|
||||
[=](sycl::nd_item<1> item) { \
|
||||
KG_ND_ITEMS
|
||||
|
||||
#define ccl_gpu_kernel_postfix \
|
||||
}); \
|
||||
}
|
||||
|
||||
#define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x
|
||||
|
||||
#define ccl_gpu_kernel_lambda(func, ...) \
|
||||
struct KernelLambda \
|
||||
{ \
|
||||
KernelLambda(const ONEAPIKernelContext *_kg) : kg(_kg) {} \
|
||||
ccl_private const ONEAPIKernelContext *kg; \
|
||||
__VA_ARGS__; \
|
||||
int operator()(const int state) const { return (func); } \
|
||||
} ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg)
|
||||
|
||||
/* GPU thread, block, grid size and index */
|
||||
#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED
|
||||
# define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
|
||||
# define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
|
||||
# define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
|
||||
# define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
|
||||
# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
|
||||
# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
|
||||
|
||||
# define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
|
||||
# define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
|
||||
#else
|
||||
# define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0)
|
||||
# define ccl_gpu_block_dim_x (kg->nd_item_local_range_0)
|
||||
# define ccl_gpu_block_idx_x (kg->nd_item_group_0)
|
||||
# define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0)
|
||||
# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
|
||||
# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
|
||||
|
||||
# define ccl_gpu_global_id_x() (kg->nd_item_global_id_0)
|
||||
# define ccl_gpu_global_size_x() (kg->nd_item_global_range_0)
|
||||
#endif
|
||||
|
||||
|
||||
/* GPU warp synchronization */
|
||||
|
||||
#define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier()
|
||||
#define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space)
|
||||
#ifdef __SYCL_DEVICE_ONLY__
|
||||
#define ccl_gpu_ballot(predicate) (sycl::ext::oneapi::group_ballot(sycl::ext::oneapi::experimental::this_sub_group(), predicate).count())
|
||||
#else
|
||||
#define ccl_gpu_ballot(predicate) (predicate ? 1 : 0)
|
||||
#endif
|
||||
|
||||
/* Debug defines */
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
# define CONSTANT __attribute__((opencl_constant))
|
||||
#else
|
||||
# define CONSTANT
|
||||
#endif
|
||||
|
||||
#define sycl_printf(format, ...) { \
|
||||
static const CONSTANT char fmt[] = format; \
|
||||
sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ ); \
|
||||
}
|
||||
|
||||
#define sycl_printf_(format) { \
|
||||
static const CONSTANT char fmt[] = format; \
|
||||
sycl::ext::oneapi::experimental::printf(fmt); \
|
||||
}
|
||||
|
||||
/* GPU texture objects */
|
||||
|
||||
/* clang-format on */
|
||||
|
||||
/* Types */
|
||||
/* It's not possible to use sycl types like sycl::float3, sycl::int3, etc
|
||||
* because these types have different interfaces from blender version */
|
||||
|
||||
using uchar = unsigned char;
|
||||
using sycl::half;
|
||||
|
||||
struct float3 {
|
||||
float x, y, z;
|
||||
};
|
||||
|
||||
ccl_always_inline float3 make_float3(float x, float y, float z)
|
||||
{
|
||||
return {x, y, z};
|
||||
}
|
||||
ccl_always_inline float3 make_float3(float x)
|
||||
{
|
||||
return {x, x, x};
|
||||
}
|
||||
|
||||
/* math functions */
|
||||
#define fabsf(x) sycl::fabs((x))
|
||||
#define copysignf(x, y) sycl::copysign((x), (y))
|
||||
#define asinf(x) sycl::asin((x))
|
||||
#define acosf(x) sycl::acos((x))
|
||||
#define atanf(x) sycl::atan((x))
|
||||
#define floorf(x) sycl::floor((x))
|
||||
#define ceilf(x) sycl::ceil((x))
|
||||
#define sinhf(x) sycl::sinh((x))
|
||||
#define coshf(x) sycl::cosh((x))
|
||||
#define tanhf(x) sycl::tanh((x))
|
||||
#define hypotf(x, y) sycl::hypot((x), (y))
|
||||
#define atan2f(x, y) sycl::atan2((x), (y))
|
||||
#define fmaxf(x, y) sycl::fmax((x), (y))
|
||||
#define fminf(x, y) sycl::fmin((x), (y))
|
||||
#define fmodf(x, y) sycl::fmod((x), (y))
|
||||
#define lgammaf(x) sycl::lgamma((x))
|
||||
|
||||
#define __forceinline __attribute__((always_inline))
|
||||
|
||||
/* Types */
|
||||
#include "util/half.h"
|
||||
#include "util/types.h"
|
||||
|
||||
/* NOTE(@nsirgien): Declaring these functions after types headers is very important because they
|
||||
* include oneAPI headers, which transitively include math.h headers which will cause redefintions
|
||||
* of the math defines because math.h also uses them and having them defined before math.h include
|
||||
* is actually UB. */
|
||||
/* Use fast math functions - get them from sycl::native namespace for native math function
|
||||
* implementations */
|
||||
#define cosf(x) sycl::native::cos(((float)(x)))
|
||||
#define sinf(x) sycl::native::sin(((float)(x)))
|
||||
#define powf(x, y) sycl::native::powr(((float)(x)), ((float)(y)))
|
||||
#define tanf(x) sycl::native::tan(((float)(x)))
|
||||
#define logf(x) sycl::native::log(((float)(x)))
|
||||
#define expf(x) sycl::native::exp(((float)(x)))
|
|
@ -0,0 +1,13 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2021-2022 Intel Corporation */
|
||||
|
||||
#ifdef WITH_NANOVDB
|
||||
# include <nanovdb/NanoVDB.h>
|
||||
# include <nanovdb/util/SampleFromVoxels.h>
|
||||
#endif
|
||||
|
||||
/* clang-format off */
|
||||
struct ONEAPIKernelContext : public KernelGlobalsGPU {
|
||||
public:
|
||||
# include "kernel/device/oneapi/image.h"
|
||||
/* clang-format on */
|
|
@ -0,0 +1,7 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2021-2022 Intel Corporation */
|
||||
}
|
||||
; /* end of ONEAPIKernelContext class definition */
|
||||
|
||||
#undef kernel_integrator_state
|
||||
#define kernel_integrator_state (*(kg->integrator_state))
|
|
@ -0,0 +1,11 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2021-2022 Intel Corporation */
|
||||
|
||||
#pragma once
|
||||
|
||||
/* from public source :
|
||||
* https://gitlab.freedesktop.org/mesa/mesa/-/blob/main/include/pci_ids/iris_pci_ids.h */
|
||||
const static std::set<uint32_t> intel_arc_alchemist_device_ids = {
|
||||
0x4f80, 0x4f81, 0x4f82, 0x4f83, 0x4f84, 0x4f87, 0x4f88, 0x5690, 0x5691,
|
||||
0x5692, 0x5693, 0x5694, 0x5695, 0x5696, 0x5697, 0x56a0, 0x56a1, 0x56a2,
|
||||
0x56a3, 0x56a4, 0x56a5, 0x56a6, 0x56b0, 0x56b1, 0x56b2, 0x56b3};
|
|
@ -0,0 +1,50 @@
|
|||
/* device_capabilities() returns a C string that must be free'd with oneapi_free(). */
|
||||
DLL_INTERFACE_CALL(oneapi_device_capabilities, char *)
|
||||
DLL_INTERFACE_CALL(oneapi_free, void, void *)
|
||||
DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue)
|
||||
|
||||
DLL_INTERFACE_CALL(oneapi_get_compute_units_amount, size_t, SyclQueue *queue)
|
||||
DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr)
|
||||
DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr)
|
||||
|
||||
DLL_INTERFACE_CALL(oneapi_create_queue, bool, SyclQueue *&external_queue, int device_index)
|
||||
DLL_INTERFACE_CALL(oneapi_free_queue, void, SyclQueue *queue)
|
||||
DLL_INTERFACE_CALL(
|
||||
oneapi_usm_aligned_alloc_host, void *, SyclQueue *queue, size_t memory_size, size_t alignment)
|
||||
DLL_INTERFACE_CALL(oneapi_usm_alloc_device, void *, SyclQueue *queue, size_t memory_size)
|
||||
DLL_INTERFACE_CALL(oneapi_usm_free, void, SyclQueue *queue, void *usm_ptr)
|
||||
|
||||
DLL_INTERFACE_CALL(
|
||||
oneapi_usm_memcpy, bool, SyclQueue *queue, void *dest, void *src, size_t num_bytes)
|
||||
DLL_INTERFACE_CALL(oneapi_queue_synchronize, bool, SyclQueue *queue)
|
||||
DLL_INTERFACE_CALL(oneapi_usm_memset,
|
||||
bool,
|
||||
SyclQueue *queue,
|
||||
void *usm_ptr,
|
||||
unsigned char value,
|
||||
size_t num_bytes)
|
||||
|
||||
DLL_INTERFACE_CALL(oneapi_run_test_kernel, bool, SyclQueue *queue)
|
||||
|
||||
/* Operation with Kernel globals structure - map of global/constant allocation - filled before
|
||||
* render/kernel execution As we don't know in cycles sizeof this - Cycles will manage just as
|
||||
* pointer. */
|
||||
DLL_INTERFACE_CALL(oneapi_kernel_globals_size, bool, SyclQueue *queue, size_t &kernel_global_size)
|
||||
DLL_INTERFACE_CALL(oneapi_set_global_memory,
|
||||
void,
|
||||
SyclQueue *queue,
|
||||
void *kernel_globals,
|
||||
const char *memory_name,
|
||||
void *memory_device_pointer)
|
||||
|
||||
DLL_INTERFACE_CALL(oneapi_kernel_preferred_local_size,
|
||||
size_t,
|
||||
SyclQueue *queue,
|
||||
const DeviceKernel kernel,
|
||||
const size_t kernel_global_size)
|
||||
DLL_INTERFACE_CALL(oneapi_enqueue_kernel,
|
||||
bool,
|
||||
KernelContext *context,
|
||||
int kernel,
|
||||
size_t global_size,
|
||||
void **args)
|
|
@ -0,0 +1,47 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2021-2022 Intel Corporation */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "kernel/integrator/state.h"
|
||||
#include "kernel/types.h"
|
||||
#include "kernel/util/profiling.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* NOTE(@nsirgien): With SYCL we can't declare __constant__ global variable, which will be
|
||||
* accessible from device code, like it has been done for Cycles CUDA backend. So, the backend will
|
||||
* allocate this "constant" memory regions and store pointers to them in oneAPI context class */
|
||||
|
||||
struct IntegratorStateGPU;
|
||||
struct IntegratorQueueCounter;
|
||||
|
||||
typedef struct KernelGlobalsGPU {
|
||||
|
||||
#define KERNEL_DATA_ARRAY(type, name) const type *__##name = nullptr;
|
||||
#include "kernel/data_arrays.h"
|
||||
#undef KERNEL_DATA_ARRAY
|
||||
IntegratorStateGPU *integrator_state;
|
||||
const KernelData *__data;
|
||||
#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
|
||||
size_t nd_item_local_id_0;
|
||||
size_t nd_item_local_range_0;
|
||||
size_t nd_item_group_0;
|
||||
size_t nd_item_group_range_0;
|
||||
|
||||
size_t nd_item_global_id_0;
|
||||
size_t nd_item_global_range_0;
|
||||
#endif
|
||||
} KernelGlobalsGPU;
|
||||
|
||||
typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals;
|
||||
|
||||
#define kernel_data (*(__data))
|
||||
#define kernel_integrator_state (*(integrator_state))
|
||||
|
||||
/* data lookup defines */
|
||||
|
||||
#define kernel_data_fetch(name, index) __##name[index]
|
||||
#define kernel_data_array(name) __##name
|
||||
|
||||
CCL_NAMESPACE_END
|
|
@ -0,0 +1,385 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2021-2022 Intel Corporation */
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* For oneAPI implementation we do manual lookup and interpolation. */
|
||||
/* TODO: share implementation with ../cpu/image.h. */
|
||||
|
||||
template<typename T> ccl_device_forceinline T tex_fetch(const TextureInfo &info, int index)
|
||||
{
|
||||
return reinterpret_cast<ccl_global T *>(info.data)[index];
|
||||
}
|
||||
|
||||
ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
|
||||
{
|
||||
x %= width;
|
||||
if (x < 0)
|
||||
x += width;
|
||||
return x;
|
||||
}
|
||||
|
||||
ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
|
||||
{
|
||||
return clamp(x, 0, width - 1);
|
||||
}
|
||||
|
||||
ccl_device_inline float4 svm_image_texture_read(const TextureInfo &info, int x, int y, int z)
|
||||
{
|
||||
const int data_offset = x + info.width * y + info.width * info.height * z;
|
||||
const int texture_type = info.data_type;
|
||||
|
||||
/* Float4 */
|
||||
if (texture_type == IMAGE_DATA_TYPE_FLOAT4) {
|
||||
return tex_fetch<float4>(info, data_offset);
|
||||
}
|
||||
/* Byte4 */
|
||||
else if (texture_type == IMAGE_DATA_TYPE_BYTE4) {
|
||||
uchar4 r = tex_fetch<uchar4>(info, data_offset);
|
||||
float f = 1.0f / 255.0f;
|
||||
return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
|
||||
}
|
||||
/* Ushort4 */
|
||||
else if (texture_type == IMAGE_DATA_TYPE_USHORT4) {
|
||||
ushort4 r = tex_fetch<ushort4>(info, data_offset);
|
||||
float f = 1.0f / 65535.f;
|
||||
return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
|
||||
}
|
||||
/* Float */
|
||||
else if (texture_type == IMAGE_DATA_TYPE_FLOAT) {
|
||||
float f = tex_fetch<float>(info, data_offset);
|
||||
return make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
/* UShort */
|
||||
else if (texture_type == IMAGE_DATA_TYPE_USHORT) {
|
||||
ushort r = tex_fetch<ushort>(info, data_offset);
|
||||
float f = r * (1.0f / 65535.0f);
|
||||
return make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
else if (texture_type == IMAGE_DATA_TYPE_HALF) {
|
||||
float f = tex_fetch<half>(info, data_offset);
|
||||
return make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
else if (texture_type == IMAGE_DATA_TYPE_HALF4) {
|
||||
half4 r = tex_fetch<half4>(info, data_offset);
|
||||
return make_float4(r.x, r.y, r.z, r.w);
|
||||
}
|
||||
/* Byte */
|
||||
else {
|
||||
uchar r = tex_fetch<uchar>(info, data_offset);
|
||||
float f = r * (1.0f / 255.0f);
|
||||
return make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline float4 svm_image_texture_read_2d(int id, int x, int y)
|
||||
{
|
||||
const TextureInfo &info = kernel_data_fetch(texture_info, id);
|
||||
|
||||
/* Wrap */
|
||||
if (info.extension == EXTENSION_REPEAT) {
|
||||
x = svm_image_texture_wrap_periodic(x, info.width);
|
||||
y = svm_image_texture_wrap_periodic(y, info.height);
|
||||
}
|
||||
else {
|
||||
x = svm_image_texture_wrap_clamp(x, info.width);
|
||||
y = svm_image_texture_wrap_clamp(y, info.height);
|
||||
}
|
||||
|
||||
return svm_image_texture_read(info, x, y, 0);
|
||||
}
|
||||
|
||||
ccl_device_inline float4 svm_image_texture_read_3d(int id, int x, int y, int z)
|
||||
{
|
||||
const TextureInfo &info = kernel_data_fetch(texture_info, id);
|
||||
|
||||
/* Wrap */
|
||||
if (info.extension == EXTENSION_REPEAT) {
|
||||
x = svm_image_texture_wrap_periodic(x, info.width);
|
||||
y = svm_image_texture_wrap_periodic(y, info.height);
|
||||
z = svm_image_texture_wrap_periodic(z, info.depth);
|
||||
}
|
||||
else {
|
||||
x = svm_image_texture_wrap_clamp(x, info.width);
|
||||
y = svm_image_texture_wrap_clamp(y, info.height);
|
||||
z = svm_image_texture_wrap_clamp(z, info.depth);
|
||||
}
|
||||
|
||||
return svm_image_texture_read(info, x, y, z);
|
||||
}
|
||||
|
||||
static float svm_image_texture_frac(float x, int *ix)
|
||||
{
|
||||
int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
|
||||
*ix = i;
|
||||
return x - (float)i;
|
||||
}
|
||||
|
||||
#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
|
||||
{ \
|
||||
u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \
|
||||
u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \
|
||||
u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \
|
||||
u[3] = (1.0f / 6.0f) * t * t * t; \
|
||||
} \
|
||||
(void)0
|
||||
|
||||
ccl_device float4 kernel_tex_image_interp(KernelGlobals, int id, float x, float y)
|
||||
{
|
||||
const TextureInfo &info = kernel_data_fetch(texture_info, id);
|
||||
|
||||
if (info.extension == EXTENSION_CLIP) {
|
||||
if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
}
|
||||
|
||||
if (info.interpolation == INTERPOLATION_CLOSEST) {
|
||||
/* Closest interpolation. */
|
||||
int ix, iy;
|
||||
svm_image_texture_frac(x * info.width, &ix);
|
||||
svm_image_texture_frac(y * info.height, &iy);
|
||||
|
||||
return svm_image_texture_read_2d(id, ix, iy);
|
||||
}
|
||||
else if (info.interpolation == INTERPOLATION_LINEAR) {
|
||||
/* Bilinear interpolation. */
|
||||
int ix, iy;
|
||||
float tx = svm_image_texture_frac(x * info.width - 0.5f, &ix);
|
||||
float ty = svm_image_texture_frac(y * info.height - 0.5f, &iy);
|
||||
|
||||
float4 r;
|
||||
r = (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_2d(id, ix, iy);
|
||||
r += (1.0f - ty) * tx * svm_image_texture_read_2d(id, ix + 1, iy);
|
||||
r += ty * (1.0f - tx) * svm_image_texture_read_2d(id, ix, iy + 1);
|
||||
r += ty * tx * svm_image_texture_read_2d(id, ix + 1, iy + 1);
|
||||
return r;
|
||||
}
|
||||
else {
|
||||
/* Bicubic interpolation. */
|
||||
int ix, iy;
|
||||
float tx = svm_image_texture_frac(x * info.width - 0.5f, &ix);
|
||||
float ty = svm_image_texture_frac(y * info.height - 0.5f, &iy);
|
||||
|
||||
float u[4], v[4];
|
||||
SET_CUBIC_SPLINE_WEIGHTS(u, tx);
|
||||
SET_CUBIC_SPLINE_WEIGHTS(v, ty);
|
||||
|
||||
float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
|
||||
for (int y = 0; y < 4; y++) {
|
||||
for (int x = 0; x < 4; x++) {
|
||||
float weight = u[x] * v[y];
|
||||
r += weight * svm_image_texture_read_2d(id, ix + x - 1, iy + y - 1);
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef WITH_NANOVDB
|
||||
template<typename T> struct NanoVDBInterpolator {
|
||||
|
||||
typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;
|
||||
|
||||
static ccl_always_inline float4 read(float r)
|
||||
{
|
||||
return make_float4(r, r, r, 1.0f);
|
||||
}
|
||||
|
||||
static ccl_always_inline float4 read(nanovdb::Vec3f r)
|
||||
{
|
||||
return make_float4(r[0], r[1], r[2], 1.0f);
|
||||
}
|
||||
|
||||
static ccl_always_inline float4 interp_3d_closest(const AccessorType &acc,
|
||||
float x,
|
||||
float y,
|
||||
float z)
|
||||
{
|
||||
const nanovdb::Vec3f xyz(x, y, z);
|
||||
return read(nanovdb::SampleFromVoxels<AccessorType, 0, false>(acc)(xyz));
|
||||
}
|
||||
|
||||
static ccl_always_inline float4 interp_3d_linear(const AccessorType &acc,
|
||||
float x,
|
||||
float y,
|
||||
float z)
|
||||
{
|
||||
const nanovdb::Vec3f xyz(x - 0.5f, y - 0.5f, z - 0.5f);
|
||||
return read(nanovdb::SampleFromVoxels<AccessorType, 1, false>(acc)(xyz));
|
||||
}
|
||||
|
||||
static float4 interp_3d_cubic(const AccessorType &acc, float x, float y, float z)
|
||||
{
|
||||
int ix, iy, iz;
|
||||
int nix, niy, niz;
|
||||
int pix, piy, piz;
|
||||
int nnix, nniy, nniz;
|
||||
/* Tricubic b-spline interpolation. */
|
||||
const float tx = svm_image_texture_frac(x - 0.5f, &ix);
|
||||
const float ty = svm_image_texture_frac(y - 0.5f, &iy);
|
||||
const float tz = svm_image_texture_frac(z - 0.5f, &iz);
|
||||
pix = ix - 1;
|
||||
piy = iy - 1;
|
||||
piz = iz - 1;
|
||||
nix = ix + 1;
|
||||
niy = iy + 1;
|
||||
niz = iz + 1;
|
||||
nnix = ix + 2;
|
||||
nniy = iy + 2;
|
||||
nniz = iz + 2;
|
||||
|
||||
const int xc[4] = {pix, ix, nix, nnix};
|
||||
const int yc[4] = {piy, iy, niy, nniy};
|
||||
const int zc[4] = {piz, iz, niz, nniz};
|
||||
float u[4], v[4], w[4];
|
||||
|
||||
/* Some helper macro to keep code reasonable size,
|
||||
* let compiler to inline all the matrix multiplications.
|
||||
*/
|
||||
# define DATA(x, y, z) (read(acc.getValue(nanovdb::Coord(xc[x], yc[y], zc[z]))))
|
||||
# define COL_TERM(col, row) \
|
||||
(v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
|
||||
u[3] * DATA(3, col, row)))
|
||||
# define ROW_TERM(row) \
|
||||
(w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))
|
||||
|
||||
SET_CUBIC_SPLINE_WEIGHTS(u, tx);
|
||||
SET_CUBIC_SPLINE_WEIGHTS(v, ty);
|
||||
SET_CUBIC_SPLINE_WEIGHTS(w, tz);
|
||||
|
||||
/* Actual interpolation. */
|
||||
return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
|
||||
|
||||
# undef COL_TERM
|
||||
# undef ROW_TERM
|
||||
# undef DATA
|
||||
}
|
||||
|
||||
static ccl_always_inline float4
|
||||
interp_3d(const TextureInfo &info, float x, float y, float z, int interp)
|
||||
{
|
||||
using namespace nanovdb;
|
||||
|
||||
NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
|
||||
AccessorType acc = grid->getAccessor();
|
||||
|
||||
switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) {
|
||||
case INTERPOLATION_CLOSEST:
|
||||
return interp_3d_closest(acc, x, y, z);
|
||||
case INTERPOLATION_LINEAR:
|
||||
return interp_3d_linear(acc, x, y, z);
|
||||
default:
|
||||
return interp_3d_cubic(acc, x, y, z);
|
||||
}
|
||||
}
|
||||
};
|
||||
#endif /* WITH_NANOVDB */
|
||||
|
||||
ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals, int id, float3 P, int interp)
|
||||
{
|
||||
const TextureInfo &info = kernel_data_fetch(texture_info, id);
|
||||
|
||||
if (info.use_transform_3d) {
|
||||
Transform tfm = info.transform_3d;
|
||||
P = transform_point(&tfm, P);
|
||||
}
|
||||
|
||||
float x = P.x;
|
||||
float y = P.y;
|
||||
float z = P.z;
|
||||
|
||||
uint interpolation = (interp == INTERPOLATION_NONE) ? info.interpolation : interp;
|
||||
|
||||
#ifdef WITH_NANOVDB
|
||||
if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) {
|
||||
return NanoVDBInterpolator<float>::interp_3d(info, x, y, z, interpolation);
|
||||
}
|
||||
else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
|
||||
return NanoVDBInterpolator<nanovdb::Vec3f>::interp_3d(info, x, y, z, interpolation);
|
||||
}
|
||||
else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FPN) {
|
||||
return NanoVDBInterpolator<nanovdb::FpN>::interp_3d(info, x, y, z, interpolation);
|
||||
}
|
||||
else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FP16) {
|
||||
return NanoVDBInterpolator<nanovdb::Fp16>::interp_3d(info, x, y, z, interpolation);
|
||||
}
|
||||
#else
|
||||
if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT ||
|
||||
info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3 ||
|
||||
info.data_type == IMAGE_DATA_TYPE_NANOVDB_FPN ||
|
||||
info.data_type == IMAGE_DATA_TYPE_NANOVDB_FP16) {
|
||||
return make_float4(
|
||||
TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
|
||||
}
|
||||
#endif
|
||||
else {
|
||||
if (info.extension == EXTENSION_CLIP) {
|
||||
if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
}
|
||||
|
||||
x *= info.width;
|
||||
y *= info.height;
|
||||
z *= info.depth;
|
||||
}
|
||||
|
||||
if (interpolation == INTERPOLATION_CLOSEST) {
|
||||
/* Closest interpolation. */
|
||||
int ix, iy, iz;
|
||||
svm_image_texture_frac(x, &ix);
|
||||
svm_image_texture_frac(y, &iy);
|
||||
svm_image_texture_frac(z, &iz);
|
||||
|
||||
return svm_image_texture_read_3d(id, ix, iy, iz);
|
||||
}
|
||||
else if (interpolation == INTERPOLATION_LINEAR) {
|
||||
/* Trilinear interpolation. */
|
||||
int ix, iy, iz;
|
||||
float tx = svm_image_texture_frac(x - 0.5f, &ix);
|
||||
float ty = svm_image_texture_frac(y - 0.5f, &iy);
|
||||
float tz = svm_image_texture_frac(z - 0.5f, &iz);
|
||||
|
||||
float4 r;
|
||||
r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy, iz);
|
||||
r += (1.0f - tz) * (1.0f - ty) * tx * svm_image_texture_read_3d(id, ix + 1, iy, iz);
|
||||
r += (1.0f - tz) * ty * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy + 1, iz);
|
||||
r += (1.0f - tz) * ty * tx * svm_image_texture_read_3d(id, ix + 1, iy + 1, iz);
|
||||
|
||||
r += tz * (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy, iz + 1);
|
||||
r += tz * (1.0f - ty) * tx * svm_image_texture_read_3d(id, ix + 1, iy, iz + 1);
|
||||
r += tz * ty * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy + 1, iz + 1);
|
||||
r += tz * ty * tx * svm_image_texture_read_3d(id, ix + 1, iy + 1, iz + 1);
|
||||
return r;
|
||||
}
|
||||
else {
|
||||
/* Tricubic interpolation. */
|
||||
int ix, iy, iz;
|
||||
float tx = svm_image_texture_frac(x - 0.5f, &ix);
|
||||
float ty = svm_image_texture_frac(y - 0.5f, &iy);
|
||||
float tz = svm_image_texture_frac(z - 0.5f, &iz);
|
||||
|
||||
float u[4], v[4], w[4];
|
||||
SET_CUBIC_SPLINE_WEIGHTS(u, tx);
|
||||
SET_CUBIC_SPLINE_WEIGHTS(v, ty);
|
||||
SET_CUBIC_SPLINE_WEIGHTS(w, tz);
|
||||
|
||||
float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
|
||||
for (int z = 0; z < 4; z++) {
|
||||
for (int y = 0; y < 4; y++) {
|
||||
for (int x = 0; x < 4; x++) {
|
||||
float weight = u[x] * v[y] * w[z];
|
||||
r += weight * svm_image_texture_read_3d(id, ix + x - 1, iy + y - 1, iz + z - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
#undef SET_CUBIC_SPLINE_WEIGHTS
|
||||
|
||||
CCL_NAMESPACE_END
|
|
@ -0,0 +1,884 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2021-2022 Intel Corporation */
|
||||
|
||||
#ifdef WITH_ONEAPI
|
||||
|
||||
/* clang-format off */
|
||||
# include "kernel.h"
|
||||
# include <iostream>
|
||||
# include <map>
|
||||
# include <set>
|
||||
|
||||
# include <level_zero/ze_api.h>
|
||||
# include <CL/sycl.hpp>
|
||||
# include <ext/oneapi/backend/level_zero.hpp>
|
||||
|
||||
# include "kernel/device/oneapi/compat.h"
|
||||
# include "kernel/device/oneapi/device_id.h"
|
||||
# include "kernel/device/oneapi/globals.h"
|
||||
# include "kernel/device/oneapi/kernel_templates.h"
|
||||
|
||||
# include "kernel/device/gpu/kernel.h"
|
||||
/* clang-format on */
|
||||
|
||||
static OneAPIErrorCallback s_error_cb = nullptr;
|
||||
static void *s_error_user_ptr = nullptr;
|
||||
|
||||
static std::vector<sycl::device> oneapi_available_devices();
|
||||
|
||||
void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
|
||||
{
|
||||
s_error_cb = cb;
|
||||
s_error_user_ptr = user_ptr;
|
||||
}
|
||||
|
||||
void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
|
||||
{
|
||||
# ifdef _DEBUG
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
sycl::info::device_type device_type =
|
||||
queue->get_device().get_info<sycl::info::device::device_type>();
|
||||
sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
|
||||
(void)usm_type;
|
||||
assert(usm_type == sycl::usm::alloc::device ||
|
||||
((device_type == sycl::info::device_type::host ||
|
||||
device_type == sycl::info::device_type::is_cpu || allow_host) &&
|
||||
usm_type == sycl::usm::alloc::host));
|
||||
# endif
|
||||
}
|
||||
|
||||
bool oneapi_create_queue(SyclQueue *&external_queue, int device_index)
|
||||
{
|
||||
bool finished_correct = true;
|
||||
try {
|
||||
std::vector<sycl::device> devices = oneapi_available_devices();
|
||||
if (device_index < 0 || device_index >= devices.size()) {
|
||||
return false;
|
||||
}
|
||||
sycl::queue *created_queue = new sycl::queue(devices[device_index],
|
||||
sycl::property::queue::in_order());
|
||||
external_queue = reinterpret_cast<SyclQueue *>(created_queue);
|
||||
}
|
||||
catch (sycl::exception const &e) {
|
||||
finished_correct = false;
|
||||
if (s_error_cb) {
|
||||
s_error_cb(e.what(), s_error_user_ptr);
|
||||
}
|
||||
}
|
||||
return finished_correct;
|
||||
}
|
||||
|
||||
void oneapi_free_queue(SyclQueue *queue_)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
delete queue;
|
||||
}
|
||||
|
||||
void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
return sycl::aligned_alloc_host(alignment, memory_size, *queue);
|
||||
}
|
||||
|
||||
void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
return sycl::malloc_device(memory_size, *queue);
|
||||
}
|
||||
|
||||
void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
oneapi_check_usm(queue_, usm_ptr, true);
|
||||
sycl::free(usm_ptr, *queue);
|
||||
}
|
||||
|
||||
bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
oneapi_check_usm(queue_, dest, true);
|
||||
oneapi_check_usm(queue_, src, true);
|
||||
try {
|
||||
sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
|
||||
mem_event.wait_and_throw();
|
||||
return true;
|
||||
}
|
||||
catch (sycl::exception const &e) {
|
||||
if (s_error_cb) {
|
||||
s_error_cb(e.what(), s_error_user_ptr);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
oneapi_check_usm(queue_, usm_ptr, true);
|
||||
try {
|
||||
sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
|
||||
mem_event.wait_and_throw();
|
||||
return true;
|
||||
}
|
||||
catch (sycl::exception const &e) {
|
||||
if (s_error_cb) {
|
||||
s_error_cb(e.what(), s_error_user_ptr);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool oneapi_queue_synchronize(SyclQueue *queue_)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
try {
|
||||
queue->wait_and_throw();
|
||||
return true;
|
||||
}
|
||||
catch (sycl::exception const &e) {
|
||||
if (s_error_cb) {
|
||||
s_error_cb(e.what(), s_error_user_ptr);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and
|
||||
* also trigger runtime compilation of all existing oneAPI kernels */
|
||||
bool oneapi_run_test_kernel(SyclQueue *queue_)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
size_t N = 8;
|
||||
sycl::buffer<float, 1> A(N);
|
||||
sycl::buffer<float, 1> B(N);
|
||||
|
||||
{
|
||||
sycl::host_accessor A_host_acc(A, sycl::write_only);
|
||||
for (size_t i = (size_t)0; i < N; i++)
|
||||
A_host_acc[i] = rand() % 32;
|
||||
}
|
||||
|
||||
try {
|
||||
queue->submit([&](sycl::handler &cgh) {
|
||||
sycl::accessor A_acc(A, cgh, sycl::read_only);
|
||||
sycl::accessor B_acc(B, cgh, sycl::write_only, sycl::no_init);
|
||||
|
||||
cgh.parallel_for(N, [=](sycl::id<1> idx) { B_acc[idx] = A_acc[idx] + idx.get(0); });
|
||||
});
|
||||
queue->wait_and_throw();
|
||||
|
||||
sycl::host_accessor A_host_acc(A, sycl::read_only);
|
||||
sycl::host_accessor B_host_acc(B, sycl::read_only);
|
||||
|
||||
for (size_t i = (size_t)0; i < N; i++) {
|
||||
float result = A_host_acc[i] + B_host_acc[i];
|
||||
(void)result;
|
||||
}
|
||||
}
|
||||
catch (sycl::exception const &e) {
|
||||
if (s_error_cb) {
|
||||
s_error_cb(e.what(), s_error_user_ptr);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
|
||||
{
|
||||
kernel_global_size = sizeof(KernelGlobalsGPU);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void oneapi_set_global_memory(SyclQueue *queue_,
|
||||
void *kernel_globals,
|
||||
const char *memory_name,
|
||||
void *memory_device_pointer)
|
||||
{
|
||||
assert(queue_);
|
||||
assert(kernel_globals);
|
||||
assert(memory_name);
|
||||
assert(memory_device_pointer);
|
||||
KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
|
||||
oneapi_check_usm(queue_, memory_device_pointer);
|
||||
oneapi_check_usm(queue_, kernel_globals, true);
|
||||
|
||||
std::string matched_name(memory_name);
|
||||
|
||||
/* This macro will change global ptr of KernelGlobals via name matching. */
|
||||
# define KERNEL_DATA_ARRAY(type, name) \
|
||||
else if (#name == matched_name) \
|
||||
{ \
|
||||
globals->__##name = (type *)memory_device_pointer; \
|
||||
return; \
|
||||
}
|
||||
if (false) {
|
||||
}
|
||||
else if ("integrator_state" == matched_name) {
|
||||
globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
|
||||
return;
|
||||
}
|
||||
KERNEL_DATA_ARRAY(KernelData, data)
|
||||
# include "kernel/data_arrays.h"
|
||||
else
|
||||
{
|
||||
std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
|
||||
<< std::endl;
|
||||
assert(false);
|
||||
}
|
||||
# undef KERNEL_DATA_ARRAY
|
||||
}
|
||||
|
||||
/* TODO: Move device information to OneapiDevice initialized on creation and use it. */
|
||||
/* TODO: Move below function to oneapi/queue.cpp. */
|
||||
size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
|
||||
const DeviceKernel kernel,
|
||||
const size_t kernel_global_size)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
(void)kernel_global_size;
|
||||
const static size_t preferred_work_group_size_intersect_shading = 32;
|
||||
const static size_t preferred_work_group_size_technical = 1024;
|
||||
|
||||
size_t preferred_work_group_size = 0;
|
||||
switch (kernel) {
|
||||
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
|
||||
preferred_work_group_size = preferred_work_group_size_intersect_shading;
|
||||
break;
|
||||
|
||||
case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
|
||||
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES:
|
||||
case DEVICE_KERNEL_INTEGRATOR_RESET:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS:
|
||||
preferred_work_group_size = preferred_work_group_size_technical;
|
||||
break;
|
||||
|
||||
default:
|
||||
preferred_work_group_size = 512;
|
||||
}
|
||||
|
||||
const size_t limit_work_group_size =
|
||||
queue->get_device().get_info<sycl::info::device::max_work_group_size>();
|
||||
return std::min(limit_work_group_size, preferred_work_group_size);
|
||||
}
|
||||
|
||||
bool oneapi_enqueue_kernel(KernelContext *kernel_context,
|
||||
int kernel,
|
||||
size_t global_size,
|
||||
void **args)
|
||||
{
|
||||
bool success = true;
|
||||
::DeviceKernel device_kernel = (::DeviceKernel)kernel;
|
||||
KernelGlobalsGPU *kg = (KernelGlobalsGPU *)kernel_context->kernel_globals;
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(kernel_context->queue);
|
||||
assert(queue);
|
||||
if (!queue) {
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t local_size = oneapi_kernel_preferred_local_size(
|
||||
kernel_context->queue, device_kernel, global_size);
|
||||
assert(global_size % local_size == 0);
|
||||
|
||||
/* Local size for DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY needs to be enforced so we
|
||||
* overwrite it outside of oneapi_kernel_preferred_local_size. */
|
||||
if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY) {
|
||||
local_size = GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
/* Kernels listed below need a specific number of work groups. */
|
||||
if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY ||
|
||||
device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY ||
|
||||
device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY ||
|
||||
device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY ||
|
||||
device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY ||
|
||||
device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY ||
|
||||
device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY) {
|
||||
int num_states = *((int *)(args[0]));
|
||||
/* Round up to the next work-group. */
|
||||
size_t groups_count = (num_states + local_size - 1) / local_size;
|
||||
/* NOTE(@nsirgien): As for now non-uniform workgroups don't work on most oneAPI devices, we
|
||||
* extend work size to fit uniformity requirements. */
|
||||
global_size = groups_count * local_size;
|
||||
|
||||
# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
|
||||
if (queue->get_device().is_host()) {
|
||||
global_size = 1;
|
||||
local_size = 1;
|
||||
}
|
||||
# endif
|
||||
}
|
||||
|
||||
/* Let the compiler throw an error if there are any kernels missing in this implementation. */
|
||||
# if defined(_WIN32)
|
||||
# pragma warning(error : 4062)
|
||||
# elif defined(__GNUC__)
|
||||
# pragma GCC diagnostic push
|
||||
# pragma GCC diagnostic error "-Wswitch"
|
||||
# endif
|
||||
|
||||
try {
|
||||
queue->submit([&](sycl::handler &cgh) {
|
||||
switch (device_kernel) {
|
||||
case DEVICE_KERNEL_INTEGRATOR_RESET: {
|
||||
oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_reset);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_init_from_camera);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_init_from_bake);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_intersect_closest);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_intersect_shadow);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: {
|
||||
oneapi_call(kg,
|
||||
cgh,
|
||||
global_size,
|
||||
local_size,
|
||||
args,
|
||||
oneapi_kernel_integrator_intersect_subsurface);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
|
||||
oneapi_call(kg,
|
||||
cgh,
|
||||
global_size,
|
||||
local_size,
|
||||
args,
|
||||
oneapi_kernel_integrator_intersect_volume_stack);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_background);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_light);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_shadow);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: {
|
||||
oneapi_call(kg,
|
||||
cgh,
|
||||
global_size,
|
||||
local_size,
|
||||
args,
|
||||
oneapi_kernel_integrator_shade_surface_raytrace);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface_mnee);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_volume);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_queued_paths_array);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: {
|
||||
oneapi_call(kg,
|
||||
cgh,
|
||||
global_size,
|
||||
local_size,
|
||||
args,
|
||||
oneapi_kernel_integrator_queued_shadow_paths_array);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_active_paths_array);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: {
|
||||
oneapi_call(kg,
|
||||
cgh,
|
||||
global_size,
|
||||
local_size,
|
||||
args,
|
||||
oneapi_kernel_integrator_terminated_paths_array);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY: {
|
||||
oneapi_call(kg,
|
||||
cgh,
|
||||
global_size,
|
||||
local_size,
|
||||
args,
|
||||
oneapi_kernel_integrator_terminated_shadow_paths_array);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sorted_paths_array);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: {
|
||||
oneapi_call(kg,
|
||||
cgh,
|
||||
global_size,
|
||||
local_size,
|
||||
args,
|
||||
oneapi_kernel_integrator_compact_paths_array);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY: {
|
||||
oneapi_call(kg,
|
||||
cgh,
|
||||
global_size,
|
||||
local_size,
|
||||
args,
|
||||
oneapi_kernel_integrator_compact_shadow_paths_array);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK: {
|
||||
oneapi_call(kg,
|
||||
cgh,
|
||||
global_size,
|
||||
local_size,
|
||||
args,
|
||||
oneapi_kernel_adaptive_sampling_convergence_check);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_adaptive_sampling_filter_x);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_adaptive_sampling_filter_y);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_SHADER_EVAL_DISPLACE: {
|
||||
oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_shader_eval_displace);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_shader_eval_background);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY: {
|
||||
oneapi_call(kg,
|
||||
cgh,
|
||||
global_size,
|
||||
local_size,
|
||||
args,
|
||||
oneapi_kernel_shader_eval_curve_shadow_transparency);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_PREFIX_SUM: {
|
||||
oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_prefix_sum);
|
||||
break;
|
||||
}
|
||||
|
||||
/* clang-format off */
|
||||
# define DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT, variant) \
|
||||
case DEVICE_KERNEL_FILM_CONVERT_##VARIANT: { \
|
||||
oneapi_call(kg, cgh, \
|
||||
global_size, \
|
||||
local_size, \
|
||||
args, \
|
||||
oneapi_kernel_film_convert_##variant); \
|
||||
break; \
|
||||
}
|
||||
|
||||
# define DEVICE_KERNEL_FILM_CONVERT(variant, VARIANT) \
|
||||
DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT, variant) \
|
||||
DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT##_HALF_RGBA, variant##_half_rgba)
|
||||
|
||||
DEVICE_KERNEL_FILM_CONVERT(depth, DEPTH);
|
||||
DEVICE_KERNEL_FILM_CONVERT(mist, MIST);
|
||||
DEVICE_KERNEL_FILM_CONVERT(sample_count, SAMPLE_COUNT);
|
||||
DEVICE_KERNEL_FILM_CONVERT(float, FLOAT);
|
||||
DEVICE_KERNEL_FILM_CONVERT(light_path, LIGHT_PATH);
|
||||
DEVICE_KERNEL_FILM_CONVERT(float3, FLOAT3);
|
||||
DEVICE_KERNEL_FILM_CONVERT(motion, MOTION);
|
||||
DEVICE_KERNEL_FILM_CONVERT(cryptomatte, CRYPTOMATTE);
|
||||
DEVICE_KERNEL_FILM_CONVERT(shadow_catcher, SHADOW_CATCHER);
|
||||
DEVICE_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow,
|
||||
SHADOW_CATCHER_MATTE_WITH_SHADOW);
|
||||
DEVICE_KERNEL_FILM_CONVERT(combined, COMBINED);
|
||||
DEVICE_KERNEL_FILM_CONVERT(float4, FLOAT4);
|
||||
|
||||
# undef DEVICE_KERNEL_FILM_CONVERT
|
||||
# undef DEVICE_KERNEL_FILM_CONVERT_PARTIAL
|
||||
/* clang-format on */
|
||||
|
||||
case DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_filter_guiding_preprocess);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO: {
|
||||
oneapi_call(kg,
|
||||
cgh,
|
||||
global_size,
|
||||
local_size,
|
||||
args,
|
||||
oneapi_kernel_filter_guiding_set_fake_albedo);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_FILTER_COLOR_PREPROCESS: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_preprocess);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_postprocess);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_cryptomatte_postprocess);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_compact_states);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES: {
|
||||
oneapi_call(kg,
|
||||
cgh,
|
||||
global_size,
|
||||
local_size,
|
||||
args,
|
||||
oneapi_kernel_integrator_compact_shadow_states);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS: {
|
||||
oneapi_call(kg,
|
||||
cgh,
|
||||
global_size,
|
||||
local_size,
|
||||
args,
|
||||
oneapi_kernel_integrator_shadow_catcher_count_possible_splits);
|
||||
break;
|
||||
}
|
||||
/* Unsupported kernels */
|
||||
case DEVICE_KERNEL_NUM:
|
||||
case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
|
||||
assert(0);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Unknown kernel. */
|
||||
assert(0);
|
||||
return false;
|
||||
});
|
||||
}
|
||||
catch (sycl::exception const &e) {
|
||||
if (s_error_cb) {
|
||||
s_error_cb(e.what(), s_error_user_ptr);
|
||||
success = false;
|
||||
}
|
||||
}
|
||||
|
||||
# if defined(_WIN32)
|
||||
# pragma warning(default : 4062)
|
||||
# elif defined(__GNUC__)
|
||||
# pragma GCC diagnostic pop
|
||||
# endif
|
||||
return success;
|
||||
}
|
||||
|
||||
static const int lowest_supported_driver_version_win = 1011660;
|
||||
static const int lowest_supported_driver_version_neo = 20066;
|
||||
|
||||
static int parse_driver_build_version(const sycl::device &device)
|
||||
{
|
||||
const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
|
||||
int driver_build_version = 0;
|
||||
|
||||
size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
|
||||
if (second_dot_position == std::string::npos) {
|
||||
std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
|
||||
<< "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
|
||||
<< " xx.xx.xxx.xxxx (Windows) for device \""
|
||||
<< device.get_info<sycl::info::device::name>() << "\"." << std::endl;
|
||||
}
|
||||
else {
|
||||
try {
|
||||
size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
|
||||
if (third_dot_position != std::string::npos) {
|
||||
const std::string &third_number_substr = driver_version.substr(
|
||||
second_dot_position + 1, third_dot_position - second_dot_position - 1);
|
||||
const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
|
||||
if (third_number_substr.length() == 3 && forth_number_substr.length() == 4)
|
||||
driver_build_version = std::stoi(third_number_substr) * 10000 +
|
||||
std::stoi(forth_number_substr);
|
||||
}
|
||||
else {
|
||||
const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
|
||||
driver_build_version = std::stoi(third_number_substr);
|
||||
}
|
||||
}
|
||||
catch (std::invalid_argument &e) {
|
||||
std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
|
||||
<< "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
|
||||
<< " xx.xx.xxx.xxxx (Windows) for device \""
|
||||
<< device.get_info<sycl::info::device::name>() << "\"." << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
return driver_build_version;
|
||||
}
|
||||
|
||||
static std::vector<sycl::device> oneapi_available_devices()
|
||||
{
|
||||
bool allow_all_devices = false;
|
||||
if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
|
||||
allow_all_devices = true;
|
||||
|
||||
/* Host device is useful only for debugging at the moment
|
||||
* so we hide this device with default build settings. */
|
||||
# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
|
||||
bool allow_host = true;
|
||||
# else
|
||||
bool allow_host = false;
|
||||
# endif
|
||||
|
||||
const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
|
||||
|
||||
std::vector<sycl::device> available_devices;
|
||||
for (const sycl::platform &platform : oneapi_platforms) {
|
||||
/* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
|
||||
*/
|
||||
if (platform.get_backend() == sycl::backend::opencl) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const std::vector<sycl::device> &oneapi_devices =
|
||||
(allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
|
||||
platform.get_devices(sycl::info::device_type::gpu);
|
||||
|
||||
for (const sycl::device &device : oneapi_devices) {
|
||||
if (allow_all_devices) {
|
||||
/* still filter out host device if build doesn't support it. */
|
||||
if (allow_host || !device.is_host()) {
|
||||
available_devices.push_back(device);
|
||||
}
|
||||
}
|
||||
else {
|
||||
bool filter_out = false;
|
||||
|
||||
/* For now we support all Intel(R) Arc(TM) devices
|
||||
* and any future GPU with more than 128 execution units
|
||||
* official support can be broaden to older and smaller GPUs once ready. */
|
||||
if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
|
||||
ze_device_handle_t ze_device = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
|
||||
device);
|
||||
ze_device_properties_t props = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
|
||||
zeDeviceGetProperties(ze_device, &props);
|
||||
bool is_dg2 = (intel_arc_alchemist_device_ids.find(props.deviceId) !=
|
||||
intel_arc_alchemist_device_ids.end());
|
||||
int number_of_eus = props.numEUsPerSubslice * props.numSubslicesPerSlice *
|
||||
props.numSlices;
|
||||
if (!is_dg2 || number_of_eus < 128)
|
||||
filter_out = true;
|
||||
|
||||
/* if not already filtered out, check driver version. */
|
||||
if (!filter_out) {
|
||||
int driver_build_version = parse_driver_build_version(device);
|
||||
if ((driver_build_version > 100000 &&
|
||||
driver_build_version < lowest_supported_driver_version_win) ||
|
||||
(driver_build_version > 0 &&
|
||||
driver_build_version < lowest_supported_driver_version_neo)) {
|
||||
filter_out = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (!allow_host && device.is_host()) {
|
||||
filter_out = true;
|
||||
}
|
||||
else if (!allow_all_devices) {
|
||||
filter_out = true;
|
||||
}
|
||||
|
||||
if (!filter_out) {
|
||||
available_devices.push_back(device);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return available_devices;
|
||||
}
|
||||
|
||||
char *oneapi_device_capabilities()
|
||||
{
|
||||
std::stringstream capabilities;
|
||||
|
||||
const std::vector<sycl::device> &oneapi_devices = oneapi_available_devices();
|
||||
for (const sycl::device &device : oneapi_devices) {
|
||||
const std::string &name = device.get_info<sycl::info::device::name>();
|
||||
|
||||
capabilities << std::string("\t") << name << "\n";
|
||||
# define WRITE_ATTR(attribute_name, attribute_variable) \
|
||||
capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
|
||||
<< "\n";
|
||||
# define GET_NUM_ATTR(attribute) \
|
||||
{ \
|
||||
size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \
|
||||
capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \
|
||||
}
|
||||
|
||||
GET_NUM_ATTR(vendor_id)
|
||||
GET_NUM_ATTR(max_compute_units)
|
||||
GET_NUM_ATTR(max_work_item_dimensions)
|
||||
|
||||
sycl::id<3> max_work_item_sizes = device.get_info<sycl::info::device::max_work_item_sizes>();
|
||||
WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0)))
|
||||
WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1)))
|
||||
WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2)))
|
||||
|
||||
GET_NUM_ATTR(max_work_group_size)
|
||||
GET_NUM_ATTR(max_num_sub_groups)
|
||||
GET_NUM_ATTR(sub_group_independent_forward_progress)
|
||||
|
||||
GET_NUM_ATTR(preferred_vector_width_char)
|
||||
GET_NUM_ATTR(preferred_vector_width_short)
|
||||
GET_NUM_ATTR(preferred_vector_width_int)
|
||||
GET_NUM_ATTR(preferred_vector_width_long)
|
||||
GET_NUM_ATTR(preferred_vector_width_float)
|
||||
GET_NUM_ATTR(preferred_vector_width_double)
|
||||
GET_NUM_ATTR(preferred_vector_width_half)
|
||||
|
||||
GET_NUM_ATTR(native_vector_width_char)
|
||||
GET_NUM_ATTR(native_vector_width_short)
|
||||
GET_NUM_ATTR(native_vector_width_int)
|
||||
GET_NUM_ATTR(native_vector_width_long)
|
||||
GET_NUM_ATTR(native_vector_width_float)
|
||||
GET_NUM_ATTR(native_vector_width_double)
|
||||
GET_NUM_ATTR(native_vector_width_half)
|
||||
|
||||
size_t max_clock_frequency =
|
||||
(size_t)(device.is_host() ? (size_t)0 :
|
||||
device.get_info<sycl::info::device::max_clock_frequency>());
|
||||
WRITE_ATTR("max_clock_frequency", max_clock_frequency)
|
||||
|
||||
GET_NUM_ATTR(address_bits)
|
||||
GET_NUM_ATTR(max_mem_alloc_size)
|
||||
|
||||
/* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't
|
||||
* supported so we always return false, even if device supports HW texture usage acceleration.
|
||||
*/
|
||||
bool image_support = false;
|
||||
WRITE_ATTR("image_support", (size_t)image_support)
|
||||
|
||||
GET_NUM_ATTR(max_parameter_size)
|
||||
GET_NUM_ATTR(mem_base_addr_align)
|
||||
GET_NUM_ATTR(global_mem_size)
|
||||
GET_NUM_ATTR(local_mem_size)
|
||||
GET_NUM_ATTR(error_correction_support)
|
||||
GET_NUM_ATTR(profiling_timer_resolution)
|
||||
GET_NUM_ATTR(is_available)
|
||||
|
||||
# undef GET_NUM_ATTR
|
||||
# undef WRITE_ATTR
|
||||
capabilities << "\n";
|
||||
}
|
||||
|
||||
return ::strdup(capabilities.str().c_str());
|
||||
}
|
||||
|
||||
void oneapi_free(void *p)
|
||||
{
|
||||
if (p) {
|
||||
::free(p);
|
||||
}
|
||||
}
|
||||
|
||||
void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
|
||||
{
|
||||
int num = 0;
|
||||
std::vector<sycl::device> devices = oneapi_available_devices();
|
||||
for (sycl::device &device : devices) {
|
||||
const std::string &platform_name =
|
||||
device.get_platform().get_info<sycl::info::platform::name>();
|
||||
std::string name = device.get_info<sycl::info::device::name>();
|
||||
std::string id = "ONEAPI_" + platform_name + "_" + name;
|
||||
(cb)(id.c_str(), name.c_str(), num, user_ptr);
|
||||
num++;
|
||||
}
|
||||
}
|
||||
|
||||
size_t oneapi_get_memcapacity(SyclQueue *queue)
|
||||
{
|
||||
return reinterpret_cast<sycl::queue *>(queue)
|
||||
->get_device()
|
||||
.get_info<sycl::info::device::global_mem_size>();
|
||||
}
|
||||
|
||||
size_t oneapi_get_compute_units_amount(SyclQueue *queue)
|
||||
{
|
||||
return reinterpret_cast<sycl::queue *>(queue)
|
||||
->get_device()
|
||||
.get_info<sycl::info::device::max_compute_units>();
|
||||
}
|
||||
|
||||
#endif /* WITH_ONEAPI */
|
|
@ -0,0 +1,57 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2021-2022 Intel Corporation */
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef WITH_ONEAPI
|
||||
|
||||
# include <stddef.h>
|
||||
|
||||
/* NOTE(@nsirgien): Should match underlying type in the declaration inside "kernel/types.h"
|
||||
* TODO: use kernel/types.h directly. */
|
||||
enum DeviceKernel : int;
|
||||
|
||||
# ifndef CYCLES_KERNEL_ONEAPI_EXPORT
|
||||
# ifdef _WIN32
|
||||
# if defined(ONEAPI_EXPORT)
|
||||
# define CYCLES_KERNEL_ONEAPI_EXPORT extern __declspec(dllexport)
|
||||
# else
|
||||
# define CYCLES_KERNEL_ONEAPI_EXPORT extern __declspec(dllimport)
|
||||
# endif
|
||||
# else
|
||||
# define CYCLES_KERNEL_ONEAPI_EXPORT
|
||||
# endif
|
||||
# endif
|
||||
|
||||
class SyclQueue;
|
||||
|
||||
typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
|
||||
const char *name,
|
||||
int num,
|
||||
void *user_ptr);
|
||||
|
||||
typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr);
|
||||
|
||||
struct KernelContext {
|
||||
/* Queue, associated with selected device */
|
||||
SyclQueue *queue;
|
||||
/* Pointer to USM device memory with all global/constant allocation on this device */
|
||||
void *kernel_globals;
|
||||
};
|
||||
|
||||
/* Use extern C linking so that the symbols can be easily load from the dynamic library at runtime.
|
||||
*/
|
||||
# ifdef __cplusplus
|
||||
extern "C" {
|
||||
# endif
|
||||
|
||||
# define DLL_INTERFACE_CALL(function, return_type, ...) \
|
||||
CYCLES_KERNEL_ONEAPI_EXPORT return_type function(__VA_ARGS__);
|
||||
# include "kernel/device/oneapi/dll_interface_template.h"
|
||||
# undef DLL_INTERFACE_CALL
|
||||
|
||||
# ifdef __cplusplus
|
||||
}
|
||||
# endif
|
||||
|
||||
#endif /* WITH_ONEAPI */
|
|
@ -0,0 +1,121 @@
|
|||
#pragma once
|
||||
|
||||
/* Some macro magic to generate templates for kernel arguments.
|
||||
The resulting oneapi_call() template allows to call a SYCL/C++ kernel
|
||||
with typed arguments by only giving it a void **args as given by Cycles.
|
||||
The template will automatically cast from void* to the expectd type.
|
||||
*/
|
||||
|
||||
/* When expanded by the preprocessor, the generated templates will look like this example: */
|
||||
#if 0
|
||||
template<typename T0, typename T1, typename T2>
|
||||
void oneapi_call(
|
||||
KernelGlobalsGPU *kg,
|
||||
sycl::handler &cgh,
|
||||
size_t global_size,
|
||||
size_t local_size,
|
||||
void **args,
|
||||
void (*func)(const KernelGlobalsGPU *, size_t, size_t, sycl::handler &, T0, T1, T2))
|
||||
{
|
||||
func(kg, global_size, local_size, cgh, *(T0 *)(args[0]), *(T1 *)(args[1]), *(T2 *)(args[2]));
|
||||
}
|
||||
#endif
|
||||
|
||||
/* clang-format off */
|
||||
#define ONEAPI_TYP(x) typename T##x
|
||||
#define ONEAPI_CAST(x) *(T##x *)(args[x])
|
||||
#define ONEAPI_T(x) T##x
|
||||
|
||||
#define ONEAPI_GET_NTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, N, ...) N
|
||||
#define ONEAPI_0(_call, ...)
|
||||
#define ONEAPI_1(_call, x) _call(x)
|
||||
#define ONEAPI_2(_call, x, ...) _call(x), ONEAPI_1(_call, __VA_ARGS__)
|
||||
#define ONEAPI_3(_call, x, ...) _call(x), ONEAPI_2(_call, __VA_ARGS__)
|
||||
#define ONEAPI_4(_call, x, ...) _call(x), ONEAPI_3(_call, __VA_ARGS__)
|
||||
#define ONEAPI_5(_call, x, ...) _call(x), ONEAPI_4(_call, __VA_ARGS__)
|
||||
#define ONEAPI_6(_call, x, ...) _call(x), ONEAPI_5(_call, __VA_ARGS__)
|
||||
#define ONEAPI_7(_call, x, ...) _call(x), ONEAPI_6(_call, __VA_ARGS__)
|
||||
#define ONEAPI_8(_call, x, ...) _call(x), ONEAPI_7(_call, __VA_ARGS__)
|
||||
#define ONEAPI_9(_call, x, ...) _call(x), ONEAPI_8(_call, __VA_ARGS__)
|
||||
#define ONEAPI_10(_call, x, ...) _call(x), ONEAPI_9(_call, __VA_ARGS__)
|
||||
#define ONEAPI_11(_call, x, ...) _call(x), ONEAPI_10(_call, __VA_ARGS__)
|
||||
#define ONEAPI_12(_call, x, ...) _call(x), ONEAPI_11(_call, __VA_ARGS__)
|
||||
#define ONEAPI_13(_call, x, ...) _call(x), ONEAPI_12(_call, __VA_ARGS__)
|
||||
#define ONEAPI_14(_call, x, ...) _call(x), ONEAPI_13(_call, __VA_ARGS__)
|
||||
#define ONEAPI_15(_call, x, ...) _call(x), ONEAPI_14(_call, __VA_ARGS__)
|
||||
#define ONEAPI_16(_call, x, ...) _call(x), ONEAPI_15(_call, __VA_ARGS__)
|
||||
#define ONEAPI_17(_call, x, ...) _call(x), ONEAPI_16(_call, __VA_ARGS__)
|
||||
#define ONEAPI_18(_call, x, ...) _call(x), ONEAPI_17(_call, __VA_ARGS__)
|
||||
#define ONEAPI_19(_call, x, ...) _call(x), ONEAPI_18(_call, __VA_ARGS__)
|
||||
#define ONEAPI_20(_call, x, ...) _call(x), ONEAPI_19(_call, __VA_ARGS__)
|
||||
#define ONEAPI_21(_call, x, ...) _call(x), ONEAPI_20(_call, __VA_ARGS__)
|
||||
|
||||
#define ONEAPI_CALL_FOR(x, ...) \
|
||||
ONEAPI_GET_NTH_ARG("ignored", \
|
||||
##__VA_ARGS__, \
|
||||
ONEAPI_21, \
|
||||
ONEAPI_20, \
|
||||
ONEAPI_19, \
|
||||
ONEAPI_18, \
|
||||
ONEAPI_17, \
|
||||
ONEAPI_16, \
|
||||
ONEAPI_15, \
|
||||
ONEAPI_14, \
|
||||
ONEAPI_13, \
|
||||
ONEAPI_12, \
|
||||
ONEAPI_11, \
|
||||
ONEAPI_10, \
|
||||
ONEAPI_9, \
|
||||
ONEAPI_8, \
|
||||
ONEAPI_7, \
|
||||
ONEAPI_6, \
|
||||
ONEAPI_5, \
|
||||
ONEAPI_4, \
|
||||
ONEAPI_3, \
|
||||
ONEAPI_2, \
|
||||
ONEAPI_1, \
|
||||
ONEAPI_0) \
|
||||
(x, ##__VA_ARGS__)
|
||||
|
||||
/* This template automatically casts entries in the void **args array to the types requested by the kernel func.
|
||||
Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */
|
||||
#define oneapi_template(...) \
|
||||
template<ONEAPI_CALL_FOR(ONEAPI_TYP, __VA_ARGS__)> \
|
||||
void oneapi_call( \
|
||||
KernelGlobalsGPU *kg, \
|
||||
sycl::handler &cgh, \
|
||||
size_t global_size, \
|
||||
size_t local_size, \
|
||||
void **args, \
|
||||
void (*func)(KernelGlobalsGPU*, size_t, size_t, sycl::handler &, ONEAPI_CALL_FOR(ONEAPI_T, __VA_ARGS__))) \
|
||||
{ \
|
||||
func(kg, \
|
||||
global_size, \
|
||||
local_size, \
|
||||
cgh, \
|
||||
ONEAPI_CALL_FOR(ONEAPI_CAST, __VA_ARGS__)); \
|
||||
}
|
||||
|
||||
oneapi_template(0)
|
||||
oneapi_template(0, 1)
|
||||
oneapi_template(0, 1, 2)
|
||||
oneapi_template(0, 1, 2, 3)
|
||||
oneapi_template(0, 1, 2, 3, 4)
|
||||
oneapi_template(0, 1, 2, 3, 4, 5)
|
||||
oneapi_template(0, 1, 2, 3, 4, 5, 6)
|
||||
oneapi_template(0, 1, 2, 3, 4, 5, 6, 7)
|
||||
oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8)
|
||||
oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
|
||||
oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
|
||||
oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
|
||||
oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
|
||||
oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)
|
||||
oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)
|
||||
oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
|
||||
oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)
|
||||
oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17)
|
||||
oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)
|
||||
oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19)
|
||||
oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)
|
||||
|
||||
/* clang-format on */
|
|
@ -1571,7 +1571,7 @@ static_assert_align(KernelShaderEvalInput, 16);
|
|||
* If the kernel uses shared CUDA memory, `CUDADeviceQueue::enqueue` is to be modified.
|
||||
* The path iteration kernels are handled in `PathTraceWorkGPU::enqueue_path_iteration`. */
|
||||
|
||||
typedef enum DeviceKernel {
|
||||
typedef enum DeviceKernel : int {
|
||||
DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA = 0,
|
||||
DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE,
|
||||
DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
|
||||
|
|
|
@ -106,6 +106,116 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float
|
|||
|
||||
# endif /* __KERNEL_METAL__ */
|
||||
|
||||
# ifdef __KERNEL_ONEAPI__
|
||||
|
||||
ccl_device_inline float atomic_add_and_fetch_float(ccl_global float *p, float x)
|
||||
{
|
||||
sycl::atomic_ref<float,
|
||||
sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device,
|
||||
sycl::access::address_space::ext_intel_global_device_space>
|
||||
atomic(*p);
|
||||
return atomic.fetch_add(x);
|
||||
}
|
||||
|
||||
ccl_device_inline float atomic_compare_and_swap_float(ccl_global float *source,
|
||||
float old_val,
|
||||
float new_val)
|
||||
{
|
||||
sycl::atomic_ref<float,
|
||||
sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device,
|
||||
sycl::access::address_space::ext_intel_global_device_space>
|
||||
atomic(*source);
|
||||
atomic.compare_exchange_weak(old_val, new_val);
|
||||
return old_val;
|
||||
}
|
||||
|
||||
ccl_device_inline unsigned int atomic_fetch_and_add_uint32(ccl_global unsigned int *p,
|
||||
unsigned int x)
|
||||
{
|
||||
sycl::atomic_ref<unsigned int,
|
||||
sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device,
|
||||
sycl::access::address_space::ext_intel_global_device_space>
|
||||
atomic(*p);
|
||||
return atomic.fetch_add(x);
|
||||
}
|
||||
|
||||
ccl_device_inline int atomic_fetch_and_add_uint32(ccl_global int *p, int x)
|
||||
{
|
||||
sycl::atomic_ref<int,
|
||||
sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device,
|
||||
sycl::access::address_space::ext_intel_global_device_space>
|
||||
atomic(*p);
|
||||
return atomic.fetch_add(x);
|
||||
}
|
||||
|
||||
ccl_device_inline unsigned int atomic_fetch_and_sub_uint32(ccl_global unsigned int *p,
|
||||
unsigned int x)
|
||||
{
|
||||
sycl::atomic_ref<unsigned int,
|
||||
sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device,
|
||||
sycl::access::address_space::ext_intel_global_device_space>
|
||||
atomic(*p);
|
||||
return atomic.fetch_sub(x);
|
||||
}
|
||||
|
||||
ccl_device_inline int atomic_fetch_and_sub_uint32(ccl_global int *p, int x)
|
||||
{
|
||||
sycl::atomic_ref<int,
|
||||
sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device,
|
||||
sycl::access::address_space::ext_intel_global_device_space>
|
||||
atomic(*p);
|
||||
return atomic.fetch_sub(x);
|
||||
}
|
||||
|
||||
ccl_device_inline unsigned int atomic_fetch_and_inc_uint32(ccl_global unsigned int *p)
|
||||
{
|
||||
return atomic_fetch_and_add_uint32(p, 1);
|
||||
}
|
||||
|
||||
ccl_device_inline int atomic_fetch_and_inc_uint32(ccl_global int *p)
|
||||
{
|
||||
return atomic_fetch_and_add_uint32(p, 1);
|
||||
}
|
||||
|
||||
ccl_device_inline unsigned int atomic_fetch_and_dec_uint32(ccl_global unsigned int *p)
|
||||
{
|
||||
return atomic_fetch_and_sub_uint32(p, 1);
|
||||
}
|
||||
|
||||
ccl_device_inline int atomic_fetch_and_dec_uint32(ccl_global int *p)
|
||||
{
|
||||
return atomic_fetch_and_sub_uint32(p, 1);
|
||||
}
|
||||
|
||||
ccl_device_inline unsigned int atomic_fetch_and_or_uint32(ccl_global unsigned int *p,
|
||||
unsigned int x)
|
||||
{
|
||||
sycl::atomic_ref<unsigned int,
|
||||
sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device,
|
||||
sycl::access::address_space::ext_intel_global_device_space>
|
||||
atomic(*p);
|
||||
return atomic.fetch_or(x);
|
||||
}
|
||||
|
||||
ccl_device_inline int atomic_fetch_and_or_uint32(ccl_global int *p, int x)
|
||||
{
|
||||
sycl::atomic_ref<int,
|
||||
sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device,
|
||||
sycl::access::address_space::ext_intel_global_device_space>
|
||||
atomic(*p);
|
||||
return atomic.fetch_or(x);
|
||||
}
|
||||
|
||||
# endif /* __KERNEL_ONEAPI__ */
|
||||
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
|
||||
#endif /* __UTIL_ATOMIC_H__ */
|
||||
|
|
|
@ -35,7 +35,7 @@ ccl_device_inline float half_to_float(half h_in)
|
|||
#else
|
||||
|
||||
/* CUDA has its own half data type, no need to define then */
|
||||
# if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
|
||||
# if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__) && !defined(__KERNEL_ONEAPI__)
|
||||
/* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
|
||||
* unsigned shorts. */
|
||||
class half {
|
||||
|
@ -73,7 +73,7 @@ struct half4 {
|
|||
|
||||
ccl_device_inline half float_to_half_image(float f)
|
||||
{
|
||||
#if defined(__KERNEL_METAL__)
|
||||
#if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__)
|
||||
return half(min(f, 65504.0f));
|
||||
#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
|
||||
return __float2half(min(f, 65504.0f));
|
||||
|
@ -103,6 +103,8 @@ ccl_device_inline float half_to_float_image(half h)
|
|||
{
|
||||
#if defined(__KERNEL_METAL__)
|
||||
return half_to_float(h);
|
||||
#elif defined(__KERNEL_ONEAPI__)
|
||||
return float(h);
|
||||
#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
|
||||
return __half2float(h);
|
||||
#else
|
||||
|
@ -136,7 +138,7 @@ ccl_device_inline float4 half4_to_float4_image(const half4 h)
|
|||
|
||||
ccl_device_inline half float_to_half_display(const float f)
|
||||
{
|
||||
#if defined(__KERNEL_METAL__)
|
||||
#if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__)
|
||||
return half(min(f, 65504.0f));
|
||||
#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
|
||||
return __float2half(min(f, 65504.0f));
|
||||
|
|
|
@ -79,7 +79,7 @@ CCL_NAMESPACE_BEGIN
|
|||
|
||||
/* Scalar */
|
||||
|
||||
#ifndef __HIP__
|
||||
#if !defined(__HIP__) && !defined(__KERNEL_ONEAPI__)
|
||||
# ifdef _WIN32
|
||||
ccl_device_inline float fmaxf(float a, float b)
|
||||
{
|
||||
|
@ -92,12 +92,18 @@ ccl_device_inline float fminf(float a, float b)
|
|||
}
|
||||
|
||||
# endif /* _WIN32 */
|
||||
#endif /* __HIP__ */
|
||||
#endif /* __HIP__, __KERNEL_ONEAPI__ */
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
# ifndef __KERNEL_ONEAPI__
|
||||
using std::isfinite;
|
||||
using std::isnan;
|
||||
using std::sqrt;
|
||||
# else
|
||||
using sycl::sqrt;
|
||||
# define isfinite(x) sycl::isfinite((x))
|
||||
# define isnan(x) sycl::isnan((x))
|
||||
# endif
|
||||
|
||||
ccl_device_inline int abs(int x)
|
||||
{
|
||||
|
@ -793,6 +799,8 @@ ccl_device_inline uint popcount(uint x)
|
|||
return i & 1;
|
||||
}
|
||||
# endif
|
||||
#elif defined(__KERNEL_ONEAPI__)
|
||||
# define popcount(x) sycl::popcount(x)
|
||||
#elif defined(__KERNEL_HIP__)
|
||||
/* Use popcll to support 64-bit wave for pre-RDNA AMD GPUs */
|
||||
# define popcount(x) __popcll(x)
|
||||
|
@ -806,6 +814,8 @@ ccl_device_inline uint count_leading_zeros(uint x)
|
|||
return __clz(x);
|
||||
#elif defined(__KERNEL_METAL__)
|
||||
return clz(x);
|
||||
#elif defined(__KERNEL_ONEAPI__)
|
||||
return sycl::clz(x);
|
||||
#else
|
||||
assert(x != 0);
|
||||
# ifdef _MSC_VER
|
||||
|
@ -824,6 +834,8 @@ ccl_device_inline uint count_trailing_zeros(uint x)
|
|||
return (__ffs(x) - 1);
|
||||
#elif defined(__KERNEL_METAL__)
|
||||
return ctz(x);
|
||||
#elif defined(__KERNEL_ONEAPI__)
|
||||
return sycl::ctz(x);
|
||||
#else
|
||||
assert(x != 0);
|
||||
# ifdef _MSC_VER
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
struct float2 {
|
||||
float x, y;
|
||||
|
||||
|
@ -20,7 +20,7 @@ struct float2 {
|
|||
|
||||
ccl_device_inline float2 make_float2(float x, float y);
|
||||
ccl_device_inline void print_float2(const char *label, const float2 &a);
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
__forceinline float float2::operator[](int i) const
|
||||
{
|
||||
util_assert(i >= 0);
|
||||
|
@ -39,7 +39,7 @@ ccl_device_inline void print_float2(const char *label, const float2 &a)
|
|||
{
|
||||
printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y);
|
||||
}
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__)
|
||||
struct ccl_try_align(16) float3
|
||||
{
|
||||
# ifdef __KERNEL_SSE__
|
||||
|
@ -40,7 +40,7 @@ struct ccl_try_align(16) float3
|
|||
ccl_device_inline float3 make_float3(float f);
|
||||
ccl_device_inline float3 make_float3(float x, float y, float z);
|
||||
ccl_device_inline void print_float3(const char *label, const float3 &a);
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) */
|
||||
|
||||
/* Smaller float3 for storage. For math operations this must be converted to float3, so that on the
|
||||
* CPU SIMD instructions can be used. */
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__)
|
||||
# ifdef __KERNEL_SSE__
|
||||
__forceinline float3::float3()
|
||||
{
|
||||
|
@ -83,7 +83,7 @@ ccl_device_inline void print_float3(const char *label, const float3 &a)
|
|||
{
|
||||
printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z);
|
||||
}
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
struct int4;
|
||||
|
||||
struct ccl_try_align(16) float4
|
||||
|
@ -43,7 +43,7 @@ ccl_device_inline float4 make_float4(float f);
|
|||
ccl_device_inline float4 make_float4(float x, float y, float z, float w);
|
||||
ccl_device_inline float4 make_float4(const int4 &i);
|
||||
ccl_device_inline void print_float4(const char *label, const float4 &a);
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
# ifdef __KERNEL_SSE__
|
||||
__forceinline float4::float4()
|
||||
{
|
||||
|
@ -89,7 +89,7 @@ ccl_device_inline void print_float4(const char *label, const float4 &a)
|
|||
{
|
||||
printf("%s: %.8f %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z, (double)a.w);
|
||||
}
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
|
||||
struct ccl_try_align(32) float8
|
||||
{
|
||||
|
@ -43,7 +43,7 @@ struct ccl_try_align(32) float8
|
|||
ccl_device_inline float8 make_float8(float f);
|
||||
ccl_device_inline float8
|
||||
make_float8(float a, float b, float c, float d, float e, float f, float g, float h);
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
# ifdef __KERNEL_AVX2__
|
||||
__forceinline float8::float8()
|
||||
{
|
||||
|
@ -81,7 +81,7 @@ make_float8(float a, float b, float c, float d, float e, float f, float g, float
|
|||
return r;
|
||||
}
|
||||
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
struct int2 {
|
||||
int x, y;
|
||||
|
||||
|
@ -19,7 +19,7 @@ struct int2 {
|
|||
};
|
||||
|
||||
ccl_device_inline int2 make_int2(int x, int y);
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
int int2::operator[](int i) const
|
||||
{
|
||||
util_assert(i >= 0);
|
||||
|
@ -30,7 +30,7 @@ ccl_device_inline int2 make_int2(int x, int y)
|
|||
int2 a = {x, y};
|
||||
return a;
|
||||
}
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
struct ccl_try_align(16) int3
|
||||
{
|
||||
# ifdef __KERNEL_SSE__
|
||||
|
@ -40,7 +40,7 @@ struct ccl_try_align(16) int3
|
|||
ccl_device_inline int3 make_int3(int i);
|
||||
ccl_device_inline int3 make_int3(int x, int y, int z);
|
||||
ccl_device_inline void print_int3(const char *label, const int3 &a);
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
# ifdef __KERNEL_SSE__
|
||||
__forceinline int3::int3()
|
||||
{
|
||||
|
@ -84,7 +84,7 @@ ccl_device_inline void print_int3(const char *label, const int3 &a)
|
|||
{
|
||||
printf("%s: %d %d %d\n", label, a.x, a.y, a.z);
|
||||
}
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
|
||||
struct float3;
|
||||
struct float4;
|
||||
|
@ -46,7 +46,7 @@ ccl_device_inline int4 make_int4(int x, int y, int z, int w);
|
|||
ccl_device_inline int4 make_int4(const float3 &f);
|
||||
ccl_device_inline int4 make_int4(const float4 &f);
|
||||
ccl_device_inline void print_int4(const char *label, const int4 &a);
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
# ifdef __KERNEL_SSE__
|
||||
__forceinline int4::int4()
|
||||
{
|
||||
|
@ -83,6 +83,8 @@ ccl_device_inline int4 make_int4(const float3 &f)
|
|||
{
|
||||
# ifdef __KERNEL_SSE__
|
||||
int4 a(_mm_cvtps_epi32(f.m128));
|
||||
# elif defined(__KERNEL_ONEAPI__)
|
||||
int4 a = {(int)f.x, (int)f.y, (int)f.z, 0};
|
||||
# else
|
||||
int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
|
||||
# endif
|
||||
|
@ -103,7 +105,7 @@ ccl_device_inline void print_int4(const char *label, const int4 &a)
|
|||
{
|
||||
printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);
|
||||
}
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
struct uchar2 {
|
||||
uchar x, y;
|
||||
|
||||
|
@ -19,7 +19,7 @@ struct uchar2 {
|
|||
};
|
||||
|
||||
ccl_device_inline uchar2 make_uchar2(uchar x, uchar y);
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
uchar uchar2::operator[](int i) const
|
||||
{
|
||||
util_assert(i >= 0);
|
||||
|
@ -30,7 +30,7 @@ ccl_device_inline uchar2 make_uchar2(uchar x, uchar y)
|
|||
uchar2 a = {x, y};
|
||||
return a;
|
||||
}
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
struct uchar3 {
|
||||
uchar x, y, z;
|
||||
|
||||
|
@ -19,7 +19,7 @@ struct uchar3 {
|
|||
};
|
||||
|
||||
ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z);
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
uchar uchar3::operator[](int i) const
|
||||
{
|
||||
util_assert(i >= 0);
|
||||
|
@ -30,7 +30,7 @@ ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z)
|
|||
uchar3 a = {x, y, z};
|
||||
return a;
|
||||
}
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
struct uchar4 {
|
||||
uchar x, y, z, w;
|
||||
|
||||
|
@ -19,7 +19,7 @@ struct uchar4 {
|
|||
};
|
||||
|
||||
ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w);
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
uchar uchar4::operator[](int i) const
|
||||
{
|
||||
util_assert(i >= 0);
|
||||
|
@ -30,7 +30,7 @@ ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
|
|||
uchar4 a = {x, y, z, w};
|
||||
return a;
|
||||
}
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
struct uint2 {
|
||||
uint x, y;
|
||||
|
||||
|
@ -19,7 +19,7 @@ struct uint2 {
|
|||
};
|
||||
|
||||
ccl_device_inline uint2 make_uint2(uint x, uint y);
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
__forceinline uint uint2::operator[](uint i) const
|
||||
{
|
||||
util_assert(i < 2);
|
||||
|
@ -28,7 +28,7 @@ ccl_device_inline uint2 make_uint2(uint x, uint y)
|
|||
uint2 a = {x, y};
|
||||
return a;
|
||||
}
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
struct uint3 {
|
||||
uint x, y, z;
|
||||
|
||||
|
@ -19,7 +19,7 @@ struct uint3 {
|
|||
};
|
||||
|
||||
ccl_device_inline uint3 make_uint3(uint x, uint y, uint z);
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
__forceinline uint uint3::operator[](uint i) const
|
||||
{
|
||||
util_assert(i < 3);
|
||||
|
@ -28,7 +28,7 @@ ccl_device_inline uint3 make_uint3(uint x, uint y, uint z)
|
|||
uint3 a = {x, y, z};
|
||||
return a;
|
||||
}
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
struct uint4 {
|
||||
uint x, y, z, w;
|
||||
|
||||
|
@ -19,7 +19,7 @@ struct uint4 {
|
|||
};
|
||||
|
||||
ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w);
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
__forceinline uint uint4::operator[](uint i) const
|
||||
{
|
||||
util_assert(i < 3);
|
||||
|
@ -28,7 +28,7 @@ ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w)
|
|||
uint4 a = {x, y, z, w};
|
||||
return a;
|
||||
}
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
|
||||
|
||||
struct ushort4 {
|
||||
uint16_t x, y, z, w;
|
||||
|
|
|
@ -9,6 +9,9 @@ global:
|
|||
*;
|
||||
*_boost*;
|
||||
local:
|
||||
__once_proxy;
|
||||
_ZSt11__once_call;
|
||||
_ZSt15__once_callable;
|
||||
al*;
|
||||
*Alembic*;
|
||||
av*;
|
||||
|
|
Loading…
Reference in New Issue