Cycles: use direct linking for oneAPI backend

This is a minimal set of changes, allowing a lot of cleanup that can
happen afterward as it allows sycl method and objects to be used outside
of kernel.cpp.

Reviewed By: brecht, sergey

Differential Revision: https://developer.blender.org/D15397
This commit is contained in:
Xavier Hallade 2022-10-06 18:35:51 +02:00
parent fc0b1627eb
commit 7eeeaec6da
14 changed files with 594 additions and 709 deletions

View File

@ -335,10 +335,18 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
set(LEVEL_ZERO_ROOT_DIR ${CYCLES_LEVEL_ZERO})
endif()
set(CYCLES_SYCL ${LIBDIR}/dpcpp CACHE PATH "Path to DPC++ and SYCL installation")
set(CYCLES_SYCL ${LIBDIR}/dpcpp CACHE PATH "Path to oneAPI DPC++ compiler")
if(EXISTS ${CYCLES_SYCL} AND NOT SYCL_ROOT_DIR)
set(SYCL_ROOT_DIR ${CYCLES_SYCL})
endif()
file(GLOB _sycl_runtime_libraries
${SYCL_ROOT_DIR}/lib/libsycl.so
${SYCL_ROOT_DIR}/lib/libsycl.so.[0-9]
${SYCL_ROOT_DIR}/lib/libsycl.so.[0-9].[0-9].[0-9]-[0-9]
${SYCL_ROOT_DIR}/lib/libpi_level_zero.so
)
list(APPEND PLATFORM_BUNDLED_LIBRARIES ${_sycl_runtime_libraries})
unset(_sycl_runtime_libraries)
endif()
if(WITH_OPENVDB)

View File

@ -952,5 +952,17 @@ endif()
set(ZSTD_INCLUDE_DIRS ${LIBDIR}/zstd/include)
set(ZSTD_LIBRARIES ${LIBDIR}/zstd/lib/zstd_static.lib)
set(LEVEL_ZERO_ROOT_DIR ${LIBDIR}/level_zero)
set(SYCL_ROOT_DIR ${LIBDIR}/dpcpp)
if(WITH_CYCLES_DEVICE_ONEAPI)
set(LEVEL_ZERO_ROOT_DIR ${LIBDIR}/level_zero)
set(CYCLES_SYCL ${LIBDIR}/dpcpp CACHE PATH "Path to oneAPI DPC++ compiler")
if(EXISTS ${CYCLES_SYCL} AND NOT SYCL_ROOT_DIR)
set(SYCL_ROOT_DIR ${CYCLES_SYCL})
endif()
file(GLOB _sycl_runtime_libraries
${SYCL_ROOT_DIR}/bin/sycl.dll
${SYCL_ROOT_DIR}/bin/sycl[0-9].dll
${SYCL_ROOT_DIR}/bin/pi_level_zero.dll
)
list(APPEND PLATFORM_BUNDLED_LIBRARIES ${_sycl_runtime_libraries})
unset(_sycl_runtime_libraries)
endif()

View File

@ -142,7 +142,6 @@ set(SRC
${SRC_DUMMY}
${SRC_MULTI}
${SRC_OPTIX}
${SRC_ONEAPI}
${SRC_HEADERS}
)
@ -188,7 +187,22 @@ if(WITH_CYCLES_DEVICE_METAL)
)
endif()
if (WITH_CYCLES_DEVICE_ONEAPI)
if(WIN32)
set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/cycles_kernel_oneapi.lib)
else()
set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/libcycles_kernel_oneapi.so)
endif()
list(APPEND LIB
${SYCL_LIBRARY}
${cycles_kernel_oneapi_lib}
)
add_definitions(-DWITH_ONEAPI)
list(APPEND SRC
${SRC_ONEAPI}
)
list(APPEND INC_SYS
${SYCL_INCLUDE_DIR}
)
endif()
if(WITH_OPENIMAGEDENOISE)

View File

@ -19,62 +19,12 @@
CCL_NAMESPACE_BEGIN
#ifdef WITH_ONEAPI
static OneAPIDLLInterface oneapi_dll;
#endif
#ifdef _WIN32
# define LOAD_ONEAPI_SHARED_LIBRARY(path) (void *)(LoadLibrary(path))
# define LOAD_ONEAPI_SHARED_LIBRARY_ERROR() GetLastError()
# define FREE_SHARED_LIBRARY(handle) FreeLibrary((HMODULE)handle)
# define GET_SHARED_LIBRARY_SYMBOL(handle, name) GetProcAddress((HMODULE)handle, name)
#elif __linux__
# define LOAD_ONEAPI_SHARED_LIBRARY(path) dlopen(path, RTLD_NOW)
# define LOAD_ONEAPI_SHARED_LIBRARY_ERROR() dlerror()
# define FREE_SHARED_LIBRARY(handle) dlclose(handle)
# define GET_SHARED_LIBRARY_SYMBOL(handle, name) dlsym(handle, name)
#endif
bool device_oneapi_init()
{
#if !defined(WITH_ONEAPI)
return false;
#else
string lib_path = path_get("lib");
# ifdef _WIN32
lib_path = path_join(lib_path, "cycles_kernel_oneapi.dll");
# else
lib_path = path_join(lib_path, "cycles_kernel_oneapi.so");
# endif
void *lib_handle = LOAD_ONEAPI_SHARED_LIBRARY(lib_path.c_str());
/* This shouldn't happen, but it still makes sense to have a branch for this. */
if (lib_handle == NULL) {
LOG(ERROR) << "oneAPI kernel shared library cannot be loaded: "
<< LOAD_ONEAPI_SHARED_LIBRARY_ERROR();
return false;
}
# define DLL_INTERFACE_CALL(function, return_type, ...) \
(oneapi_dll.function) = reinterpret_cast<decltype(oneapi_dll.function)>( \
GET_SHARED_LIBRARY_SYMBOL(lib_handle, #function)); \
if (oneapi_dll.function == NULL) { \
LOG(ERROR) << "oneAPI shared library function \"" << #function \
<< "\" has not been loaded from kernel shared - disable oneAPI " \
"library disable oneAPI implementation due to this"; \
FREE_SHARED_LIBRARY(lib_handle); \
return false; \
}
# include "kernel/device/oneapi/dll_interface_template.h"
# undef DLL_INTERFACE_CALL
VLOG_INFO << "oneAPI kernel shared library has been loaded successfully";
/* We need to have this oneapi kernel shared library during all life-span of the Blender.
* So it is not unloaded because of this.
* FREE_SHARED_LIBRARY(lib_handle); */
/* NOTE(@nsirgien): we need to enable JIT cache from here and
* right now this cache policy is controlled by env. variables. */
/* NOTE(hallade) we also disable use of copy engine as it
@ -109,17 +59,10 @@ bool device_oneapi_init()
#endif
}
#if defined(_WIN32) || defined(__linux__)
# undef LOAD_SYCL_SHARED_LIBRARY
# undef LOAD_ONEAPI_SHARED_LIBRARY
# undef FREE_SHARED_LIBRARY
# undef GET_SHARED_LIBRARY_SYMBOL
#endif
Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
{
#ifdef WITH_ONEAPI
return new OneapiDevice(info, oneapi_dll, stats, profiler);
return new OneapiDevice(info, stats, profiler);
#else
(void)info;
(void)stats;
@ -165,7 +108,7 @@ static void device_iterator_cb(const char *id, const char *name, int num, void *
void device_oneapi_info(vector<DeviceInfo> &devices)
{
#ifdef WITH_ONEAPI
(oneapi_dll.oneapi_iterate_devices)(device_iterator_cb, &devices);
OneapiDevice::iterate_devices(device_iterator_cb, &devices);
#else /* WITH_ONEAPI */
(void)devices;
#endif /* WITH_ONEAPI */
@ -175,10 +118,10 @@ string device_oneapi_capabilities()
{
string capabilities;
#ifdef WITH_ONEAPI
char *c_capabilities = (oneapi_dll.oneapi_device_capabilities)();
char *c_capabilities = OneapiDevice::device_capabilities();
if (c_capabilities) {
capabilities = c_capabilities;
(oneapi_dll.oneapi_free)(c_capabilities);
free(c_capabilities);
}
#endif
return capabilities;

View File

@ -8,7 +8,7 @@
# include "util/debug.h"
# include "util/log.h"
# include "kernel/device/oneapi/kernel.h"
# include "kernel/device/oneapi/globals.h"
CCL_NAMESPACE_BEGIN
@ -19,26 +19,19 @@ static void queue_error_cb(const char *message, void *user_ptr)
}
}
OneapiDevice::OneapiDevice(const DeviceInfo &info,
OneAPIDLLInterface &oneapi_dll_object,
Stats &stats,
Profiler &profiler)
OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
: Device(info, stats, profiler),
device_queue_(nullptr),
texture_info_(this, "texture_info", MEM_GLOBAL),
kg_memory_(nullptr),
kg_memory_device_(nullptr),
kg_memory_size_(0),
oneapi_dll_(oneapi_dll_object)
kg_memory_size_(0)
{
need_texture_info_ = false;
oneapi_dll_.oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
/* OneAPI calls should be initialized on this moment. */
assert(oneapi_dll_.oneapi_create_queue != nullptr);
bool is_finished_ok = oneapi_dll_.oneapi_create_queue(device_queue_, info.num);
bool is_finished_ok = create_queue(device_queue_, info.num);
if (is_finished_ok == false) {
set_error("oneAPI queue initialization error: got runtime exception \"" +
oneapi_error_string_ + "\"");
@ -50,7 +43,7 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info,
}
size_t globals_segment_size;
is_finished_ok = oneapi_dll_.oneapi_kernel_globals_size(device_queue_, globals_segment_size);
is_finished_ok = kernel_globals_size(device_queue_, globals_segment_size);
if (is_finished_ok == false) {
set_error("oneAPI constant memory initialization got runtime exception \"" +
oneapi_error_string_ + "\"");
@ -59,27 +52,27 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info,
VLOG_DEBUG << "Successfully created global/constant memory segment (kernel globals object)";
}
kg_memory_ = oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, globals_segment_size, 16);
oneapi_dll_.oneapi_usm_memset(device_queue_, kg_memory_, 0, globals_segment_size);
kg_memory_ = usm_aligned_alloc_host(device_queue_, globals_segment_size, 16);
usm_memset(device_queue_, kg_memory_, 0, globals_segment_size);
kg_memory_device_ = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, globals_segment_size);
kg_memory_device_ = usm_alloc_device(device_queue_, globals_segment_size);
kg_memory_size_ = globals_segment_size;
max_memory_on_device_ = oneapi_dll_.oneapi_get_memcapacity(device_queue_);
max_memory_on_device_ = get_memcapacity();
}
OneapiDevice::~OneapiDevice()
{
texture_info_.free();
oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_);
oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_device_);
usm_free(device_queue_, kg_memory_);
usm_free(device_queue_, kg_memory_device_);
for (ConstMemMap::iterator mt = const_mem_map_.begin(); mt != const_mem_map_.end(); mt++)
delete mt->second;
if (device_queue_)
oneapi_dll_.oneapi_free_queue(device_queue_);
free_queue(device_queue_);
}
bool OneapiDevice::check_peer_access(Device * /*peer_device*/)
@ -99,7 +92,7 @@ bool OneapiDevice::load_kernels(const uint requested_features)
* with specialization constants, but it hasn't been implemented yet. */
(void)requested_features;
bool is_finished_ok = oneapi_dll_.oneapi_run_test_kernel(device_queue_);
bool is_finished_ok = oneapi_run_test_kernel(device_queue_);
if (is_finished_ok == false) {
set_error("oneAPI kernel load: got runtime exception \"" + oneapi_error_string_ + "\"");
}
@ -138,7 +131,7 @@ void OneapiDevice::generic_alloc(device_memory &mem)
* type has been used for oneAPI device in order to better fit in Cycles architecture. */
void *device_pointer = nullptr;
if (mem.memory_size() + stats.mem_used < max_memory_on_device_)
device_pointer = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, memory_size);
device_pointer = usm_alloc_device(device_queue_, memory_size);
if (device_pointer == nullptr) {
set_error("oneAPI kernel - device memory allocation error for " +
string_human_readable_size(mem.memory_size()) +
@ -163,8 +156,7 @@ void OneapiDevice::generic_copy_to(device_memory &mem)
/* Copy operation from host shouldn't be requested if there is no memory allocated on host. */
assert(mem.host_pointer);
assert(device_queue_);
oneapi_dll_.oneapi_usm_memcpy(
device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size);
usm_memcpy(device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size);
}
/* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */
@ -178,11 +170,6 @@ string OneapiDevice::oneapi_error_message()
return string(oneapi_error_string_);
}
OneAPIDLLInterface OneapiDevice::oneapi_dll_object()
{
return oneapi_dll_;
}
void *OneapiDevice::kernel_globals_device_pointer()
{
return kg_memory_device_;
@ -198,7 +185,7 @@ void OneapiDevice::generic_free(device_memory &mem)
mem.device_size = 0;
assert(device_queue_);
oneapi_dll_.oneapi_usm_free(device_queue_, (void *)mem.device_pointer);
usm_free(device_queue_, (void *)mem.device_pointer);
mem.device_pointer = 0;
}
@ -266,8 +253,7 @@ void OneapiDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t
if (mem.device_pointer) {
char *shifted_host = reinterpret_cast<char *>(mem.host_pointer) + offset;
char *shifted_device = reinterpret_cast<char *>(mem.device_pointer) + offset;
bool is_finished_ok = oneapi_dll_.oneapi_usm_memcpy(
device_queue_, shifted_host, shifted_device, size);
bool is_finished_ok = usm_memcpy(device_queue_, shifted_host, shifted_device, size);
if (is_finished_ok == false) {
set_error("oneAPI memory operation error: got runtime exception \"" +
oneapi_error_string_ + "\"");
@ -292,7 +278,7 @@ void OneapiDevice::mem_zero(device_memory &mem)
}
assert(device_queue_);
bool is_finished_ok = oneapi_dll_.oneapi_usm_memset(
bool is_finished_ok = usm_memset(
device_queue_, (void *)mem.device_pointer, 0, mem.memory_size());
if (is_finished_ok == false) {
set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
@ -349,10 +335,9 @@ void OneapiDevice::const_copy_to(const char *name, void *host, size_t size)
memcpy(data->data(), host, size);
data->copy_to_device();
oneapi_dll_.oneapi_set_global_memory(
device_queue_, kg_memory_, name, (void *)data->device_pointer);
set_global_memory(device_queue_, kg_memory_, name, (void *)data->device_pointer);
oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
}
void OneapiDevice::global_alloc(device_memory &mem)
@ -367,10 +352,9 @@ void OneapiDevice::global_alloc(device_memory &mem)
generic_alloc(mem);
generic_copy_to(mem);
oneapi_dll_.oneapi_set_global_memory(
device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer);
set_global_memory(device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer);
oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
}
void OneapiDevice::global_free(device_memory &mem)
@ -410,18 +394,6 @@ unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create()
return make_unique<OneapiDeviceQueue>(this);
}
int OneapiDevice::get_num_multiprocessors()
{
assert(device_queue_);
return oneapi_dll_.oneapi_get_num_multiprocessors(device_queue_);
}
int OneapiDevice::get_max_num_threads_per_multiprocessor()
{
assert(device_queue_);
return oneapi_dll_.oneapi_get_max_num_threads_per_multiprocessor(device_queue_);
}
bool OneapiDevice::should_use_graphics_interop()
{
/* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so
@ -432,13 +404,460 @@ bool OneapiDevice::should_use_graphics_interop()
void *OneapiDevice::usm_aligned_alloc_host(size_t memory_size, size_t alignment)
{
assert(device_queue_);
return oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, memory_size, alignment);
return usm_aligned_alloc_host(device_queue_, memory_size, alignment);
}
void OneapiDevice::usm_free(void *usm_ptr)
{
assert(device_queue_);
return oneapi_dll_.oneapi_usm_free(device_queue_, usm_ptr);
return usm_free(device_queue_, usm_ptr);
}
void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
{
# ifdef _DEBUG
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
sycl::info::device_type device_type =
queue->get_device().get_info<sycl::info::device::device_type>();
sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
(void)usm_type;
assert(usm_type == sycl::usm::alloc::device ||
((device_type == sycl::info::device_type::host ||
device_type == sycl::info::device_type::cpu || allow_host) &&
usm_type == sycl::usm::alloc::host));
# endif
}
bool OneapiDevice::create_queue(SyclQueue *&external_queue, int device_index)
{
bool finished_correct = true;
try {
std::vector<sycl::device> devices = OneapiDevice::available_devices();
if (device_index < 0 || device_index >= devices.size()) {
return false;
}
sycl::queue *created_queue = new sycl::queue(devices[device_index],
sycl::property::queue::in_order());
external_queue = reinterpret_cast<SyclQueue *>(created_queue);
}
catch (sycl::exception const &e) {
finished_correct = false;
oneapi_error_string_ = e.what();
}
return finished_correct;
}
void OneapiDevice::free_queue(SyclQueue *queue_)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
delete queue;
}
void *OneapiDevice::usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
return sycl::aligned_alloc_host(alignment, memory_size, *queue);
}
void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
return sycl::malloc_device(memory_size, *queue);
}
void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
OneapiDevice::check_usm(queue_, usm_ptr, true);
sycl::free(usm_ptr, *queue);
}
bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
OneapiDevice::check_usm(queue_, dest, true);
OneapiDevice::check_usm(queue_, src, true);
sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
# ifdef WITH_CYCLES_DEBUG
try {
/* NOTE(@nsirgien) Waiting on memory operation may give more precise error
* messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
*/
mem_event.wait_and_throw();
return true;
}
catch (sycl::exception const &e) {
oneapi_error_string_ = e.what();
return false;
}
# else
sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
src_type == sycl::usm::alloc::device;
bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
src_type == sycl::usm::alloc::unknown;
/* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
* may not wait until the end of the transfer before using the memory.
*/
if (from_device_to_host || host_or_device_memop_with_offset)
mem_event.wait();
return true;
# endif
}
bool OneapiDevice::usm_memset(SyclQueue *queue_,
void *usm_ptr,
unsigned char value,
size_t num_bytes)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
OneapiDevice::check_usm(queue_, usm_ptr, true);
sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
# ifdef WITH_CYCLES_DEBUG
try {
/* NOTE(@nsirgien) Waiting on memory operation may give more precise error
* messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
*/
mem_event.wait_and_throw();
return true;
}
catch (sycl::exception const &e) {
oneapi_error_string_ = e.what();
return false;
}
# else
(void)mem_event;
return true;
# endif
}
bool OneapiDevice::queue_synchronize(SyclQueue *queue_)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
try {
queue->wait_and_throw();
return true;
}
catch (sycl::exception const &e) {
oneapi_error_string_ = e.what();
return false;
}
}
bool OneapiDevice::kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
{
kernel_global_size = sizeof(KernelGlobalsGPU);
return true;
}
void OneapiDevice::set_global_memory(SyclQueue *queue_,
void *kernel_globals,
const char *memory_name,
void *memory_device_pointer)
{
assert(queue_);
assert(kernel_globals);
assert(memory_name);
assert(memory_device_pointer);
KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
OneapiDevice::check_usm(queue_, memory_device_pointer);
OneapiDevice::check_usm(queue_, kernel_globals, true);
std::string matched_name(memory_name);
/* This macro will change global ptr of KernelGlobals via name matching. */
# define KERNEL_DATA_ARRAY(type, name) \
else if (#name == matched_name) \
{ \
globals->__##name = (type *)memory_device_pointer; \
return; \
}
if (false) {
}
else if ("integrator_state" == matched_name) {
globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
return;
}
KERNEL_DATA_ARRAY(KernelData, data)
# include "kernel/data_arrays.h"
else
{
std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
<< std::endl;
assert(false);
}
# undef KERNEL_DATA_ARRAY
}
bool OneapiDevice::enqueue_kernel(KernelContext *kernel_context,
int kernel,
size_t global_size,
void **args)
{
return oneapi_enqueue_kernel(kernel_context, kernel, global_size, args);
}
/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
* since Windows driver 101.3268. */
/* The same min compute-runtime version is currently required across Windows and Linux.
* For Windows driver 101.3430, compute-runtime version is 23904. */
static const int lowest_supported_driver_version_win = 1013430;
static const int lowest_supported_driver_version_neo = 23904;
int OneapiDevice::parse_driver_build_version(const sycl::device &device)
{
const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
int driver_build_version = 0;
size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
if (second_dot_position == std::string::npos) {
std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
<< "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
<< " xx.xx.xxx.xxxx (Windows) for device \""
<< device.get_info<sycl::info::device::name>() << "\"." << std::endl;
}
else {
try {
size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
if (third_dot_position != std::string::npos) {
const std::string &third_number_substr = driver_version.substr(
second_dot_position + 1, third_dot_position - second_dot_position - 1);
const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
if (third_number_substr.length() == 3 && forth_number_substr.length() == 4)
driver_build_version = std::stoi(third_number_substr) * 10000 +
std::stoi(forth_number_substr);
}
else {
const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
driver_build_version = std::stoi(third_number_substr);
}
}
catch (std::invalid_argument &) {
std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
<< "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
<< " xx.xx.xxx.xxxx (Windows) for device \""
<< device.get_info<sycl::info::device::name>() << "\"." << std::endl;
}
}
return driver_build_version;
}
std::vector<sycl::device> OneapiDevice::available_devices()
{
bool allow_all_devices = false;
if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
allow_all_devices = true;
/* Host device is useful only for debugging at the moment
* so we hide this device with default build settings. */
# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
bool allow_host = true;
# else
bool allow_host = false;
# endif
const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
std::vector<sycl::device> available_devices;
for (const sycl::platform &platform : oneapi_platforms) {
/* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
*/
if (platform.get_backend() == sycl::backend::opencl) {
continue;
}
const std::vector<sycl::device> &oneapi_devices =
(allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
platform.get_devices(sycl::info::device_type::gpu);
for (const sycl::device &device : oneapi_devices) {
if (allow_all_devices) {
/* still filter out host device if build doesn't support it. */
if (allow_host || !device.is_host()) {
available_devices.push_back(device);
}
}
else {
bool filter_out = false;
/* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
* assuming they have either more than 96 Execution Units or not 7 threads per EU.
* Official support can be broaden to older and smaller GPUs once ready. */
if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
/* Filtered-out defaults in-case these values aren't available through too old L0
* runtime. */
int number_of_eus = 96;
int threads_per_eu = 7;
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
}
if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
threads_per_eu =
device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
}
/* This filters out all Level-Zero supported GPUs from older generation than Arc. */
if (number_of_eus <= 96 && threads_per_eu == 7) {
filter_out = true;
}
/* if not already filtered out, check driver version. */
if (!filter_out) {
int driver_build_version = parse_driver_build_version(device);
if ((driver_build_version > 100000 &&
driver_build_version < lowest_supported_driver_version_win) ||
driver_build_version < lowest_supported_driver_version_neo) {
filter_out = true;
}
}
}
else if (!allow_host && device.is_host()) {
filter_out = true;
}
else if (!allow_all_devices) {
filter_out = true;
}
if (!filter_out) {
available_devices.push_back(device);
}
}
}
}
return available_devices;
}
char *OneapiDevice::device_capabilities()
{
std::stringstream capabilities;
const std::vector<sycl::device> &oneapi_devices = available_devices();
for (const sycl::device &device : oneapi_devices) {
const std::string &name = device.get_info<sycl::info::device::name>();
capabilities << std::string("\t") << name << "\n";
# define WRITE_ATTR(attribute_name, attribute_variable) \
capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
<< "\n";
# define GET_NUM_ATTR(attribute) \
{ \
size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \
capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \
}
GET_NUM_ATTR(vendor_id)
GET_NUM_ATTR(max_compute_units)
GET_NUM_ATTR(max_work_item_dimensions)
sycl::id<3> max_work_item_sizes =
device.get_info<sycl::info::device::max_work_item_sizes<3>>();
WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0)))
WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1)))
WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2)))
GET_NUM_ATTR(max_work_group_size)
GET_NUM_ATTR(max_num_sub_groups)
GET_NUM_ATTR(sub_group_independent_forward_progress)
GET_NUM_ATTR(preferred_vector_width_char)
GET_NUM_ATTR(preferred_vector_width_short)
GET_NUM_ATTR(preferred_vector_width_int)
GET_NUM_ATTR(preferred_vector_width_long)
GET_NUM_ATTR(preferred_vector_width_float)
GET_NUM_ATTR(preferred_vector_width_double)
GET_NUM_ATTR(preferred_vector_width_half)
GET_NUM_ATTR(native_vector_width_char)
GET_NUM_ATTR(native_vector_width_short)
GET_NUM_ATTR(native_vector_width_int)
GET_NUM_ATTR(native_vector_width_long)
GET_NUM_ATTR(native_vector_width_float)
GET_NUM_ATTR(native_vector_width_double)
GET_NUM_ATTR(native_vector_width_half)
size_t max_clock_frequency =
(size_t)(device.is_host() ? (size_t)0 :
device.get_info<sycl::info::device::max_clock_frequency>());
WRITE_ATTR("max_clock_frequency", max_clock_frequency)
GET_NUM_ATTR(address_bits)
GET_NUM_ATTR(max_mem_alloc_size)
/* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't
* supported so we always return false, even if device supports HW texture usage acceleration.
*/
bool image_support = false;
WRITE_ATTR("image_support", (size_t)image_support)
GET_NUM_ATTR(max_parameter_size)
GET_NUM_ATTR(mem_base_addr_align)
GET_NUM_ATTR(global_mem_size)
GET_NUM_ATTR(local_mem_size)
GET_NUM_ATTR(error_correction_support)
GET_NUM_ATTR(profiling_timer_resolution)
GET_NUM_ATTR(is_available)
# undef GET_NUM_ATTR
# undef WRITE_ATTR
capabilities << "\n";
}
return ::strdup(capabilities.str().c_str());
}
void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
{
int num = 0;
std::vector<sycl::device> devices = OneapiDevice::available_devices();
for (sycl::device &device : devices) {
const std::string &platform_name =
device.get_platform().get_info<sycl::info::platform::name>();
std::string name = device.get_info<sycl::info::device::name>();
std::string id = "ONEAPI_" + platform_name + "_" + name;
if (device.has(sycl::aspect::ext_intel_pci_address)) {
id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
}
(cb)(id.c_str(), name.c_str(), num, user_ptr);
num++;
}
}
size_t OneapiDevice::get_memcapacity()
{
return reinterpret_cast<sycl::queue *>(device_queue_)
->get_device()
.get_info<sycl::info::device::global_mem_size>();
}
int OneapiDevice::get_num_multiprocessors()
{
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
}
else
return 0;
}
int OneapiDevice::get_max_num_threads_per_multiprocessor()
{
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() *
device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
}
else
return 0;
}
CCL_NAMESPACE_END

View File

@ -3,9 +3,12 @@
#ifdef WITH_ONEAPI
# include <CL/sycl.hpp>
# include "device/device.h"
# include "device/oneapi/device.h"
# include "device/oneapi/queue.h"
# include "kernel/device/oneapi/kernel.h"
# include "util/map.h"
@ -13,6 +16,11 @@ CCL_NAMESPACE_BEGIN
class DeviceQueue;
typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
const char *name,
int num,
void *user_ptr);
class OneapiDevice : public Device {
private:
SyclQueue *device_queue_;
@ -25,16 +33,12 @@ class OneapiDevice : public Device {
void *kg_memory_device_;
size_t kg_memory_size_ = (size_t)0;
size_t max_memory_on_device_ = (size_t)0;
OneAPIDLLInterface oneapi_dll_;
std::string oneapi_error_string_;
public:
virtual BVHLayoutMask get_bvh_layout_mask() const override;
OneapiDevice(const DeviceInfo &info,
OneAPIDLLInterface &oneapi_dll_object,
Stats &stats,
Profiler &profiler);
OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
virtual ~OneapiDevice();
@ -50,12 +54,8 @@ class OneapiDevice : public Device {
void generic_free(device_memory &mem);
SyclQueue *sycl_queue();
string oneapi_error_message();
OneAPIDLLInterface oneapi_dll_object();
void *kernel_globals_device_pointer();
void mem_alloc(device_memory &mem) override;
@ -90,13 +90,37 @@ class OneapiDevice : public Device {
virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
int get_num_multiprocessors();
int get_max_num_threads_per_multiprocessor();
/* NOTE(@nsirgien): Create this methods to avoid some compilation problems on Windows with host
* side compilation (MSVC). */
void *usm_aligned_alloc_host(size_t memory_size, size_t alignment);
void usm_free(void *usm_ptr);
static std::vector<sycl::device> available_devices();
static char *device_capabilities();
static int parse_driver_build_version(const sycl::device &device);
static void iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr);
size_t get_memcapacity();
int get_num_multiprocessors();
int get_max_num_threads_per_multiprocessor();
bool queue_synchronize(SyclQueue *queue);
bool kernel_globals_size(SyclQueue *queue, size_t &kernel_global_size);
void set_global_memory(SyclQueue *queue,
void *kernel_globals,
const char *memory_name,
void *memory_device_pointer);
bool enqueue_kernel(KernelContext *kernel_context, int kernel, size_t global_size, void **args);
SyclQueue *sycl_queue();
protected:
void check_usm(SyclQueue *queue, const void *usm_ptr, bool allow_host);
bool create_queue(SyclQueue *&external_queue, int device_index);
void free_queue(SyclQueue *queue);
void *usm_aligned_alloc_host(SyclQueue *queue, size_t memory_size, size_t alignment);
void *usm_alloc_device(SyclQueue *queue, size_t memory_size);
void usm_free(SyclQueue *queue, void *usm_ptr);
bool usm_memcpy(SyclQueue *queue, void *dest, void *src, size_t num_bytes);
bool usm_memset(SyclQueue *queue, void *usm_ptr, unsigned char value, size_t num_bytes);
};
CCL_NAMESPACE_END

View File

@ -1,17 +0,0 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
#pragma once
/* Include kernel header to get access to SYCL-specific types, like SyclQueue and
* OneAPIDeviceIteratorCallback. */
#include "kernel/device/oneapi/kernel.h"
#ifdef WITH_ONEAPI
struct OneAPIDLLInterface {
# define DLL_INTERFACE_CALL(function, return_type, ...) \
return_type (*function)(__VA_ARGS__) = nullptr;
# include "kernel/device/oneapi/dll_interface_template.h"
# undef DLL_INTERFACE_CALL
};
#endif

View File

@ -22,10 +22,7 @@ struct KernelExecutionInfo {
/* OneapiDeviceQueue */
OneapiDeviceQueue::OneapiDeviceQueue(OneapiDevice *device)
: DeviceQueue(device),
oneapi_device_(device),
oneapi_dll_(device->oneapi_dll_object()),
kernel_context_(nullptr)
: DeviceQueue(device), oneapi_device_(device), kernel_context_(nullptr)
{
}
@ -81,14 +78,14 @@ bool OneapiDeviceQueue::enqueue(DeviceKernel kernel,
assert(signed_kernel_work_size >= 0);
size_t kernel_work_size = (size_t)signed_kernel_work_size;
size_t kernel_local_size = oneapi_dll_.oneapi_kernel_preferred_local_size(
size_t kernel_local_size = oneapi_kernel_preferred_local_size(
kernel_context_->queue, (::DeviceKernel)kernel, kernel_work_size);
size_t uniformed_kernel_work_size = round_up(kernel_work_size, kernel_local_size);
assert(kernel_context_);
/* Call the oneAPI kernel DLL to launch the requested kernel. */
bool is_finished_ok = oneapi_dll_.oneapi_enqueue_kernel(
bool is_finished_ok = oneapi_device_->enqueue_kernel(
kernel_context_, kernel, uniformed_kernel_work_size, args);
if (is_finished_ok == false) {
@ -108,7 +105,7 @@ bool OneapiDeviceQueue::synchronize()
return false;
}
bool is_finished_ok = oneapi_dll_.oneapi_queue_synchronize(oneapi_device_->sycl_queue());
bool is_finished_ok = oneapi_device_->queue_synchronize(oneapi_device_->sycl_queue());
if (is_finished_ok == false)
oneapi_device_->set_error("oneAPI unknown kernel execution error: got runtime exception \"" +
oneapi_device_->oneapi_error_message() + "\"");

View File

@ -10,7 +10,7 @@
# include "device/queue.h"
# include "device/oneapi/device.h"
# include "device/oneapi/dll_interface.h"
# include "kernel/device/oneapi/kernel.h"
CCL_NAMESPACE_BEGIN
@ -41,9 +41,7 @@ class OneapiDeviceQueue : public DeviceQueue {
protected:
OneapiDevice *oneapi_device_;
OneAPIDLLInterface oneapi_dll_;
KernelContext *kernel_context_;
bool with_kernel_statistics_;
};
CCL_NAMESPACE_END

View File

@ -716,7 +716,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
if(WIN32)
set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.dll)
else()
set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.so)
set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/libcycles_kernel_oneapi.so)
endif()
set(cycles_oneapi_kernel_sources
@ -815,6 +815,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
if(WIN32)
list(APPEND sycl_compiler_flags
-fuse-ld=link
-fms-extensions
-fms-compatibility
-D_WINDLL
@ -888,33 +889,24 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
endif()
endif()
# install dynamic libraries required at runtime
if(WIN32)
set(SYCL_RUNTIME_DEPENDENCIES
sycl.dll
pi_level_zero.dll
)
if(NOT WITH_BLENDER)
# For the Cycles standalone put libraries next to the Cycles application.
delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH})
else()
# For Blender put the libraries next to the Blender executable.
#
# Note that the installation path in the delayed_install is relative to the versioned folder,
# which means we need to go one level up.
delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" "../")
endif()
elseif(UNIX AND NOT APPLE)
file(GLOB SYCL_RUNTIME_DEPENDENCIES
${sycl_compiler_root}/../lib/libsycl.so
${sycl_compiler_root}/../lib/libsycl.so.[0-9]
${sycl_compiler_root}/../lib/libsycl.so.[0-9].[0-9].[0-9]-[0-9]
)
list(APPEND SYCL_RUNTIME_DEPENDENCIES ${sycl_compiler_root}/../lib/libpi_level_zero.so)
delayed_install("" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH}/lib)
if(NOT WITH_BLENDER)
# For the Cycles standalone put libraries next to the Cycles application.
set(cycles_oneapi_target_path ${CYCLES_INSTALL_PATH})
else()
# For Blender put the libraries next to the Blender executable.
#
# Note that the installation path in the delayed_install is relative to the versioned folder,
# which means we need to go one level up.
set(cycles_oneapi_target_path "../")
endif()
# install dynamic libraries required at runtime
if(WIN32)
delayed_install("" "${cycles_kernel_oneapi_lib}" ${cycles_oneapi_target_path})
elseif(UNIX AND NOT APPLE)
delayed_install("" "${cycles_kernel_oneapi_lib}" ${cycles_oneapi_target_path}/lib)
endif()
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cycles_kernel_oneapi_lib}" ${CYCLES_INSTALL_PATH}/lib)
add_custom_target(cycles_kernel_oneapi ALL DEPENDS ${cycles_kernel_oneapi_lib})
endif()

View File

@ -1,54 +0,0 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2022 Intel Corporation */
/* device_capabilities() returns a C string that must be free'd with oneapi_free(). */
DLL_INTERFACE_CALL(oneapi_device_capabilities, char *)
DLL_INTERFACE_CALL(oneapi_free, void, void *)
DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue)
DLL_INTERFACE_CALL(oneapi_get_num_multiprocessors, int, SyclQueue *queue)
DLL_INTERFACE_CALL(oneapi_get_max_num_threads_per_multiprocessor, int, SyclQueue *queue)
DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr)
DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr)
DLL_INTERFACE_CALL(oneapi_create_queue, bool, SyclQueue *&external_queue, int device_index)
DLL_INTERFACE_CALL(oneapi_free_queue, void, SyclQueue *queue)
DLL_INTERFACE_CALL(
oneapi_usm_aligned_alloc_host, void *, SyclQueue *queue, size_t memory_size, size_t alignment)
DLL_INTERFACE_CALL(oneapi_usm_alloc_device, void *, SyclQueue *queue, size_t memory_size)
DLL_INTERFACE_CALL(oneapi_usm_free, void, SyclQueue *queue, void *usm_ptr)
DLL_INTERFACE_CALL(
oneapi_usm_memcpy, bool, SyclQueue *queue, void *dest, void *src, size_t num_bytes)
DLL_INTERFACE_CALL(oneapi_queue_synchronize, bool, SyclQueue *queue)
DLL_INTERFACE_CALL(oneapi_usm_memset,
bool,
SyclQueue *queue,
void *usm_ptr,
unsigned char value,
size_t num_bytes)
DLL_INTERFACE_CALL(oneapi_run_test_kernel, bool, SyclQueue *queue)
/* Operation with Kernel globals structure - map of global/constant allocation - filled before
* render/kernel execution As we don't know in cycles `sizeof` this - Cycles will manage just as
* pointer. */
DLL_INTERFACE_CALL(oneapi_kernel_globals_size, bool, SyclQueue *queue, size_t &kernel_global_size)
DLL_INTERFACE_CALL(oneapi_set_global_memory,
void,
SyclQueue *queue,
void *kernel_globals,
const char *memory_name,
void *memory_device_pointer)
DLL_INTERFACE_CALL(oneapi_kernel_preferred_local_size,
size_t,
SyclQueue *queue,
const DeviceKernel kernel,
const size_t kernel_global_size)
DLL_INTERFACE_CALL(oneapi_enqueue_kernel,
bool,
KernelContext *context,
int kernel,
size_t global_size,
void **args)

View File

@ -3,7 +3,6 @@
#ifdef WITH_ONEAPI
/* clang-format off */
# include "kernel.h"
# include <iostream>
# include <map>
@ -16,163 +15,16 @@
# include "kernel/device/oneapi/kernel_templates.h"
# include "kernel/device/gpu/kernel.h"
/* clang-format on */
static OneAPIErrorCallback s_error_cb = nullptr;
static void *s_error_user_ptr = nullptr;
static std::vector<sycl::device> oneapi_available_devices();
void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
{
s_error_cb = cb;
s_error_user_ptr = user_ptr;
}
void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
{
# ifdef _DEBUG
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
sycl::info::device_type device_type =
queue->get_device().get_info<sycl::info::device::device_type>();
sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
(void)usm_type;
assert(usm_type == sycl::usm::alloc::device ||
((device_type == sycl::info::device_type::host ||
device_type == sycl::info::device_type::is_cpu || allow_host) &&
usm_type == sycl::usm::alloc::host));
# endif
}
bool oneapi_create_queue(SyclQueue *&external_queue, int device_index)
{
bool finished_correct = true;
try {
std::vector<sycl::device> devices = oneapi_available_devices();
if (device_index < 0 || device_index >= devices.size()) {
return false;
}
sycl::queue *created_queue = new sycl::queue(devices[device_index],
sycl::property::queue::in_order());
external_queue = reinterpret_cast<SyclQueue *>(created_queue);
}
catch (sycl::exception const &e) {
finished_correct = false;
if (s_error_cb) {
s_error_cb(e.what(), s_error_user_ptr);
}
}
return finished_correct;
}
void oneapi_free_queue(SyclQueue *queue_)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
delete queue;
}
void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
return sycl::aligned_alloc_host(alignment, memory_size, *queue);
}
void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
return sycl::malloc_device(memory_size, *queue);
}
void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
oneapi_check_usm(queue_, usm_ptr, true);
sycl::free(usm_ptr, *queue);
}
bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
oneapi_check_usm(queue_, dest, true);
oneapi_check_usm(queue_, src, true);
sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
# ifdef WITH_CYCLES_DEBUG
try {
/* NOTE(@nsirgien) Waiting on memory operation may give more precise error
* messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
*/
mem_event.wait_and_throw();
return true;
}
catch (sycl::exception const &e) {
if (s_error_cb) {
s_error_cb(e.what(), s_error_user_ptr);
}
return false;
}
# else
sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
src_type == sycl::usm::alloc::device;
bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
src_type == sycl::usm::alloc::unknown;
/* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
* may not wait until the end of the transfer before using the memory.
*/
if (from_device_to_host || host_or_device_memop_with_offset)
mem_event.wait();
return true;
# endif
}
bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
oneapi_check_usm(queue_, usm_ptr, true);
sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
# ifdef WITH_CYCLES_DEBUG
try {
/* NOTE(@nsirgien) Waiting on memory operation may give more precise error
* messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
*/
mem_event.wait_and_throw();
return true;
}
catch (sycl::exception const &e) {
if (s_error_cb) {
s_error_cb(e.what(), s_error_user_ptr);
}
return false;
}
# else
(void)mem_event;
return true;
# endif
}
bool oneapi_queue_synchronize(SyclQueue *queue_)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
try {
queue->wait_and_throw();
return true;
}
catch (sycl::exception const &e) {
if (s_error_cb) {
s_error_cb(e.what(), s_error_user_ptr);
}
return false;
}
}
/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and
* also trigger runtime compilation of all existing oneAPI kernels */
bool oneapi_run_test_kernel(SyclQueue *queue_)
@ -216,60 +68,13 @@ bool oneapi_run_test_kernel(SyclQueue *queue_)
return true;
}
bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
{
kernel_global_size = sizeof(KernelGlobalsGPU);
return true;
}
void oneapi_set_global_memory(SyclQueue *queue_,
void *kernel_globals,
const char *memory_name,
void *memory_device_pointer)
{
assert(queue_);
assert(kernel_globals);
assert(memory_name);
assert(memory_device_pointer);
KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
oneapi_check_usm(queue_, memory_device_pointer);
oneapi_check_usm(queue_, kernel_globals, true);
std::string matched_name(memory_name);
/* This macro will change global ptr of KernelGlobals via name matching. */
# define KERNEL_DATA_ARRAY(type, name) \
else if (#name == matched_name) \
{ \
globals->__##name = (type *)memory_device_pointer; \
return; \
}
if (false) {
}
else if ("integrator_state" == matched_name) {
globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
return;
}
KERNEL_DATA_ARRAY(KernelData, data)
# include "kernel/data_arrays.h"
else
{
std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
<< std::endl;
assert(false);
}
# undef KERNEL_DATA_ARRAY
}
/* TODO: Move device information to OneapiDevice initialized on creation and use it. */
/* TODO: Move below function to oneapi/queue.cpp. */
size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
size_t oneapi_kernel_preferred_local_size(SyclQueue *queue,
const DeviceKernel kernel,
const size_t kernel_global_size)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
assert(queue);
(void)kernel_global_size;
const static size_t preferred_work_group_size_intersect_shading = 32;
const static size_t preferred_work_group_size_technical = 1024;
@ -311,8 +116,10 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
preferred_work_group_size = 512;
}
const size_t limit_work_group_size =
queue->get_device().get_info<sycl::info::device::max_work_group_size>();
const size_t limit_work_group_size = reinterpret_cast<sycl::queue *>(queue)
->get_device()
.get_info<sycl::info::device::max_work_group_size>();
return std::min(limit_work_group_size, preferred_work_group_size);
}
@ -664,266 +471,4 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
# endif
return success;
}
/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
* since Windows driver 101.3268. */
/* The same min compute-runtime version is currently required across Windows and Linux.
* For Windows driver 101.3430, compute-runtime version is 23904. */
static const int lowest_supported_driver_version_win = 1013430;
static const int lowest_supported_driver_version_neo = 23904;
static int parse_driver_build_version(const sycl::device &device)
{
const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
int driver_build_version = 0;
size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
if (second_dot_position == std::string::npos) {
std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
<< "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
<< " xx.xx.xxx.xxxx (Windows) for device \""
<< device.get_info<sycl::info::device::name>() << "\"." << std::endl;
}
else {
try {
size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
if (third_dot_position != std::string::npos) {
const std::string &third_number_substr = driver_version.substr(
second_dot_position + 1, third_dot_position - second_dot_position - 1);
const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
if (third_number_substr.length() == 3 && forth_number_substr.length() == 4)
driver_build_version = std::stoi(third_number_substr) * 10000 +
std::stoi(forth_number_substr);
}
else {
const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
driver_build_version = std::stoi(third_number_substr);
}
}
catch (std::invalid_argument &e) {
std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
<< "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
<< " xx.xx.xxx.xxxx (Windows) for device \""
<< device.get_info<sycl::info::device::name>() << "\"." << std::endl;
}
}
return driver_build_version;
}
static std::vector<sycl::device> oneapi_available_devices()
{
bool allow_all_devices = false;
if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
allow_all_devices = true;
/* Host device is useful only for debugging at the moment
* so we hide this device with default build settings. */
# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
bool allow_host = true;
# else
bool allow_host = false;
# endif
const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
std::vector<sycl::device> available_devices;
for (const sycl::platform &platform : oneapi_platforms) {
/* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
*/
if (platform.get_backend() == sycl::backend::opencl) {
continue;
}
const std::vector<sycl::device> &oneapi_devices =
(allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
platform.get_devices(sycl::info::device_type::gpu);
for (const sycl::device &device : oneapi_devices) {
if (allow_all_devices) {
/* still filter out host device if build doesn't support it. */
if (allow_host || !device.is_host()) {
available_devices.push_back(device);
}
}
else {
bool filter_out = false;
/* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
* assuming they have either more than 96 Execution Units or not 7 threads per EU.
* Official support can be broaden to older and smaller GPUs once ready. */
if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
/* Filtered-out defaults in-case these values aren't available through too old L0
* runtime. */
int number_of_eus = 96;
int threads_per_eu = 7;
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
}
if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
threads_per_eu =
device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
}
/* This filters out all Level-Zero supported GPUs from older generation than Arc. */
if (number_of_eus <= 96 && threads_per_eu == 7) {
filter_out = true;
}
/* if not already filtered out, check driver version. */
if (!filter_out) {
int driver_build_version = parse_driver_build_version(device);
if ((driver_build_version > 100000 &&
driver_build_version < lowest_supported_driver_version_win) ||
driver_build_version < lowest_supported_driver_version_neo) {
filter_out = true;
}
}
}
else if (!allow_host && device.is_host()) {
filter_out = true;
}
else if (!allow_all_devices) {
filter_out = true;
}
if (!filter_out) {
available_devices.push_back(device);
}
}
}
}
return available_devices;
}
char *oneapi_device_capabilities()
{
std::stringstream capabilities;
const std::vector<sycl::device> &oneapi_devices = oneapi_available_devices();
for (const sycl::device &device : oneapi_devices) {
const std::string &name = device.get_info<sycl::info::device::name>();
capabilities << std::string("\t") << name << "\n";
# define WRITE_ATTR(attribute_name, attribute_variable) \
capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
<< "\n";
# define GET_NUM_ATTR(attribute) \
{ \
size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \
capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \
}
GET_NUM_ATTR(vendor_id)
GET_NUM_ATTR(max_compute_units)
GET_NUM_ATTR(max_work_item_dimensions)
sycl::id<3> max_work_item_sizes =
device.get_info<sycl::info::device::max_work_item_sizes<3>>();
WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0)))
WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1)))
WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2)))
GET_NUM_ATTR(max_work_group_size)
GET_NUM_ATTR(max_num_sub_groups)
GET_NUM_ATTR(sub_group_independent_forward_progress)
GET_NUM_ATTR(preferred_vector_width_char)
GET_NUM_ATTR(preferred_vector_width_short)
GET_NUM_ATTR(preferred_vector_width_int)
GET_NUM_ATTR(preferred_vector_width_long)
GET_NUM_ATTR(preferred_vector_width_float)
GET_NUM_ATTR(preferred_vector_width_double)
GET_NUM_ATTR(preferred_vector_width_half)
GET_NUM_ATTR(native_vector_width_char)
GET_NUM_ATTR(native_vector_width_short)
GET_NUM_ATTR(native_vector_width_int)
GET_NUM_ATTR(native_vector_width_long)
GET_NUM_ATTR(native_vector_width_float)
GET_NUM_ATTR(native_vector_width_double)
GET_NUM_ATTR(native_vector_width_half)
size_t max_clock_frequency =
(size_t)(device.is_host() ? (size_t)0 :
device.get_info<sycl::info::device::max_clock_frequency>());
WRITE_ATTR("max_clock_frequency", max_clock_frequency)
GET_NUM_ATTR(address_bits)
GET_NUM_ATTR(max_mem_alloc_size)
/* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't
* supported so we always return false, even if device supports HW texture usage acceleration.
*/
bool image_support = false;
WRITE_ATTR("image_support", (size_t)image_support)
GET_NUM_ATTR(max_parameter_size)
GET_NUM_ATTR(mem_base_addr_align)
GET_NUM_ATTR(global_mem_size)
GET_NUM_ATTR(local_mem_size)
GET_NUM_ATTR(error_correction_support)
GET_NUM_ATTR(profiling_timer_resolution)
GET_NUM_ATTR(is_available)
# undef GET_NUM_ATTR
# undef WRITE_ATTR
capabilities << "\n";
}
return ::strdup(capabilities.str().c_str());
}
void oneapi_free(void *p)
{
if (p) {
::free(p);
}
}
void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
{
int num = 0;
std::vector<sycl::device> devices = oneapi_available_devices();
for (sycl::device &device : devices) {
const std::string &platform_name =
device.get_platform().get_info<sycl::info::platform::name>();
std::string name = device.get_info<sycl::info::device::name>();
std::string id = "ONEAPI_" + platform_name + "_" + name;
if (device.has(sycl::aspect::ext_intel_pci_address)) {
id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
}
(cb)(id.c_str(), name.c_str(), num, user_ptr);
num++;
}
}
size_t oneapi_get_memcapacity(SyclQueue *queue)
{
return reinterpret_cast<sycl::queue *>(queue)
->get_device()
.get_info<sycl::info::device::global_mem_size>();
}
int oneapi_get_num_multiprocessors(SyclQueue *queue)
{
const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
}
else
return 0;
}
int oneapi_get_max_num_threads_per_multiprocessor(SyclQueue *queue)
{
const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() *
device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
}
else
return 0;
}
#endif /* WITH_ONEAPI */

View File

@ -25,11 +25,6 @@ enum DeviceKernel : int;
class SyclQueue;
typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
const char *name,
int num,
void *user_ptr);
typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr);
struct KernelContext {
@ -45,13 +40,15 @@ struct KernelContext {
extern "C" {
# endif
# define DLL_INTERFACE_CALL(function, return_type, ...) \
CYCLES_KERNEL_ONEAPI_EXPORT return_type function(__VA_ARGS__);
# include "kernel/device/oneapi/dll_interface_template.h"
# undef DLL_INTERFACE_CALL
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_run_test_kernel(SyclQueue *queue_);
CYCLES_KERNEL_ONEAPI_EXPORT void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr);
CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_kernel_preferred_local_size(
SyclQueue *queue, const DeviceKernel kernel, const size_t kernel_global_size);
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context,
int kernel,
size_t global_size,
void **args);
# ifdef __cplusplus
}
# endif
#endif /* WITH_ONEAPI */

View File

@ -1076,6 +1076,13 @@ elseif(WIN32)
DESTINATION ${TARGETDIR_VER}/python/lib/site-packages
)
endif()
if(PLATFORM_BUNDLED_LIBRARIES)
install(
FILES ${PLATFORM_BUNDLED_LIBRARIES}
DESTINATION ${TARGETDIR_LIB}
)
endif()
elseif(APPLE)
if(NOT WITH_PYTHON_MODULE)
# Uppercase name for app bundle.