Cycles: use direct linking for oneAPI backend
This is a minimal set of changes, allowing a lot of cleanup that can happen afterward as it allows sycl method and objects to be used outside of kernel.cpp. Reviewed By: brecht, sergey Differential Revision: https://developer.blender.org/D15397
This commit is contained in:
parent
fc0b1627eb
commit
7eeeaec6da
|
@ -335,10 +335,18 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
|
|||
set(LEVEL_ZERO_ROOT_DIR ${CYCLES_LEVEL_ZERO})
|
||||
endif()
|
||||
|
||||
set(CYCLES_SYCL ${LIBDIR}/dpcpp CACHE PATH "Path to DPC++ and SYCL installation")
|
||||
set(CYCLES_SYCL ${LIBDIR}/dpcpp CACHE PATH "Path to oneAPI DPC++ compiler")
|
||||
if(EXISTS ${CYCLES_SYCL} AND NOT SYCL_ROOT_DIR)
|
||||
set(SYCL_ROOT_DIR ${CYCLES_SYCL})
|
||||
endif()
|
||||
file(GLOB _sycl_runtime_libraries
|
||||
${SYCL_ROOT_DIR}/lib/libsycl.so
|
||||
${SYCL_ROOT_DIR}/lib/libsycl.so.[0-9]
|
||||
${SYCL_ROOT_DIR}/lib/libsycl.so.[0-9].[0-9].[0-9]-[0-9]
|
||||
${SYCL_ROOT_DIR}/lib/libpi_level_zero.so
|
||||
)
|
||||
list(APPEND PLATFORM_BUNDLED_LIBRARIES ${_sycl_runtime_libraries})
|
||||
unset(_sycl_runtime_libraries)
|
||||
endif()
|
||||
|
||||
if(WITH_OPENVDB)
|
||||
|
|
|
@ -952,5 +952,17 @@ endif()
|
|||
set(ZSTD_INCLUDE_DIRS ${LIBDIR}/zstd/include)
|
||||
set(ZSTD_LIBRARIES ${LIBDIR}/zstd/lib/zstd_static.lib)
|
||||
|
||||
set(LEVEL_ZERO_ROOT_DIR ${LIBDIR}/level_zero)
|
||||
set(SYCL_ROOT_DIR ${LIBDIR}/dpcpp)
|
||||
if(WITH_CYCLES_DEVICE_ONEAPI)
|
||||
set(LEVEL_ZERO_ROOT_DIR ${LIBDIR}/level_zero)
|
||||
set(CYCLES_SYCL ${LIBDIR}/dpcpp CACHE PATH "Path to oneAPI DPC++ compiler")
|
||||
if(EXISTS ${CYCLES_SYCL} AND NOT SYCL_ROOT_DIR)
|
||||
set(SYCL_ROOT_DIR ${CYCLES_SYCL})
|
||||
endif()
|
||||
file(GLOB _sycl_runtime_libraries
|
||||
${SYCL_ROOT_DIR}/bin/sycl.dll
|
||||
${SYCL_ROOT_DIR}/bin/sycl[0-9].dll
|
||||
${SYCL_ROOT_DIR}/bin/pi_level_zero.dll
|
||||
)
|
||||
list(APPEND PLATFORM_BUNDLED_LIBRARIES ${_sycl_runtime_libraries})
|
||||
unset(_sycl_runtime_libraries)
|
||||
endif()
|
||||
|
|
|
@ -142,7 +142,6 @@ set(SRC
|
|||
${SRC_DUMMY}
|
||||
${SRC_MULTI}
|
||||
${SRC_OPTIX}
|
||||
${SRC_ONEAPI}
|
||||
${SRC_HEADERS}
|
||||
)
|
||||
|
||||
|
@ -188,7 +187,22 @@ if(WITH_CYCLES_DEVICE_METAL)
|
|||
)
|
||||
endif()
|
||||
if (WITH_CYCLES_DEVICE_ONEAPI)
|
||||
if(WIN32)
|
||||
set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/cycles_kernel_oneapi.lib)
|
||||
else()
|
||||
set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/libcycles_kernel_oneapi.so)
|
||||
endif()
|
||||
list(APPEND LIB
|
||||
${SYCL_LIBRARY}
|
||||
${cycles_kernel_oneapi_lib}
|
||||
)
|
||||
add_definitions(-DWITH_ONEAPI)
|
||||
list(APPEND SRC
|
||||
${SRC_ONEAPI}
|
||||
)
|
||||
list(APPEND INC_SYS
|
||||
${SYCL_INCLUDE_DIR}
|
||||
)
|
||||
endif()
|
||||
|
||||
if(WITH_OPENIMAGEDENOISE)
|
||||
|
|
|
@ -19,62 +19,12 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifdef WITH_ONEAPI
|
||||
static OneAPIDLLInterface oneapi_dll;
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
# define LOAD_ONEAPI_SHARED_LIBRARY(path) (void *)(LoadLibrary(path))
|
||||
# define LOAD_ONEAPI_SHARED_LIBRARY_ERROR() GetLastError()
|
||||
# define FREE_SHARED_LIBRARY(handle) FreeLibrary((HMODULE)handle)
|
||||
# define GET_SHARED_LIBRARY_SYMBOL(handle, name) GetProcAddress((HMODULE)handle, name)
|
||||
#elif __linux__
|
||||
# define LOAD_ONEAPI_SHARED_LIBRARY(path) dlopen(path, RTLD_NOW)
|
||||
# define LOAD_ONEAPI_SHARED_LIBRARY_ERROR() dlerror()
|
||||
# define FREE_SHARED_LIBRARY(handle) dlclose(handle)
|
||||
# define GET_SHARED_LIBRARY_SYMBOL(handle, name) dlsym(handle, name)
|
||||
#endif
|
||||
|
||||
bool device_oneapi_init()
|
||||
{
|
||||
#if !defined(WITH_ONEAPI)
|
||||
return false;
|
||||
#else
|
||||
|
||||
string lib_path = path_get("lib");
|
||||
# ifdef _WIN32
|
||||
lib_path = path_join(lib_path, "cycles_kernel_oneapi.dll");
|
||||
# else
|
||||
lib_path = path_join(lib_path, "cycles_kernel_oneapi.so");
|
||||
# endif
|
||||
void *lib_handle = LOAD_ONEAPI_SHARED_LIBRARY(lib_path.c_str());
|
||||
|
||||
/* This shouldn't happen, but it still makes sense to have a branch for this. */
|
||||
if (lib_handle == NULL) {
|
||||
LOG(ERROR) << "oneAPI kernel shared library cannot be loaded: "
|
||||
<< LOAD_ONEAPI_SHARED_LIBRARY_ERROR();
|
||||
return false;
|
||||
}
|
||||
|
||||
# define DLL_INTERFACE_CALL(function, return_type, ...) \
|
||||
(oneapi_dll.function) = reinterpret_cast<decltype(oneapi_dll.function)>( \
|
||||
GET_SHARED_LIBRARY_SYMBOL(lib_handle, #function)); \
|
||||
if (oneapi_dll.function == NULL) { \
|
||||
LOG(ERROR) << "oneAPI shared library function \"" << #function \
|
||||
<< "\" has not been loaded from kernel shared - disable oneAPI " \
|
||||
"library disable oneAPI implementation due to this"; \
|
||||
FREE_SHARED_LIBRARY(lib_handle); \
|
||||
return false; \
|
||||
}
|
||||
# include "kernel/device/oneapi/dll_interface_template.h"
|
||||
# undef DLL_INTERFACE_CALL
|
||||
|
||||
VLOG_INFO << "oneAPI kernel shared library has been loaded successfully";
|
||||
|
||||
/* We need to have this oneapi kernel shared library during all life-span of the Blender.
|
||||
* So it is not unloaded because of this.
|
||||
* FREE_SHARED_LIBRARY(lib_handle); */
|
||||
|
||||
/* NOTE(@nsirgien): we need to enable JIT cache from here and
|
||||
* right now this cache policy is controlled by env. variables. */
|
||||
/* NOTE(hallade) we also disable use of copy engine as it
|
||||
|
@ -109,17 +59,10 @@ bool device_oneapi_init()
|
|||
#endif
|
||||
}
|
||||
|
||||
#if defined(_WIN32) || defined(__linux__)
|
||||
# undef LOAD_SYCL_SHARED_LIBRARY
|
||||
# undef LOAD_ONEAPI_SHARED_LIBRARY
|
||||
# undef FREE_SHARED_LIBRARY
|
||||
# undef GET_SHARED_LIBRARY_SYMBOL
|
||||
#endif
|
||||
|
||||
Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
|
||||
{
|
||||
#ifdef WITH_ONEAPI
|
||||
return new OneapiDevice(info, oneapi_dll, stats, profiler);
|
||||
return new OneapiDevice(info, stats, profiler);
|
||||
#else
|
||||
(void)info;
|
||||
(void)stats;
|
||||
|
@ -165,7 +108,7 @@ static void device_iterator_cb(const char *id, const char *name, int num, void *
|
|||
void device_oneapi_info(vector<DeviceInfo> &devices)
|
||||
{
|
||||
#ifdef WITH_ONEAPI
|
||||
(oneapi_dll.oneapi_iterate_devices)(device_iterator_cb, &devices);
|
||||
OneapiDevice::iterate_devices(device_iterator_cb, &devices);
|
||||
#else /* WITH_ONEAPI */
|
||||
(void)devices;
|
||||
#endif /* WITH_ONEAPI */
|
||||
|
@ -175,10 +118,10 @@ string device_oneapi_capabilities()
|
|||
{
|
||||
string capabilities;
|
||||
#ifdef WITH_ONEAPI
|
||||
char *c_capabilities = (oneapi_dll.oneapi_device_capabilities)();
|
||||
char *c_capabilities = OneapiDevice::device_capabilities();
|
||||
if (c_capabilities) {
|
||||
capabilities = c_capabilities;
|
||||
(oneapi_dll.oneapi_free)(c_capabilities);
|
||||
free(c_capabilities);
|
||||
}
|
||||
#endif
|
||||
return capabilities;
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
# include "util/debug.h"
|
||||
# include "util/log.h"
|
||||
|
||||
# include "kernel/device/oneapi/kernel.h"
|
||||
# include "kernel/device/oneapi/globals.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -19,26 +19,19 @@ static void queue_error_cb(const char *message, void *user_ptr)
|
|||
}
|
||||
}
|
||||
|
||||
OneapiDevice::OneapiDevice(const DeviceInfo &info,
|
||||
OneAPIDLLInterface &oneapi_dll_object,
|
||||
Stats &stats,
|
||||
Profiler &profiler)
|
||||
OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
|
||||
: Device(info, stats, profiler),
|
||||
device_queue_(nullptr),
|
||||
texture_info_(this, "texture_info", MEM_GLOBAL),
|
||||
kg_memory_(nullptr),
|
||||
kg_memory_device_(nullptr),
|
||||
kg_memory_size_(0),
|
||||
oneapi_dll_(oneapi_dll_object)
|
||||
kg_memory_size_(0)
|
||||
{
|
||||
need_texture_info_ = false;
|
||||
|
||||
oneapi_dll_.oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
|
||||
oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
|
||||
|
||||
/* OneAPI calls should be initialized on this moment. */
|
||||
assert(oneapi_dll_.oneapi_create_queue != nullptr);
|
||||
|
||||
bool is_finished_ok = oneapi_dll_.oneapi_create_queue(device_queue_, info.num);
|
||||
bool is_finished_ok = create_queue(device_queue_, info.num);
|
||||
if (is_finished_ok == false) {
|
||||
set_error("oneAPI queue initialization error: got runtime exception \"" +
|
||||
oneapi_error_string_ + "\"");
|
||||
|
@ -50,7 +43,7 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info,
|
|||
}
|
||||
|
||||
size_t globals_segment_size;
|
||||
is_finished_ok = oneapi_dll_.oneapi_kernel_globals_size(device_queue_, globals_segment_size);
|
||||
is_finished_ok = kernel_globals_size(device_queue_, globals_segment_size);
|
||||
if (is_finished_ok == false) {
|
||||
set_error("oneAPI constant memory initialization got runtime exception \"" +
|
||||
oneapi_error_string_ + "\"");
|
||||
|
@ -59,27 +52,27 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info,
|
|||
VLOG_DEBUG << "Successfully created global/constant memory segment (kernel globals object)";
|
||||
}
|
||||
|
||||
kg_memory_ = oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, globals_segment_size, 16);
|
||||
oneapi_dll_.oneapi_usm_memset(device_queue_, kg_memory_, 0, globals_segment_size);
|
||||
kg_memory_ = usm_aligned_alloc_host(device_queue_, globals_segment_size, 16);
|
||||
usm_memset(device_queue_, kg_memory_, 0, globals_segment_size);
|
||||
|
||||
kg_memory_device_ = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, globals_segment_size);
|
||||
kg_memory_device_ = usm_alloc_device(device_queue_, globals_segment_size);
|
||||
|
||||
kg_memory_size_ = globals_segment_size;
|
||||
|
||||
max_memory_on_device_ = oneapi_dll_.oneapi_get_memcapacity(device_queue_);
|
||||
max_memory_on_device_ = get_memcapacity();
|
||||
}
|
||||
|
||||
OneapiDevice::~OneapiDevice()
|
||||
{
|
||||
texture_info_.free();
|
||||
oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_);
|
||||
oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_device_);
|
||||
usm_free(device_queue_, kg_memory_);
|
||||
usm_free(device_queue_, kg_memory_device_);
|
||||
|
||||
for (ConstMemMap::iterator mt = const_mem_map_.begin(); mt != const_mem_map_.end(); mt++)
|
||||
delete mt->second;
|
||||
|
||||
if (device_queue_)
|
||||
oneapi_dll_.oneapi_free_queue(device_queue_);
|
||||
free_queue(device_queue_);
|
||||
}
|
||||
|
||||
bool OneapiDevice::check_peer_access(Device * /*peer_device*/)
|
||||
|
@ -99,7 +92,7 @@ bool OneapiDevice::load_kernels(const uint requested_features)
|
|||
* with specialization constants, but it hasn't been implemented yet. */
|
||||
(void)requested_features;
|
||||
|
||||
bool is_finished_ok = oneapi_dll_.oneapi_run_test_kernel(device_queue_);
|
||||
bool is_finished_ok = oneapi_run_test_kernel(device_queue_);
|
||||
if (is_finished_ok == false) {
|
||||
set_error("oneAPI kernel load: got runtime exception \"" + oneapi_error_string_ + "\"");
|
||||
}
|
||||
|
@ -138,7 +131,7 @@ void OneapiDevice::generic_alloc(device_memory &mem)
|
|||
* type has been used for oneAPI device in order to better fit in Cycles architecture. */
|
||||
void *device_pointer = nullptr;
|
||||
if (mem.memory_size() + stats.mem_used < max_memory_on_device_)
|
||||
device_pointer = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, memory_size);
|
||||
device_pointer = usm_alloc_device(device_queue_, memory_size);
|
||||
if (device_pointer == nullptr) {
|
||||
set_error("oneAPI kernel - device memory allocation error for " +
|
||||
string_human_readable_size(mem.memory_size()) +
|
||||
|
@ -163,8 +156,7 @@ void OneapiDevice::generic_copy_to(device_memory &mem)
|
|||
/* Copy operation from host shouldn't be requested if there is no memory allocated on host. */
|
||||
assert(mem.host_pointer);
|
||||
assert(device_queue_);
|
||||
oneapi_dll_.oneapi_usm_memcpy(
|
||||
device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size);
|
||||
usm_memcpy(device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size);
|
||||
}
|
||||
|
||||
/* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */
|
||||
|
@ -178,11 +170,6 @@ string OneapiDevice::oneapi_error_message()
|
|||
return string(oneapi_error_string_);
|
||||
}
|
||||
|
||||
OneAPIDLLInterface OneapiDevice::oneapi_dll_object()
|
||||
{
|
||||
return oneapi_dll_;
|
||||
}
|
||||
|
||||
void *OneapiDevice::kernel_globals_device_pointer()
|
||||
{
|
||||
return kg_memory_device_;
|
||||
|
@ -198,7 +185,7 @@ void OneapiDevice::generic_free(device_memory &mem)
|
|||
mem.device_size = 0;
|
||||
|
||||
assert(device_queue_);
|
||||
oneapi_dll_.oneapi_usm_free(device_queue_, (void *)mem.device_pointer);
|
||||
usm_free(device_queue_, (void *)mem.device_pointer);
|
||||
mem.device_pointer = 0;
|
||||
}
|
||||
|
||||
|
@ -266,8 +253,7 @@ void OneapiDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t
|
|||
if (mem.device_pointer) {
|
||||
char *shifted_host = reinterpret_cast<char *>(mem.host_pointer) + offset;
|
||||
char *shifted_device = reinterpret_cast<char *>(mem.device_pointer) + offset;
|
||||
bool is_finished_ok = oneapi_dll_.oneapi_usm_memcpy(
|
||||
device_queue_, shifted_host, shifted_device, size);
|
||||
bool is_finished_ok = usm_memcpy(device_queue_, shifted_host, shifted_device, size);
|
||||
if (is_finished_ok == false) {
|
||||
set_error("oneAPI memory operation error: got runtime exception \"" +
|
||||
oneapi_error_string_ + "\"");
|
||||
|
@ -292,7 +278,7 @@ void OneapiDevice::mem_zero(device_memory &mem)
|
|||
}
|
||||
|
||||
assert(device_queue_);
|
||||
bool is_finished_ok = oneapi_dll_.oneapi_usm_memset(
|
||||
bool is_finished_ok = usm_memset(
|
||||
device_queue_, (void *)mem.device_pointer, 0, mem.memory_size());
|
||||
if (is_finished_ok == false) {
|
||||
set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
|
||||
|
@ -349,10 +335,9 @@ void OneapiDevice::const_copy_to(const char *name, void *host, size_t size)
|
|||
memcpy(data->data(), host, size);
|
||||
data->copy_to_device();
|
||||
|
||||
oneapi_dll_.oneapi_set_global_memory(
|
||||
device_queue_, kg_memory_, name, (void *)data->device_pointer);
|
||||
set_global_memory(device_queue_, kg_memory_, name, (void *)data->device_pointer);
|
||||
|
||||
oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
|
||||
usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
|
||||
}
|
||||
|
||||
void OneapiDevice::global_alloc(device_memory &mem)
|
||||
|
@ -367,10 +352,9 @@ void OneapiDevice::global_alloc(device_memory &mem)
|
|||
generic_alloc(mem);
|
||||
generic_copy_to(mem);
|
||||
|
||||
oneapi_dll_.oneapi_set_global_memory(
|
||||
device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer);
|
||||
set_global_memory(device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer);
|
||||
|
||||
oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
|
||||
usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
|
||||
}
|
||||
|
||||
void OneapiDevice::global_free(device_memory &mem)
|
||||
|
@ -410,18 +394,6 @@ unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create()
|
|||
return make_unique<OneapiDeviceQueue>(this);
|
||||
}
|
||||
|
||||
int OneapiDevice::get_num_multiprocessors()
|
||||
{
|
||||
assert(device_queue_);
|
||||
return oneapi_dll_.oneapi_get_num_multiprocessors(device_queue_);
|
||||
}
|
||||
|
||||
int OneapiDevice::get_max_num_threads_per_multiprocessor()
|
||||
{
|
||||
assert(device_queue_);
|
||||
return oneapi_dll_.oneapi_get_max_num_threads_per_multiprocessor(device_queue_);
|
||||
}
|
||||
|
||||
bool OneapiDevice::should_use_graphics_interop()
|
||||
{
|
||||
/* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so
|
||||
|
@ -432,13 +404,460 @@ bool OneapiDevice::should_use_graphics_interop()
|
|||
void *OneapiDevice::usm_aligned_alloc_host(size_t memory_size, size_t alignment)
|
||||
{
|
||||
assert(device_queue_);
|
||||
return oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, memory_size, alignment);
|
||||
return usm_aligned_alloc_host(device_queue_, memory_size, alignment);
|
||||
}
|
||||
|
||||
void OneapiDevice::usm_free(void *usm_ptr)
|
||||
{
|
||||
assert(device_queue_);
|
||||
return oneapi_dll_.oneapi_usm_free(device_queue_, usm_ptr);
|
||||
return usm_free(device_queue_, usm_ptr);
|
||||
}
|
||||
|
||||
void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
|
||||
{
|
||||
# ifdef _DEBUG
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
sycl::info::device_type device_type =
|
||||
queue->get_device().get_info<sycl::info::device::device_type>();
|
||||
sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
|
||||
(void)usm_type;
|
||||
assert(usm_type == sycl::usm::alloc::device ||
|
||||
((device_type == sycl::info::device_type::host ||
|
||||
device_type == sycl::info::device_type::cpu || allow_host) &&
|
||||
usm_type == sycl::usm::alloc::host));
|
||||
# endif
|
||||
}
|
||||
|
||||
bool OneapiDevice::create_queue(SyclQueue *&external_queue, int device_index)
|
||||
{
|
||||
bool finished_correct = true;
|
||||
try {
|
||||
std::vector<sycl::device> devices = OneapiDevice::available_devices();
|
||||
if (device_index < 0 || device_index >= devices.size()) {
|
||||
return false;
|
||||
}
|
||||
sycl::queue *created_queue = new sycl::queue(devices[device_index],
|
||||
sycl::property::queue::in_order());
|
||||
external_queue = reinterpret_cast<SyclQueue *>(created_queue);
|
||||
}
|
||||
catch (sycl::exception const &e) {
|
||||
finished_correct = false;
|
||||
oneapi_error_string_ = e.what();
|
||||
}
|
||||
return finished_correct;
|
||||
}
|
||||
|
||||
void OneapiDevice::free_queue(SyclQueue *queue_)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
delete queue;
|
||||
}
|
||||
|
||||
void *OneapiDevice::usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
return sycl::aligned_alloc_host(alignment, memory_size, *queue);
|
||||
}
|
||||
|
||||
void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
return sycl::malloc_device(memory_size, *queue);
|
||||
}
|
||||
|
||||
void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
OneapiDevice::check_usm(queue_, usm_ptr, true);
|
||||
sycl::free(usm_ptr, *queue);
|
||||
}
|
||||
|
||||
bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
OneapiDevice::check_usm(queue_, dest, true);
|
||||
OneapiDevice::check_usm(queue_, src, true);
|
||||
sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
|
||||
# ifdef WITH_CYCLES_DEBUG
|
||||
try {
|
||||
/* NOTE(@nsirgien) Waiting on memory operation may give more precise error
|
||||
* messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
|
||||
*/
|
||||
mem_event.wait_and_throw();
|
||||
return true;
|
||||
}
|
||||
catch (sycl::exception const &e) {
|
||||
oneapi_error_string_ = e.what();
|
||||
return false;
|
||||
}
|
||||
# else
|
||||
sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
|
||||
sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
|
||||
bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
|
||||
src_type == sycl::usm::alloc::device;
|
||||
bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
|
||||
src_type == sycl::usm::alloc::unknown;
|
||||
/* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
|
||||
* may not wait until the end of the transfer before using the memory.
|
||||
*/
|
||||
if (from_device_to_host || host_or_device_memop_with_offset)
|
||||
mem_event.wait();
|
||||
return true;
|
||||
# endif
|
||||
}
|
||||
|
||||
bool OneapiDevice::usm_memset(SyclQueue *queue_,
|
||||
void *usm_ptr,
|
||||
unsigned char value,
|
||||
size_t num_bytes)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
OneapiDevice::check_usm(queue_, usm_ptr, true);
|
||||
sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
|
||||
# ifdef WITH_CYCLES_DEBUG
|
||||
try {
|
||||
/* NOTE(@nsirgien) Waiting on memory operation may give more precise error
|
||||
* messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
|
||||
*/
|
||||
mem_event.wait_and_throw();
|
||||
return true;
|
||||
}
|
||||
catch (sycl::exception const &e) {
|
||||
oneapi_error_string_ = e.what();
|
||||
return false;
|
||||
}
|
||||
# else
|
||||
(void)mem_event;
|
||||
return true;
|
||||
# endif
|
||||
}
|
||||
|
||||
bool OneapiDevice::queue_synchronize(SyclQueue *queue_)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
try {
|
||||
queue->wait_and_throw();
|
||||
return true;
|
||||
}
|
||||
catch (sycl::exception const &e) {
|
||||
oneapi_error_string_ = e.what();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool OneapiDevice::kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
|
||||
{
|
||||
kernel_global_size = sizeof(KernelGlobalsGPU);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void OneapiDevice::set_global_memory(SyclQueue *queue_,
|
||||
void *kernel_globals,
|
||||
const char *memory_name,
|
||||
void *memory_device_pointer)
|
||||
{
|
||||
assert(queue_);
|
||||
assert(kernel_globals);
|
||||
assert(memory_name);
|
||||
assert(memory_device_pointer);
|
||||
KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
|
||||
OneapiDevice::check_usm(queue_, memory_device_pointer);
|
||||
OneapiDevice::check_usm(queue_, kernel_globals, true);
|
||||
|
||||
std::string matched_name(memory_name);
|
||||
|
||||
/* This macro will change global ptr of KernelGlobals via name matching. */
|
||||
# define KERNEL_DATA_ARRAY(type, name) \
|
||||
else if (#name == matched_name) \
|
||||
{ \
|
||||
globals->__##name = (type *)memory_device_pointer; \
|
||||
return; \
|
||||
}
|
||||
if (false) {
|
||||
}
|
||||
else if ("integrator_state" == matched_name) {
|
||||
globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
|
||||
return;
|
||||
}
|
||||
KERNEL_DATA_ARRAY(KernelData, data)
|
||||
# include "kernel/data_arrays.h"
|
||||
else
|
||||
{
|
||||
std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
|
||||
<< std::endl;
|
||||
assert(false);
|
||||
}
|
||||
# undef KERNEL_DATA_ARRAY
|
||||
}
|
||||
|
||||
bool OneapiDevice::enqueue_kernel(KernelContext *kernel_context,
|
||||
int kernel,
|
||||
size_t global_size,
|
||||
void **args)
|
||||
{
|
||||
return oneapi_enqueue_kernel(kernel_context, kernel, global_size, args);
|
||||
}
|
||||
|
||||
/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
|
||||
* since Windows driver 101.3268. */
|
||||
/* The same min compute-runtime version is currently required across Windows and Linux.
|
||||
* For Windows driver 101.3430, compute-runtime version is 23904. */
|
||||
static const int lowest_supported_driver_version_win = 1013430;
|
||||
static const int lowest_supported_driver_version_neo = 23904;
|
||||
|
||||
int OneapiDevice::parse_driver_build_version(const sycl::device &device)
|
||||
{
|
||||
const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
|
||||
int driver_build_version = 0;
|
||||
|
||||
size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
|
||||
if (second_dot_position == std::string::npos) {
|
||||
std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
|
||||
<< "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
|
||||
<< " xx.xx.xxx.xxxx (Windows) for device \""
|
||||
<< device.get_info<sycl::info::device::name>() << "\"." << std::endl;
|
||||
}
|
||||
else {
|
||||
try {
|
||||
size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
|
||||
if (third_dot_position != std::string::npos) {
|
||||
const std::string &third_number_substr = driver_version.substr(
|
||||
second_dot_position + 1, third_dot_position - second_dot_position - 1);
|
||||
const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
|
||||
if (third_number_substr.length() == 3 && forth_number_substr.length() == 4)
|
||||
driver_build_version = std::stoi(third_number_substr) * 10000 +
|
||||
std::stoi(forth_number_substr);
|
||||
}
|
||||
else {
|
||||
const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
|
||||
driver_build_version = std::stoi(third_number_substr);
|
||||
}
|
||||
}
|
||||
catch (std::invalid_argument &) {
|
||||
std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
|
||||
<< "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
|
||||
<< " xx.xx.xxx.xxxx (Windows) for device \""
|
||||
<< device.get_info<sycl::info::device::name>() << "\"." << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
return driver_build_version;
|
||||
}
|
||||
|
||||
std::vector<sycl::device> OneapiDevice::available_devices()
|
||||
{
|
||||
bool allow_all_devices = false;
|
||||
if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
|
||||
allow_all_devices = true;
|
||||
|
||||
/* Host device is useful only for debugging at the moment
|
||||
* so we hide this device with default build settings. */
|
||||
# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
|
||||
bool allow_host = true;
|
||||
# else
|
||||
bool allow_host = false;
|
||||
# endif
|
||||
|
||||
const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
|
||||
|
||||
std::vector<sycl::device> available_devices;
|
||||
for (const sycl::platform &platform : oneapi_platforms) {
|
||||
/* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
|
||||
*/
|
||||
if (platform.get_backend() == sycl::backend::opencl) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const std::vector<sycl::device> &oneapi_devices =
|
||||
(allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
|
||||
platform.get_devices(sycl::info::device_type::gpu);
|
||||
|
||||
for (const sycl::device &device : oneapi_devices) {
|
||||
if (allow_all_devices) {
|
||||
/* still filter out host device if build doesn't support it. */
|
||||
if (allow_host || !device.is_host()) {
|
||||
available_devices.push_back(device);
|
||||
}
|
||||
}
|
||||
else {
|
||||
bool filter_out = false;
|
||||
|
||||
/* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
|
||||
* assuming they have either more than 96 Execution Units or not 7 threads per EU.
|
||||
* Official support can be broaden to older and smaller GPUs once ready. */
|
||||
if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
|
||||
/* Filtered-out defaults in-case these values aren't available through too old L0
|
||||
* runtime. */
|
||||
int number_of_eus = 96;
|
||||
int threads_per_eu = 7;
|
||||
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
|
||||
number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
|
||||
}
|
||||
if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
|
||||
threads_per_eu =
|
||||
device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
|
||||
}
|
||||
/* This filters out all Level-Zero supported GPUs from older generation than Arc. */
|
||||
if (number_of_eus <= 96 && threads_per_eu == 7) {
|
||||
filter_out = true;
|
||||
}
|
||||
/* if not already filtered out, check driver version. */
|
||||
if (!filter_out) {
|
||||
int driver_build_version = parse_driver_build_version(device);
|
||||
if ((driver_build_version > 100000 &&
|
||||
driver_build_version < lowest_supported_driver_version_win) ||
|
||||
driver_build_version < lowest_supported_driver_version_neo) {
|
||||
filter_out = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (!allow_host && device.is_host()) {
|
||||
filter_out = true;
|
||||
}
|
||||
else if (!allow_all_devices) {
|
||||
filter_out = true;
|
||||
}
|
||||
|
||||
if (!filter_out) {
|
||||
available_devices.push_back(device);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return available_devices;
|
||||
}
|
||||
|
||||
char *OneapiDevice::device_capabilities()
|
||||
{
|
||||
std::stringstream capabilities;
|
||||
|
||||
const std::vector<sycl::device> &oneapi_devices = available_devices();
|
||||
for (const sycl::device &device : oneapi_devices) {
|
||||
const std::string &name = device.get_info<sycl::info::device::name>();
|
||||
|
||||
capabilities << std::string("\t") << name << "\n";
|
||||
# define WRITE_ATTR(attribute_name, attribute_variable) \
|
||||
capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
|
||||
<< "\n";
|
||||
# define GET_NUM_ATTR(attribute) \
|
||||
{ \
|
||||
size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \
|
||||
capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \
|
||||
}
|
||||
|
||||
GET_NUM_ATTR(vendor_id)
|
||||
GET_NUM_ATTR(max_compute_units)
|
||||
GET_NUM_ATTR(max_work_item_dimensions)
|
||||
|
||||
sycl::id<3> max_work_item_sizes =
|
||||
device.get_info<sycl::info::device::max_work_item_sizes<3>>();
|
||||
WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0)))
|
||||
WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1)))
|
||||
WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2)))
|
||||
|
||||
GET_NUM_ATTR(max_work_group_size)
|
||||
GET_NUM_ATTR(max_num_sub_groups)
|
||||
GET_NUM_ATTR(sub_group_independent_forward_progress)
|
||||
|
||||
GET_NUM_ATTR(preferred_vector_width_char)
|
||||
GET_NUM_ATTR(preferred_vector_width_short)
|
||||
GET_NUM_ATTR(preferred_vector_width_int)
|
||||
GET_NUM_ATTR(preferred_vector_width_long)
|
||||
GET_NUM_ATTR(preferred_vector_width_float)
|
||||
GET_NUM_ATTR(preferred_vector_width_double)
|
||||
GET_NUM_ATTR(preferred_vector_width_half)
|
||||
|
||||
GET_NUM_ATTR(native_vector_width_char)
|
||||
GET_NUM_ATTR(native_vector_width_short)
|
||||
GET_NUM_ATTR(native_vector_width_int)
|
||||
GET_NUM_ATTR(native_vector_width_long)
|
||||
GET_NUM_ATTR(native_vector_width_float)
|
||||
GET_NUM_ATTR(native_vector_width_double)
|
||||
GET_NUM_ATTR(native_vector_width_half)
|
||||
|
||||
size_t max_clock_frequency =
|
||||
(size_t)(device.is_host() ? (size_t)0 :
|
||||
device.get_info<sycl::info::device::max_clock_frequency>());
|
||||
WRITE_ATTR("max_clock_frequency", max_clock_frequency)
|
||||
|
||||
GET_NUM_ATTR(address_bits)
|
||||
GET_NUM_ATTR(max_mem_alloc_size)
|
||||
|
||||
/* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't
|
||||
* supported so we always return false, even if device supports HW texture usage acceleration.
|
||||
*/
|
||||
bool image_support = false;
|
||||
WRITE_ATTR("image_support", (size_t)image_support)
|
||||
|
||||
GET_NUM_ATTR(max_parameter_size)
|
||||
GET_NUM_ATTR(mem_base_addr_align)
|
||||
GET_NUM_ATTR(global_mem_size)
|
||||
GET_NUM_ATTR(local_mem_size)
|
||||
GET_NUM_ATTR(error_correction_support)
|
||||
GET_NUM_ATTR(profiling_timer_resolution)
|
||||
GET_NUM_ATTR(is_available)
|
||||
|
||||
# undef GET_NUM_ATTR
|
||||
# undef WRITE_ATTR
|
||||
capabilities << "\n";
|
||||
}
|
||||
|
||||
return ::strdup(capabilities.str().c_str());
|
||||
}
|
||||
|
||||
void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
|
||||
{
|
||||
int num = 0;
|
||||
std::vector<sycl::device> devices = OneapiDevice::available_devices();
|
||||
for (sycl::device &device : devices) {
|
||||
const std::string &platform_name =
|
||||
device.get_platform().get_info<sycl::info::platform::name>();
|
||||
std::string name = device.get_info<sycl::info::device::name>();
|
||||
std::string id = "ONEAPI_" + platform_name + "_" + name;
|
||||
if (device.has(sycl::aspect::ext_intel_pci_address)) {
|
||||
id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
|
||||
}
|
||||
(cb)(id.c_str(), name.c_str(), num, user_ptr);
|
||||
num++;
|
||||
}
|
||||
}
|
||||
|
||||
size_t OneapiDevice::get_memcapacity()
|
||||
{
|
||||
return reinterpret_cast<sycl::queue *>(device_queue_)
|
||||
->get_device()
|
||||
.get_info<sycl::info::device::global_mem_size>();
|
||||
}
|
||||
|
||||
int OneapiDevice::get_num_multiprocessors()
|
||||
{
|
||||
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
|
||||
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
|
||||
return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
|
||||
}
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
int OneapiDevice::get_max_num_threads_per_multiprocessor()
|
||||
{
|
||||
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
|
||||
if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
|
||||
device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
|
||||
return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() *
|
||||
device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
|
||||
}
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
|
|
@ -3,9 +3,12 @@
|
|||
|
||||
#ifdef WITH_ONEAPI
|
||||
|
||||
# include <CL/sycl.hpp>
|
||||
|
||||
# include "device/device.h"
|
||||
# include "device/oneapi/device.h"
|
||||
# include "device/oneapi/queue.h"
|
||||
# include "kernel/device/oneapi/kernel.h"
|
||||
|
||||
# include "util/map.h"
|
||||
|
||||
|
@ -13,6 +16,11 @@ CCL_NAMESPACE_BEGIN
|
|||
|
||||
class DeviceQueue;
|
||||
|
||||
typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
|
||||
const char *name,
|
||||
int num,
|
||||
void *user_ptr);
|
||||
|
||||
class OneapiDevice : public Device {
|
||||
private:
|
||||
SyclQueue *device_queue_;
|
||||
|
@ -25,16 +33,12 @@ class OneapiDevice : public Device {
|
|||
void *kg_memory_device_;
|
||||
size_t kg_memory_size_ = (size_t)0;
|
||||
size_t max_memory_on_device_ = (size_t)0;
|
||||
OneAPIDLLInterface oneapi_dll_;
|
||||
std::string oneapi_error_string_;
|
||||
|
||||
public:
|
||||
virtual BVHLayoutMask get_bvh_layout_mask() const override;
|
||||
|
||||
OneapiDevice(const DeviceInfo &info,
|
||||
OneAPIDLLInterface &oneapi_dll_object,
|
||||
Stats &stats,
|
||||
Profiler &profiler);
|
||||
OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
|
||||
|
||||
virtual ~OneapiDevice();
|
||||
|
||||
|
@ -50,12 +54,8 @@ class OneapiDevice : public Device {
|
|||
|
||||
void generic_free(device_memory &mem);
|
||||
|
||||
SyclQueue *sycl_queue();
|
||||
|
||||
string oneapi_error_message();
|
||||
|
||||
OneAPIDLLInterface oneapi_dll_object();
|
||||
|
||||
void *kernel_globals_device_pointer();
|
||||
|
||||
void mem_alloc(device_memory &mem) override;
|
||||
|
@ -90,13 +90,37 @@ class OneapiDevice : public Device {
|
|||
|
||||
virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
|
||||
|
||||
int get_num_multiprocessors();
|
||||
int get_max_num_threads_per_multiprocessor();
|
||||
|
||||
/* NOTE(@nsirgien): Create this methods to avoid some compilation problems on Windows with host
|
||||
* side compilation (MSVC). */
|
||||
void *usm_aligned_alloc_host(size_t memory_size, size_t alignment);
|
||||
void usm_free(void *usm_ptr);
|
||||
|
||||
static std::vector<sycl::device> available_devices();
|
||||
static char *device_capabilities();
|
||||
static int parse_driver_build_version(const sycl::device &device);
|
||||
static void iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr);
|
||||
|
||||
size_t get_memcapacity();
|
||||
int get_num_multiprocessors();
|
||||
int get_max_num_threads_per_multiprocessor();
|
||||
bool queue_synchronize(SyclQueue *queue);
|
||||
bool kernel_globals_size(SyclQueue *queue, size_t &kernel_global_size);
|
||||
void set_global_memory(SyclQueue *queue,
|
||||
void *kernel_globals,
|
||||
const char *memory_name,
|
||||
void *memory_device_pointer);
|
||||
bool enqueue_kernel(KernelContext *kernel_context, int kernel, size_t global_size, void **args);
|
||||
SyclQueue *sycl_queue();
|
||||
|
||||
protected:
|
||||
void check_usm(SyclQueue *queue, const void *usm_ptr, bool allow_host);
|
||||
bool create_queue(SyclQueue *&external_queue, int device_index);
|
||||
void free_queue(SyclQueue *queue);
|
||||
void *usm_aligned_alloc_host(SyclQueue *queue, size_t memory_size, size_t alignment);
|
||||
void *usm_alloc_device(SyclQueue *queue, size_t memory_size);
|
||||
void usm_free(SyclQueue *queue, void *usm_ptr);
|
||||
bool usm_memcpy(SyclQueue *queue, void *dest, void *src, size_t num_bytes);
|
||||
bool usm_memset(SyclQueue *queue, void *usm_ptr, unsigned char value, size_t num_bytes);
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
|
|
@ -1,17 +0,0 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2011-2022 Blender Foundation */
|
||||
|
||||
#pragma once
|
||||
|
||||
/* Include kernel header to get access to SYCL-specific types, like SyclQueue and
|
||||
* OneAPIDeviceIteratorCallback. */
|
||||
#include "kernel/device/oneapi/kernel.h"
|
||||
|
||||
#ifdef WITH_ONEAPI
|
||||
struct OneAPIDLLInterface {
|
||||
# define DLL_INTERFACE_CALL(function, return_type, ...) \
|
||||
return_type (*function)(__VA_ARGS__) = nullptr;
|
||||
# include "kernel/device/oneapi/dll_interface_template.h"
|
||||
# undef DLL_INTERFACE_CALL
|
||||
};
|
||||
#endif
|
|
@ -22,10 +22,7 @@ struct KernelExecutionInfo {
|
|||
/* OneapiDeviceQueue */
|
||||
|
||||
OneapiDeviceQueue::OneapiDeviceQueue(OneapiDevice *device)
|
||||
: DeviceQueue(device),
|
||||
oneapi_device_(device),
|
||||
oneapi_dll_(device->oneapi_dll_object()),
|
||||
kernel_context_(nullptr)
|
||||
: DeviceQueue(device), oneapi_device_(device), kernel_context_(nullptr)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -81,14 +78,14 @@ bool OneapiDeviceQueue::enqueue(DeviceKernel kernel,
|
|||
assert(signed_kernel_work_size >= 0);
|
||||
size_t kernel_work_size = (size_t)signed_kernel_work_size;
|
||||
|
||||
size_t kernel_local_size = oneapi_dll_.oneapi_kernel_preferred_local_size(
|
||||
size_t kernel_local_size = oneapi_kernel_preferred_local_size(
|
||||
kernel_context_->queue, (::DeviceKernel)kernel, kernel_work_size);
|
||||
size_t uniformed_kernel_work_size = round_up(kernel_work_size, kernel_local_size);
|
||||
|
||||
assert(kernel_context_);
|
||||
|
||||
/* Call the oneAPI kernel DLL to launch the requested kernel. */
|
||||
bool is_finished_ok = oneapi_dll_.oneapi_enqueue_kernel(
|
||||
bool is_finished_ok = oneapi_device_->enqueue_kernel(
|
||||
kernel_context_, kernel, uniformed_kernel_work_size, args);
|
||||
|
||||
if (is_finished_ok == false) {
|
||||
|
@ -108,7 +105,7 @@ bool OneapiDeviceQueue::synchronize()
|
|||
return false;
|
||||
}
|
||||
|
||||
bool is_finished_ok = oneapi_dll_.oneapi_queue_synchronize(oneapi_device_->sycl_queue());
|
||||
bool is_finished_ok = oneapi_device_->queue_synchronize(oneapi_device_->sycl_queue());
|
||||
if (is_finished_ok == false)
|
||||
oneapi_device_->set_error("oneAPI unknown kernel execution error: got runtime exception \"" +
|
||||
oneapi_device_->oneapi_error_message() + "\"");
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
# include "device/queue.h"
|
||||
|
||||
# include "device/oneapi/device.h"
|
||||
# include "device/oneapi/dll_interface.h"
|
||||
# include "kernel/device/oneapi/kernel.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -41,9 +41,7 @@ class OneapiDeviceQueue : public DeviceQueue {
|
|||
|
||||
protected:
|
||||
OneapiDevice *oneapi_device_;
|
||||
OneAPIDLLInterface oneapi_dll_;
|
||||
KernelContext *kernel_context_;
|
||||
bool with_kernel_statistics_;
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
|
|
@ -716,7 +716,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
|
|||
if(WIN32)
|
||||
set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.dll)
|
||||
else()
|
||||
set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.so)
|
||||
set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/libcycles_kernel_oneapi.so)
|
||||
endif()
|
||||
|
||||
set(cycles_oneapi_kernel_sources
|
||||
|
@ -815,6 +815,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
|
|||
|
||||
if(WIN32)
|
||||
list(APPEND sycl_compiler_flags
|
||||
-fuse-ld=link
|
||||
-fms-extensions
|
||||
-fms-compatibility
|
||||
-D_WINDLL
|
||||
|
@ -888,33 +889,24 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
# install dynamic libraries required at runtime
|
||||
if(WIN32)
|
||||
set(SYCL_RUNTIME_DEPENDENCIES
|
||||
sycl.dll
|
||||
pi_level_zero.dll
|
||||
)
|
||||
if(NOT WITH_BLENDER)
|
||||
# For the Cycles standalone put libraries next to the Cycles application.
|
||||
delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH})
|
||||
else()
|
||||
# For Blender put the libraries next to the Blender executable.
|
||||
#
|
||||
# Note that the installation path in the delayed_install is relative to the versioned folder,
|
||||
# which means we need to go one level up.
|
||||
delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" "../")
|
||||
endif()
|
||||
elseif(UNIX AND NOT APPLE)
|
||||
file(GLOB SYCL_RUNTIME_DEPENDENCIES
|
||||
${sycl_compiler_root}/../lib/libsycl.so
|
||||
${sycl_compiler_root}/../lib/libsycl.so.[0-9]
|
||||
${sycl_compiler_root}/../lib/libsycl.so.[0-9].[0-9].[0-9]-[0-9]
|
||||
)
|
||||
list(APPEND SYCL_RUNTIME_DEPENDENCIES ${sycl_compiler_root}/../lib/libpi_level_zero.so)
|
||||
delayed_install("" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH}/lib)
|
||||
if(NOT WITH_BLENDER)
|
||||
# For the Cycles standalone put libraries next to the Cycles application.
|
||||
set(cycles_oneapi_target_path ${CYCLES_INSTALL_PATH})
|
||||
else()
|
||||
# For Blender put the libraries next to the Blender executable.
|
||||
#
|
||||
# Note that the installation path in the delayed_install is relative to the versioned folder,
|
||||
# which means we need to go one level up.
|
||||
set(cycles_oneapi_target_path "../")
|
||||
endif()
|
||||
|
||||
# install dynamic libraries required at runtime
|
||||
if(WIN32)
|
||||
delayed_install("" "${cycles_kernel_oneapi_lib}" ${cycles_oneapi_target_path})
|
||||
elseif(UNIX AND NOT APPLE)
|
||||
delayed_install("" "${cycles_kernel_oneapi_lib}" ${cycles_oneapi_target_path}/lib)
|
||||
endif()
|
||||
|
||||
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cycles_kernel_oneapi_lib}" ${CYCLES_INSTALL_PATH}/lib)
|
||||
add_custom_target(cycles_kernel_oneapi ALL DEPENDS ${cycles_kernel_oneapi_lib})
|
||||
endif()
|
||||
|
||||
|
|
|
@ -1,54 +0,0 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2022 Intel Corporation */
|
||||
|
||||
/* device_capabilities() returns a C string that must be free'd with oneapi_free(). */
|
||||
DLL_INTERFACE_CALL(oneapi_device_capabilities, char *)
|
||||
DLL_INTERFACE_CALL(oneapi_free, void, void *)
|
||||
DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue)
|
||||
|
||||
DLL_INTERFACE_CALL(oneapi_get_num_multiprocessors, int, SyclQueue *queue)
|
||||
DLL_INTERFACE_CALL(oneapi_get_max_num_threads_per_multiprocessor, int, SyclQueue *queue)
|
||||
DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr)
|
||||
DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr)
|
||||
|
||||
DLL_INTERFACE_CALL(oneapi_create_queue, bool, SyclQueue *&external_queue, int device_index)
|
||||
DLL_INTERFACE_CALL(oneapi_free_queue, void, SyclQueue *queue)
|
||||
DLL_INTERFACE_CALL(
|
||||
oneapi_usm_aligned_alloc_host, void *, SyclQueue *queue, size_t memory_size, size_t alignment)
|
||||
DLL_INTERFACE_CALL(oneapi_usm_alloc_device, void *, SyclQueue *queue, size_t memory_size)
|
||||
DLL_INTERFACE_CALL(oneapi_usm_free, void, SyclQueue *queue, void *usm_ptr)
|
||||
|
||||
DLL_INTERFACE_CALL(
|
||||
oneapi_usm_memcpy, bool, SyclQueue *queue, void *dest, void *src, size_t num_bytes)
|
||||
DLL_INTERFACE_CALL(oneapi_queue_synchronize, bool, SyclQueue *queue)
|
||||
DLL_INTERFACE_CALL(oneapi_usm_memset,
|
||||
bool,
|
||||
SyclQueue *queue,
|
||||
void *usm_ptr,
|
||||
unsigned char value,
|
||||
size_t num_bytes)
|
||||
|
||||
DLL_INTERFACE_CALL(oneapi_run_test_kernel, bool, SyclQueue *queue)
|
||||
|
||||
/* Operation with Kernel globals structure - map of global/constant allocation - filled before
|
||||
* render/kernel execution As we don't know in cycles `sizeof` this - Cycles will manage just as
|
||||
* pointer. */
|
||||
DLL_INTERFACE_CALL(oneapi_kernel_globals_size, bool, SyclQueue *queue, size_t &kernel_global_size)
|
||||
DLL_INTERFACE_CALL(oneapi_set_global_memory,
|
||||
void,
|
||||
SyclQueue *queue,
|
||||
void *kernel_globals,
|
||||
const char *memory_name,
|
||||
void *memory_device_pointer)
|
||||
|
||||
DLL_INTERFACE_CALL(oneapi_kernel_preferred_local_size,
|
||||
size_t,
|
||||
SyclQueue *queue,
|
||||
const DeviceKernel kernel,
|
||||
const size_t kernel_global_size)
|
||||
DLL_INTERFACE_CALL(oneapi_enqueue_kernel,
|
||||
bool,
|
||||
KernelContext *context,
|
||||
int kernel,
|
||||
size_t global_size,
|
||||
void **args)
|
|
@ -3,7 +3,6 @@
|
|||
|
||||
#ifdef WITH_ONEAPI
|
||||
|
||||
/* clang-format off */
|
||||
# include "kernel.h"
|
||||
# include <iostream>
|
||||
# include <map>
|
||||
|
@ -16,163 +15,16 @@
|
|||
# include "kernel/device/oneapi/kernel_templates.h"
|
||||
|
||||
# include "kernel/device/gpu/kernel.h"
|
||||
/* clang-format on */
|
||||
|
||||
static OneAPIErrorCallback s_error_cb = nullptr;
|
||||
static void *s_error_user_ptr = nullptr;
|
||||
|
||||
static std::vector<sycl::device> oneapi_available_devices();
|
||||
|
||||
void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
|
||||
{
|
||||
s_error_cb = cb;
|
||||
s_error_user_ptr = user_ptr;
|
||||
}
|
||||
|
||||
void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
|
||||
{
|
||||
# ifdef _DEBUG
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
sycl::info::device_type device_type =
|
||||
queue->get_device().get_info<sycl::info::device::device_type>();
|
||||
sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
|
||||
(void)usm_type;
|
||||
assert(usm_type == sycl::usm::alloc::device ||
|
||||
((device_type == sycl::info::device_type::host ||
|
||||
device_type == sycl::info::device_type::is_cpu || allow_host) &&
|
||||
usm_type == sycl::usm::alloc::host));
|
||||
# endif
|
||||
}
|
||||
|
||||
bool oneapi_create_queue(SyclQueue *&external_queue, int device_index)
|
||||
{
|
||||
bool finished_correct = true;
|
||||
try {
|
||||
std::vector<sycl::device> devices = oneapi_available_devices();
|
||||
if (device_index < 0 || device_index >= devices.size()) {
|
||||
return false;
|
||||
}
|
||||
sycl::queue *created_queue = new sycl::queue(devices[device_index],
|
||||
sycl::property::queue::in_order());
|
||||
external_queue = reinterpret_cast<SyclQueue *>(created_queue);
|
||||
}
|
||||
catch (sycl::exception const &e) {
|
||||
finished_correct = false;
|
||||
if (s_error_cb) {
|
||||
s_error_cb(e.what(), s_error_user_ptr);
|
||||
}
|
||||
}
|
||||
return finished_correct;
|
||||
}
|
||||
|
||||
void oneapi_free_queue(SyclQueue *queue_)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
delete queue;
|
||||
}
|
||||
|
||||
void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
return sycl::aligned_alloc_host(alignment, memory_size, *queue);
|
||||
}
|
||||
|
||||
void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
return sycl::malloc_device(memory_size, *queue);
|
||||
}
|
||||
|
||||
void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
oneapi_check_usm(queue_, usm_ptr, true);
|
||||
sycl::free(usm_ptr, *queue);
|
||||
}
|
||||
|
||||
bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
oneapi_check_usm(queue_, dest, true);
|
||||
oneapi_check_usm(queue_, src, true);
|
||||
sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
|
||||
# ifdef WITH_CYCLES_DEBUG
|
||||
try {
|
||||
/* NOTE(@nsirgien) Waiting on memory operation may give more precise error
|
||||
* messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
|
||||
*/
|
||||
mem_event.wait_and_throw();
|
||||
return true;
|
||||
}
|
||||
catch (sycl::exception const &e) {
|
||||
if (s_error_cb) {
|
||||
s_error_cb(e.what(), s_error_user_ptr);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
# else
|
||||
sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
|
||||
sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
|
||||
bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
|
||||
src_type == sycl::usm::alloc::device;
|
||||
bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
|
||||
src_type == sycl::usm::alloc::unknown;
|
||||
/* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
|
||||
* may not wait until the end of the transfer before using the memory.
|
||||
*/
|
||||
if (from_device_to_host || host_or_device_memop_with_offset)
|
||||
mem_event.wait();
|
||||
return true;
|
||||
# endif
|
||||
}
|
||||
|
||||
bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
oneapi_check_usm(queue_, usm_ptr, true);
|
||||
sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
|
||||
# ifdef WITH_CYCLES_DEBUG
|
||||
try {
|
||||
/* NOTE(@nsirgien) Waiting on memory operation may give more precise error
|
||||
* messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
|
||||
*/
|
||||
mem_event.wait_and_throw();
|
||||
return true;
|
||||
}
|
||||
catch (sycl::exception const &e) {
|
||||
if (s_error_cb) {
|
||||
s_error_cb(e.what(), s_error_user_ptr);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
# else
|
||||
(void)mem_event;
|
||||
return true;
|
||||
# endif
|
||||
}
|
||||
|
||||
bool oneapi_queue_synchronize(SyclQueue *queue_)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
try {
|
||||
queue->wait_and_throw();
|
||||
return true;
|
||||
}
|
||||
catch (sycl::exception const &e) {
|
||||
if (s_error_cb) {
|
||||
s_error_cb(e.what(), s_error_user_ptr);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and
|
||||
* also trigger runtime compilation of all existing oneAPI kernels */
|
||||
bool oneapi_run_test_kernel(SyclQueue *queue_)
|
||||
|
@ -216,60 +68,13 @@ bool oneapi_run_test_kernel(SyclQueue *queue_)
|
|||
return true;
|
||||
}
|
||||
|
||||
bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
|
||||
{
|
||||
kernel_global_size = sizeof(KernelGlobalsGPU);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void oneapi_set_global_memory(SyclQueue *queue_,
|
||||
void *kernel_globals,
|
||||
const char *memory_name,
|
||||
void *memory_device_pointer)
|
||||
{
|
||||
assert(queue_);
|
||||
assert(kernel_globals);
|
||||
assert(memory_name);
|
||||
assert(memory_device_pointer);
|
||||
KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
|
||||
oneapi_check_usm(queue_, memory_device_pointer);
|
||||
oneapi_check_usm(queue_, kernel_globals, true);
|
||||
|
||||
std::string matched_name(memory_name);
|
||||
|
||||
/* This macro will change global ptr of KernelGlobals via name matching. */
|
||||
# define KERNEL_DATA_ARRAY(type, name) \
|
||||
else if (#name == matched_name) \
|
||||
{ \
|
||||
globals->__##name = (type *)memory_device_pointer; \
|
||||
return; \
|
||||
}
|
||||
if (false) {
|
||||
}
|
||||
else if ("integrator_state" == matched_name) {
|
||||
globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
|
||||
return;
|
||||
}
|
||||
KERNEL_DATA_ARRAY(KernelData, data)
|
||||
# include "kernel/data_arrays.h"
|
||||
else
|
||||
{
|
||||
std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
|
||||
<< std::endl;
|
||||
assert(false);
|
||||
}
|
||||
# undef KERNEL_DATA_ARRAY
|
||||
}
|
||||
|
||||
/* TODO: Move device information to OneapiDevice initialized on creation and use it. */
|
||||
/* TODO: Move below function to oneapi/queue.cpp. */
|
||||
size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
|
||||
size_t oneapi_kernel_preferred_local_size(SyclQueue *queue,
|
||||
const DeviceKernel kernel,
|
||||
const size_t kernel_global_size)
|
||||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
assert(queue);
|
||||
(void)kernel_global_size;
|
||||
const static size_t preferred_work_group_size_intersect_shading = 32;
|
||||
const static size_t preferred_work_group_size_technical = 1024;
|
||||
|
@ -311,8 +116,10 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
|
|||
preferred_work_group_size = 512;
|
||||
}
|
||||
|
||||
const size_t limit_work_group_size =
|
||||
queue->get_device().get_info<sycl::info::device::max_work_group_size>();
|
||||
const size_t limit_work_group_size = reinterpret_cast<sycl::queue *>(queue)
|
||||
->get_device()
|
||||
.get_info<sycl::info::device::max_work_group_size>();
|
||||
|
||||
return std::min(limit_work_group_size, preferred_work_group_size);
|
||||
}
|
||||
|
||||
|
@ -664,266 +471,4 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
|
|||
# endif
|
||||
return success;
|
||||
}
|
||||
|
||||
/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
|
||||
* since Windows driver 101.3268. */
|
||||
/* The same min compute-runtime version is currently required across Windows and Linux.
|
||||
* For Windows driver 101.3430, compute-runtime version is 23904. */
|
||||
static const int lowest_supported_driver_version_win = 1013430;
|
||||
static const int lowest_supported_driver_version_neo = 23904;
|
||||
|
||||
static int parse_driver_build_version(const sycl::device &device)
|
||||
{
|
||||
const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
|
||||
int driver_build_version = 0;
|
||||
|
||||
size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
|
||||
if (second_dot_position == std::string::npos) {
|
||||
std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
|
||||
<< "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
|
||||
<< " xx.xx.xxx.xxxx (Windows) for device \""
|
||||
<< device.get_info<sycl::info::device::name>() << "\"." << std::endl;
|
||||
}
|
||||
else {
|
||||
try {
|
||||
size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
|
||||
if (third_dot_position != std::string::npos) {
|
||||
const std::string &third_number_substr = driver_version.substr(
|
||||
second_dot_position + 1, third_dot_position - second_dot_position - 1);
|
||||
const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
|
||||
if (third_number_substr.length() == 3 && forth_number_substr.length() == 4)
|
||||
driver_build_version = std::stoi(third_number_substr) * 10000 +
|
||||
std::stoi(forth_number_substr);
|
||||
}
|
||||
else {
|
||||
const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
|
||||
driver_build_version = std::stoi(third_number_substr);
|
||||
}
|
||||
}
|
||||
catch (std::invalid_argument &e) {
|
||||
std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
|
||||
<< "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
|
||||
<< " xx.xx.xxx.xxxx (Windows) for device \""
|
||||
<< device.get_info<sycl::info::device::name>() << "\"." << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
return driver_build_version;
|
||||
}
|
||||
|
||||
static std::vector<sycl::device> oneapi_available_devices()
|
||||
{
|
||||
bool allow_all_devices = false;
|
||||
if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
|
||||
allow_all_devices = true;
|
||||
|
||||
/* Host device is useful only for debugging at the moment
|
||||
* so we hide this device with default build settings. */
|
||||
# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
|
||||
bool allow_host = true;
|
||||
# else
|
||||
bool allow_host = false;
|
||||
# endif
|
||||
|
||||
const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
|
||||
|
||||
std::vector<sycl::device> available_devices;
|
||||
for (const sycl::platform &platform : oneapi_platforms) {
|
||||
/* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
|
||||
*/
|
||||
if (platform.get_backend() == sycl::backend::opencl) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const std::vector<sycl::device> &oneapi_devices =
|
||||
(allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
|
||||
platform.get_devices(sycl::info::device_type::gpu);
|
||||
|
||||
for (const sycl::device &device : oneapi_devices) {
|
||||
if (allow_all_devices) {
|
||||
/* still filter out host device if build doesn't support it. */
|
||||
if (allow_host || !device.is_host()) {
|
||||
available_devices.push_back(device);
|
||||
}
|
||||
}
|
||||
else {
|
||||
bool filter_out = false;
|
||||
|
||||
/* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
|
||||
* assuming they have either more than 96 Execution Units or not 7 threads per EU.
|
||||
* Official support can be broaden to older and smaller GPUs once ready. */
|
||||
if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
|
||||
/* Filtered-out defaults in-case these values aren't available through too old L0
|
||||
* runtime. */
|
||||
int number_of_eus = 96;
|
||||
int threads_per_eu = 7;
|
||||
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
|
||||
number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
|
||||
}
|
||||
if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
|
||||
threads_per_eu =
|
||||
device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
|
||||
}
|
||||
/* This filters out all Level-Zero supported GPUs from older generation than Arc. */
|
||||
if (number_of_eus <= 96 && threads_per_eu == 7) {
|
||||
filter_out = true;
|
||||
}
|
||||
/* if not already filtered out, check driver version. */
|
||||
if (!filter_out) {
|
||||
int driver_build_version = parse_driver_build_version(device);
|
||||
if ((driver_build_version > 100000 &&
|
||||
driver_build_version < lowest_supported_driver_version_win) ||
|
||||
driver_build_version < lowest_supported_driver_version_neo) {
|
||||
filter_out = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (!allow_host && device.is_host()) {
|
||||
filter_out = true;
|
||||
}
|
||||
else if (!allow_all_devices) {
|
||||
filter_out = true;
|
||||
}
|
||||
|
||||
if (!filter_out) {
|
||||
available_devices.push_back(device);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return available_devices;
|
||||
}
|
||||
|
||||
char *oneapi_device_capabilities()
|
||||
{
|
||||
std::stringstream capabilities;
|
||||
|
||||
const std::vector<sycl::device> &oneapi_devices = oneapi_available_devices();
|
||||
for (const sycl::device &device : oneapi_devices) {
|
||||
const std::string &name = device.get_info<sycl::info::device::name>();
|
||||
|
||||
capabilities << std::string("\t") << name << "\n";
|
||||
# define WRITE_ATTR(attribute_name, attribute_variable) \
|
||||
capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
|
||||
<< "\n";
|
||||
# define GET_NUM_ATTR(attribute) \
|
||||
{ \
|
||||
size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \
|
||||
capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \
|
||||
}
|
||||
|
||||
GET_NUM_ATTR(vendor_id)
|
||||
GET_NUM_ATTR(max_compute_units)
|
||||
GET_NUM_ATTR(max_work_item_dimensions)
|
||||
|
||||
sycl::id<3> max_work_item_sizes =
|
||||
device.get_info<sycl::info::device::max_work_item_sizes<3>>();
|
||||
WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0)))
|
||||
WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1)))
|
||||
WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2)))
|
||||
|
||||
GET_NUM_ATTR(max_work_group_size)
|
||||
GET_NUM_ATTR(max_num_sub_groups)
|
||||
GET_NUM_ATTR(sub_group_independent_forward_progress)
|
||||
|
||||
GET_NUM_ATTR(preferred_vector_width_char)
|
||||
GET_NUM_ATTR(preferred_vector_width_short)
|
||||
GET_NUM_ATTR(preferred_vector_width_int)
|
||||
GET_NUM_ATTR(preferred_vector_width_long)
|
||||
GET_NUM_ATTR(preferred_vector_width_float)
|
||||
GET_NUM_ATTR(preferred_vector_width_double)
|
||||
GET_NUM_ATTR(preferred_vector_width_half)
|
||||
|
||||
GET_NUM_ATTR(native_vector_width_char)
|
||||
GET_NUM_ATTR(native_vector_width_short)
|
||||
GET_NUM_ATTR(native_vector_width_int)
|
||||
GET_NUM_ATTR(native_vector_width_long)
|
||||
GET_NUM_ATTR(native_vector_width_float)
|
||||
GET_NUM_ATTR(native_vector_width_double)
|
||||
GET_NUM_ATTR(native_vector_width_half)
|
||||
|
||||
size_t max_clock_frequency =
|
||||
(size_t)(device.is_host() ? (size_t)0 :
|
||||
device.get_info<sycl::info::device::max_clock_frequency>());
|
||||
WRITE_ATTR("max_clock_frequency", max_clock_frequency)
|
||||
|
||||
GET_NUM_ATTR(address_bits)
|
||||
GET_NUM_ATTR(max_mem_alloc_size)
|
||||
|
||||
/* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't
|
||||
* supported so we always return false, even if device supports HW texture usage acceleration.
|
||||
*/
|
||||
bool image_support = false;
|
||||
WRITE_ATTR("image_support", (size_t)image_support)
|
||||
|
||||
GET_NUM_ATTR(max_parameter_size)
|
||||
GET_NUM_ATTR(mem_base_addr_align)
|
||||
GET_NUM_ATTR(global_mem_size)
|
||||
GET_NUM_ATTR(local_mem_size)
|
||||
GET_NUM_ATTR(error_correction_support)
|
||||
GET_NUM_ATTR(profiling_timer_resolution)
|
||||
GET_NUM_ATTR(is_available)
|
||||
|
||||
# undef GET_NUM_ATTR
|
||||
# undef WRITE_ATTR
|
||||
capabilities << "\n";
|
||||
}
|
||||
|
||||
return ::strdup(capabilities.str().c_str());
|
||||
}
|
||||
|
||||
void oneapi_free(void *p)
|
||||
{
|
||||
if (p) {
|
||||
::free(p);
|
||||
}
|
||||
}
|
||||
|
||||
void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
|
||||
{
|
||||
int num = 0;
|
||||
std::vector<sycl::device> devices = oneapi_available_devices();
|
||||
for (sycl::device &device : devices) {
|
||||
const std::string &platform_name =
|
||||
device.get_platform().get_info<sycl::info::platform::name>();
|
||||
std::string name = device.get_info<sycl::info::device::name>();
|
||||
std::string id = "ONEAPI_" + platform_name + "_" + name;
|
||||
if (device.has(sycl::aspect::ext_intel_pci_address)) {
|
||||
id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
|
||||
}
|
||||
(cb)(id.c_str(), name.c_str(), num, user_ptr);
|
||||
num++;
|
||||
}
|
||||
}
|
||||
|
||||
size_t oneapi_get_memcapacity(SyclQueue *queue)
|
||||
{
|
||||
return reinterpret_cast<sycl::queue *>(queue)
|
||||
->get_device()
|
||||
.get_info<sycl::info::device::global_mem_size>();
|
||||
}
|
||||
|
||||
int oneapi_get_num_multiprocessors(SyclQueue *queue)
|
||||
{
|
||||
const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
|
||||
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
|
||||
return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
|
||||
}
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
int oneapi_get_max_num_threads_per_multiprocessor(SyclQueue *queue)
|
||||
{
|
||||
const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
|
||||
if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
|
||||
device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
|
||||
return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() *
|
||||
device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
|
||||
}
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* WITH_ONEAPI */
|
||||
|
|
|
@ -25,11 +25,6 @@ enum DeviceKernel : int;
|
|||
|
||||
class SyclQueue;
|
||||
|
||||
typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
|
||||
const char *name,
|
||||
int num,
|
||||
void *user_ptr);
|
||||
|
||||
typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr);
|
||||
|
||||
struct KernelContext {
|
||||
|
@ -45,13 +40,15 @@ struct KernelContext {
|
|||
extern "C" {
|
||||
# endif
|
||||
|
||||
# define DLL_INTERFACE_CALL(function, return_type, ...) \
|
||||
CYCLES_KERNEL_ONEAPI_EXPORT return_type function(__VA_ARGS__);
|
||||
# include "kernel/device/oneapi/dll_interface_template.h"
|
||||
# undef DLL_INTERFACE_CALL
|
||||
|
||||
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_run_test_kernel(SyclQueue *queue_);
|
||||
CYCLES_KERNEL_ONEAPI_EXPORT void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr);
|
||||
CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_kernel_preferred_local_size(
|
||||
SyclQueue *queue, const DeviceKernel kernel, const size_t kernel_global_size);
|
||||
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context,
|
||||
int kernel,
|
||||
size_t global_size,
|
||||
void **args);
|
||||
# ifdef __cplusplus
|
||||
}
|
||||
# endif
|
||||
|
||||
#endif /* WITH_ONEAPI */
|
||||
|
|
|
@ -1076,6 +1076,13 @@ elseif(WIN32)
|
|||
DESTINATION ${TARGETDIR_VER}/python/lib/site-packages
|
||||
)
|
||||
endif()
|
||||
|
||||
if(PLATFORM_BUNDLED_LIBRARIES)
|
||||
install(
|
||||
FILES ${PLATFORM_BUNDLED_LIBRARIES}
|
||||
DESTINATION ${TARGETDIR_LIB}
|
||||
)
|
||||
endif()
|
||||
elseif(APPLE)
|
||||
if(NOT WITH_PYTHON_MODULE)
|
||||
# Uppercase name for app bundle.
|
||||
|
|
Loading…
Reference in New Issue