Cycles: use direct linking for oneAPI backend

This is a minimal set of changes, allowing a lot of cleanup that can happen afterward as it allows sycl method and objects to be used outside of kernel.cpp. Reviewed By: brecht, sergey Differential Revision: https://developer.blender.org/D15397
2022-10-06 18:35:51 +02:00 · 2022-10-06 18:35:51 +02:00 · 7eeeaec6da
parent fc0b1627eb
commit 7eeeaec6da
14 changed files with 594 additions and 709 deletions
--- a/build_files/cmake/platform/platform_unix.cmake
+++ b/build_files/cmake/platform/platform_unix.cmake
@ -335,10 +335,18 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
    set(LEVEL_ZERO_ROOT_DIR ${CYCLES_LEVEL_ZERO})
  endif()

-  set(CYCLES_SYCL ${LIBDIR}/dpcpp CACHE PATH "Path to DPC++ and SYCL installation")
+  set(CYCLES_SYCL ${LIBDIR}/dpcpp CACHE PATH "Path to oneAPI DPC++ compiler")
  if(EXISTS ${CYCLES_SYCL} AND NOT SYCL_ROOT_DIR)
    set(SYCL_ROOT_DIR ${CYCLES_SYCL})
  endif()
+  file(GLOB _sycl_runtime_libraries
+    ${SYCL_ROOT_DIR}/lib/libsycl.so
+    ${SYCL_ROOT_DIR}/lib/libsycl.so.[0-9]
+    ${SYCL_ROOT_DIR}/lib/libsycl.so.[0-9].[0-9].[0-9]-[0-9]
+    ${SYCL_ROOT_DIR}/lib/libpi_level_zero.so
+  )
+  list(APPEND PLATFORM_BUNDLED_LIBRARIES ${_sycl_runtime_libraries})
+  unset(_sycl_runtime_libraries)
 endif()

 if(WITH_OPENVDB)
--- a/build_files/cmake/platform/platform_win32.cmake
+++ b/build_files/cmake/platform/platform_win32.cmake
@ -952,5 +952,17 @@ endif()
 set(ZSTD_INCLUDE_DIRS ${LIBDIR}/zstd/include)
 set(ZSTD_LIBRARIES ${LIBDIR}/zstd/lib/zstd_static.lib)

-set(LEVEL_ZERO_ROOT_DIR ${LIBDIR}/level_zero)
-set(SYCL_ROOT_DIR ${LIBDIR}/dpcpp)
+if(WITH_CYCLES_DEVICE_ONEAPI)
+  set(LEVEL_ZERO_ROOT_DIR ${LIBDIR}/level_zero)
+  set(CYCLES_SYCL ${LIBDIR}/dpcpp CACHE PATH "Path to oneAPI DPC++ compiler")
+  if(EXISTS ${CYCLES_SYCL} AND NOT SYCL_ROOT_DIR)
+    set(SYCL_ROOT_DIR ${CYCLES_SYCL})
+  endif()
+  file(GLOB _sycl_runtime_libraries
+    ${SYCL_ROOT_DIR}/bin/sycl.dll
+    ${SYCL_ROOT_DIR}/bin/sycl[0-9].dll
+    ${SYCL_ROOT_DIR}/bin/pi_level_zero.dll
+  )
+  list(APPEND PLATFORM_BUNDLED_LIBRARIES ${_sycl_runtime_libraries})
+  unset(_sycl_runtime_libraries)
+endif()
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@ -142,7 +142,6 @@ set(SRC
  ${SRC_DUMMY}
  ${SRC_MULTI}
  ${SRC_OPTIX}
-  ${SRC_ONEAPI}
  ${SRC_HEADERS}
 )

@ -188,7 +187,22 @@ if(WITH_CYCLES_DEVICE_METAL)
  )
 endif()
 if (WITH_CYCLES_DEVICE_ONEAPI)
+  if(WIN32)
+    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/cycles_kernel_oneapi.lib)
+  else()
+    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/libcycles_kernel_oneapi.so)
+  endif()
+  list(APPEND LIB
+    ${SYCL_LIBRARY}
+    ${cycles_kernel_oneapi_lib}
+  )
  add_definitions(-DWITH_ONEAPI)
+  list(APPEND SRC
+    ${SRC_ONEAPI}
+  )
+  list(APPEND INC_SYS
+    ${SYCL_INCLUDE_DIR}
+  )
 endif()

 if(WITH_OPENIMAGEDENOISE)
--- a/intern/cycles/device/oneapi/device.cpp
+++ b/intern/cycles/device/oneapi/device.cpp
@ -19,62 +19,12 @@

 CCL_NAMESPACE_BEGIN

-#ifdef WITH_ONEAPI
-static OneAPIDLLInterface oneapi_dll;
-#endif
-
-#ifdef _WIN32
-#  define LOAD_ONEAPI_SHARED_LIBRARY(path) (void *)(LoadLibrary(path))
-#  define LOAD_ONEAPI_SHARED_LIBRARY_ERROR() GetLastError()
-#  define FREE_SHARED_LIBRARY(handle) FreeLibrary((HMODULE)handle)
-#  define GET_SHARED_LIBRARY_SYMBOL(handle, name) GetProcAddress((HMODULE)handle, name)
-#elif __linux__
-#  define LOAD_ONEAPI_SHARED_LIBRARY(path) dlopen(path, RTLD_NOW)
-#  define LOAD_ONEAPI_SHARED_LIBRARY_ERROR() dlerror()
-#  define FREE_SHARED_LIBRARY(handle) dlclose(handle)
-#  define GET_SHARED_LIBRARY_SYMBOL(handle, name) dlsym(handle, name)
-#endif
-
 bool device_oneapi_init()
 {
 #if !defined(WITH_ONEAPI)
  return false;
 #else

-  string lib_path = path_get("lib");
-#  ifdef _WIN32
-  lib_path = path_join(lib_path, "cycles_kernel_oneapi.dll");
-#  else
-  lib_path = path_join(lib_path, "cycles_kernel_oneapi.so");
-#  endif
-  void *lib_handle = LOAD_ONEAPI_SHARED_LIBRARY(lib_path.c_str());
-
-  /* This shouldn't happen, but it still makes sense to have a branch for this. */
-  if (lib_handle == NULL) {
-    LOG(ERROR) << "oneAPI kernel shared library cannot be loaded: "
-               << LOAD_ONEAPI_SHARED_LIBRARY_ERROR();
-    return false;
-  }
-
-#  define DLL_INTERFACE_CALL(function, return_type, ...) \
-    (oneapi_dll.function) = reinterpret_cast<decltype(oneapi_dll.function)>( \
-        GET_SHARED_LIBRARY_SYMBOL(lib_handle, #function)); \
-    if (oneapi_dll.function == NULL) { \
-      LOG(ERROR) << "oneAPI shared library function \"" << #function \
-                 << "\" has not been loaded from kernel shared  - disable oneAPI " \
-                    "library disable oneAPI implementation due to this"; \
-      FREE_SHARED_LIBRARY(lib_handle); \
-      return false; \
-    }
-#  include "kernel/device/oneapi/dll_interface_template.h"
-#  undef DLL_INTERFACE_CALL
-
-  VLOG_INFO << "oneAPI kernel shared library has been loaded successfully";
-
-  /* We need to have this oneapi kernel shared library during all life-span of the Blender.
-   * So it is not unloaded because of this.
-   * FREE_SHARED_LIBRARY(lib_handle); */
-
  /* NOTE(@nsirgien): we need to enable JIT cache from here and
   * right now this cache policy is controlled by env. variables. */
  /* NOTE(hallade) we also disable use of copy engine as it
@ -109,17 +59,10 @@ bool device_oneapi_init()
 #endif
 }

-#if defined(_WIN32) || defined(__linux__)
-#  undef LOAD_SYCL_SHARED_LIBRARY
-#  undef LOAD_ONEAPI_SHARED_LIBRARY
-#  undef FREE_SHARED_LIBRARY
-#  undef GET_SHARED_LIBRARY_SYMBOL
-#endif
-
 Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 {
 #ifdef WITH_ONEAPI
-  return new OneapiDevice(info, oneapi_dll, stats, profiler);
+  return new OneapiDevice(info, stats, profiler);
 #else
  (void)info;
  (void)stats;
@ -165,7 +108,7 @@ static void device_iterator_cb(const char *id, const char *name, int num, void *
 void device_oneapi_info(vector<DeviceInfo> &devices)
 {
 #ifdef WITH_ONEAPI
-  (oneapi_dll.oneapi_iterate_devices)(device_iterator_cb, &devices);
+  OneapiDevice::iterate_devices(device_iterator_cb, &devices);
 #else  /* WITH_ONEAPI */
  (void)devices;
 #endif /* WITH_ONEAPI */
@ -175,10 +118,10 @@ string device_oneapi_capabilities()
 {
  string capabilities;
 #ifdef WITH_ONEAPI
-  char *c_capabilities = (oneapi_dll.oneapi_device_capabilities)();
+  char *c_capabilities = OneapiDevice::device_capabilities();
  if (c_capabilities) {
    capabilities = c_capabilities;
-    (oneapi_dll.oneapi_free)(c_capabilities);
+    free(c_capabilities);
  }
 #endif
  return capabilities;
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@ -8,7 +8,7 @@
 #  include "util/debug.h"
 #  include "util/log.h"

-#  include "kernel/device/oneapi/kernel.h"
+#  include "kernel/device/oneapi/globals.h"

 CCL_NAMESPACE_BEGIN

@ -19,26 +19,19 @@ static void queue_error_cb(const char *message, void *user_ptr)
  }
 }

-OneapiDevice::OneapiDevice(const DeviceInfo &info,
-                           OneAPIDLLInterface &oneapi_dll_object,
-                           Stats &stats,
-                           Profiler &profiler)
+OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
    : Device(info, stats, profiler),
      device_queue_(nullptr),
      texture_info_(this, "texture_info", MEM_GLOBAL),
      kg_memory_(nullptr),
      kg_memory_device_(nullptr),
-      kg_memory_size_(0),
-      oneapi_dll_(oneapi_dll_object)
+      kg_memory_size_(0)
 {
  need_texture_info_ = false;

-  oneapi_dll_.oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
+  oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);

-  /* OneAPI calls should be initialized on this moment. */
-  assert(oneapi_dll_.oneapi_create_queue != nullptr);
-
-  bool is_finished_ok = oneapi_dll_.oneapi_create_queue(device_queue_, info.num);
+  bool is_finished_ok = create_queue(device_queue_, info.num);
  if (is_finished_ok == false) {
    set_error("oneAPI queue initialization error: got runtime exception \"" +
              oneapi_error_string_ + "\"");
@ -50,7 +43,7 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info,
  }

  size_t globals_segment_size;
-  is_finished_ok = oneapi_dll_.oneapi_kernel_globals_size(device_queue_, globals_segment_size);
+  is_finished_ok = kernel_globals_size(device_queue_, globals_segment_size);
  if (is_finished_ok == false) {
    set_error("oneAPI constant memory initialization got runtime exception \"" +
              oneapi_error_string_ + "\"");
@ -59,27 +52,27 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info,
    VLOG_DEBUG << "Successfully created global/constant memory segment (kernel globals object)";
  }

-  kg_memory_ = oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, globals_segment_size, 16);
-  oneapi_dll_.oneapi_usm_memset(device_queue_, kg_memory_, 0, globals_segment_size);
+  kg_memory_ = usm_aligned_alloc_host(device_queue_, globals_segment_size, 16);
+  usm_memset(device_queue_, kg_memory_, 0, globals_segment_size);

-  kg_memory_device_ = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, globals_segment_size);
+  kg_memory_device_ = usm_alloc_device(device_queue_, globals_segment_size);

  kg_memory_size_ = globals_segment_size;

-  max_memory_on_device_ = oneapi_dll_.oneapi_get_memcapacity(device_queue_);
+  max_memory_on_device_ = get_memcapacity();
 }

 OneapiDevice::~OneapiDevice()
 {
  texture_info_.free();
-  oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_);
-  oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_device_);
+  usm_free(device_queue_, kg_memory_);
+  usm_free(device_queue_, kg_memory_device_);

  for (ConstMemMap::iterator mt = const_mem_map_.begin(); mt != const_mem_map_.end(); mt++)
    delete mt->second;

  if (device_queue_)
-    oneapi_dll_.oneapi_free_queue(device_queue_);
+    free_queue(device_queue_);
 }

 bool OneapiDevice::check_peer_access(Device * /*peer_device*/)
@ -99,7 +92,7 @@ bool OneapiDevice::load_kernels(const uint requested_features)
   * with specialization constants, but it hasn't been implemented yet. */
  (void)requested_features;

-  bool is_finished_ok = oneapi_dll_.oneapi_run_test_kernel(device_queue_);
+  bool is_finished_ok = oneapi_run_test_kernel(device_queue_);
  if (is_finished_ok == false) {
    set_error("oneAPI kernel load: got runtime exception \"" + oneapi_error_string_ + "\"");
  }
@ -138,7 +131,7 @@ void OneapiDevice::generic_alloc(device_memory &mem)
   * type has been used for oneAPI device in order to better fit in Cycles architecture. */
  void *device_pointer = nullptr;
  if (mem.memory_size() + stats.mem_used < max_memory_on_device_)
-    device_pointer = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, memory_size);
+    device_pointer = usm_alloc_device(device_queue_, memory_size);
  if (device_pointer == nullptr) {
    set_error("oneAPI kernel - device memory allocation error for " +
              string_human_readable_size(mem.memory_size()) +
@ -163,8 +156,7 @@ void OneapiDevice::generic_copy_to(device_memory &mem)
  /* Copy operation from host shouldn't be requested if there is no memory allocated on host. */
  assert(mem.host_pointer);
  assert(device_queue_);
-  oneapi_dll_.oneapi_usm_memcpy(
-      device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size);
+  usm_memcpy(device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size);
 }

 /* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */
@ -178,11 +170,6 @@ string OneapiDevice::oneapi_error_message()
  return string(oneapi_error_string_);
 }

-OneAPIDLLInterface OneapiDevice::oneapi_dll_object()
-{
-  return oneapi_dll_;
-}
-
 void *OneapiDevice::kernel_globals_device_pointer()
 {
  return kg_memory_device_;
@ -198,7 +185,7 @@ void OneapiDevice::generic_free(device_memory &mem)
  mem.device_size = 0;

  assert(device_queue_);
-  oneapi_dll_.oneapi_usm_free(device_queue_, (void *)mem.device_pointer);
+  usm_free(device_queue_, (void *)mem.device_pointer);
  mem.device_pointer = 0;
 }

@ -266,8 +253,7 @@ void OneapiDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t
    if (mem.device_pointer) {
      char *shifted_host = reinterpret_cast<char *>(mem.host_pointer) + offset;
      char *shifted_device = reinterpret_cast<char *>(mem.device_pointer) + offset;
-      bool is_finished_ok = oneapi_dll_.oneapi_usm_memcpy(
-          device_queue_, shifted_host, shifted_device, size);
+      bool is_finished_ok = usm_memcpy(device_queue_, shifted_host, shifted_device, size);
      if (is_finished_ok == false) {
        set_error("oneAPI memory operation error: got runtime exception \"" +
                  oneapi_error_string_ + "\"");
@ -292,7 +278,7 @@ void OneapiDevice::mem_zero(device_memory &mem)
  }

  assert(device_queue_);
-  bool is_finished_ok = oneapi_dll_.oneapi_usm_memset(
+  bool is_finished_ok = usm_memset(
      device_queue_, (void *)mem.device_pointer, 0, mem.memory_size());
  if (is_finished_ok == false) {
    set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
@ -349,10 +335,9 @@ void OneapiDevice::const_copy_to(const char *name, void *host, size_t size)
  memcpy(data->data(), host, size);
  data->copy_to_device();

-  oneapi_dll_.oneapi_set_global_memory(
-      device_queue_, kg_memory_, name, (void *)data->device_pointer);
+  set_global_memory(device_queue_, kg_memory_, name, (void *)data->device_pointer);

-  oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
+  usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
 }

 void OneapiDevice::global_alloc(device_memory &mem)
@ -367,10 +352,9 @@ void OneapiDevice::global_alloc(device_memory &mem)
  generic_alloc(mem);
  generic_copy_to(mem);

-  oneapi_dll_.oneapi_set_global_memory(
-      device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer);
+  set_global_memory(device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer);

-  oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
+  usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
 }

 void OneapiDevice::global_free(device_memory &mem)
@ -410,18 +394,6 @@ unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create()
  return make_unique<OneapiDeviceQueue>(this);
 }

-int OneapiDevice::get_num_multiprocessors()
-{
-  assert(device_queue_);
-  return oneapi_dll_.oneapi_get_num_multiprocessors(device_queue_);
-}
-
-int OneapiDevice::get_max_num_threads_per_multiprocessor()
-{
-  assert(device_queue_);
-  return oneapi_dll_.oneapi_get_max_num_threads_per_multiprocessor(device_queue_);
-}
-
 bool OneapiDevice::should_use_graphics_interop()
 {
  /* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so
@ -432,13 +404,460 @@ bool OneapiDevice::should_use_graphics_interop()
 void *OneapiDevice::usm_aligned_alloc_host(size_t memory_size, size_t alignment)
 {
  assert(device_queue_);
-  return oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, memory_size, alignment);
+  return usm_aligned_alloc_host(device_queue_, memory_size, alignment);
 }

 void OneapiDevice::usm_free(void *usm_ptr)
 {
  assert(device_queue_);
-  return oneapi_dll_.oneapi_usm_free(device_queue_, usm_ptr);
+  return usm_free(device_queue_, usm_ptr);
+}
+
+void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
+{
+#  ifdef _DEBUG
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  sycl::info::device_type device_type =
+      queue->get_device().get_info<sycl::info::device::device_type>();
+  sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
+  (void)usm_type;
+  assert(usm_type == sycl::usm::alloc::device ||
+         ((device_type == sycl::info::device_type::host ||
+           device_type == sycl::info::device_type::cpu || allow_host) &&
+          usm_type == sycl::usm::alloc::host));
+#  endif
+}
+
+bool OneapiDevice::create_queue(SyclQueue *&external_queue, int device_index)
+{
+  bool finished_correct = true;
+  try {
+    std::vector<sycl::device> devices = OneapiDevice::available_devices();
+    if (device_index < 0 || device_index >= devices.size()) {
+      return false;
+    }
+    sycl::queue *created_queue = new sycl::queue(devices[device_index],
+                                                 sycl::property::queue::in_order());
+    external_queue = reinterpret_cast<SyclQueue *>(created_queue);
+  }
+  catch (sycl::exception const &e) {
+    finished_correct = false;
+    oneapi_error_string_ = e.what();
+  }
+  return finished_correct;
+}
+
+void OneapiDevice::free_queue(SyclQueue *queue_)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  delete queue;
+}
+
+void *OneapiDevice::usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  return sycl::aligned_alloc_host(alignment, memory_size, *queue);
+}
+
+void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  return sycl::malloc_device(memory_size, *queue);
+}
+
+void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  OneapiDevice::check_usm(queue_, usm_ptr, true);
+  sycl::free(usm_ptr, *queue);
+}
+
+bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  OneapiDevice::check_usm(queue_, dest, true);
+  OneapiDevice::check_usm(queue_, src, true);
+  sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
+#  ifdef WITH_CYCLES_DEBUG
+  try {
+    /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
+     * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
+     */
+    mem_event.wait_and_throw();
+    return true;
+  }
+  catch (sycl::exception const &e) {
+    oneapi_error_string_ = e.what();
+    return false;
+  }
+#  else
+  sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
+  sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
+  bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
+                             src_type == sycl::usm::alloc::device;
+  bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
+                                          src_type == sycl::usm::alloc::unknown;
+  /* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
+   * may not wait until the end of the transfer before using the memory.
+   */
+  if (from_device_to_host || host_or_device_memop_with_offset)
+    mem_event.wait();
+  return true;
+#  endif
+}
+
+bool OneapiDevice::usm_memset(SyclQueue *queue_,
+                              void *usm_ptr,
+                              unsigned char value,
+                              size_t num_bytes)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  OneapiDevice::check_usm(queue_, usm_ptr, true);
+  sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
+#  ifdef WITH_CYCLES_DEBUG
+  try {
+    /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
+     * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
+     */
+    mem_event.wait_and_throw();
+    return true;
+  }
+  catch (sycl::exception const &e) {
+    oneapi_error_string_ = e.what();
+    return false;
+  }
+#  else
+  (void)mem_event;
+  return true;
+#  endif
+}
+
+bool OneapiDevice::queue_synchronize(SyclQueue *queue_)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  try {
+    queue->wait_and_throw();
+    return true;
+  }
+  catch (sycl::exception const &e) {
+    oneapi_error_string_ = e.what();
+    return false;
+  }
+}
+
+bool OneapiDevice::kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
+{
+  kernel_global_size = sizeof(KernelGlobalsGPU);
+
+  return true;
+}
+
+void OneapiDevice::set_global_memory(SyclQueue *queue_,
+                                     void *kernel_globals,
+                                     const char *memory_name,
+                                     void *memory_device_pointer)
+{
+  assert(queue_);
+  assert(kernel_globals);
+  assert(memory_name);
+  assert(memory_device_pointer);
+  KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
+  OneapiDevice::check_usm(queue_, memory_device_pointer);
+  OneapiDevice::check_usm(queue_, kernel_globals, true);
+
+  std::string matched_name(memory_name);
+
+/* This macro will change global ptr of KernelGlobals via name matching. */
+#  define KERNEL_DATA_ARRAY(type, name) \
+    else if (#name == matched_name) \
+    { \
+      globals->__##name = (type *)memory_device_pointer; \
+      return; \
+    }
+  if (false) {
+  }
+  else if ("integrator_state" == matched_name) {
+    globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
+    return;
+  }
+  KERNEL_DATA_ARRAY(KernelData, data)
+#  include "kernel/data_arrays.h"
+  else
+  {
+    std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
+              << std::endl;
+    assert(false);
+  }
+#  undef KERNEL_DATA_ARRAY
+}
+
+bool OneapiDevice::enqueue_kernel(KernelContext *kernel_context,
+                                  int kernel,
+                                  size_t global_size,
+                                  void **args)
+{
+  return oneapi_enqueue_kernel(kernel_context, kernel, global_size, args);
+}
+
+/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
+ * since Windows driver 101.3268. */
+/* The same min compute-runtime version is currently required across Windows and Linux.
+ * For Windows driver 101.3430, compute-runtime version is 23904. */
+static const int lowest_supported_driver_version_win = 1013430;
+static const int lowest_supported_driver_version_neo = 23904;
+
+int OneapiDevice::parse_driver_build_version(const sycl::device &device)
+{
+  const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
+  int driver_build_version = 0;
+
+  size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
+  if (second_dot_position == std::string::npos) {
+    std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
+              << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
+              << " xx.xx.xxx.xxxx (Windows) for device \""
+              << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
+  }
+  else {
+    try {
+      size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
+      if (third_dot_position != std::string::npos) {
+        const std::string &third_number_substr = driver_version.substr(
+            second_dot_position + 1, third_dot_position - second_dot_position - 1);
+        const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
+        if (third_number_substr.length() == 3 && forth_number_substr.length() == 4)
+          driver_build_version = std::stoi(third_number_substr) * 10000 +
+                                 std::stoi(forth_number_substr);
+      }
+      else {
+        const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
+        driver_build_version = std::stoi(third_number_substr);
+      }
+    }
+    catch (std::invalid_argument &) {
+      std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
+                << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
+                << " xx.xx.xxx.xxxx (Windows) for device \""
+                << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
+    }
+  }
+
+  return driver_build_version;
+}
+
+std::vector<sycl::device> OneapiDevice::available_devices()
+{
+  bool allow_all_devices = false;
+  if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
+    allow_all_devices = true;
+
+    /* Host device is useful only for debugging at the moment
+     * so we hide this device with default build settings. */
+#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+  bool allow_host = true;
+#  else
+  bool allow_host = false;
+#  endif
+
+  const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
+
+  std::vector<sycl::device> available_devices;
+  for (const sycl::platform &platform : oneapi_platforms) {
+    /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
+     */
+    if (platform.get_backend() == sycl::backend::opencl) {
+      continue;
+    }
+
+    const std::vector<sycl::device> &oneapi_devices =
+        (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
+                                            platform.get_devices(sycl::info::device_type::gpu);
+
+    for (const sycl::device &device : oneapi_devices) {
+      if (allow_all_devices) {
+        /* still filter out host device if build doesn't support it. */
+        if (allow_host || !device.is_host()) {
+          available_devices.push_back(device);
+        }
+      }
+      else {
+        bool filter_out = false;
+
+        /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
+         * assuming they have either more than 96 Execution Units or not 7 threads per EU.
+         * Official support can be broaden to older and smaller GPUs once ready. */
+        if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
+          /* Filtered-out defaults in-case these values aren't available through too old L0
+           * runtime. */
+          int number_of_eus = 96;
+          int threads_per_eu = 7;
+          if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
+            number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+          }
+          if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
+            threads_per_eu =
+                device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+          }
+          /* This filters out all Level-Zero supported GPUs from older generation than Arc. */
+          if (number_of_eus <= 96 && threads_per_eu == 7) {
+            filter_out = true;
+          }
+          /* if not already filtered out, check driver version. */
+          if (!filter_out) {
+            int driver_build_version = parse_driver_build_version(device);
+            if ((driver_build_version > 100000 &&
+                 driver_build_version < lowest_supported_driver_version_win) ||
+                driver_build_version < lowest_supported_driver_version_neo) {
+              filter_out = true;
+            }
+          }
+        }
+        else if (!allow_host && device.is_host()) {
+          filter_out = true;
+        }
+        else if (!allow_all_devices) {
+          filter_out = true;
+        }
+
+        if (!filter_out) {
+          available_devices.push_back(device);
+        }
+      }
+    }
+  }
+
+  return available_devices;
+}
+
+char *OneapiDevice::device_capabilities()
+{
+  std::stringstream capabilities;
+
+  const std::vector<sycl::device> &oneapi_devices = available_devices();
+  for (const sycl::device &device : oneapi_devices) {
+    const std::string &name = device.get_info<sycl::info::device::name>();
+
+    capabilities << std::string("\t") << name << "\n";
+#  define WRITE_ATTR(attribute_name, attribute_variable) \
+    capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
+                 << "\n";
+#  define GET_NUM_ATTR(attribute) \
+    { \
+      size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \
+      capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \
+    }
+
+    GET_NUM_ATTR(vendor_id)
+    GET_NUM_ATTR(max_compute_units)
+    GET_NUM_ATTR(max_work_item_dimensions)
+
+    sycl::id<3> max_work_item_sizes =
+        device.get_info<sycl::info::device::max_work_item_sizes<3>>();
+    WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0)))
+    WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1)))
+    WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2)))
+
+    GET_NUM_ATTR(max_work_group_size)
+    GET_NUM_ATTR(max_num_sub_groups)
+    GET_NUM_ATTR(sub_group_independent_forward_progress)
+
+    GET_NUM_ATTR(preferred_vector_width_char)
+    GET_NUM_ATTR(preferred_vector_width_short)
+    GET_NUM_ATTR(preferred_vector_width_int)
+    GET_NUM_ATTR(preferred_vector_width_long)
+    GET_NUM_ATTR(preferred_vector_width_float)
+    GET_NUM_ATTR(preferred_vector_width_double)
+    GET_NUM_ATTR(preferred_vector_width_half)
+
+    GET_NUM_ATTR(native_vector_width_char)
+    GET_NUM_ATTR(native_vector_width_short)
+    GET_NUM_ATTR(native_vector_width_int)
+    GET_NUM_ATTR(native_vector_width_long)
+    GET_NUM_ATTR(native_vector_width_float)
+    GET_NUM_ATTR(native_vector_width_double)
+    GET_NUM_ATTR(native_vector_width_half)
+
+    size_t max_clock_frequency =
+        (size_t)(device.is_host() ? (size_t)0 :
+                                    device.get_info<sycl::info::device::max_clock_frequency>());
+    WRITE_ATTR("max_clock_frequency", max_clock_frequency)
+
+    GET_NUM_ATTR(address_bits)
+    GET_NUM_ATTR(max_mem_alloc_size)
+
+    /* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't
+     * supported so we always return false, even if device supports HW texture usage acceleration.
+     */
+    bool image_support = false;
+    WRITE_ATTR("image_support", (size_t)image_support)
+
+    GET_NUM_ATTR(max_parameter_size)
+    GET_NUM_ATTR(mem_base_addr_align)
+    GET_NUM_ATTR(global_mem_size)
+    GET_NUM_ATTR(local_mem_size)
+    GET_NUM_ATTR(error_correction_support)
+    GET_NUM_ATTR(profiling_timer_resolution)
+    GET_NUM_ATTR(is_available)
+
+#  undef GET_NUM_ATTR
+#  undef WRITE_ATTR
+    capabilities << "\n";
+  }
+
+  return ::strdup(capabilities.str().c_str());
+}
+
+void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
+{
+  int num = 0;
+  std::vector<sycl::device> devices = OneapiDevice::available_devices();
+  for (sycl::device &device : devices) {
+    const std::string &platform_name =
+        device.get_platform().get_info<sycl::info::platform::name>();
+    std::string name = device.get_info<sycl::info::device::name>();
+    std::string id = "ONEAPI_" + platform_name + "_" + name;
+    if (device.has(sycl::aspect::ext_intel_pci_address)) {
+      id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
+    }
+    (cb)(id.c_str(), name.c_str(), num, user_ptr);
+    num++;
+  }
+}
+
+size_t OneapiDevice::get_memcapacity()
+{
+  return reinterpret_cast<sycl::queue *>(device_queue_)
+      ->get_device()
+      .get_info<sycl::info::device::global_mem_size>();
+}
+
+int OneapiDevice::get_num_multiprocessors()
+{
+  const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
+  if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
+    return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+  }
+  else
+    return 0;
+}
+
+int OneapiDevice::get_max_num_threads_per_multiprocessor()
+{
+  const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
+  if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
+      device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
+    return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() *
+           device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+  }
+  else
+    return 0;
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/device/oneapi/device_impl.h
+++ b/intern/cycles/device/oneapi/device_impl.h
@ -3,9 +3,12 @@

 #ifdef WITH_ONEAPI

+#  include <CL/sycl.hpp>
+
 #  include "device/device.h"
 #  include "device/oneapi/device.h"
 #  include "device/oneapi/queue.h"
+#  include "kernel/device/oneapi/kernel.h"

 #  include "util/map.h"

@ -13,6 +16,11 @@ CCL_NAMESPACE_BEGIN

 class DeviceQueue;

+typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
+                                             const char *name,
+                                             int num,
+                                             void *user_ptr);
+
 class OneapiDevice : public Device {
 private:
  SyclQueue *device_queue_;
@ -25,16 +33,12 @@ class OneapiDevice : public Device {
  void *kg_memory_device_;
  size_t kg_memory_size_ = (size_t)0;
  size_t max_memory_on_device_ = (size_t)0;
-  OneAPIDLLInterface oneapi_dll_;
  std::string oneapi_error_string_;

 public:
  virtual BVHLayoutMask get_bvh_layout_mask() const override;

-  OneapiDevice(const DeviceInfo &info,
-               OneAPIDLLInterface &oneapi_dll_object,
-               Stats &stats,
-               Profiler &profiler);
+  OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);

  virtual ~OneapiDevice();

@ -50,12 +54,8 @@ class OneapiDevice : public Device {

  void generic_free(device_memory &mem);

-  SyclQueue *sycl_queue();
-
  string oneapi_error_message();

-  OneAPIDLLInterface oneapi_dll_object();
-
  void *kernel_globals_device_pointer();

  void mem_alloc(device_memory &mem) override;
@ -90,13 +90,37 @@ class OneapiDevice : public Device {

  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;

-  int get_num_multiprocessors();
-  int get_max_num_threads_per_multiprocessor();
-
  /* NOTE(@nsirgien): Create this methods to avoid some compilation problems on Windows with host
   * side compilation (MSVC). */
  void *usm_aligned_alloc_host(size_t memory_size, size_t alignment);
  void usm_free(void *usm_ptr);
+
+  static std::vector<sycl::device> available_devices();
+  static char *device_capabilities();
+  static int parse_driver_build_version(const sycl::device &device);
+  static void iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr);
+
+  size_t get_memcapacity();
+  int get_num_multiprocessors();
+  int get_max_num_threads_per_multiprocessor();
+  bool queue_synchronize(SyclQueue *queue);
+  bool kernel_globals_size(SyclQueue *queue, size_t &kernel_global_size);
+  void set_global_memory(SyclQueue *queue,
+                         void *kernel_globals,
+                         const char *memory_name,
+                         void *memory_device_pointer);
+  bool enqueue_kernel(KernelContext *kernel_context, int kernel, size_t global_size, void **args);
+  SyclQueue *sycl_queue();
+
+ protected:
+  void check_usm(SyclQueue *queue, const void *usm_ptr, bool allow_host);
+  bool create_queue(SyclQueue *&external_queue, int device_index);
+  void free_queue(SyclQueue *queue);
+  void *usm_aligned_alloc_host(SyclQueue *queue, size_t memory_size, size_t alignment);
+  void *usm_alloc_device(SyclQueue *queue, size_t memory_size);
+  void usm_free(SyclQueue *queue, void *usm_ptr);
+  bool usm_memcpy(SyclQueue *queue, void *dest, void *src, size_t num_bytes);
+  bool usm_memset(SyclQueue *queue, void *usm_ptr, unsigned char value, size_t num_bytes);
 };

 CCL_NAMESPACE_END
--- a/intern/cycles/device/oneapi/dll_interface.h
+++ b/intern/cycles/device/oneapi/dll_interface.h
@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
-
-#pragma once
-
-/* Include kernel header to get access to SYCL-specific types, like SyclQueue and
- * OneAPIDeviceIteratorCallback. */
-#include "kernel/device/oneapi/kernel.h"
-
-#ifdef WITH_ONEAPI
-struct OneAPIDLLInterface {
-#  define DLL_INTERFACE_CALL(function, return_type, ...) \
-    return_type (*function)(__VA_ARGS__) = nullptr;
-#  include "kernel/device/oneapi/dll_interface_template.h"
-#  undef DLL_INTERFACE_CALL
-};
-#endif
--- a/intern/cycles/device/oneapi/queue.cpp
+++ b/intern/cycles/device/oneapi/queue.cpp
@ -22,10 +22,7 @@ struct KernelExecutionInfo {
 /* OneapiDeviceQueue */

 OneapiDeviceQueue::OneapiDeviceQueue(OneapiDevice *device)
-    : DeviceQueue(device),
-      oneapi_device_(device),
-      oneapi_dll_(device->oneapi_dll_object()),
-      kernel_context_(nullptr)
+    : DeviceQueue(device), oneapi_device_(device), kernel_context_(nullptr)
 {
 }

@ -81,14 +78,14 @@ bool OneapiDeviceQueue::enqueue(DeviceKernel kernel,
  assert(signed_kernel_work_size >= 0);
  size_t kernel_work_size = (size_t)signed_kernel_work_size;

-  size_t kernel_local_size = oneapi_dll_.oneapi_kernel_preferred_local_size(
+  size_t kernel_local_size = oneapi_kernel_preferred_local_size(
      kernel_context_->queue, (::DeviceKernel)kernel, kernel_work_size);
  size_t uniformed_kernel_work_size = round_up(kernel_work_size, kernel_local_size);

  assert(kernel_context_);

  /* Call the oneAPI kernel DLL to launch the requested kernel. */
-  bool is_finished_ok = oneapi_dll_.oneapi_enqueue_kernel(
+  bool is_finished_ok = oneapi_device_->enqueue_kernel(
      kernel_context_, kernel, uniformed_kernel_work_size, args);

  if (is_finished_ok == false) {
@ -108,7 +105,7 @@ bool OneapiDeviceQueue::synchronize()
    return false;
  }

-  bool is_finished_ok = oneapi_dll_.oneapi_queue_synchronize(oneapi_device_->sycl_queue());
+  bool is_finished_ok = oneapi_device_->queue_synchronize(oneapi_device_->sycl_queue());
  if (is_finished_ok == false)
    oneapi_device_->set_error("oneAPI unknown kernel execution error: got runtime exception \"" +
                              oneapi_device_->oneapi_error_message() + "\"");
--- a/intern/cycles/device/oneapi/queue.h
+++ b/intern/cycles/device/oneapi/queue.h
@ -10,7 +10,7 @@
 #  include "device/queue.h"

 #  include "device/oneapi/device.h"
-#  include "device/oneapi/dll_interface.h"
+#  include "kernel/device/oneapi/kernel.h"

 CCL_NAMESPACE_BEGIN

@ -41,9 +41,7 @@ class OneapiDeviceQueue : public DeviceQueue {

 protected:
  OneapiDevice *oneapi_device_;
-  OneAPIDLLInterface oneapi_dll_;
  KernelContext *kernel_context_;
-  bool with_kernel_statistics_;
 };

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -716,7 +716,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
  if(WIN32)
    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.dll)
  else()
-    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.so)
+    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/libcycles_kernel_oneapi.so)
  endif()

  set(cycles_oneapi_kernel_sources
@ -815,6 +815,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI)

  if(WIN32)
    list(APPEND sycl_compiler_flags
+    -fuse-ld=link
    -fms-extensions
    -fms-compatibility
    -D_WINDLL
@ -888,33 +889,24 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
    endif()
  endif()

-  # install dynamic libraries required at runtime
-  if(WIN32)
-    set(SYCL_RUNTIME_DEPENDENCIES
-        sycl.dll
-        pi_level_zero.dll
-    )
-    if(NOT WITH_BLENDER)
-      # For the Cycles standalone put libraries next to the Cycles application.
-      delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH})
-    else()
-      # For Blender put the libraries next to the Blender executable.
-      #
-      # Note that the installation path in the delayed_install is relative to the versioned folder,
-      # which means we need to go one level up.
-      delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" "../")
-    endif()
-  elseif(UNIX AND NOT APPLE)
-    file(GLOB SYCL_RUNTIME_DEPENDENCIES
-              ${sycl_compiler_root}/../lib/libsycl.so
-              ${sycl_compiler_root}/../lib/libsycl.so.[0-9]
-              ${sycl_compiler_root}/../lib/libsycl.so.[0-9].[0-9].[0-9]-[0-9]
-    )
-    list(APPEND SYCL_RUNTIME_DEPENDENCIES ${sycl_compiler_root}/../lib/libpi_level_zero.so)
-    delayed_install("" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH}/lib)
+  if(NOT WITH_BLENDER)
+    # For the Cycles standalone put libraries next to the Cycles application.
+    set(cycles_oneapi_target_path ${CYCLES_INSTALL_PATH})
+  else()
+    # For Blender put the libraries next to the Blender executable.
+    #
+    # Note that the installation path in the delayed_install is relative to the versioned folder,
+    # which means we need to go one level up.
+    set(cycles_oneapi_target_path "../")
+  endif()
+
+  # install dynamic libraries required at runtime
+  if(WIN32)
+    delayed_install("" "${cycles_kernel_oneapi_lib}" ${cycles_oneapi_target_path})
+  elseif(UNIX AND NOT APPLE)
+    delayed_install("" "${cycles_kernel_oneapi_lib}" ${cycles_oneapi_target_path}/lib)
  endif()

-  delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cycles_kernel_oneapi_lib}" ${CYCLES_INSTALL_PATH}/lib)
  add_custom_target(cycles_kernel_oneapi ALL DEPENDS ${cycles_kernel_oneapi_lib})
 endif()

--- a/intern/cycles/kernel/device/oneapi/dll_interface_template.h
+++ b/intern/cycles/kernel/device/oneapi/dll_interface_template.h
@ -1,54 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2022 Intel Corporation */
-
-/* device_capabilities() returns a C string that must be free'd with oneapi_free(). */
-DLL_INTERFACE_CALL(oneapi_device_capabilities, char *)
-DLL_INTERFACE_CALL(oneapi_free, void, void *)
-DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue)
-
-DLL_INTERFACE_CALL(oneapi_get_num_multiprocessors, int, SyclQueue *queue)
-DLL_INTERFACE_CALL(oneapi_get_max_num_threads_per_multiprocessor, int, SyclQueue *queue)
-DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr)
-DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr)
-
-DLL_INTERFACE_CALL(oneapi_create_queue, bool, SyclQueue *&external_queue, int device_index)
-DLL_INTERFACE_CALL(oneapi_free_queue, void, SyclQueue *queue)
-DLL_INTERFACE_CALL(
-    oneapi_usm_aligned_alloc_host, void *, SyclQueue *queue, size_t memory_size, size_t alignment)
-DLL_INTERFACE_CALL(oneapi_usm_alloc_device, void *, SyclQueue *queue, size_t memory_size)
-DLL_INTERFACE_CALL(oneapi_usm_free, void, SyclQueue *queue, void *usm_ptr)
-
-DLL_INTERFACE_CALL(
-    oneapi_usm_memcpy, bool, SyclQueue *queue, void *dest, void *src, size_t num_bytes)
-DLL_INTERFACE_CALL(oneapi_queue_synchronize, bool, SyclQueue *queue)
-DLL_INTERFACE_CALL(oneapi_usm_memset,
-                   bool,
-                   SyclQueue *queue,
-                   void *usm_ptr,
-                   unsigned char value,
-                   size_t num_bytes)
-
-DLL_INTERFACE_CALL(oneapi_run_test_kernel, bool, SyclQueue *queue)
-
-/* Operation with Kernel globals structure - map of global/constant allocation - filled before
- * render/kernel execution As we don't know in cycles `sizeof` this - Cycles will manage just as
- * pointer. */
-DLL_INTERFACE_CALL(oneapi_kernel_globals_size, bool, SyclQueue *queue, size_t &kernel_global_size)
-DLL_INTERFACE_CALL(oneapi_set_global_memory,
-                   void,
-                   SyclQueue *queue,
-                   void *kernel_globals,
-                   const char *memory_name,
-                   void *memory_device_pointer)
-
-DLL_INTERFACE_CALL(oneapi_kernel_preferred_local_size,
-                   size_t,
-                   SyclQueue *queue,
-                   const DeviceKernel kernel,
-                   const size_t kernel_global_size)
-DLL_INTERFACE_CALL(oneapi_enqueue_kernel,
-                   bool,
-                   KernelContext *context,
-                   int kernel,
-                   size_t global_size,
-                   void **args)
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@ -3,7 +3,6 @@

 #ifdef WITH_ONEAPI

-/* clang-format off */
 #  include "kernel.h"
 #  include <iostream>
 #  include <map>
@ -16,163 +15,16 @@
 #  include "kernel/device/oneapi/kernel_templates.h"

 #  include "kernel/device/gpu/kernel.h"
-/* clang-format on */

 static OneAPIErrorCallback s_error_cb = nullptr;
 static void *s_error_user_ptr = nullptr;

-static std::vector<sycl::device> oneapi_available_devices();
-
 void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
 {
  s_error_cb = cb;
  s_error_user_ptr = user_ptr;
 }

-void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
-{
-#  ifdef _DEBUG
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  sycl::info::device_type device_type =
-      queue->get_device().get_info<sycl::info::device::device_type>();
-  sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
-  (void)usm_type;
-  assert(usm_type == sycl::usm::alloc::device ||
-         ((device_type == sycl::info::device_type::host ||
-           device_type == sycl::info::device_type::is_cpu || allow_host) &&
-          usm_type == sycl::usm::alloc::host));
-#  endif
-}
-
-bool oneapi_create_queue(SyclQueue *&external_queue, int device_index)
-{
-  bool finished_correct = true;
-  try {
-    std::vector<sycl::device> devices = oneapi_available_devices();
-    if (device_index < 0 || device_index >= devices.size()) {
-      return false;
-    }
-    sycl::queue *created_queue = new sycl::queue(devices[device_index],
-                                                 sycl::property::queue::in_order());
-    external_queue = reinterpret_cast<SyclQueue *>(created_queue);
-  }
-  catch (sycl::exception const &e) {
-    finished_correct = false;
-    if (s_error_cb) {
-      s_error_cb(e.what(), s_error_user_ptr);
-    }
-  }
-  return finished_correct;
-}
-
-void oneapi_free_queue(SyclQueue *queue_)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  delete queue;
-}
-
-void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  return sycl::aligned_alloc_host(alignment, memory_size, *queue);
-}
-
-void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  return sycl::malloc_device(memory_size, *queue);
-}
-
-void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  oneapi_check_usm(queue_, usm_ptr, true);
-  sycl::free(usm_ptr, *queue);
-}
-
-bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  oneapi_check_usm(queue_, dest, true);
-  oneapi_check_usm(queue_, src, true);
-  sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
-#  ifdef WITH_CYCLES_DEBUG
-  try {
-    /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
-     * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
-     */
-    mem_event.wait_and_throw();
-    return true;
-  }
-  catch (sycl::exception const &e) {
-    if (s_error_cb) {
-      s_error_cb(e.what(), s_error_user_ptr);
-    }
-    return false;
-  }
-#  else
-  sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
-  sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
-  bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
-                             src_type == sycl::usm::alloc::device;
-  bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
-                                          src_type == sycl::usm::alloc::unknown;
-  /* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
-   * may not wait until the end of the transfer before using the memory.
-   */
-  if (from_device_to_host || host_or_device_memop_with_offset)
-    mem_event.wait();
-  return true;
-#  endif
-}
-
-bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  oneapi_check_usm(queue_, usm_ptr, true);
-  sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
-#  ifdef WITH_CYCLES_DEBUG
-  try {
-    /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
-     * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
-     */
-    mem_event.wait_and_throw();
-    return true;
-  }
-  catch (sycl::exception const &e) {
-    if (s_error_cb) {
-      s_error_cb(e.what(), s_error_user_ptr);
-    }
-    return false;
-  }
-#  else
-  (void)mem_event;
-  return true;
-#  endif
-}
-
-bool oneapi_queue_synchronize(SyclQueue *queue_)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  try {
-    queue->wait_and_throw();
-    return true;
-  }
-  catch (sycl::exception const &e) {
-    if (s_error_cb) {
-      s_error_cb(e.what(), s_error_user_ptr);
-    }
-    return false;
-  }
-}
-
 /* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and
 * also trigger runtime compilation of all existing oneAPI kernels */
 bool oneapi_run_test_kernel(SyclQueue *queue_)
@ -216,60 +68,13 @@ bool oneapi_run_test_kernel(SyclQueue *queue_)
  return true;
 }

-bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
-{
-  kernel_global_size = sizeof(KernelGlobalsGPU);
-
-  return true;
-}
-
-void oneapi_set_global_memory(SyclQueue *queue_,
-                              void *kernel_globals,
-                              const char *memory_name,
-                              void *memory_device_pointer)
-{
-  assert(queue_);
-  assert(kernel_globals);
-  assert(memory_name);
-  assert(memory_device_pointer);
-  KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
-  oneapi_check_usm(queue_, memory_device_pointer);
-  oneapi_check_usm(queue_, kernel_globals, true);
-
-  std::string matched_name(memory_name);
-
-/* This macro will change global ptr of KernelGlobals via name matching. */
-#  define KERNEL_DATA_ARRAY(type, name) \
-    else if (#name == matched_name) \
-    { \
-      globals->__##name = (type *)memory_device_pointer; \
-      return; \
-    }
-  if (false) {
-  }
-  else if ("integrator_state" == matched_name) {
-    globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
-    return;
-  }
-  KERNEL_DATA_ARRAY(KernelData, data)
-#  include "kernel/data_arrays.h"
-  else
-  {
-    std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
-              << std::endl;
-    assert(false);
-  }
-#  undef KERNEL_DATA_ARRAY
-}
-
 /* TODO: Move device information to OneapiDevice initialized on creation and use it. */
 /* TODO: Move below function to oneapi/queue.cpp. */
-size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
+size_t oneapi_kernel_preferred_local_size(SyclQueue *queue,
                                          const DeviceKernel kernel,
                                          const size_t kernel_global_size)
 {
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  assert(queue);
  (void)kernel_global_size;
  const static size_t preferred_work_group_size_intersect_shading = 32;
  const static size_t preferred_work_group_size_technical = 1024;
@ -311,8 +116,10 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
      preferred_work_group_size = 512;
  }

-  const size_t limit_work_group_size =
-      queue->get_device().get_info<sycl::info::device::max_work_group_size>();
+  const size_t limit_work_group_size = reinterpret_cast<sycl::queue *>(queue)
+                                           ->get_device()
+                                           .get_info<sycl::info::device::max_work_group_size>();
+
  return std::min(limit_work_group_size, preferred_work_group_size);
 }

@ -664,266 +471,4 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
 #  endif
  return success;
 }
-
-/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
- * since Windows driver 101.3268. */
-/* The same min compute-runtime version is currently required across Windows and Linux.
- * For Windows driver 101.3430, compute-runtime version is 23904. */
-static const int lowest_supported_driver_version_win = 1013430;
-static const int lowest_supported_driver_version_neo = 23904;
-
-static int parse_driver_build_version(const sycl::device &device)
-{
-  const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
-  int driver_build_version = 0;
-
-  size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
-  if (second_dot_position == std::string::npos) {
-    std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
-              << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
-              << " xx.xx.xxx.xxxx (Windows) for device \""
-              << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
-  }
-  else {
-    try {
-      size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
-      if (third_dot_position != std::string::npos) {
-        const std::string &third_number_substr = driver_version.substr(
-            second_dot_position + 1, third_dot_position - second_dot_position - 1);
-        const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
-        if (third_number_substr.length() == 3 && forth_number_substr.length() == 4)
-          driver_build_version = std::stoi(third_number_substr) * 10000 +
-                                 std::stoi(forth_number_substr);
-      }
-      else {
-        const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
-        driver_build_version = std::stoi(third_number_substr);
-      }
-    }
-    catch (std::invalid_argument &e) {
-      std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
-                << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
-                << " xx.xx.xxx.xxxx (Windows) for device \""
-                << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
-    }
-  }
-
-  return driver_build_version;
-}
-
-static std::vector<sycl::device> oneapi_available_devices()
-{
-  bool allow_all_devices = false;
-  if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
-    allow_all_devices = true;
-
-    /* Host device is useful only for debugging at the moment
-     * so we hide this device with default build settings. */
-#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-  bool allow_host = true;
-#  else
-  bool allow_host = false;
-#  endif
-
-  const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
-
-  std::vector<sycl::device> available_devices;
-  for (const sycl::platform &platform : oneapi_platforms) {
-    /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
-     */
-    if (platform.get_backend() == sycl::backend::opencl) {
-      continue;
-    }
-
-    const std::vector<sycl::device> &oneapi_devices =
-        (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
-                                            platform.get_devices(sycl::info::device_type::gpu);
-
-    for (const sycl::device &device : oneapi_devices) {
-      if (allow_all_devices) {
-        /* still filter out host device if build doesn't support it. */
-        if (allow_host || !device.is_host()) {
-          available_devices.push_back(device);
-        }
-      }
-      else {
-        bool filter_out = false;
-
-        /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
-         * assuming they have either more than 96 Execution Units or not 7 threads per EU.
-         * Official support can be broaden to older and smaller GPUs once ready. */
-        if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
-          /* Filtered-out defaults in-case these values aren't available through too old L0
-           * runtime. */
-          int number_of_eus = 96;
-          int threads_per_eu = 7;
-          if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
-            number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
-          }
-          if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
-            threads_per_eu =
-                device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
-          }
-          /* This filters out all Level-Zero supported GPUs from older generation than Arc. */
-          if (number_of_eus <= 96 && threads_per_eu == 7) {
-            filter_out = true;
-          }
-          /* if not already filtered out, check driver version. */
-          if (!filter_out) {
-            int driver_build_version = parse_driver_build_version(device);
-            if ((driver_build_version > 100000 &&
-                 driver_build_version < lowest_supported_driver_version_win) ||
-                driver_build_version < lowest_supported_driver_version_neo) {
-              filter_out = true;
-            }
-          }
-        }
-        else if (!allow_host && device.is_host()) {
-          filter_out = true;
-        }
-        else if (!allow_all_devices) {
-          filter_out = true;
-        }
-
-        if (!filter_out) {
-          available_devices.push_back(device);
-        }
-      }
-    }
-  }
-
-  return available_devices;
-}
-
-char *oneapi_device_capabilities()
-{
-  std::stringstream capabilities;
-
-  const std::vector<sycl::device> &oneapi_devices = oneapi_available_devices();
-  for (const sycl::device &device : oneapi_devices) {
-    const std::string &name = device.get_info<sycl::info::device::name>();
-
-    capabilities << std::string("\t") << name << "\n";
-#  define WRITE_ATTR(attribute_name, attribute_variable) \
-    capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
-                 << "\n";
-#  define GET_NUM_ATTR(attribute) \
-    { \
-      size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \
-      capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \
-    }
-
-    GET_NUM_ATTR(vendor_id)
-    GET_NUM_ATTR(max_compute_units)
-    GET_NUM_ATTR(max_work_item_dimensions)
-
-    sycl::id<3> max_work_item_sizes =
-        device.get_info<sycl::info::device::max_work_item_sizes<3>>();
-    WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0)))
-    WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1)))
-    WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2)))
-
-    GET_NUM_ATTR(max_work_group_size)
-    GET_NUM_ATTR(max_num_sub_groups)
-    GET_NUM_ATTR(sub_group_independent_forward_progress)
-
-    GET_NUM_ATTR(preferred_vector_width_char)
-    GET_NUM_ATTR(preferred_vector_width_short)
-    GET_NUM_ATTR(preferred_vector_width_int)
-    GET_NUM_ATTR(preferred_vector_width_long)
-    GET_NUM_ATTR(preferred_vector_width_float)
-    GET_NUM_ATTR(preferred_vector_width_double)
-    GET_NUM_ATTR(preferred_vector_width_half)
-
-    GET_NUM_ATTR(native_vector_width_char)
-    GET_NUM_ATTR(native_vector_width_short)
-    GET_NUM_ATTR(native_vector_width_int)
-    GET_NUM_ATTR(native_vector_width_long)
-    GET_NUM_ATTR(native_vector_width_float)
-    GET_NUM_ATTR(native_vector_width_double)
-    GET_NUM_ATTR(native_vector_width_half)
-
-    size_t max_clock_frequency =
-        (size_t)(device.is_host() ? (size_t)0 :
-                                    device.get_info<sycl::info::device::max_clock_frequency>());
-    WRITE_ATTR("max_clock_frequency", max_clock_frequency)
-
-    GET_NUM_ATTR(address_bits)
-    GET_NUM_ATTR(max_mem_alloc_size)
-
-    /* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't
-     * supported so we always return false, even if device supports HW texture usage acceleration.
-     */
-    bool image_support = false;
-    WRITE_ATTR("image_support", (size_t)image_support)
-
-    GET_NUM_ATTR(max_parameter_size)
-    GET_NUM_ATTR(mem_base_addr_align)
-    GET_NUM_ATTR(global_mem_size)
-    GET_NUM_ATTR(local_mem_size)
-    GET_NUM_ATTR(error_correction_support)
-    GET_NUM_ATTR(profiling_timer_resolution)
-    GET_NUM_ATTR(is_available)
-
-#  undef GET_NUM_ATTR
-#  undef WRITE_ATTR
-    capabilities << "\n";
-  }
-
-  return ::strdup(capabilities.str().c_str());
-}
-
-void oneapi_free(void *p)
-{
-  if (p) {
-    ::free(p);
-  }
-}
-
-void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
-{
-  int num = 0;
-  std::vector<sycl::device> devices = oneapi_available_devices();
-  for (sycl::device &device : devices) {
-    const std::string &platform_name =
-        device.get_platform().get_info<sycl::info::platform::name>();
-    std::string name = device.get_info<sycl::info::device::name>();
-    std::string id = "ONEAPI_" + platform_name + "_" + name;
-    if (device.has(sycl::aspect::ext_intel_pci_address)) {
-      id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
-    }
-    (cb)(id.c_str(), name.c_str(), num, user_ptr);
-    num++;
-  }
-}
-
-size_t oneapi_get_memcapacity(SyclQueue *queue)
-{
-  return reinterpret_cast<sycl::queue *>(queue)
-      ->get_device()
-      .get_info<sycl::info::device::global_mem_size>();
-}
-
-int oneapi_get_num_multiprocessors(SyclQueue *queue)
-{
-  const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
-  if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
-    return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
-  }
-  else
-    return 0;
-}
-
-int oneapi_get_max_num_threads_per_multiprocessor(SyclQueue *queue)
-{
-  const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
-  if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
-      device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
-    return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() *
-           device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
-  }
-  else
-    return 0;
-}
-
 #endif /* WITH_ONEAPI */
--- a/intern/cycles/kernel/device/oneapi/kernel.h
+++ b/intern/cycles/kernel/device/oneapi/kernel.h
@ -25,11 +25,6 @@ enum DeviceKernel : int;

 class SyclQueue;

-typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
-                                             const char *name,
-                                             int num,
-                                             void *user_ptr);
-
 typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr);

 struct KernelContext {
@ -45,13 +40,15 @@ struct KernelContext {
 extern "C" {
 #  endif

-#  define DLL_INTERFACE_CALL(function, return_type, ...) \
-    CYCLES_KERNEL_ONEAPI_EXPORT return_type function(__VA_ARGS__);
-#  include "kernel/device/oneapi/dll_interface_template.h"
-#  undef DLL_INTERFACE_CALL
-
+CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_run_test_kernel(SyclQueue *queue_);
+CYCLES_KERNEL_ONEAPI_EXPORT void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr);
+CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_kernel_preferred_local_size(
+    SyclQueue *queue, const DeviceKernel kernel, const size_t kernel_global_size);
+CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context,
+                                                       int kernel,
+                                                       size_t global_size,
+                                                       void **args);
 #  ifdef __cplusplus
 }
 #  endif
-
 #endif /* WITH_ONEAPI */
--- a/source/creator/CMakeLists.txt
+++ b/source/creator/CMakeLists.txt
@ -1076,6 +1076,13 @@ elseif(WIN32)
      DESTINATION ${TARGETDIR_VER}/python/lib/site-packages
    )
  endif()
+
+  if(PLATFORM_BUNDLED_LIBRARIES)
+    install(
+      FILES ${PLATFORM_BUNDLED_LIBRARIES}
+      DESTINATION ${TARGETDIR_LIB}
+    )
+  endif()
 elseif(APPLE)
  if(NOT WITH_PYTHON_MODULE)
    # Uppercase name for app bundle.