Cycles: oneAPI: add support for SYCL host task

This functionality is related only to debugging of SYCL implementation via single-threaded CPU execution and is disabled by default. Host device has been deprecated in SYCL 2020 spec and we removed it in 305b92e05f. Since this is still very useful for debugging, we're restoring a similar functionality here through SYCL 2020 Host Task.
2023-01-03 20:45:57 +01:00 · 2023-01-03 20:45:57 +01:00 · 858fffc2df
parent 7355d64f2b
commit 858fffc2df
8 changed files with 115 additions and 17 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -501,12 +501,14 @@ endif()
 if(NOT APPLE)
  option(WITH_CYCLES_DEVICE_ONEAPI "Enable Cycles oneAPI compute support" OFF)
  option(WITH_CYCLES_ONEAPI_BINARIES "Enable Ahead-Of-Time compilation for Cycles oneAPI device" OFF)
+  option(WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION "Switch target of oneAPI implementation from SYCL devices to Host Task (single thread on CPU). This option is only for debugging purposes." OFF)

  # https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html
  # acm-g10 is the target for the first Intel Arc Alchemist GPUs.
  set(CYCLES_ONEAPI_SPIR64_GEN_DEVICES "acm-g10" CACHE STRING "oneAPI Intel GPU architectures to build binaries for")
  set(CYCLES_ONEAPI_SYCL_TARGETS spir64 spir64_gen CACHE STRING "oneAPI targets to build AOT binaries for")

+  mark_as_advanced(WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION)
  mark_as_advanced(CYCLES_ONEAPI_SPIR64_GEN_DEVICES)
  mark_as_advanced(CYCLES_ONEAPI_SYCL_TARGETS)
 endif()
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@ -163,6 +163,9 @@ if(WITH_CYCLES_DEVICE_METAL)
 endif()

 if(WITH_CYCLES_DEVICE_ONEAPI)
+  if(WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION)
+    add_definitions(-DWITH_ONEAPI_SYCL_HOST_TASK)
+  endif()
  if(WITH_CYCLES_ONEAPI_BINARIES)
    set(cycles_kernel_oneapi_lib_suffix "_aot")
  else()
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@ -429,7 +429,12 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_
      queue->get_device().get_info<sycl::info::device::device_type>();
  sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
  (void)usm_type;
-  assert(usm_type == sycl::usm::alloc::device ||
+#    ifndef WITH_ONEAPI_SYCL_HOST_TASK
+  const sycl::usm::alloc main_memory_type = sycl::usm::alloc::device;
+#    else
+  const sycl::usm::alloc main_memory_type = sycl::usm::alloc::host;
+#    endif
+  assert(usm_type == main_memory_type ||
         (usm_type == sycl::usm::alloc::host &&
          (allow_host || device_type == sycl::info::device_type::cpu)) ||
         usm_type == sycl::usm::alloc::unknown);
@ -478,7 +483,11 @@ void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size)
 {
  assert(queue_);
  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+#  ifndef WITH_ONEAPI_SYCL_HOST_TASK
  return sycl::malloc_device(memory_size, *queue);
+#  else
+  return sycl::malloc_host(memory_size, *queue);
+#  endif
 }

 void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr)
@ -736,7 +745,11 @@ char *OneapiDevice::device_capabilities()

  const std::vector<sycl::device> &oneapi_devices = available_devices();
  for (const sycl::device &device : oneapi_devices) {
+#  ifndef WITH_ONEAPI_SYCL_HOST_TASK
    const std::string &name = device.get_info<sycl::info::device::name>();
+#  else
+    const std::string &name = "SYCL Host Task (Debug)";
+#  endif

    capabilities << std::string("\t") << name << "\n";
 #  define WRITE_ATTR(attribute_name, attribute_variable) \
@ -813,7 +826,11 @@ void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_p
  for (sycl::device &device : devices) {
    const std::string &platform_name =
        device.get_platform().get_info<sycl::info::platform::name>();
+#  ifndef WITH_ONEAPI_SYCL_HOST_TASK
    std::string name = device.get_info<sycl::info::device::name>();
+#  else
+    std::string name = "SYCL Host Task (Debug)";
+#  endif
    std::string id = "ONEAPI_" + platform_name + "_" + name;
    if (device.has(sycl::aspect::ext_intel_pci_address)) {
      id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>());
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -752,6 +752,10 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
      ${SYCL_CPP_FLAGS}
      )

+  if (WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION)
+    list(APPEND sycl_compiler_flags -DWITH_ONEAPI_SYCL_HOST_TASK)
+  endif()
+
  # Set defaults for spir64 and spir64_gen options
  if(NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64)
    set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'")
@ -763,7 +767,8 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
  string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "--format zebin ")
  string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "-device ${CYCLES_ONEAPI_SPIR64_GEN_DEVICES} ")

-  if(WITH_CYCLES_ONEAPI_BINARIES)
+  # Host execution won't use GPU binaries, no need to compile them.
+  if(WITH_CYCLES_ONEAPI_BINARIES AND NOT WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION)
    # AoT binaries aren't currently reused when calling sycl::build.
    list(APPEND sycl_compiler_flags -DSYCL_SKIP_KERNELS_PRELOAD)
    # Iterate over all targest and their options
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@ -30,6 +30,16 @@ void gpu_parallel_active_index_array_impl(const uint num_states,
                                          ccl_global int *ccl_restrict num_indices,
                                          IsActiveOp is_active_op)
 {
+#  ifdef WITH_ONEAPI_SYCL_HOST_TASK
+  int write_index = 0;
+  for (int state_index = 0; state_index < num_states; state_index++) {
+    if (is_active_op(state_index))
+      indices[write_index++] = state_index;
+  }
+  *num_indices = write_index;
+  return;
+#  endif /* WITH_ONEAPI_SYCL_HOST_TASK */
+
  const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>();
  const uint blocksize = item_id.get_local_range(0);

--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@ -56,7 +56,8 @@
 #define ccl_gpu_kernel(block_num_threads, thread_num_registers)
 #define ccl_gpu_kernel_threads(block_num_threads)

-#define ccl_gpu_kernel_signature(name, ...) \
+#ifndef WITH_ONEAPI_SYCL_HOST_TASK
+#  define ccl_gpu_kernel_signature(name, ...) \
 void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
                          size_t kernel_global_size, \
                          size_t kernel_local_size, \
@ -67,9 +68,37 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
          sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
          [=](sycl::nd_item<1> item) {

-#define ccl_gpu_kernel_postfix \
+#  define ccl_gpu_kernel_postfix \
          }); \
    }
+#else
+/* Additional anonymous lambda is required to handle all "return" statements in the kernel code */
+#  define ccl_gpu_kernel_signature(name, ...) \
+void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
+                          size_t kernel_global_size, \
+                          size_t kernel_local_size, \
+                          sycl::handler &cgh, \
+                          __VA_ARGS__) { \
+      (kg); \
+      (kernel_local_size); \
+      cgh.host_task( \
+          [=]() {\
+            for (size_t gid = (size_t)0; gid < kernel_global_size; gid++) { \
+                kg->nd_item_local_id_0 = 0; \
+                kg->nd_item_local_range_0 = 1; \
+                kg->nd_item_group_id_0 = gid; \
+                kg->nd_item_group_range_0 = kernel_global_size; \
+                kg->nd_item_global_id_0 = gid; \
+                kg->nd_item_global_range_0 = kernel_global_size; \
+                auto kernel = [=]() {
+
+#  define ccl_gpu_kernel_postfix \
+                }; \
+                kernel(); \
+            } \
+      }); \
+}
+#endif

 #define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x

@ -83,23 +112,40 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
  } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg)

 /* GPU thread, block, grid size and index */
-#define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
-#define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
-#define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
-#define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
-#define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
-#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))

-#define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
-#define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
+#ifndef WITH_ONEAPI_SYCL_HOST_TASK
+#  define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
+#  define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
+#  define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
+#  define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
+#  define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
+#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
+
+#  define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
+#  define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))

 /* GPU warp synchronization */
-#define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier()
-#define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space)
-#ifdef __SYCL_DEVICE_ONLY__
-  #define ccl_gpu_ballot(predicate) (sycl::ext::oneapi::group_ballot(sycl::ext::oneapi::experimental::this_sub_group(), predicate).count())
+#  define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier()
+#  define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space)
+#  ifdef __SYCL_DEVICE_ONLY__
+#    define ccl_gpu_ballot(predicate) (sycl::ext::oneapi::group_ballot(sycl::ext::oneapi::experimental::this_sub_group(), predicate).count())
+#  else
+#    define ccl_gpu_ballot(predicate) (predicate ? 1 : 0)
+#  endif
 #else
-  #define ccl_gpu_ballot(predicate) (predicate ? 1 : 0)
+#  define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0)
+#  define ccl_gpu_block_dim_x (kg->nd_item_local_range_0)
+#  define ccl_gpu_block_idx_x (kg->nd_item_group_id_0)
+#  define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0)
+#  define ccl_gpu_warp_size (1)
+#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
+
+#  define ccl_gpu_global_id_x() (kg->nd_item_global_id_0)
+#  define ccl_gpu_global_size_x() (kg->nd_item_global_range_0)
+
+#  define ccl_gpu_syncthreads()
+#  define ccl_gpu_local_syncthreads()
+#  define ccl_gpu_ballot(predicate) (predicate ? 1 : 0)
 #endif

 /* Debug defines */
--- a/intern/cycles/kernel/device/oneapi/globals.h
+++ b/intern/cycles/kernel/device/oneapi/globals.h
@ -23,6 +23,15 @@ typedef struct KernelGlobalsGPU {
 #undef KERNEL_DATA_ARRAY
  IntegratorStateGPU *integrator_state;
  const KernelData *__data;
+
+#ifdef WITH_ONEAPI_SYCL_HOST_TASK
+  size_t nd_item_local_id_0;
+  size_t nd_item_local_range_0;
+  size_t nd_item_group_id_0;
+  size_t nd_item_group_range_0;
+  size_t nd_item_global_id_0;
+  size_t nd_item_global_range_0;
+#endif
 } KernelGlobalsGPU;

 typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals;
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@ -230,6 +230,12 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
    /* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
     * we extend work size to fit uniformity requirements. */
    global_size = groups_count * local_size;
+
+#  ifdef WITH_ONEAPI_SYCL_HOST_TASK
+    /* Path array implementation is serial in case of SYCL Host Task execution. */
+    global_size = 1;
+    local_size = 1;
+#  endif
  }

  /* Let the compiler throw an error if there are any kernels missing in this implementation. */