Cycles: oneAPI: add support for SYCL host task
This functionality is related only to debugging of SYCL implementation
via single-threaded CPU execution and is disabled by default.
Host device has been deprecated in SYCL 2020 spec and we removed it
in 305b92e05f
.
Since this is still very useful for debugging, we're restoring a
similar functionality here through SYCL 2020 Host Task.
This commit is contained in:
parent
7355d64f2b
commit
858fffc2df
|
@ -501,12 +501,14 @@ endif()
|
|||
if(NOT APPLE)
|
||||
option(WITH_CYCLES_DEVICE_ONEAPI "Enable Cycles oneAPI compute support" OFF)
|
||||
option(WITH_CYCLES_ONEAPI_BINARIES "Enable Ahead-Of-Time compilation for Cycles oneAPI device" OFF)
|
||||
option(WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION "Switch target of oneAPI implementation from SYCL devices to Host Task (single thread on CPU). This option is only for debugging purposes." OFF)
|
||||
|
||||
# https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html
|
||||
# acm-g10 is the target for the first Intel Arc Alchemist GPUs.
|
||||
set(CYCLES_ONEAPI_SPIR64_GEN_DEVICES "acm-g10" CACHE STRING "oneAPI Intel GPU architectures to build binaries for")
|
||||
set(CYCLES_ONEAPI_SYCL_TARGETS spir64 spir64_gen CACHE STRING "oneAPI targets to build AOT binaries for")
|
||||
|
||||
mark_as_advanced(WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION)
|
||||
mark_as_advanced(CYCLES_ONEAPI_SPIR64_GEN_DEVICES)
|
||||
mark_as_advanced(CYCLES_ONEAPI_SYCL_TARGETS)
|
||||
endif()
|
||||
|
|
|
@ -163,6 +163,9 @@ if(WITH_CYCLES_DEVICE_METAL)
|
|||
endif()
|
||||
|
||||
if(WITH_CYCLES_DEVICE_ONEAPI)
|
||||
if(WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION)
|
||||
add_definitions(-DWITH_ONEAPI_SYCL_HOST_TASK)
|
||||
endif()
|
||||
if(WITH_CYCLES_ONEAPI_BINARIES)
|
||||
set(cycles_kernel_oneapi_lib_suffix "_aot")
|
||||
else()
|
||||
|
|
|
@ -429,7 +429,12 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_
|
|||
queue->get_device().get_info<sycl::info::device::device_type>();
|
||||
sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
|
||||
(void)usm_type;
|
||||
assert(usm_type == sycl::usm::alloc::device ||
|
||||
# ifndef WITH_ONEAPI_SYCL_HOST_TASK
|
||||
const sycl::usm::alloc main_memory_type = sycl::usm::alloc::device;
|
||||
# else
|
||||
const sycl::usm::alloc main_memory_type = sycl::usm::alloc::host;
|
||||
# endif
|
||||
assert(usm_type == main_memory_type ||
|
||||
(usm_type == sycl::usm::alloc::host &&
|
||||
(allow_host || device_type == sycl::info::device_type::cpu)) ||
|
||||
usm_type == sycl::usm::alloc::unknown);
|
||||
|
@ -478,7 +483,11 @@ void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size)
|
|||
{
|
||||
assert(queue_);
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
||||
# ifndef WITH_ONEAPI_SYCL_HOST_TASK
|
||||
return sycl::malloc_device(memory_size, *queue);
|
||||
# else
|
||||
return sycl::malloc_host(memory_size, *queue);
|
||||
# endif
|
||||
}
|
||||
|
||||
void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr)
|
||||
|
@ -736,7 +745,11 @@ char *OneapiDevice::device_capabilities()
|
|||
|
||||
const std::vector<sycl::device> &oneapi_devices = available_devices();
|
||||
for (const sycl::device &device : oneapi_devices) {
|
||||
# ifndef WITH_ONEAPI_SYCL_HOST_TASK
|
||||
const std::string &name = device.get_info<sycl::info::device::name>();
|
||||
# else
|
||||
const std::string &name = "SYCL Host Task (Debug)";
|
||||
# endif
|
||||
|
||||
capabilities << std::string("\t") << name << "\n";
|
||||
# define WRITE_ATTR(attribute_name, attribute_variable) \
|
||||
|
@ -813,7 +826,11 @@ void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_p
|
|||
for (sycl::device &device : devices) {
|
||||
const std::string &platform_name =
|
||||
device.get_platform().get_info<sycl::info::platform::name>();
|
||||
# ifndef WITH_ONEAPI_SYCL_HOST_TASK
|
||||
std::string name = device.get_info<sycl::info::device::name>();
|
||||
# else
|
||||
std::string name = "SYCL Host Task (Debug)";
|
||||
# endif
|
||||
std::string id = "ONEAPI_" + platform_name + "_" + name;
|
||||
if (device.has(sycl::aspect::ext_intel_pci_address)) {
|
||||
id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>());
|
||||
|
|
|
@ -752,6 +752,10 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
|
|||
${SYCL_CPP_FLAGS}
|
||||
)
|
||||
|
||||
if (WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION)
|
||||
list(APPEND sycl_compiler_flags -DWITH_ONEAPI_SYCL_HOST_TASK)
|
||||
endif()
|
||||
|
||||
# Set defaults for spir64 and spir64_gen options
|
||||
if(NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64)
|
||||
set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'")
|
||||
|
@ -763,7 +767,8 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
|
|||
string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "--format zebin ")
|
||||
string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "-device ${CYCLES_ONEAPI_SPIR64_GEN_DEVICES} ")
|
||||
|
||||
if(WITH_CYCLES_ONEAPI_BINARIES)
|
||||
# Host execution won't use GPU binaries, no need to compile them.
|
||||
if(WITH_CYCLES_ONEAPI_BINARIES AND NOT WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION)
|
||||
# AoT binaries aren't currently reused when calling sycl::build.
|
||||
list(APPEND sycl_compiler_flags -DSYCL_SKIP_KERNELS_PRELOAD)
|
||||
# Iterate over all targest and their options
|
||||
|
|
|
@ -30,6 +30,16 @@ void gpu_parallel_active_index_array_impl(const uint num_states,
|
|||
ccl_global int *ccl_restrict num_indices,
|
||||
IsActiveOp is_active_op)
|
||||
{
|
||||
# ifdef WITH_ONEAPI_SYCL_HOST_TASK
|
||||
int write_index = 0;
|
||||
for (int state_index = 0; state_index < num_states; state_index++) {
|
||||
if (is_active_op(state_index))
|
||||
indices[write_index++] = state_index;
|
||||
}
|
||||
*num_indices = write_index;
|
||||
return;
|
||||
# endif /* WITH_ONEAPI_SYCL_HOST_TASK */
|
||||
|
||||
const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>();
|
||||
const uint blocksize = item_id.get_local_range(0);
|
||||
|
||||
|
|
|
@ -56,7 +56,8 @@
|
|||
#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
|
||||
#define ccl_gpu_kernel_threads(block_num_threads)
|
||||
|
||||
#define ccl_gpu_kernel_signature(name, ...) \
|
||||
#ifndef WITH_ONEAPI_SYCL_HOST_TASK
|
||||
# define ccl_gpu_kernel_signature(name, ...) \
|
||||
void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
|
||||
size_t kernel_global_size, \
|
||||
size_t kernel_local_size, \
|
||||
|
@ -67,9 +68,37 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
|
|||
sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
|
||||
[=](sycl::nd_item<1> item) {
|
||||
|
||||
#define ccl_gpu_kernel_postfix \
|
||||
# define ccl_gpu_kernel_postfix \
|
||||
}); \
|
||||
}
|
||||
#else
|
||||
/* Additional anonymous lambda is required to handle all "return" statements in the kernel code */
|
||||
# define ccl_gpu_kernel_signature(name, ...) \
|
||||
void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
|
||||
size_t kernel_global_size, \
|
||||
size_t kernel_local_size, \
|
||||
sycl::handler &cgh, \
|
||||
__VA_ARGS__) { \
|
||||
(kg); \
|
||||
(kernel_local_size); \
|
||||
cgh.host_task( \
|
||||
[=]() {\
|
||||
for (size_t gid = (size_t)0; gid < kernel_global_size; gid++) { \
|
||||
kg->nd_item_local_id_0 = 0; \
|
||||
kg->nd_item_local_range_0 = 1; \
|
||||
kg->nd_item_group_id_0 = gid; \
|
||||
kg->nd_item_group_range_0 = kernel_global_size; \
|
||||
kg->nd_item_global_id_0 = gid; \
|
||||
kg->nd_item_global_range_0 = kernel_global_size; \
|
||||
auto kernel = [=]() {
|
||||
|
||||
# define ccl_gpu_kernel_postfix \
|
||||
}; \
|
||||
kernel(); \
|
||||
} \
|
||||
}); \
|
||||
}
|
||||
#endif
|
||||
|
||||
#define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x
|
||||
|
||||
|
@ -83,23 +112,40 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
|
|||
} ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg)
|
||||
|
||||
/* GPU thread, block, grid size and index */
|
||||
#define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
|
||||
#define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
|
||||
#define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
|
||||
#define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
|
||||
#define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
|
||||
#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
|
||||
|
||||
#define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
|
||||
#define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
|
||||
#ifndef WITH_ONEAPI_SYCL_HOST_TASK
|
||||
# define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
|
||||
# define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
|
||||
# define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
|
||||
# define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
|
||||
# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
|
||||
# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
|
||||
|
||||
# define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
|
||||
# define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
|
||||
|
||||
/* GPU warp synchronization */
|
||||
#define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier()
|
||||
#define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space)
|
||||
#ifdef __SYCL_DEVICE_ONLY__
|
||||
#define ccl_gpu_ballot(predicate) (sycl::ext::oneapi::group_ballot(sycl::ext::oneapi::experimental::this_sub_group(), predicate).count())
|
||||
# define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier()
|
||||
# define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space)
|
||||
# ifdef __SYCL_DEVICE_ONLY__
|
||||
# define ccl_gpu_ballot(predicate) (sycl::ext::oneapi::group_ballot(sycl::ext::oneapi::experimental::this_sub_group(), predicate).count())
|
||||
# else
|
||||
# define ccl_gpu_ballot(predicate) (predicate ? 1 : 0)
|
||||
# endif
|
||||
#else
|
||||
#define ccl_gpu_ballot(predicate) (predicate ? 1 : 0)
|
||||
# define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0)
|
||||
# define ccl_gpu_block_dim_x (kg->nd_item_local_range_0)
|
||||
# define ccl_gpu_block_idx_x (kg->nd_item_group_id_0)
|
||||
# define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0)
|
||||
# define ccl_gpu_warp_size (1)
|
||||
# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
|
||||
|
||||
# define ccl_gpu_global_id_x() (kg->nd_item_global_id_0)
|
||||
# define ccl_gpu_global_size_x() (kg->nd_item_global_range_0)
|
||||
|
||||
# define ccl_gpu_syncthreads()
|
||||
# define ccl_gpu_local_syncthreads()
|
||||
# define ccl_gpu_ballot(predicate) (predicate ? 1 : 0)
|
||||
#endif
|
||||
|
||||
/* Debug defines */
|
||||
|
|
|
@ -23,6 +23,15 @@ typedef struct KernelGlobalsGPU {
|
|||
#undef KERNEL_DATA_ARRAY
|
||||
IntegratorStateGPU *integrator_state;
|
||||
const KernelData *__data;
|
||||
|
||||
#ifdef WITH_ONEAPI_SYCL_HOST_TASK
|
||||
size_t nd_item_local_id_0;
|
||||
size_t nd_item_local_range_0;
|
||||
size_t nd_item_group_id_0;
|
||||
size_t nd_item_group_range_0;
|
||||
size_t nd_item_global_id_0;
|
||||
size_t nd_item_global_range_0;
|
||||
#endif
|
||||
} KernelGlobalsGPU;
|
||||
|
||||
typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals;
|
||||
|
|
|
@ -230,6 +230,12 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
|
|||
/* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
|
||||
* we extend work size to fit uniformity requirements. */
|
||||
global_size = groups_count * local_size;
|
||||
|
||||
# ifdef WITH_ONEAPI_SYCL_HOST_TASK
|
||||
/* Path array implementation is serial in case of SYCL Host Task execution. */
|
||||
global_size = 1;
|
||||
local_size = 1;
|
||||
# endif
|
||||
}
|
||||
|
||||
/* Let the compiler throw an error if there are any kernels missing in this implementation. */
|
||||
|
|
Loading…
Reference in New Issue