Cycles: oneAPI: remove use of SYCL host device
Host device is deprecated in SYCL 2020 spec, cpu device or standard C++ should be used instead.
This commit is contained in:
parent
4776a74bf7
commit
305b92e05f
Notes:
blender-bot
2023-02-14 02:45:41 +01:00
Referenced by commit858fffc2df
, Cycles: oneAPI: add support for SYCL host task Referenced by commit454dd3f7f0
, Cycles: fix up logic in oneAPI devices filtering
|
@ -489,14 +489,12 @@ endif()
|
|||
if(NOT APPLE)
|
||||
option(WITH_CYCLES_DEVICE_ONEAPI "Enable Cycles oneAPI compute support" OFF)
|
||||
option(WITH_CYCLES_ONEAPI_BINARIES "Enable Ahead-Of-Time compilation for Cycles oneAPI device" OFF)
|
||||
option(WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED "Enable use of SYCL host (CPU) device execution by oneAPI implementation. This option is for debugging purposes and impacts GPU execution." OFF)
|
||||
|
||||
# https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html
|
||||
# acm-g10 is the architecture for the first Arc Alchemist GPUs but we'll keep using dg2 until IGC dependency is updated to support acm-g10.
|
||||
set(CYCLES_ONEAPI_SPIR64_GEN_DEVICES "dg2" CACHE STRING "oneAPI Intel GPU architectures to build binaries for")
|
||||
set(CYCLES_ONEAPI_SYCL_TARGETS spir64 spir64_gen CACHE STRING "oneAPI targets to build AOT binaries for")
|
||||
|
||||
mark_as_advanced(WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED)
|
||||
mark_as_advanced(CYCLES_ONEAPI_SPIR64_GEN_DEVICES)
|
||||
mark_as_advanced(CYCLES_ONEAPI_SYCL_TARGETS)
|
||||
endif()
|
||||
|
|
|
@ -39,7 +39,7 @@ bool device_oneapi_init()
|
|||
_putenv_s("SYCL_CACHE_THRESHOLD", "0");
|
||||
}
|
||||
if (getenv("SYCL_DEVICE_FILTER") == nullptr) {
|
||||
_putenv_s("SYCL_DEVICE_FILTER", "host,level_zero");
|
||||
_putenv_s("SYCL_DEVICE_FILTER", "level_zero");
|
||||
}
|
||||
if (getenv("SYCL_ENABLE_PCI") == nullptr) {
|
||||
_putenv_s("SYCL_ENABLE_PCI", "1");
|
||||
|
@ -50,7 +50,7 @@ bool device_oneapi_init()
|
|||
# elif __linux__
|
||||
setenv("SYCL_CACHE_PERSISTENT", "1", false);
|
||||
setenv("SYCL_CACHE_THRESHOLD", "0", false);
|
||||
setenv("SYCL_DEVICE_FILTER", "host,level_zero", false);
|
||||
setenv("SYCL_DEVICE_FILTER", "level_zero", false);
|
||||
setenv("SYCL_ENABLE_PCI", "1", false);
|
||||
setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false);
|
||||
# endif
|
||||
|
|
|
@ -430,8 +430,7 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_
|
|||
sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
|
||||
(void)usm_type;
|
||||
assert(usm_type == sycl::usm::alloc::device ||
|
||||
((device_type == sycl::info::device_type::host ||
|
||||
device_type == sycl::info::device_type::cpu || allow_host) &&
|
||||
((device_type == sycl::info::device_type::cpu || allow_host) &&
|
||||
usm_type == sycl::usm::alloc::host ||
|
||||
usm_type == sycl::usm::alloc::unknown));
|
||||
# else
|
||||
|
@ -672,14 +671,6 @@ std::vector<sycl::device> OneapiDevice::available_devices()
|
|||
if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
|
||||
allow_all_devices = true;
|
||||
|
||||
/* Host device is useful only for debugging at the moment
|
||||
* so we hide this device with default build settings. */
|
||||
# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
|
||||
bool allow_host = true;
|
||||
# else
|
||||
bool allow_host = false;
|
||||
# endif
|
||||
|
||||
const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
|
||||
|
||||
std::vector<sycl::device> available_devices;
|
||||
|
@ -691,17 +682,11 @@ std::vector<sycl::device> OneapiDevice::available_devices()
|
|||
}
|
||||
|
||||
const std::vector<sycl::device> &oneapi_devices =
|
||||
(allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
|
||||
platform.get_devices(sycl::info::device_type::gpu);
|
||||
(allow_all_devices) ? platform.get_devices(sycl::info::device_type::all) :
|
||||
platform.get_devices(sycl::info::device_type::gpu);
|
||||
|
||||
for (const sycl::device &device : oneapi_devices) {
|
||||
if (allow_all_devices) {
|
||||
/* still filter out host device if build doesn't support it. */
|
||||
if (allow_host || !device.is_host()) {
|
||||
available_devices.push_back(device);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (!allow_all_devices) {
|
||||
bool filter_out = false;
|
||||
|
||||
/* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
|
||||
|
@ -733,9 +718,6 @@ std::vector<sycl::device> OneapiDevice::available_devices()
|
|||
}
|
||||
}
|
||||
}
|
||||
else if (!allow_host && device.is_host()) {
|
||||
filter_out = true;
|
||||
}
|
||||
else if (!allow_all_devices) {
|
||||
filter_out = true;
|
||||
}
|
||||
|
@ -798,9 +780,7 @@ char *OneapiDevice::device_capabilities()
|
|||
GET_NUM_ATTR(native_vector_width_double)
|
||||
GET_NUM_ATTR(native_vector_width_half)
|
||||
|
||||
size_t max_clock_frequency =
|
||||
(size_t)(device.is_host() ? (size_t)0 :
|
||||
device.get_info<sycl::info::device::max_clock_frequency>());
|
||||
size_t max_clock_frequency = device.get_info<sycl::info::device::max_clock_frequency>();
|
||||
WRITE_ATTR("max_clock_frequency", max_clock_frequency)
|
||||
|
||||
GET_NUM_ATTR(address_bits)
|
||||
|
|
|
@ -752,10 +752,6 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
|
|||
${SYCL_CPP_FLAGS}
|
||||
)
|
||||
|
||||
if (WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED)
|
||||
list(APPEND sycl_compiler_flags -DWITH_ONEAPI_SYCL_HOST_ENABLED)
|
||||
endif()
|
||||
|
||||
# Set defaults for spir64 and spir64_gen options
|
||||
if (NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64)
|
||||
set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'")
|
||||
|
|
|
@ -23,22 +23,6 @@ CCL_NAMESPACE_BEGIN
|
|||
* and keep device specific code in compat.h */
|
||||
|
||||
#ifdef __KERNEL_ONEAPI__
|
||||
# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
|
||||
template<typename IsActiveOp>
|
||||
void cpu_serial_active_index_array_impl(const uint num_states,
|
||||
ccl_global int *ccl_restrict indices,
|
||||
ccl_global int *ccl_restrict num_indices,
|
||||
IsActiveOp is_active_op)
|
||||
{
|
||||
int write_index = 0;
|
||||
for (int state_index = 0; state_index < num_states; state_index++) {
|
||||
if (is_active_op(state_index))
|
||||
indices[write_index++] = state_index;
|
||||
}
|
||||
*num_indices = write_index;
|
||||
return;
|
||||
}
|
||||
# endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */
|
||||
|
||||
template<typename IsActiveOp>
|
||||
void gpu_parallel_active_index_array_impl(const uint num_states,
|
||||
|
@ -182,18 +166,11 @@ __device__
|
|||
num_simd_groups, \
|
||||
simdgroup_offset)
|
||||
#elif defined(__KERNEL_ONEAPI__)
|
||||
# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
|
||||
# define gpu_parallel_active_index_array( \
|
||||
blocksize, num_states, indices, num_indices, is_active_op) \
|
||||
if (ccl_gpu_global_size_x() == 1) \
|
||||
cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \
|
||||
else \
|
||||
gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op);
|
||||
# else
|
||||
# define gpu_parallel_active_index_array( \
|
||||
blocksize, num_states, indices, num_indices, is_active_op) \
|
||||
gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
|
||||
# endif
|
||||
|
||||
# define gpu_parallel_active_index_array( \
|
||||
blocksize, num_states, indices, num_indices, is_active_op) \
|
||||
gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
|
||||
|
||||
#else
|
||||
|
||||
# define gpu_parallel_active_index_array( \
|
||||
|
|
|
@ -55,18 +55,6 @@
|
|||
#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
|
||||
#define ccl_gpu_kernel_threads(block_num_threads)
|
||||
|
||||
#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
|
||||
# define KG_ND_ITEMS \
|
||||
kg->nd_item_local_id_0 = item.get_local_id(0); \
|
||||
kg->nd_item_local_range_0 = item.get_local_range(0); \
|
||||
kg->nd_item_group_0 = item.get_group(0); \
|
||||
kg->nd_item_group_range_0 = item.get_group_range(0); \
|
||||
kg->nd_item_global_id_0 = item.get_global_id(0); \
|
||||
kg->nd_item_global_range_0 = item.get_global_range(0);
|
||||
#else
|
||||
# define KG_ND_ITEMS
|
||||
#endif
|
||||
|
||||
#define ccl_gpu_kernel_signature(name, ...) \
|
||||
void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
|
||||
size_t kernel_global_size, \
|
||||
|
@ -76,8 +64,7 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
|
|||
(kg); \
|
||||
cgh.parallel_for<class kernel_##name>( \
|
||||
sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
|
||||
[=](sycl::nd_item<1> item) { \
|
||||
KG_ND_ITEMS
|
||||
[=](sycl::nd_item<1> item) {
|
||||
|
||||
#define ccl_gpu_kernel_postfix \
|
||||
}); \
|
||||
|
@ -95,31 +82,17 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
|
|||
} ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg)
|
||||
|
||||
/* GPU thread, block, grid size and index */
|
||||
#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED
|
||||
# define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
|
||||
# define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
|
||||
# define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
|
||||
# define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
|
||||
# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
|
||||
# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
|
||||
|
||||
# define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
|
||||
# define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
|
||||
#else
|
||||
# define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0)
|
||||
# define ccl_gpu_block_dim_x (kg->nd_item_local_range_0)
|
||||
# define ccl_gpu_block_idx_x (kg->nd_item_group_0)
|
||||
# define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0)
|
||||
# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
|
||||
# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
|
||||
|
||||
# define ccl_gpu_global_id_x() (kg->nd_item_global_id_0)
|
||||
# define ccl_gpu_global_size_x() (kg->nd_item_global_range_0)
|
||||
#endif
|
||||
#define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
|
||||
#define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
|
||||
#define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
|
||||
#define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
|
||||
#define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
|
||||
#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
|
||||
|
||||
#define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
|
||||
#define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
|
||||
|
||||
/* GPU warp synchronization */
|
||||
|
||||
#define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier()
|
||||
#define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space)
|
||||
#ifdef __SYCL_DEVICE_ONLY__
|
||||
|
|
|
@ -23,15 +23,6 @@ typedef struct KernelGlobalsGPU {
|
|||
#undef KERNEL_DATA_ARRAY
|
||||
IntegratorStateGPU *integrator_state;
|
||||
const KernelData *__data;
|
||||
#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
|
||||
size_t nd_item_local_id_0;
|
||||
size_t nd_item_local_range_0;
|
||||
size_t nd_item_group_0;
|
||||
size_t nd_item_group_range_0;
|
||||
|
||||
size_t nd_item_global_id_0;
|
||||
size_t nd_item_global_range_0;
|
||||
#endif
|
||||
} KernelGlobalsGPU;
|
||||
|
||||
typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals;
|
||||
|
|
|
@ -230,13 +230,6 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
|
|||
/* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
|
||||
* we extend work size to fit uniformity requirements. */
|
||||
global_size = groups_count * local_size;
|
||||
|
||||
# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
|
||||
if (queue->get_device().is_host()) {
|
||||
global_size = 1;
|
||||
local_size = 1;
|
||||
}
|
||||
# endif
|
||||
}
|
||||
|
||||
/* Let the compiler throw an error if there are any kernels missing in this implementation. */
|
||||
|
|
Loading…
Reference in New Issue