Merge branch 'refactor-mesh-corners-generic' into refactor-mesh-face-generic

This commit is contained in:
Hans Goudey 2023-02-06 17:21:34 -05:00
commit 709ec5ea3a
219 changed files with 2877 additions and 2604 deletions

View File

@ -1722,13 +1722,20 @@ class CyclesPreferences(bpy.types.AddonPreferences):
row.prop(self, "peer_memory")
if compute_device_type == 'METAL':
import platform
# MetalRT only works on Apple Silicon at present, pending argument encoding fixes on AMD
# Kernel specialization is only viable on Apple Silicon at present due to relative compilation speed
if platform.machine() == 'arm64':
import platform, re
isNavi2 = False
for device in devices:
obj = re.search("((RX)|(Pro)|(PRO))\s+W?6\d00X",device.name)
if obj:
isNavi2 = True
# MetalRT only works on Apple Silicon and Navi2
if platform.machine() == 'arm64' or isNavi2:
col = layout.column()
col.use_property_split = True
col.prop(self, "kernel_optimization_level")
# Kernel specialization is only supported on Apple Silicon
if platform.machine() == 'arm64':
col.prop(self, "kernel_optimization_level")
col.prop(self, "use_metalrt")
def draw(self, context):

View File

@ -53,8 +53,12 @@ void CUDADevice::set_error(const string &error)
}
CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
: Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
: GPUDevice(info, stats, profiler)
{
/* Verify that base class types can be used with specific backend types */
static_assert(sizeof(texMemObject) == sizeof(CUtexObject));
static_assert(sizeof(arrayMemObject) == sizeof(CUarray));
first_error = true;
cuDevId = info.num;
@ -65,12 +69,6 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
need_texture_info = false;
device_texture_headroom = 0;
device_working_headroom = 0;
move_texture_to_host = false;
map_host_limit = 0;
map_host_used = 0;
can_map_host = 0;
pitch_alignment = 0;
/* Initialize CUDA. */
@ -91,8 +89,9 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
/* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
* CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
* so we can predict which memory to map to host. */
cuda_assert(
cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
int value;
cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
can_map_host = value != 0;
cuda_assert(cuDeviceGetAttribute(
&pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
@ -499,311 +498,57 @@ void CUDADevice::reserve_local_memory(const uint kernel_features)
# endif
}
void CUDADevice::init_host_memory()
{
/* Limit amount of host mapped memory, because allocating too much can
* cause system instability. Leave at least half or 4 GB of system
* memory free, whichever is smaller. */
size_t default_limit = 4 * 1024 * 1024 * 1024LL;
size_t system_ram = system_physical_ram();
if (system_ram > 0) {
if (system_ram / 2 > default_limit) {
map_host_limit = system_ram - default_limit;
}
else {
map_host_limit = system_ram / 2;
}
}
else {
VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
map_host_limit = 0;
}
/* Amount of device memory to keep is free after texture memory
* and working memory allocations respectively. We set the working
* memory limit headroom lower so that some space is left after all
* texture memory allocations. */
device_working_headroom = 32 * 1024 * 1024LL; // 32MB
device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
<< " bytes. (" << string_human_readable_size(map_host_limit) << ")";
}
void CUDADevice::load_texture_info()
{
if (need_texture_info) {
/* Unset flag before copying, so this does not loop indefinitely if the copy below calls
* into 'move_textures_to_host' (which calls 'load_texture_info' again). */
need_texture_info = false;
texture_info.copy_to_device();
}
}
void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
{
/* Break out of recursive call, which can happen when moving memory on a multi device. */
static bool any_device_moving_textures_to_host = false;
if (any_device_moving_textures_to_host) {
return;
}
/* Signal to reallocate textures in host memory only. */
move_texture_to_host = true;
while (size > 0) {
/* Find suitable memory allocation to move. */
device_memory *max_mem = NULL;
size_t max_size = 0;
bool max_is_image = false;
thread_scoped_lock lock(cuda_mem_map_mutex);
foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
device_memory &mem = *pair.first;
CUDAMem *cmem = &pair.second;
/* Can only move textures allocated on this device (and not those from peer devices).
* And need to ignore memory that is already on the host. */
if (!mem.is_resident(this) || cmem->use_mapped_host) {
continue;
}
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
(&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
/* Can't move this type of memory. */
if (!is_texture || cmem->array) {
continue;
}
/* For other textures, only move image textures. */
if (for_texture && !is_image) {
continue;
}
/* Try to move largest allocation, prefer moving images. */
if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
max_is_image = is_image;
max_size = mem.device_size;
max_mem = &mem;
}
}
lock.unlock();
/* Move to host memory. This part is mutex protected since
* multiple CUDA devices could be moving the memory. The
* first one will do it, and the rest will adopt the pointer. */
if (max_mem) {
VLOG_WORK << "Move memory from device to host: " << max_mem->name;
static thread_mutex move_mutex;
thread_scoped_lock lock(move_mutex);
any_device_moving_textures_to_host = true;
/* Potentially need to call back into multi device, so pointer mapping
* and peer devices are updated. This is also necessary since the device
* pointer may just be a key here, so cannot be accessed and freed directly.
* Unfortunately it does mean that memory is reallocated on all other
* devices as well, which is potentially dangerous when still in use (since
* a thread rendering on another devices would only be caught in this mutex
* if it so happens to do an allocation at the same time as well. */
max_mem->device_copy_to();
size = (max_size >= size) ? 0 : size - max_size;
any_device_moving_textures_to_host = false;
}
else {
break;
}
}
/* Unset flag before texture info is reloaded, since it should stay in device memory. */
move_texture_to_host = false;
/* Update texture info array with new pointers. */
load_texture_info();
}
CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
void CUDADevice::get_device_memory_info(size_t &total, size_t &free)
{
CUDAContextScope scope(this);
CUdeviceptr device_pointer = 0;
size_t size = mem.memory_size() + pitch_padding;
CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
const char *status = "";
/* First try allocating in device memory, respecting headroom. We make
* an exception for texture info. It is small and frequently accessed,
* so treat it as working memory.
*
* If there is not enough room for working memory, we will try to move
* textures to host memory, assuming the performance impact would have
* been worse for working memory. */
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
size_t total = 0, free = 0;
cuMemGetInfo(&free, &total);
/* Move textures to host memory if needed. */
if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
move_textures_to_host(size + headroom - free, is_texture);
cuMemGetInfo(&free, &total);
}
/* Allocate in device memory. */
if (!move_texture_to_host && (size + headroom) < free) {
mem_alloc_result = cuMemAlloc(&device_pointer, size);
if (mem_alloc_result == CUDA_SUCCESS) {
status = " in device memory";
}
}
/* Fall back to mapped host memory if needed and possible. */
void *shared_pointer = 0;
if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
if (mem.shared_pointer) {
/* Another device already allocated host memory. */
mem_alloc_result = CUDA_SUCCESS;
shared_pointer = mem.shared_pointer;
}
else if (map_host_used + size < map_host_limit) {
/* Allocate host memory ourselves. */
mem_alloc_result = cuMemHostAlloc(
&shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
(mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
}
if (mem_alloc_result == CUDA_SUCCESS) {
cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
map_host_used += size;
status = " in host memory";
}
}
if (mem_alloc_result != CUDA_SUCCESS) {
if (mem.type == MEM_DEVICE_ONLY) {
status = " failed, out of device memory";
set_error("System is out of GPU memory");
}
else {
status = " failed, out of device and host memory";
set_error("System is out of GPU and shared host memory");
}
}
if (mem.name) {
VLOG_WORK << "Buffer allocate: " << mem.name << ", "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ")" << status;
}
mem.device_pointer = (device_ptr)device_pointer;
mem.device_size = size;
stats.mem_alloc(size);
if (!mem.device_pointer) {
return NULL;
}
/* Insert into map of allocations. */
thread_scoped_lock lock(cuda_mem_map_mutex);
CUDAMem *cmem = &cuda_mem_map[&mem];
if (shared_pointer != 0) {
/* Replace host pointer with our host allocation. Only works if
* CUDA memory layout is the same and has no pitch padding. Also
* does not work if we move textures to host during a render,
* since other devices might be using the memory. */
if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
mem.host_pointer != shared_pointer) {
memcpy(shared_pointer, mem.host_pointer, size);
/* A Call to device_memory::host_free() should be preceded by
* a call to device_memory::device_free() for host memory
* allocated by a device to be handled properly. Two exceptions
* are here and a call in OptiXDevice::generic_alloc(), where
* the current host memory can be assumed to be allocated by
* device_memory::host_alloc(), not by a device */
mem.host_free();
mem.host_pointer = shared_pointer;
}
mem.shared_pointer = shared_pointer;
mem.shared_counter++;
cmem->use_mapped_host = true;
}
else {
cmem->use_mapped_host = false;
}
return cmem;
}
void CUDADevice::generic_copy_to(device_memory &mem)
bool CUDADevice::alloc_device(void *&device_pointer, size_t size)
{
if (!mem.host_pointer || !mem.device_pointer) {
return;
}
CUDAContextScope scope(this);
/* If use_mapped_host of mem is false, the current device only uses device memory allocated by
* cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
* mem.host_pointer. */
thread_scoped_lock lock(cuda_mem_map_mutex);
if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
const CUDAContextScope scope(this);
cuda_assert(
cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
}
CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer, size);
return mem_alloc_result == CUDA_SUCCESS;
}
void CUDADevice::generic_free(device_memory &mem)
void CUDADevice::free_device(void *device_pointer)
{
if (mem.device_pointer) {
CUDAContextScope scope(this);
thread_scoped_lock lock(cuda_mem_map_mutex);
DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
const CUDAMem &cmem = cuda_mem_map[&mem];
CUDAContextScope scope(this);
/* If cmem.use_mapped_host is true, reference counting is used
* to safely free a mapped host memory. */
cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
}
if (cmem.use_mapped_host) {
assert(mem.shared_pointer);
if (mem.shared_pointer) {
assert(mem.shared_counter > 0);
if (--mem.shared_counter == 0) {
if (mem.host_pointer == mem.shared_pointer) {
mem.host_pointer = 0;
}
cuMemFreeHost(mem.shared_pointer);
mem.shared_pointer = 0;
}
}
map_host_used -= mem.device_size;
}
else {
/* Free device memory. */
cuda_assert(cuMemFree(mem.device_pointer));
}
bool CUDADevice::alloc_host(void *&shared_pointer, size_t size)
{
CUDAContextScope scope(this);
stats.mem_free(mem.device_size);
mem.device_pointer = 0;
mem.device_size = 0;
CUresult mem_alloc_result = cuMemHostAlloc(
&shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
return mem_alloc_result == CUDA_SUCCESS;
}
cuda_mem_map.erase(cuda_mem_map.find(&mem));
}
void CUDADevice::free_host(void *shared_pointer)
{
CUDAContextScope scope(this);
cuMemFreeHost(shared_pointer);
}
bool CUDADevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
{
CUDAContextScope scope(this);
cuda_assert(cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, shared_pointer, 0));
return true;
}
void CUDADevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
{
const CUDAContextScope scope(this);
cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer, size));
}
void CUDADevice::mem_alloc(device_memory &mem)
@ -868,8 +613,8 @@ void CUDADevice::mem_zero(device_memory &mem)
/* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
* regardless of mem.host_pointer and mem.shared_pointer. */
thread_scoped_lock lock(cuda_mem_map_mutex);
if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
thread_scoped_lock lock(device_mem_map_mutex);
if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
const CUDAContextScope scope(this);
cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
}
@ -994,19 +739,19 @@ void CUDADevice::tex_alloc(device_texture &mem)
return;
}
CUDAMem *cmem = NULL;
Mem *cmem = NULL;
CUarray array_3d = NULL;
size_t src_pitch = mem.data_width * dsize * mem.data_elements;
size_t dst_pitch = src_pitch;
if (!mem.is_resident(this)) {
thread_scoped_lock lock(cuda_mem_map_mutex);
cmem = &cuda_mem_map[&mem];
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
cmem->texobject = 0;
if (mem.data_depth > 1) {
array_3d = (CUarray)mem.device_pointer;
cmem->array = array_3d;
cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
}
else if (mem.data_height > 0) {
dst_pitch = align_up(src_pitch, pitch_alignment);
@ -1050,10 +795,10 @@ void CUDADevice::tex_alloc(device_texture &mem)
mem.device_size = size;
stats.mem_alloc(size);
thread_scoped_lock lock(cuda_mem_map_mutex);
cmem = &cuda_mem_map[&mem];
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
cmem->texobject = 0;
cmem->array = array_3d;
cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
}
else if (mem.data_height > 0) {
/* 2D texture, using pitch aligned linear memory. */
@ -1137,8 +882,8 @@ void CUDADevice::tex_alloc(device_texture &mem)
texDesc.filterMode = filter_mode;
texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
thread_scoped_lock lock(cuda_mem_map_mutex);
cmem = &cuda_mem_map[&mem];
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
@ -1153,9 +898,9 @@ void CUDADevice::tex_free(device_texture &mem)
{
if (mem.device_pointer) {
CUDAContextScope scope(this);
thread_scoped_lock lock(cuda_mem_map_mutex);
DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
const CUDAMem &cmem = cuda_mem_map[&mem];
thread_scoped_lock lock(device_mem_map_mutex);
DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
const Mem &cmem = device_mem_map[&mem];
if (cmem.texobject) {
/* Free bindless texture. */
@ -1164,16 +909,16 @@ void CUDADevice::tex_free(device_texture &mem)
if (!mem.is_resident(this)) {
/* Do not free memory here, since it was allocated on a different device. */
cuda_mem_map.erase(cuda_mem_map.find(&mem));
device_mem_map.erase(device_mem_map.find(&mem));
}
else if (cmem.array) {
/* Free array. */
cuArrayDestroy(cmem.array);
cuArrayDestroy(reinterpret_cast<CUarray>(cmem.array));
stats.mem_free(mem.device_size);
mem.device_pointer = 0;
mem.device_size = 0;
cuda_mem_map.erase(cuda_mem_map.find(&mem));
device_mem_map.erase(device_mem_map.find(&mem));
}
else {
lock.unlock();

View File

@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
class DeviceQueue;
class CUDADevice : public Device {
class CUDADevice : public GPUDevice {
friend class CUDAContextScope;
@ -29,36 +29,11 @@ class CUDADevice : public Device {
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
size_t device_texture_headroom;
size_t device_working_headroom;
bool move_texture_to_host;
size_t map_host_used;
size_t map_host_limit;
int can_map_host;
int pitch_alignment;
int cuDevId;
int cuDevArchitecture;
bool first_error;
struct CUDAMem {
CUDAMem() : texobject(0), array(0), use_mapped_host(false)
{
}
CUtexObject texobject;
CUarray array;
/* If true, a mapped host memory in shared_pointer is being used. */
bool use_mapped_host;
};
typedef map<device_memory *, CUDAMem> CUDAMemMap;
CUDAMemMap cuda_mem_map;
thread_mutex cuda_mem_map_mutex;
/* Bindless Textures */
device_vector<TextureInfo> texture_info;
bool need_texture_info;
CUDADeviceKernels kernels;
static bool have_precompiled_kernels();
@ -88,17 +63,13 @@ class CUDADevice : public Device {
void reserve_local_memory(const uint kernel_features);
void init_host_memory();
void load_texture_info();
void move_textures_to_host(size_t size, bool for_texture);
CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
void generic_copy_to(device_memory &mem);
void generic_free(device_memory &mem);
virtual void get_device_memory_info(size_t &total, size_t &free) override;
virtual bool alloc_device(void *&device_pointer, size_t size) override;
virtual void free_device(void *device_pointer) override;
virtual bool alloc_host(void *&shared_pointer, size_t size) override;
virtual void free_host(void *shared_pointer) override;
virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;
void mem_alloc(device_memory &mem) override;

View File

@ -452,6 +452,320 @@ void *Device::get_cpu_osl_memory()
return nullptr;
}
GPUDevice::~GPUDevice() noexcept(false)
{
}
bool GPUDevice::load_texture_info()
{
if (need_texture_info) {
/* Unset flag before copying, so this does not loop indefinitely if the copy below calls
* into 'move_textures_to_host' (which calls 'load_texture_info' again). */
need_texture_info = false;
texture_info.copy_to_device();
return true;
}
else {
return false;
}
}
void GPUDevice::init_host_memory(size_t preferred_texture_headroom,
size_t preferred_working_headroom)
{
/* Limit amount of host mapped memory, because allocating too much can
* cause system instability. Leave at least half or 4 GB of system
* memory free, whichever is smaller. */
size_t default_limit = 4 * 1024 * 1024 * 1024LL;
size_t system_ram = system_physical_ram();
if (system_ram > 0) {
if (system_ram / 2 > default_limit) {
map_host_limit = system_ram - default_limit;
}
else {
map_host_limit = system_ram / 2;
}
}
else {
VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
map_host_limit = 0;
}
/* Amount of device memory to keep free after texture memory
* and working memory allocations respectively. We set the working
* memory limit headroom lower than the working one so there
* is space left for it. */
device_working_headroom = preferred_working_headroom > 0 ? preferred_working_headroom :
32 * 1024 * 1024LL; // 32MB
device_texture_headroom = preferred_texture_headroom > 0 ? preferred_texture_headroom :
128 * 1024 * 1024LL; // 128MB
VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
<< " bytes. (" << string_human_readable_size(map_host_limit) << ")";
}
void GPUDevice::move_textures_to_host(size_t size, bool for_texture)
{
/* Break out of recursive call, which can happen when moving memory on a multi device. */
static bool any_device_moving_textures_to_host = false;
if (any_device_moving_textures_to_host) {
return;
}
/* Signal to reallocate textures in host memory only. */
move_texture_to_host = true;
while (size > 0) {
/* Find suitable memory allocation to move. */
device_memory *max_mem = NULL;
size_t max_size = 0;
bool max_is_image = false;
thread_scoped_lock lock(device_mem_map_mutex);
foreach (MemMap::value_type &pair, device_mem_map) {
device_memory &mem = *pair.first;
Mem *cmem = &pair.second;
/* Can only move textures allocated on this device (and not those from peer devices).
* And need to ignore memory that is already on the host. */
if (!mem.is_resident(this) || cmem->use_mapped_host) {
continue;
}
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
(&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
/* Can't move this type of memory. */
if (!is_texture || cmem->array) {
continue;
}
/* For other textures, only move image textures. */
if (for_texture && !is_image) {
continue;
}
/* Try to move largest allocation, prefer moving images. */
if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
max_is_image = is_image;
max_size = mem.device_size;
max_mem = &mem;
}
}
lock.unlock();
/* Move to host memory. This part is mutex protected since
* multiple backend devices could be moving the memory. The
* first one will do it, and the rest will adopt the pointer. */
if (max_mem) {
VLOG_WORK << "Move memory from device to host: " << max_mem->name;
static thread_mutex move_mutex;
thread_scoped_lock lock(move_mutex);
any_device_moving_textures_to_host = true;
/* Potentially need to call back into multi device, so pointer mapping
* and peer devices are updated. This is also necessary since the device
* pointer may just be a key here, so cannot be accessed and freed directly.
* Unfortunately it does mean that memory is reallocated on all other
* devices as well, which is potentially dangerous when still in use (since
* a thread rendering on another devices would only be caught in this mutex
* if it so happens to do an allocation at the same time as well. */
max_mem->device_copy_to();
size = (max_size >= size) ? 0 : size - max_size;
any_device_moving_textures_to_host = false;
}
else {
break;
}
}
/* Unset flag before texture info is reloaded, since it should stay in device memory. */
move_texture_to_host = false;
/* Update texture info array with new pointers. */
load_texture_info();
}
GPUDevice::Mem *GPUDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
{
void *device_pointer = 0;
size_t size = mem.memory_size() + pitch_padding;
bool mem_alloc_result = false;
const char *status = "";
/* First try allocating in device memory, respecting headroom. We make
* an exception for texture info. It is small and frequently accessed,
* so treat it as working memory.
*
* If there is not enough room for working memory, we will try to move
* textures to host memory, assuming the performance impact would have
* been worse for working memory. */
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
size_t total = 0, free = 0;
get_device_memory_info(total, free);
/* Move textures to host memory if needed. */
if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
move_textures_to_host(size + headroom - free, is_texture);
get_device_memory_info(total, free);
}
/* Allocate in device memory. */
if (!move_texture_to_host && (size + headroom) < free) {
mem_alloc_result = alloc_device(device_pointer, size);
if (mem_alloc_result) {
device_mem_in_use += size;
status = " in device memory";
}
}
/* Fall back to mapped host memory if needed and possible. */
void *shared_pointer = 0;
if (!mem_alloc_result && can_map_host && mem.type != MEM_DEVICE_ONLY) {
if (mem.shared_pointer) {
/* Another device already allocated host memory. */
mem_alloc_result = true;
shared_pointer = mem.shared_pointer;
}
else if (map_host_used + size < map_host_limit) {
/* Allocate host memory ourselves. */
mem_alloc_result = alloc_host(shared_pointer, size);
assert((mem_alloc_result && shared_pointer != 0) ||
(!mem_alloc_result && shared_pointer == 0));
}
if (mem_alloc_result) {
assert(transform_host_pointer(&device_pointer, shared_pointer));
map_host_used += size;
status = " in host memory";
}
}
if (!mem_alloc_result) {
if (mem.type == MEM_DEVICE_ONLY) {
status = " failed, out of device memory";
set_error("System is out of GPU memory");
}
else {
status = " failed, out of device and host memory";
set_error("System is out of GPU and shared host memory");
}
}
if (mem.name) {
VLOG_WORK << "Buffer allocate: " << mem.name << ", "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ")" << status;
}
mem.device_pointer = (device_ptr)device_pointer;
mem.device_size = size;
stats.mem_alloc(size);
if (!mem.device_pointer) {
return NULL;
}
/* Insert into map of allocations. */
thread_scoped_lock lock(device_mem_map_mutex);
Mem *cmem = &device_mem_map[&mem];
if (shared_pointer != 0) {
/* Replace host pointer with our host allocation. Only works if
* memory layout is the same and has no pitch padding. Also
* does not work if we move textures to host during a render,
* since other devices might be using the memory. */
if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
mem.host_pointer != shared_pointer) {
memcpy(shared_pointer, mem.host_pointer, size);
/* A Call to device_memory::host_free() should be preceded by
* a call to device_memory::device_free() for host memory
* allocated by a device to be handled properly. Two exceptions
* are here and a call in OptiXDevice::generic_alloc(), where
* the current host memory can be assumed to be allocated by
* device_memory::host_alloc(), not by a device */
mem.host_free();
mem.host_pointer = shared_pointer;
}
mem.shared_pointer = shared_pointer;
mem.shared_counter++;
cmem->use_mapped_host = true;
}
else {
cmem->use_mapped_host = false;
}
return cmem;
}
void GPUDevice::generic_free(device_memory &mem)
{
if (mem.device_pointer) {
thread_scoped_lock lock(device_mem_map_mutex);
DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
const Mem &cmem = device_mem_map[&mem];
/* If cmem.use_mapped_host is true, reference counting is used
* to safely free a mapped host memory. */
if (cmem.use_mapped_host) {
assert(mem.shared_pointer);
if (mem.shared_pointer) {
assert(mem.shared_counter > 0);
if (--mem.shared_counter == 0) {
if (mem.host_pointer == mem.shared_pointer) {
mem.host_pointer = 0;
}
free_host(mem.shared_pointer);
mem.shared_pointer = 0;
}
}
map_host_used -= mem.device_size;
}
else {
/* Free device memory. */
free_device((void *)mem.device_pointer);
device_mem_in_use -= mem.device_size;
}
stats.mem_free(mem.device_size);
mem.device_pointer = 0;
mem.device_size = 0;
device_mem_map.erase(device_mem_map.find(&mem));
}
}
void GPUDevice::generic_copy_to(device_memory &mem)
{
if (!mem.host_pointer || !mem.device_pointer) {
return;
}
/* If use_mapped_host of mem is false, the current device only uses device memory allocated by
* backend device allocation regardless of mem.host_pointer and mem.shared_pointer, and should
* copy data from mem.host_pointer. */
thread_scoped_lock lock(device_mem_map_mutex);
if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
copy_host_to_device((void *)mem.device_pointer, mem.host_pointer, mem.memory_size());
}
}
/* DeviceInfo */
CCL_NAMESPACE_END

View File

@ -309,6 +309,93 @@ class Device {
static uint devices_initialized_mask;
};
/* Device, which is GPU, with some common functionality for GPU backends */
class GPUDevice : public Device {
protected:
GPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
: Device(info_, stats_, profiler_),
texture_info(this, "texture_info", MEM_GLOBAL),
need_texture_info(false),
can_map_host(false),
map_host_used(0),
map_host_limit(0),
device_texture_headroom(0),
device_working_headroom(0),
device_mem_map(),
device_mem_map_mutex(),
move_texture_to_host(false),
device_mem_in_use(0)
{
}
public:
virtual ~GPUDevice() noexcept(false);
/* For GPUs that can use bindless textures in some way or another. */
device_vector<TextureInfo> texture_info;
bool need_texture_info;
/* Returns true if the texture info was copied to the device (meaning, some more
* re-initialization might be needed). */
virtual bool load_texture_info();
protected:
/* Memory allocation, only accessed through device_memory. */
friend class device_memory;
bool can_map_host;
size_t map_host_used;
size_t map_host_limit;
size_t device_texture_headroom;
size_t device_working_headroom;
typedef unsigned long long texMemObject;
typedef unsigned long long arrayMemObject;
struct Mem {
Mem() : texobject(0), array(0), use_mapped_host(false)
{
}
texMemObject texobject;
arrayMemObject array;
/* If true, a mapped host memory in shared_pointer is being used. */
bool use_mapped_host;
};
typedef map<device_memory *, Mem> MemMap;
MemMap device_mem_map;
thread_mutex device_mem_map_mutex;
bool move_texture_to_host;
/* Simple counter which will try to track amount of used device memory */
size_t device_mem_in_use;
virtual void init_host_memory(size_t preferred_texture_headroom = 0,
size_t preferred_working_headroom = 0);
virtual void move_textures_to_host(size_t size, bool for_texture);
/* Allocation, deallocation and copy functions, with coresponding
* support of device/host allocations. */
virtual GPUDevice::Mem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
virtual void generic_free(device_memory &mem);
virtual void generic_copy_to(device_memory &mem);
/* total - amount of device memory, free - amount of available device memory */
virtual void get_device_memory_info(size_t &total, size_t &free) = 0;
virtual bool alloc_device(void *&device_pointer, size_t size) = 0;
virtual void free_device(void *device_pointer) = 0;
virtual bool alloc_host(void *&shared_pointer, size_t size) = 0;
virtual void free_host(void *shared_pointer) = 0;
/* This function should return device pointer coresponding to shared pointer, which
* is host buffer, allocated in `alloc_host`. The function should `true`, if such
* address transformation is possible and `false` overwise */
virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) = 0;
virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) = 0;
};
CCL_NAMESPACE_END
#endif /* __DEVICE_H__ */

View File

@ -53,8 +53,12 @@ void HIPDevice::set_error(const string &error)
}
HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
: Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
: GPUDevice(info, stats, profiler)
{
/* Verify that base class types can be used with specific backend types */
static_assert(sizeof(texMemObject) == sizeof(hipTextureObject_t));
static_assert(sizeof(arrayMemObject) == sizeof(hArray));
first_error = true;
hipDevId = info.num;
@ -65,12 +69,6 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
need_texture_info = false;
device_texture_headroom = 0;
device_working_headroom = 0;
move_texture_to_host = false;
map_host_limit = 0;
map_host_used = 0;
can_map_host = 0;
pitch_alignment = 0;
/* Initialize HIP. */
@ -91,7 +89,9 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
/* hipDeviceMapHost for mapping host memory when out of device memory.
* hipDeviceLmemResizeToMax for reserving local memory ahead of render,
* so we can predict which memory to map to host. */
hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));
int value;
hip_assert(hipDeviceGetAttribute(&value, hipDeviceAttributeCanMapHostMemory, hipDevice));
can_map_host = value != 0;
hip_assert(
hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice));
@ -460,305 +460,58 @@ void HIPDevice::reserve_local_memory(const uint kernel_features)
# endif
}
void HIPDevice::init_host_memory()
{
/* Limit amount of host mapped memory, because allocating too much can
* cause system instability. Leave at least half or 4 GB of system
* memory free, whichever is smaller. */
size_t default_limit = 4 * 1024 * 1024 * 1024LL;
size_t system_ram = system_physical_ram();
if (system_ram > 0) {
if (system_ram / 2 > default_limit) {
map_host_limit = system_ram - default_limit;
}
else {
map_host_limit = system_ram / 2;
}
}
else {
VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
map_host_limit = 0;
}
/* Amount of device memory to keep is free after texture memory
* and working memory allocations respectively. We set the working
* memory limit headroom lower so that some space is left after all
* texture memory allocations. */
device_working_headroom = 32 * 1024 * 1024LL; // 32MB
device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
<< " bytes. (" << string_human_readable_size(map_host_limit) << ")";
}
void HIPDevice::load_texture_info()
{
if (need_texture_info) {
/* Unset flag before copying, so this does not loop indefinitely if the copy below calls
* into 'move_textures_to_host' (which calls 'load_texture_info' again). */
need_texture_info = false;
texture_info.copy_to_device();
}
}
void HIPDevice::move_textures_to_host(size_t size, bool for_texture)
{
/* Break out of recursive call, which can happen when moving memory on a multi device. */
static bool any_device_moving_textures_to_host = false;
if (any_device_moving_textures_to_host) {
return;
}
/* Signal to reallocate textures in host memory only. */
move_texture_to_host = true;
while (size > 0) {
/* Find suitable memory allocation to move. */
device_memory *max_mem = NULL;
size_t max_size = 0;
bool max_is_image = false;
thread_scoped_lock lock(hip_mem_map_mutex);
foreach (HIPMemMap::value_type &pair, hip_mem_map) {
device_memory &mem = *pair.first;
HIPMem *cmem = &pair.second;
/* Can only move textures allocated on this device (and not those from peer devices).
* And need to ignore memory that is already on the host. */
if (!mem.is_resident(this) || cmem->use_mapped_host) {
continue;
}
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
(&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
/* Can't move this type of memory. */
if (!is_texture || cmem->array) {
continue;
}
/* For other textures, only move image textures. */
if (for_texture && !is_image) {
continue;
}
/* Try to move largest allocation, prefer moving images. */
if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
max_is_image = is_image;
max_size = mem.device_size;
max_mem = &mem;
}
}
lock.unlock();
/* Move to host memory. This part is mutex protected since
* multiple HIP devices could be moving the memory. The
* first one will do it, and the rest will adopt the pointer. */
if (max_mem) {
VLOG_WORK << "Move memory from device to host: " << max_mem->name;
static thread_mutex move_mutex;
thread_scoped_lock lock(move_mutex);
any_device_moving_textures_to_host = true;
/* Potentially need to call back into multi device, so pointer mapping
* and peer devices are updated. This is also necessary since the device
* pointer may just be a key here, so cannot be accessed and freed directly.
* Unfortunately it does mean that memory is reallocated on all other
* devices as well, which is potentially dangerous when still in use (since
* a thread rendering on another devices would only be caught in this mutex
* if it so happens to do an allocation at the same time as well. */
max_mem->device_copy_to();
size = (max_size >= size) ? 0 : size - max_size;
any_device_moving_textures_to_host = false;
}
else {
break;
}
}
/* Unset flag before texture info is reloaded, since it should stay in device memory. */
move_texture_to_host = false;
/* Update texture info array with new pointers. */
load_texture_info();
}
HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
void HIPDevice::get_device_memory_info(size_t &total, size_t &free)
{
HIPContextScope scope(this);
hipDeviceptr_t device_pointer = 0;
size_t size = mem.memory_size() + pitch_padding;
hipError_t mem_alloc_result = hipErrorOutOfMemory;
const char *status = "";
/* First try allocating in device memory, respecting headroom. We make
* an exception for texture info. It is small and frequently accessed,
* so treat it as working memory.
*
* If there is not enough room for working memory, we will try to move
* textures to host memory, assuming the performance impact would have
* been worse for working memory. */
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
size_t total = 0, free = 0;
hipMemGetInfo(&free, &total);
/* Move textures to host memory if needed. */
if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
move_textures_to_host(size + headroom - free, is_texture);
hipMemGetInfo(&free, &total);
}
/* Allocate in device memory. */
if (!move_texture_to_host && (size + headroom) < free) {
mem_alloc_result = hipMalloc(&device_pointer, size);
if (mem_alloc_result == hipSuccess) {
status = " in device memory";
}
}
/* Fall back to mapped host memory if needed and possible. */
void *shared_pointer = 0;
if (mem_alloc_result != hipSuccess && can_map_host) {
if (mem.shared_pointer) {
/* Another device already allocated host memory. */
mem_alloc_result = hipSuccess;
shared_pointer = mem.shared_pointer;
}
else if (map_host_used + size < map_host_limit) {
/* Allocate host memory ourselves. */
mem_alloc_result = hipHostMalloc(
&shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
(mem_alloc_result != hipSuccess && shared_pointer == 0));
}
if (mem_alloc_result == hipSuccess) {
hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0));
map_host_used += size;
status = " in host memory";
}
}
if (mem_alloc_result != hipSuccess) {
status = " failed, out of device and host memory";
set_error("System is out of GPU and shared host memory");
}
if (mem.name) {
VLOG_WORK << "Buffer allocate: " << mem.name << ", "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ")" << status;
}
mem.device_pointer = (device_ptr)device_pointer;
mem.device_size = size;
stats.mem_alloc(size);
if (!mem.device_pointer) {
return NULL;
}
/* Insert into map of allocations. */
thread_scoped_lock lock(hip_mem_map_mutex);
HIPMem *cmem = &hip_mem_map[&mem];
if (shared_pointer != 0) {
/* Replace host pointer with our host allocation. Only works if
* HIP memory layout is the same and has no pitch padding. Also
* does not work if we move textures to host during a render,
* since other devices might be using the memory. */
if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
mem.host_pointer != shared_pointer) {
memcpy(shared_pointer, mem.host_pointer, size);
/* A Call to device_memory::host_free() should be preceded by
* a call to device_memory::device_free() for host memory
* allocated by a device to be handled properly. Two exceptions
* are here and a call in OptiXDevice::generic_alloc(), where
* the current host memory can be assumed to be allocated by
* device_memory::host_alloc(), not by a device */
mem.host_free();
mem.host_pointer = shared_pointer;
}
mem.shared_pointer = shared_pointer;
mem.shared_counter++;
cmem->use_mapped_host = true;
}
else {
cmem->use_mapped_host = false;
}
return cmem;
}
void HIPDevice::generic_copy_to(device_memory &mem)
bool HIPDevice::alloc_device(void *&device_pointer, size_t size)
{
if (!mem.host_pointer || !mem.device_pointer) {
return;
}
HIPContextScope scope(this);
/* If use_mapped_host of mem is false, the current device only uses device memory allocated by
* hipMalloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
* mem.host_pointer. */
thread_scoped_lock lock(hip_mem_map_mutex);
if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
const HIPContextScope scope(this);
hip_assert(
hipMemcpyHtoD((hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size()));
}
hipError_t mem_alloc_result = hipMalloc((hipDeviceptr_t *)&device_pointer, size);
return mem_alloc_result == hipSuccess;
}
void HIPDevice::generic_free(device_memory &mem)
void HIPDevice::free_device(void *device_pointer)
{
if (mem.device_pointer) {
HIPContextScope scope(this);
thread_scoped_lock lock(hip_mem_map_mutex);
DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
const HIPMem &cmem = hip_mem_map[&mem];
HIPContextScope scope(this);
/* If cmem.use_mapped_host is true, reference counting is used
* to safely free a mapped host memory. */
hip_assert(hipFree((hipDeviceptr_t)device_pointer));
}
if (cmem.use_mapped_host) {
assert(mem.shared_pointer);
if (mem.shared_pointer) {
assert(mem.shared_counter > 0);
if (--mem.shared_counter == 0) {
if (mem.host_pointer == mem.shared_pointer) {
mem.host_pointer = 0;
}
hipHostFree(mem.shared_pointer);
mem.shared_pointer = 0;
}
}
map_host_used -= mem.device_size;
}
else {
/* Free device memory. */
hip_assert(hipFree(mem.device_pointer));
}
bool HIPDevice::alloc_host(void *&shared_pointer, size_t size)
{
HIPContextScope scope(this);
stats.mem_free(mem.device_size);
mem.device_pointer = 0;
mem.device_size = 0;
hipError_t mem_alloc_result = hipHostMalloc(
&shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
hip_mem_map.erase(hip_mem_map.find(&mem));
}
return mem_alloc_result == hipSuccess;
}
void HIPDevice::free_host(void *shared_pointer)
{
HIPContextScope scope(this);
hipHostFree(shared_pointer);
}
bool HIPDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
{
HIPContextScope scope(this);
hip_assert(hipHostGetDevicePointer((hipDeviceptr_t *)&device_pointer, shared_pointer, 0));
return true;
}
void HIPDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
{
const HIPContextScope scope(this);
hip_assert(hipMemcpyHtoD((hipDeviceptr_t)device_pointer, host_pointer, size));
}
void HIPDevice::mem_alloc(device_memory &mem)
@ -823,8 +576,8 @@ void HIPDevice::mem_zero(device_memory &mem)
/* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
* regardless of mem.host_pointer and mem.shared_pointer. */
thread_scoped_lock lock(hip_mem_map_mutex);
if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
thread_scoped_lock lock(device_mem_map_mutex);
if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
const HIPContextScope scope(this);
hip_assert(hipMemsetD8((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size()));
}
@ -951,19 +704,19 @@ void HIPDevice::tex_alloc(device_texture &mem)
return;
}
HIPMem *cmem = NULL;
Mem *cmem = NULL;
hArray array_3d = NULL;
size_t src_pitch = mem.data_width * dsize * mem.data_elements;
size_t dst_pitch = src_pitch;
if (!mem.is_resident(this)) {
thread_scoped_lock lock(hip_mem_map_mutex);
cmem = &hip_mem_map[&mem];
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
cmem->texobject = 0;
if (mem.data_depth > 1) {
array_3d = (hArray)mem.device_pointer;
cmem->array = array_3d;
cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
}
else if (mem.data_height > 0) {
dst_pitch = align_up(src_pitch, pitch_alignment);
@ -1007,10 +760,10 @@ void HIPDevice::tex_alloc(device_texture &mem)
mem.device_size = size;
stats.mem_alloc(size);
thread_scoped_lock lock(hip_mem_map_mutex);
cmem = &hip_mem_map[&mem];
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
cmem->texobject = 0;
cmem->array = array_3d;
cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
}
else if (mem.data_height > 0) {
/* 2D texture, using pitch aligned linear memory. */
@ -1095,8 +848,8 @@ void HIPDevice::tex_alloc(device_texture &mem)
texDesc.filterMode = filter_mode;
texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES;
thread_scoped_lock lock(hip_mem_map_mutex);
cmem = &hip_mem_map[&mem];
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
hip_assert(hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
@ -1111,9 +864,9 @@ void HIPDevice::tex_free(device_texture &mem)
{
if (mem.device_pointer) {
HIPContextScope scope(this);
thread_scoped_lock lock(hip_mem_map_mutex);
DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
const HIPMem &cmem = hip_mem_map[&mem];
thread_scoped_lock lock(device_mem_map_mutex);
DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
const Mem &cmem = device_mem_map[&mem];
if (cmem.texobject) {
/* Free bindless texture. */
@ -1122,16 +875,16 @@ void HIPDevice::tex_free(device_texture &mem)
if (!mem.is_resident(this)) {
/* Do not free memory here, since it was allocated on a different device. */
hip_mem_map.erase(hip_mem_map.find(&mem));
device_mem_map.erase(device_mem_map.find(&mem));
}
else if (cmem.array) {
/* Free array. */
hipArrayDestroy(cmem.array);
hipArrayDestroy(reinterpret_cast<hArray>(cmem.array));
stats.mem_free(mem.device_size);
mem.device_pointer = 0;
mem.device_size = 0;
hip_mem_map.erase(hip_mem_map.find(&mem));
device_mem_map.erase(device_mem_map.find(&mem));
}
else {
lock.unlock();

View File

@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
class DeviceQueue;
class HIPDevice : public Device {
class HIPDevice : public GPUDevice {
friend class HIPContextScope;
@ -26,36 +26,11 @@ class HIPDevice : public Device {
hipDevice_t hipDevice;
hipCtx_t hipContext;
hipModule_t hipModule;
size_t device_texture_headroom;
size_t device_working_headroom;
bool move_texture_to_host;
size_t map_host_used;
size_t map_host_limit;
int can_map_host;
int pitch_alignment;
int hipDevId;
int hipDevArchitecture;
bool first_error;
struct HIPMem {
HIPMem() : texobject(0), array(0), use_mapped_host(false)
{
}
hipTextureObject_t texobject;
hArray array;
/* If true, a mapped host memory in shared_pointer is being used. */
bool use_mapped_host;
};
typedef map<device_memory *, HIPMem> HIPMemMap;
HIPMemMap hip_mem_map;
thread_mutex hip_mem_map_mutex;
/* Bindless Textures */
device_vector<TextureInfo> texture_info;
bool need_texture_info;
HIPDeviceKernels kernels;
static bool have_precompiled_kernels();
@ -81,17 +56,13 @@ class HIPDevice : public Device {
virtual bool load_kernels(const uint kernel_features) override;
void reserve_local_memory(const uint kernel_features);
void init_host_memory();
void load_texture_info();
void move_textures_to_host(size_t size, bool for_texture);
HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
void generic_copy_to(device_memory &mem);
void generic_free(device_memory &mem);
virtual void get_device_memory_info(size_t &total, size_t &free) override;
virtual bool alloc_device(void *&device_pointer, size_t size) override;
virtual void free_device(void *device_pointer) override;
virtual bool alloc_host(void *&shared_pointer, size_t size) override;
virtual void free_host(void *shared_pointer) override;
virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;
void mem_alloc(device_memory &mem) override;

View File

@ -73,6 +73,10 @@ const char *device_kernel_as_string(DeviceKernel kernel)
return "integrator_terminated_paths_array";
case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
return "integrator_sorted_paths_array";
case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
return "integrator_sort_bucket_pass";
case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS:
return "integrator_sort_write_pass";
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
return "integrator_compact_paths_array";
case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:

View File

@ -247,6 +247,8 @@ class device_memory {
bool is_resident(Device *sub_device) const;
protected:
friend class Device;
friend class GPUDevice;
friend class CUDADevice;
friend class OptiXDevice;
friend class HIPDevice;

View File

@ -21,6 +21,7 @@ class BVHMetal : public BVH {
API_AVAILABLE(macos(11.0))
vector<id<MTLAccelerationStructure>> blas_array;
vector<uint32_t> blas_lookup;
bool motion_blur = false;

View File

@ -816,6 +816,11 @@ bool BVHMetal::build_TLAS(Progress &progress,
uint32_t instance_index = 0;
uint32_t motion_transform_index = 0;
// allocate look up buffer for wost case scenario
uint64_t count = objects.size();
blas_lookup.resize(count);
for (Object *ob : objects) {
/* Skip non-traceable objects */
if (!ob->is_traceable())
@ -843,12 +848,15 @@ bool BVHMetal::build_TLAS(Progress &progress,
/* Set user instance ID to object index */
int object_index = ob->get_device_index();
uint32_t user_id = uint32_t(object_index);
int currIndex = instance_index++;
assert(user_id < blas_lookup.size());
blas_lookup[user_id] = accel_struct_index;
/* Bake into the appropriate descriptor */
if (motion_blur) {
MTLAccelerationStructureMotionInstanceDescriptor *instances =
(MTLAccelerationStructureMotionInstanceDescriptor *)[instanceBuf contents];
MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[instance_index++];
MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[currIndex];
desc.accelerationStructureIndex = accel_struct_index;
desc.userID = user_id;
@ -894,7 +902,7 @@ bool BVHMetal::build_TLAS(Progress &progress,
else {
MTLAccelerationStructureUserIDInstanceDescriptor *instances =
(MTLAccelerationStructureUserIDInstanceDescriptor *)[instanceBuf contents];
MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[instance_index++];
MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[currIndex];
desc.accelerationStructureIndex = accel_struct_index;
desc.userID = user_id;

View File

@ -55,6 +55,10 @@ void device_metal_info(vector<DeviceInfo> &devices)
info.denoisers = DENOISER_NONE;
info.id = id;
if (MetalInfo::get_device_vendor(device) == METAL_GPU_AMD) {
info.has_light_tree = false;
}
devices.push_back(info);
device_index++;
}

View File

@ -74,6 +74,11 @@ class MetalDevice : public Device {
id<MTLBuffer> texture_bindings_3d = nil;
std::vector<id<MTLTexture>> texture_slot_map;
/* BLAS encoding & lookup */
id<MTLArgumentEncoder> mtlBlasArgEncoder = nil;
id<MTLBuffer> blas_buffer = nil;
id<MTLBuffer> blas_lookup_buffer = nil;
bool use_metalrt = false;
MetalPipelineType kernel_specialization_level = PSO_GENERIC;
@ -105,6 +110,8 @@ class MetalDevice : public Device {
bool use_adaptive_compilation();
bool use_local_atomic_sort() const;
bool make_source_and_check_if_compile_needed(MetalPipelineType pso_type);
void make_source(MetalPipelineType pso_type, const uint kernel_features);

View File

@ -192,6 +192,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
arg_desc_as.access = MTLArgumentAccessReadOnly;
MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init];
arg_desc_ptrs.dataType = MTLDataTypePointer;
arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
arg_desc_ift.access = MTLArgumentAccessReadOnly;
@ -204,14 +208,28 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
arg_desc_ift.index = index++;
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
arg_desc_ift.index = index++;
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_prim */
arg_desc_ptrs.index = index++;
[ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas array */
arg_desc_ptrs.index = index++;
[ancillary_desc addObject:[arg_desc_ptrs copy]]; /* look up table for blas */
[arg_desc_ift release];
[arg_desc_as release];
[arg_desc_ptrs release];
}
}
mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];
// preparing the blas arg encoder
MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
arg_desc_blas.access = MTLArgumentAccessReadOnly;
mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
[arg_desc_blas release];
for (int i = 0; i < ancillary_desc.count; i++) {
[ancillary_desc[i] release];
}
@ -271,6 +289,11 @@ bool MetalDevice::use_adaptive_compilation()
return DebugFlags().metal.adaptive_compile;
}
bool MetalDevice::use_local_atomic_sort() const
{
return DebugFlags().metal.use_local_atomic_sort;
}
void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
{
string global_defines;
@ -278,6 +301,10 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
}
if (use_local_atomic_sort()) {
global_defines += "#define __KERNEL_LOCAL_ATOMIC_SORT__\n";
}
if (use_metalrt) {
global_defines += "#define __METALRT__\n";
if (motion_blur) {
@ -1231,6 +1258,33 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
if (@available(macos 11.0, *)) {
if (bvh->params.top_level) {
bvhMetalRT = bvh_metal;
// allocate required buffers for BLAS array
uint64_t count = bvhMetalRT->blas_array.size();
uint64_t bufferSize = mtlBlasArgEncoder.encodedLength * count;
blas_buffer = [mtlDevice newBufferWithLength:bufferSize options:default_storage_mode];
stats.mem_alloc(blas_buffer.allocatedSize);
for (uint64_t i = 0; i < count; ++i) {
[mtlBlasArgEncoder setArgumentBuffer:blas_buffer
offset:i * mtlBlasArgEncoder.encodedLength];
[mtlBlasArgEncoder setAccelerationStructure:bvhMetalRT->blas_array[i] atIndex:0];
}
count = bvhMetalRT->blas_lookup.size();
bufferSize = sizeof(uint32_t) * count;
blas_lookup_buffer = [mtlDevice newBufferWithLength:bufferSize
options:default_storage_mode];
stats.mem_alloc(blas_lookup_buffer.allocatedSize);
memcpy([blas_lookup_buffer contents],
bvhMetalRT -> blas_lookup.data(),
blas_lookup_buffer.allocatedSize);
if (default_storage_mode == MTLResourceStorageModeManaged) {
[blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)];
[blas_lookup_buffer didModifyRange:NSMakeRange(0, blas_lookup_buffer.length)];
}
}
}
}

View File

@ -19,6 +19,8 @@ enum {
METALRT_FUNC_SHADOW_BOX,
METALRT_FUNC_LOCAL_TRI,
METALRT_FUNC_LOCAL_BOX,
METALRT_FUNC_LOCAL_TRI_PRIM,
METALRT_FUNC_LOCAL_BOX_PRIM,
METALRT_FUNC_CURVE_RIBBON,
METALRT_FUNC_CURVE_RIBBON_SHADOW,
METALRT_FUNC_CURVE_ALL,
@ -28,7 +30,13 @@ enum {
METALRT_FUNC_NUM
};
enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
enum {
METALRT_TABLE_DEFAULT,
METALRT_TABLE_SHADOW,
METALRT_TABLE_LOCAL,
METALRT_TABLE_LOCAL_PRIM,
METALRT_TABLE_NUM
};
/* Pipeline State Object types */
enum MetalPipelineType {

View File

@ -87,6 +87,9 @@ struct ShaderCache {
break;
}
}
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS] = {1024, 1024};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS] = {1024, 1024};
}
~ShaderCache();
@ -521,6 +524,8 @@ void MetalKernelPipeline::compile()
"__anyhit__cycles_metalrt_shadow_all_hit_box",
"__anyhit__cycles_metalrt_local_hit_tri",
"__anyhit__cycles_metalrt_local_hit_box",
"__anyhit__cycles_metalrt_local_hit_tri_prim",
"__anyhit__cycles_metalrt_local_hit_box_prim",
"__intersection__curve_ribbon",
"__intersection__curve_ribbon_shadow",
"__intersection__curve_all",
@ -611,11 +616,17 @@ void MetalKernelPipeline::compile()
rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
nil];
table_functions[METALRT_TABLE_LOCAL_PRIM] = [NSArray
arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI_PRIM],
rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
nil];
NSMutableSet *unique_functions = [NSMutableSet
setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL_PRIM]];
if (kernel_has_intersection(device_kernel)) {
linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]

View File

@ -25,6 +25,7 @@ class MetalDeviceQueue : public DeviceQueue {
virtual int num_concurrent_states(const size_t) const override;
virtual int num_concurrent_busy_states(const size_t) const override;
virtual int num_sort_partition_elements() const override;
virtual bool supports_local_atomic_sort() const override;
virtual void init_execution() override;

View File

@ -315,6 +315,11 @@ int MetalDeviceQueue::num_sort_partition_elements() const
return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
}
bool MetalDeviceQueue::supports_local_atomic_sort() const
{
return metal_device_->use_local_atomic_sort();
}
void MetalDeviceQueue::init_execution()
{
/* Synchronize all textures and memory copies before executing task. */
@ -477,6 +482,12 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
if (metal_device_->bvhMetalRT) {
id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct;
[metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
offset:0
atIndex:7];
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_lookup_buffer
offset:0
atIndex:8];
}
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
@ -527,6 +538,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
if (bvhMetalRT) {
/* Mark all Accelerations resources as used */
[mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct usage:MTLResourceUsageRead];
[mtlComputeCommandEncoder useResource:metal_device_->blas_buffer
usage:MTLResourceUsageRead];
[mtlComputeCommandEncoder useResource:metal_device_->blas_lookup_buffer
usage:MTLResourceUsageRead];
[mtlComputeCommandEncoder useResources:bvhMetalRT->blas_array.data()
count:bvhMetalRT->blas_array.size()
usage:MTLResourceUsageRead];
@ -553,13 +568,24 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
/* See parallel_active_index.h for why this amount of shared memory is needed.
* Rounded up to 16 bytes for Metal */
shared_mem_bytes = (int)round_up((num_threads_per_block + 1) * sizeof(int), 16);
[mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
break;
case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
int key_count = metal_device_->launch_params.data.max_shaders;
shared_mem_bytes = (int)round_up(key_count * sizeof(int), 16);
break;
}
default:
break;
}
if (shared_mem_bytes) {
assert(shared_mem_bytes <= 32 * 1024);
[mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
}
MTLSize size_threadgroups_per_dispatch = MTLSizeMake(
divide_up(work_size, num_threads_per_block), 1, 1);
MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);

View File

@ -64,6 +64,12 @@ MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device)
return METAL_GPU_INTEL;
}
else if (strstr(device_name, "AMD")) {
/* Setting this env var hides AMD devices thus exposing any integrated Intel devices. */
if (auto str = getenv("CYCLES_METAL_FORCE_INTEL")) {
if (atoi(str)) {
return METAL_GPU_UNKNOWN;
}
}
return METAL_GPU_AMD;
}
else if (strstr(device_name, "Apple")) {
@ -96,6 +102,15 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
return usable_devices;
}
/* If the system has both an AMD GPU (discrete) and an Intel one (integrated), prefer the AMD
* one. This can be overriden with CYCLES_METAL_FORCE_INTEL. */
bool has_usable_amd_gpu = false;
if (@available(macos 12.3, *)) {
for (id<MTLDevice> device in MTLCopyAllDevices()) {
has_usable_amd_gpu |= (get_device_vendor(device) == METAL_GPU_AMD);
}
}
metal_printf("Usable Metal devices:\n");
for (id<MTLDevice> device in MTLCopyAllDevices()) {
string device_name = get_device_name(device);
@ -111,8 +126,10 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
}
# if defined(MAC_OS_VERSION_13_0)
if (@available(macos 13.0, *)) {
usable |= (vendor == METAL_GPU_INTEL);
if (!has_usable_amd_gpu) {
if (@available(macos 13.0, *)) {
usable |= (vendor == METAL_GPU_INTEL);
}
}
# endif

View File

@ -854,12 +854,14 @@ bool OptiXDevice::load_osl_kernels()
context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2]));
}
OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
vector<OptixStackSizes> osl_stack_size(osl_groups.size());
/* Update SBT with new entries. */
sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size());
for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
}
for (size_t i = 0; i < osl_groups.size(); ++i) {
if (osl_groups[i] != NULL) {
@ -907,13 +909,15 @@ bool OptiXDevice::load_osl_kernels()
0,
&pipelines[PIP_SHADE]));
const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG);
unsigned int dss = 0;
for (unsigned int i = 0; i < osl_stack_size.size(); ++i) {
dss = std::max(dss, osl_stack_size[i].dssDC);
}
optix_assert(optixPipelineSetStackSize(
pipelines[PIP_SHADE], 0, dss, 0, pipeline_options.usesMotionBlur ? 3 : 2));
pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
}
return !have_error();

View File

@ -112,6 +112,13 @@ class DeviceQueue {
return 65536;
}
/* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and
* INTEGRATOR_SORT_WRITE_PASS)? */
virtual bool supports_local_atomic_sort() const
{
return false;
}
/* Initialize execution of kernels on this queue.
*
* Will, for example, load all data required by the kernels from Device to global or path state.

View File

@ -71,6 +71,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
device, "integrator_shader_mnee_sort_counter", MEM_READ_WRITE),
integrator_shader_sort_prefix_sum_(
device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
integrator_shader_sort_partition_key_offsets_(
device, "integrator_shader_sort_partition_key_offsets", MEM_READ_WRITE),
integrator_next_main_path_index_(device, "integrator_next_main_path_index", MEM_READ_WRITE),
integrator_next_shadow_path_index_(
device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
@ -207,33 +209,45 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
num_sort_partitions_);
/* Allocate arrays for shader sorting. */
const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
if (integrator_shader_sort_counter_.size() < sort_buckets) {
integrator_shader_sort_counter_.alloc(sort_buckets);
integrator_shader_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
(int *)integrator_shader_sort_counter_.device_pointer;
integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
integrator_shader_sort_prefix_sum_.zero_to_device();
}
if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
integrator_shader_raytrace_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
(int *)integrator_shader_raytrace_sort_counter_.device_pointer;
if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
/* Allocate array for partitioned shader sorting using local atomics. */
const int num_offsets = (device_scene_->data.max_shaders + 1) * num_sort_partitions_;
if (integrator_shader_sort_partition_key_offsets_.size() < num_offsets) {
integrator_shader_sort_partition_key_offsets_.alloc(num_offsets);
integrator_shader_sort_partition_key_offsets_.zero_to_device();
}
integrator_state_gpu_.sort_partition_key_offsets =
(int *)integrator_shader_sort_partition_key_offsets_.device_pointer;
}
else {
/* Allocate arrays for shader sorting. */
const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
if (integrator_shader_sort_counter_.size() < sort_buckets) {
integrator_shader_sort_counter_.alloc(sort_buckets);
integrator_shader_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
(int *)integrator_shader_sort_counter_.device_pointer;
if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
integrator_shader_mnee_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
(int *)integrator_shader_mnee_sort_counter_.device_pointer;
integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
integrator_shader_sort_prefix_sum_.zero_to_device();
}
if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
integrator_shader_raytrace_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
(int *)integrator_shader_raytrace_sort_counter_.device_pointer;
}
}
if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
integrator_shader_mnee_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
(int *)integrator_shader_mnee_sort_counter_.device_pointer;
}
}
}
}
@ -451,8 +465,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
work_size = num_queued;
d_path_index = queued_paths_.device_pointer;
compute_sorted_queued_paths(
DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit);
compute_sorted_queued_paths(kernel, num_paths_limit);
}
else if (num_queued < work_size) {
work_size = num_queued;
@ -511,11 +524,26 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
}
}
void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
DeviceKernel queued_kernel,
void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel queued_kernel,
const int num_paths_limit)
{
int d_queued_kernel = queued_kernel;
/* Launch kernel to fill the active paths arrays. */
if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
const int work_size = kernel_max_active_main_path_index(queued_kernel);
device_ptr d_queued_paths = queued_paths_.device_pointer;
int partition_size = (int)integrator_state_gpu_.sort_partition_divisor;
DeviceKernelArguments args(
&work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, 1024 * num_sort_partitions_, args);
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, 1024 * num_sort_partitions_, args);
return;
}
device_ptr d_counter = (device_ptr)integrator_state_gpu_.sort_key_counter[d_queued_kernel];
device_ptr d_prefix_sum = integrator_shader_sort_prefix_sum_.device_pointer;
assert(d_counter != 0 && d_prefix_sum != 0);
@ -552,7 +580,7 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
&d_prefix_sum,
&d_queued_kernel);
queue_->enqueue(kernel, work_size, args);
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, work_size, args);
}
}

View File

@ -70,9 +70,7 @@ class PathTraceWorkGPU : public PathTraceWork {
void enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit = INT_MAX);
void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
void compute_sorted_queued_paths(DeviceKernel kernel,
DeviceKernel queued_kernel,
const int num_paths_limit);
void compute_sorted_queued_paths(DeviceKernel queued_kernel, const int num_paths_limit);
void compact_main_paths(const int num_active_paths);
void compact_shadow_paths();
@ -135,6 +133,7 @@ class PathTraceWorkGPU : public PathTraceWork {
device_vector<int> integrator_shader_raytrace_sort_counter_;
device_vector<int> integrator_shader_mnee_sort_counter_;
device_vector<int> integrator_shader_sort_prefix_sum_;
device_vector<int> integrator_shader_sort_partition_key_offsets_;
/* Path split. */
device_vector<int> integrator_next_main_path_index_;
device_vector<int> integrator_next_shadow_path_index_;

View File

@ -661,7 +661,8 @@ ccl_device void bsdf_blur(KernelGlobals kg, ccl_private ShaderClosure *sc, float
#endif
}
ccl_device_inline Spectrum bsdf_albedo(ccl_private const ShaderData *sd, ccl_private const ShaderClosure *sc)
ccl_device_inline Spectrum bsdf_albedo(ccl_private const ShaderData *sd,
ccl_private const ShaderClosure *sc)
{
Spectrum albedo = sc->weight;
/* Some closures include additional components such as Fresnel terms that cause their albedo to

View File

@ -519,14 +519,6 @@ ccl_device int bsdf_microfacet_ggx_setup(ccl_private MicrofacetBsdf *bsdf)
return SD_BSDF | SD_BSDF_HAS_EVAL;
}
/* Required to maintain OSL interface. */
ccl_device int bsdf_microfacet_ggx_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
{
bsdf->alpha_y = bsdf->alpha_x;
return bsdf_microfacet_ggx_setup(bsdf);
}
ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsdf,
ccl_private const ShaderData *sd)
{
@ -613,14 +605,6 @@ ccl_device int bsdf_microfacet_beckmann_setup(ccl_private MicrofacetBsdf *bsdf)
return SD_BSDF | SD_BSDF_HAS_EVAL;
}
/* Required to maintain OSL interface. */
ccl_device int bsdf_microfacet_beckmann_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
{
bsdf->alpha_y = bsdf->alpha_x;
return bsdf_microfacet_beckmann_setup(bsdf);
}
ccl_device int bsdf_microfacet_beckmann_refraction_setup(ccl_private MicrofacetBsdf *bsdf)
{
bsdf->alpha_x = saturatef(bsdf->alpha_x);

View File

@ -90,8 +90,10 @@ ccl_device float schlick_fresnel(float u)
}
/* Calculate the fresnel color, which is a blend between white and the F0 color */
ccl_device_forceinline Spectrum
interpolate_fresnel_color(float3 L, float3 H, float ior, Spectrum F0)
ccl_device_forceinline Spectrum interpolate_fresnel_color(float3 L,
float3 H,
float ior,
Spectrum F0)
{
/* Compute the real Fresnel term and remap it from real_F0..1 to F0..1.
* The reason why we use this remapping instead of directly doing the

View File

@ -401,6 +401,72 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
}
ccl_gpu_kernel_postfix
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
int num_states,
int partition_size,
int num_states_limit,
ccl_global int *indices,
int kernel_index)
{
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
int max_shaders = context.launch_params_metal.data.max_shaders;
ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
kernel_integrator_state.path.queued_kernel;
ccl_global uint *d_shader_sort_key = (ccl_global uint *)
kernel_integrator_state.path.shader_sort_key;
ccl_global int *key_offsets = (ccl_global int *)
kernel_integrator_state.sort_partition_key_offsets;
gpu_parallel_sort_bucket_pass(num_states,
partition_size,
max_shaders,
kernel_index,
d_queued_kernel,
d_shader_sort_key,
key_offsets,
(threadgroup int *)threadgroup_array,
metal_local_id,
metal_local_size,
metal_grid_id);
#endif
}
ccl_gpu_kernel_postfix
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
ccl_gpu_kernel_signature(integrator_sort_write_pass,
int num_states,
int partition_size,
int num_states_limit,
ccl_global int *indices,
int kernel_index)
{
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
int max_shaders = context.launch_params_metal.data.max_shaders;
ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
kernel_integrator_state.path.queued_kernel;
ccl_global uint *d_shader_sort_key = (ccl_global uint *)
kernel_integrator_state.path.shader_sort_key;
ccl_global int *key_offsets = (ccl_global int *)
kernel_integrator_state.sort_partition_key_offsets;
gpu_parallel_sort_write_pass(num_states,
partition_size,
max_shaders,
kernel_index,
num_states_limit,
indices,
d_queued_kernel,
d_shader_sort_key,
key_offsets,
(threadgroup int *)threadgroup_array,
metal_local_id,
metal_local_size,
metal_grid_id);
#endif
}
ccl_gpu_kernel_postfix
ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
ccl_gpu_kernel_signature(integrator_compact_paths_array,
int num_states,

View File

@ -178,7 +178,7 @@ __device__
simd_lane_index, \
simd_group_index, \
num_simd_groups, \
simdgroup_offset)
(threadgroup int *)threadgroup_array)
#elif defined(__KERNEL_ONEAPI__)
# define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \

View File

@ -19,6 +19,115 @@ CCL_NAMESPACE_BEGIN
# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
#endif
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
# define atomic_store_local(p, x) \
atomic_store_explicit((threadgroup atomic_int *)p, x, memory_order_relaxed)
# define atomic_load_local(p) \
atomic_load_explicit((threadgroup atomic_int *)p, memory_order_relaxed)
ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
const uint partition_size,
const uint max_shaders,
const uint queued_kernel,
ccl_global ushort *d_queued_kernel,
ccl_global uint *d_shader_sort_key,
ccl_global int *partition_key_offsets,
ccl_gpu_shared int *buckets,
const ushort local_id,
const ushort local_size,
const ushort grid_id)
{
/* Zero the bucket sizes. */
if (local_id < max_shaders) {
atomic_store_local(&buckets[local_id], 0);
}
ccl_gpu_syncthreads();
/* Determine bucket sizes within the partitions. */
const uint partition_start = partition_size * uint(grid_id);
const uint partition_end = min(num_states, partition_start + partition_size);
for (int state_index = partition_start + uint(local_id); state_index < partition_end;
state_index += uint(local_size)) {
ushort kernel_index = d_queued_kernel[state_index];
if (kernel_index == queued_kernel) {
uint key = d_shader_sort_key[state_index] % max_shaders;
atomic_fetch_and_add_uint32(&buckets[key], 1);
}
}
ccl_gpu_syncthreads();
/* Calculate the partition's local offsets from the prefix sum of bucket sizes. */
if (local_id == 0) {
int offset = 0;
for (int i = 0; i < max_shaders; i++) {
partition_key_offsets[i + uint(grid_id) * (max_shaders + 1)] = offset;
offset = offset + atomic_load_local(&buckets[i]);
}
/* Store the number of active states in this partition. */
partition_key_offsets[max_shaders + uint(grid_id) * (max_shaders + 1)] = offset;
}
}
ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
const uint partition_size,
const uint max_shaders,
const uint queued_kernel,
const int num_states_limit,
ccl_global int *indices,
ccl_global ushort *d_queued_kernel,
ccl_global uint *d_shader_sort_key,
ccl_global int *partition_key_offsets,
ccl_gpu_shared int *local_offset,
const ushort local_id,
const ushort local_size,
const ushort grid_id)
{
/* Calculate each partition's global offset from the prefix sum of the active state counts per
* partition. */
if (local_id < max_shaders) {
int partition_offset = 0;
for (int i = 0; i < uint(grid_id); i++) {
int partition_key_count = partition_key_offsets[max_shaders + uint(i) * (max_shaders + 1)];
partition_offset += partition_key_count;
}
ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * (max_shaders + 1));
atomic_store_local(&local_offset[local_id], key_offsets[local_id] + partition_offset);
}
ccl_gpu_syncthreads();
/* Write the sorted active indices. */
const uint partition_start = partition_size * uint(grid_id);
const uint partition_end = min(num_states, partition_start + partition_size);
ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * max_shaders);
for (int state_index = partition_start + uint(local_id); state_index < partition_end;
state_index += uint(local_size)) {
ushort kernel_index = d_queued_kernel[state_index];
if (kernel_index == queued_kernel) {
uint key = d_shader_sort_key[state_index] % max_shaders;
int index = atomic_fetch_and_add_uint32(&local_offset[key], 1);
if (index < num_states_limit) {
indices[index] = state_index;
}
}
}
}
#endif /* __KERNEL_LOCAL_ATOMIC_SORT__ */
template<typename GetKeyOp>
__device__ void gpu_parallel_sorted_index_array(const uint state_index,

View File

@ -172,17 +172,14 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
kernel_assert(!"Invalid ift_local");
return false;
}
# endif
metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
metalrt_intersector_type metalrt_intersect;
metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
if (triangle_only) {
metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
if (is_null_intersection_function_table(metal_ancillaries->ift_local_prim)) {
if (local_isect) {
local_isect->num_hits = 0;
}
kernel_assert(!"Invalid ift_local_prim");
return false;
}
# endif
MetalRTIntersectionLocalPayload payload;
payload.self = ray->self;
@ -195,14 +192,48 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
}
payload.result = false;
typename metalrt_intersector_type::result_type intersection;
metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
# if defined(__METALRT_MOTION__)
metalrt_intersector_type metalrt_intersect;
typename metalrt_intersector_type::result_type intersection;
metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
if (triangle_only) {
metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
}
intersection = metalrt_intersect.intersect(
r, metal_ancillaries->accel_struct, 0xFF, ray->time, metal_ancillaries->ift_local, payload);
# else
metalrt_blas_intersector_type metalrt_intersect;
typename metalrt_blas_intersector_type::result_type intersection;
metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
if (triangle_only) {
metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
}
// if we know we are going to get max one hit, like for random-sss-walk we can
// optimize and accept the first hit
if (max_hits == 1) {
metalrt_intersect.accept_any_intersection(true);
}
int blas_index = metal_ancillaries->blas_userID_to_index_lookUp[local_object];
// transform the ray into object's local space
Transform itfm = kernel_data_fetch(objects, local_object).itfm;
r.origin = transform_point(&itfm, r.origin);
r.direction = transform_direction(&itfm, r.direction);
intersection = metalrt_intersect.intersect(
r, metal_ancillaries->accel_struct, 0xFF, metal_ancillaries->ift_local, payload);
r,
metal_ancillaries->blas_accel_structs[blas_index].blas,
metal_ancillaries->ift_local_prim,
payload);
# endif
if (lcg_state) {

View File

@ -105,10 +105,11 @@ struct kernel_gpu_##name \
{ \
PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
void run(thread MetalKernelContext& context, \
threadgroup int *simdgroup_offset, \
threadgroup atomic_int *threadgroup_array, \
const uint metal_global_id, \
const ushort metal_local_id, \
const ushort metal_local_size, \
const ushort metal_grid_id, \
uint simdgroup_size, \
uint simd_lane_index, \
uint simd_group_index, \
@ -117,22 +118,24 @@ struct kernel_gpu_##name \
kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
constant KernelParamsMetal &ccl_restrict _launch_params_metal, \
constant MetalAncillaries *_metal_ancillaries, \
threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
threadgroup atomic_int *threadgroup_array[[ threadgroup(0) ]], \
const uint metal_global_id [[thread_position_in_grid]], \
const ushort metal_local_id [[thread_position_in_threadgroup]], \
const ushort metal_local_size [[threads_per_threadgroup]], \
const ushort metal_grid_id [[threadgroup_position_in_grid]], \
uint simdgroup_size [[threads_per_simdgroup]], \
uint simd_lane_index [[thread_index_in_simdgroup]], \
uint simd_group_index [[simdgroup_index_in_threadgroup]], \
uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
params_struct->run(context, threadgroup_array, metal_global_id, metal_local_id, metal_local_size, metal_grid_id, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
} \
void kernel_gpu_##name::run(thread MetalKernelContext& context, \
threadgroup int *simdgroup_offset, \
threadgroup atomic_int *threadgroup_array, \
const uint metal_global_id, \
const ushort metal_local_id, \
const ushort metal_local_size, \
const ushort metal_grid_id, \
uint simdgroup_size, \
uint simd_lane_index, \
uint simd_group_index, \
@ -263,13 +266,25 @@ ccl_device_forceinline uchar4 make_uchar4(const uchar x,
# if defined(__METALRT_MOTION__)
# define METALRT_TAGS instancing, instance_motion, primitive_motion
# define METALRT_BLAS_TAGS , primitive_motion
# else
# define METALRT_TAGS instancing
# define METALRT_BLAS_TAGS
# endif /* __METALRT_MOTION__ */
typedef acceleration_structure<METALRT_TAGS> metalrt_as_type;
typedef intersection_function_table<triangle_data, METALRT_TAGS> metalrt_ift_type;
typedef metal::raytracing::intersector<triangle_data, METALRT_TAGS> metalrt_intersector_type;
# if defined(__METALRT_MOTION__)
typedef acceleration_structure<primitive_motion> metalrt_blas_as_type;
typedef intersection_function_table<triangle_data, primitive_motion> metalrt_blas_ift_type;
typedef metal::raytracing::intersector<triangle_data, primitive_motion>
metalrt_blas_intersector_type;
# else
typedef acceleration_structure<> metalrt_blas_as_type;
typedef intersection_function_table<triangle_data> metalrt_blas_ift_type;
typedef metal::raytracing::intersector<triangle_data> metalrt_blas_intersector_type;
# endif
#endif /* __METALRT__ */
@ -282,6 +297,12 @@ struct Texture3DParamsMetal {
texture3d<float, access::sample> tex;
};
#ifdef __METALRT__
struct MetalRTBlasWrapper {
metalrt_blas_as_type blas;
};
#endif
struct MetalAncillaries {
device Texture2DParamsMetal *textures_2d;
device Texture3DParamsMetal *textures_3d;
@ -291,6 +312,9 @@ struct MetalAncillaries {
metalrt_ift_type ift_default;
metalrt_ift_type ift_shadow;
metalrt_ift_type ift_local;
metalrt_blas_ift_type ift_local_prim;
constant MetalRTBlasWrapper *blas_accel_structs;
constant int *blas_userID_to_index_lookUp;
#endif
};

View File

@ -139,6 +139,20 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
#endif
}
[[intersection(triangle, triangle_data )]] TriangleIntersectionResult
__anyhit__cycles_metalrt_local_hit_tri_prim(
constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
uint primitive_id [[primitive_id]],
float2 barycentrics [[barycentric_coord]],
float ray_tmax [[distance]])
{
//instance_id, aka the user_id has been removed. If we take this function we optimized the
//SSS for starting traversal from a primitive acceleration structure instead of the root of the global AS.
//this means we will always be intersecting the correct object no need for the userid to check
return metalrt_local_hit<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
launch_params_metal, payload, payload.local_object, primitive_id, barycentrics, ray_tmax);
}
[[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
__anyhit__cycles_metalrt_local_hit_tri(
constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
@ -163,6 +177,17 @@ __anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
return result;
}
[[intersection(bounding_box, triangle_data )]] BoundingBoxIntersectionResult
__anyhit__cycles_metalrt_local_hit_box_prim(const float ray_tmax [[max_distance]])
{
/* unused function */
BoundingBoxIntersectionResult result;
result.distance = ray_tmax;
result.accept = false;
result.continue_search = false;
return result;
}
template<uint intersection_type>
bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,

View File

@ -372,6 +372,16 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sorted_paths_array);
break;
}
case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_bucket_pass);
break;
}
case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_write_pass);
break;
}
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: {
oneapi_call(kg,
cgh,

View File

@ -132,6 +132,9 @@ typedef struct IntegratorStateGPU {
/* Index of main path which will be used by a next shadow catcher split. */
ccl_global int *next_main_path_index;
/* Partition/key offsets used when writing sorted active indices. */
ccl_global int *sort_partition_key_offsets;
/* Divisor used to partition active indices by locality when sorting by material. */
uint sort_partition_divisor;
} IntegratorStateGPU;

View File

@ -115,6 +115,13 @@ ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
# if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
if (!kernel_integrator_state.sort_key_counter[next_kernel]) {
return;
}
# endif
atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
}
@ -130,6 +137,13 @@ ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
# if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
if (!kernel_integrator_state.sort_key_counter[next_kernel]) {
return;
}
# endif
atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
}

View File

@ -209,14 +209,7 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
if (closure->distribution == make_string("ggx", 11253504724482777663ull) ||
closure->distribution == make_string("default", 4430693559278735917ull)) {
if (!closure->refract) {
if (closure->alpha_x == closure->alpha_y) {
/* Isotropic */
sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
}
else {
/* Anisotropic */
sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
}
sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
}
else {
sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
@ -225,14 +218,7 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
/* Beckmann */
else {
if (!closure->refract) {
if (closure->alpha_x == closure->alpha_y) {
/* Isotropic */
sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
}
else {
/* Anisotropic */
sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
}
sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
}
else {
sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
@ -258,9 +244,9 @@ ccl_device void osl_closure_microfacet_ggx_setup(
}
bsdf->N = ensure_valid_reflection(sd->Ng, sd->wi, closure->N);
bsdf->alpha_x = closure->alpha_x;
bsdf->alpha_x = bsdf->alpha_y = closure->alpha_x;
sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
}
ccl_device void osl_closure_microfacet_ggx_aniso_setup(
@ -652,9 +638,9 @@ ccl_device void osl_closure_microfacet_beckmann_setup(
}
bsdf->N = ensure_valid_reflection(sd->Ng, sd->wi, closure->N);
bsdf->alpha_x = closure->alpha_x;
bsdf->alpha_x = bsdf->alpha_y = closure->alpha_x;
sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
}
ccl_device void osl_closure_microfacet_beckmann_aniso_setup(

View File

@ -74,7 +74,8 @@ CCL_NAMESPACE_BEGIN
#define __VOLUME__
/* TODO: solve internal compiler errors and enable light tree on HIP. */
#ifdef __KERNEL_HIP__
/* TODO: solve internal compiler perf issue and enable light tree on Metal/AMD. */
#if defined(__KERNEL_HIP__) || defined(__KERNEL_METAL_AMD__)
# undef __LIGHT_TREE__
#endif
@ -1508,6 +1509,8 @@ typedef enum DeviceKernel : int {
DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY,
DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY,
DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY,
DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS,
DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS,
DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES,
DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY,

View File

@ -73,16 +73,55 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *_s
return new_value.float_value;
}
# define atomic_fetch_and_add_uint32(p, x) \
atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed)
# define atomic_fetch_and_sub_uint32(p, x) \
atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed)
# define atomic_fetch_and_inc_uint32(p) \
atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
# define atomic_fetch_and_dec_uint32(p) \
atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
# define atomic_fetch_and_or_uint32(p, x) \
atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed)
template<class T> ccl_device_inline uint32_t atomic_fetch_and_add_uint32(device T *p, int x)
{
return atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_sub_uint32(device T *p, int x)
{
return atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_inc_uint32(device T *p)
{
return atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_dec_uint32(device T *p)
{
return atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_or_uint32(device T *p, int x)
{
return atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_add_uint32(threadgroup T *p, int x)
{
return atomic_fetch_add_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_sub_uint32(threadgroup T *p, int x)
{
return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_inc_uint32(threadgroup T *p)
{
return atomic_fetch_add_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_dec_uint32(threadgroup T *p)
{
return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_or_uint32(threadgroup T *p, int x)
{
return atomic_fetch_or_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
}
ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest,
const float old_val,

View File

@ -69,6 +69,9 @@ void DebugFlags::Metal::reset()
{
if (getenv("CYCLES_METAL_ADAPTIVE_COMPILE") != NULL)
adaptive_compile = true;
if (auto str = getenv("CYCLES_METAL_LOCAL_ATOMIC_SORT"))
use_local_atomic_sort = (atoi(str) != 0);
}
DebugFlags::OptiX::OptiX()

View File

@ -97,6 +97,9 @@ class DebugFlags {
/* Whether adaptive feature based runtime compile is enabled or not. */
bool adaptive_compile = false;
/* Whether local atomic sorting is enabled or not. */
bool use_local_atomic_sort = true;
};
/* Get instance of debug flags registry. */

View File

@ -82,6 +82,8 @@
#include "CLG_log.h"
#ifdef USE_EVENT_BACKGROUND_THREAD
# include "GHOST_TimerTask.h"
# include <pthread.h>
#endif
@ -768,7 +770,12 @@ struct GWL_Seat {
int32_t rate = 0;
/** Time (milliseconds) after which to start repeating keys. */
int32_t delay = 0;
/** Timer for key repeats. */
/**
* Timer for key repeats.
*
* \note For as long as #USE_EVENT_BACKGROUND_THREAD is defined, any access to this
* (including null checks, must lock `timer_mutex` first.
*/
GHOST_ITimerTask *timer = nullptr;
} key_repeat;
@ -832,6 +839,42 @@ static bool gwl_seat_key_depressed_suppress_warning(const GWL_Seat *seat)
return suppress_warning;
}
/**
* \note Caller must lock `timer_mutex`.
*/
static void gwl_seat_key_repeat_timer_add(GWL_Seat *seat,
GHOST_TimerProcPtr key_repeat_fn,
GHOST_TUserDataPtr payload,
const bool use_delay)
{
GHOST_SystemWayland *system = seat->system;
const uint64_t time_step = 1000 / seat->key_repeat.rate;
const uint64_t time_start = use_delay ? seat->key_repeat.delay : time_step;
#ifdef USE_EVENT_BACKGROUND_THREAD
GHOST_TimerTask *timer = new GHOST_TimerTask(
system->getMilliSeconds() + time_start, time_step, key_repeat_fn, payload);
seat->key_repeat.timer = timer;
system->ghost_timer_manager()->addTimer(timer);
#else
seat->key_repeat.timer = system->installTimer(time_start, time_step, key_repeat_fn, payload);
#endif
}
/**
* \note The caller must lock `timer_mutex`.
*/
static void gwl_seat_key_repeat_timer_remove(GWL_Seat *seat)
{
GHOST_SystemWayland *system = seat->system;
#ifdef USE_EVENT_BACKGROUND_THREAD
system->ghost_timer_manager()->removeTimer(
static_cast<GHOST_TimerTask *>(seat->key_repeat.timer));
#else
system->removeTimer(seat->key_repeat.timer);
#endif
seat->key_repeat.timer = nullptr;
}
/** \} */
/* -------------------------------------------------------------------- */
@ -906,6 +949,16 @@ struct GWL_Display {
/** Guard against multiple threads accessing `events_pending` at once. */
std::mutex events_pending_mutex;
/**
* A separate timer queue, needed so the WAYLAND thread can lock access.
* Using the system's #GHOST_Sysem::getTimerManager is not thread safe because
* access to the timer outside of WAYLAND specific logic will not lock.
*
* Needed because #GHOST_System::dispatchEvents fires timers
* outside of WAYLAND (without locking the `timer_mutex`).
*/
GHOST_TimerManager *ghost_timer_manager;
#endif /* USE_EVENT_BACKGROUND_THREAD */
};
@ -922,6 +975,9 @@ static void gwl_display_destroy(GWL_Display *display)
ghost_wl_display_lock_without_input(display->wl_display, display->system->server_mutex);
display->events_pthread_is_active = false;
}
delete display->ghost_timer_manager;
display->ghost_timer_manager = nullptr;
#endif
/* For typical WAYLAND use this will always be set.
@ -3718,9 +3774,14 @@ static void keyboard_handle_leave(void *data,
GWL_Seat *seat = static_cast<GWL_Seat *>(data);
seat->keyboard.wl_surface_window = nullptr;
/* Losing focus must stop repeating text. */
if (seat->key_repeat.timer) {
keyboard_handle_key_repeat_cancel(seat);
{
#ifdef USE_EVENT_BACKGROUND_THREAD
std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
#endif
/* Losing focus must stop repeating text. */
if (seat->key_repeat.timer) {
keyboard_handle_key_repeat_cancel(seat);
}
}
#ifdef USE_GNOME_KEYBOARD_SUPPRESS_WARNING
@ -3780,36 +3841,32 @@ static xkb_keysym_t xkb_state_key_get_one_sym_without_modifiers(
return sym;
}
/**
* \note Caller must lock `timer_mutex`.
*/
static void keyboard_handle_key_repeat_cancel(GWL_Seat *seat)
{
#ifdef USE_EVENT_BACKGROUND_THREAD
std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
#endif
GHOST_ASSERT(seat->key_repeat.timer != nullptr, "Caller much check for timer");
delete static_cast<GWL_KeyRepeatPlayload *>(seat->key_repeat.timer->getUserData());
seat->system->removeTimer(seat->key_repeat.timer);
seat->key_repeat.timer = nullptr;
gwl_seat_key_repeat_timer_remove(seat);
}
/**
* Restart the key-repeat timer.
* \param use_delay: When false, use the interval
* (prevents pause when the setting changes while the key is held).
*
* \note Caller must lock `timer_mutex`.
*/
static void keyboard_handle_key_repeat_reset(GWL_Seat *seat, const bool use_delay)
{
#ifdef USE_EVENT_BACKGROUND_THREAD
std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
#endif
GHOST_ASSERT(seat->key_repeat.timer != nullptr, "Caller much check for timer");
GHOST_SystemWayland *system = seat->system;
GHOST_ITimerTask *timer = seat->key_repeat.timer;
GHOST_TimerProcPtr key_repeat_fn = timer->getTimerProc();
GHOST_TimerProcPtr key_repeat_fn = seat->key_repeat.timer->getTimerProc();
GHOST_TUserDataPtr payload = seat->key_repeat.timer->getUserData();
seat->system->removeTimer(seat->key_repeat.timer);
const uint64_t time_step = 1000 / seat->key_repeat.rate;
const uint64_t time_start = use_delay ? seat->key_repeat.delay : time_step;
seat->key_repeat.timer = system->installTimer(time_start, time_step, key_repeat_fn, payload);
gwl_seat_key_repeat_timer_remove(seat);
gwl_seat_key_repeat_timer_add(seat, key_repeat_fn, payload, use_delay);
}
static void keyboard_handle_key(void *data,
@ -3848,6 +3905,11 @@ static void keyboard_handle_key(void *data,
break;
}
#ifdef USE_EVENT_BACKGROUND_THREAD
/* Any access to `seat->key_repeat.timer` must lock. */
std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
#endif
struct GWL_KeyRepeatPlayload *key_repeat_payload = nullptr;
/* Delete previous timer. */
@ -3886,23 +3948,14 @@ static void keyboard_handle_key(void *data,
break;
}
case RESET: {
#ifdef USE_EVENT_BACKGROUND_THREAD
std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
#endif
/* The payload will be added again. */
seat->system->removeTimer(seat->key_repeat.timer);
seat->key_repeat.timer = nullptr;
gwl_seat_key_repeat_timer_remove(seat);
break;
}
case CANCEL: {
#ifdef USE_EVENT_BACKGROUND_THREAD
std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
#endif
delete key_repeat_payload;
key_repeat_payload = nullptr;
seat->system->removeTimer(seat->key_repeat.timer);
seat->key_repeat.timer = nullptr;
gwl_seat_key_repeat_timer_remove(seat);
break;
}
}
@ -3956,8 +4009,8 @@ static void keyboard_handle_key(void *data,
utf8_buf));
}
};
seat->key_repeat.timer = seat->system->installTimer(
seat->key_repeat.delay, 1000 / seat->key_repeat.rate, key_repeat_fn, key_repeat_payload);
gwl_seat_key_repeat_timer_add(seat, key_repeat_fn, key_repeat_payload, true);
}
}
@ -3982,8 +4035,13 @@ static void keyboard_handle_modifiers(void *data,
/* A modifier changed so reset the timer,
* see comment in #keyboard_handle_key regarding this behavior. */
if (seat->key_repeat.timer) {
keyboard_handle_key_repeat_reset(seat, true);
{
#ifdef USE_EVENT_BACKGROUND_THREAD
std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
#endif
if (seat->key_repeat.timer) {
keyboard_handle_key_repeat_reset(seat, true);
}
}
#ifdef USE_GNOME_KEYBOARD_SUPPRESS_WARNING
@ -4002,9 +4060,14 @@ static void keyboard_repeat_handle_info(void *data,
seat->key_repeat.rate = rate;
seat->key_repeat.delay = delay;
/* Unlikely possible this setting changes while repeating. */
if (seat->key_repeat.timer) {
keyboard_handle_key_repeat_reset(seat, false);
{
#ifdef USE_EVENT_BACKGROUND_THREAD
std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
#endif
/* Unlikely possible this setting changes while repeating. */
if (seat->key_repeat.timer) {
keyboard_handle_key_repeat_reset(seat, false);
}
}
}
@ -4275,8 +4338,14 @@ static void gwl_seat_capability_keyboard_disable(GWL_Seat *seat)
if (!seat->wl_keyboard) {
return;
}
if (seat->key_repeat.timer) {
keyboard_handle_key_repeat_cancel(seat);
{
#ifdef USE_EVENT_BACKGROUND_THREAD
std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
#endif
if (seat->key_repeat.timer) {
keyboard_handle_key_repeat_cancel(seat);
}
}
wl_keyboard_destroy(seat->wl_keyboard);
seat->wl_keyboard = nullptr;
@ -5411,6 +5480,8 @@ GHOST_SystemWayland::GHOST_SystemWayland(bool background)
#ifdef USE_EVENT_BACKGROUND_THREAD
gwl_display_event_thread_create(display_);
display_->ghost_timer_manager = new GHOST_TimerManager();
#endif
}
@ -5491,10 +5562,16 @@ bool GHOST_SystemWayland::processEvents(bool waitForEvent)
#endif /* USE_EVENT_BACKGROUND_THREAD */
{
const uint64_t now = getMilliSeconds();
#ifdef USE_EVENT_BACKGROUND_THREAD
std::lock_guard lock_timer_guard{*display_->system->timer_mutex};
{
std::lock_guard lock_timer_guard{*display_->system->timer_mutex};
if (ghost_timer_manager()->fireTimers(now)) {
any_processed = true;
}
}
#endif
if (getTimerManager()->fireTimers(getMilliSeconds())) {
if (getTimerManager()->fireTimers(now)) {
any_processed = true;
}
}
@ -6717,6 +6794,13 @@ struct wl_shm *GHOST_SystemWayland::wl_shm() const
return display_->wl_shm;
}
#ifdef USE_EVENT_BACKGROUND_THREAD
GHOST_TimerManager *GHOST_SystemWayland::ghost_timer_manager()
{
return display_->ghost_timer_manager;
}
#endif
/** \} */
/* -------------------------------------------------------------------- */

View File

@ -165,6 +165,16 @@ class GHOST_SystemWayland : public GHOST_System {
bool cursor_grab_use_software_display_get(const GHOST_TGrabCursorMode mode);
#ifdef USE_EVENT_BACKGROUND_THREAD
/**
* Return a separate WAYLAND local timer manager to #GHOST_System::getTimerManager
* Manipulation & access must lock with #GHOST_WaylandSystem::server_mutex.
*
* See #GWL_Display::ghost_timer_manager doc-string for details on why this is needed.
*/
GHOST_TimerManager *ghost_timer_manager();
#endif
/* WAYLAND direct-data access. */
struct wl_display *wl_display();
@ -233,7 +243,14 @@ class GHOST_SystemWayland : public GHOST_System {
* from running at the same time. */
std::mutex *server_mutex = nullptr;
/** Threads must lock this before manipulating timers. */
/**
* Threads must lock this before manipulating #GWL_Display::ghost_timer_manager.
*
* \note Using a separate lock to `server_mutex` is necessary because the
* server lock is already held when calling `ghost_wl_display_event_pump`.
* If manipulating the timer used the `server_mutex`, event pump can indirectly
* handle key up/down events which would lock `server_mutex` causing a dead-lock.
*/
std::mutex *timer_mutex = nullptr;
std::thread::id main_thread_id;

View File

@ -6,8 +6,7 @@
#pragma once
#include "BLI_float3x3.hh"
#include "BLI_math_vector_types.hh"
#include "BLI_math_matrix.hh"
#include "BLI_span.hh"
struct Depsgraph;
@ -38,7 +37,7 @@ struct GeometryDeformation {
return translation;
}
const float3x3 &deform_mat = this->deform_mats[position_i];
return deform_mat.inverted() * translation;
return math::transform_point(math::invert(deform_mat), translation);
}
};

View File

@ -13,10 +13,9 @@
#include "BLI_bounds_types.hh"
#include "BLI_cache_mutex.hh"
#include "BLI_float3x3.hh"
#include "BLI_float4x4.hh"
#include "BLI_generic_virtual_array.hh"
#include "BLI_index_mask.hh"
#include "BLI_math_matrix_types.hh"
#include "BLI_math_vector_types.hh"
#include "BLI_offset_indices.hh"
#include "BLI_shared_cache.hh"

View File

@ -2,7 +2,7 @@
#pragma once
#include "BLI_float4x4.hh"
#include "BLI_math_matrix_types.hh"
#include "BKE_geometry_set.hh"

View File

@ -19,7 +19,7 @@
#include <mutex>
#include "BLI_float4x4.hh"
#include "BLI_math_matrix_types.hh"
#include "BLI_vector.hh"
#include "BLI_vector_set.hh"

View File

@ -497,6 +497,7 @@ void BKE_lib_id_expand_local(struct Main *bmain, struct ID *id, int flags);
*
* Only for local IDs (linked ones already have a unique ID in their library).
*
* \param name: The new name of the given ID, if NULL the current given ID name is used instead.
* \param do_linked_data: if true, also ensure a unique name in case the given \a id is linked
* (otherwise, just ensure that it is properly sorted).
*

View File

@ -142,6 +142,9 @@ enum {
/** Also process internal ID pointers like `ID.newid` or `ID.orig_id`.
* WARNING: Dangerous, use with caution. */
IDWALK_DO_INTERNAL_RUNTIME_POINTERS = (1 << 9),
/** Also process the ID.lib pointer. It is an option because this pointer can usually be fully
ignored. */
IDWALK_DO_LIBRARY_POINTER = (1 << 10),
};
typedef struct LibraryForeachIDData LibraryForeachIDData;

View File

@ -222,8 +222,12 @@ void BKE_id_remapper_clear(struct IDRemapper *id_remapper);
bool BKE_id_remapper_is_empty(const struct IDRemapper *id_remapper);
/** Free the given ID Remapper. */
void BKE_id_remapper_free(struct IDRemapper *id_remapper);
/** Add a new remapping. */
/** Add a new remapping. Does not replace an existing mapping for `old_id`, if any. */
void BKE_id_remapper_add(struct IDRemapper *id_remapper, struct ID *old_id, struct ID *new_id);
/** Add a new remapping, replacing a potential already existing mapping of `old_id`. */
void BKE_id_remapper_add_overwrite(struct IDRemapper *id_remapper,
struct ID *old_id,
struct ID *new_id);
/**
* Apply a remapping.

View File

@ -29,6 +29,15 @@ struct UniqueName_Map;
struct UniqueName_Map *BKE_main_namemap_create(void) ATTR_WARN_UNUSED_RESULT;
void BKE_main_namemap_destroy(struct UniqueName_Map **r_name_map) ATTR_NONNULL();
/**
* Destroy all name_maps in given bmain:
* - In bmain itself for local IDs.
* - In the split bmains in the list is any (for linked IDs in some cases, e.g. if called during
* readfile code).
* - In all of the libraries IDs (for linked IDs).
*/
void BKE_main_namemap_clear(struct Main *bmain) ATTR_NONNULL();
/**
* Ensures the given name is unique within the given ID type.
*

View File

@ -8,7 +8,7 @@
*/
#include "BLI_array.hh"
#include "BLI_float4x4.hh"
#include "BLI_math_matrix_types.hh"
#include "BLI_mesh_boolean.hh"
#include "BLI_span.hh"

View File

@ -36,7 +36,7 @@ void BKE_mesh_legacy_convert_uvs_to_generic(Mesh *mesh);
* Move face sets to the legacy type from a generic type.
*/
void BKE_mesh_legacy_face_set_from_generic(
Mesh *mesh, blender::MutableSpan<CustomDataLayer> poly_layers_to_write);
blender::MutableSpan<CustomDataLayer> poly_layers_to_write);
/**
* Copy face sets to the generic data type from the legacy type.
*/

View File

@ -25,6 +25,7 @@
struct BVHCache;
struct EditMeshData;
struct Mesh;
struct MLoopTri;
struct ShrinkwrapBoundaryData;
struct SubdivCCG;
@ -167,10 +168,11 @@ struct MeshRuntime {
SharedCache<LooseEdgeCache> loose_edges_cache;
/**
* A #BLI_bitmap containing tags for the center vertices of subdivided polygons, set by the
* subdivision surface modifier and used by drawing code instead of polygon center face dots.
* A bit vector the size of the number of vertices, set to true for the center vertices of
* subdivided polygons. The values are set by the subdivision surface modifier and used by
* drawing code instead of polygon center face dots. Otherwise this will be empty.
*/
uint32_t *subsurf_face_dot_tags = nullptr;
BitVector<> subsurf_face_dot_tags;
MeshRuntime() = default;
~MeshRuntime();

View File

@ -612,3 +612,25 @@ void BKE_modifier_blend_read_lib(struct BlendLibReader *reader, struct Object *o
#ifdef __cplusplus
}
#endif
#ifdef __cplusplus
namespace blender::bke {
/**
* A convenience class that can be used to set `ModifierData::execution_time` based on the lifetime
* of this class.
*/
class ScopedModifierTimer {
private:
ModifierData &md_;
double start_time_;
public:
ScopedModifierTimer(ModifierData &md);
~ScopedModifierTimer();
};
} // namespace blender::bke
#endif

View File

@ -133,19 +133,19 @@ void BKE_nlastrips_sort_strips(ListBase *strips);
/**
* Add the given NLA-Strip to the given list of strips, assuming that it
* isn't currently a member of another list, NULL, or conflicting with existing
* strips position.
* isn't currently a member of another list, NULL, or conflicting with existing
* strips position.
*/
void BKE_nlastrips_add_strip_unsafe(ListBase *strips, struct NlaStrip *strip);
/**
* @brief NULL checks incoming strip and verifies no overlap / invalid
* configuration against other strips in NLA Track.
* \brief NULL checks incoming strip and verifies no overlap / invalid
* configuration against other strips in NLA Track.
*
* @param strips
* @param strip
* @return true
* @return false
* \param strips:
* \param strip:
* \return true
* \return false
*/
bool BKE_nlastrips_add_strip(ListBase *strips, struct NlaStrip *strip);

View File

@ -161,7 +161,7 @@ bool BKE_volume_save(const struct Volume *volume,
* file or copy shared grids to make them writeable. */
#ifdef __cplusplus
# include "BLI_float4x4.hh"
# include "BLI_math_matrix_types.hh"
# include "BLI_math_vector_types.hh"
# include "BLI_string_ref.hh"

View File

@ -676,23 +676,17 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
GeometrySet **r_geometry_set)
{
using namespace blender::bke;
/* Input and final mesh. Final mesh is only created the moment the first
* constructive modifier is executed, or a deform modifier needs normals
* or certain data layers. */
/* Input mesh shouldn't be modified. */
Mesh *mesh_input = (Mesh *)ob->data;
/* The final mesh is the result of calculating all enabled modifiers. */
Mesh *mesh_final = nullptr;
/* The result of calculating all leading deform modifiers. */
Mesh *mesh_deform = nullptr;
/* This geometry set contains the non-mesh data that might be generated by modifiers. */
GeometrySet geometry_set_final;
BLI_assert((mesh_input->id.tag & LIB_TAG_COPIED_ON_WRITE_EVAL_RESULT) == 0);
/* TODO: Remove use of `deformed_verts` in mesh modifier stack
* since mesh positions are now stored in a contiguous array. */
float(*deformed_verts)[3] = nullptr;
int num_deformed_verts = mesh_input->totvert;
bool isPrevDeform = false;
/* Mesh with constructive modifiers but no deformation applied. Tracked
* along with final mesh if undeformed / orco coordinates are requested
* for texturing. */
@ -775,20 +769,16 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
}
if (mti->type == eModifierTypeType_OnlyDeform && !sculpt_dyntopo) {
if (!deformed_verts) {
deformed_verts = BKE_mesh_vert_coords_alloc(mesh_input, &num_deformed_verts);
blender::bke::ScopedModifierTimer modifier_timer{*md};
if (!mesh_final) {
mesh_final = BKE_mesh_copy_for_eval(mesh_input, true);
ASSERT_IS_VALID_MESH(mesh_final);
}
else if (isPrevDeform && mti->dependsOnNormals && mti->dependsOnNormals(md)) {
if (mesh_final == nullptr) {
mesh_final = BKE_mesh_copy_for_eval(mesh_input, true);
ASSERT_IS_VALID_MESH(mesh_final);
}
BKE_mesh_vert_coords_apply(mesh_final, deformed_verts);
}
BKE_modifier_deform_verts(md, &mectx, mesh_final, deformed_verts, num_deformed_verts);
isPrevDeform = true;
BKE_modifier_deform_verts(md,
&mectx,
mesh_final,
BKE_mesh_vert_positions_for_write(mesh_final),
mesh_final->totvert);
}
else {
break;
@ -800,10 +790,6 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
* coordinates (like vertex paint). */
if (r_deform) {
mesh_deform = BKE_mesh_copy_for_eval(mesh_input, true);
if (deformed_verts) {
BKE_mesh_vert_coords_apply(mesh_deform, deformed_verts);
}
}
}
@ -861,6 +847,8 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
continue;
}
blender::bke::ScopedModifierTimer modifier_timer{*md};
/* Add orco mesh as layer if needed by this modifier. */
if (mesh_final && mesh_orco && mti->requiredDataMask) {
CustomData_MeshMasks mask = {0};
@ -870,36 +858,19 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
}
}
/* How to apply modifier depends on (a) what we already have as
* a result of previous modifiers (could be a Mesh or just
* deformed vertices) and (b) what type the modifier is. */
if (mti->type == eModifierTypeType_OnlyDeform) {
/* No existing verts to deform, need to build them. */
if (!deformed_verts) {
if (mesh_final) {
/* Deforming a mesh, read the vertex locations
* out of the mesh and deform them. Once done with this
* run of deformers verts will be written back. */
deformed_verts = BKE_mesh_vert_coords_alloc(mesh_final, &num_deformed_verts);
}
else {
deformed_verts = BKE_mesh_vert_coords_alloc(mesh_input, &num_deformed_verts);
}
if (!mesh_final) {
mesh_final = BKE_mesh_copy_for_eval(mesh_input, true);
ASSERT_IS_VALID_MESH(mesh_final);
}
/* if this is not the last modifier in the stack then recalculate the normals
* to avoid giving bogus normals to the next modifier see: T23673. */
else if (isPrevDeform && mti->dependsOnNormals && mti->dependsOnNormals(md)) {
if (mesh_final == nullptr) {
mesh_final = BKE_mesh_copy_for_eval(mesh_input, true);
ASSERT_IS_VALID_MESH(mesh_final);
}
BKE_mesh_vert_coords_apply(mesh_final, deformed_verts);
}
BKE_modifier_deform_verts(md, &mectx, mesh_final, deformed_verts, num_deformed_verts);
BKE_modifier_deform_verts(md,
&mectx,
mesh_final,
BKE_mesh_vert_positions_for_write(mesh_final),
mesh_final->totvert);
}
else {
bool check_for_needs_mapping = false;
/* apply vertex coordinates or build a Mesh as necessary */
if (mesh_final != nullptr) {
if (have_non_onlydeform_modifiers_appled == false) {
/* If we only deformed, we won't have initialized #CD_ORIGINDEX.
@ -913,10 +884,6 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
check_for_needs_mapping = true;
}
if (deformed_verts) {
BKE_mesh_vert_coords_apply(mesh_final, deformed_verts);
}
have_non_onlydeform_modifiers_appled = true;
/* determine which data layers are needed by following modifiers */
@ -997,11 +964,6 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
BKE_id_free(nullptr, mesh_final);
}
mesh_final = mesh_next;
if (deformed_verts) {
MEM_freeN(deformed_verts);
deformed_verts = nullptr;
}
}
/* create an orco mesh in parallel */
@ -1072,8 +1034,6 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
mesh_final->runtime->deformed_only = false;
}
isPrevDeform = (mti->type == eModifierTypeType_OnlyDeform);
if (sculpt_mode && md->type == eModifierType_Multires) {
multires_applied = true;
}
@ -1085,22 +1045,14 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
BKE_modifier_free_temporary_data(md);
}
/* Yay, we are done. If we have a Mesh and deformed vertices,
* we need to apply these back onto the Mesh. If we have no
* Mesh then we need to build one. */
if (mesh_final == nullptr) {
if (deformed_verts == nullptr && allow_shared_mesh) {
if (allow_shared_mesh) {
mesh_final = mesh_input;
}
else {
mesh_final = BKE_mesh_copy_for_eval(mesh_input, true);
}
}
if (deformed_verts) {
BKE_mesh_vert_coords_apply(mesh_final, deformed_verts);
MEM_freeN(deformed_verts);
deformed_verts = nullptr;
}
/* Denotes whether the object which the modifier stack came from owns the mesh or whether the
* mesh is shared across multiple objects since there are no effective modifiers. */
@ -1338,6 +1290,8 @@ static void editbmesh_calc_modifiers(struct Depsgraph *depsgraph,
continue;
}
blender::bke::ScopedModifierTimer modifier_timer{*md};
/* Add an orco mesh as layer if needed by this modifier. */
if (mesh_final && mesh_orco && mti->requiredDataMask) {
CustomData_MeshMasks mask = {0};

View File

@ -214,6 +214,8 @@ static void setup_app_data(bContext *C,
SWAP(ListBase, bmain->wm, bfd->main->wm);
SWAP(ListBase, bmain->workspaces, bfd->main->workspaces);
SWAP(ListBase, bmain->screens, bfd->main->screens);
/* NOTE: UI IDs are assumed to be only local data-blocks, so no need to call
* #BKE_main_namemap_clear here (otherwise, the swapping would fail in many funny ways). */
if (bmain->name_map != nullptr) {
BKE_main_namemap_destroy(&bmain->name_map);
}

View File

@ -138,7 +138,7 @@ static void cdDM_recalc_looptri(DerivedMesh *dm)
BLI_assert(cddm->dm.looptris.array == NULL);
atomic_cas_ptr(
(void **)&cddm->dm.looptris.array, cddm->dm.looptris.array, cddm->dm.looptris.array_wip);
cddm->dm.looptris.array_wip = NULL;
cddm->dm.looptris.array_wip = nullptr;
}
static void cdDM_free_internal(CDDerivedMesh *cddm)
@ -241,7 +241,7 @@ static DerivedMesh *cdDM_from_mesh_ex(Mesh *mesh,
#if 0
cddm->mface = CustomData_get_layer(&dm->faceData, CD_MFACE);
#else
cddm->mface = NULL;
cddm->mface = nullptr;
#endif
/* commented since even when CD_ORIGINDEX was first added this line fails

View File

@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#include "BLI_array.hh"
#include "BLI_math_matrix.hh"
#include "BLI_set.hh"
#include "BLI_task.hh"
@ -175,25 +176,26 @@ static void fill_mesh_positions(const int main_point_num,
{
if (profile_point_num == 1) {
for (const int i_ring : IndexRange(main_point_num)) {
float4x4 point_matrix = float4x4::from_normalized_axis_data(
float4x4 point_matrix = math::from_orthonormal_axes<float4x4>(
main_positions[i_ring], normals[i_ring], tangents[i_ring]);
if (!radii.is_empty()) {
point_matrix.apply_scale(radii[i_ring]);
point_matrix = math::scale(point_matrix, float3(radii[i_ring]));
}
mesh_positions[i_ring] = point_matrix * profile_positions.first();
mesh_positions[i_ring] = math::transform_point(point_matrix, profile_positions.first());
}
}
else {
for (const int i_ring : IndexRange(main_point_num)) {
float4x4 point_matrix = float4x4::from_normalized_axis_data(
float4x4 point_matrix = math::from_orthonormal_axes<float4x4>(
main_positions[i_ring], normals[i_ring], tangents[i_ring]);
if (!radii.is_empty()) {
point_matrix.apply_scale(radii[i_ring]);
point_matrix = math::scale(point_matrix, float3(radii[i_ring]));
}
const int ring_vert_start = i_ring * profile_point_num;
for (const int i_profile : IndexRange(profile_point_num)) {
mesh_positions[ring_vert_start + i_profile] = point_matrix * profile_positions[i_profile];
mesh_positions[ring_vert_start + i_profile] = math::transform_point(
point_matrix, profile_positions[i_profile]);
}
}
}

View File

@ -17,7 +17,7 @@
#include "BLI_index_range.hh"
#include "BLI_listbase.h"
#include "BLI_math_base.h"
#include "BLI_math_vector.hh"
#include "BLI_math_matrix.hh"
#include "BLI_rand.hh"
#include "BLI_span.hh"
#include "BLI_string.h"
@ -314,6 +314,8 @@ static void curves_evaluate_modifiers(struct Depsgraph *depsgraph,
continue;
}
blender::bke::ScopedModifierTimer modifier_timer{*md};
if (mti->modifyGeometrySet != nullptr) {
mti->modifyGeometrySet(md, &mectx, &geometry_set);
}
@ -417,15 +419,15 @@ void curves_copy_parameters(const Curves &src, Curves &dst)
CurvesSurfaceTransforms::CurvesSurfaceTransforms(const Object &curves_ob, const Object *surface_ob)
{
this->curves_to_world = curves_ob.object_to_world;
this->world_to_curves = this->curves_to_world.inverted();
this->curves_to_world = float4x4_view(curves_ob.object_to_world);
this->world_to_curves = math::invert(this->curves_to_world);
if (surface_ob != nullptr) {
this->surface_to_world = surface_ob->object_to_world;
this->world_to_surface = this->surface_to_world.inverted();
this->surface_to_world = float4x4_view(surface_ob->object_to_world);
this->world_to_surface = math::invert(this->surface_to_world);
this->surface_to_curves = this->world_to_curves * this->surface_to_world;
this->curves_to_surface = this->world_to_surface * this->curves_to_world;
this->surface_to_curves_normal = this->surface_to_curves.inverted().transposed();
this->surface_to_curves_normal = math::transpose(math::invert(this->surface_to_curves));
}
}

View File

@ -13,6 +13,7 @@
#include "BLI_bounds.hh"
#include "BLI_index_mask_ops.hh"
#include "BLI_length_parameterize.hh"
#include "BLI_math_matrix.hh"
#include "BLI_math_rotation_legacy.hh"
#include "BLI_task.hh"
@ -997,7 +998,7 @@ static void transform_positions(MutableSpan<float3> positions, const float4x4 &m
{
threading::parallel_for(positions.index_range(), 1024, [&](const IndexRange range) {
for (float3 &position : positions.slice(range)) {
position = matrix * position;
position = math::transform_point(matrix, position);
}
});
}

View File

@ -613,6 +613,8 @@ void BKE_curve_calc_modifiers_pre(Depsgraph *depsgraph,
continue;
}
blender::bke::ScopedModifierTimer modifier_timer{*md};
if (!deformedVerts) {
deformedVerts = BKE_curve_nurbs_vert_coords_alloc(source_nurb, &numVerts);
}
@ -733,6 +735,8 @@ static GeometrySet curve_calc_modifiers_post(Depsgraph *depsgraph,
continue;
}
blender::bke::ScopedModifierTimer modifier_timer{*md};
if (!geometry_set.has_mesh()) {
geometry_set.replace_mesh(BKE_mesh_new_nomain(0, 0, 0, 0, 0));
}

View File

@ -2,9 +2,9 @@
#include <mutex>
#include "BLI_float4x4.hh"
#include "BLI_index_mask.hh"
#include "BLI_map.hh"
#include "BLI_math_matrix_types.hh"
#include "BLI_rand.hh"
#include "BLI_set.hh"
#include "BLI_span.hh"
@ -117,12 +117,12 @@ namespace blender::bke {
static float3 get_transform_position(const float4x4 &transform)
{
return transform.translation();
return transform.location();
}
static void set_transform_position(float4x4 &transform, const float3 position)
{
copy_v3_v3(transform.values[3], position);
transform.location() = position;
}
class InstancePositionAttributeProvider final : public BuiltinAttributeProvider {

View File

@ -78,7 +78,7 @@ static void geometry_set_collect_recursive_collection_instance(
const Collection &collection, const float4x4 &transform, Vector<GeometryInstanceGroup> &r_sets)
{
float4x4 offset_matrix = float4x4::identity();
sub_v3_v3(offset_matrix.values[3], collection.instance_offset);
offset_matrix.location() -= float3(collection.instance_offset);
const float4x4 instance_transform = transform * offset_matrix;
geometry_set_collect_recursive_collection(collection, instance_transform, r_sets);
}
@ -98,7 +98,7 @@ static void geometry_set_collect_recursive_collection(const Collection &collecti
LISTBASE_FOREACH (const CollectionObject *, collection_object, &collection.gobject) {
BLI_assert(collection_object->ob != nullptr);
const Object &object = *collection_object->ob;
const float4x4 object_transform = transform * object.object_to_world;
const float4x4 object_transform = transform * float4x4_view(object.object_to_world);
geometry_set_collect_recursive_object(object, object_transform, r_sets);
}
LISTBASE_FOREACH (const CollectionChild *, collection_child, &collection.children) {
@ -220,9 +220,9 @@ void Instances::ensure_geometry_instances()
Collection &collection = reference.collection();
FOREACH_COLLECTION_OBJECT_RECURSIVE_BEGIN (&collection, object) {
const int handle = instances->add_reference(*object);
instances->add_instance(handle, object->object_to_world);
instances->add_instance(handle, float4x4(object->object_to_world));
float4x4 &transform = instances->transforms().last();
sub_v3_v3(transform.values[3], collection.instance_offset);
transform.location() -= collection.instance_offset;
}
FOREACH_COLLECTION_OBJECT_RECURSIVE_END;
instances->ensure_geometry_instances();

View File

@ -1273,6 +1273,10 @@ bGPDframe *BKE_gpencil_layer_frame_get(bGPDlayer *gpl, int cframe, eGP_GetFrame_
gpl->actframe = gpf;
}
else if (addnew == GP_GETFRAME_ADD_COPY) {
/* The frame_addcopy function copies the active frame of gpl,
so we need to set the active frame before copying.
*/
gpl->actframe = gpf;
gpl->actframe = BKE_gpencil_frame_addcopy(gpl, cframe);
}
else {
@ -1300,6 +1304,10 @@ bGPDframe *BKE_gpencil_layer_frame_get(bGPDlayer *gpl, int cframe, eGP_GetFrame_
gpl->actframe = gpf;
}
else if (addnew == GP_GETFRAME_ADD_COPY) {
/* The frame_addcopy function copies the active frame of gpl;
so we need to set the active frame before copying.
*/
gpl->actframe = gpf;
gpl->actframe = BKE_gpencil_frame_addcopy(gpl, cframe);
}
else {

View File

@ -40,6 +40,15 @@ struct IDRemapper {
source_types |= BKE_idtype_idcode_to_idfilter(GS(old_id->name));
}
void add_overwrite(ID *old_id, ID *new_id)
{
BLI_assert(old_id != nullptr);
BLI_assert(new_id == nullptr || (GS(old_id->name) == GS(new_id->name)));
mappings.add_overwrite(old_id, new_id);
BLI_assert(BKE_idtype_idcode_to_idfilter(GS(old_id->name)) != 0);
source_types |= BKE_idtype_idcode_to_idfilter(GS(old_id->name));
}
bool contains_mappings_for_any(IDTypeFilter filter) const
{
return (source_types & filter) != 0;
@ -159,6 +168,12 @@ void BKE_id_remapper_add(IDRemapper *id_remapper, ID *old_id, ID *new_id)
remapper->add(old_id, new_id);
}
void BKE_id_remapper_add_overwrite(IDRemapper *id_remapper, ID *old_id, ID *new_id)
{
blender::bke::id::remapper::IDRemapper *remapper = unwrap(id_remapper);
remapper->add_overwrite(old_id, new_id);
}
bool BKE_id_remapper_has_mapping_for(const struct IDRemapper *id_remapper, uint64_t type_filter)
{
const blender::bke::id::remapper::IDRemapper *remapper = unwrap(id_remapper);

View File

@ -294,8 +294,9 @@ static bool library_foreach_ID_link(Main *bmain,
continue;
}
/* NOTE: ID.lib pointer is purposefully fully ignored here...
* We may want to add it at some point? */
if (flag & IDWALK_DO_LIBRARY_POINTER) {
CALLBACK_INVOKE(id->lib, IDWALK_CB_NEVER_SELF);
}
if (flag & IDWALK_DO_INTERNAL_RUNTIME_POINTERS) {
CALLBACK_INVOKE_ID(id->newid, IDWALK_CB_INTERNAL);

View File

@ -532,13 +532,16 @@ typedef struct LibblockRemapMultipleUserData {
static void libblock_remap_foreach_idpair_cb(ID *old_id, ID *new_id, void *user_data)
{
if (old_id == new_id) {
return;
}
LibBlockRemapMultipleUserData *data = user_data;
Main *bmain = data->bmain;
const short remap_flags = data->remap_flags;
BLI_assert(old_id != NULL);
BLI_assert((new_id == NULL) || GS(old_id->name) == GS(new_id->name));
BLI_assert(old_id != new_id);
if (free_notifier_reference_cb) {
free_notifier_reference_cb(old_id);

View File

@ -186,6 +186,7 @@ void BKE_main_free(Main *mainvar)
BKE_main_idmap_destroy(mainvar->id_map);
}
/* NOTE: `name_map` in libraries are freed together with the library IDs above. */
if (mainvar->name_map) {
BKE_main_namemap_destroy(&mainvar->name_map);
}

View File

@ -195,6 +195,22 @@ void BKE_main_namemap_destroy(struct UniqueName_Map **r_name_map)
*r_name_map = nullptr;
}
void BKE_main_namemap_clear(Main *bmain)
{
for (Main *bmain_iter = bmain; bmain_iter != nullptr; bmain_iter = bmain_iter->next) {
if (bmain_iter->name_map != nullptr) {
BKE_main_namemap_destroy(&bmain_iter->name_map);
}
for (Library *lib_iter = static_cast<Library *>(bmain_iter->libraries.first);
lib_iter != nullptr;
lib_iter = static_cast<Library *>(lib_iter->id.next)) {
if (lib_iter->runtime.name_map != nullptr) {
BKE_main_namemap_destroy(&lib_iter->runtime.name_map);
}
}
}
}
static void main_namemap_populate(UniqueName_Map *name_map, struct Main *bmain, ID *ignore_id)
{
BLI_assert_msg(name_map != nullptr, "name_map should not be null");

View File

@ -107,10 +107,10 @@ static void mesh_copy_data(Main *bmain, ID *id_dst, const ID *id_src, const int
mesh_dst->runtime->wrapper_type_finalize = mesh_src->runtime->wrapper_type_finalize;
mesh_dst->runtime->subsurf_runtime_data = mesh_src->runtime->subsurf_runtime_data;
mesh_dst->runtime->cd_mask_extra = mesh_src->runtime->cd_mask_extra;
/* Copy face dot tags, since meshes may be duplicated after a subsurf modifier
* or node, but we still need to be able to draw face center vertices. */
mesh_dst->runtime->subsurf_face_dot_tags = static_cast<uint32_t *>(
MEM_dupallocN(mesh_src->runtime->subsurf_face_dot_tags));
/* Copy face dot tags, since meshes may be duplicated after a subsurf modifier or node, but we
* still need to be able to draw face center vertices. The tags may be cleared explicitly when
* the topology is changed. */
mesh_dst->runtime->subsurf_face_dot_tags = mesh_src->runtime->subsurf_face_dot_tags;
if ((mesh_src->id.tag & LIB_TAG_NO_MAIN) == 0) {
/* This is a direct copy of a main mesh, so for now it has the same topology. */
mesh_dst->runtime->deformed_only = true;
@ -275,7 +275,6 @@ static void mesh_blend_write(BlendWriter *writer, ID *id, const void *id_address
BKE_mesh_legacy_convert_material_indices_to_mpoly(mesh);
BKE_mesh_legacy_sharp_faces_to_flags(mesh);
BKE_mesh_legacy_bevel_weight_from_layers(mesh);
BKE_mesh_legacy_face_set_from_generic(mesh, poly_layers);
BKE_mesh_legacy_edge_crease_from_layers(mesh);
BKE_mesh_legacy_sharp_edges_to_flags(mesh);
BKE_mesh_legacy_attribute_strings_to_flags(mesh);
@ -295,6 +294,7 @@ static void mesh_blend_write(BlendWriter *writer, ID *id, const void *id_address
if (!BLO_write_is_undo(writer)) {
BKE_mesh_legacy_convert_uvs_to_struct(mesh, temp_arrays_for_legacy_format, loop_layers);
BKE_mesh_legacy_face_set_from_generic(poly_layers);
}
}

View File

@ -17,9 +17,8 @@
#include "BLI_alloca.h"
#include "BLI_array.hh"
#include "BLI_float4x4.hh"
#include "BLI_math.h"
#include "BLI_math_vector_types.hh"
#include "BLI_math_matrix.hh"
#include "BLI_mesh_boolean.hh"
#include "BLI_mesh_intersect.hh"
#include "BLI_span.hh"
@ -45,7 +44,7 @@ static float4x4 clean_transform(const float4x4 &mat)
const float fuzz = 1e-6f;
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
float f = mat.values[i][j];
float f = mat[i][j];
if (fabsf(f) <= fuzz) {
f = 0.0f;
}
@ -55,7 +54,7 @@ static float4x4 clean_transform(const float4x4 &mat)
else if (fabsf(f + 1.0f) <= fuzz) {
f = -1.0f;
}
cleaned.values[i][j] = f;
cleaned[i][j] = f;
}
}
return cleaned;
@ -280,7 +279,7 @@ static IMesh meshes_to_imesh(Span<const Mesh *> meshes,
* of the target, multiply each transform by the inverse of the
* target matrix. Exact Boolean works better if these matrices are 'cleaned'
* -- see the comment for the `clean_transform` function, above. */
const float4x4 inv_target_mat = clean_transform(target_transform).inverted();
const float4x4 inv_target_mat = math::invert(clean_transform(target_transform));
/* For each input `Mesh`, make `Vert`s and `Face`s for the corresponding
* vertices and polygons, and keep track of the original indices (using the
@ -297,7 +296,7 @@ static IMesh meshes_to_imesh(Span<const Mesh *> meshes,
const float4x4 objn_mat = (obmats[mi] == nullptr) ? float4x4::identity() :
clean_transform(*obmats[mi]);
r_info->to_target_transform[mi] = inv_target_mat * objn_mat;
r_info->has_negative_transform[mi] = objn_mat.is_negative();
r_info->has_negative_transform[mi] = math::is_negative(objn_mat);
/* All meshes 1 and up will be transformed into the local space of operand 0.
* Historical behavior of the modifier has been to flip the faces of any meshes
@ -327,7 +326,7 @@ static IMesh meshes_to_imesh(Span<const Mesh *> meshes,
else {
threading::parallel_for(vert_positions.index_range(), 2048, [&](IndexRange range) {
for (int i : range) {
float3 co = r_info->to_target_transform[mi] * vert_positions[i];
float3 co = math::transform_point(r_info->to_target_transform[mi], vert_positions[i]);
mpq3 mco = mpq3(co.x, co.y, co.z);
double3 dco(mco[0].get_d(), mco[1].get_d(), mco[2].get_d());
verts[i] = new Vert(mco, dco, NO_INDEX, i);
@ -560,8 +559,8 @@ static void get_poly2d_cos(const Mesh *me,
axis_dominant_v3_to_m3(r_axis_mat, axis_dominant);
for (const int i : poly_verts.index_range()) {
float3 co = positions[poly_verts[i]];
co = trans_mat * co;
mul_v2_m3v3(cos_2d[i], r_axis_mat, co);
co = math::transform_point(trans_mat, co);
*reinterpret_cast<float2 *>(&cos_2d[i]) = (float3x3(r_axis_mat) * co).xy();
}
}

View File

@ -306,23 +306,22 @@ void BKE_mesh_foreach_mapped_subdiv_face_center(
MeshForeachFlag flag)
{
const float(*positions)[3] = BKE_mesh_vert_positions(mesh);
const OffsetIndices polys = mesh->polys();
const blender::OffsetIndices polys = mesh->polys();
const blender::Span<int> corner_verts = mesh->corner_verts();
const float(*vert_normals)[3] = (flag & MESH_FOREACH_USE_NORMAL) ?
BKE_mesh_vertex_normals_ensure(mesh) :
nullptr;
const int *index = static_cast<const int *>(CustomData_get_layer(&mesh->pdata, CD_ORIGINDEX));
const BLI_bitmap *facedot_tags = mesh->runtime->subsurf_face_dot_tags;
BLI_assert(facedot_tags != nullptr);
const blender::BitVector<> &facedot_tags = mesh->runtime->subsurf_face_dot_tags;
if (index) {
for (int i = 0; i < mesh->totpoly; i++) {
for (const int i : polys.index_range()) {
const int orig = *index++;
if (orig == ORIGINDEX_NONE) {
continue;
}
for (const int vert : corner_verts.slice(polys[i])) {
if (BLI_BITMAP_TEST(facedot_tags, vert)) {
if (facedot_tags[vert]) {
func(userData,
orig,
positions[vert],
@ -332,9 +331,9 @@ void BKE_mesh_foreach_mapped_subdiv_face_center(
}
}
else {
for (int i = 0; i < mesh->totpoly; i++) {
for (const int i : polys.index_range()) {
for (const int vert : corner_verts.slice(polys[i])) {
if (BLI_BITMAP_TEST(facedot_tags, vert)) {
if (facedot_tags[vert]) {
func(userData,
i,
positions[vert],

View File

@ -1282,23 +1282,26 @@ void BKE_mesh_legacy_sharp_faces_from_flags(Mesh *mesh)
/** \name Face Set Conversion
* \{ */
void BKE_mesh_legacy_face_set_from_generic(Mesh *mesh,
blender::MutableSpan<CustomDataLayer> poly_layers)
void BKE_mesh_legacy_face_set_from_generic(blender::MutableSpan<CustomDataLayer> poly_layers)
{
using namespace blender;
void *faceset_data = nullptr;
bool changed = false;
for (CustomDataLayer &layer : poly_layers) {
if (StringRef(layer.name) == ".sculpt_face_set") {
faceset_data = layer.data;
layer.data = nullptr;
CustomData_free_layer_named(&mesh->pdata, ".sculpt_face_set", mesh->totpoly);
layer.type = CD_SCULPT_FACE_SETS;
layer.name[0] = '\0';
changed = true;
break;
}
}
if (faceset_data != nullptr) {
CustomData_add_layer(
&mesh->pdata, CD_SCULPT_FACE_SETS, CD_ASSIGN, faceset_data, mesh->totpoly);
if (!changed) {
return;
}
/* #CustomData expects the layers to be sorted in increasing order based on type. */
std::stable_sort(
poly_layers.begin(),
poly_layers.end(),
[](const CustomDataLayer &a, const CustomDataLayer &b) { return a.type < b.type; });
}
void BKE_mesh_legacy_face_set_to_generic(Mesh *mesh)

View File

@ -202,7 +202,7 @@ Mesh *BKE_mesh_mirror_apply_mirror_on_axis_for_modifier(MirrorModifierData *mmd,
/* Subdivision-surface for eg won't have mesh data in the custom-data arrays.
* Now add position/#MEdge layers. */
if (BKE_mesh_vert_positions(mesh) != NULL) {
if (BKE_mesh_vert_positions(mesh) != nullptr) {
memcpy(BKE_mesh_vert_positions_for_write(result),
BKE_mesh_vert_positions(mesh),
sizeof(float[3]) * mesh->totvert);

View File

@ -105,7 +105,6 @@ MeshRuntime::~MeshRuntime()
if (this->shrinkwrap_data) {
BKE_shrinkwrap_boundary_data_free(this->shrinkwrap_data);
}
MEM_SAFE_FREE(this->subsurf_face_dot_tags);
}
} // namespace blender::bke
@ -230,10 +229,10 @@ void BKE_mesh_runtime_clear_geometry(Mesh *mesh)
mesh->runtime->bounds_cache.tag_dirty();
mesh->runtime->loose_edges_cache.tag_dirty();
mesh->runtime->looptris_cache.tag_dirty();
mesh->runtime->subsurf_face_dot_tags.clear_and_shrink();
if (mesh->runtime->shrinkwrap_data) {
BKE_shrinkwrap_boundary_data_free(mesh->runtime->shrinkwrap_data);
}
MEM_SAFE_FREE(mesh->runtime->subsurf_face_dot_tags);
}
void BKE_mesh_tag_edges_split(struct Mesh *mesh)
@ -245,10 +244,10 @@ void BKE_mesh_tag_edges_split(struct Mesh *mesh)
free_normals(*mesh->runtime);
free_subdiv_ccg(*mesh->runtime);
mesh->runtime->loose_edges_cache.tag_dirty();
mesh->runtime->subsurf_face_dot_tags.clear_and_shrink();
if (mesh->runtime->shrinkwrap_data) {
BKE_shrinkwrap_boundary_data_free(mesh->runtime->shrinkwrap_data);
}
MEM_SAFE_FREE(mesh->runtime->subsurf_face_dot_tags);
}
void BKE_mesh_tag_coords_changed(Mesh *mesh)

View File

@ -11,6 +11,7 @@
#define DNA_DEPRECATED_ALLOW
#include <cfloat>
#include <chrono>
#include <cmath>
#include <cstdarg>
#include <cstddef>
@ -1016,6 +1017,9 @@ void BKE_modifier_deform_verts(ModifierData *md,
modwrap_dependsOnNormals(me);
}
mti->deformVerts(md, ctx, me, vertexCos, numVerts);
if (me) {
BKE_mesh_tag_coords_changed(me);
}
}
void BKE_modifier_deform_vertsEM(ModifierData *md,
@ -1514,3 +1518,28 @@ void BKE_modifier_blend_read_lib(BlendLibReader *reader, Object *ob)
}
}
}
namespace blender::bke {
using Clock = std::chrono::high_resolution_clock;
static double get_current_time_in_seconds()
{
return std::chrono::duration<double, std::chrono::seconds::period>(
Clock::now().time_since_epoch())
.count();
}
ScopedModifierTimer::ScopedModifierTimer(ModifierData &md) : md_(md)
{
start_time_ = get_current_time_in_seconds();
}
ScopedModifierTimer::~ScopedModifierTimer()
{
const double end_time = get_current_time_in_seconds();
const double duration = end_time - start_time_;
md_.execution_time = duration;
}
} // namespace blender::bke

View File

@ -787,7 +787,6 @@ void BKE_nlastrips_add_strip_unsafe(ListBase *strips, NlaStrip *strip)
}
}
/** NULL and Space check before adding in nlastrip */
bool BKE_nlastrips_add_strip(ListBase *strips, NlaStrip *strip)
{
if (ELEM(NULL, strips, strip)) {

View File

@ -50,7 +50,7 @@ TEST(nla_strip, BKE_nlastrips_add_strip)
NlaStrip strip2{};
strip2.start = 5;
strip2.end = 10;
/* can't add a null NLA strip to an NLA Track. */
EXPECT_FALSE(BKE_nlastrips_add_strip(&strips, NULL));

View File

@ -15,9 +15,9 @@
#include "BLI_string_utf8.h"
#include "BLI_array.hh"
#include "BLI_float4x4.hh"
#include "BLI_math.h"
#include "BLI_math_vector_types.hh"
#include "BLI_math_matrix.hh"
#include "BLI_math_vector.hh"
#include "BLI_rand.h"
#include "BLI_span.hh"
#include "BLI_vector.hh"
@ -956,11 +956,11 @@ static void make_duplis_geometry_set_impl(const DupliContext *ctx,
case InstanceReference::Type::Object: {
Object &object = reference.object();
float matrix[4][4];
mul_m4_m4m4(matrix, parent_transform, instance_offset_matrices[i].values);
mul_m4_m4m4(matrix, parent_transform, instance_offset_matrices[i].ptr());
make_dupli(ctx_for_instance, &object, matrix, id, &geometry_set, i);
float space_matrix[4][4];
mul_m4_m4m4(space_matrix, instance_offset_matrices[i].values, object.world_to_object);
mul_m4_m4m4(space_matrix, instance_offset_matrices[i].ptr(), object.world_to_object);
mul_m4_m4_pre(space_matrix, parent_transform);
make_recursive_duplis(ctx_for_instance, &object, space_matrix, id, &geometry_set, i);
break;
@ -970,7 +970,7 @@ static void make_duplis_geometry_set_impl(const DupliContext *ctx,
float collection_matrix[4][4];
unit_m4(collection_matrix);
sub_v3_v3(collection_matrix[3], collection.instance_offset);
mul_m4_m4_pre(collection_matrix, instance_offset_matrices[i].values);
mul_m4_m4_pre(collection_matrix, instance_offset_matrices[i].ptr());
mul_m4_m4_pre(collection_matrix, parent_transform);
DupliContext sub_ctx;
@ -1002,7 +1002,7 @@ static void make_duplis_geometry_set_impl(const DupliContext *ctx,
}
case InstanceReference::Type::GeometrySet: {
float new_transform[4][4];
mul_m4_m4m4(new_transform, parent_transform, instance_offset_matrices[i].values);
mul_m4_m4m4(new_transform, parent_transform, instance_offset_matrices[i].ptr());
DupliContext sub_ctx;
if (copy_dupli_context(&sub_ctx,

View File

@ -26,7 +26,7 @@
#include "BKE_pbvh.h"
#include "BKE_subdiv_ccg.h"
#include "DRW_pbvh.h"
#include "DRW_pbvh.hh"
#include "PIL_time.h"
@ -401,8 +401,8 @@ int BKE_pbvh_count_grid_quads(BLI_bitmap **grid_hidden,
/* grid hidden layer is present, so have to check each grid for
* visibility */
int depth1 = int(log2((double)gridsize - 1.0) + DBL_EPSILON);
int depth2 = int(log2((double)display_gridsize - 1.0) + DBL_EPSILON);
int depth1 = int(log2(double(gridsize) - 1.0) + DBL_EPSILON);
int depth2 = int(log2(double(display_gridsize) - 1.0) + DBL_EPSILON);
int skip = depth2 < depth1 ? 1 << (depth1 - depth2 - 1) : 1;
@ -688,8 +688,8 @@ static void pbvh_draw_args_init(PBVH *pbvh, PBVH_GPU_Args *args, PBVHNode *node)
args->face_sets_color_default = pbvh->face_sets_color_default;
args->face_sets_color_seed = pbvh->face_sets_color_seed;
args->vert_positions = pbvh->vert_positions;
args->corner_verts = pbvh->corner_verts;
args->corner_edges = pbvh->mesh ? BKE_mesh_corner_edges(pbvh->mesh) : NULL;
args->corner_verts = {pbvh->corner_verts, pbvh->mesh->totloop};
args->corner_edges = pbvh->mesh ? pbvh->mesh->corner_edges() : blender::Span<int>();
args->mpoly = pbvh->mpoly;
args->mlooptri = pbvh->looptri;
@ -3684,7 +3684,7 @@ static void pbvh_face_iter_step(PBVHFaceIter *fd, bool do_step)
}
BMFace *f = (BMFace *)BLI_gsetIterator_getKey(&fd->bm_faces_iter_);
fd->face.i = (intptr_t)f;
fd->face.i = intptr_t(f);
fd->index = f->head.index;
if (fd->cd_face_set_ != -1) {
@ -3700,7 +3700,7 @@ static void pbvh_face_iter_step(PBVHFaceIter *fd, bool do_step)
BMLoop *l = f->l_first;
do {
fd->verts[vertex_i++].i = (intptr_t)l->v;
fd->verts[vertex_i++].i = intptr_t(l->v);
} while ((l = l->next) != f->l_first);
break;

View File

@ -17,7 +17,7 @@
#include "BKE_ccg.h"
#include "BKE_pbvh.h"
#include "DRW_pbvh.h"
#include "DRW_pbvh.hh"
#include "bmesh.h"
#include "pbvh_intern.hh"
@ -1528,7 +1528,7 @@ bool pbvh_bmesh_node_raycast(PBVHNode *node,
if (j == 0 || len_squared_v3v3(location, cos[j]) <
len_squared_v3v3(location, nearest_vertex_co)) {
copy_v3_v3(nearest_vertex_co, cos[j]);
r_active_vertex->i = (intptr_t)node->bm_orvert[node->bm_ortri[i][j]];
r_active_vertex->i = intptr_t(node->bm_orvert[node->bm_ortri[i][j]]);
}
}
}
@ -1560,7 +1560,7 @@ bool pbvh_bmesh_node_raycast(PBVHNode *node,
if (j == 0 || len_squared_v3v3(location, v_tri[j]->co) <
len_squared_v3v3(location, nearest_vertex_co)) {
copy_v3_v3(nearest_vertex_co, v_tri[j]->co);
r_active_vertex->i = (intptr_t)v_tri[j];
r_active_vertex->i = intptr_t(v_tri[j]);
}
}
}

View File

@ -116,7 +116,7 @@ static void split_pixel_node(
const int axis = BB_widest_axis(&cb);
const float mid = (cb.bmax[axis] + cb.bmin[axis]) * 0.5f;
node->flag = (PBVHNodeFlags)((int)node->flag & (int)~PBVH_TexLeaf);
node->flag = (PBVHNodeFlags)(int(node->flag) & int(~PBVH_TexLeaf));
SplitNodePair *split1 = MEM_new<SplitNodePair>("split_pixel_node split1", split);
SplitNodePair *split2 = MEM_new<SplitNodePair>("split_pixel_node split1", split);
@ -188,7 +188,7 @@ static void split_pixel_node(
float2 delta = uv_prim.delta_barycentric_coord_u;
float2 uv1 = row.start_barycentric_coord;
float2 uv2 = row.start_barycentric_coord + delta * (float)row.num_pixels;
float2 uv2 = row.start_barycentric_coord + delta * float(row.num_pixels);
float co1[3];
float co2[3];
@ -210,7 +210,7 @@ static void split_pixel_node(
t = (mid - co1[axis]) / (co2[axis] - co1[axis]);
}
int num_pixels = (int)floorf((float)row.num_pixels * t);
int num_pixels = int(floorf(float(row.num_pixels) * t));
if (num_pixels) {
row1.num_pixels = num_pixels;
@ -223,7 +223,7 @@ static void split_pixel_node(
row2.num_pixels = row.num_pixels - num_pixels;
row2.start_barycentric_coord = row.start_barycentric_coord +
uv_prim.delta_barycentric_coord_u * (float)num_pixels;
uv_prim.delta_barycentric_coord_u * float(num_pixels);
row2.start_image_coordinate = row.start_image_coordinate;
row2.start_image_coordinate[0] += num_pixels;
@ -731,7 +731,7 @@ static bool update_pixels(PBVH *pbvh, Mesh *mesh, Image *image, ImageUser *image
PBVHNode &node = pbvh->nodes[i];
if (node.flag & PBVH_Leaf) {
node.flag = (PBVHNodeFlags)((int)node.flag | (int)PBVH_TexLeaf);
node.flag = (PBVHNodeFlags)(int(node.flag) | int(PBVH_TexLeaf));
}
}

View File

@ -1,5 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#include "BLI_math_matrix.hh"
#include "pbvh_uv_islands.hh"
#include <optional>
@ -1244,14 +1246,14 @@ UVBorderCorner::UVBorderCorner(UVBorderEdge *first, UVBorderEdge *second, float
float2 UVBorderCorner::uv(float factor, float min_uv_distance)
{
using namespace blender::math;
float2 origin = first->get_uv_vertex(1)->uv;
float angle_between = angle * factor;
float desired_len = max_ff(second->length() * factor + first->length() * (1.0 - factor),
min_uv_distance);
float2 v = first->get_uv_vertex(0)->uv - origin;
normalize_v2(v);
float2 v = normalize(first->get_uv_vertex(0)->uv - origin);
float3x3 rot_mat = float3x3::from_rotation(angle_between);
float2x2 rot_mat = from_rotation<float2x2>(AngleRadian(angle_between));
float2 rotated = rot_mat * v;
float2 result = rotated * desired_len + first->get_uv_vertex(1)->uv;
return result;

View File

@ -24,9 +24,9 @@
#include "BLI_array.hh"
#include "BLI_edgehash.h"
#include "BLI_float3x3.hh"
#include "BLI_map.hh"
#include "BLI_math.h"
#include "BLI_math_matrix_types.hh"
#include "BLI_math_vector_types.hh"
#include "BLI_rect.h"
#include "BLI_vector.hh"

View File

@ -380,6 +380,8 @@ static void pointcloud_evaluate_modifiers(struct Depsgraph *depsgraph,
continue;
}
blender::bke::ScopedModifierTimer modifier_timer{*md};
if (mti->modifyGeometrySet) {
mti->modifyGeometrySet(md, &mectx, &geometry_set);
}

View File

@ -161,7 +161,7 @@ static float get_edge_sharpness(const OpenSubdiv_Converter *converter, int manif
return 10.0f;
}
#endif
if (!storage->settings.use_creases || storage->cd_edge_crease == NULL) {
if (!storage->settings.use_creases || storage->cd_edge_crease == nullptr) {
return 0.0f;
}
const int edge_index = storage->manifold_edge_index_reverse[manifold_edge_index];
@ -184,7 +184,7 @@ static bool is_infinite_sharp_vertex(const OpenSubdiv_Converter *converter,
static float get_vertex_sharpness(const OpenSubdiv_Converter *converter, int manifold_vertex_index)
{
ConverterStorage *storage = static_cast<ConverterStorage *>(converter->user_data);
if (!storage->settings.use_creases || storage->cd_vertex_crease == NULL) {
if (!storage->settings.use_creases || storage->cd_vertex_crease == nullptr) {
return 0.0f;
}
const int vertex_index = storage->manifold_vertex_index_reverse[manifold_vertex_index];
@ -208,7 +208,7 @@ static void precalc_uv_layer(const OpenSubdiv_Converter *converter, const int la
const int num_vert = mesh->totvert;
const float limit[2] = {STD_UV_CONNECT_LIMIT, STD_UV_CONNECT_LIMIT};
/* Initialize memory required for the operations. */
if (storage->loop_uv_indices == NULL) {
if (storage->loop_uv_indices == nullptr) {
storage->loop_uv_indices = static_cast<int *>(
MEM_malloc_arrayN(mesh->totloop, sizeof(int), "loop uv vertex index"));
}
@ -227,7 +227,7 @@ static void precalc_uv_layer(const OpenSubdiv_Converter *converter, const int la
storage->num_uv_coordinates = -1;
for (int vertex_index = 0; vertex_index < num_vert; vertex_index++) {
const UvMapVert *uv_vert = BKE_mesh_uv_vert_map_get_vert(uv_vert_map, vertex_index);
while (uv_vert != NULL) {
while (uv_vert != nullptr) {
if (uv_vert->separate) {
storage->num_uv_coordinates++;
}
@ -287,17 +287,17 @@ static void init_functions(OpenSubdiv_Converter *converter)
converter->getNumFaceVertices = get_num_face_vertices;
converter->getFaceVertices = get_face_vertices;
converter->getFaceEdges = NULL;
converter->getFaceEdges = nullptr;
converter->getEdgeVertices = get_edge_vertices;
converter->getNumEdgeFaces = NULL;
converter->getEdgeFaces = NULL;
converter->getNumEdgeFaces = nullptr;
converter->getEdgeFaces = nullptr;
converter->getEdgeSharpness = get_edge_sharpness;
converter->getNumVertexEdges = NULL;
converter->getVertexEdges = NULL;
converter->getNumVertexFaces = NULL;
converter->getVertexFaces = NULL;
converter->getNumVertexEdges = nullptr;
converter->getVertexEdges = nullptr;
converter->getNumVertexFaces = nullptr;
converter->getVertexFaces = nullptr;
converter->isInfiniteSharpVertex = is_infinite_sharp_vertex;
converter->getVertexSharpness = get_vertex_sharpness;
@ -316,36 +316,36 @@ static void initialize_manifold_index_array(const BLI_bitmap *used_map,
int **r_indices_reverse,
int *r_num_manifold_elements)
{
int *indices = NULL;
if (r_indices != NULL) {
int *indices = nullptr;
if (r_indices != nullptr) {
indices = static_cast<int *>(MEM_malloc_arrayN(num_elements, sizeof(int), "manifold indices"));
}
int *indices_reverse = NULL;
if (r_indices_reverse != NULL) {
int *indices_reverse = nullptr;
if (r_indices_reverse != nullptr) {
indices_reverse = static_cast<int *>(
MEM_malloc_arrayN(num_elements, sizeof(int), "manifold indices reverse"));
}
int offset = 0;
for (int i = 0; i < num_elements; i++) {
if (BLI_BITMAP_TEST_BOOL(used_map, i)) {
if (indices != NULL) {
if (indices != nullptr) {
indices[i] = i - offset;
}
if (indices_reverse != NULL) {
if (indices_reverse != nullptr) {
indices_reverse[i - offset] = i;
}
}
else {
if (indices != NULL) {
if (indices != nullptr) {
indices[i] = -1;
}
offset++;
}
}
if (r_indices != NULL) {
if (r_indices != nullptr) {
*r_indices = indices;
}
if (r_indices_reverse != NULL) {
if (r_indices_reverse != nullptr) {
*r_indices_reverse = indices_reverse;
}
*r_num_manifold_elements = num_elements - offset;
@ -372,7 +372,7 @@ static void initialize_manifold_indices(ConverterStorage *storage)
&storage->num_manifold_vertices);
initialize_manifold_index_array(edge_used_map,
mesh->totedge,
NULL,
nullptr,
&storage->manifold_edge_index_reverse,
&storage->num_manifold_edges);
/* Initialize infinite sharp mapping. */
@ -405,7 +405,7 @@ static void init_user_data(OpenSubdiv_Converter *converter,
CustomData_get_layer(&mesh->vdata, CD_CREASE));
user_data->cd_edge_crease = static_cast<const float *>(
CustomData_get_layer(&mesh->edata, CD_CREASE));
user_data->loop_uv_indices = NULL;
user_data->loop_uv_indices = nullptr;
initialize_manifold_indices(user_data);
converter->user_data = user_data;
}

View File

@ -528,9 +528,8 @@ static bool subdiv_mesh_topology_info(const SubdivForeachContext *foreach_contex
subdiv_context->coarse_mesh, num_vertices, num_edges, 0, num_loops, num_polygons, mask);
subdiv_mesh_ctx_cache_custom_data_layers(subdiv_context);
subdiv_mesh_prepare_accumulator(subdiv_context, num_vertices);
MEM_SAFE_FREE(subdiv_context->subdiv_mesh->runtime->subsurf_face_dot_tags);
subdiv_context->subdiv_mesh->runtime->subsurf_face_dot_tags = BLI_BITMAP_NEW(num_vertices,
__func__);
subdiv_context->subdiv_mesh->runtime->subsurf_face_dot_tags.clear();
subdiv_context->subdiv_mesh->runtime->subsurf_face_dot_tags.resize(num_vertices);
return true;
}
@ -592,7 +591,7 @@ static void evaluate_vertex_and_apply_displacement_copy(const SubdivMeshContext
/* Evaluate undeformed texture coordinate. */
subdiv_vertex_orco_evaluate(ctx, ptex_face_index, u, v, subdiv_vertex_index);
/* Remove face-dot flag. This can happen if there is more than one subsurf modifier. */
BLI_BITMAP_DISABLE(ctx->subdiv_mesh->runtime->subsurf_face_dot_tags, subdiv_vertex_index);
ctx->subdiv_mesh->runtime->subsurf_face_dot_tags[subdiv_vertex_index].reset();
}
static void evaluate_vertex_and_apply_displacement_interpolate(
@ -748,7 +747,7 @@ static void subdiv_mesh_tag_center_vertex(const IndexRange coarse_poly,
Mesh *subdiv_mesh)
{
if (subdiv_mesh_is_center_vertex(coarse_poly, u, v)) {
BLI_BITMAP_ENABLE(subdiv_mesh->runtime->subsurf_face_dot_tags, subdiv_vertex_index);
subdiv_mesh->runtime->subsurf_face_dot_tags[subdiv_vertex_index].set();
}
}
@ -782,10 +781,10 @@ static void subdiv_mesh_vertex_inner(const SubdivForeachContext *foreach_context
static void subdiv_copy_edge_data(SubdivMeshContext *ctx,
MEdge *subdiv_edge,
const MEdge *coarse_edge)
const int coarse_edge_index)
{
const int subdiv_edge_index = subdiv_edge - ctx->subdiv_edges;
if (coarse_edge == nullptr) {
if (coarse_edge_index == ORIGINDEX_NONE) {
subdiv_edge->flag = 0;
if (!ctx->settings->use_optimal_display) {
subdiv_edge->flag |= ME_EDGEDRAW;
@ -795,7 +794,6 @@ static void subdiv_copy_edge_data(SubdivMeshContext *ctx,
}
return;
}
const int coarse_edge_index = coarse_edge - ctx->coarse_edges;
CustomData_copy_data(
&ctx->coarse_mesh->edata, &ctx->subdiv_mesh->edata, coarse_edge_index, subdiv_edge_index, 1);
subdiv_edge->flag |= ME_EDGEDRAW;
@ -812,12 +810,7 @@ static void subdiv_mesh_edge(const SubdivForeachContext *foreach_context,
SubdivMeshContext *ctx = static_cast<SubdivMeshContext *>(foreach_context->user_data);
MEdge *subdiv_medge = ctx->subdiv_edges;
MEdge *subdiv_edge = &subdiv_medge[subdiv_edge_index];
const MEdge *coarse_edge = nullptr;
if (coarse_edge_index != ORIGINDEX_NONE) {
const MEdge *coarse_medge = ctx->coarse_edges;
coarse_edge = &coarse_medge[coarse_edge_index];
}
subdiv_copy_edge_data(ctx, subdiv_edge, coarse_edge);
subdiv_copy_edge_data(ctx, subdiv_edge, coarse_edge_index);
subdiv_edge->v1 = subdiv_v1;
subdiv_edge->v2 = subdiv_v2;
}

View File

@ -14,11 +14,11 @@
#include "BLI_compiler_compat.h"
#include "BLI_fileops.h"
#include "BLI_float4x4.hh"
#include "BLI_ghash.h"
#include "BLI_index_range.hh"
#include "BLI_map.hh"
#include "BLI_math.h"
#include "BLI_math_matrix_types.hh"
#include "BLI_math_vector_types.hh"
#include "BLI_path_util.h"
#include "BLI_string.h"
@ -1108,6 +1108,8 @@ static void volume_evaluate_modifiers(struct Depsgraph *depsgraph,
continue;
}
blender::bke::ScopedModifierTimer modifier_timer{*md};
if (mti->modifyGeometrySet) {
mti->modifyGeometrySet(md, &mectx, &geometry_set);
}
@ -1650,7 +1652,7 @@ openvdb::GridBase::ConstPtr BKE_volume_grid_shallow_transform(openvdb::GridBase:
const blender::float4x4 &transform)
{
openvdb::math::Transform::Ptr grid_transform = grid->transform().copy();
grid_transform->postMult(openvdb::Mat4d((float *)transform.values));
grid_transform->postMult(openvdb::Mat4d((float *)transform.ptr()));
/* Create a transformed grid. The underlying tree is shared. */
return grid->copyGridReplacingTransform(grid_transform);

View File

@ -98,8 +98,8 @@ class Any {
private:
/* Makes it possible to use void in the template parameters. */
using RealExtraInfo =
std::conditional_t<std::is_void_v<ExtraInfo>, detail::NoExtraInfo, ExtraInfo>;
using Info = detail::AnyTypeInfo<RealExtraInfo>;
std::conditional_t<std::is_void_v<ExtraInfo>, blender::detail::NoExtraInfo, ExtraInfo>;
using Info = blender::detail::AnyTypeInfo<RealExtraInfo>;
static constexpr size_t RealInlineBufferCapacity = std::max(InlineBufferCapacity,
sizeof(std::unique_ptr<int>));

View File

@ -478,6 +478,28 @@ class BitVector {
this->realloc_to_at_least(new_capacity_in_bits);
}
/**
* Reset the size of the vector to zero elements, but keep the same memory capacity to be
* refilled again.
*/
void clear()
{
size_in_bits_ = 0;
}
/**
* Free memory and reset the vector to zero elements.
*/
void clear_and_shrink()
{
size_in_bits_ = 0;
capacity_in_bits_ = 0;
if (!this->is_inline()) {
allocator_.deallocate(data_);
}
data_ = inline_buffer_;
}
private:
void ensure_space_for_one()
{

View File

@ -1,215 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#pragma once
#include <cmath>
#include <cstdint>
#include "BLI_assert.h"
#include "BLI_math_base.h"
#include "BLI_math_matrix.h"
#include "BLI_math_vector.h"
#include "BLI_math_vector_types.hh"
namespace blender {
struct float3x3 {
/* A 3x3 matrix in column major order. */
float values[3][3];
float3x3() = default;
float3x3(const float *matrix)
{
memcpy(values, matrix, sizeof(float) * 3 * 3);
}
float3x3(const float matrix[3][3]) : float3x3(static_cast<const float *>(matrix[0]))
{
}
static float3x3 zero()
{
float3x3 result;
zero_m3(result.values);
return result;
}
static float3x3 identity()
{
float3x3 result;
unit_m3(result.values);
return result;
}
static float3x3 from_translation(const float2 translation)
{
float3x3 result = identity();
result.values[2][0] = translation.x;
result.values[2][1] = translation.y;
return result;
}
static float3x3 from_rotation(float rotation)
{
float3x3 result = zero();
const float cosine = std::cos(rotation);
const float sine = std::sin(rotation);
result.values[0][0] = cosine;
result.values[0][1] = sine;
result.values[1][0] = -sine;
result.values[1][1] = cosine;
result.values[2][2] = 1.0f;
return result;
}
static float3x3 from_scale(const float2 scale)
{
float3x3 result = zero();
result.values[0][0] = scale.x;
result.values[1][1] = scale.y;
result.values[2][2] = 1.0f;
return result;
}
static float3x3 from_translation_rotation_scale(const float2 translation,
float rotation,
const float2 scale)
{
float3x3 result;
const float cosine = std::cos(rotation);
const float sine = std::sin(rotation);
result.values[0][0] = scale.x * cosine;
result.values[0][1] = scale.x * sine;
result.values[0][2] = 0.0f;
result.values[1][0] = scale.y * -sine;
result.values[1][1] = scale.y * cosine;
result.values[1][2] = 0.0f;
result.values[2][0] = translation.x;
result.values[2][1] = translation.y;
result.values[2][2] = 1.0f;
return result;
}
static float3x3 from_normalized_axes(const float2 translation,
const float2 horizontal,
const float2 vertical)
{
BLI_ASSERT_UNIT_V2(horizontal);
BLI_ASSERT_UNIT_V2(vertical);
float3x3 result;
result.values[0][0] = horizontal.x;
result.values[0][1] = horizontal.y;
result.values[0][2] = 0.0f;
result.values[1][0] = vertical.x;
result.values[1][1] = vertical.y;
result.values[1][2] = 0.0f;
result.values[2][0] = translation.x;
result.values[2][1] = translation.y;
result.values[2][2] = 1.0f;
return result;
}
/* Construct a transformation that is pivoted around the given origin point. So for instance,
* from_origin_transformation(from_rotation(M_PI_2), float2(0.0f, 2.0f))
* will construct a transformation representing a 90 degree rotation around the point (0, 2). */
static float3x3 from_origin_transformation(const float3x3 &transformation, const float2 origin)
{
return from_translation(origin) * transformation * from_translation(-origin);
}
operator float *()
{
return &values[0][0];
}
operator const float *() const
{
return &values[0][0];
}
float *operator[](const int64_t index)
{
BLI_assert(index >= 0);
BLI_assert(index < 3);
return &values[index][0];
}
const float *operator[](const int64_t index) const
{
BLI_assert(index >= 0);
BLI_assert(index < 3);
return &values[index][0];
}
using c_style_float3x3 = float[3][3];
c_style_float3x3 &ptr()
{
return values;
}
const c_style_float3x3 &ptr() const
{
return values;
}
friend float3x3 operator*(const float3x3 &a, const float3x3 &b)
{
float3x3 result;
mul_m3_m3m3(result.values, a.values, b.values);
return result;
}
friend float3 operator*(const float3x3 &a, const float3 &b)
{
float3 result;
mul_v3_m3v3(result, a.values, b);
return result;
}
void operator*=(const float3x3 &other)
{
mul_m3_m3_post(values, other.values);
}
friend float2 operator*(const float3x3 &transformation, const float2 &vector)
{
float2 result;
mul_v2_m3v2(result, transformation.values, vector);
return result;
}
friend float2 operator*(const float3x3 &transformation, const float (*vector)[2])
{
return transformation * float2(vector);
}
float3x3 transposed() const
{
float3x3 result;
transpose_m3_m3(result.values, values);
return result;
}
float3x3 inverted() const
{
float3x3 result;
invert_m3_m3(result.values, values);
return result;
}
float2 scale_2d() const
{
float2 scale;
mat3_to_size_2d(scale, values);
return scale;
}
friend bool operator==(const float3x3 &a, const float3x3 &b)
{
return equals_m3m3(a.values, b.values);
}
};
} // namespace blender

View File

@ -1,280 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#pragma once
#include "BLI_math_matrix.h"
#include "BLI_math_vector.h"
#include "BLI_math_vector.hh"
#include "BLI_math_vector_types.hh"
namespace blender {
struct float4x4 {
float values[4][4];
float4x4() = default;
float4x4(const float *matrix)
{
memcpy(values, matrix, sizeof(float) * 16);
}
float4x4(const float matrix[4][4]) : float4x4(static_cast<const float *>(matrix[0]))
{
}
/* Assumes an XYZ euler order. */
static float4x4 from_loc_eul_scale(const float3 location,
const float3 rotation,
const float3 scale)
{
float4x4 mat;
loc_eul_size_to_mat4(mat.values, location, rotation, scale);
return mat;
}
static float4x4 from_location(const float3 location)
{
float4x4 mat = float4x4::identity();
copy_v3_v3(mat.values[3], location);
return mat;
}
static float4x4 from_normalized_axis_data(const float3 location,
const float3 forward,
const float3 up)
{
BLI_ASSERT_UNIT_V3(forward);
BLI_ASSERT_UNIT_V3(up);
/* Negate the cross product so that the resulting matrix has determinant 1 (instead of -1).
* Without the negation, the result would be a so called improper rotation. That means it
* contains a reflection. Such an improper rotation matrix could not be converted to another
* representation of a rotation such as euler angles. */
const float3 cross = -math::cross(forward, up);
float4x4 matrix;
matrix.values[0][0] = forward.x;
matrix.values[1][0] = cross.x;
matrix.values[2][0] = up.x;
matrix.values[3][0] = location.x;
matrix.values[0][1] = forward.y;
matrix.values[1][1] = cross.y;
matrix.values[2][1] = up.y;
matrix.values[3][1] = location.y;
matrix.values[0][2] = forward.z;
matrix.values[1][2] = cross.z;
matrix.values[2][2] = up.z;
matrix.values[3][2] = location.z;
matrix.values[0][3] = 0.0f;
matrix.values[1][3] = 0.0f;
matrix.values[2][3] = 0.0f;
matrix.values[3][3] = 1.0f;
return matrix;
}
static float4x4 identity()
{
float4x4 mat;
unit_m4(mat.values);
return mat;
}
operator float *()
{
return &values[0][0];
}
operator const float *() const
{
return &values[0][0];
}
float *operator[](const int64_t index)
{
BLI_assert(index >= 0);
BLI_assert(index < 4);
return &values[index][0];
}
const float *operator[](const int64_t index) const
{
BLI_assert(index >= 0);
BLI_assert(index < 4);
return &values[index][0];
}
using c_style_float4x4 = float[4][4];
c_style_float4x4 &ptr()
{
return values;
}
const c_style_float4x4 &ptr() const
{
return values;
}
friend float4x4 operator*(const float4x4 &a, const float4x4 &b)
{
float4x4 result;
mul_m4_m4m4(result.values, a.values, b.values);
return result;
}
void operator*=(const float4x4 &other)
{
mul_m4_m4_post(values, other.values);
}
/**
* This also applies the translation on the vector. Use `m.ref_3x3() * v` if that is not
* intended.
*/
friend float3 operator*(const float4x4 &m, const float3 &v)
{
float3 result;
mul_v3_m4v3(result, m.values, v);
return result;
}
friend float3 operator*(const float4x4 &m, const float (*v)[3])
{
return m * float3(v);
}
friend bool operator==(const float4x4 &a, const float4x4 &b)
{
return equals_m4m4(a.ptr(), b.ptr());
}
friend bool operator!=(const float4x4 &a, const float4x4 &b)
{
return !(a == b);
}
float3 translation() const
{
return float3(values[3]);
}
/* Assumes XYZ rotation order. */
float3 to_euler() const
{
float3 euler;
mat4_to_eul(euler, values);
return euler;
}
float3 scale() const
{
float3 scale;
mat4_to_size(scale, values);
return scale;
}
void apply_scale(const float scale)
{
values[0][0] *= scale;
values[0][1] *= scale;
values[0][2] *= scale;
values[1][0] *= scale;
values[1][1] *= scale;
values[1][2] *= scale;
values[2][0] *= scale;
values[2][1] *= scale;
values[2][2] *= scale;
}
float4x4 inverted() const
{
float4x4 result;
invert_m4_m4(result.values, values);
return result;
}
/**
* Matrix inversion can be implemented more efficiently for affine matrices.
*/
float4x4 inverted_affine() const
{
BLI_assert(values[0][3] == 0.0f && values[1][3] == 0.0f && values[2][3] == 0.0f &&
values[3][3] == 1.0f);
return this->inverted();
}
float4x4 transposed() const
{
float4x4 result;
transpose_m4_m4(result.values, values);
return result;
}
float4x4 inverted_transposed_affine() const
{
return this->inverted_affine().transposed();
}
struct float3x3_ref {
const float4x4 &data;
friend float3 operator*(const float3x3_ref &m, const float3 &v)
{
float3 result;
mul_v3_mat3_m4v3(result, m.data.values, v);
return result;
}
};
float3x3_ref ref_3x3() const
{
return {*this};
}
static float4x4 interpolate(const float4x4 &a, const float4x4 &b, float t)
{
float result[4][4];
interp_m4_m4m4(result, a.values, b.values, t);
return result;
}
bool is_negative() const
{
return is_negative_m4(ptr());
}
uint64_t hash() const
{
uint64_t h = 435109;
for (int i = 0; i < 16; i++) {
float value = (static_cast<const float *>(values[0]))[i];
h = h * 33 + *reinterpret_cast<const uint32_t *>(&value);
}
return h;
}
friend std::ostream &operator<<(std::ostream &stream, const float4x4 &mat)
{
char fchar[16];
stream << "(\n";
for (int i = 0; i < 4; i++) {
stream << "(";
for (int j = 0; j < 4; j++) {
snprintf(fchar, sizeof(fchar), "%11.6f", mat[j][i]);
stream << fchar;
if (j != 3) {
stream << ", ";
}
}
stream << ")\n";
}
stream << ")\n";
return stream;
}
};
} // namespace blender

View File

@ -243,6 +243,16 @@ void BLI_movelisttolist(struct ListBase *dst, struct ListBase *src) ATTR_NONNULL
* Moves the entire contents of \a src at the beginning of \a dst.
*/
void BLI_movelisttolist_reverse(struct ListBase *dst, struct ListBase *src) ATTR_NONNULL(1, 2);
/**
* Split `original_listbase` after given `vlink`, putting the remaining of the list into given
* `split_listbase`.
*
* \note If `vlink` is nullptr, it is considered as 'the item before the first item', so the whole
* list is moved from `original_listbase` to `split_listbase`.
*/
void BLI_listbase_split_after(struct ListBase *original_listbase,
struct ListBase *split_listbase,
void *vlink) ATTR_NONNULL(1, 2);
/**
* Sets dst to a duplicate of the entire contents of src. dst may be the same as src.
*/

View File

@ -176,7 +176,7 @@ template<typename T, int Size>
/**
* Create a translation only matrix. Matrix dimensions should be at least 4 col x 3 row.
*/
template<typename MatT> [[nodiscard]] MatT from_location(const typename MatT::vec3_type &location);
template<typename MatT> [[nodiscard]] MatT from_location(const typename MatT::loc_type &location);
/**
* Create a matrix whose diagonal is defined by the given scale vector.
@ -201,14 +201,14 @@ template<typename MatT, typename RotationT, typename VectorT>
* Create a transform matrix with translation and rotation applied in this order.
*/
template<typename MatT, typename RotationT>
[[nodiscard]] MatT from_loc_rot(const typename MatT::vec3_type &location,
[[nodiscard]] MatT from_loc_rot(const typename MatT::loc_type &location,
const RotationT &rotation);
/**
* Create a transform matrix with translation, rotation and scale applied in this order.
*/
template<typename MatT, typename RotationT, int ScaleDim>
[[nodiscard]] MatT from_loc_rot_scale(const typename MatT::vec3_type &location,
[[nodiscard]] MatT from_loc_rot_scale(const typename MatT::loc_type &location,
const RotationT &rotation,
const VecBase<typename MatT::base_type, ScaleDim> &scale);
@ -229,6 +229,14 @@ template<typename MatT, typename VectorT>
const VectorT forward,
const VectorT up);
/**
* Construct a transformation that is pivoted around the given origin point. So for instance,
* from_origin_transform<MatT>(from_rotation(M_PI_2), float2(0.0f, 2.0f))
* will construct a transformation representing a 90 degree rotation around the point (0, 2).
*/
template<typename MatT, typename VectorT>
[[nodiscard]] MatT from_origin_transform(const MatT &transform, const VectorT origin);
/** \} */
/* -------------------------------------------------------------------- */
@ -259,6 +267,8 @@ template<typename T, bool Normalized = false>
*/
template<bool AllowNegativeScale = false, typename T, int NumCol, int NumRow>
[[nodiscard]] inline VecBase<T, 3> to_scale(const MatBase<T, NumCol, NumRow> &mat);
template<bool AllowNegativeScale = false, typename T>
[[nodiscard]] inline VecBase<T, 2> to_scale(const MatBase<T, 2, 2> &mat);
/**
* Decompose a matrix into location, rotation, and scale components.
@ -464,6 +474,9 @@ inline bool is_zero(const MatBase<T, NumCol, NumRow> &mat)
/* Implementation details. */
namespace detail {
template<typename T, int NumCol, int NumRow>
[[nodiscard]] MatBase<T, NumCol, NumRow> from_rotation(const AngleRadian<T> &rotation);
template<typename T, int NumCol, int NumRow>
[[nodiscard]] MatBase<T, NumCol, NumRow> from_rotation(const EulerXYZ<T> &rotation);
@ -491,7 +504,8 @@ template<typename T, int Size>
[[nodiscard]] MatBase<T, Size, Size> invert(const MatBase<T, Size, Size> &mat)
{
bool success;
return invert(mat, success);
/* Explicit template parameter to please MSVC. */
return invert<T, Size>(mat, success);
}
template<typename T, int NumCol, int NumRow>
@ -798,7 +812,7 @@ MatBase<T, NumCol, NumRow> from_rotation(const EulerXYZ<T> &rotation)
DoublePrecision sc = si * ch;
DoublePrecision ss = si * sh;
MatT mat;
MatT mat = MatT::identity();
mat[0][0] = T(cj * ch);
mat[1][0] = T(sj * sc - cs);
mat[2][0] = T(sj * cc + ss);
@ -833,7 +847,7 @@ MatBase<T, NumCol, NumRow> from_rotation(const Quaternion<T> &rotation)
DoublePrecision qbc = q2 * q3;
DoublePrecision qcc = q3 * q3;
MatT mat;
MatT mat = MatT::identity();
mat[0][0] = T(1.0 - qbb - qcc);
mat[0][1] = T(qdc + qab);
mat[0][2] = T(-qdb + qac);
@ -877,7 +891,25 @@ MatBase<T, NumCol, NumRow> from_rotation(const AxisAngle<T> &rotation)
return mat;
}
template<typename T, int NumCol, int NumRow>
MatBase<T, NumCol, NumRow> from_rotation(const AngleRadian<T> &rotation)
{
using MatT = MatBase<T, NumCol, NumRow>;
T ci = math::cos(rotation.value);
T si = math::sin(rotation.value);
MatT mat = MatT::identity();
mat[0][0] = ci;
mat[1][0] = -si;
mat[0][1] = si;
mat[1][1] = ci;
return mat;
}
/* Using explicit template instantiations in order to reduce compilation time. */
extern template MatBase<float, 2, 2> from_rotation(const AngleRadian<float> &rotation);
extern template MatBase<float, 3, 3> from_rotation(const AngleRadian<float> &rotation);
extern template MatBase<float, 3, 3> from_rotation(const EulerXYZ<float> &rotation);
extern template MatBase<float, 4, 4> from_rotation(const EulerXYZ<float> &rotation);
extern template MatBase<float, 3, 3> from_rotation(const Quaternion<float> &rotation);
@ -940,6 +972,18 @@ template<bool AllowNegativeScale, typename T, int NumCol, int NumRow>
return result;
}
template<bool AllowNegativeScale, typename T>
[[nodiscard]] inline VecBase<T, 2> to_scale(const MatBase<T, 2, 2> &mat)
{
VecBase<T, 2> result = {length(mat.x), length(mat.y)};
if constexpr (AllowNegativeScale) {
if (UNLIKELY(is_negative(mat))) {
result = -result;
}
}
return result;
}
/* Implementation details. Use `to_euler` and `to_quaternion` instead. */
namespace detail {
@ -982,7 +1026,7 @@ inline void to_loc_rot_scale(const MatBase<T, 4, 4> &mat,
to_rot_scale<AllowNegativeScale>(MatBase<T, 3, 3>(mat), r_rotation, r_scale);
}
template<typename MatT> [[nodiscard]] MatT from_location(const typename MatT::vec3_type &location)
template<typename MatT> [[nodiscard]] MatT from_location(const typename MatT::loc_type &location)
{
MatT mat = MatT::identity();
mat.location() = location;
@ -1013,22 +1057,23 @@ template<typename MatT, typename RotationT, typename VectorT>
}
template<typename MatT, typename RotationT, int ScaleDim>
[[nodiscard]] MatT from_loc_rot_scale(const typename MatT::vec3_type &location,
[[nodiscard]] MatT from_loc_rot_scale(const typename MatT::loc_type &location,
const RotationT &rotation,
const VecBase<typename MatT::base_type, ScaleDim> &scale)
{
using Mat3x3 = MatBase<typename MatT::base_type, 3, 3>;
MatT mat = MatT(from_rot_scale<Mat3x3>(rotation, scale));
using MatRotT =
MatBase<typename MatT::base_type, MatT::loc_type::type_length, MatT::loc_type::type_length>;
MatT mat = MatT(from_rot_scale<MatRotT>(rotation, scale));
mat.location() = location;
return mat;
}
template<typename MatT, typename RotationT>
[[nodiscard]] MatT from_loc_rot(const typename MatT::vec3_type &location,
const RotationT &rotation)
[[nodiscard]] MatT from_loc_rot(const typename MatT::loc_type &location, const RotationT &rotation)
{
using Mat3x3 = MatBase<typename MatT::base_type, 3, 3>;
MatT mat = MatT(from_rotation<Mat3x3>(rotation));
using MatRotT =
MatBase<typename MatT::base_type, MatT::loc_type::type_length, MatT::loc_type::type_length>;
MatT mat = MatT(from_rotation<MatRotT>(rotation));
mat.location() = location;
return mat;
}
@ -1059,6 +1104,12 @@ template<typename MatT, typename VectorT>
return matrix;
}
template<typename MatT, typename VectorT>
[[nodiscard]] MatT from_origin_transform(const MatT &transform, const VectorT origin)
{
return from_location<MatT>(origin) * transform * from_location<MatT>(-origin);
}
template<typename T>
VecBase<T, 3> transform_point(const MatBase<T, 3, 3> &mat, const VecBase<T, 3> &point)
{

View File

@ -30,8 +30,6 @@
* defined outside of the class in the `blender::math` namespace.
*/
#define __BLI_MATH_MATRIX_TYPES_HH__
#include <array>
#include <cmath>
#include <iostream>
@ -79,6 +77,7 @@ struct alignas(Alignment) MatBase : public vec_struct_base<VecBase<T, NumRow>, N
using vec3_type = VecBase<T, 3>;
using col_type = VecBase<T, NumRow>;
using row_type = VecBase<T, NumCol>;
using loc_type = VecBase<T, (NumRow < NumCol) ? NumRow : (NumRow - 1)>;
static constexpr int min_dim = (NumRow < NumCol) ? NumRow : NumCol;
static constexpr int col_len = NumCol;
static constexpr int row_len = NumRow;
@ -258,11 +257,11 @@ struct alignas(Alignment) MatBase : public vec_struct_base<VecBase<T, NumRow>, N
return *reinterpret_cast<vec3_type *>(&(*this)[2]);
}
vec3_type &location()
loc_type &location()
{
BLI_STATIC_ASSERT(NumCol >= 4, "Wrong Matrix dimension");
BLI_STATIC_ASSERT(NumRow >= 3, "Wrong Matrix dimension");
return *reinterpret_cast<vec3_type *>(&(*this)[3]);
BLI_STATIC_ASSERT(NumCol >= 3, "Wrong Matrix dimension");
BLI_STATIC_ASSERT(NumRow >= 2, "Wrong Matrix dimension");
return *reinterpret_cast<loc_type *>(&(*this)[NumCol - 1]);
}
const vec3_type &x_axis() const
@ -286,11 +285,11 @@ struct alignas(Alignment) MatBase : public vec_struct_base<VecBase<T, NumRow>, N
return *reinterpret_cast<const vec3_type *>(&(*this)[2]);
}
const vec3_type &location() const
const loc_type &location() const
{
BLI_STATIC_ASSERT(NumCol >= 4, "Wrong Matrix dimension");
BLI_STATIC_ASSERT(NumRow >= 3, "Wrong Matrix dimension");
return *reinterpret_cast<const vec3_type *>(&(*this)[3]);
BLI_STATIC_ASSERT(NumCol >= 3, "Wrong Matrix dimension");
BLI_STATIC_ASSERT(NumRow >= 2, "Wrong Matrix dimension");
return *reinterpret_cast<const loc_type *>(&(*this)[NumCol - 1]);
}
/** Matrix operators. */
@ -481,7 +480,7 @@ struct alignas(Alignment) MatBase : public vec_struct_base<VecBase<T, NumRow>, N
{
uint64_t h = 435109;
unroll<NumCol * NumRow>([&](auto i) {
T value = (static_cast<const T *>(this))[i];
T value = (reinterpret_cast<const T *>(this))[i];
h = h * 33 + *reinterpret_cast<const as_uint_type<T> *>(&value);
});
return h;

View File

@ -24,6 +24,35 @@ namespace detail {
template<typename T> struct AxisAngle;
template<typename T> struct Quaternion;
template<typename T> struct AngleRadian {
T value;
AngleRadian() = default;
AngleRadian(const T &radian) : value(radian){};
/** Static functions. */
static AngleRadian identity()
{
return 0;
}
/** Conversions. */
explicit operator T() const
{
return value;
}
/** Operators. */
friend std::ostream &operator<<(std::ostream &stream, const AngleRadian &rot)
{
return stream << "AngleRadian(" << rot.value << ")";
}
};
template<typename T> struct EulerXYZ {
T x, y, z;
@ -234,6 +263,7 @@ template<typename U> struct AssertUnitEpsilon<detail::Quaternion<U>> {
};
/* Most common used types. */
using AngleRadian = math::detail::AngleRadian<float>;
using EulerXYZ = math::detail::EulerXYZ<float>;
using Quaternion = math::detail::Quaternion<float>;
using AxisAngle = math::detail::AxisAngle<float>;

View File

@ -527,7 +527,7 @@ template<typename T> class VArrayCommon {
* Other virtual array implementations are typically stored as #std::shared_ptr. That works even
* when the implementation itself is not copyable and makes copying #VArrayCommon cheaper.
*/
using Storage = Any<detail::VArrayAnyExtraInfo<T>, 24, 8>;
using Storage = Any<blender::detail::VArrayAnyExtraInfo<T>, 24, 8>;
/**
* Pointer to the currently contained virtual array implementation. This is allowed to be null.

View File

@ -218,8 +218,6 @@ set(SRC
BLI_fileops.hh
BLI_fileops_types.h
BLI_filereader.h
BLI_float3x3.hh
BLI_float4x4.hh
BLI_fnmatch.h
BLI_function_ref.hh
BLI_generic_array.hh
@ -469,7 +467,6 @@ if(WITH_GTESTS)
tests/BLI_edgehash_test.cc
tests/BLI_expr_pylike_eval_test.cc
tests/BLI_fileops_test.cc
tests/BLI_float3x3_test.cc
tests/BLI_function_ref_test.cc
tests/BLI_generic_array_test.cc
tests/BLI_generic_span_test.cc

Some files were not shown because too many files have changed in this diff Show More