Merge branch 'refactor-mesh-corners-generic' into refactor-mesh-face-generic

2023-02-06 17:21:34 -05:00 · 2023-02-06 17:21:34 -05:00 · 709ec5ea3a
parent 997ad445ba 46a27e3e95
commit 709ec5ea3a
219 changed files with 2877 additions and 2604 deletions
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@ -1722,13 +1722,20 @@ class CyclesPreferences(bpy.types.AddonPreferences):
            row.prop(self, "peer_memory")

        if compute_device_type == 'METAL':
-            import platform
-            # MetalRT only works on Apple Silicon at present, pending argument encoding fixes on AMD
-            # Kernel specialization is only viable on Apple Silicon at present due to relative compilation speed
-            if platform.machine() == 'arm64':
+            import platform, re
+            isNavi2 = False
+            for device in devices:
+                obj = re.search("((RX)|(Pro)|(PRO))\s+W?6\d00X",device.name)
+                if obj:
+                    isNavi2 = True
+
+            # MetalRT only works on Apple Silicon and Navi2
+            if platform.machine() == 'arm64' or isNavi2:
                col = layout.column()
                col.use_property_split = True
-                col.prop(self, "kernel_optimization_level")
+                # Kernel specialization is only supported on Apple Silicon
+                if platform.machine() == 'arm64':
+                    col.prop(self, "kernel_optimization_level")
                col.prop(self, "use_metalrt")

    def draw(self, context):
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@ -53,8 +53,12 @@ void CUDADevice::set_error(const string &error)
 }

 CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
-    : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
+    : GPUDevice(info, stats, profiler)
 {
+  /* Verify that base class types can be used with specific backend types */
+  static_assert(sizeof(texMemObject) == sizeof(CUtexObject));
+  static_assert(sizeof(arrayMemObject) == sizeof(CUarray));
+
  first_error = true;

  cuDevId = info.num;
@ -65,12 +69,6 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)

  need_texture_info = false;

-  device_texture_headroom = 0;
-  device_working_headroom = 0;
-  move_texture_to_host = false;
-  map_host_limit = 0;
-  map_host_used = 0;
-  can_map_host = 0;
  pitch_alignment = 0;

  /* Initialize CUDA. */
@ -91,8 +89,9 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
   * so we can predict which memory to map to host. */
-  cuda_assert(
-      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+  int value;
+  cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+  can_map_host = value != 0;

  cuda_assert(cuDeviceGetAttribute(
      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
@ -499,311 +498,57 @@ void CUDADevice::reserve_local_memory(const uint kernel_features)
 #  endif
 }

-void CUDADevice::init_host_memory()
-{
-  /* Limit amount of host mapped memory, because allocating too much can
-   * cause system instability. Leave at least half or 4 GB of system
-   * memory free, whichever is smaller. */
-  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-  size_t system_ram = system_physical_ram();
-
-  if (system_ram > 0) {
-    if (system_ram / 2 > default_limit) {
-      map_host_limit = system_ram - default_limit;
-    }
-    else {
-      map_host_limit = system_ram / 2;
-    }
-  }
-  else {
-    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
-    map_host_limit = 0;
-  }
-
-  /* Amount of device memory to keep is free after texture memory
-   * and working memory allocations respectively. We set the working
-   * memory limit headroom lower so that some space is left after all
-   * texture memory allocations. */
-  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void CUDADevice::load_texture_info()
-{
-  if (need_texture_info) {
-    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
-     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
-    need_texture_info = false;
-    texture_info.copy_to_device();
-  }
-}
-
-void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
-{
-  /* Break out of recursive call, which can happen when moving memory on a multi device. */
-  static bool any_device_moving_textures_to_host = false;
-  if (any_device_moving_textures_to_host) {
-    return;
-  }
-
-  /* Signal to reallocate textures in host memory only. */
-  move_texture_to_host = true;
-
-  while (size > 0) {
-    /* Find suitable memory allocation to move. */
-    device_memory *max_mem = NULL;
-    size_t max_size = 0;
-    bool max_is_image = false;
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
-      device_memory &mem = *pair.first;
-      CUDAMem *cmem = &pair.second;
-
-      /* Can only move textures allocated on this device (and not those from peer devices).
-       * And need to ignore memory that is already on the host. */
-      if (!mem.is_resident(this) || cmem->use_mapped_host) {
-        continue;
-      }
-
-      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
-                        (&mem != &texture_info);
-      bool is_image = is_texture && (mem.data_height > 1);
-
-      /* Can't move this type of memory. */
-      if (!is_texture || cmem->array) {
-        continue;
-      }
-
-      /* For other textures, only move image textures. */
-      if (for_texture && !is_image) {
-        continue;
-      }
-
-      /* Try to move largest allocation, prefer moving images. */
-      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-        max_is_image = is_image;
-        max_size = mem.device_size;
-        max_mem = &mem;
-      }
-    }
-    lock.unlock();
-
-    /* Move to host memory. This part is mutex protected since
-     * multiple CUDA devices could be moving the memory. The
-     * first one will do it, and the rest will adopt the pointer. */
-    if (max_mem) {
-      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
-
-      static thread_mutex move_mutex;
-      thread_scoped_lock lock(move_mutex);
-
-      any_device_moving_textures_to_host = true;
-
-      /* Potentially need to call back into multi device, so pointer mapping
-       * and peer devices are updated. This is also necessary since the device
-       * pointer may just be a key here, so cannot be accessed and freed directly.
-       * Unfortunately it does mean that memory is reallocated on all other
-       * devices as well, which is potentially dangerous when still in use (since
-       * a thread rendering on another devices would only be caught in this mutex
-       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
-      size = (max_size >= size) ? 0 : size - max_size;
-
-      any_device_moving_textures_to_host = false;
-    }
-    else {
-      break;
-    }
-  }
-
-  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
-  move_texture_to_host = false;
-
-  /* Update texture info array with new pointers. */
-  load_texture_info();
-}
-
-CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+void CUDADevice::get_device_memory_info(size_t &total, size_t &free)
 {
  CUDAContextScope scope(this);

-  CUdeviceptr device_pointer = 0;
-  size_t size = mem.memory_size() + pitch_padding;
-
-  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-  const char *status = "";
-
-  /* First try allocating in device memory, respecting headroom. We make
-   * an exception for texture info. It is small and frequently accessed,
-   * so treat it as working memory.
-   *
-   * If there is not enough room for working memory, we will try to move
-   * textures to host memory, assuming the performance impact would have
-   * been worse for working memory. */
-  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
-  bool is_image = is_texture && (mem.data_height > 1);
-
-  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-  size_t total = 0, free = 0;
  cuMemGetInfo(&free, &total);
-
-  /* Move textures to host memory if needed. */
-  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-    move_textures_to_host(size + headroom - free, is_texture);
-    cuMemGetInfo(&free, &total);
-  }
-
-  /* Allocate in device memory. */
-  if (!move_texture_to_host && (size + headroom) < free) {
-    mem_alloc_result = cuMemAlloc(&device_pointer, size);
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      status = " in device memory";
-    }
-  }
-
-  /* Fall back to mapped host memory if needed and possible. */
-
-  void *shared_pointer = 0;
-
-  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
-    if (mem.shared_pointer) {
-      /* Another device already allocated host memory. */
-      mem_alloc_result = CUDA_SUCCESS;
-      shared_pointer = mem.shared_pointer;
-    }
-    else if (map_host_used + size < map_host_limit) {
-      /* Allocate host memory ourselves. */
-      mem_alloc_result = cuMemHostAlloc(
-          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
-      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
-             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
-    }
-
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
-      map_host_used += size;
-      status = " in host memory";
-    }
-  }
-
-  if (mem_alloc_result != CUDA_SUCCESS) {
-    if (mem.type == MEM_DEVICE_ONLY) {
-      status = " failed, out of device memory";
-      set_error("System is out of GPU memory");
-    }
-    else {
-      status = " failed, out of device and host memory";
-      set_error("System is out of GPU and shared host memory");
-    }
-  }
-
-  if (mem.name) {
-    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")" << status;
-  }
-
-  mem.device_pointer = (device_ptr)device_pointer;
-  mem.device_size = size;
-  stats.mem_alloc(size);
-
-  if (!mem.device_pointer) {
-    return NULL;
-  }
-
-  /* Insert into map of allocations. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  CUDAMem *cmem = &cuda_mem_map[&mem];
-  if (shared_pointer != 0) {
-    /* Replace host pointer with our host allocation. Only works if
-     * CUDA memory layout is the same and has no pitch padding. Also
-     * does not work if we move textures to host during a render,
-     * since other devices might be using the memory. */
-
-    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-        mem.host_pointer != shared_pointer) {
-      memcpy(shared_pointer, mem.host_pointer, size);
-
-      /* A Call to device_memory::host_free() should be preceded by
-       * a call to device_memory::device_free() for host memory
-       * allocated by a device to be handled properly. Two exceptions
-       * are here and a call in OptiXDevice::generic_alloc(), where
-       * the current host memory can be assumed to be allocated by
-       * device_memory::host_alloc(), not by a device */
-
-      mem.host_free();
-      mem.host_pointer = shared_pointer;
-    }
-    mem.shared_pointer = shared_pointer;
-    mem.shared_counter++;
-    cmem->use_mapped_host = true;
-  }
-  else {
-    cmem->use_mapped_host = false;
-  }
-
-  return cmem;
 }

-void CUDADevice::generic_copy_to(device_memory &mem)
+bool CUDADevice::alloc_device(void *&device_pointer, size_t size)
 {
-  if (!mem.host_pointer || !mem.device_pointer) {
-    return;
-  }
+  CUDAContextScope scope(this);

-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
-   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
-   * mem.host_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const CUDAContextScope scope(this);
-    cuda_assert(
-        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-  }
+  CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer, size);
+  return mem_alloc_result == CUDA_SUCCESS;
 }

-void CUDADevice::generic_free(device_memory &mem)
+void CUDADevice::free_device(void *device_pointer)
 {
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
-    const CUDAMem &cmem = cuda_mem_map[&mem];
+  CUDAContextScope scope(this);

-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
+  cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
+}

-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          cuMemFreeHost(mem.shared_pointer);
-          mem.shared_pointer = 0;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      cuda_assert(cuMemFree(mem.device_pointer));
-    }
+bool CUDADevice::alloc_host(void *&shared_pointer, size_t size)
+{
+  CUDAContextScope scope(this);

-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
+  CUresult mem_alloc_result = cuMemHostAlloc(
+      &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+  return mem_alloc_result == CUDA_SUCCESS;
+}

-    cuda_mem_map.erase(cuda_mem_map.find(&mem));
-  }
+void CUDADevice::free_host(void *shared_pointer)
+{
+  CUDAContextScope scope(this);
+
+  cuMemFreeHost(shared_pointer);
+}
+
+bool CUDADevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
+{
+  CUDAContextScope scope(this);
+
+  cuda_assert(cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, shared_pointer, 0));
+  return true;
+}
+
+void CUDADevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
+{
+  const CUDAContextScope scope(this);
+
+  cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer, size));
 }

 void CUDADevice::mem_alloc(device_memory &mem)
@ -868,8 +613,8 @@ void CUDADevice::mem_zero(device_memory &mem)

  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
    const CUDAContextScope scope(this);
    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
  }
@ -994,19 +739,19 @@ void CUDADevice::tex_alloc(device_texture &mem)
      return;
  }

-  CUDAMem *cmem = NULL;
+  Mem *cmem = NULL;
  CUarray array_3d = NULL;
  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
  size_t dst_pitch = src_pitch;

  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;

    if (mem.data_depth > 1) {
      array_3d = (CUarray)mem.device_pointer;
-      cmem->array = array_3d;
+      cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
    }
    else if (mem.data_height > 0) {
      dst_pitch = align_up(src_pitch, pitch_alignment);
@ -1050,10 +795,10 @@ void CUDADevice::tex_alloc(device_texture &mem)
    mem.device_size = size;
    stats.mem_alloc(size);

-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;
-    cmem->array = array_3d;
+    cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
  }
  else if (mem.data_height > 0) {
    /* 2D texture, using pitch aligned linear memory. */
@ -1137,8 +882,8 @@ void CUDADevice::tex_alloc(device_texture &mem)
    texDesc.filterMode = filter_mode;
    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;

-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];

    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));

@ -1153,9 +898,9 @@ void CUDADevice::tex_free(device_texture &mem)
 {
  if (mem.device_pointer) {
    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
-    const CUDAMem &cmem = cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];

    if (cmem.texobject) {
      /* Free bindless texture. */
@ -1164,16 +909,16 @@ void CUDADevice::tex_free(device_texture &mem)

    if (!mem.is_resident(this)) {
      /* Do not free memory here, since it was allocated on a different device. */
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else if (cmem.array) {
      /* Free array. */
-      cuArrayDestroy(cmem.array);
+      cuArrayDestroy(reinterpret_cast<CUarray>(cmem.array));
      stats.mem_free(mem.device_size);
      mem.device_pointer = 0;
      mem.device_size = 0;

-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else {
      lock.unlock();
--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN

 class DeviceQueue;

-class CUDADevice : public Device {
+class CUDADevice : public GPUDevice {

  friend class CUDAContextScope;

@ -29,36 +29,11 @@ class CUDADevice : public Device {
  CUdevice cuDevice;
  CUcontext cuContext;
  CUmodule cuModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
  int pitch_alignment;
  int cuDevId;
  int cuDevArchitecture;
  bool first_error;

-  struct CUDAMem {
-    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    CUtexObject texobject;
-    CUarray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, CUDAMem> CUDAMemMap;
-  CUDAMemMap cuda_mem_map;
-  thread_mutex cuda_mem_map_mutex;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
  CUDADeviceKernels kernels;

  static bool have_precompiled_kernels();
@ -88,17 +63,13 @@ class CUDADevice : public Device {

  void reserve_local_memory(const uint kernel_features);

-  void init_host_memory();
-
-  void load_texture_info();
-
-  void move_textures_to_host(size_t size, bool for_texture);
-
-  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
-  void generic_copy_to(device_memory &mem);
-
-  void generic_free(device_memory &mem);
+  virtual void get_device_memory_info(size_t &total, size_t &free) override;
+  virtual bool alloc_device(void *&device_pointer, size_t size) override;
+  virtual void free_device(void *device_pointer) override;
+  virtual bool alloc_host(void *&shared_pointer, size_t size) override;
+  virtual void free_host(void *shared_pointer) override;
+  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;

  void mem_alloc(device_memory &mem) override;

--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@ -452,6 +452,320 @@ void *Device::get_cpu_osl_memory()
  return nullptr;
 }

+GPUDevice::~GPUDevice() noexcept(false)
+{
+}
+
+bool GPUDevice::load_texture_info()
+{
+  if (need_texture_info) {
+    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
+     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
+    need_texture_info = false;
+    texture_info.copy_to_device();
+    return true;
+  }
+  else {
+    return false;
+  }
+}
+
+void GPUDevice::init_host_memory(size_t preferred_texture_headroom,
+                                 size_t preferred_working_headroom)
+{
+  /* Limit amount of host mapped memory, because allocating too much can
+   * cause system instability. Leave at least half or 4 GB of system
+   * memory free, whichever is smaller. */
+  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+  size_t system_ram = system_physical_ram();
+
+  if (system_ram > 0) {
+    if (system_ram / 2 > default_limit) {
+      map_host_limit = system_ram - default_limit;
+    }
+    else {
+      map_host_limit = system_ram / 2;
+    }
+  }
+  else {
+    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
+    map_host_limit = 0;
+  }
+
+  /* Amount of device memory to keep free after texture memory
+   * and working memory allocations respectively. We set the working
+   * memory limit headroom lower than the working one so there
+   * is space left for it. */
+  device_working_headroom = preferred_working_headroom > 0 ? preferred_working_headroom :
+                                                             32 * 1024 * 1024LL;  // 32MB
+  device_texture_headroom = preferred_texture_headroom > 0 ? preferred_texture_headroom :
+                                                             128 * 1024 * 1024LL;  // 128MB
+
+  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void GPUDevice::move_textures_to_host(size_t size, bool for_texture)
+{
+  /* Break out of recursive call, which can happen when moving memory on a multi device. */
+  static bool any_device_moving_textures_to_host = false;
+  if (any_device_moving_textures_to_host) {
+    return;
+  }
+
+  /* Signal to reallocate textures in host memory only. */
+  move_texture_to_host = true;
+
+  while (size > 0) {
+    /* Find suitable memory allocation to move. */
+    device_memory *max_mem = NULL;
+    size_t max_size = 0;
+    bool max_is_image = false;
+
+    thread_scoped_lock lock(device_mem_map_mutex);
+    foreach (MemMap::value_type &pair, device_mem_map) {
+      device_memory &mem = *pair.first;
+      Mem *cmem = &pair.second;
+
+      /* Can only move textures allocated on this device (and not those from peer devices).
+       * And need to ignore memory that is already on the host. */
+      if (!mem.is_resident(this) || cmem->use_mapped_host) {
+        continue;
+      }
+
+      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
+                        (&mem != &texture_info);
+      bool is_image = is_texture && (mem.data_height > 1);
+
+      /* Can't move this type of memory. */
+      if (!is_texture || cmem->array) {
+        continue;
+      }
+
+      /* For other textures, only move image textures. */
+      if (for_texture && !is_image) {
+        continue;
+      }
+
+      /* Try to move largest allocation, prefer moving images. */
+      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+        max_is_image = is_image;
+        max_size = mem.device_size;
+        max_mem = &mem;
+      }
+    }
+    lock.unlock();
+
+    /* Move to host memory. This part is mutex protected since
+     * multiple backend devices could be moving the memory. The
+     * first one will do it, and the rest will adopt the pointer. */
+    if (max_mem) {
+      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
+
+      static thread_mutex move_mutex;
+      thread_scoped_lock lock(move_mutex);
+
+      any_device_moving_textures_to_host = true;
+
+      /* Potentially need to call back into multi device, so pointer mapping
+       * and peer devices are updated. This is also necessary since the device
+       * pointer may just be a key here, so cannot be accessed and freed directly.
+       * Unfortunately it does mean that memory is reallocated on all other
+       * devices as well, which is potentially dangerous when still in use (since
+       * a thread rendering on another devices would only be caught in this mutex
+       * if it so happens to do an allocation at the same time as well. */
+      max_mem->device_copy_to();
+      size = (max_size >= size) ? 0 : size - max_size;
+
+      any_device_moving_textures_to_host = false;
+    }
+    else {
+      break;
+    }
+  }
+
+  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+  move_texture_to_host = false;
+
+  /* Update texture info array with new pointers. */
+  load_texture_info();
+}
+
+GPUDevice::Mem *GPUDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+  void *device_pointer = 0;
+  size_t size = mem.memory_size() + pitch_padding;
+
+  bool mem_alloc_result = false;
+  const char *status = "";
+
+  /* First try allocating in device memory, respecting headroom. We make
+   * an exception for texture info. It is small and frequently accessed,
+   * so treat it as working memory.
+   *
+   * If there is not enough room for working memory, we will try to move
+   * textures to host memory, assuming the performance impact would have
+   * been worse for working memory. */
+  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
+  bool is_image = is_texture && (mem.data_height > 1);
+
+  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+  size_t total = 0, free = 0;
+  get_device_memory_info(total, free);
+
+  /* Move textures to host memory if needed. */
+  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+    move_textures_to_host(size + headroom - free, is_texture);
+    get_device_memory_info(total, free);
+  }
+
+  /* Allocate in device memory. */
+  if (!move_texture_to_host && (size + headroom) < free) {
+    mem_alloc_result = alloc_device(device_pointer, size);
+    if (mem_alloc_result) {
+      device_mem_in_use += size;
+      status = " in device memory";
+    }
+  }
+
+  /* Fall back to mapped host memory if needed and possible. */
+
+  void *shared_pointer = 0;
+
+  if (!mem_alloc_result && can_map_host && mem.type != MEM_DEVICE_ONLY) {
+    if (mem.shared_pointer) {
+      /* Another device already allocated host memory. */
+      mem_alloc_result = true;
+      shared_pointer = mem.shared_pointer;
+    }
+    else if (map_host_used + size < map_host_limit) {
+      /* Allocate host memory ourselves. */
+      mem_alloc_result = alloc_host(shared_pointer, size);
+
+      assert((mem_alloc_result && shared_pointer != 0) ||
+             (!mem_alloc_result && shared_pointer == 0));
+    }
+
+    if (mem_alloc_result) {
+      assert(transform_host_pointer(&device_pointer, shared_pointer));
+      map_host_used += size;
+      status = " in host memory";
+    }
+  }
+
+  if (!mem_alloc_result) {
+    if (mem.type == MEM_DEVICE_ONLY) {
+      status = " failed, out of device memory";
+      set_error("System is out of GPU memory");
+    }
+    else {
+      status = " failed, out of device and host memory";
+      set_error("System is out of GPU and shared host memory");
+    }
+  }
+
+  if (mem.name) {
+    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")" << status;
+  }
+
+  mem.device_pointer = (device_ptr)device_pointer;
+  mem.device_size = size;
+  stats.mem_alloc(size);
+
+  if (!mem.device_pointer) {
+    return NULL;
+  }
+
+  /* Insert into map of allocations. */
+  thread_scoped_lock lock(device_mem_map_mutex);
+  Mem *cmem = &device_mem_map[&mem];
+  if (shared_pointer != 0) {
+    /* Replace host pointer with our host allocation. Only works if
+     * memory layout is the same and has no pitch padding. Also
+     * does not work if we move textures to host during a render,
+     * since other devices might be using the memory. */
+
+    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+        mem.host_pointer != shared_pointer) {
+      memcpy(shared_pointer, mem.host_pointer, size);
+
+      /* A Call to device_memory::host_free() should be preceded by
+       * a call to device_memory::device_free() for host memory
+       * allocated by a device to be handled properly. Two exceptions
+       * are here and a call in OptiXDevice::generic_alloc(), where
+       * the current host memory can be assumed to be allocated by
+       * device_memory::host_alloc(), not by a device */
+
+      mem.host_free();
+      mem.host_pointer = shared_pointer;
+    }
+    mem.shared_pointer = shared_pointer;
+    mem.shared_counter++;
+    cmem->use_mapped_host = true;
+  }
+  else {
+    cmem->use_mapped_host = false;
+  }
+
+  return cmem;
+}
+
+void GPUDevice::generic_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];
+
+    /* If cmem.use_mapped_host is true, reference counting is used
+     * to safely free a mapped host memory. */
+
+    if (cmem.use_mapped_host) {
+      assert(mem.shared_pointer);
+      if (mem.shared_pointer) {
+        assert(mem.shared_counter > 0);
+        if (--mem.shared_counter == 0) {
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          free_host(mem.shared_pointer);
+          mem.shared_pointer = 0;
+        }
+      }
+      map_host_used -= mem.device_size;
+    }
+    else {
+      /* Free device memory. */
+      free_device((void *)mem.device_pointer);
+      device_mem_in_use -= mem.device_size;
+    }
+
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    device_mem_map.erase(device_mem_map.find(&mem));
+  }
+}
+
+void GPUDevice::generic_copy_to(device_memory &mem)
+{
+  if (!mem.host_pointer || !mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+   * backend device allocation regardless of mem.host_pointer and mem.shared_pointer, and should
+   * copy data from mem.host_pointer. */
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    copy_host_to_device((void *)mem.device_pointer, mem.host_pointer, mem.memory_size());
+  }
+}
+
 /* DeviceInfo */

 CCL_NAMESPACE_END
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@ -309,6 +309,93 @@ class Device {
  static uint devices_initialized_mask;
 };

+/* Device, which is GPU, with some common functionality for GPU backends */
+class GPUDevice : public Device {
+ protected:
+  GPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+      : Device(info_, stats_, profiler_),
+        texture_info(this, "texture_info", MEM_GLOBAL),
+        need_texture_info(false),
+        can_map_host(false),
+        map_host_used(0),
+        map_host_limit(0),
+        device_texture_headroom(0),
+        device_working_headroom(0),
+        device_mem_map(),
+        device_mem_map_mutex(),
+        move_texture_to_host(false),
+        device_mem_in_use(0)
+  {
+  }
+
+ public:
+  virtual ~GPUDevice() noexcept(false);
+
+  /* For GPUs that can use bindless textures in some way or another. */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+  /* Returns true if the texture info was copied to the device (meaning, some more
+   * re-initialization might be needed). */
+  virtual bool load_texture_info();
+
+ protected:
+  /* Memory allocation, only accessed through device_memory. */
+  friend class device_memory;
+
+  bool can_map_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  typedef unsigned long long texMemObject;
+  typedef unsigned long long arrayMemObject;
+  struct Mem {
+    Mem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    texMemObject texobject;
+    arrayMemObject array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, Mem> MemMap;
+  MemMap device_mem_map;
+  thread_mutex device_mem_map_mutex;
+  bool move_texture_to_host;
+  /* Simple counter which will try to track amount of used device memory */
+  size_t device_mem_in_use;
+
+  virtual void init_host_memory(size_t preferred_texture_headroom = 0,
+                                size_t preferred_working_headroom = 0);
+  virtual void move_textures_to_host(size_t size, bool for_texture);
+
+  /* Allocation, deallocation and copy functions, with coresponding
+   * support of device/host allocations. */
+  virtual GPUDevice::Mem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+  virtual void generic_free(device_memory &mem);
+  virtual void generic_copy_to(device_memory &mem);
+
+  /* total - amount of device memory, free - amount of available device memory */
+  virtual void get_device_memory_info(size_t &total, size_t &free) = 0;
+
+  virtual bool alloc_device(void *&device_pointer, size_t size) = 0;
+
+  virtual void free_device(void *device_pointer) = 0;
+
+  virtual bool alloc_host(void *&shared_pointer, size_t size) = 0;
+
+  virtual void free_host(void *shared_pointer) = 0;
+
+  /* This function should return device pointer coresponding to shared pointer, which
+   * is host buffer, allocated in `alloc_host`. The function should `true`, if such
+   * address transformation is possible and `false` overwise */
+  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) = 0;
+
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) = 0;
+};
+
 CCL_NAMESPACE_END

 #endif /* __DEVICE_H__ */
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@ -53,8 +53,12 @@ void HIPDevice::set_error(const string &error)
 }

 HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
-    : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
+    : GPUDevice(info, stats, profiler)
 {
+  /* Verify that base class types can be used with specific backend types */
+  static_assert(sizeof(texMemObject) == sizeof(hipTextureObject_t));
+  static_assert(sizeof(arrayMemObject) == sizeof(hArray));
+
  first_error = true;

  hipDevId = info.num;
@ -65,12 +69,6 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)

  need_texture_info = false;

-  device_texture_headroom = 0;
-  device_working_headroom = 0;
-  move_texture_to_host = false;
-  map_host_limit = 0;
-  map_host_used = 0;
-  can_map_host = 0;
  pitch_alignment = 0;

  /* Initialize HIP. */
@ -91,7 +89,9 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  /* hipDeviceMapHost for mapping host memory when out of device memory.
   * hipDeviceLmemResizeToMax for reserving local memory ahead of render,
   * so we can predict which memory to map to host. */
-  hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));
+  int value;
+  hip_assert(hipDeviceGetAttribute(&value, hipDeviceAttributeCanMapHostMemory, hipDevice));
+  can_map_host = value != 0;

  hip_assert(
      hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice));
@ -460,305 +460,58 @@ void HIPDevice::reserve_local_memory(const uint kernel_features)
 #  endif
 }

-void HIPDevice::init_host_memory()
-{
-  /* Limit amount of host mapped memory, because allocating too much can
-   * cause system instability. Leave at least half or 4 GB of system
-   * memory free, whichever is smaller. */
-  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-  size_t system_ram = system_physical_ram();
-
-  if (system_ram > 0) {
-    if (system_ram / 2 > default_limit) {
-      map_host_limit = system_ram - default_limit;
-    }
-    else {
-      map_host_limit = system_ram / 2;
-    }
-  }
-  else {
-    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
-    map_host_limit = 0;
-  }
-
-  /* Amount of device memory to keep is free after texture memory
-   * and working memory allocations respectively. We set the working
-   * memory limit headroom lower so that some space is left after all
-   * texture memory allocations. */
-  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void HIPDevice::load_texture_info()
-{
-  if (need_texture_info) {
-    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
-     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
-    need_texture_info = false;
-    texture_info.copy_to_device();
-  }
-}
-
-void HIPDevice::move_textures_to_host(size_t size, bool for_texture)
-{
-  /* Break out of recursive call, which can happen when moving memory on a multi device. */
-  static bool any_device_moving_textures_to_host = false;
-  if (any_device_moving_textures_to_host) {
-    return;
-  }
-
-  /* Signal to reallocate textures in host memory only. */
-  move_texture_to_host = true;
-
-  while (size > 0) {
-    /* Find suitable memory allocation to move. */
-    device_memory *max_mem = NULL;
-    size_t max_size = 0;
-    bool max_is_image = false;
-
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    foreach (HIPMemMap::value_type &pair, hip_mem_map) {
-      device_memory &mem = *pair.first;
-      HIPMem *cmem = &pair.second;
-
-      /* Can only move textures allocated on this device (and not those from peer devices).
-       * And need to ignore memory that is already on the host. */
-      if (!mem.is_resident(this) || cmem->use_mapped_host) {
-        continue;
-      }
-
-      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
-                        (&mem != &texture_info);
-      bool is_image = is_texture && (mem.data_height > 1);
-
-      /* Can't move this type of memory. */
-      if (!is_texture || cmem->array) {
-        continue;
-      }
-
-      /* For other textures, only move image textures. */
-      if (for_texture && !is_image) {
-        continue;
-      }
-
-      /* Try to move largest allocation, prefer moving images. */
-      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-        max_is_image = is_image;
-        max_size = mem.device_size;
-        max_mem = &mem;
-      }
-    }
-    lock.unlock();
-
-    /* Move to host memory. This part is mutex protected since
-     * multiple HIP devices could be moving the memory. The
-     * first one will do it, and the rest will adopt the pointer. */
-    if (max_mem) {
-      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
-
-      static thread_mutex move_mutex;
-      thread_scoped_lock lock(move_mutex);
-
-      any_device_moving_textures_to_host = true;
-
-      /* Potentially need to call back into multi device, so pointer mapping
-       * and peer devices are updated. This is also necessary since the device
-       * pointer may just be a key here, so cannot be accessed and freed directly.
-       * Unfortunately it does mean that memory is reallocated on all other
-       * devices as well, which is potentially dangerous when still in use (since
-       * a thread rendering on another devices would only be caught in this mutex
-       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
-      size = (max_size >= size) ? 0 : size - max_size;
-
-      any_device_moving_textures_to_host = false;
-    }
-    else {
-      break;
-    }
-  }
-
-  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
-  move_texture_to_host = false;
-
-  /* Update texture info array with new pointers. */
-  load_texture_info();
-}
-
-HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+void HIPDevice::get_device_memory_info(size_t &total, size_t &free)
 {
  HIPContextScope scope(this);

-  hipDeviceptr_t device_pointer = 0;
-  size_t size = mem.memory_size() + pitch_padding;
-
-  hipError_t mem_alloc_result = hipErrorOutOfMemory;
-  const char *status = "";
-
-  /* First try allocating in device memory, respecting headroom. We make
-   * an exception for texture info. It is small and frequently accessed,
-   * so treat it as working memory.
-   *
-   * If there is not enough room for working memory, we will try to move
-   * textures to host memory, assuming the performance impact would have
-   * been worse for working memory. */
-  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
-  bool is_image = is_texture && (mem.data_height > 1);
-
-  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-  size_t total = 0, free = 0;
  hipMemGetInfo(&free, &total);
-
-  /* Move textures to host memory if needed. */
-  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-    move_textures_to_host(size + headroom - free, is_texture);
-    hipMemGetInfo(&free, &total);
-  }
-
-  /* Allocate in device memory. */
-  if (!move_texture_to_host && (size + headroom) < free) {
-    mem_alloc_result = hipMalloc(&device_pointer, size);
-    if (mem_alloc_result == hipSuccess) {
-      status = " in device memory";
-    }
-  }
-
-  /* Fall back to mapped host memory if needed and possible. */
-
-  void *shared_pointer = 0;
-
-  if (mem_alloc_result != hipSuccess && can_map_host) {
-    if (mem.shared_pointer) {
-      /* Another device already allocated host memory. */
-      mem_alloc_result = hipSuccess;
-      shared_pointer = mem.shared_pointer;
-    }
-    else if (map_host_used + size < map_host_limit) {
-      /* Allocate host memory ourselves. */
-      mem_alloc_result = hipHostMalloc(
-          &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
-
-      assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
-             (mem_alloc_result != hipSuccess && shared_pointer == 0));
-    }
-
-    if (mem_alloc_result == hipSuccess) {
-      hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0));
-      map_host_used += size;
-      status = " in host memory";
-    }
-  }
-
-  if (mem_alloc_result != hipSuccess) {
-    status = " failed, out of device and host memory";
-    set_error("System is out of GPU and shared host memory");
-  }
-
-  if (mem.name) {
-    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")" << status;
-  }
-
-  mem.device_pointer = (device_ptr)device_pointer;
-  mem.device_size = size;
-  stats.mem_alloc(size);
-
-  if (!mem.device_pointer) {
-    return NULL;
-  }
-
-  /* Insert into map of allocations. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  HIPMem *cmem = &hip_mem_map[&mem];
-  if (shared_pointer != 0) {
-    /* Replace host pointer with our host allocation. Only works if
-     * HIP memory layout is the same and has no pitch padding. Also
-     * does not work if we move textures to host during a render,
-     * since other devices might be using the memory. */
-
-    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-        mem.host_pointer != shared_pointer) {
-      memcpy(shared_pointer, mem.host_pointer, size);
-
-      /* A Call to device_memory::host_free() should be preceded by
-       * a call to device_memory::device_free() for host memory
-       * allocated by a device to be handled properly. Two exceptions
-       * are here and a call in OptiXDevice::generic_alloc(), where
-       * the current host memory can be assumed to be allocated by
-       * device_memory::host_alloc(), not by a device */
-
-      mem.host_free();
-      mem.host_pointer = shared_pointer;
-    }
-    mem.shared_pointer = shared_pointer;
-    mem.shared_counter++;
-    cmem->use_mapped_host = true;
-  }
-  else {
-    cmem->use_mapped_host = false;
-  }
-
-  return cmem;
 }

-void HIPDevice::generic_copy_to(device_memory &mem)
+bool HIPDevice::alloc_device(void *&device_pointer, size_t size)
 {
-  if (!mem.host_pointer || !mem.device_pointer) {
-    return;
-  }
+  HIPContextScope scope(this);

-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
-   * hipMalloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
-   * mem.host_pointer. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const HIPContextScope scope(this);
-    hip_assert(
-        hipMemcpyHtoD((hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-  }
+  hipError_t mem_alloc_result = hipMalloc((hipDeviceptr_t *)&device_pointer, size);
+  return mem_alloc_result == hipSuccess;
 }

-void HIPDevice::generic_free(device_memory &mem)
+void HIPDevice::free_device(void *device_pointer)
 {
-  if (mem.device_pointer) {
-    HIPContextScope scope(this);
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
-    const HIPMem &cmem = hip_mem_map[&mem];
+  HIPContextScope scope(this);

-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
+  hip_assert(hipFree((hipDeviceptr_t)device_pointer));
+}

-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          hipHostFree(mem.shared_pointer);
-          mem.shared_pointer = 0;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      hip_assert(hipFree(mem.device_pointer));
-    }
+bool HIPDevice::alloc_host(void *&shared_pointer, size_t size)
+{
+  HIPContextScope scope(this);

-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
+  hipError_t mem_alloc_result = hipHostMalloc(
+      &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);

-    hip_mem_map.erase(hip_mem_map.find(&mem));
-  }
+  return mem_alloc_result == hipSuccess;
+}
+
+void HIPDevice::free_host(void *shared_pointer)
+{
+  HIPContextScope scope(this);
+
+  hipHostFree(shared_pointer);
+}
+
+bool HIPDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
+{
+  HIPContextScope scope(this);
+
+  hip_assert(hipHostGetDevicePointer((hipDeviceptr_t *)&device_pointer, shared_pointer, 0));
+  return true;
+}
+
+void HIPDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
+{
+  const HIPContextScope scope(this);
+
+  hip_assert(hipMemcpyHtoD((hipDeviceptr_t)device_pointer, host_pointer, size));
 }

 void HIPDevice::mem_alloc(device_memory &mem)
@ -823,8 +576,8 @@ void HIPDevice::mem_zero(device_memory &mem)

  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
    const HIPContextScope scope(this);
    hip_assert(hipMemsetD8((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size()));
  }
@ -951,19 +704,19 @@ void HIPDevice::tex_alloc(device_texture &mem)
      return;
  }

-  HIPMem *cmem = NULL;
+  Mem *cmem = NULL;
  hArray array_3d = NULL;
  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
  size_t dst_pitch = src_pitch;

  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;

    if (mem.data_depth > 1) {
      array_3d = (hArray)mem.device_pointer;
-      cmem->array = array_3d;
+      cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
    }
    else if (mem.data_height > 0) {
      dst_pitch = align_up(src_pitch, pitch_alignment);
@ -1007,10 +760,10 @@ void HIPDevice::tex_alloc(device_texture &mem)
    mem.device_size = size;
    stats.mem_alloc(size);

-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;
-    cmem->array = array_3d;
+    cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
  }
  else if (mem.data_height > 0) {
    /* 2D texture, using pitch aligned linear memory. */
@ -1095,8 +848,8 @@ void HIPDevice::tex_alloc(device_texture &mem)
    texDesc.filterMode = filter_mode;
    texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES;

-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];

    hip_assert(hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));

@ -1111,9 +864,9 @@ void HIPDevice::tex_free(device_texture &mem)
 {
  if (mem.device_pointer) {
    HIPContextScope scope(this);
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
-    const HIPMem &cmem = hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];

    if (cmem.texobject) {
      /* Free bindless texture. */
@ -1122,16 +875,16 @@ void HIPDevice::tex_free(device_texture &mem)

    if (!mem.is_resident(this)) {
      /* Do not free memory here, since it was allocated on a different device. */
-      hip_mem_map.erase(hip_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else if (cmem.array) {
      /* Free array. */
-      hipArrayDestroy(cmem.array);
+      hipArrayDestroy(reinterpret_cast<hArray>(cmem.array));
      stats.mem_free(mem.device_size);
      mem.device_pointer = 0;
      mem.device_size = 0;

-      hip_mem_map.erase(hip_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else {
      lock.unlock();
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN

 class DeviceQueue;

-class HIPDevice : public Device {
+class HIPDevice : public GPUDevice {

  friend class HIPContextScope;

@ -26,36 +26,11 @@ class HIPDevice : public Device {
  hipDevice_t hipDevice;
  hipCtx_t hipContext;
  hipModule_t hipModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
  int pitch_alignment;
  int hipDevId;
  int hipDevArchitecture;
  bool first_error;

-  struct HIPMem {
-    HIPMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    hipTextureObject_t texobject;
-    hArray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, HIPMem> HIPMemMap;
-  HIPMemMap hip_mem_map;
-  thread_mutex hip_mem_map_mutex;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
  HIPDeviceKernels kernels;

  static bool have_precompiled_kernels();
@ -81,17 +56,13 @@ class HIPDevice : public Device {
  virtual bool load_kernels(const uint kernel_features) override;
  void reserve_local_memory(const uint kernel_features);

-  void init_host_memory();
-
-  void load_texture_info();
-
-  void move_textures_to_host(size_t size, bool for_texture);
-
-  HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
-  void generic_copy_to(device_memory &mem);
-
-  void generic_free(device_memory &mem);
+  virtual void get_device_memory_info(size_t &total, size_t &free) override;
+  virtual bool alloc_device(void *&device_pointer, size_t size) override;
+  virtual void free_device(void *device_pointer) override;
+  virtual bool alloc_host(void *&shared_pointer, size_t size) override;
+  virtual void free_host(void *shared_pointer) override;
+  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;

  void mem_alloc(device_memory &mem) override;

--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@ -73,6 +73,10 @@ const char *device_kernel_as_string(DeviceKernel kernel)
      return "integrator_terminated_paths_array";
    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
      return "integrator_sorted_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
+      return "integrator_sort_bucket_pass";
+    case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS:
+      return "integrator_sort_write_pass";
    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
      return "integrator_compact_paths_array";
    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@ -247,6 +247,8 @@ class device_memory {
  bool is_resident(Device *sub_device) const;

 protected:
+  friend class Device;
+  friend class GPUDevice;
  friend class CUDADevice;
  friend class OptiXDevice;
  friend class HIPDevice;
--- a/intern/cycles/device/metal/bvh.h
+++ b/intern/cycles/device/metal/bvh.h
@ -21,6 +21,7 @@ class BVHMetal : public BVH {

  API_AVAILABLE(macos(11.0))
  vector<id<MTLAccelerationStructure>> blas_array;
+  vector<uint32_t> blas_lookup;

  bool motion_blur = false;

--- a/intern/cycles/device/metal/bvh.mm
+++ b/intern/cycles/device/metal/bvh.mm
@ -816,6 +816,11 @@ bool BVHMetal::build_TLAS(Progress &progress,

    uint32_t instance_index = 0;
    uint32_t motion_transform_index = 0;
+
+    // allocate look up buffer for wost case scenario
+    uint64_t count = objects.size();
+    blas_lookup.resize(count);
+
    for (Object *ob : objects) {
      /* Skip non-traceable objects */
      if (!ob->is_traceable())
@ -843,12 +848,15 @@ bool BVHMetal::build_TLAS(Progress &progress,
      /* Set user instance ID to object index */
      int object_index = ob->get_device_index();
      uint32_t user_id = uint32_t(object_index);
+      int currIndex = instance_index++;
+      assert(user_id < blas_lookup.size());
+      blas_lookup[user_id] = accel_struct_index;

      /* Bake into the appropriate descriptor */
      if (motion_blur) {
        MTLAccelerationStructureMotionInstanceDescriptor *instances =
            (MTLAccelerationStructureMotionInstanceDescriptor *)[instanceBuf contents];
-        MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[instance_index++];
+        MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[currIndex];

        desc.accelerationStructureIndex = accel_struct_index;
        desc.userID = user_id;
@ -894,7 +902,7 @@ bool BVHMetal::build_TLAS(Progress &progress,
      else {
        MTLAccelerationStructureUserIDInstanceDescriptor *instances =
            (MTLAccelerationStructureUserIDInstanceDescriptor *)[instanceBuf contents];
-        MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[instance_index++];
+        MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[currIndex];

        desc.accelerationStructureIndex = accel_struct_index;
        desc.userID = user_id;
--- a/intern/cycles/device/metal/device.mm
+++ b/intern/cycles/device/metal/device.mm
@ -55,6 +55,10 @@ void device_metal_info(vector<DeviceInfo> &devices)
    info.denoisers = DENOISER_NONE;
    info.id = id;

+    if (MetalInfo::get_device_vendor(device) == METAL_GPU_AMD) {
+      info.has_light_tree = false;
+    }
+
    devices.push_back(info);
    device_index++;
  }
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@ -74,6 +74,11 @@ class MetalDevice : public Device {
  id<MTLBuffer> texture_bindings_3d = nil;
  std::vector<id<MTLTexture>> texture_slot_map;

+  /* BLAS encoding & lookup */
+  id<MTLArgumentEncoder> mtlBlasArgEncoder = nil;
+  id<MTLBuffer> blas_buffer = nil;
+  id<MTLBuffer> blas_lookup_buffer = nil;
+
  bool use_metalrt = false;
  MetalPipelineType kernel_specialization_level = PSO_GENERIC;

@ -105,6 +110,8 @@ class MetalDevice : public Device {

  bool use_adaptive_compilation();

+  bool use_local_atomic_sort() const;
+
  bool make_source_and_check_if_compile_needed(MetalPipelineType pso_type);

  void make_source(MetalPipelineType pso_type, const uint kernel_features);
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@ -192,6 +192,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
        arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
        arg_desc_as.access = MTLArgumentAccessReadOnly;

+        MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init];
+        arg_desc_ptrs.dataType = MTLDataTypePointer;
+        arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
+
        MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
        arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
        arg_desc_ift.access = MTLArgumentAccessReadOnly;
@ -204,14 +208,28 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
        arg_desc_ift.index = index++;
        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
+        arg_desc_ift.index = index++;
+        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_prim */
+        arg_desc_ptrs.index = index++;
+        [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas array */
+        arg_desc_ptrs.index = index++;
+        [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* look up table for blas */

        [arg_desc_ift release];
        [arg_desc_as release];
+        [arg_desc_ptrs release];
      }
    }

    mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];

+    // preparing the blas arg encoder
+    MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
+    arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
+    arg_desc_blas.access = MTLArgumentAccessReadOnly;
+    mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
+    [arg_desc_blas release];
+
    for (int i = 0; i < ancillary_desc.count; i++) {
      [ancillary_desc[i] release];
    }
@ -271,6 +289,11 @@ bool MetalDevice::use_adaptive_compilation()
  return DebugFlags().metal.adaptive_compile;
 }

+bool MetalDevice::use_local_atomic_sort() const
+{
+  return DebugFlags().metal.use_local_atomic_sort;
+}
+
 void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
 {
  string global_defines;
@ -278,6 +301,10 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
    global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
  }

+  if (use_local_atomic_sort()) {
+    global_defines += "#define __KERNEL_LOCAL_ATOMIC_SORT__\n";
+  }
+
  if (use_metalrt) {
    global_defines += "#define __METALRT__\n";
    if (motion_blur) {
@ -1231,6 +1258,33 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
    if (@available(macos 11.0, *)) {
      if (bvh->params.top_level) {
        bvhMetalRT = bvh_metal;
+
+        // allocate required buffers for BLAS array
+        uint64_t count = bvhMetalRT->blas_array.size();
+        uint64_t bufferSize = mtlBlasArgEncoder.encodedLength * count;
+        blas_buffer = [mtlDevice newBufferWithLength:bufferSize options:default_storage_mode];
+        stats.mem_alloc(blas_buffer.allocatedSize);
+
+        for (uint64_t i = 0; i < count; ++i) {
+          [mtlBlasArgEncoder setArgumentBuffer:blas_buffer
+                                        offset:i * mtlBlasArgEncoder.encodedLength];
+          [mtlBlasArgEncoder setAccelerationStructure:bvhMetalRT->blas_array[i] atIndex:0];
+        }
+
+        count = bvhMetalRT->blas_lookup.size();
+        bufferSize = sizeof(uint32_t) * count;
+        blas_lookup_buffer = [mtlDevice newBufferWithLength:bufferSize
+                                                    options:default_storage_mode];
+        stats.mem_alloc(blas_lookup_buffer.allocatedSize);
+
+        memcpy([blas_lookup_buffer contents],
+               bvhMetalRT -> blas_lookup.data(),
+               blas_lookup_buffer.allocatedSize);
+
+        if (default_storage_mode == MTLResourceStorageModeManaged) {
+          [blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)];
+          [blas_lookup_buffer didModifyRange:NSMakeRange(0, blas_lookup_buffer.length)];
+        }
      }
    }
  }
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@ -19,6 +19,8 @@ enum {
  METALRT_FUNC_SHADOW_BOX,
  METALRT_FUNC_LOCAL_TRI,
  METALRT_FUNC_LOCAL_BOX,
+  METALRT_FUNC_LOCAL_TRI_PRIM,
+  METALRT_FUNC_LOCAL_BOX_PRIM,
  METALRT_FUNC_CURVE_RIBBON,
  METALRT_FUNC_CURVE_RIBBON_SHADOW,
  METALRT_FUNC_CURVE_ALL,
@ -28,7 +30,13 @@ enum {
  METALRT_FUNC_NUM
 };

-enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
+enum {
+  METALRT_TABLE_DEFAULT,
+  METALRT_TABLE_SHADOW,
+  METALRT_TABLE_LOCAL,
+  METALRT_TABLE_LOCAL_PRIM,
+  METALRT_TABLE_NUM
+};

 /* Pipeline State Object types */
 enum MetalPipelineType {
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@ -87,6 +87,9 @@ struct ShaderCache {
          break;
      }
    }
+
+    occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS] = {1024, 1024};
+    occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS] = {1024, 1024};
  }
  ~ShaderCache();

@ -521,6 +524,8 @@ void MetalKernelPipeline::compile()
          "__anyhit__cycles_metalrt_shadow_all_hit_box",
          "__anyhit__cycles_metalrt_local_hit_tri",
          "__anyhit__cycles_metalrt_local_hit_box",
+          "__anyhit__cycles_metalrt_local_hit_tri_prim",
+          "__anyhit__cycles_metalrt_local_hit_box_prim",
          "__intersection__curve_ribbon",
          "__intersection__curve_ribbon_shadow",
          "__intersection__curve_all",
@ -611,11 +616,17 @@ void MetalKernelPipeline::compile()
                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
                         nil];
+    table_functions[METALRT_TABLE_LOCAL_PRIM] = [NSArray
+        arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI_PRIM],
+                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
+                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
+                         nil];

    NSMutableSet *unique_functions = [NSMutableSet
        setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
+    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL_PRIM]];

    if (kernel_has_intersection(device_kernel)) {
      linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@ -25,6 +25,7 @@ class MetalDeviceQueue : public DeviceQueue {
  virtual int num_concurrent_states(const size_t) const override;
  virtual int num_concurrent_busy_states(const size_t) const override;
  virtual int num_sort_partition_elements() const override;
+  virtual bool supports_local_atomic_sort() const override;

  virtual void init_execution() override;

--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@ -315,6 +315,11 @@ int MetalDeviceQueue::num_sort_partition_elements() const
  return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
 }

+bool MetalDeviceQueue::supports_local_atomic_sort() const
+{
+  return metal_device_->use_local_atomic_sort();
+}
+
 void MetalDeviceQueue::init_execution()
 {
  /* Synchronize all textures and memory copies before executing task. */
@ -477,6 +482,12 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      if (metal_device_->bvhMetalRT) {
        id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct;
        [metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
+        [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
+                                                  offset:0
+                                                 atIndex:7];
+        [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_lookup_buffer
+                                                  offset:0
+                                                 atIndex:8];
      }

      for (int table = 0; table < METALRT_TABLE_NUM; table++) {
@ -527,6 +538,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      if (bvhMetalRT) {
        /* Mark all Accelerations resources as used */
        [mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct usage:MTLResourceUsageRead];
+        [mtlComputeCommandEncoder useResource:metal_device_->blas_buffer
+                                        usage:MTLResourceUsageRead];
+        [mtlComputeCommandEncoder useResource:metal_device_->blas_lookup_buffer
+                                        usage:MTLResourceUsageRead];
        [mtlComputeCommandEncoder useResources:bvhMetalRT->blas_array.data()
                                         count:bvhMetalRT->blas_array.size()
                                         usage:MTLResourceUsageRead];
@ -553,13 +568,24 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      /* See parallel_active_index.h for why this amount of shared memory is needed.
       * Rounded up to 16 bytes for Metal */
      shared_mem_bytes = (int)round_up((num_threads_per_block + 1) * sizeof(int), 16);
-      [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
      break;

+    case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
+    case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
+      int key_count = metal_device_->launch_params.data.max_shaders;
+      shared_mem_bytes = (int)round_up(key_count * sizeof(int), 16);
+      break;
+    }
+
    default:
      break;
  }

+  if (shared_mem_bytes) {
+    assert(shared_mem_bytes <= 32 * 1024);
+    [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
+  }
+
  MTLSize size_threadgroups_per_dispatch = MTLSizeMake(
      divide_up(work_size, num_threads_per_block), 1, 1);
  MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@ -64,6 +64,12 @@ MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device)
    return METAL_GPU_INTEL;
  }
  else if (strstr(device_name, "AMD")) {
+    /* Setting this env var hides AMD devices thus exposing any integrated Intel devices. */
+    if (auto str = getenv("CYCLES_METAL_FORCE_INTEL")) {
+      if (atoi(str)) {
+        return METAL_GPU_UNKNOWN;
+      }
+    }
    return METAL_GPU_AMD;
  }
  else if (strstr(device_name, "Apple")) {
@ -96,6 +102,15 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
    return usable_devices;
  }

+  /* If the system has both an AMD GPU (discrete) and an Intel one (integrated), prefer the AMD
+   * one. This can be overriden with CYCLES_METAL_FORCE_INTEL. */
+  bool has_usable_amd_gpu = false;
+  if (@available(macos 12.3, *)) {
+    for (id<MTLDevice> device in MTLCopyAllDevices()) {
+      has_usable_amd_gpu |= (get_device_vendor(device) == METAL_GPU_AMD);
+    }
+  }
+
  metal_printf("Usable Metal devices:\n");
  for (id<MTLDevice> device in MTLCopyAllDevices()) {
    string device_name = get_device_name(device);
@ -111,8 +126,10 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
    }

 #  if defined(MAC_OS_VERSION_13_0)
-    if (@available(macos 13.0, *)) {
-      usable |= (vendor == METAL_GPU_INTEL);
+    if (!has_usable_amd_gpu) {
+      if (@available(macos 13.0, *)) {
+        usable |= (vendor == METAL_GPU_INTEL);
+      }
    }
 #  endif

--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@ -854,12 +854,14 @@ bool OptiXDevice::load_osl_kernels()
        context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2]));
  }

+  OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
  vector<OptixStackSizes> osl_stack_size(osl_groups.size());

  /* Update SBT with new entries. */
  sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size());
  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
    optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+    optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
  }
  for (size_t i = 0; i < osl_groups.size(); ++i) {
    if (osl_groups[i] != NULL) {
@ -907,13 +909,15 @@ bool OptiXDevice::load_osl_kernels()
                                     0,
                                     &pipelines[PIP_SHADE]));

+    const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
+                                      stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG);
    unsigned int dss = 0;
    for (unsigned int i = 0; i < osl_stack_size.size(); ++i) {
      dss = std::max(dss, osl_stack_size[i].dssDC);
    }

    optix_assert(optixPipelineSetStackSize(
-        pipelines[PIP_SHADE], 0, dss, 0, pipeline_options.usesMotionBlur ? 3 : 2));
+        pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
  }

  return !have_error();
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@ -112,6 +112,13 @@ class DeviceQueue {
    return 65536;
  }

+  /* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and
+   * INTEGRATOR_SORT_WRITE_PASS)? */
+  virtual bool supports_local_atomic_sort() const
+  {
+    return false;
+  }
+
  /* Initialize execution of kernels on this queue.
   *
   * Will, for example, load all data required by the kernels from Device to global or path state.
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@ -71,6 +71,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
          device, "integrator_shader_mnee_sort_counter", MEM_READ_WRITE),
      integrator_shader_sort_prefix_sum_(
          device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
+      integrator_shader_sort_partition_key_offsets_(
+          device, "integrator_shader_sort_partition_key_offsets", MEM_READ_WRITE),
      integrator_next_main_path_index_(device, "integrator_next_main_path_index", MEM_READ_WRITE),
      integrator_next_shadow_path_index_(
          device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
@ -207,33 +209,45 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
  integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
                                                                num_sort_partitions_);

-  /* Allocate arrays for shader sorting. */
-  const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
-  if (integrator_shader_sort_counter_.size() < sort_buckets) {
-    integrator_shader_sort_counter_.alloc(sort_buckets);
-    integrator_shader_sort_counter_.zero_to_device();
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
-        (int *)integrator_shader_sort_counter_.device_pointer;
-
-    integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
-    integrator_shader_sort_prefix_sum_.zero_to_device();
-  }
-
-  if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
-    if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
-      integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
-      integrator_shader_raytrace_sort_counter_.zero_to_device();
-      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
-          (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+  if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
+    /* Allocate array for partitioned shader sorting using local atomics. */
+    const int num_offsets = (device_scene_->data.max_shaders + 1) * num_sort_partitions_;
+    if (integrator_shader_sort_partition_key_offsets_.size() < num_offsets) {
+      integrator_shader_sort_partition_key_offsets_.alloc(num_offsets);
+      integrator_shader_sort_partition_key_offsets_.zero_to_device();
    }
+    integrator_state_gpu_.sort_partition_key_offsets =
+        (int *)integrator_shader_sort_partition_key_offsets_.device_pointer;
  }
+  else {
+    /* Allocate arrays for shader sorting. */
+    const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
+    if (integrator_shader_sort_counter_.size() < sort_buckets) {
+      integrator_shader_sort_counter_.alloc(sort_buckets);
+      integrator_shader_sort_counter_.zero_to_device();
+      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+          (int *)integrator_shader_sort_counter_.device_pointer;

-  if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
-    if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
-      integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
-      integrator_shader_mnee_sort_counter_.zero_to_device();
-      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
-          (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+      integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
+      integrator_shader_sort_prefix_sum_.zero_to_device();
+    }
+
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+      if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
+        integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
+        integrator_shader_raytrace_sort_counter_.zero_to_device();
+        integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+            (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+      }
+    }
+
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
+      if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
+        integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
+        integrator_shader_mnee_sort_counter_.zero_to_device();
+        integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
+            (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+      }
    }
  }
 }
@ -451,8 +465,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
    work_size = num_queued;
    d_path_index = queued_paths_.device_pointer;

-    compute_sorted_queued_paths(
-        DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit);
+    compute_sorted_queued_paths(kernel, num_paths_limit);
  }
  else if (num_queued < work_size) {
    work_size = num_queued;
@ -511,11 +524,26 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
  }
 }

-void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
-                                                   DeviceKernel queued_kernel,
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel queued_kernel,
                                                   const int num_paths_limit)
 {
  int d_queued_kernel = queued_kernel;
+
+  /* Launch kernel to fill the active paths arrays. */
+  if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
+    const int work_size = kernel_max_active_main_path_index(queued_kernel);
+    device_ptr d_queued_paths = queued_paths_.device_pointer;
+
+    int partition_size = (int)integrator_state_gpu_.sort_partition_divisor;
+
+    DeviceKernelArguments args(
+        &work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
+
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, 1024 * num_sort_partitions_, args);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, 1024 * num_sort_partitions_, args);
+    return;
+  }
+
  device_ptr d_counter = (device_ptr)integrator_state_gpu_.sort_key_counter[d_queued_kernel];
  device_ptr d_prefix_sum = integrator_shader_sort_prefix_sum_.device_pointer;
  assert(d_counter != 0 && d_prefix_sum != 0);
@ -552,7 +580,7 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
                               &d_prefix_sum,
                               &d_queued_kernel);

-    queue_->enqueue(kernel, work_size, args);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, work_size, args);
  }
 }

--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@ -70,9 +70,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  void enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit = INT_MAX);

  void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
-  void compute_sorted_queued_paths(DeviceKernel kernel,
-                                   DeviceKernel queued_kernel,
-                                   const int num_paths_limit);
+  void compute_sorted_queued_paths(DeviceKernel queued_kernel, const int num_paths_limit);

  void compact_main_paths(const int num_active_paths);
  void compact_shadow_paths();
@ -135,6 +133,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  device_vector<int> integrator_shader_raytrace_sort_counter_;
  device_vector<int> integrator_shader_mnee_sort_counter_;
  device_vector<int> integrator_shader_sort_prefix_sum_;
+  device_vector<int> integrator_shader_sort_partition_key_offsets_;
  /* Path split. */
  device_vector<int> integrator_next_main_path_index_;
  device_vector<int> integrator_next_shadow_path_index_;
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@ -661,7 +661,8 @@ ccl_device void bsdf_blur(KernelGlobals kg, ccl_private ShaderClosure *sc, float
 #endif
 }

-ccl_device_inline Spectrum bsdf_albedo(ccl_private const ShaderData *sd, ccl_private const ShaderClosure *sc)
+ccl_device_inline Spectrum bsdf_albedo(ccl_private const ShaderData *sd,
+                                       ccl_private const ShaderClosure *sc)
 {
  Spectrum albedo = sc->weight;
  /* Some closures include additional components such as Fresnel terms that cause their albedo to
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@ -519,14 +519,6 @@ ccl_device int bsdf_microfacet_ggx_setup(ccl_private MicrofacetBsdf *bsdf)
  return SD_BSDF | SD_BSDF_HAS_EVAL;
 }

-/* Required to maintain OSL interface. */
-ccl_device int bsdf_microfacet_ggx_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
-{
-  bsdf->alpha_y = bsdf->alpha_x;
-
-  return bsdf_microfacet_ggx_setup(bsdf);
-}
-
 ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsdf,
                                                 ccl_private const ShaderData *sd)
 {
@ -613,14 +605,6 @@ ccl_device int bsdf_microfacet_beckmann_setup(ccl_private MicrofacetBsdf *bsdf)
  return SD_BSDF | SD_BSDF_HAS_EVAL;
 }

-/* Required to maintain OSL interface. */
-ccl_device int bsdf_microfacet_beckmann_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
-{
-  bsdf->alpha_y = bsdf->alpha_x;
-
-  return bsdf_microfacet_beckmann_setup(bsdf);
-}
-
 ccl_device int bsdf_microfacet_beckmann_refraction_setup(ccl_private MicrofacetBsdf *bsdf)
 {
  bsdf->alpha_x = saturatef(bsdf->alpha_x);
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@ -90,8 +90,10 @@ ccl_device float schlick_fresnel(float u)
 }

 /* Calculate the fresnel color, which is a blend between white and the F0 color */
-ccl_device_forceinline Spectrum
-interpolate_fresnel_color(float3 L, float3 H, float ior, Spectrum F0)
+ccl_device_forceinline Spectrum interpolate_fresnel_color(float3 L,
+                                                          float3 H,
+                                                          float ior,
+                                                          Spectrum F0)
 {
  /* Compute the real Fresnel term and remap it from real_F0..1 to F0..1.
   * The reason why we use this remapping instead of directly doing the
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@ -401,6 +401,72 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
 }
 ccl_gpu_kernel_postfix

+ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
+                             int num_states,
+                             int partition_size,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             int kernel_index)
+{
+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  int max_shaders = context.launch_params_metal.data.max_shaders;
+  ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
+                                           kernel_integrator_state.path.queued_kernel;
+  ccl_global uint *d_shader_sort_key = (ccl_global uint *)
+                                           kernel_integrator_state.path.shader_sort_key;
+  ccl_global int *key_offsets = (ccl_global int *)
+                                    kernel_integrator_state.sort_partition_key_offsets;
+
+  gpu_parallel_sort_bucket_pass(num_states,
+                                partition_size,
+                                max_shaders,
+                                kernel_index,
+                                d_queued_kernel,
+                                d_shader_sort_key,
+                                key_offsets,
+                                (threadgroup int *)threadgroup_array,
+                                metal_local_id,
+                                metal_local_size,
+                                metal_grid_id);
+#endif
+}
+ccl_gpu_kernel_postfix
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sort_write_pass,
+                             int num_states,
+                             int partition_size,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             int kernel_index)
+{
+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  int max_shaders = context.launch_params_metal.data.max_shaders;
+  ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
+                                           kernel_integrator_state.path.queued_kernel;
+  ccl_global uint *d_shader_sort_key = (ccl_global uint *)
+                                           kernel_integrator_state.path.shader_sort_key;
+  ccl_global int *key_offsets = (ccl_global int *)
+                                    kernel_integrator_state.sort_partition_key_offsets;
+
+  gpu_parallel_sort_write_pass(num_states,
+                               partition_size,
+                               max_shaders,
+                               kernel_index,
+                               num_states_limit,
+                               indices,
+                               d_queued_kernel,
+                               d_shader_sort_key,
+                               key_offsets,
+                               (threadgroup int *)threadgroup_array,
+                               metal_local_id,
+                               metal_local_size,
+                               metal_grid_id);
+#endif
+}
+ccl_gpu_kernel_postfix
+
 ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_compact_paths_array,
                             int num_states,
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@ -178,7 +178,7 @@ __device__
                                         simd_lane_index, \
                                         simd_group_index, \
                                         num_simd_groups, \
-                                         simdgroup_offset)
+                                         (threadgroup int *)threadgroup_array)
 #elif defined(__KERNEL_ONEAPI__)

 #  define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@ -19,6 +19,115 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
+#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
+
+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+
+#  define atomic_store_local(p, x) \
+    atomic_store_explicit((threadgroup atomic_int *)p, x, memory_order_relaxed)
+#  define atomic_load_local(p) \
+    atomic_load_explicit((threadgroup atomic_int *)p, memory_order_relaxed)
+
+ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
+                                                     const uint partition_size,
+                                                     const uint max_shaders,
+                                                     const uint queued_kernel,
+                                                     ccl_global ushort *d_queued_kernel,
+                                                     ccl_global uint *d_shader_sort_key,
+                                                     ccl_global int *partition_key_offsets,
+                                                     ccl_gpu_shared int *buckets,
+                                                     const ushort local_id,
+                                                     const ushort local_size,
+                                                     const ushort grid_id)
+{
+  /* Zero the bucket sizes. */
+  if (local_id < max_shaders) {
+    atomic_store_local(&buckets[local_id], 0);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Determine bucket sizes within the partitions. */
+
+  const uint partition_start = partition_size * uint(grid_id);
+  const uint partition_end = min(num_states, partition_start + partition_size);
+
+  for (int state_index = partition_start + uint(local_id); state_index < partition_end;
+       state_index += uint(local_size)) {
+    ushort kernel_index = d_queued_kernel[state_index];
+    if (kernel_index == queued_kernel) {
+      uint key = d_shader_sort_key[state_index] % max_shaders;
+      atomic_fetch_and_add_uint32(&buckets[key], 1);
+    }
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Calculate the partition's local offsets from the prefix sum of bucket sizes. */
+
+  if (local_id == 0) {
+    int offset = 0;
+    for (int i = 0; i < max_shaders; i++) {
+      partition_key_offsets[i + uint(grid_id) * (max_shaders + 1)] = offset;
+      offset = offset + atomic_load_local(&buckets[i]);
+    }
+
+    /* Store the number of active states in this partition. */
+    partition_key_offsets[max_shaders + uint(grid_id) * (max_shaders + 1)] = offset;
+  }
+}
+
+ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
+                                                    const uint partition_size,
+                                                    const uint max_shaders,
+                                                    const uint queued_kernel,
+                                                    const int num_states_limit,
+                                                    ccl_global int *indices,
+                                                    ccl_global ushort *d_queued_kernel,
+                                                    ccl_global uint *d_shader_sort_key,
+                                                    ccl_global int *partition_key_offsets,
+                                                    ccl_gpu_shared int *local_offset,
+                                                    const ushort local_id,
+                                                    const ushort local_size,
+                                                    const ushort grid_id)
+{
+  /* Calculate each partition's global offset from the prefix sum of the active state counts per
+   * partition. */
+
+  if (local_id < max_shaders) {
+    int partition_offset = 0;
+    for (int i = 0; i < uint(grid_id); i++) {
+      int partition_key_count = partition_key_offsets[max_shaders + uint(i) * (max_shaders + 1)];
+      partition_offset += partition_key_count;
+    }
+
+    ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * (max_shaders + 1));
+    atomic_store_local(&local_offset[local_id], key_offsets[local_id] + partition_offset);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Write the sorted active indices. */
+
+  const uint partition_start = partition_size * uint(grid_id);
+  const uint partition_end = min(num_states, partition_start + partition_size);
+
+  ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * max_shaders);
+
+  for (int state_index = partition_start + uint(local_id); state_index < partition_end;
+       state_index += uint(local_size)) {
+    ushort kernel_index = d_queued_kernel[state_index];
+    if (kernel_index == queued_kernel) {
+      uint key = d_shader_sort_key[state_index] % max_shaders;
+      int index = atomic_fetch_and_add_uint32(&local_offset[key], 1);
+      if (index < num_states_limit) {
+        indices[index] = state_index;
+      }
+    }
+  }
+}
+
+#endif /* __KERNEL_LOCAL_ATOMIC_SORT__ */

 template<typename GetKeyOp>
 __device__ void gpu_parallel_sorted_index_array(const uint state_index,
--- a/intern/cycles/kernel/device/metal/bvh.h
+++ b/intern/cycles/kernel/device/metal/bvh.h
@ -172,17 +172,14 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
    kernel_assert(!"Invalid ift_local");
    return false;
  }
-#  endif
-
-  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
-  metalrt_intersector_type metalrt_intersect;
-
-  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
-
-  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
-  if (triangle_only) {
-    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  if (is_null_intersection_function_table(metal_ancillaries->ift_local_prim)) {
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
+    kernel_assert(!"Invalid ift_local_prim");
+    return false;
  }
+#  endif

  MetalRTIntersectionLocalPayload payload;
  payload.self = ray->self;
@ -195,14 +192,48 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
  }
  payload.result = false;

-  typename metalrt_intersector_type::result_type intersection;
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);

 #  if defined(__METALRT_MOTION__)
+  metalrt_intersector_type metalrt_intersect;
+  typename metalrt_intersector_type::result_type intersection;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
+  if (triangle_only) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
  intersection = metalrt_intersect.intersect(
      r, metal_ancillaries->accel_struct, 0xFF, ray->time, metal_ancillaries->ift_local, payload);
 #  else
+
+  metalrt_blas_intersector_type metalrt_intersect;
+  typename metalrt_blas_intersector_type::result_type intersection;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
+  if (triangle_only) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
+  // if we know we are going to get max one hit, like for random-sss-walk we can
+  // optimize and accept the first hit
+  if (max_hits == 1) {
+    metalrt_intersect.accept_any_intersection(true);
+  }
+
+  int blas_index = metal_ancillaries->blas_userID_to_index_lookUp[local_object];
+  // transform the ray into object's local space
+  Transform itfm = kernel_data_fetch(objects, local_object).itfm;
+  r.origin = transform_point(&itfm, r.origin);
+  r.direction = transform_direction(&itfm, r.direction);
+
  intersection = metalrt_intersect.intersect(
-      r, metal_ancillaries->accel_struct, 0xFF, metal_ancillaries->ift_local, payload);
+      r,
+      metal_ancillaries->blas_accel_structs[blas_index].blas,
+      metal_ancillaries->ift_local_prim,
+      payload);
 #  endif

  if (lcg_state) {
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@ -105,10 +105,11 @@ struct kernel_gpu_##name \
 { \
  PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
  void run(thread MetalKernelContext& context, \
-           threadgroup int *simdgroup_offset, \
+           threadgroup atomic_int *threadgroup_array, \
           const uint metal_global_id, \
           const ushort metal_local_id, \
           const ushort metal_local_size, \
+           const ushort metal_grid_id, \
           uint simdgroup_size, \
           uint simd_lane_index, \
           uint simd_group_index, \
@ -117,22 +118,24 @@ struct kernel_gpu_##name \
 kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
                                constant KernelParamsMetal &ccl_restrict   _launch_params_metal, \
                                constant MetalAncillaries *_metal_ancillaries, \
-                                threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
+                                threadgroup atomic_int *threadgroup_array[[ threadgroup(0) ]], \
                                const uint metal_global_id [[thread_position_in_grid]], \
                                const ushort metal_local_id   [[thread_position_in_threadgroup]], \
                                const ushort metal_local_size [[threads_per_threadgroup]], \
+                                const ushort metal_grid_id    [[threadgroup_position_in_grid]], \
                                uint simdgroup_size [[threads_per_simdgroup]], \
                                uint simd_lane_index [[thread_index_in_simdgroup]], \
                                uint simd_group_index [[simdgroup_index_in_threadgroup]], \
                                uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
  MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
-  params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
+  params_struct->run(context, threadgroup_array, metal_global_id, metal_local_id, metal_local_size, metal_grid_id, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
 } \
 void kernel_gpu_##name::run(thread MetalKernelContext& context, \
-                  threadgroup int *simdgroup_offset, \
+                  threadgroup atomic_int *threadgroup_array, \
                  const uint metal_global_id, \
                  const ushort metal_local_id, \
                  const ushort metal_local_size, \
+                  const ushort metal_grid_id, \
                  uint simdgroup_size, \
                  uint simd_lane_index, \
                  uint simd_group_index, \
@ -263,13 +266,25 @@ ccl_device_forceinline uchar4 make_uchar4(const uchar x,

 #  if defined(__METALRT_MOTION__)
 #    define METALRT_TAGS instancing, instance_motion, primitive_motion
+#    define METALRT_BLAS_TAGS , primitive_motion
 #  else
 #    define METALRT_TAGS instancing
+#    define METALRT_BLAS_TAGS
 #  endif /* __METALRT_MOTION__ */

 typedef acceleration_structure<METALRT_TAGS> metalrt_as_type;
 typedef intersection_function_table<triangle_data, METALRT_TAGS> metalrt_ift_type;
 typedef metal::raytracing::intersector<triangle_data, METALRT_TAGS> metalrt_intersector_type;
+#  if defined(__METALRT_MOTION__)
+typedef acceleration_structure<primitive_motion> metalrt_blas_as_type;
+typedef intersection_function_table<triangle_data, primitive_motion> metalrt_blas_ift_type;
+typedef metal::raytracing::intersector<triangle_data, primitive_motion>
+    metalrt_blas_intersector_type;
+#  else
+typedef acceleration_structure<> metalrt_blas_as_type;
+typedef intersection_function_table<triangle_data> metalrt_blas_ift_type;
+typedef metal::raytracing::intersector<triangle_data> metalrt_blas_intersector_type;
+#  endif

 #endif /* __METALRT__ */

@ -282,6 +297,12 @@ struct Texture3DParamsMetal {
  texture3d<float, access::sample> tex;
 };

+#ifdef __METALRT__
+struct MetalRTBlasWrapper {
+  metalrt_blas_as_type blas;
+};
+#endif
+
 struct MetalAncillaries {
  device Texture2DParamsMetal *textures_2d;
  device Texture3DParamsMetal *textures_3d;
@ -291,6 +312,9 @@ struct MetalAncillaries {
  metalrt_ift_type ift_default;
  metalrt_ift_type ift_shadow;
  metalrt_ift_type ift_local;
+  metalrt_blas_ift_type ift_local_prim;
+  constant MetalRTBlasWrapper *blas_accel_structs;
+  constant int *blas_userID_to_index_lookUp;
 #endif
 };

--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@ -139,6 +139,20 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
 #endif
 }

+[[intersection(triangle, triangle_data )]] TriangleIntersectionResult
+__anyhit__cycles_metalrt_local_hit_tri_prim(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
+    uint primitive_id [[primitive_id]],
+    float2 barycentrics [[barycentric_coord]],
+    float ray_tmax [[distance]])
+{
+  //instance_id, aka the user_id has been removed. If we take this function we optimized the
+  //SSS for starting traversal from a primitive acceleration structure instead of the root of the global AS.
+  //this means we will always be intersecting the correct object no need for the userid to check
+  return metalrt_local_hit<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
+      launch_params_metal, payload, payload.local_object, primitive_id, barycentrics, ray_tmax);
+}
 [[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
 __anyhit__cycles_metalrt_local_hit_tri(
    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
@ -163,6 +177,17 @@ __anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
  return result;
 }

+[[intersection(bounding_box, triangle_data )]] BoundingBoxIntersectionResult
+__anyhit__cycles_metalrt_local_hit_box_prim(const float ray_tmax [[max_distance]])
+{
+  /* unused function */
+  BoundingBoxIntersectionResult result;
+  result.distance = ray_tmax;
+  result.accept = false;
+  result.continue_search = false;
+  return result;
+}
+
 template<uint intersection_type>
 bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
                            ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@ -372,6 +372,16 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sorted_paths_array);
          break;
        }
+        case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_bucket_pass);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_write_pass);
+          break;
+        }
        case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: {
          oneapi_call(kg,
                      cgh,
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@ -132,6 +132,9 @@ typedef struct IntegratorStateGPU {
  /* Index of main path which will be used by a next shadow catcher split.  */
  ccl_global int *next_main_path_index;

+  /* Partition/key offsets used when writing sorted active indices. */
+  ccl_global int *sort_partition_key_offsets;
+
  /* Divisor used to partition active indices by locality when sorting by material.  */
  uint sort_partition_divisor;
 } IntegratorStateGPU;
--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@ -115,6 +115,13 @@ ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+
+#  if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  if (!kernel_integrator_state.sort_key_counter[next_kernel]) {
+    return;
+  }
+#  endif
+
  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
 }

@ -130,6 +137,13 @@ ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+
+#  if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  if (!kernel_integrator_state.sort_key_counter[next_kernel]) {
+    return;
+  }
+#  endif
+
  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
 }

--- a/intern/cycles/kernel/osl/closures_setup.h
+++ b/intern/cycles/kernel/osl/closures_setup.h
@ -209,14 +209,7 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
  if (closure->distribution == make_string("ggx", 11253504724482777663ull) ||
      closure->distribution == make_string("default", 4430693559278735917ull)) {
    if (!closure->refract) {
-      if (closure->alpha_x == closure->alpha_y) {
-        /* Isotropic */
-        sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
-      }
-      else {
-        /* Anisotropic */
-        sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
-      }
+      sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
    }
    else {
      sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
@ -225,14 +218,7 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
  /* Beckmann */
  else {
    if (!closure->refract) {
-      if (closure->alpha_x == closure->alpha_y) {
-        /* Isotropic */
-        sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
-      }
-      else {
-        /* Anisotropic */
-        sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
-      }
+      sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
    }
    else {
      sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
@ -258,9 +244,9 @@ ccl_device void osl_closure_microfacet_ggx_setup(
  }

  bsdf->N = ensure_valid_reflection(sd->Ng, sd->wi, closure->N);
-  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_x = bsdf->alpha_y = closure->alpha_x;

-  sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
+  sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
 }

 ccl_device void osl_closure_microfacet_ggx_aniso_setup(
@ -652,9 +638,9 @@ ccl_device void osl_closure_microfacet_beckmann_setup(
  }

  bsdf->N = ensure_valid_reflection(sd->Ng, sd->wi, closure->N);
-  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_x = bsdf->alpha_y = closure->alpha_x;

-  sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
+  sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
 }

 ccl_device void osl_closure_microfacet_beckmann_aniso_setup(
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@ -74,7 +74,8 @@ CCL_NAMESPACE_BEGIN
 #define __VOLUME__

 /* TODO: solve internal compiler errors and enable light tree on HIP. */
-#ifdef __KERNEL_HIP__
+/* TODO: solve internal compiler perf issue and enable light tree on Metal/AMD. */
+#if defined(__KERNEL_HIP__) || defined(__KERNEL_METAL_AMD__)
 #  undef __LIGHT_TREE__
 #endif

@ -1508,6 +1509,8 @@ typedef enum DeviceKernel : int {
  DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS,
+  DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS,
  DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES,
  DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY,
--- a/intern/cycles/util/atomic.h
+++ b/intern/cycles/util/atomic.h
@ -73,16 +73,55 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *_s
  return new_value.float_value;
 }

-#    define atomic_fetch_and_add_uint32(p, x) \
-      atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed)
-#    define atomic_fetch_and_sub_uint32(p, x) \
-      atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed)
-#    define atomic_fetch_and_inc_uint32(p) \
-      atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
-#    define atomic_fetch_and_dec_uint32(p) \
-      atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
-#    define atomic_fetch_and_or_uint32(p, x) \
-      atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed)
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_add_uint32(device T *p, int x)
+{
+  return atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_sub_uint32(device T *p, int x)
+{
+  return atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_inc_uint32(device T *p)
+{
+  return atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_dec_uint32(device T *p)
+{
+  return atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_or_uint32(device T *p, int x)
+{
+  return atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_add_uint32(threadgroup T *p, int x)
+{
+  return atomic_fetch_add_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_sub_uint32(threadgroup T *p, int x)
+{
+  return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_inc_uint32(threadgroup T *p)
+{
+  return atomic_fetch_add_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_dec_uint32(threadgroup T *p)
+{
+  return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_or_uint32(threadgroup T *p, int x)
+{
+  return atomic_fetch_or_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
+}

 ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest,
                                                      const float old_val,
--- a/intern/cycles/util/debug.cpp
+++ b/intern/cycles/util/debug.cpp
@ -69,6 +69,9 @@ void DebugFlags::Metal::reset()
 {
  if (getenv("CYCLES_METAL_ADAPTIVE_COMPILE") != NULL)
    adaptive_compile = true;
+
+  if (auto str = getenv("CYCLES_METAL_LOCAL_ATOMIC_SORT"))
+    use_local_atomic_sort = (atoi(str) != 0);
 }

 DebugFlags::OptiX::OptiX()
--- a/intern/cycles/util/debug.h
+++ b/intern/cycles/util/debug.h
@ -97,6 +97,9 @@ class DebugFlags {

    /* Whether adaptive feature based runtime compile is enabled or not. */
    bool adaptive_compile = false;
+
+    /* Whether local atomic sorting is enabled or not. */
+    bool use_local_atomic_sort = true;
  };

  /* Get instance of debug flags registry. */
--- a/intern/ghost/intern/GHOST_SystemWayland.cpp
+++ b/intern/ghost/intern/GHOST_SystemWayland.cpp
@ -82,6 +82,8 @@
 #include "CLG_log.h"

 #ifdef USE_EVENT_BACKGROUND_THREAD
+#  include "GHOST_TimerTask.h"
+
 #  include <pthread.h>
 #endif

@ -768,7 +770,12 @@ struct GWL_Seat {
    int32_t rate = 0;
    /** Time (milliseconds) after which to start repeating keys. */
    int32_t delay = 0;
-    /** Timer for key repeats. */
+    /**
+     * Timer for key repeats.
+     *
+     * \note For as long as #USE_EVENT_BACKGROUND_THREAD is defined, any access to this
+     * (including null checks, must lock `timer_mutex` first.
+     */
    GHOST_ITimerTask *timer = nullptr;
  } key_repeat;

@ -832,6 +839,42 @@ static bool gwl_seat_key_depressed_suppress_warning(const GWL_Seat *seat)
  return suppress_warning;
 }

+/**
+ * \note Caller must lock `timer_mutex`.
+ */
+static void gwl_seat_key_repeat_timer_add(GWL_Seat *seat,
+                                          GHOST_TimerProcPtr key_repeat_fn,
+                                          GHOST_TUserDataPtr payload,
+                                          const bool use_delay)
+{
+  GHOST_SystemWayland *system = seat->system;
+  const uint64_t time_step = 1000 / seat->key_repeat.rate;
+  const uint64_t time_start = use_delay ? seat->key_repeat.delay : time_step;
+#ifdef USE_EVENT_BACKGROUND_THREAD
+  GHOST_TimerTask *timer = new GHOST_TimerTask(
+      system->getMilliSeconds() + time_start, time_step, key_repeat_fn, payload);
+  seat->key_repeat.timer = timer;
+  system->ghost_timer_manager()->addTimer(timer);
+#else
+  seat->key_repeat.timer = system->installTimer(time_start, time_step, key_repeat_fn, payload);
+#endif
+}
+
+/**
+ * \note The caller must lock `timer_mutex`.
+ */
+static void gwl_seat_key_repeat_timer_remove(GWL_Seat *seat)
+{
+  GHOST_SystemWayland *system = seat->system;
+#ifdef USE_EVENT_BACKGROUND_THREAD
+  system->ghost_timer_manager()->removeTimer(
+      static_cast<GHOST_TimerTask *>(seat->key_repeat.timer));
+#else
+  system->removeTimer(seat->key_repeat.timer);
+#endif
+  seat->key_repeat.timer = nullptr;
+}
+
 /** \} */

 /* -------------------------------------------------------------------- */
@ -906,6 +949,16 @@ struct GWL_Display {
  /** Guard against multiple threads accessing `events_pending` at once. */
  std::mutex events_pending_mutex;

+  /**
+   * A separate timer queue, needed so the WAYLAND thread can lock access.
+   * Using the system's #GHOST_Sysem::getTimerManager is not thread safe because
+   * access to the timer outside of WAYLAND specific logic will not lock.
+   *
+   * Needed because #GHOST_System::dispatchEvents fires timers
+   * outside of WAYLAND (without locking the `timer_mutex`).
+   */
+  GHOST_TimerManager *ghost_timer_manager;
+
 #endif /* USE_EVENT_BACKGROUND_THREAD */
 };

@ -922,6 +975,9 @@ static void gwl_display_destroy(GWL_Display *display)
    ghost_wl_display_lock_without_input(display->wl_display, display->system->server_mutex);
    display->events_pthread_is_active = false;
  }
+
+  delete display->ghost_timer_manager;
+  display->ghost_timer_manager = nullptr;
 #endif

  /* For typical WAYLAND use this will always be set.
@ -3718,9 +3774,14 @@ static void keyboard_handle_leave(void *data,
  GWL_Seat *seat = static_cast<GWL_Seat *>(data);
  seat->keyboard.wl_surface_window = nullptr;

-  /* Losing focus must stop repeating text. */
-  if (seat->key_repeat.timer) {
-    keyboard_handle_key_repeat_cancel(seat);
+  {
+#ifdef USE_EVENT_BACKGROUND_THREAD
+    std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+    /* Losing focus must stop repeating text. */
+    if (seat->key_repeat.timer) {
+      keyboard_handle_key_repeat_cancel(seat);
+    }
  }

 #ifdef USE_GNOME_KEYBOARD_SUPPRESS_WARNING
@ -3780,36 +3841,32 @@ static xkb_keysym_t xkb_state_key_get_one_sym_without_modifiers(
  return sym;
 }

+/**
+ * \note Caller must lock `timer_mutex`.
+ */
 static void keyboard_handle_key_repeat_cancel(GWL_Seat *seat)
 {
-#ifdef USE_EVENT_BACKGROUND_THREAD
-  std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
-#endif
  GHOST_ASSERT(seat->key_repeat.timer != nullptr, "Caller much check for timer");
  delete static_cast<GWL_KeyRepeatPlayload *>(seat->key_repeat.timer->getUserData());
-  seat->system->removeTimer(seat->key_repeat.timer);
-  seat->key_repeat.timer = nullptr;
+
+  gwl_seat_key_repeat_timer_remove(seat);
 }

 /**
 * Restart the key-repeat timer.
 * \param use_delay: When false, use the interval
 * (prevents pause when the setting changes while the key is held).
+ *
+ * \note Caller must lock `timer_mutex`.
 */
 static void keyboard_handle_key_repeat_reset(GWL_Seat *seat, const bool use_delay)
 {
-#ifdef USE_EVENT_BACKGROUND_THREAD
-  std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
-#endif
  GHOST_ASSERT(seat->key_repeat.timer != nullptr, "Caller much check for timer");
-  GHOST_SystemWayland *system = seat->system;
-  GHOST_ITimerTask *timer = seat->key_repeat.timer;
-  GHOST_TimerProcPtr key_repeat_fn = timer->getTimerProc();
+  GHOST_TimerProcPtr key_repeat_fn = seat->key_repeat.timer->getTimerProc();
  GHOST_TUserDataPtr payload = seat->key_repeat.timer->getUserData();
-  seat->system->removeTimer(seat->key_repeat.timer);
-  const uint64_t time_step = 1000 / seat->key_repeat.rate;
-  const uint64_t time_start = use_delay ? seat->key_repeat.delay : time_step;
-  seat->key_repeat.timer = system->installTimer(time_start, time_step, key_repeat_fn, payload);
+
+  gwl_seat_key_repeat_timer_remove(seat);
+  gwl_seat_key_repeat_timer_add(seat, key_repeat_fn, payload, use_delay);
 }

 static void keyboard_handle_key(void *data,
@ -3848,6 +3905,11 @@ static void keyboard_handle_key(void *data,
      break;
  }

+#ifdef USE_EVENT_BACKGROUND_THREAD
+  /* Any access to `seat->key_repeat.timer` must lock. */
+  std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+
  struct GWL_KeyRepeatPlayload *key_repeat_payload = nullptr;

  /* Delete previous timer. */
@ -3886,23 +3948,14 @@ static void keyboard_handle_key(void *data,
        break;
      }
      case RESET: {
-#ifdef USE_EVENT_BACKGROUND_THREAD
-        std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
-#endif
        /* The payload will be added again. */
-        seat->system->removeTimer(seat->key_repeat.timer);
-        seat->key_repeat.timer = nullptr;
+        gwl_seat_key_repeat_timer_remove(seat);
        break;
      }
      case CANCEL: {
-#ifdef USE_EVENT_BACKGROUND_THREAD
-        std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
-#endif
        delete key_repeat_payload;
        key_repeat_payload = nullptr;
-
-        seat->system->removeTimer(seat->key_repeat.timer);
-        seat->key_repeat.timer = nullptr;
+        gwl_seat_key_repeat_timer_remove(seat);
        break;
      }
    }
@ -3956,8 +4009,8 @@ static void keyboard_handle_key(void *data,
                                                           utf8_buf));
      }
    };
-    seat->key_repeat.timer = seat->system->installTimer(
-        seat->key_repeat.delay, 1000 / seat->key_repeat.rate, key_repeat_fn, key_repeat_payload);
+
+    gwl_seat_key_repeat_timer_add(seat, key_repeat_fn, key_repeat_payload, true);
  }
 }

@ -3982,8 +4035,13 @@ static void keyboard_handle_modifiers(void *data,

  /* A modifier changed so reset the timer,
   * see comment in #keyboard_handle_key regarding this behavior. */
-  if (seat->key_repeat.timer) {
-    keyboard_handle_key_repeat_reset(seat, true);
+  {
+#ifdef USE_EVENT_BACKGROUND_THREAD
+    std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+    if (seat->key_repeat.timer) {
+      keyboard_handle_key_repeat_reset(seat, true);
+    }
  }

 #ifdef USE_GNOME_KEYBOARD_SUPPRESS_WARNING
@ -4002,9 +4060,14 @@ static void keyboard_repeat_handle_info(void *data,
  seat->key_repeat.rate = rate;
  seat->key_repeat.delay = delay;

-  /* Unlikely possible this setting changes while repeating. */
-  if (seat->key_repeat.timer) {
-    keyboard_handle_key_repeat_reset(seat, false);
+  {
+#ifdef USE_EVENT_BACKGROUND_THREAD
+    std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+    /* Unlikely possible this setting changes while repeating. */
+    if (seat->key_repeat.timer) {
+      keyboard_handle_key_repeat_reset(seat, false);
+    }
  }
 }

@ -4275,8 +4338,14 @@ static void gwl_seat_capability_keyboard_disable(GWL_Seat *seat)
  if (!seat->wl_keyboard) {
    return;
  }
-  if (seat->key_repeat.timer) {
-    keyboard_handle_key_repeat_cancel(seat);
+
+  {
+#ifdef USE_EVENT_BACKGROUND_THREAD
+    std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+    if (seat->key_repeat.timer) {
+      keyboard_handle_key_repeat_cancel(seat);
+    }
  }
  wl_keyboard_destroy(seat->wl_keyboard);
  seat->wl_keyboard = nullptr;
@ -5411,6 +5480,8 @@ GHOST_SystemWayland::GHOST_SystemWayland(bool background)

 #ifdef USE_EVENT_BACKGROUND_THREAD
  gwl_display_event_thread_create(display_);
+
+  display_->ghost_timer_manager = new GHOST_TimerManager();
 #endif
 }

@ -5491,10 +5562,16 @@ bool GHOST_SystemWayland::processEvents(bool waitForEvent)
 #endif /* USE_EVENT_BACKGROUND_THREAD */

  {
+    const uint64_t now = getMilliSeconds();
 #ifdef USE_EVENT_BACKGROUND_THREAD
-    std::lock_guard lock_timer_guard{*display_->system->timer_mutex};
+    {
+      std::lock_guard lock_timer_guard{*display_->system->timer_mutex};
+      if (ghost_timer_manager()->fireTimers(now)) {
+        any_processed = true;
+      }
+    }
 #endif
-    if (getTimerManager()->fireTimers(getMilliSeconds())) {
+    if (getTimerManager()->fireTimers(now)) {
      any_processed = true;
    }
  }
@ -6717,6 +6794,13 @@ struct wl_shm *GHOST_SystemWayland::wl_shm() const
  return display_->wl_shm;
 }

+#ifdef USE_EVENT_BACKGROUND_THREAD
+GHOST_TimerManager *GHOST_SystemWayland::ghost_timer_manager()
+{
+  return display_->ghost_timer_manager;
+}
+#endif
+
 /** \} */

 /* -------------------------------------------------------------------- */
--- a/intern/ghost/intern/GHOST_SystemWayland.h
+++ b/intern/ghost/intern/GHOST_SystemWayland.h
@ -165,6 +165,16 @@ class GHOST_SystemWayland : public GHOST_System {

  bool cursor_grab_use_software_display_get(const GHOST_TGrabCursorMode mode);

+#ifdef USE_EVENT_BACKGROUND_THREAD
+  /**
+   * Return a separate WAYLAND local timer manager to #GHOST_System::getTimerManager
+   * Manipulation & access must lock with #GHOST_WaylandSystem::server_mutex.
+   *
+   * See #GWL_Display::ghost_timer_manager doc-string for details on why this is needed.
+   */
+  GHOST_TimerManager *ghost_timer_manager();
+#endif
+
  /* WAYLAND direct-data access. */

  struct wl_display *wl_display();
@ -233,7 +243,14 @@ class GHOST_SystemWayland : public GHOST_System {
   * from running at the same time. */
  std::mutex *server_mutex = nullptr;

-  /** Threads must lock this before manipulating timers. */
+  /**
+   * Threads must lock this before manipulating #GWL_Display::ghost_timer_manager.
+   *
+   * \note Using a separate lock to `server_mutex` is necessary because the
+   * server lock is already held when calling `ghost_wl_display_event_pump`.
+   * If manipulating the timer used the `server_mutex`, event pump can indirectly
+   * handle key up/down events which would lock `server_mutex` causing a dead-lock.
+   */
  std::mutex *timer_mutex = nullptr;

  std::thread::id main_thread_id;
--- a/source/blender/blenkernel/BKE_crazyspace.hh
+++ b/source/blender/blenkernel/BKE_crazyspace.hh
@ -6,8 +6,7 @@

 #pragma once

-#include "BLI_float3x3.hh"
-#include "BLI_math_vector_types.hh"
+#include "BLI_math_matrix.hh"
 #include "BLI_span.hh"

 struct Depsgraph;
@ -38,7 +37,7 @@ struct GeometryDeformation {
      return translation;
    }
    const float3x3 &deform_mat = this->deform_mats[position_i];
-    return deform_mat.inverted() * translation;
+    return math::transform_point(math::invert(deform_mat), translation);
  }
 };

--- a/source/blender/blenkernel/BKE_curves.hh
+++ b/source/blender/blenkernel/BKE_curves.hh
@ -13,10 +13,9 @@

 #include "BLI_bounds_types.hh"
 #include "BLI_cache_mutex.hh"
-#include "BLI_float3x3.hh"
-#include "BLI_float4x4.hh"
 #include "BLI_generic_virtual_array.hh"
 #include "BLI_index_mask.hh"
+#include "BLI_math_matrix_types.hh"
 #include "BLI_math_vector_types.hh"
 #include "BLI_offset_indices.hh"
 #include "BLI_shared_cache.hh"
--- a/source/blender/blenkernel/BKE_geometry_set_instances.hh
+++ b/source/blender/blenkernel/BKE_geometry_set_instances.hh
@ -2,7 +2,7 @@

 #pragma once

-#include "BLI_float4x4.hh"
+#include "BLI_math_matrix_types.hh"

 #include "BKE_geometry_set.hh"

--- a/source/blender/blenkernel/BKE_instances.hh
+++ b/source/blender/blenkernel/BKE_instances.hh
@ -19,7 +19,7 @@

 #include <mutex>

-#include "BLI_float4x4.hh"
+#include "BLI_math_matrix_types.hh"
 #include "BLI_vector.hh"
 #include "BLI_vector_set.hh"

--- a/source/blender/blenkernel/BKE_lib_id.h
+++ b/source/blender/blenkernel/BKE_lib_id.h
@ -497,6 +497,7 @@ void BKE_lib_id_expand_local(struct Main *bmain, struct ID *id, int flags);
 *
 * Only for local IDs (linked ones already have a unique ID in their library).
 *
+ * \param name: The new name of the given ID, if NULL the current given ID name is used instead.
 * \param do_linked_data: if true, also ensure a unique name in case the given \a id is linked
 * (otherwise, just ensure that it is properly sorted).
 *
--- a/source/blender/blenkernel/BKE_lib_query.h
+++ b/source/blender/blenkernel/BKE_lib_query.h
@ -142,6 +142,9 @@ enum {
  /** Also process internal ID pointers like `ID.newid` or `ID.orig_id`.
   *  WARNING: Dangerous, use with caution. */
  IDWALK_DO_INTERNAL_RUNTIME_POINTERS = (1 << 9),
+  /** Also process the ID.lib pointer. It is an option because this pointer can usually be fully
+     ignored. */
+  IDWALK_DO_LIBRARY_POINTER = (1 << 10),
 };

 typedef struct LibraryForeachIDData LibraryForeachIDData;
--- a/source/blender/blenkernel/BKE_lib_remap.h
+++ b/source/blender/blenkernel/BKE_lib_remap.h
@ -222,8 +222,12 @@ void BKE_id_remapper_clear(struct IDRemapper *id_remapper);
 bool BKE_id_remapper_is_empty(const struct IDRemapper *id_remapper);
 /** Free the given ID Remapper. */
 void BKE_id_remapper_free(struct IDRemapper *id_remapper);
-/** Add a new remapping. */
+/** Add a new remapping. Does not replace an existing mapping for `old_id`, if any. */
 void BKE_id_remapper_add(struct IDRemapper *id_remapper, struct ID *old_id, struct ID *new_id);
+/** Add a new remapping, replacing a potential already existing mapping of `old_id`. */
+void BKE_id_remapper_add_overwrite(struct IDRemapper *id_remapper,
+                                   struct ID *old_id,
+                                   struct ID *new_id);

 /**
 * Apply a remapping.
--- a/source/blender/blenkernel/BKE_main_namemap.h
+++ b/source/blender/blenkernel/BKE_main_namemap.h
@ -29,6 +29,15 @@ struct UniqueName_Map;
 struct UniqueName_Map *BKE_main_namemap_create(void) ATTR_WARN_UNUSED_RESULT;
 void BKE_main_namemap_destroy(struct UniqueName_Map **r_name_map) ATTR_NONNULL();

+/**
+ * Destroy all name_maps in given bmain:
+ * - In bmain itself for local IDs.
+ * - In the split bmains in the list is any (for linked IDs in some cases, e.g. if called during
+ *   readfile code).
+ * - In all of the libraries IDs (for linked IDs).
+ */
+void BKE_main_namemap_clear(struct Main *bmain) ATTR_NONNULL();
+
 /**
 * Ensures the given name is unique within the given ID type.
 *
--- a/source/blender/blenkernel/BKE_mesh_boolean_convert.hh
+++ b/source/blender/blenkernel/BKE_mesh_boolean_convert.hh
@ -8,7 +8,7 @@
 */

 #include "BLI_array.hh"
-#include "BLI_float4x4.hh"
+#include "BLI_math_matrix_types.hh"
 #include "BLI_mesh_boolean.hh"
 #include "BLI_span.hh"

--- a/source/blender/blenkernel/BKE_mesh_legacy_convert.h
+++ b/source/blender/blenkernel/BKE_mesh_legacy_convert.h
@ -36,7 +36,7 @@ void BKE_mesh_legacy_convert_uvs_to_generic(Mesh *mesh);
 * Move face sets to the legacy type from a generic type.
 */
 void BKE_mesh_legacy_face_set_from_generic(
-    Mesh *mesh, blender::MutableSpan<CustomDataLayer> poly_layers_to_write);
+    blender::MutableSpan<CustomDataLayer> poly_layers_to_write);
 /**
 * Copy face sets to the generic data type from the legacy type.
 */
--- a/source/blender/blenkernel/BKE_mesh_types.h
+++ b/source/blender/blenkernel/BKE_mesh_types.h
@ -25,6 +25,7 @@

 struct BVHCache;
 struct EditMeshData;
+struct Mesh;
 struct MLoopTri;
 struct ShrinkwrapBoundaryData;
 struct SubdivCCG;
@ -167,10 +168,11 @@ struct MeshRuntime {
  SharedCache<LooseEdgeCache> loose_edges_cache;

  /**
-   * A #BLI_bitmap containing tags for the center vertices of subdivided polygons, set by the
-   * subdivision surface modifier and used by drawing code instead of polygon center face dots.
+   * A bit vector the size of the number of vertices, set to true for the center vertices of
+   * subdivided polygons. The values are set by the subdivision surface modifier and used by
+   * drawing code instead of polygon center face dots. Otherwise this will be empty.
   */
-  uint32_t *subsurf_face_dot_tags = nullptr;
+  BitVector<> subsurf_face_dot_tags;

  MeshRuntime() = default;
  ~MeshRuntime();
--- a/source/blender/blenkernel/BKE_modifier.h
+++ b/source/blender/blenkernel/BKE_modifier.h
@ -612,3 +612,25 @@ void BKE_modifier_blend_read_lib(struct BlendLibReader *reader, struct Object *o
 #ifdef __cplusplus
 }
 #endif
+
+#ifdef __cplusplus
+
+namespace blender::bke {
+
+/**
+ * A convenience class that can be used to set `ModifierData::execution_time` based on the lifetime
+ * of this class.
+ */
+class ScopedModifierTimer {
+ private:
+  ModifierData &md_;
+  double start_time_;
+
+ public:
+  ScopedModifierTimer(ModifierData &md);
+  ~ScopedModifierTimer();
+};
+
+}  // namespace blender::bke
+
+#endif
--- a/source/blender/blenkernel/BKE_nla.h
+++ b/source/blender/blenkernel/BKE_nla.h
@ -133,19 +133,19 @@ void BKE_nlastrips_sort_strips(ListBase *strips);

 /**
 * Add the given NLA-Strip to the given list of strips, assuming that it
- * isn't currently a member of another list, NULL, or conflicting with existing 
- * strips position. 
+ * isn't currently a member of another list, NULL, or conflicting with existing
+ * strips position.
 */
 void BKE_nlastrips_add_strip_unsafe(ListBase *strips, struct NlaStrip *strip);

 /**
- * @brief NULL checks incoming strip and verifies no overlap / invalid 
- *  configuration against other strips in NLA Track.
+ * \brief NULL checks incoming strip and verifies no overlap / invalid
+ * configuration against other strips in NLA Track.
 *
- * @param strips
- * @param strip
- * @return true
- * @return false
+ * \param strips:
+ * \param strip:
+ * \return true
+ * \return false
 */
 bool BKE_nlastrips_add_strip(ListBase *strips, struct NlaStrip *strip);

--- a/source/blender/blenkernel/BKE_volume.h
+++ b/source/blender/blenkernel/BKE_volume.h
@ -161,7 +161,7 @@ bool BKE_volume_save(const struct Volume *volume,
 * file or copy shared grids to make them writeable. */

 #ifdef __cplusplus
-#  include "BLI_float4x4.hh"
+#  include "BLI_math_matrix_types.hh"
 #  include "BLI_math_vector_types.hh"
 #  include "BLI_string_ref.hh"

--- a/source/blender/blenkernel/intern/DerivedMesh.cc
+++ b/source/blender/blenkernel/intern/DerivedMesh.cc
@ -676,23 +676,17 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
                                GeometrySet **r_geometry_set)
 {
  using namespace blender::bke;
-  /* Input and final mesh. Final mesh is only created the moment the first
-   * constructive modifier is executed, or a deform modifier needs normals
-   * or certain data layers. */
+  /* Input mesh shouldn't be modified. */
  Mesh *mesh_input = (Mesh *)ob->data;
+  /* The final mesh is the result of calculating all enabled modifiers. */
  Mesh *mesh_final = nullptr;
+  /* The result of calculating all leading deform modifiers. */
  Mesh *mesh_deform = nullptr;
  /* This geometry set contains the non-mesh data that might be generated by modifiers. */
  GeometrySet geometry_set_final;

  BLI_assert((mesh_input->id.tag & LIB_TAG_COPIED_ON_WRITE_EVAL_RESULT) == 0);

-  /* TODO: Remove use of `deformed_verts` in mesh modifier stack
-   * since mesh positions are now stored in a contiguous array. */
-  float(*deformed_verts)[3] = nullptr;
-  int num_deformed_verts = mesh_input->totvert;
-  bool isPrevDeform = false;
-
  /* Mesh with constructive modifiers but no deformation applied. Tracked
   * along with final mesh if undeformed / orco coordinates are requested
   * for texturing. */
@ -775,20 +769,16 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
      }

      if (mti->type == eModifierTypeType_OnlyDeform && !sculpt_dyntopo) {
-        if (!deformed_verts) {
-          deformed_verts = BKE_mesh_vert_coords_alloc(mesh_input, &num_deformed_verts);
+        blender::bke::ScopedModifierTimer modifier_timer{*md};
+        if (!mesh_final) {
+          mesh_final = BKE_mesh_copy_for_eval(mesh_input, true);
+          ASSERT_IS_VALID_MESH(mesh_final);
        }
-        else if (isPrevDeform && mti->dependsOnNormals && mti->dependsOnNormals(md)) {
-          if (mesh_final == nullptr) {
-            mesh_final = BKE_mesh_copy_for_eval(mesh_input, true);
-            ASSERT_IS_VALID_MESH(mesh_final);
-          }
-          BKE_mesh_vert_coords_apply(mesh_final, deformed_verts);
-        }
-
-        BKE_modifier_deform_verts(md, &mectx, mesh_final, deformed_verts, num_deformed_verts);
-
-        isPrevDeform = true;
+        BKE_modifier_deform_verts(md,
+                                  &mectx,
+                                  mesh_final,
+                                  BKE_mesh_vert_positions_for_write(mesh_final),
+                                  mesh_final->totvert);
      }
      else {
        break;
@ -800,10 +790,6 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
     * coordinates (like vertex paint). */
    if (r_deform) {
      mesh_deform = BKE_mesh_copy_for_eval(mesh_input, true);
-
-      if (deformed_verts) {
-        BKE_mesh_vert_coords_apply(mesh_deform, deformed_verts);
-      }
    }
  }

@ -861,6 +847,8 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
      continue;
    }

+    blender::bke::ScopedModifierTimer modifier_timer{*md};
+
    /* Add orco mesh as layer if needed by this modifier. */
    if (mesh_final && mesh_orco && mti->requiredDataMask) {
      CustomData_MeshMasks mask = {0};
@ -870,36 +858,19 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
      }
    }

-    /* How to apply modifier depends on (a) what we already have as
-     * a result of previous modifiers (could be a Mesh or just
-     * deformed vertices) and (b) what type the modifier is. */
    if (mti->type == eModifierTypeType_OnlyDeform) {
-      /* No existing verts to deform, need to build them. */
-      if (!deformed_verts) {
-        if (mesh_final) {
-          /* Deforming a mesh, read the vertex locations
-           * out of the mesh and deform them. Once done with this
-           * run of deformers verts will be written back. */
-          deformed_verts = BKE_mesh_vert_coords_alloc(mesh_final, &num_deformed_verts);
-        }
-        else {
-          deformed_verts = BKE_mesh_vert_coords_alloc(mesh_input, &num_deformed_verts);
-        }
+      if (!mesh_final) {
+        mesh_final = BKE_mesh_copy_for_eval(mesh_input, true);
+        ASSERT_IS_VALID_MESH(mesh_final);
      }
-      /* if this is not the last modifier in the stack then recalculate the normals
-       * to avoid giving bogus normals to the next modifier see: T23673. */
-      else if (isPrevDeform && mti->dependsOnNormals && mti->dependsOnNormals(md)) {
-        if (mesh_final == nullptr) {
-          mesh_final = BKE_mesh_copy_for_eval(mesh_input, true);
-          ASSERT_IS_VALID_MESH(mesh_final);
-        }
-        BKE_mesh_vert_coords_apply(mesh_final, deformed_verts);
-      }
-      BKE_modifier_deform_verts(md, &mectx, mesh_final, deformed_verts, num_deformed_verts);
+      BKE_modifier_deform_verts(md,
+                                &mectx,
+                                mesh_final,
+                                BKE_mesh_vert_positions_for_write(mesh_final),
+                                mesh_final->totvert);
    }
    else {
      bool check_for_needs_mapping = false;
-      /* apply vertex coordinates or build a Mesh as necessary */
      if (mesh_final != nullptr) {
        if (have_non_onlydeform_modifiers_appled == false) {
          /* If we only deformed, we won't have initialized #CD_ORIGINDEX.
@ -913,10 +884,6 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
        check_for_needs_mapping = true;
      }

-      if (deformed_verts) {
-        BKE_mesh_vert_coords_apply(mesh_final, deformed_verts);
-      }
-
      have_non_onlydeform_modifiers_appled = true;

      /* determine which data layers are needed by following modifiers */
@ -997,11 +964,6 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
          BKE_id_free(nullptr, mesh_final);
        }
        mesh_final = mesh_next;
-
-        if (deformed_verts) {
-          MEM_freeN(deformed_verts);
-          deformed_verts = nullptr;
-        }
      }

      /* create an orco mesh in parallel */
@ -1072,8 +1034,6 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
      mesh_final->runtime->deformed_only = false;
    }

-    isPrevDeform = (mti->type == eModifierTypeType_OnlyDeform);
-
    if (sculpt_mode && md->type == eModifierType_Multires) {
      multires_applied = true;
    }
@ -1085,22 +1045,14 @@ static void mesh_calc_modifiers(struct Depsgraph *depsgraph,
    BKE_modifier_free_temporary_data(md);
  }

-  /* Yay, we are done. If we have a Mesh and deformed vertices,
-   * we need to apply these back onto the Mesh. If we have no
-   * Mesh then we need to build one. */
  if (mesh_final == nullptr) {
-    if (deformed_verts == nullptr && allow_shared_mesh) {
+    if (allow_shared_mesh) {
      mesh_final = mesh_input;
    }
    else {
      mesh_final = BKE_mesh_copy_for_eval(mesh_input, true);
    }
  }
-  if (deformed_verts) {
-    BKE_mesh_vert_coords_apply(mesh_final, deformed_verts);
-    MEM_freeN(deformed_verts);
-    deformed_verts = nullptr;
-  }

  /* Denotes whether the object which the modifier stack came from owns the mesh or whether the
   * mesh is shared across multiple objects since there are no effective modifiers. */
@ -1338,6 +1290,8 @@ static void editbmesh_calc_modifiers(struct Depsgraph *depsgraph,
      continue;
    }

+    blender::bke::ScopedModifierTimer modifier_timer{*md};
+
    /* Add an orco mesh as layer if needed by this modifier. */
    if (mesh_final && mesh_orco && mti->requiredDataMask) {
      CustomData_MeshMasks mask = {0};
--- a/source/blender/blenkernel/intern/blendfile.cc
+++ b/source/blender/blenkernel/intern/blendfile.cc
@ -214,6 +214,8 @@ static void setup_app_data(bContext *C,
    SWAP(ListBase, bmain->wm, bfd->main->wm);
    SWAP(ListBase, bmain->workspaces, bfd->main->workspaces);
    SWAP(ListBase, bmain->screens, bfd->main->screens);
+    /* NOTE: UI IDs are assumed to be only local data-blocks, so no need to call
+     * #BKE_main_namemap_clear here (otherwise, the swapping would fail in many funny ways). */
    if (bmain->name_map != nullptr) {
      BKE_main_namemap_destroy(&bmain->name_map);
    }
--- a/source/blender/blenkernel/intern/cdderivedmesh.cc
+++ b/source/blender/blenkernel/intern/cdderivedmesh.cc
@ -138,7 +138,7 @@ static void cdDM_recalc_looptri(DerivedMesh *dm)
  BLI_assert(cddm->dm.looptris.array == NULL);
  atomic_cas_ptr(
      (void **)&cddm->dm.looptris.array, cddm->dm.looptris.array, cddm->dm.looptris.array_wip);
-  cddm->dm.looptris.array_wip = NULL;
+  cddm->dm.looptris.array_wip = nullptr;
 }

 static void cdDM_free_internal(CDDerivedMesh *cddm)
@ -241,7 +241,7 @@ static DerivedMesh *cdDM_from_mesh_ex(Mesh *mesh,
 #if 0
  cddm->mface = CustomData_get_layer(&dm->faceData, CD_MFACE);
 #else
-  cddm->mface = NULL;
+  cddm->mface = nullptr;
 #endif

  /* commented since even when CD_ORIGINDEX was first added this line fails
--- a/source/blender/blenkernel/intern/curve_to_mesh_convert.cc
+++ b/source/blender/blenkernel/intern/curve_to_mesh_convert.cc
@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */

 #include "BLI_array.hh"
+#include "BLI_math_matrix.hh"
 #include "BLI_set.hh"
 #include "BLI_task.hh"

@ -175,25 +176,26 @@ static void fill_mesh_positions(const int main_point_num,
 {
  if (profile_point_num == 1) {
    for (const int i_ring : IndexRange(main_point_num)) {
-      float4x4 point_matrix = float4x4::from_normalized_axis_data(
+      float4x4 point_matrix = math::from_orthonormal_axes<float4x4>(
          main_positions[i_ring], normals[i_ring], tangents[i_ring]);
      if (!radii.is_empty()) {
-        point_matrix.apply_scale(radii[i_ring]);
+        point_matrix = math::scale(point_matrix, float3(radii[i_ring]));
      }
-      mesh_positions[i_ring] = point_matrix * profile_positions.first();
+      mesh_positions[i_ring] = math::transform_point(point_matrix, profile_positions.first());
    }
  }
  else {
    for (const int i_ring : IndexRange(main_point_num)) {
-      float4x4 point_matrix = float4x4::from_normalized_axis_data(
+      float4x4 point_matrix = math::from_orthonormal_axes<float4x4>(
          main_positions[i_ring], normals[i_ring], tangents[i_ring]);
      if (!radii.is_empty()) {
-        point_matrix.apply_scale(radii[i_ring]);
+        point_matrix = math::scale(point_matrix, float3(radii[i_ring]));
      }

      const int ring_vert_start = i_ring * profile_point_num;
      for (const int i_profile : IndexRange(profile_point_num)) {
-        mesh_positions[ring_vert_start + i_profile] = point_matrix * profile_positions[i_profile];
+        mesh_positions[ring_vert_start + i_profile] = math::transform_point(
+            point_matrix, profile_positions[i_profile]);
      }
    }
  }
--- a/source/blender/blenkernel/intern/curves.cc
+++ b/source/blender/blenkernel/intern/curves.cc
@ -17,7 +17,7 @@
 #include "BLI_index_range.hh"
 #include "BLI_listbase.h"
 #include "BLI_math_base.h"
-#include "BLI_math_vector.hh"
+#include "BLI_math_matrix.hh"
 #include "BLI_rand.hh"
 #include "BLI_span.hh"
 #include "BLI_string.h"
@ -314,6 +314,8 @@ static void curves_evaluate_modifiers(struct Depsgraph *depsgraph,
      continue;
    }

+    blender::bke::ScopedModifierTimer modifier_timer{*md};
+
    if (mti->modifyGeometrySet != nullptr) {
      mti->modifyGeometrySet(md, &mectx, &geometry_set);
    }
@ -417,15 +419,15 @@ void curves_copy_parameters(const Curves &src, Curves &dst)

 CurvesSurfaceTransforms::CurvesSurfaceTransforms(const Object &curves_ob, const Object *surface_ob)
 {
-  this->curves_to_world = curves_ob.object_to_world;
-  this->world_to_curves = this->curves_to_world.inverted();
+  this->curves_to_world = float4x4_view(curves_ob.object_to_world);
+  this->world_to_curves = math::invert(this->curves_to_world);

  if (surface_ob != nullptr) {
-    this->surface_to_world = surface_ob->object_to_world;
-    this->world_to_surface = this->surface_to_world.inverted();
+    this->surface_to_world = float4x4_view(surface_ob->object_to_world);
+    this->world_to_surface = math::invert(this->surface_to_world);
    this->surface_to_curves = this->world_to_curves * this->surface_to_world;
    this->curves_to_surface = this->world_to_surface * this->curves_to_world;
-    this->surface_to_curves_normal = this->surface_to_curves.inverted().transposed();
+    this->surface_to_curves_normal = math::transpose(math::invert(this->surface_to_curves));
  }
 }

--- a/source/blender/blenkernel/intern/curves_geometry.cc
+++ b/source/blender/blenkernel/intern/curves_geometry.cc
@ -13,6 +13,7 @@
 #include "BLI_bounds.hh"
 #include "BLI_index_mask_ops.hh"
 #include "BLI_length_parameterize.hh"
+#include "BLI_math_matrix.hh"
 #include "BLI_math_rotation_legacy.hh"
 #include "BLI_task.hh"

@ -997,7 +998,7 @@ static void transform_positions(MutableSpan<float3> positions, const float4x4 &m
 {
  threading::parallel_for(positions.index_range(), 1024, [&](const IndexRange range) {
    for (float3 &position : positions.slice(range)) {
-      position = matrix * position;
+      position = math::transform_point(matrix, position);
    }
  });
 }
--- a/source/blender/blenkernel/intern/displist.cc
+++ b/source/blender/blenkernel/intern/displist.cc
@ -613,6 +613,8 @@ void BKE_curve_calc_modifiers_pre(Depsgraph *depsgraph,
        continue;
      }

+      blender::bke::ScopedModifierTimer modifier_timer{*md};
+
      if (!deformedVerts) {
        deformedVerts = BKE_curve_nurbs_vert_coords_alloc(source_nurb, &numVerts);
      }
@ -733,6 +735,8 @@ static GeometrySet curve_calc_modifiers_post(Depsgraph *depsgraph,
      continue;
    }

+    blender::bke::ScopedModifierTimer modifier_timer{*md};
+
    if (!geometry_set.has_mesh()) {
      geometry_set.replace_mesh(BKE_mesh_new_nomain(0, 0, 0, 0, 0));
    }
--- a/source/blender/blenkernel/intern/geometry_component_instances.cc
+++ b/source/blender/blenkernel/intern/geometry_component_instances.cc
@ -2,9 +2,9 @@

 #include <mutex>

-#include "BLI_float4x4.hh"
 #include "BLI_index_mask.hh"
 #include "BLI_map.hh"
+#include "BLI_math_matrix_types.hh"
 #include "BLI_rand.hh"
 #include "BLI_set.hh"
 #include "BLI_span.hh"
@ -117,12 +117,12 @@ namespace blender::bke {

 static float3 get_transform_position(const float4x4 &transform)
 {
-  return transform.translation();
+  return transform.location();
 }

 static void set_transform_position(float4x4 &transform, const float3 position)
 {
-  copy_v3_v3(transform.values[3], position);
+  transform.location() = position;
 }

 class InstancePositionAttributeProvider final : public BuiltinAttributeProvider {
--- a/source/blender/blenkernel/intern/geometry_set_instances.cc
+++ b/source/blender/blenkernel/intern/geometry_set_instances.cc
@ -78,7 +78,7 @@ static void geometry_set_collect_recursive_collection_instance(
    const Collection &collection, const float4x4 &transform, Vector<GeometryInstanceGroup> &r_sets)
 {
  float4x4 offset_matrix = float4x4::identity();
-  sub_v3_v3(offset_matrix.values[3], collection.instance_offset);
+  offset_matrix.location() -= float3(collection.instance_offset);
  const float4x4 instance_transform = transform * offset_matrix;
  geometry_set_collect_recursive_collection(collection, instance_transform, r_sets);
 }
@ -98,7 +98,7 @@ static void geometry_set_collect_recursive_collection(const Collection &collecti
  LISTBASE_FOREACH (const CollectionObject *, collection_object, &collection.gobject) {
    BLI_assert(collection_object->ob != nullptr);
    const Object &object = *collection_object->ob;
-    const float4x4 object_transform = transform * object.object_to_world;
+    const float4x4 object_transform = transform * float4x4_view(object.object_to_world);
    geometry_set_collect_recursive_object(object, object_transform, r_sets);
  }
  LISTBASE_FOREACH (const CollectionChild *, collection_child, &collection.children) {
@ -220,9 +220,9 @@ void Instances::ensure_geometry_instances()
        Collection &collection = reference.collection();
        FOREACH_COLLECTION_OBJECT_RECURSIVE_BEGIN (&collection, object) {
          const int handle = instances->add_reference(*object);
-          instances->add_instance(handle, object->object_to_world);
+          instances->add_instance(handle, float4x4(object->object_to_world));
          float4x4 &transform = instances->transforms().last();
-          sub_v3_v3(transform.values[3], collection.instance_offset);
+          transform.location() -= collection.instance_offset;
        }
        FOREACH_COLLECTION_OBJECT_RECURSIVE_END;
        instances->ensure_geometry_instances();
--- a/source/blender/blenkernel/intern/gpencil.c
+++ b/source/blender/blenkernel/intern/gpencil.c
@ -1273,6 +1273,10 @@ bGPDframe *BKE_gpencil_layer_frame_get(bGPDlayer *gpl, int cframe, eGP_GetFrame_
          gpl->actframe = gpf;
        }
        else if (addnew == GP_GETFRAME_ADD_COPY) {
+          /* The frame_addcopy function copies the active frame of gpl,
+             so we need to set the active frame before copying.
+          */
+          gpl->actframe = gpf;
          gpl->actframe = BKE_gpencil_frame_addcopy(gpl, cframe);
        }
        else {
@ -1300,6 +1304,10 @@ bGPDframe *BKE_gpencil_layer_frame_get(bGPDlayer *gpl, int cframe, eGP_GetFrame_
          gpl->actframe = gpf;
        }
        else if (addnew == GP_GETFRAME_ADD_COPY) {
+          /* The frame_addcopy function copies the active frame of gpl;
+             so we need to set the active frame before copying.
+          */
+          gpl->actframe = gpf;
          gpl->actframe = BKE_gpencil_frame_addcopy(gpl, cframe);
        }
        else {
--- a/source/blender/blenkernel/intern/lib_id_remapper.cc
+++ b/source/blender/blenkernel/intern/lib_id_remapper.cc
@ -40,6 +40,15 @@ struct IDRemapper {
    source_types |= BKE_idtype_idcode_to_idfilter(GS(old_id->name));
  }

+  void add_overwrite(ID *old_id, ID *new_id)
+  {
+    BLI_assert(old_id != nullptr);
+    BLI_assert(new_id == nullptr || (GS(old_id->name) == GS(new_id->name)));
+    mappings.add_overwrite(old_id, new_id);
+    BLI_assert(BKE_idtype_idcode_to_idfilter(GS(old_id->name)) != 0);
+    source_types |= BKE_idtype_idcode_to_idfilter(GS(old_id->name));
+  }
+
  bool contains_mappings_for_any(IDTypeFilter filter) const
  {
    return (source_types & filter) != 0;
@ -159,6 +168,12 @@ void BKE_id_remapper_add(IDRemapper *id_remapper, ID *old_id, ID *new_id)
  remapper->add(old_id, new_id);
 }

+void BKE_id_remapper_add_overwrite(IDRemapper *id_remapper, ID *old_id, ID *new_id)
+{
+  blender::bke::id::remapper::IDRemapper *remapper = unwrap(id_remapper);
+  remapper->add_overwrite(old_id, new_id);
+}
+
 bool BKE_id_remapper_has_mapping_for(const struct IDRemapper *id_remapper, uint64_t type_filter)
 {
  const blender::bke::id::remapper::IDRemapper *remapper = unwrap(id_remapper);
--- a/source/blender/blenkernel/intern/lib_query.c
+++ b/source/blender/blenkernel/intern/lib_query.c
@ -294,8 +294,9 @@ static bool library_foreach_ID_link(Main *bmain,
      continue;
    }

-    /* NOTE: ID.lib pointer is purposefully fully ignored here...
-     * We may want to add it at some point? */
+    if (flag & IDWALK_DO_LIBRARY_POINTER) {
+      CALLBACK_INVOKE(id->lib, IDWALK_CB_NEVER_SELF);
+    }

    if (flag & IDWALK_DO_INTERNAL_RUNTIME_POINTERS) {
      CALLBACK_INVOKE_ID(id->newid, IDWALK_CB_INTERNAL);
--- a/source/blender/blenkernel/intern/lib_remap.c
+++ b/source/blender/blenkernel/intern/lib_remap.c
@ -532,13 +532,16 @@ typedef struct LibblockRemapMultipleUserData {

 static void libblock_remap_foreach_idpair_cb(ID *old_id, ID *new_id, void *user_data)
 {
+  if (old_id == new_id) {
+    return;
+  }
+
  LibBlockRemapMultipleUserData *data = user_data;
  Main *bmain = data->bmain;
  const short remap_flags = data->remap_flags;

  BLI_assert(old_id != NULL);
  BLI_assert((new_id == NULL) || GS(old_id->name) == GS(new_id->name));
-  BLI_assert(old_id != new_id);

  if (free_notifier_reference_cb) {
    free_notifier_reference_cb(old_id);
--- a/source/blender/blenkernel/intern/main.c
+++ b/source/blender/blenkernel/intern/main.c
@ -186,6 +186,7 @@ void BKE_main_free(Main *mainvar)
    BKE_main_idmap_destroy(mainvar->id_map);
  }

+  /* NOTE: `name_map` in libraries are freed together with the library IDs above. */
  if (mainvar->name_map) {
    BKE_main_namemap_destroy(&mainvar->name_map);
  }
--- a/source/blender/blenkernel/intern/main_namemap.cc
+++ b/source/blender/blenkernel/intern/main_namemap.cc
@ -195,6 +195,22 @@ void BKE_main_namemap_destroy(struct UniqueName_Map **r_name_map)
  *r_name_map = nullptr;
 }

+void BKE_main_namemap_clear(Main *bmain)
+{
+  for (Main *bmain_iter = bmain; bmain_iter != nullptr; bmain_iter = bmain_iter->next) {
+    if (bmain_iter->name_map != nullptr) {
+      BKE_main_namemap_destroy(&bmain_iter->name_map);
+    }
+    for (Library *lib_iter = static_cast<Library *>(bmain_iter->libraries.first);
+         lib_iter != nullptr;
+         lib_iter = static_cast<Library *>(lib_iter->id.next)) {
+      if (lib_iter->runtime.name_map != nullptr) {
+        BKE_main_namemap_destroy(&lib_iter->runtime.name_map);
+      }
+    }
+  }
+}
+
 static void main_namemap_populate(UniqueName_Map *name_map, struct Main *bmain, ID *ignore_id)
 {
  BLI_assert_msg(name_map != nullptr, "name_map should not be null");
--- a/source/blender/blenkernel/intern/mesh.cc
+++ b/source/blender/blenkernel/intern/mesh.cc
@ -107,10 +107,10 @@ static void mesh_copy_data(Main *bmain, ID *id_dst, const ID *id_src, const int
  mesh_dst->runtime->wrapper_type_finalize = mesh_src->runtime->wrapper_type_finalize;
  mesh_dst->runtime->subsurf_runtime_data = mesh_src->runtime->subsurf_runtime_data;
  mesh_dst->runtime->cd_mask_extra = mesh_src->runtime->cd_mask_extra;
-  /* Copy face dot tags, since meshes may be duplicated after a subsurf modifier
-   * or node, but we still need to be able to draw face center vertices. */
-  mesh_dst->runtime->subsurf_face_dot_tags = static_cast<uint32_t *>(
-      MEM_dupallocN(mesh_src->runtime->subsurf_face_dot_tags));
+  /* Copy face dot tags, since meshes may be duplicated after a subsurf modifier or node, but we
+   * still need to be able to draw face center vertices. The tags may be cleared explicitly when
+   * the topology is changed. */
+  mesh_dst->runtime->subsurf_face_dot_tags = mesh_src->runtime->subsurf_face_dot_tags;
  if ((mesh_src->id.tag & LIB_TAG_NO_MAIN) == 0) {
    /* This is a direct copy of a main mesh, so for now it has the same topology. */
    mesh_dst->runtime->deformed_only = true;
@ -275,7 +275,6 @@ static void mesh_blend_write(BlendWriter *writer, ID *id, const void *id_address
      BKE_mesh_legacy_convert_material_indices_to_mpoly(mesh);
      BKE_mesh_legacy_sharp_faces_to_flags(mesh);
      BKE_mesh_legacy_bevel_weight_from_layers(mesh);
-      BKE_mesh_legacy_face_set_from_generic(mesh, poly_layers);
      BKE_mesh_legacy_edge_crease_from_layers(mesh);
      BKE_mesh_legacy_sharp_edges_to_flags(mesh);
      BKE_mesh_legacy_attribute_strings_to_flags(mesh);
@ -295,6 +294,7 @@ static void mesh_blend_write(BlendWriter *writer, ID *id, const void *id_address

    if (!BLO_write_is_undo(writer)) {
      BKE_mesh_legacy_convert_uvs_to_struct(mesh, temp_arrays_for_legacy_format, loop_layers);
+      BKE_mesh_legacy_face_set_from_generic(poly_layers);
    }
  }

--- a/source/blender/blenkernel/intern/mesh_boolean_convert.cc
+++ b/source/blender/blenkernel/intern/mesh_boolean_convert.cc
@ -17,9 +17,8 @@

 #include "BLI_alloca.h"
 #include "BLI_array.hh"
-#include "BLI_float4x4.hh"
 #include "BLI_math.h"
-#include "BLI_math_vector_types.hh"
+#include "BLI_math_matrix.hh"
 #include "BLI_mesh_boolean.hh"
 #include "BLI_mesh_intersect.hh"
 #include "BLI_span.hh"
@ -45,7 +44,7 @@ static float4x4 clean_transform(const float4x4 &mat)
  const float fuzz = 1e-6f;
  for (int i = 0; i < 4; i++) {
    for (int j = 0; j < 4; j++) {
-      float f = mat.values[i][j];
+      float f = mat[i][j];
      if (fabsf(f) <= fuzz) {
        f = 0.0f;
      }
@ -55,7 +54,7 @@ static float4x4 clean_transform(const float4x4 &mat)
      else if (fabsf(f + 1.0f) <= fuzz) {
        f = -1.0f;
      }
-      cleaned.values[i][j] = f;
+      cleaned[i][j] = f;
    }
  }
  return cleaned;
@ -280,7 +279,7 @@ static IMesh meshes_to_imesh(Span<const Mesh *> meshes,
   * of the target, multiply each transform by the inverse of the
   * target matrix. Exact Boolean works better if these matrices are 'cleaned'
   *  -- see the comment for the `clean_transform` function, above. */
-  const float4x4 inv_target_mat = clean_transform(target_transform).inverted();
+  const float4x4 inv_target_mat = math::invert(clean_transform(target_transform));

  /* For each input `Mesh`, make `Vert`s and `Face`s for the corresponding
   * vertices and polygons, and keep track of the original indices (using the
@ -297,7 +296,7 @@ static IMesh meshes_to_imesh(Span<const Mesh *> meshes,
    const float4x4 objn_mat = (obmats[mi] == nullptr) ? float4x4::identity() :
                                                        clean_transform(*obmats[mi]);
    r_info->to_target_transform[mi] = inv_target_mat * objn_mat;
-    r_info->has_negative_transform[mi] = objn_mat.is_negative();
+    r_info->has_negative_transform[mi] = math::is_negative(objn_mat);

    /* All meshes 1 and up will be transformed into the local space of operand 0.
     * Historical behavior of the modifier has been to flip the faces of any meshes
@ -327,7 +326,7 @@ static IMesh meshes_to_imesh(Span<const Mesh *> meshes,
    else {
      threading::parallel_for(vert_positions.index_range(), 2048, [&](IndexRange range) {
        for (int i : range) {
-          float3 co = r_info->to_target_transform[mi] * vert_positions[i];
+          float3 co = math::transform_point(r_info->to_target_transform[mi], vert_positions[i]);
          mpq3 mco = mpq3(co.x, co.y, co.z);
          double3 dco(mco[0].get_d(), mco[1].get_d(), mco[2].get_d());
          verts[i] = new Vert(mco, dco, NO_INDEX, i);
@ -560,8 +559,8 @@ static void get_poly2d_cos(const Mesh *me,
  axis_dominant_v3_to_m3(r_axis_mat, axis_dominant);
  for (const int i : poly_verts.index_range()) {
    float3 co = positions[poly_verts[i]];
-    co = trans_mat * co;
-    mul_v2_m3v3(cos_2d[i], r_axis_mat, co);
+    co = math::transform_point(trans_mat, co);
+    *reinterpret_cast<float2 *>(&cos_2d[i]) = (float3x3(r_axis_mat) * co).xy();
  }
 }

--- a/source/blender/blenkernel/intern/mesh_iterators.cc
+++ b/source/blender/blenkernel/intern/mesh_iterators.cc
@ -306,23 +306,22 @@ void BKE_mesh_foreach_mapped_subdiv_face_center(
    MeshForeachFlag flag)
 {
  const float(*positions)[3] = BKE_mesh_vert_positions(mesh);
-  const OffsetIndices polys = mesh->polys();
+  const blender::OffsetIndices polys = mesh->polys();
  const blender::Span<int> corner_verts = mesh->corner_verts();
  const float(*vert_normals)[3] = (flag & MESH_FOREACH_USE_NORMAL) ?
                                      BKE_mesh_vertex_normals_ensure(mesh) :
                                      nullptr;
  const int *index = static_cast<const int *>(CustomData_get_layer(&mesh->pdata, CD_ORIGINDEX));
-  const BLI_bitmap *facedot_tags = mesh->runtime->subsurf_face_dot_tags;
-  BLI_assert(facedot_tags != nullptr);
+  const blender::BitVector<> &facedot_tags = mesh->runtime->subsurf_face_dot_tags;

  if (index) {
-    for (int i = 0; i < mesh->totpoly; i++) {
+    for (const int i : polys.index_range()) {
      const int orig = *index++;
      if (orig == ORIGINDEX_NONE) {
        continue;
      }
      for (const int vert : corner_verts.slice(polys[i])) {
-        if (BLI_BITMAP_TEST(facedot_tags, vert)) {
+        if (facedot_tags[vert]) {
          func(userData,
               orig,
               positions[vert],
@ -332,9 +331,9 @@ void BKE_mesh_foreach_mapped_subdiv_face_center(
    }
  }
  else {
-    for (int i = 0; i < mesh->totpoly; i++) {
+    for (const int i : polys.index_range()) {
      for (const int vert : corner_verts.slice(polys[i])) {
-        if (BLI_BITMAP_TEST(facedot_tags, vert)) {
+        if (facedot_tags[vert]) {
          func(userData,
               i,
               positions[vert],
--- a/source/blender/blenkernel/intern/mesh_legacy_convert.cc
+++ b/source/blender/blenkernel/intern/mesh_legacy_convert.cc
@ -1282,23 +1282,26 @@ void BKE_mesh_legacy_sharp_faces_from_flags(Mesh *mesh)
 /** \name Face Set Conversion
 * \{ */

-void BKE_mesh_legacy_face_set_from_generic(Mesh *mesh,
-                                           blender::MutableSpan<CustomDataLayer> poly_layers)
+void BKE_mesh_legacy_face_set_from_generic(blender::MutableSpan<CustomDataLayer> poly_layers)
 {
  using namespace blender;
-  void *faceset_data = nullptr;
+  bool changed = false;
  for (CustomDataLayer &layer : poly_layers) {
    if (StringRef(layer.name) == ".sculpt_face_set") {
-      faceset_data = layer.data;
-      layer.data = nullptr;
-      CustomData_free_layer_named(&mesh->pdata, ".sculpt_face_set", mesh->totpoly);
+      layer.type = CD_SCULPT_FACE_SETS;
+      layer.name[0] = '\0';
+      changed = true;
      break;
    }
  }
-  if (faceset_data != nullptr) {
-    CustomData_add_layer(
-        &mesh->pdata, CD_SCULPT_FACE_SETS, CD_ASSIGN, faceset_data, mesh->totpoly);
+  if (!changed) {
+    return;
  }
+  /* #CustomData expects the layers to be sorted in increasing order based on type. */
+  std::stable_sort(
+      poly_layers.begin(),
+      poly_layers.end(),
+      [](const CustomDataLayer &a, const CustomDataLayer &b) { return a.type < b.type; });
 }

 void BKE_mesh_legacy_face_set_to_generic(Mesh *mesh)
--- a/source/blender/blenkernel/intern/mesh_mirror.cc
+++ b/source/blender/blenkernel/intern/mesh_mirror.cc
@ -202,7 +202,7 @@ Mesh *BKE_mesh_mirror_apply_mirror_on_axis_for_modifier(MirrorModifierData *mmd,

  /* Subdivision-surface for eg won't have mesh data in the custom-data arrays.
   * Now add position/#MEdge layers. */
-  if (BKE_mesh_vert_positions(mesh) != NULL) {
+  if (BKE_mesh_vert_positions(mesh) != nullptr) {
    memcpy(BKE_mesh_vert_positions_for_write(result),
           BKE_mesh_vert_positions(mesh),
           sizeof(float[3]) * mesh->totvert);
--- a/source/blender/blenkernel/intern/mesh_runtime.cc
+++ b/source/blender/blenkernel/intern/mesh_runtime.cc
@ -105,7 +105,6 @@ MeshRuntime::~MeshRuntime()
  if (this->shrinkwrap_data) {
    BKE_shrinkwrap_boundary_data_free(this->shrinkwrap_data);
  }
-  MEM_SAFE_FREE(this->subsurf_face_dot_tags);
 }

 }  // namespace blender::bke
@ -230,10 +229,10 @@ void BKE_mesh_runtime_clear_geometry(Mesh *mesh)
  mesh->runtime->bounds_cache.tag_dirty();
  mesh->runtime->loose_edges_cache.tag_dirty();
  mesh->runtime->looptris_cache.tag_dirty();
+  mesh->runtime->subsurf_face_dot_tags.clear_and_shrink();
  if (mesh->runtime->shrinkwrap_data) {
    BKE_shrinkwrap_boundary_data_free(mesh->runtime->shrinkwrap_data);
  }
-  MEM_SAFE_FREE(mesh->runtime->subsurf_face_dot_tags);
 }

 void BKE_mesh_tag_edges_split(struct Mesh *mesh)
@ -245,10 +244,10 @@ void BKE_mesh_tag_edges_split(struct Mesh *mesh)
  free_normals(*mesh->runtime);
  free_subdiv_ccg(*mesh->runtime);
  mesh->runtime->loose_edges_cache.tag_dirty();
+  mesh->runtime->subsurf_face_dot_tags.clear_and_shrink();
  if (mesh->runtime->shrinkwrap_data) {
    BKE_shrinkwrap_boundary_data_free(mesh->runtime->shrinkwrap_data);
  }
-  MEM_SAFE_FREE(mesh->runtime->subsurf_face_dot_tags);
 }

 void BKE_mesh_tag_coords_changed(Mesh *mesh)
--- a/source/blender/blenkernel/intern/modifier.cc
+++ b/source/blender/blenkernel/intern/modifier.cc
@ -11,6 +11,7 @@
 #define DNA_DEPRECATED_ALLOW

 #include <cfloat>
+#include <chrono>
 #include <cmath>
 #include <cstdarg>
 #include <cstddef>
@ -1016,6 +1017,9 @@ void BKE_modifier_deform_verts(ModifierData *md,
    modwrap_dependsOnNormals(me);
  }
  mti->deformVerts(md, ctx, me, vertexCos, numVerts);
+  if (me) {
+    BKE_mesh_tag_coords_changed(me);
+  }
 }

 void BKE_modifier_deform_vertsEM(ModifierData *md,
@ -1514,3 +1518,28 @@ void BKE_modifier_blend_read_lib(BlendLibReader *reader, Object *ob)
    }
  }
 }
+
+namespace blender::bke {
+
+using Clock = std::chrono::high_resolution_clock;
+
+static double get_current_time_in_seconds()
+{
+  return std::chrono::duration<double, std::chrono::seconds::period>(
+             Clock::now().time_since_epoch())
+      .count();
+}
+
+ScopedModifierTimer::ScopedModifierTimer(ModifierData &md) : md_(md)
+{
+  start_time_ = get_current_time_in_seconds();
+}
+
+ScopedModifierTimer::~ScopedModifierTimer()
+{
+  const double end_time = get_current_time_in_seconds();
+  const double duration = end_time - start_time_;
+  md_.execution_time = duration;
+}
+
+}  // namespace blender::bke
--- a/source/blender/blenkernel/intern/nla.c
+++ b/source/blender/blenkernel/intern/nla.c
@ -787,7 +787,6 @@ void BKE_nlastrips_add_strip_unsafe(ListBase *strips, NlaStrip *strip)
  }
 }

-/** NULL and Space check before adding in nlastrip */
 bool BKE_nlastrips_add_strip(ListBase *strips, NlaStrip *strip)
 {
  if (ELEM(NULL, strips, strip)) {
--- a/source/blender/blenkernel/intern/nla_test.cc
+++ b/source/blender/blenkernel/intern/nla_test.cc
@ -50,7 +50,7 @@ TEST(nla_strip, BKE_nlastrips_add_strip)
  NlaStrip strip2{};
  strip2.start = 5;
  strip2.end = 10;
-  
+
  /*  can't add a null NLA strip to an NLA Track.   */
  EXPECT_FALSE(BKE_nlastrips_add_strip(&strips, NULL));

--- a/source/blender/blenkernel/intern/object_dupli.cc
+++ b/source/blender/blenkernel/intern/object_dupli.cc
@ -15,9 +15,9 @@
 #include "BLI_string_utf8.h"

 #include "BLI_array.hh"
-#include "BLI_float4x4.hh"
 #include "BLI_math.h"
-#include "BLI_math_vector_types.hh"
+#include "BLI_math_matrix.hh"
+#include "BLI_math_vector.hh"
 #include "BLI_rand.h"
 #include "BLI_span.hh"
 #include "BLI_vector.hh"
@ -956,11 +956,11 @@ static void make_duplis_geometry_set_impl(const DupliContext *ctx,
      case InstanceReference::Type::Object: {
        Object &object = reference.object();
        float matrix[4][4];
-        mul_m4_m4m4(matrix, parent_transform, instance_offset_matrices[i].values);
+        mul_m4_m4m4(matrix, parent_transform, instance_offset_matrices[i].ptr());
        make_dupli(ctx_for_instance, &object, matrix, id, &geometry_set, i);

        float space_matrix[4][4];
-        mul_m4_m4m4(space_matrix, instance_offset_matrices[i].values, object.world_to_object);
+        mul_m4_m4m4(space_matrix, instance_offset_matrices[i].ptr(), object.world_to_object);
        mul_m4_m4_pre(space_matrix, parent_transform);
        make_recursive_duplis(ctx_for_instance, &object, space_matrix, id, &geometry_set, i);
        break;
@ -970,7 +970,7 @@ static void make_duplis_geometry_set_impl(const DupliContext *ctx,
        float collection_matrix[4][4];
        unit_m4(collection_matrix);
        sub_v3_v3(collection_matrix[3], collection.instance_offset);
-        mul_m4_m4_pre(collection_matrix, instance_offset_matrices[i].values);
+        mul_m4_m4_pre(collection_matrix, instance_offset_matrices[i].ptr());
        mul_m4_m4_pre(collection_matrix, parent_transform);

        DupliContext sub_ctx;
@ -1002,7 +1002,7 @@ static void make_duplis_geometry_set_impl(const DupliContext *ctx,
      }
      case InstanceReference::Type::GeometrySet: {
        float new_transform[4][4];
-        mul_m4_m4m4(new_transform, parent_transform, instance_offset_matrices[i].values);
+        mul_m4_m4m4(new_transform, parent_transform, instance_offset_matrices[i].ptr());

        DupliContext sub_ctx;
        if (copy_dupli_context(&sub_ctx,
--- a/source/blender/blenkernel/intern/pbvh.cc
+++ b/source/blender/blenkernel/intern/pbvh.cc
@ -26,7 +26,7 @@
 #include "BKE_pbvh.h"
 #include "BKE_subdiv_ccg.h"

-#include "DRW_pbvh.h"
+#include "DRW_pbvh.hh"

 #include "PIL_time.h"

@ -401,8 +401,8 @@ int BKE_pbvh_count_grid_quads(BLI_bitmap **grid_hidden,
  /* grid hidden layer is present, so have to check each grid for
   * visibility */

-  int depth1 = int(log2((double)gridsize - 1.0) + DBL_EPSILON);
-  int depth2 = int(log2((double)display_gridsize - 1.0) + DBL_EPSILON);
+  int depth1 = int(log2(double(gridsize) - 1.0) + DBL_EPSILON);
+  int depth2 = int(log2(double(display_gridsize) - 1.0) + DBL_EPSILON);

  int skip = depth2 < depth1 ? 1 << (depth1 - depth2 - 1) : 1;

@ -688,8 +688,8 @@ static void pbvh_draw_args_init(PBVH *pbvh, PBVH_GPU_Args *args, PBVHNode *node)
  args->face_sets_color_default = pbvh->face_sets_color_default;
  args->face_sets_color_seed = pbvh->face_sets_color_seed;
  args->vert_positions = pbvh->vert_positions;
-  args->corner_verts = pbvh->corner_verts;
-  args->corner_edges = pbvh->mesh ? BKE_mesh_corner_edges(pbvh->mesh) : NULL;
+  args->corner_verts = {pbvh->corner_verts, pbvh->mesh->totloop};
+  args->corner_edges = pbvh->mesh ? pbvh->mesh->corner_edges() : blender::Span<int>();
  args->mpoly = pbvh->mpoly;
  args->mlooptri = pbvh->looptri;

@ -3684,7 +3684,7 @@ static void pbvh_face_iter_step(PBVHFaceIter *fd, bool do_step)
      }

      BMFace *f = (BMFace *)BLI_gsetIterator_getKey(&fd->bm_faces_iter_);
-      fd->face.i = (intptr_t)f;
+      fd->face.i = intptr_t(f);
      fd->index = f->head.index;

      if (fd->cd_face_set_ != -1) {
@ -3700,7 +3700,7 @@ static void pbvh_face_iter_step(PBVHFaceIter *fd, bool do_step)

      BMLoop *l = f->l_first;
      do {
-        fd->verts[vertex_i++].i = (intptr_t)l->v;
+        fd->verts[vertex_i++].i = intptr_t(l->v);
      } while ((l = l->next) != f->l_first);

      break;
--- a/source/blender/blenkernel/intern/pbvh_bmesh.cc
+++ b/source/blender/blenkernel/intern/pbvh_bmesh.cc
@ -17,7 +17,7 @@
 #include "BKE_ccg.h"
 #include "BKE_pbvh.h"

-#include "DRW_pbvh.h"
+#include "DRW_pbvh.hh"

 #include "bmesh.h"
 #include "pbvh_intern.hh"
@ -1528,7 +1528,7 @@ bool pbvh_bmesh_node_raycast(PBVHNode *node,
            if (j == 0 || len_squared_v3v3(location, cos[j]) <
                              len_squared_v3v3(location, nearest_vertex_co)) {
              copy_v3_v3(nearest_vertex_co, cos[j]);
-              r_active_vertex->i = (intptr_t)node->bm_orvert[node->bm_ortri[i][j]];
+              r_active_vertex->i = intptr_t(node->bm_orvert[node->bm_ortri[i][j]]);
            }
          }
        }
@ -1560,7 +1560,7 @@ bool pbvh_bmesh_node_raycast(PBVHNode *node,
              if (j == 0 || len_squared_v3v3(location, v_tri[j]->co) <
                                len_squared_v3v3(location, nearest_vertex_co)) {
                copy_v3_v3(nearest_vertex_co, v_tri[j]->co);
-                r_active_vertex->i = (intptr_t)v_tri[j];
+                r_active_vertex->i = intptr_t(v_tri[j]);
              }
            }
          }
--- a/source/blender/blenkernel/intern/pbvh_pixels.cc
+++ b/source/blender/blenkernel/intern/pbvh_pixels.cc
@ -116,7 +116,7 @@ static void split_pixel_node(
  const int axis = BB_widest_axis(&cb);
  const float mid = (cb.bmax[axis] + cb.bmin[axis]) * 0.5f;

-  node->flag = (PBVHNodeFlags)((int)node->flag & (int)~PBVH_TexLeaf);
+  node->flag = (PBVHNodeFlags)(int(node->flag) & int(~PBVH_TexLeaf));

  SplitNodePair *split1 = MEM_new<SplitNodePair>("split_pixel_node split1", split);
  SplitNodePair *split2 = MEM_new<SplitNodePair>("split_pixel_node split1", split);
@ -188,7 +188,7 @@ static void split_pixel_node(

      float2 delta = uv_prim.delta_barycentric_coord_u;
      float2 uv1 = row.start_barycentric_coord;
-      float2 uv2 = row.start_barycentric_coord + delta * (float)row.num_pixels;
+      float2 uv2 = row.start_barycentric_coord + delta * float(row.num_pixels);

      float co1[3];
      float co2[3];
@ -210,7 +210,7 @@ static void split_pixel_node(
          t = (mid - co1[axis]) / (co2[axis] - co1[axis]);
        }

-        int num_pixels = (int)floorf((float)row.num_pixels * t);
+        int num_pixels = int(floorf(float(row.num_pixels) * t));

        if (num_pixels) {
          row1.num_pixels = num_pixels;
@ -223,7 +223,7 @@ static void split_pixel_node(
          row2.num_pixels = row.num_pixels - num_pixels;

          row2.start_barycentric_coord = row.start_barycentric_coord +
-                                         uv_prim.delta_barycentric_coord_u * (float)num_pixels;
+                                         uv_prim.delta_barycentric_coord_u * float(num_pixels);
          row2.start_image_coordinate = row.start_image_coordinate;
          row2.start_image_coordinate[0] += num_pixels;

@ -731,7 +731,7 @@ static bool update_pixels(PBVH *pbvh, Mesh *mesh, Image *image, ImageUser *image
    PBVHNode &node = pbvh->nodes[i];

    if (node.flag & PBVH_Leaf) {
-      node.flag = (PBVHNodeFlags)((int)node.flag | (int)PBVH_TexLeaf);
+      node.flag = (PBVHNodeFlags)(int(node.flag) | int(PBVH_TexLeaf));
    }
  }

--- a/source/blender/blenkernel/intern/pbvh_uv_islands.cc
+++ b/source/blender/blenkernel/intern/pbvh_uv_islands.cc
@ -1,5 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */

+#include "BLI_math_matrix.hh"
+
 #include "pbvh_uv_islands.hh"

 #include <optional>
@ -1244,14 +1246,14 @@ UVBorderCorner::UVBorderCorner(UVBorderEdge *first, UVBorderEdge *second, float

 float2 UVBorderCorner::uv(float factor, float min_uv_distance)
 {
+  using namespace blender::math;
  float2 origin = first->get_uv_vertex(1)->uv;
  float angle_between = angle * factor;
  float desired_len = max_ff(second->length() * factor + first->length() * (1.0 - factor),
                             min_uv_distance);
-  float2 v = first->get_uv_vertex(0)->uv - origin;
-  normalize_v2(v);
+  float2 v = normalize(first->get_uv_vertex(0)->uv - origin);

-  float3x3 rot_mat = float3x3::from_rotation(angle_between);
+  float2x2 rot_mat = from_rotation<float2x2>(AngleRadian(angle_between));
  float2 rotated = rot_mat * v;
  float2 result = rotated * desired_len + first->get_uv_vertex(1)->uv;
  return result;
--- a/source/blender/blenkernel/intern/pbvh_uv_islands.hh
+++ b/source/blender/blenkernel/intern/pbvh_uv_islands.hh
@ -24,9 +24,9 @@

 #include "BLI_array.hh"
 #include "BLI_edgehash.h"
-#include "BLI_float3x3.hh"
 #include "BLI_map.hh"
 #include "BLI_math.h"
+#include "BLI_math_matrix_types.hh"
 #include "BLI_math_vector_types.hh"
 #include "BLI_rect.h"
 #include "BLI_vector.hh"
--- a/source/blender/blenkernel/intern/pointcloud.cc
+++ b/source/blender/blenkernel/intern/pointcloud.cc
@ -380,6 +380,8 @@ static void pointcloud_evaluate_modifiers(struct Depsgraph *depsgraph,
      continue;
    }

+    blender::bke::ScopedModifierTimer modifier_timer{*md};
+
    if (mti->modifyGeometrySet) {
      mti->modifyGeometrySet(md, &mectx, &geometry_set);
    }
--- a/source/blender/blenkernel/intern/subdiv_converter_mesh.cc
+++ b/source/blender/blenkernel/intern/subdiv_converter_mesh.cc
@ -161,7 +161,7 @@ static float get_edge_sharpness(const OpenSubdiv_Converter *converter, int manif
    return 10.0f;
  }
 #endif
-  if (!storage->settings.use_creases || storage->cd_edge_crease == NULL) {
+  if (!storage->settings.use_creases || storage->cd_edge_crease == nullptr) {
    return 0.0f;
  }
  const int edge_index = storage->manifold_edge_index_reverse[manifold_edge_index];
@ -184,7 +184,7 @@ static bool is_infinite_sharp_vertex(const OpenSubdiv_Converter *converter,
 static float get_vertex_sharpness(const OpenSubdiv_Converter *converter, int manifold_vertex_index)
 {
  ConverterStorage *storage = static_cast<ConverterStorage *>(converter->user_data);
-  if (!storage->settings.use_creases || storage->cd_vertex_crease == NULL) {
+  if (!storage->settings.use_creases || storage->cd_vertex_crease == nullptr) {
    return 0.0f;
  }
  const int vertex_index = storage->manifold_vertex_index_reverse[manifold_vertex_index];
@ -208,7 +208,7 @@ static void precalc_uv_layer(const OpenSubdiv_Converter *converter, const int la
  const int num_vert = mesh->totvert;
  const float limit[2] = {STD_UV_CONNECT_LIMIT, STD_UV_CONNECT_LIMIT};
  /* Initialize memory required for the operations. */
-  if (storage->loop_uv_indices == NULL) {
+  if (storage->loop_uv_indices == nullptr) {
    storage->loop_uv_indices = static_cast<int *>(
        MEM_malloc_arrayN(mesh->totloop, sizeof(int), "loop uv vertex index"));
  }
@ -227,7 +227,7 @@ static void precalc_uv_layer(const OpenSubdiv_Converter *converter, const int la
  storage->num_uv_coordinates = -1;
  for (int vertex_index = 0; vertex_index < num_vert; vertex_index++) {
    const UvMapVert *uv_vert = BKE_mesh_uv_vert_map_get_vert(uv_vert_map, vertex_index);
-    while (uv_vert != NULL) {
+    while (uv_vert != nullptr) {
      if (uv_vert->separate) {
        storage->num_uv_coordinates++;
      }
@ -287,17 +287,17 @@ static void init_functions(OpenSubdiv_Converter *converter)

  converter->getNumFaceVertices = get_num_face_vertices;
  converter->getFaceVertices = get_face_vertices;
-  converter->getFaceEdges = NULL;
+  converter->getFaceEdges = nullptr;

  converter->getEdgeVertices = get_edge_vertices;
-  converter->getNumEdgeFaces = NULL;
-  converter->getEdgeFaces = NULL;
+  converter->getNumEdgeFaces = nullptr;
+  converter->getEdgeFaces = nullptr;
  converter->getEdgeSharpness = get_edge_sharpness;

-  converter->getNumVertexEdges = NULL;
-  converter->getVertexEdges = NULL;
-  converter->getNumVertexFaces = NULL;
-  converter->getVertexFaces = NULL;
+  converter->getNumVertexEdges = nullptr;
+  converter->getVertexEdges = nullptr;
+  converter->getNumVertexFaces = nullptr;
+  converter->getVertexFaces = nullptr;
  converter->isInfiniteSharpVertex = is_infinite_sharp_vertex;
  converter->getVertexSharpness = get_vertex_sharpness;

@ -316,36 +316,36 @@ static void initialize_manifold_index_array(const BLI_bitmap *used_map,
                                            int **r_indices_reverse,
                                            int *r_num_manifold_elements)
 {
-  int *indices = NULL;
-  if (r_indices != NULL) {
+  int *indices = nullptr;
+  if (r_indices != nullptr) {
    indices = static_cast<int *>(MEM_malloc_arrayN(num_elements, sizeof(int), "manifold indices"));
  }
-  int *indices_reverse = NULL;
-  if (r_indices_reverse != NULL) {
+  int *indices_reverse = nullptr;
+  if (r_indices_reverse != nullptr) {
    indices_reverse = static_cast<int *>(
        MEM_malloc_arrayN(num_elements, sizeof(int), "manifold indices reverse"));
  }
  int offset = 0;
  for (int i = 0; i < num_elements; i++) {
    if (BLI_BITMAP_TEST_BOOL(used_map, i)) {
-      if (indices != NULL) {
+      if (indices != nullptr) {
        indices[i] = i - offset;
      }
-      if (indices_reverse != NULL) {
+      if (indices_reverse != nullptr) {
        indices_reverse[i - offset] = i;
      }
    }
    else {
-      if (indices != NULL) {
+      if (indices != nullptr) {
        indices[i] = -1;
      }
      offset++;
    }
  }
-  if (r_indices != NULL) {
+  if (r_indices != nullptr) {
    *r_indices = indices;
  }
-  if (r_indices_reverse != NULL) {
+  if (r_indices_reverse != nullptr) {
    *r_indices_reverse = indices_reverse;
  }
  *r_num_manifold_elements = num_elements - offset;
@ -372,7 +372,7 @@ static void initialize_manifold_indices(ConverterStorage *storage)
                                  &storage->num_manifold_vertices);
  initialize_manifold_index_array(edge_used_map,
                                  mesh->totedge,
-                                  NULL,
+                                  nullptr,
                                  &storage->manifold_edge_index_reverse,
                                  &storage->num_manifold_edges);
  /* Initialize infinite sharp mapping. */
@ -405,7 +405,7 @@ static void init_user_data(OpenSubdiv_Converter *converter,
      CustomData_get_layer(&mesh->vdata, CD_CREASE));
  user_data->cd_edge_crease = static_cast<const float *>(
      CustomData_get_layer(&mesh->edata, CD_CREASE));
-  user_data->loop_uv_indices = NULL;
+  user_data->loop_uv_indices = nullptr;
  initialize_manifold_indices(user_data);
  converter->user_data = user_data;
 }
--- a/source/blender/blenkernel/intern/subdiv_mesh.cc
+++ b/source/blender/blenkernel/intern/subdiv_mesh.cc
@ -528,9 +528,8 @@ static bool subdiv_mesh_topology_info(const SubdivForeachContext *foreach_contex
      subdiv_context->coarse_mesh, num_vertices, num_edges, 0, num_loops, num_polygons, mask);
  subdiv_mesh_ctx_cache_custom_data_layers(subdiv_context);
  subdiv_mesh_prepare_accumulator(subdiv_context, num_vertices);
-  MEM_SAFE_FREE(subdiv_context->subdiv_mesh->runtime->subsurf_face_dot_tags);
-  subdiv_context->subdiv_mesh->runtime->subsurf_face_dot_tags = BLI_BITMAP_NEW(num_vertices,
-                                                                               __func__);
+  subdiv_context->subdiv_mesh->runtime->subsurf_face_dot_tags.clear();
+  subdiv_context->subdiv_mesh->runtime->subsurf_face_dot_tags.resize(num_vertices);
  return true;
 }

@ -592,7 +591,7 @@ static void evaluate_vertex_and_apply_displacement_copy(const SubdivMeshContext
  /* Evaluate undeformed texture coordinate. */
  subdiv_vertex_orco_evaluate(ctx, ptex_face_index, u, v, subdiv_vertex_index);
  /* Remove face-dot flag. This can happen if there is more than one subsurf modifier. */
-  BLI_BITMAP_DISABLE(ctx->subdiv_mesh->runtime->subsurf_face_dot_tags, subdiv_vertex_index);
+  ctx->subdiv_mesh->runtime->subsurf_face_dot_tags[subdiv_vertex_index].reset();
 }

 static void evaluate_vertex_and_apply_displacement_interpolate(
@ -748,7 +747,7 @@ static void subdiv_mesh_tag_center_vertex(const IndexRange coarse_poly,
                                          Mesh *subdiv_mesh)
 {
  if (subdiv_mesh_is_center_vertex(coarse_poly, u, v)) {
-    BLI_BITMAP_ENABLE(subdiv_mesh->runtime->subsurf_face_dot_tags, subdiv_vertex_index);
+    subdiv_mesh->runtime->subsurf_face_dot_tags[subdiv_vertex_index].set();
  }
 }

@ -782,10 +781,10 @@ static void subdiv_mesh_vertex_inner(const SubdivForeachContext *foreach_context

 static void subdiv_copy_edge_data(SubdivMeshContext *ctx,
                                  MEdge *subdiv_edge,
-                                  const MEdge *coarse_edge)
+                                  const int coarse_edge_index)
 {
  const int subdiv_edge_index = subdiv_edge - ctx->subdiv_edges;
-  if (coarse_edge == nullptr) {
+  if (coarse_edge_index == ORIGINDEX_NONE) {
    subdiv_edge->flag = 0;
    if (!ctx->settings->use_optimal_display) {
      subdiv_edge->flag |= ME_EDGEDRAW;
@ -795,7 +794,6 @@ static void subdiv_copy_edge_data(SubdivMeshContext *ctx,
    }
    return;
  }
-  const int coarse_edge_index = coarse_edge - ctx->coarse_edges;
  CustomData_copy_data(
      &ctx->coarse_mesh->edata, &ctx->subdiv_mesh->edata, coarse_edge_index, subdiv_edge_index, 1);
  subdiv_edge->flag |= ME_EDGEDRAW;
@ -812,12 +810,7 @@ static void subdiv_mesh_edge(const SubdivForeachContext *foreach_context,
  SubdivMeshContext *ctx = static_cast<SubdivMeshContext *>(foreach_context->user_data);
  MEdge *subdiv_medge = ctx->subdiv_edges;
  MEdge *subdiv_edge = &subdiv_medge[subdiv_edge_index];
-  const MEdge *coarse_edge = nullptr;
-  if (coarse_edge_index != ORIGINDEX_NONE) {
-    const MEdge *coarse_medge = ctx->coarse_edges;
-    coarse_edge = &coarse_medge[coarse_edge_index];
-  }
-  subdiv_copy_edge_data(ctx, subdiv_edge, coarse_edge);
+  subdiv_copy_edge_data(ctx, subdiv_edge, coarse_edge_index);
  subdiv_edge->v1 = subdiv_v1;
  subdiv_edge->v2 = subdiv_v2;
 }
--- a/source/blender/blenkernel/intern/volume.cc
+++ b/source/blender/blenkernel/intern/volume.cc
@ -14,11 +14,11 @@

 #include "BLI_compiler_compat.h"
 #include "BLI_fileops.h"
-#include "BLI_float4x4.hh"
 #include "BLI_ghash.h"
 #include "BLI_index_range.hh"
 #include "BLI_map.hh"
 #include "BLI_math.h"
+#include "BLI_math_matrix_types.hh"
 #include "BLI_math_vector_types.hh"
 #include "BLI_path_util.h"
 #include "BLI_string.h"
@ -1108,6 +1108,8 @@ static void volume_evaluate_modifiers(struct Depsgraph *depsgraph,
      continue;
    }

+    blender::bke::ScopedModifierTimer modifier_timer{*md};
+
    if (mti->modifyGeometrySet) {
      mti->modifyGeometrySet(md, &mectx, &geometry_set);
    }
@ -1650,7 +1652,7 @@ openvdb::GridBase::ConstPtr BKE_volume_grid_shallow_transform(openvdb::GridBase:
                                                              const blender::float4x4 &transform)
 {
  openvdb::math::Transform::Ptr grid_transform = grid->transform().copy();
-  grid_transform->postMult(openvdb::Mat4d((float *)transform.values));
+  grid_transform->postMult(openvdb::Mat4d((float *)transform.ptr()));

  /* Create a transformed grid. The underlying tree is shared. */
  return grid->copyGridReplacingTransform(grid_transform);
--- a/source/blender/blenlib/BLI_any.hh
+++ b/source/blender/blenlib/BLI_any.hh
@ -98,8 +98,8 @@ class Any {
 private:
  /* Makes it possible to use void in the template parameters. */
  using RealExtraInfo =
-      std::conditional_t<std::is_void_v<ExtraInfo>, detail::NoExtraInfo, ExtraInfo>;
-  using Info = detail::AnyTypeInfo<RealExtraInfo>;
+      std::conditional_t<std::is_void_v<ExtraInfo>, blender::detail::NoExtraInfo, ExtraInfo>;
+  using Info = blender::detail::AnyTypeInfo<RealExtraInfo>;
  static constexpr size_t RealInlineBufferCapacity = std::max(InlineBufferCapacity,
                                                              sizeof(std::unique_ptr<int>));

--- a/source/blender/blenlib/BLI_bit_vector.hh
+++ b/source/blender/blenlib/BLI_bit_vector.hh
@ -478,6 +478,28 @@ class BitVector {
    this->realloc_to_at_least(new_capacity_in_bits);
  }

+  /**
+   * Reset the size of the vector to zero elements, but keep the same memory capacity to be
+   * refilled again.
+   */
+  void clear()
+  {
+    size_in_bits_ = 0;
+  }
+
+  /**
+   * Free memory and reset the vector to zero elements.
+   */
+  void clear_and_shrink()
+  {
+    size_in_bits_ = 0;
+    capacity_in_bits_ = 0;
+    if (!this->is_inline()) {
+      allocator_.deallocate(data_);
+    }
+    data_ = inline_buffer_;
+  }
+
 private:
  void ensure_space_for_one()
  {
--- a/source/blender/blenlib/BLI_float3x3.hh
+++ b/source/blender/blenlib/BLI_float3x3.hh
@ -1,215 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-
-#pragma once
-
-#include <cmath>
-#include <cstdint>
-
-#include "BLI_assert.h"
-#include "BLI_math_base.h"
-#include "BLI_math_matrix.h"
-#include "BLI_math_vector.h"
-#include "BLI_math_vector_types.hh"
-
-namespace blender {
-
-struct float3x3 {
-  /* A 3x3 matrix in column major order. */
-  float values[3][3];
-
-  float3x3() = default;
-
-  float3x3(const float *matrix)
-  {
-    memcpy(values, matrix, sizeof(float) * 3 * 3);
-  }
-
-  float3x3(const float matrix[3][3]) : float3x3(static_cast<const float *>(matrix[0]))
-  {
-  }
-
-  static float3x3 zero()
-  {
-    float3x3 result;
-    zero_m3(result.values);
-    return result;
-  }
-
-  static float3x3 identity()
-  {
-    float3x3 result;
-    unit_m3(result.values);
-    return result;
-  }
-
-  static float3x3 from_translation(const float2 translation)
-  {
-    float3x3 result = identity();
-    result.values[2][0] = translation.x;
-    result.values[2][1] = translation.y;
-    return result;
-  }
-
-  static float3x3 from_rotation(float rotation)
-  {
-    float3x3 result = zero();
-    const float cosine = std::cos(rotation);
-    const float sine = std::sin(rotation);
-    result.values[0][0] = cosine;
-    result.values[0][1] = sine;
-    result.values[1][0] = -sine;
-    result.values[1][1] = cosine;
-    result.values[2][2] = 1.0f;
-    return result;
-  }
-
-  static float3x3 from_scale(const float2 scale)
-  {
-    float3x3 result = zero();
-    result.values[0][0] = scale.x;
-    result.values[1][1] = scale.y;
-    result.values[2][2] = 1.0f;
-    return result;
-  }
-
-  static float3x3 from_translation_rotation_scale(const float2 translation,
-                                                  float rotation,
-                                                  const float2 scale)
-  {
-    float3x3 result;
-    const float cosine = std::cos(rotation);
-    const float sine = std::sin(rotation);
-    result.values[0][0] = scale.x * cosine;
-    result.values[0][1] = scale.x * sine;
-    result.values[0][2] = 0.0f;
-    result.values[1][0] = scale.y * -sine;
-    result.values[1][1] = scale.y * cosine;
-    result.values[1][2] = 0.0f;
-    result.values[2][0] = translation.x;
-    result.values[2][1] = translation.y;
-    result.values[2][2] = 1.0f;
-    return result;
-  }
-
-  static float3x3 from_normalized_axes(const float2 translation,
-                                       const float2 horizontal,
-                                       const float2 vertical)
-  {
-    BLI_ASSERT_UNIT_V2(horizontal);
-    BLI_ASSERT_UNIT_V2(vertical);
-
-    float3x3 result;
-    result.values[0][0] = horizontal.x;
-    result.values[0][1] = horizontal.y;
-    result.values[0][2] = 0.0f;
-    result.values[1][0] = vertical.x;
-    result.values[1][1] = vertical.y;
-    result.values[1][2] = 0.0f;
-    result.values[2][0] = translation.x;
-    result.values[2][1] = translation.y;
-    result.values[2][2] = 1.0f;
-    return result;
-  }
-
-  /* Construct a transformation that is pivoted around the given origin point. So for instance,
-   * from_origin_transformation(from_rotation(M_PI_2), float2(0.0f, 2.0f))
-   * will construct a transformation representing a 90 degree rotation around the point (0, 2). */
-  static float3x3 from_origin_transformation(const float3x3 &transformation, const float2 origin)
-  {
-    return from_translation(origin) * transformation * from_translation(-origin);
-  }
-
-  operator float *()
-  {
-    return &values[0][0];
-  }
-
-  operator const float *() const
-  {
-    return &values[0][0];
-  }
-
-  float *operator[](const int64_t index)
-  {
-    BLI_assert(index >= 0);
-    BLI_assert(index < 3);
-    return &values[index][0];
-  }
-
-  const float *operator[](const int64_t index) const
-  {
-    BLI_assert(index >= 0);
-    BLI_assert(index < 3);
-    return &values[index][0];
-  }
-
-  using c_style_float3x3 = float[3][3];
-  c_style_float3x3 &ptr()
-  {
-    return values;
-  }
-
-  const c_style_float3x3 &ptr() const
-  {
-    return values;
-  }
-
-  friend float3x3 operator*(const float3x3 &a, const float3x3 &b)
-  {
-    float3x3 result;
-    mul_m3_m3m3(result.values, a.values, b.values);
-    return result;
-  }
-
-  friend float3 operator*(const float3x3 &a, const float3 &b)
-  {
-    float3 result;
-    mul_v3_m3v3(result, a.values, b);
-    return result;
-  }
-
-  void operator*=(const float3x3 &other)
-  {
-    mul_m3_m3_post(values, other.values);
-  }
-
-  friend float2 operator*(const float3x3 &transformation, const float2 &vector)
-  {
-    float2 result;
-    mul_v2_m3v2(result, transformation.values, vector);
-    return result;
-  }
-
-  friend float2 operator*(const float3x3 &transformation, const float (*vector)[2])
-  {
-    return transformation * float2(vector);
-  }
-
-  float3x3 transposed() const
-  {
-    float3x3 result;
-    transpose_m3_m3(result.values, values);
-    return result;
-  }
-
-  float3x3 inverted() const
-  {
-    float3x3 result;
-    invert_m3_m3(result.values, values);
-    return result;
-  }
-
-  float2 scale_2d() const
-  {
-    float2 scale;
-    mat3_to_size_2d(scale, values);
-    return scale;
-  }
-
-  friend bool operator==(const float3x3 &a, const float3x3 &b)
-  {
-    return equals_m3m3(a.values, b.values);
-  }
-};
-
-}  // namespace blender
--- a/source/blender/blenlib/BLI_float4x4.hh
+++ b/source/blender/blenlib/BLI_float4x4.hh
@ -1,280 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-
-#pragma once
-
-#include "BLI_math_matrix.h"
-#include "BLI_math_vector.h"
-#include "BLI_math_vector.hh"
-#include "BLI_math_vector_types.hh"
-
-namespace blender {
-
-struct float4x4 {
-  float values[4][4];
-
-  float4x4() = default;
-
-  float4x4(const float *matrix)
-  {
-    memcpy(values, matrix, sizeof(float) * 16);
-  }
-
-  float4x4(const float matrix[4][4]) : float4x4(static_cast<const float *>(matrix[0]))
-  {
-  }
-
-  /* Assumes an XYZ euler order. */
-  static float4x4 from_loc_eul_scale(const float3 location,
-                                     const float3 rotation,
-                                     const float3 scale)
-  {
-    float4x4 mat;
-    loc_eul_size_to_mat4(mat.values, location, rotation, scale);
-    return mat;
-  }
-
-  static float4x4 from_location(const float3 location)
-  {
-    float4x4 mat = float4x4::identity();
-    copy_v3_v3(mat.values[3], location);
-    return mat;
-  }
-
-  static float4x4 from_normalized_axis_data(const float3 location,
-                                            const float3 forward,
-                                            const float3 up)
-  {
-    BLI_ASSERT_UNIT_V3(forward);
-    BLI_ASSERT_UNIT_V3(up);
-
-    /* Negate the cross product so that the resulting matrix has determinant 1 (instead of -1).
-     * Without the negation, the result would be a so called improper rotation. That means it
-     * contains a reflection. Such an improper rotation matrix could not be converted to another
-     * representation of a rotation such as euler angles. */
-    const float3 cross = -math::cross(forward, up);
-
-    float4x4 matrix;
-    matrix.values[0][0] = forward.x;
-    matrix.values[1][0] = cross.x;
-    matrix.values[2][0] = up.x;
-    matrix.values[3][0] = location.x;
-
-    matrix.values[0][1] = forward.y;
-    matrix.values[1][1] = cross.y;
-    matrix.values[2][1] = up.y;
-    matrix.values[3][1] = location.y;
-
-    matrix.values[0][2] = forward.z;
-    matrix.values[1][2] = cross.z;
-    matrix.values[2][2] = up.z;
-    matrix.values[3][2] = location.z;
-
-    matrix.values[0][3] = 0.0f;
-    matrix.values[1][3] = 0.0f;
-    matrix.values[2][3] = 0.0f;
-    matrix.values[3][3] = 1.0f;
-
-    return matrix;
-  }
-
-  static float4x4 identity()
-  {
-    float4x4 mat;
-    unit_m4(mat.values);
-    return mat;
-  }
-
-  operator float *()
-  {
-    return &values[0][0];
-  }
-
-  operator const float *() const
-  {
-    return &values[0][0];
-  }
-
-  float *operator[](const int64_t index)
-  {
-    BLI_assert(index >= 0);
-    BLI_assert(index < 4);
-    return &values[index][0];
-  }
-
-  const float *operator[](const int64_t index) const
-  {
-    BLI_assert(index >= 0);
-    BLI_assert(index < 4);
-    return &values[index][0];
-  }
-
-  using c_style_float4x4 = float[4][4];
-  c_style_float4x4 &ptr()
-  {
-    return values;
-  }
-
-  const c_style_float4x4 &ptr() const
-  {
-    return values;
-  }
-
-  friend float4x4 operator*(const float4x4 &a, const float4x4 &b)
-  {
-    float4x4 result;
-    mul_m4_m4m4(result.values, a.values, b.values);
-    return result;
-  }
-
-  void operator*=(const float4x4 &other)
-  {
-    mul_m4_m4_post(values, other.values);
-  }
-
-  /**
-   * This also applies the translation on the vector. Use `m.ref_3x3() * v` if that is not
-   * intended.
-   */
-  friend float3 operator*(const float4x4 &m, const float3 &v)
-  {
-    float3 result;
-    mul_v3_m4v3(result, m.values, v);
-    return result;
-  }
-
-  friend float3 operator*(const float4x4 &m, const float (*v)[3])
-  {
-    return m * float3(v);
-  }
-
-  friend bool operator==(const float4x4 &a, const float4x4 &b)
-  {
-    return equals_m4m4(a.ptr(), b.ptr());
-  }
-
-  friend bool operator!=(const float4x4 &a, const float4x4 &b)
-  {
-    return !(a == b);
-  }
-
-  float3 translation() const
-  {
-    return float3(values[3]);
-  }
-
-  /* Assumes XYZ rotation order. */
-  float3 to_euler() const
-  {
-    float3 euler;
-    mat4_to_eul(euler, values);
-    return euler;
-  }
-
-  float3 scale() const
-  {
-    float3 scale;
-    mat4_to_size(scale, values);
-    return scale;
-  }
-
-  void apply_scale(const float scale)
-  {
-    values[0][0] *= scale;
-    values[0][1] *= scale;
-    values[0][2] *= scale;
-    values[1][0] *= scale;
-    values[1][1] *= scale;
-    values[1][2] *= scale;
-    values[2][0] *= scale;
-    values[2][1] *= scale;
-    values[2][2] *= scale;
-  }
-
-  float4x4 inverted() const
-  {
-    float4x4 result;
-    invert_m4_m4(result.values, values);
-    return result;
-  }
-
-  /**
-   * Matrix inversion can be implemented more efficiently for affine matrices.
-   */
-  float4x4 inverted_affine() const
-  {
-    BLI_assert(values[0][3] == 0.0f && values[1][3] == 0.0f && values[2][3] == 0.0f &&
-               values[3][3] == 1.0f);
-    return this->inverted();
-  }
-
-  float4x4 transposed() const
-  {
-    float4x4 result;
-    transpose_m4_m4(result.values, values);
-    return result;
-  }
-
-  float4x4 inverted_transposed_affine() const
-  {
-    return this->inverted_affine().transposed();
-  }
-
-  struct float3x3_ref {
-    const float4x4 &data;
-
-    friend float3 operator*(const float3x3_ref &m, const float3 &v)
-    {
-      float3 result;
-      mul_v3_mat3_m4v3(result, m.data.values, v);
-      return result;
-    }
-  };
-
-  float3x3_ref ref_3x3() const
-  {
-    return {*this};
-  }
-
-  static float4x4 interpolate(const float4x4 &a, const float4x4 &b, float t)
-  {
-    float result[4][4];
-    interp_m4_m4m4(result, a.values, b.values, t);
-    return result;
-  }
-
-  bool is_negative() const
-  {
-    return is_negative_m4(ptr());
-  }
-
-  uint64_t hash() const
-  {
-    uint64_t h = 435109;
-    for (int i = 0; i < 16; i++) {
-      float value = (static_cast<const float *>(values[0]))[i];
-      h = h * 33 + *reinterpret_cast<const uint32_t *>(&value);
-    }
-    return h;
-  }
-
-  friend std::ostream &operator<<(std::ostream &stream, const float4x4 &mat)
-  {
-    char fchar[16];
-    stream << "(\n";
-    for (int i = 0; i < 4; i++) {
-      stream << "(";
-      for (int j = 0; j < 4; j++) {
-        snprintf(fchar, sizeof(fchar), "%11.6f", mat[j][i]);
-        stream << fchar;
-        if (j != 3) {
-          stream << ", ";
-        }
-      }
-      stream << ")\n";
-    }
-    stream << ")\n";
-    return stream;
-  }
-};
-
-}  // namespace blender
--- a/source/blender/blenlib/BLI_listbase.h
+++ b/source/blender/blenlib/BLI_listbase.h
@ -243,6 +243,16 @@ void BLI_movelisttolist(struct ListBase *dst, struct ListBase *src) ATTR_NONNULL
 * Moves the entire contents of \a src at the beginning of \a dst.
 */
 void BLI_movelisttolist_reverse(struct ListBase *dst, struct ListBase *src) ATTR_NONNULL(1, 2);
+/**
+ * Split `original_listbase` after given `vlink`, putting the remaining of the list into given
+ * `split_listbase`.
+ *
+ * \note If `vlink` is nullptr, it is considered as 'the item before the first item', so the whole
+ * list is moved from `original_listbase` to `split_listbase`.
+ */
+void BLI_listbase_split_after(struct ListBase *original_listbase,
+                              struct ListBase *split_listbase,
+                              void *vlink) ATTR_NONNULL(1, 2);
 /**
 * Sets dst to a duplicate of the entire contents of src. dst may be the same as src.
 */
--- a/source/blender/blenlib/BLI_math_matrix.hh
+++ b/source/blender/blenlib/BLI_math_matrix.hh
@ -176,7 +176,7 @@ template<typename T, int Size>
 /**
 * Create a translation only matrix. Matrix dimensions should be at least 4 col x 3 row.
 */
-template<typename MatT> [[nodiscard]] MatT from_location(const typename MatT::vec3_type &location);
+template<typename MatT> [[nodiscard]] MatT from_location(const typename MatT::loc_type &location);

 /**
 * Create a matrix whose diagonal is defined by the given scale vector.
@ -201,14 +201,14 @@ template<typename MatT, typename RotationT, typename VectorT>
 * Create a transform matrix with translation and rotation applied in this order.
 */
 template<typename MatT, typename RotationT>
-[[nodiscard]] MatT from_loc_rot(const typename MatT::vec3_type &location,
+[[nodiscard]] MatT from_loc_rot(const typename MatT::loc_type &location,
                                const RotationT &rotation);

 /**
 * Create a transform matrix with translation, rotation and scale applied in this order.
 */
 template<typename MatT, typename RotationT, int ScaleDim>
-[[nodiscard]] MatT from_loc_rot_scale(const typename MatT::vec3_type &location,
+[[nodiscard]] MatT from_loc_rot_scale(const typename MatT::loc_type &location,
                                      const RotationT &rotation,
                                      const VecBase<typename MatT::base_type, ScaleDim> &scale);

@ -229,6 +229,14 @@ template<typename MatT, typename VectorT>
                                         const VectorT forward,
                                         const VectorT up);

+/**
+ * Construct a transformation that is pivoted around the given origin point. So for instance,
+ * from_origin_transform<MatT>(from_rotation(M_PI_2), float2(0.0f, 2.0f))
+ * will construct a transformation representing a 90 degree rotation around the point (0, 2).
+ */
+template<typename MatT, typename VectorT>
+[[nodiscard]] MatT from_origin_transform(const MatT &transform, const VectorT origin);
+
 /** \} */

 /* -------------------------------------------------------------------- */
@ -259,6 +267,8 @@ template<typename T, bool Normalized = false>
 */
 template<bool AllowNegativeScale = false, typename T, int NumCol, int NumRow>
 [[nodiscard]] inline VecBase<T, 3> to_scale(const MatBase<T, NumCol, NumRow> &mat);
+template<bool AllowNegativeScale = false, typename T>
+[[nodiscard]] inline VecBase<T, 2> to_scale(const MatBase<T, 2, 2> &mat);

 /**
 * Decompose a matrix into location, rotation, and scale components.
@ -464,6 +474,9 @@ inline bool is_zero(const MatBase<T, NumCol, NumRow> &mat)
 /* Implementation details. */
 namespace detail {

+template<typename T, int NumCol, int NumRow>
+[[nodiscard]] MatBase<T, NumCol, NumRow> from_rotation(const AngleRadian<T> &rotation);
+
 template<typename T, int NumCol, int NumRow>
 [[nodiscard]] MatBase<T, NumCol, NumRow> from_rotation(const EulerXYZ<T> &rotation);

@ -491,7 +504,8 @@ template<typename T, int Size>
 [[nodiscard]] MatBase<T, Size, Size> invert(const MatBase<T, Size, Size> &mat)
 {
  bool success;
-  return invert(mat, success);
+  /* Explicit template parameter to please MSVC. */
+  return invert<T, Size>(mat, success);
 }

 template<typename T, int NumCol, int NumRow>
@ -798,7 +812,7 @@ MatBase<T, NumCol, NumRow> from_rotation(const EulerXYZ<T> &rotation)
  DoublePrecision sc = si * ch;
  DoublePrecision ss = si * sh;

-  MatT mat;
+  MatT mat = MatT::identity();
  mat[0][0] = T(cj * ch);
  mat[1][0] = T(sj * sc - cs);
  mat[2][0] = T(sj * cc + ss);
@ -833,7 +847,7 @@ MatBase<T, NumCol, NumRow> from_rotation(const Quaternion<T> &rotation)
  DoublePrecision qbc = q2 * q3;
  DoublePrecision qcc = q3 * q3;

-  MatT mat;
+  MatT mat = MatT::identity();
  mat[0][0] = T(1.0 - qbb - qcc);
  mat[0][1] = T(qdc + qab);
  mat[0][2] = T(-qdb + qac);
@ -877,7 +891,25 @@ MatBase<T, NumCol, NumRow> from_rotation(const AxisAngle<T> &rotation)
  return mat;
 }

+template<typename T, int NumCol, int NumRow>
+MatBase<T, NumCol, NumRow> from_rotation(const AngleRadian<T> &rotation)
+{
+  using MatT = MatBase<T, NumCol, NumRow>;
+  T ci = math::cos(rotation.value);
+  T si = math::sin(rotation.value);
+
+  MatT mat = MatT::identity();
+  mat[0][0] = ci;
+  mat[1][0] = -si;
+
+  mat[0][1] = si;
+  mat[1][1] = ci;
+  return mat;
+}
+
 /* Using explicit template instantiations in order to reduce compilation time. */
+extern template MatBase<float, 2, 2> from_rotation(const AngleRadian<float> &rotation);
+extern template MatBase<float, 3, 3> from_rotation(const AngleRadian<float> &rotation);
 extern template MatBase<float, 3, 3> from_rotation(const EulerXYZ<float> &rotation);
 extern template MatBase<float, 4, 4> from_rotation(const EulerXYZ<float> &rotation);
 extern template MatBase<float, 3, 3> from_rotation(const Quaternion<float> &rotation);
@ -940,6 +972,18 @@ template<bool AllowNegativeScale, typename T, int NumCol, int NumRow>
  return result;
 }

+template<bool AllowNegativeScale, typename T>
+[[nodiscard]] inline VecBase<T, 2> to_scale(const MatBase<T, 2, 2> &mat)
+{
+  VecBase<T, 2> result = {length(mat.x), length(mat.y)};
+  if constexpr (AllowNegativeScale) {
+    if (UNLIKELY(is_negative(mat))) {
+      result = -result;
+    }
+  }
+  return result;
+}
+
 /* Implementation details. Use `to_euler` and `to_quaternion` instead. */
 namespace detail {

@ -982,7 +1026,7 @@ inline void to_loc_rot_scale(const MatBase<T, 4, 4> &mat,
  to_rot_scale<AllowNegativeScale>(MatBase<T, 3, 3>(mat), r_rotation, r_scale);
 }

-template<typename MatT> [[nodiscard]] MatT from_location(const typename MatT::vec3_type &location)
+template<typename MatT> [[nodiscard]] MatT from_location(const typename MatT::loc_type &location)
 {
  MatT mat = MatT::identity();
  mat.location() = location;
@ -1013,22 +1057,23 @@ template<typename MatT, typename RotationT, typename VectorT>
 }

 template<typename MatT, typename RotationT, int ScaleDim>
-[[nodiscard]] MatT from_loc_rot_scale(const typename MatT::vec3_type &location,
+[[nodiscard]] MatT from_loc_rot_scale(const typename MatT::loc_type &location,
                                      const RotationT &rotation,
                                      const VecBase<typename MatT::base_type, ScaleDim> &scale)
 {
-  using Mat3x3 = MatBase<typename MatT::base_type, 3, 3>;
-  MatT mat = MatT(from_rot_scale<Mat3x3>(rotation, scale));
+  using MatRotT =
+      MatBase<typename MatT::base_type, MatT::loc_type::type_length, MatT::loc_type::type_length>;
+  MatT mat = MatT(from_rot_scale<MatRotT>(rotation, scale));
  mat.location() = location;
  return mat;
 }

 template<typename MatT, typename RotationT>
-[[nodiscard]] MatT from_loc_rot(const typename MatT::vec3_type &location,
-                                const RotationT &rotation)
+[[nodiscard]] MatT from_loc_rot(const typename MatT::loc_type &location, const RotationT &rotation)
 {
-  using Mat3x3 = MatBase<typename MatT::base_type, 3, 3>;
-  MatT mat = MatT(from_rotation<Mat3x3>(rotation));
+  using MatRotT =
+      MatBase<typename MatT::base_type, MatT::loc_type::type_length, MatT::loc_type::type_length>;
+  MatT mat = MatT(from_rotation<MatRotT>(rotation));
  mat.location() = location;
  return mat;
 }
@ -1059,6 +1104,12 @@ template<typename MatT, typename VectorT>
  return matrix;
 }

+template<typename MatT, typename VectorT>
+[[nodiscard]] MatT from_origin_transform(const MatT &transform, const VectorT origin)
+{
+  return from_location<MatT>(origin) * transform * from_location<MatT>(-origin);
+}
+
 template<typename T>
 VecBase<T, 3> transform_point(const MatBase<T, 3, 3> &mat, const VecBase<T, 3> &point)
 {
--- a/source/blender/blenlib/BLI_math_matrix_types.hh
+++ b/source/blender/blenlib/BLI_math_matrix_types.hh
@ -30,8 +30,6 @@
 * defined outside of the class in the `blender::math` namespace.
 */

-#define __BLI_MATH_MATRIX_TYPES_HH__
-
 #include <array>
 #include <cmath>
 #include <iostream>
@ -79,6 +77,7 @@ struct alignas(Alignment) MatBase : public vec_struct_base<VecBase<T, NumRow>, N
  using vec3_type = VecBase<T, 3>;
  using col_type = VecBase<T, NumRow>;
  using row_type = VecBase<T, NumCol>;
+  using loc_type = VecBase<T, (NumRow < NumCol) ? NumRow : (NumRow - 1)>;
  static constexpr int min_dim = (NumRow < NumCol) ? NumRow : NumCol;
  static constexpr int col_len = NumCol;
  static constexpr int row_len = NumRow;
@ -258,11 +257,11 @@ struct alignas(Alignment) MatBase : public vec_struct_base<VecBase<T, NumRow>, N
    return *reinterpret_cast<vec3_type *>(&(*this)[2]);
  }

-  vec3_type &location()
+  loc_type &location()
  {
-    BLI_STATIC_ASSERT(NumCol >= 4, "Wrong Matrix dimension");
-    BLI_STATIC_ASSERT(NumRow >= 3, "Wrong Matrix dimension");
-    return *reinterpret_cast<vec3_type *>(&(*this)[3]);
+    BLI_STATIC_ASSERT(NumCol >= 3, "Wrong Matrix dimension");
+    BLI_STATIC_ASSERT(NumRow >= 2, "Wrong Matrix dimension");
+    return *reinterpret_cast<loc_type *>(&(*this)[NumCol - 1]);
  }

  const vec3_type &x_axis() const
@ -286,11 +285,11 @@ struct alignas(Alignment) MatBase : public vec_struct_base<VecBase<T, NumRow>, N
    return *reinterpret_cast<const vec3_type *>(&(*this)[2]);
  }

-  const vec3_type &location() const
+  const loc_type &location() const
  {
-    BLI_STATIC_ASSERT(NumCol >= 4, "Wrong Matrix dimension");
-    BLI_STATIC_ASSERT(NumRow >= 3, "Wrong Matrix dimension");
-    return *reinterpret_cast<const vec3_type *>(&(*this)[3]);
+    BLI_STATIC_ASSERT(NumCol >= 3, "Wrong Matrix dimension");
+    BLI_STATIC_ASSERT(NumRow >= 2, "Wrong Matrix dimension");
+    return *reinterpret_cast<const loc_type *>(&(*this)[NumCol - 1]);
  }

  /** Matrix operators. */
@ -481,7 +480,7 @@ struct alignas(Alignment) MatBase : public vec_struct_base<VecBase<T, NumRow>, N
  {
    uint64_t h = 435109;
    unroll<NumCol * NumRow>([&](auto i) {
-      T value = (static_cast<const T *>(this))[i];
+      T value = (reinterpret_cast<const T *>(this))[i];
      h = h * 33 + *reinterpret_cast<const as_uint_type<T> *>(&value);
    });
    return h;
--- a/source/blender/blenlib/BLI_math_rotation_types.hh
+++ b/source/blender/blenlib/BLI_math_rotation_types.hh
@ -24,6 +24,35 @@ namespace detail {
 template<typename T> struct AxisAngle;
 template<typename T> struct Quaternion;

+template<typename T> struct AngleRadian {
+  T value;
+
+  AngleRadian() = default;
+
+  AngleRadian(const T &radian) : value(radian){};
+
+  /** Static functions. */
+
+  static AngleRadian identity()
+  {
+    return 0;
+  }
+
+  /** Conversions. */
+
+  explicit operator T() const
+  {
+    return value;
+  }
+
+  /** Operators. */
+
+  friend std::ostream &operator<<(std::ostream &stream, const AngleRadian &rot)
+  {
+    return stream << "AngleRadian(" << rot.value << ")";
+  }
+};
+
 template<typename T> struct EulerXYZ {
  T x, y, z;

@ -234,6 +263,7 @@ template<typename U> struct AssertUnitEpsilon<detail::Quaternion<U>> {
 };

 /* Most common used types. */
+using AngleRadian = math::detail::AngleRadian<float>;
 using EulerXYZ = math::detail::EulerXYZ<float>;
 using Quaternion = math::detail::Quaternion<float>;
 using AxisAngle = math::detail::AxisAngle<float>;
--- a/source/blender/blenlib/BLI_virtual_array.hh
+++ b/source/blender/blenlib/BLI_virtual_array.hh
@ -527,7 +527,7 @@ template<typename T> class VArrayCommon {
   * Other virtual array implementations are typically stored as #std::shared_ptr. That works even
   * when the implementation itself is not copyable and makes copying #VArrayCommon cheaper.
   */
-  using Storage = Any<detail::VArrayAnyExtraInfo<T>, 24, 8>;
+  using Storage = Any<blender::detail::VArrayAnyExtraInfo<T>, 24, 8>;

  /**
   * Pointer to the currently contained virtual array implementation. This is allowed to be null.
--- a/source/blender/blenlib/CMakeLists.txt
+++ b/source/blender/blenlib/CMakeLists.txt
@ -218,8 +218,6 @@ set(SRC
  BLI_fileops.hh
  BLI_fileops_types.h
  BLI_filereader.h
-  BLI_float3x3.hh
-  BLI_float4x4.hh
  BLI_fnmatch.h
  BLI_function_ref.hh
  BLI_generic_array.hh
@ -469,7 +467,6 @@ if(WITH_GTESTS)
    tests/BLI_edgehash_test.cc
    tests/BLI_expr_pylike_eval_test.cc
    tests/BLI_fileops_test.cc
-    tests/BLI_float3x3_test.cc
    tests/BLI_function_ref_test.cc
    tests/BLI_generic_array_test.cc
    tests/BLI_generic_span_test.cc
--- a/Show More
+++ b/Show More