Sculpt: experiment with lock-free mempools

and fine grained locks.
2021-10-20 14:14:12 -07:00 · 2021-10-20 14:14:12 -07:00 · 5de8134abc
parent 6ddd95f15a
commit 5de8134abc
15 changed files with 578 additions and 16 deletions
--- a/intern/guardedalloc/MEM_guardedalloc.h
+++ b/intern/guardedalloc/MEM_guardedalloc.h
@ -81,6 +81,12 @@ extern short (*MEM_testN)(void *vmemh);
 * newly allocated block. */
 extern void *(*MEM_dupallocN)(const void *vmemh) /* ATTR_MALLOC */ ATTR_WARN_UNUSED_RESULT;

+/**
+ * Duplicates a block of memory, and returns a pointer to the
+ * newly allocated block. */
+extern void *(*MEM_dupallocN_id)(const void *vmemh,
+                                 const char *str) /* ATTR_MALLOC */ ATTR_WARN_UNUSED_RESULT;
+
 /**
 * Reallocates a block of memory, and returns pointer to the newly
 * allocated block, the old one is freed. this is not as optimized
@ -253,6 +259,8 @@ void MEM_use_lockfree_allocator(void);
 * NOTE: The switch between allocator types can only happen before any allocation did happen. */
 void MEM_use_guarded_allocator(void);

+#define MEM_dupallocN(vmemh) MEM_dupallocN_id(vmemh, __func__)
+
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
--- a/intern/guardedalloc/intern/mallocn.c
+++ b/intern/guardedalloc/intern/mallocn.c
@ -40,6 +40,7 @@ const char *malloc_conf = "background_thread:true,dirty_decay_ms:4000";
 size_t (*MEM_allocN_len)(const void *vmemh) = MEM_lockfree_allocN_len;
 void (*MEM_freeN)(void *vmemh) = MEM_lockfree_freeN;
 void *(*MEM_dupallocN)(const void *vmemh) = MEM_lockfree_dupallocN;
+void *(*MEM_dupallocN_id)(const void *vmemh) = MEM_lockfree_dupallocN_id;
 void *(*MEM_reallocN_id)(void *vmemh, size_t len, const char *str) = MEM_lockfree_reallocN_id;
 void *(*MEM_recallocN_id)(void *vmemh, size_t len, const char *str) = MEM_lockfree_recallocN_id;
 void *(*MEM_callocN)(size_t len, const char *str) = MEM_lockfree_callocN;
@ -121,6 +122,7 @@ void MEM_use_lockfree_allocator(void)
  MEM_allocN_len = MEM_lockfree_allocN_len;
  MEM_freeN = MEM_lockfree_freeN;
  MEM_dupallocN = MEM_lockfree_dupallocN;
+  MEM_dupallocN_id = MEM_lockfree_dupallocN_id;
  MEM_reallocN_id = MEM_lockfree_reallocN_id;
  MEM_recallocN_id = MEM_lockfree_recallocN_id;
  MEM_callocN = MEM_lockfree_callocN;
@ -152,6 +154,7 @@ void MEM_use_guarded_allocator(void)
  MEM_allocN_len = MEM_guarded_allocN_len;
  MEM_freeN = MEM_guarded_freeN;
  MEM_dupallocN = MEM_guarded_dupallocN;
+  MEM_dupallocN_id = MEM_guarded_dupallocN_id;
  MEM_reallocN_id = MEM_guarded_reallocN_id;
  MEM_recallocN_id = MEM_guarded_recallocN_id;
  MEM_callocN = MEM_guarded_callocN;
--- a/intern/guardedalloc/intern/mallocn_guarded_impl.c
+++ b/intern/guardedalloc/intern/mallocn_guarded_impl.c
@ -303,6 +303,31 @@ void *MEM_guarded_dupallocN(const void *vmemh)
  return newp;
 }

+void *MEM_guarded_dupallocN_id(const void *vmemh, const char *str)
+{
+  void *newp = NULL;
+
+  if (vmemh) {
+    const MemHead *memh = vmemh;
+    memh--;
+
+    if (LIKELY(memh->alignment == 0)) {
+      newp = MEM_guarded_mallocN(memh->len, str);
+    }
+    else {
+      newp = MEM_guarded_mallocN_aligned(memh->len, (size_t)memh->alignment, str);
+    }
+
+    if (newp == NULL) {
+      return NULL;
+    }
+
+    memcpy(newp, vmemh, memh->len);
+  }
+
+  return newp;
+}
+
 void *MEM_guarded_reallocN_id(void *vmemh, size_t len, const char *str)
 {
  void *newp = NULL;
@ -408,7 +433,7 @@ static void print_memhead_backtrace(MemHead *memh)
  (void)memh; /* Ignored. */
 }
 #  endif /* defined(__linux__) || defined(__APPLE__) */
-#endif /* DEBUG_BACKTRACE */
+#endif   /* DEBUG_BACKTRACE */

 static void make_memhead_header(MemHead *memh, size_t len, const char *str)
 {
--- a/intern/guardedalloc/intern/mallocn_intern.h
+++ b/intern/guardedalloc/intern/mallocn_intern.h
@ -110,6 +110,8 @@ extern char free_after_leak_detection_message[];
 size_t MEM_lockfree_allocN_len(const void *vmemh) ATTR_WARN_UNUSED_RESULT;
 void MEM_lockfree_freeN(void *vmemh);
 void *MEM_lockfree_dupallocN(const void *vmemh) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT;
+void *MEM_lockfree_dupallocN_id(const void *vmemh,
+                                const char *str) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT;
 void *MEM_lockfree_reallocN_id(void *vmemh,
                               size_t len,
                               const char *str) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT
@ -153,6 +155,8 @@ const char *MEM_lockfree_name_ptr(void *vmemh);
 size_t MEM_guarded_allocN_len(const void *vmemh) ATTR_WARN_UNUSED_RESULT;
 void MEM_guarded_freeN(void *vmemh);
 void *MEM_guarded_dupallocN(const void *vmemh) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT;
+void *MEM_guarded_dupallocN_id(const void *vmemh,
+                               const char *str) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT;
 void *MEM_guarded_reallocN_id(void *vmemh,
                              size_t len,
                              const char *str) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT
--- a/intern/guardedalloc/intern/mallocn_lockfree_impl.c
+++ b/intern/guardedalloc/intern/mallocn_lockfree_impl.c
@ -29,8 +29,8 @@
 #include "MEM_guardedalloc.h"

 /* to ensure strict conversions */
-#include "../../source/blender/blenlib/BLI_strict_flags.h"
 #include "../../source/blender/blenlib/BLI_asan.h"
+#include "../../source/blender/blenlib/BLI_strict_flags.h"

 #include "atomic_ops.h"
 #include "mallocn_intern.h"
@ -167,6 +167,29 @@ void *MEM_lockfree_dupallocN(const void *vmemh)
  return newp;
 }

+void *MEM_lockfree_dupallocN_id(const void *vmemh, const char *str)
+{
+  void *newp = NULL;
+  if (vmemh) {
+    MemHead *memh = MEMHEAD_FROM_PTR(vmemh);
+    const size_t prev_size = MEM_lockfree_allocN_len(vmemh);
+
+    MEM_UNPOISON_MEMHEAD(vmemh);
+
+    if (UNLIKELY(MEMHEAD_IS_ALIGNED(memh))) {
+      MemHeadAligned *memh_aligned = MEMHEAD_ALIGNED_FROM_PTR(vmemh);
+      newp = MEM_lockfree_mallocN_aligned(prev_size, (size_t)memh_aligned->alignment, str);
+    }
+    else {
+      newp = MEM_lockfree_mallocN(prev_size, str);
+    }
+
+    MEM_POISON_MEMHEAD(vmemh);
+    memcpy(newp, vmemh, prev_size);
+  }
+  return newp;
+}
+
 void *MEM_lockfree_reallocN_id(void *vmemh, size_t len, const char *str)
 {
  void *newp = NULL;
--- a/release/datafiles/locale
+++ b/release/datafiles/locale
@ -1 +1 @@
-Subproject commit 75e46177f36a49ad36b917e641ee1586ddef7092
+Subproject commit 80d9e7ee122c626cbbcd1da554683bce79f8d3df
--- a/source/blender/blenkernel/intern/colortools.c
+++ b/source/blender/blenkernel/intern/colortools.c
@ -163,6 +163,15 @@ void BKE_curvemapping_copy_data_tag_ex(CurveMapping *target,
  }
 }

+static void *debug_dupalloc_id(const void *mem, const char *id)
+{
+  int len = MEM_allocN_len(mem);
+  void *cpy = MEM_mallocN(len, id);
+  memcpy(cpy, mem, len);
+
+  return cpy;
+}
+
 void BKE_curvemapping_copy_data(CurveMapping *target, const CurveMapping *cumap)
 {
  int a;
@ -177,13 +186,14 @@ void BKE_curvemapping_copy_data(CurveMapping *target, const CurveMapping *cumap)

  for (a = 0; a < CM_TOT; a++) {
    if (cumap->cm[a].curve) {
-      target->cm[a].curve = MEM_dupallocN(cumap->cm[a].curve);
+      target->cm[a].curve = debug_dupalloc_id(cumap->cm[a].curve, "curvemapping.curve");
    }
    if (cumap->cm[a].table) {
-      target->cm[a].table = MEM_dupallocN(cumap->cm[a].table);
+      target->cm[a].table = debug_dupalloc_id(cumap->cm[a].table, "curvemapping.table");
    }
    if (cumap->cm[a].premultable) {
-      target->cm[a].premultable = MEM_dupallocN(cumap->cm[a].premultable);
+      target->cm[a].premultable = debug_dupalloc_id(cumap->cm[a].premultable,
+                                                    "curvemapping.premultable");
    }
  }
 }
@ -191,7 +201,7 @@ void BKE_curvemapping_copy_data(CurveMapping *target, const CurveMapping *cumap)
 CurveMapping *BKE_curvemapping_copy(const CurveMapping *cumap)
 {
  if (cumap) {
-    CurveMapping *cumapn = MEM_dupallocN(cumap);
+    CurveMapping *cumapn = debug_dupalloc_id(cumap, "CurveMapping");
    BKE_curvemapping_copy_data(cumapn, cumap);
    cumapn->flag &= ~CUMA_PART_OF_CACHE;
    return cumapn;
--- a/source/blender/blenkernel/intern/dyntopo.c
+++ b/source/blender/blenkernel/intern/dyntopo.c
@ -34,6 +34,7 @@

 #include <stdio.h>

+#define DYNTOPO_EDGE_LOCKS
 //#define DYNTOPO_REPORT
 //#define WITH_ADAPTIVE_CURVATURE

@ -190,6 +191,169 @@ void bmesh_radial_loop_append(BMEdge *e, BMLoop *l);
 void bm_kill_only_edge(BMesh *bm, BMEdge *e);
 void bm_kill_only_loop(BMesh *bm, BMLoop *l);
 void bm_kill_only_face(BMesh *bm, BMFace *f);
+static bool bm_elem_is_free(BMElem *elem, int htype);
+
+extern char dyntopop_node_idx_layer_id[];
+extern char dyntopop_faces_areas_layer_id[];
+
+#ifdef DYNTOPO_EDGE_LOCKS
+
+char *cdlayer_lock_attr_name = "__bm_lock";
+
+static int cdlayer_lock_begin(PBVH *pbvh, BMesh *bm)
+{
+  int idx = CustomData_get_named_layer_index(&bm->edata, CD_PROP_INT32, cdlayer_lock_attr_name);
+
+  if (idx == -1) {
+    BM_data_layer_add_named(bm, &bm->edata, CD_PROP_INT32, cdlayer_lock_attr_name);
+
+    idx = CustomData_get_named_layer_index(&bm->edata, CD_PROP_INT32, cdlayer_lock_attr_name);
+    bm->vdata.layers[idx].flag |= CD_FLAG_TEMPORARY | CD_FLAG_ELEM_NOCOPY | CD_FLAG_ELEM_NOINTERP;
+
+    pbvh->cd_vert_node_offset = CustomData_get_named_layer_index(
+        &pbvh->bm->vdata, CD_PROP_INT32, dyntopop_node_idx_layer_id);
+    pbvh->cd_face_node_offset = CustomData_get_named_layer_index(
+        &pbvh->bm->pdata, CD_PROP_INT32, dyntopop_node_idx_layer_id);
+
+    pbvh->cd_vert_node_offset = bm->vdata.layers[pbvh->cd_vert_node_offset].offset;
+    pbvh->cd_face_node_offset = bm->pdata.layers[pbvh->cd_face_node_offset].offset;
+  }
+
+  return bm->vdata.layers[idx].offset;
+}
+
+static bool cdlayer_elem_lock(BMElem *elem, int cd_lock, int thread_nr)
+{
+  thread_nr++;
+
+  int *lock = BM_ELEM_CD_GET_VOID_P(elem, cd_lock);
+  int old = *lock;
+
+  if (old == thread_nr) {
+    return true;
+  }
+
+  while (old != atomic_cas_int32(lock, old, thread_nr)) {
+    if (elem->head.htype != BM_EDGE) {
+      // element was freed
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static void cdlayer_elem_unlock(BMElem *elem, int cd_lock, int thread_nr)
+{
+  thread_nr++;
+
+  int *lock = BM_ELEM_CD_GET_VOID_P(elem, cd_lock);
+  // int old = *lock;
+
+  *lock = 0;
+}
+
+static bool cdlayer_lock_edge(BMEdge *e, int cd_lock, int thread_nr)
+{
+  if (BM_ELEM_CD_GET_INT(e, cd_lock) == thread_nr + 1) {
+    return true;
+  }
+
+  if (!cdlayer_elem_lock((BMElem *)e, cd_lock, thread_nr)) {
+    return false;
+  }
+
+  for (int i = 0; i < 2; i++) {
+    BMVert *v = i ? e->v2 : e->v1;
+
+    BMEdge *e2 = v->e;
+    do {
+      BMLoop *l = e2->l;
+
+      if (!l) {
+        cdlayer_elem_lock((BMElem *)e2, cd_lock, thread_nr);
+        continue;
+      }
+
+      do {
+        BMLoop *l2 = l;
+        do {
+          cdlayer_elem_lock((BMElem *)l2->e, cd_lock, thread_nr);
+        } while ((l2 = l2->next) != l);
+      } while ((l = l->next) != e2->l);
+
+    } while ((e2 = BM_DISK_EDGE_NEXT(e2, v)) != v->e);
+  }
+
+  return true;
+}
+
+static void cdlayer_unlock_edge(BMEdge *e, int cd_lock, int thread_nr)
+{
+  if (BM_ELEM_CD_GET_INT(e, cd_lock) == thread_nr + 1) {
+    return;
+  }
+
+  BMEdge **es = NULL;
+  BLI_array_staticdeclare(es, 32);
+
+  const int tag = BM_ELEM_TAG_ALT;
+
+  for (int i = 0; i < 2; i++) {
+    BMVert *v = i ? e->v2 : e->v1;
+
+    BMEdge *e2 = v->e;
+    do {
+      BMLoop *l = e2->l;
+      if (!l) {
+        BLI_array_append(es, e2);
+        continue;
+      }
+
+      do {
+        BMLoop *l2 = l;
+        do {
+          l2->e->head.hflag &= ~tag;
+        } while ((l2 = l2->next) != l);
+      } while ((l = l->next) != e2->l);
+
+    } while ((e2 = BM_DISK_EDGE_NEXT(e2, v)) != v->e);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    BMVert *v = i ? e->v2 : e->v1;
+
+    BMEdge *e2 = v->e;
+    do {
+      BMLoop *l = e2->l;
+      if (!l) {
+        BLI_array_append(es, e2);
+        continue;
+      }
+
+      do {
+        BMLoop *l2 = l;
+        do {
+          if (!(l2->e->head.hflag & tag)) {
+            l2->e->head.hflag |= tag;
+            BLI_array_append(es, l2->e);
+          }
+        } while ((l2 = l2->next) != l);
+      } while ((l = l->next) != e2->l);
+
+    } while ((e2 = BM_DISK_EDGE_NEXT(e2, v)) != v->e);
+  }
+
+  for (int i = 0; i < BLI_array_len(es); i++) {
+    BMEdge *e2 = es[i];
+
+    if (!bm_elem_is_free((BMElem *)e2, BM_EDGE) &&
+        BM_ELEM_CD_GET_INT(e2, cd_lock) == thread_nr + 1) {
+      cdlayer_elem_unlock((BMElem *)e2, cd_lock, thread_nr);
+    }
+  }
+}
+#endif

 static void fix_mesh(PBVH *pbvh, BMesh *bm)
 {
@ -1749,6 +1913,7 @@ typedef struct EdgeQueueThreadData {
  int size;
  bool is_collapse;
  int seed;
+  int n;
 } EdgeQueueThreadData;

 static void edge_thread_data_insert(EdgeQueueThreadData *tdata, BMEdge *e)
@ -3537,8 +3702,10 @@ static BMVert *pbvh_bmesh_collapse_edge(PBVH *pbvh,
    return NULL;
  }

+#ifndef DYNTOPO_EDGE_LOCKS
  pbvh_check_vert_boundary(pbvh, v1);
  pbvh_check_vert_boundary(pbvh, v2);
+#endif

  const int mupdateflag = SCULPTVERT_NEED_VALENCE | SCULPTVERT_NEED_BOUNDARY |
                          SCULPTVERT_NEED_DISK_SORT;
@ -3546,8 +3713,10 @@ static BMVert *pbvh_bmesh_collapse_edge(PBVH *pbvh,

  validate_edge(pbvh, pbvh->bm, e, true, true);

+#ifndef DYNTOPO_EDGE_LOCKS
  check_vert_fan_are_tris(pbvh, e->v1);
  check_vert_fan_are_tris(pbvh, e->v2);
+#endif

  MSculptVert *mv1 = BKE_PBVH_SCULPTVERT(pbvh->cd_sculpt_vert, v1);
  MSculptVert *mv2 = BKE_PBVH_SCULPTVERT(pbvh->cd_sculpt_vert, v2);
@ -3785,7 +3954,9 @@ static BMVert *pbvh_bmesh_collapse_edge(PBVH *pbvh,
      BMLoop *l = e2->l;

      if (e2 != e && !(e2->head.hflag & tag)) {
+#ifndef DYNTOPO_EDGE_LOCKS
        BM_log_edge_topo_pre(pbvh->bm_log, e2);
+#endif
      }

      e2->head.hflag |= tag;
@ -3797,7 +3968,9 @@ static BMVert *pbvh_bmesh_collapse_edge(PBVH *pbvh,
      do {
        if (BM_ELEM_CD_GET_INT(l->f, pbvh->cd_face_node_offset) != DYNTOPO_NODE_NONE) {
          pbvh_bmesh_face_remove(pbvh, l->f, false, false, false);
+#ifndef DYNTOPO_EDGE_LOCKS
          BM_log_face_topo_pre(pbvh->bm_log, l->f);
+#endif
        }
      } while ((l = l->radial_next) != e2->l);
    } while ((e2 = BM_DISK_EDGE_NEXT(e2, v_step)) != v_step->e);
@ -3805,8 +3978,10 @@ static BMVert *pbvh_bmesh_collapse_edge(PBVH *pbvh,

  pbvh_bmesh_vert_remove(pbvh, v_del);

+#ifndef DYNTOPO_EDGE_LOCKS
  BM_log_edge_topo_pre(pbvh->bm_log, e);
  BM_log_vert_removed(pbvh->bm_log, v_del, pbvh->cd_vert_mask_offset);
+#endif

  BLI_ghash_insert(deleted_verts, (void *)v_del, NULL);

@ -3931,7 +4106,9 @@ static BMVert *pbvh_bmesh_collapse_edge(PBVH *pbvh,

    if (e2->head.hflag & tag) {
      e2->head.hflag &= ~tag;
+#ifndef DYNTOPO_EDGE_LOCKS
      BM_log_edge_topo_post(pbvh->bm_log, e2);
+#endif
    }

    BMLoop *lnext;
@ -3962,7 +4139,9 @@ static BMVert *pbvh_bmesh_collapse_edge(PBVH *pbvh,

      if (!fbad && BM_ELEM_CD_GET_INT(l->f, pbvh->cd_face_node_offset) == DYNTOPO_NODE_NONE) {
        BKE_pbvh_bmesh_add_face(pbvh, l->f, false, false);
+#ifndef DYNTOPO_EDGE_LOCKS
        BM_log_face_topo_post(pbvh->bm_log, l->f);
+#endif
      }

      if (!lnext) {
@ -4007,6 +4186,20 @@ static BMVert *pbvh_bmesh_collapse_edge(PBVH *pbvh,
  return v_conn;
 }

+#ifdef DYNTOPO_EDGE_LOCKS
+static void pbvh_bmesh_collapse_short_edges_cb(void *__restrict userdata,
+                                               const int n,
+                                               const TaskParallelTLS *__restrict tls)
+{
+  EdgeQueueThreadData *tdata = ((EdgeQueueThreadData *)userdata) + n;
+  int thread_nr = n;
+
+  for (int i = 0; i < tdata->totedge; i++) {
+    BMEdge *e = tdata->edges[i];
+  }
+}
+#endif
+
 static bool pbvh_bmesh_collapse_short_edges(EdgeQueueContext *eq_ctx,
                                            PBVH *pbvh,
                                            BLI_Buffer *deleted_faces,
@ -4035,6 +4228,23 @@ static bool pbvh_bmesh_collapse_short_edges(EdgeQueueContext *eq_ctx,
  BMVert **checkvs = NULL;
  BLI_array_declare(checkvs);

+#ifdef DYNTOPO_EDGE_LOCKS
+  const int totthread = 8;
+  EdgeQueueThreadData *tdata = MEM_callocN(sizeof(EdgeQueueThreadData), "EdgeQueueThreadData");
+  int totedge = max_steps / totthread + 1;
+
+  int curthread = 0;
+
+  if (totedge * totthread < max_steps) {
+    totedge += ((totedge * totthread) % max_steps) + 100;
+  }
+
+  for (int i = 0; i < totthread; i++) {
+    tdata[i].edges = MEM_mallocN(sizeof(void *) * totedge, "edge queue thread data edges");
+  }
+
+#endif
+
  while (!BLI_heapsimple_is_empty(eq_ctx->q->heap)) {
    if (step++ > max_steps) {
      break;
@ -4085,9 +4295,14 @@ static bool pbvh_bmesh_collapse_short_edges(EdgeQueueContext *eq_ctx,
      continue;
    }

-#ifdef USE_EDGEQUEUE_TAG
+#ifdef DYNTOPO_EDGE_LOCKS
+    tdata[curthread].edges[tdata[curthread].totedge++] = e;
+    curthread = (curthread + 1) % totthread;
+#else
+
+#  ifdef USE_EDGEQUEUE_TAG
    EDGE_QUEUE_DISABLE(e);
-#endif
+#  endif

    if (calc_weighted_edge_collapse(eq_ctx, v1, v2) >= limit_len_squared) {
      continue;
@ -4112,13 +4327,25 @@ static bool pbvh_bmesh_collapse_short_edges(EdgeQueueContext *eq_ctx,
      BLI_array_append(checkvs, v_conn);
    }

-#ifdef TEST_COLLAPSE
+#  ifdef TEST_COLLAPSE
    if (_i++ > 10) {
      break;
    }
+#  endif
 #endif
  }

+  TaskParallelSettings settings;
+
+  BLI_parallel_range_settings_defaults(&settings);
+  BLI_task_parallel_range(0, totthread, tdata, pbvh_bmesh_collapse_short_edges_cb, &settings);
+
+  for (int i = 0; i < totthread; i++) {
+    MEM_SAFE_FREE(tdata[i].edges);
+  }
+
+  MEM_SAFE_FREE(tdata);
+
  // add log subentry
  BM_log_entry_add_ex(pbvh->bm, pbvh->bm_log, true);

@ -5660,9 +5887,6 @@ static void pbvh_split_edges(EdgeQueueContext *eq_ctx,
 #endif
 }

-extern char dyntopop_node_idx_layer_id[];
-extern char dyntopop_faces_areas_layer_id[];
-
 typedef struct DynTopoState {
  PBVH *pbvh;
  bool is_fake_pbvh;
--- a/source/blender/blenlib/BLI_mempool_lockfree.h
+++ b/source/blender/blenlib/BLI_mempool_lockfree.h
@ -0,0 +1,13 @@
+#pragma once
+
+typedef struct BLI_lfmempool_iter {
+  void *chunk;
+  BLI_lfmempool *pool;
+  int i;
+} BLI_lfmempool_iter;
+
+void BLI_lfmempool_destroy(BLI_lfmempool *pool);
+void *BLI_lfmempool_alloc(BLI_lfmempool *pool);
+void BLI_lfmempool_free(BLI_lfmempool *pool, void *mem);
+void BLI_lfmempool_iternew(BLI_lfmempool *_pool, BLI_lfmempool_iter *iter);
+void *BLI_lfmempool_iterstep(BLI_lfmempool_iter *iter);
--- a/source/blender/blenlib/CMakeLists.txt
+++ b/source/blender/blenlib/CMakeLists.txt
@ -90,6 +90,7 @@ set(SRC
  intern/kdtree_2d.c
  intern/kdtree_3d.c
  intern/kdtree_4d.c
+  intern/lockfree_mempool.cc
  intern/lasso_2d.c
  intern/listbase.c
  intern/math_base.c
--- a/source/blender/blenlib/intern/lockfree_mempool.cc
+++ b/source/blender/blenlib/intern/lockfree_mempool.cc
@ -0,0 +1,241 @@
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+
+#include <algorithm>
+#include <atomic>
+
+#include "MEM_guardedalloc.h"
+
+/* NOTE: copied from BLO_blend_defs.h, don't use here because we're in BLI. */
+#ifdef __BIG_ENDIAN__
+/* Big Endian */
+#  define MAKE_ID(a, b, c, d) ((int)(a) << 24 | (int)(b) << 16 | (c) << 8 | (d))
+#  define MAKE_ID_8(a, b, c, d, e, f, g, h) \
+    ((int64_t)(a) << 56 | (int64_t)(b) << 48 | (int64_t)(c) << 40 | (int64_t)(d) << 32 | \
+     (int64_t)(e) << 24 | (int64_t)(f) << 16 | (int64_t)(g) << 8 | (h))
+#else
+/* Little Endian */
+#  define MAKE_ID(a, b, c, d) ((int)(d) << 24 | (int)(c) << 16 | (b) << 8 | (a))
+#  define MAKE_ID_8(a, b, c, d, e, f, g, h) \
+    ((int64_t)(h) << 56 | (int64_t)(g) << 48 | (int64_t)(f) << 40 | (int64_t)(e) << 32 | \
+     (int64_t)(d) << 24 | (int64_t)(c) << 16 | (int64_t)(b) << 8 | (a))
+#endif
+
+/**
+ * Important that this value is an is _not_  aligned with `sizeof(void *)`.
+ * So having a pointer to 2/4/8... aligned memory is enough to ensure
+ * the `freeword` will never be used.
+ * To be safe, use a word that's the same in both directions.
+ */
+#define FREEWORD \
+  ((sizeof(void *) > sizeof(int32_t)) ? MAKE_ID_8('e', 'e', 'r', 'f', 'f', 'r', 'e', 'e') : \
+                                        MAKE_ID('e', 'f', 'f', 'e'))
+
+/**
+ * The 'used' word just needs to be set to something besides FREEWORD.
+ */
+#define USEDWORD MAKE_ID('u', 's', 'e', 'd')
+
+typedef struct BLI_lfmempool BLI_lfmempool;
+
+namespace blender {
+
+struct LockFreeElem {
+  struct LockFreeElem *next;
+  uintptr_t freeword;
+};
+
+struct LockFreeChunk {
+  struct LockFreeChunk *next, *prev;
+
+  // we are convienently aligned to 16 bytes here
+};
+
+static void *data_from_chunk(LockFreeChunk *chunk)
+{
+  return reinterpret_cast<void *>(chunk + 1);
+}
+
+static LockFreeElem *elem_from_chunk(LockFreeChunk *chunk)
+{
+  return reinterpret_cast<LockFreeElem *>(data_from_chunk(chunk));
+}
+
+static LockFreeElem *elem_next(LockFreeElem *elem, int esize)
+{
+  char *ptr = reinterpret_cast<char *>(elem);
+  ptr += esize;
+  return reinterpret_cast<LockFreeElem *>(ptr);
+}
+
+static_assert(sizeof(std::atomic<void *>) == sizeof(void *), "std:atomic has space overhead!");
+
+struct LockFreePool {
+  struct {
+    std::atomic<LockFreeChunk *> first;
+    std::atomic<LockFreeChunk *> last;
+  } chunks;
+
+  std::atomic<int> totchunk;
+  std::atomic<int> totused;
+
+  std::atomic<LockFreeElem *> free_elem;
+
+  int esize, psize, csize;
+
+  LockFreePool(int esize, int psize) : esize(esize), psize(psize)
+  {
+    esize = std::max(esize, (int)(sizeof(void *) * 2));
+    csize = esize * psize + sizeof(LockFreeChunk);
+  }
+
+  ~LockFreePool()
+  {
+    LockFreeChunk *chunk, *next;
+
+    for (chunk = chunks.first; chunk; chunk = next) {
+      next = chunk->next;
+
+      OBJECT_GUARDED_DELETE(chunk, LockFreeChunk);
+    }
+  }
+
+  void add_chunk()
+  {
+    LockFreeChunk *chunk = OBJECT_GUARDED_NEW(LockFreeChunk);
+    LockFreeElem *elem = elem_from_chunk(chunk), *last;
+
+    chunk->next = chunk->prev = nullptr;
+
+    for (int i = 0; i < psize; i++, elem = elem_next(elem, esize)) {
+      elem->next = i == psize - 1 ? nullptr : elem_next(elem, esize);
+      elem->freeword = FREEWORD;
+
+      if (i == psize - 1) {
+        last = elem;
+      }
+    }
+
+    // last->next = free_elem
+    // free_elem = last;
+
+    while (1) {
+      last->next = free_elem.load();
+
+      if (free_elem.compare_exchange_strong(last->next, last)) {
+        break;
+      }
+    }
+
+    while (1) {
+      chunk->prev = chunks.last.load();
+
+      if (chunks.last.compare_exchange_strong(chunk->prev, chunk)) {
+        if (!chunk->prev) {
+          // chunks.first is not accessed in threading cases, only when pool
+          // is destroyed
+          chunks.first.store(chunk);
+        }
+        break;
+      }
+    }
+  }
+
+  void *alloc()
+  {
+    while (1) {
+      if (!free_elem.load()) {
+        add_chunk();
+      }
+
+      LockFreeElem *cur = free_elem.load();
+
+      if (free_elem.compare_exchange_strong(cur, cur->next)) {
+        cur->freeword = 0;
+        return reinterpret_cast<void *>(cur);
+      }
+    }
+  }
+
+  void free(void *mem)
+  {
+    LockFreeElem *elem = reinterpret_cast<LockFreeElem *>(mem);
+
+    elem->freeword = FREEWORD;
+
+    while (!free_elem.compare_exchange_strong(elem->next, elem)) {
+    }
+  }
+};
+
+static LockFreePool *cast_pool(BLI_lfmempool *pool)
+{
+  return reinterpret_cast<LockFreePool *>(pool);
+}
+
+extern "C" {
+
+BLI_lfmempool *BLI_lfmempool_create(int esize, int psize)
+{
+  LockFreePool *pool = OBJECT_GUARDED_NEW(LockFreePool, esize, psize);
+  return reinterpret_cast<BLI_lfmempool *>(pool);
+}
+
+typedef struct BLI_lfmempool_iter {
+  void *chunk;
+  BLI_lfmempool *pool;
+  int i;
+} BLI_lfmempool_iter;
+
+void BLI_lfmempool_destroy(BLI_lfmempool *pool)
+{
+  OBJECT_GUARDED_DELETE(cast_pool(pool), LockFreePool);
+}
+
+void *BLI_lfmempool_alloc(BLI_lfmempool *pool)
+{
+  return cast_pool(pool)->alloc();
+}
+
+void BLI_lfmempool_free(BLI_lfmempool *pool, void *mem)
+{
+  return cast_pool(pool)->free(mem);
+}
+
+void BLI_lfmempool_iternew(BLI_lfmempool *_pool, BLI_lfmempool_iter *iter)
+{
+  LockFreePool *pool = cast_pool(_pool);
+  iter->pool = _pool;
+  iter->chunk = reinterpret_cast<void *>(pool->chunks.first.load());
+  iter->i = 0;
+}
+
+void *BLI_lfmempool_iterstep(BLI_lfmempool_iter *iter)
+{
+  if (!iter->chunk) {
+    return nullptr;
+  }
+
+  LockFreePool *pool = cast_pool(iter->pool);
+  LockFreeChunk *chunk = reinterpret_cast<LockFreeChunk *>(iter->chunk);
+
+  char *data = reinterpret_cast<char *>(data_from_chunk(chunk));
+  void *ret = reinterpret_cast<void *>(data + pool->esize * iter->i);
+
+  iter->i++;
+
+  if (iter->i >= pool->psize) {
+    iter->i = 0;
+    iter->chunk = reinterpret_cast<void *>(chunk->next);
+  }
+
+  LockFreeElem *elem = reinterpret_cast<LockFreeElem *>(ret);
+  if (elem->freeword == FREEWORD) {
+    return BLI_lfmempool_iterstep(iter);
+  }
+
+  return ret;
+}
+}
+}  // namespace blender
--- a/source/blender/blentranslation/msgfmt/msgfmt.c
+++ b/source/blender/blentranslation/msgfmt/msgfmt.c
@ -32,6 +32,7 @@
 * Usage: msgfmt input.po output.po
 */

+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

--- a/source/blender/editors/sculpt_paint/paint_mask.c
+++ b/source/blender/editors/sculpt_paint/paint_mask.c
@ -746,7 +746,7 @@ static void sculpt_gesture_face_set_begin(bContext *C, SculptGestureContext *sgc
  SCULPT_undo_push_node(sgcontext->vc.obact, NULL, SCULPT_UNDO_FACE_SETS);
 }

-static void face_set_gesture_apply_task_cb(void *__restrict userdata,
+static void   (void *__restrict userdata,
                                           const int i,
                                           const TaskParallelTLS *__restrict UNUSED(tls))
 {
--- a/source/blender/editors/sculpt_paint/sculpt.c
+++ b/source/blender/editors/sculpt_paint/sculpt.c
@ -5059,6 +5059,7 @@ static void sculpt_topology_update(Sculpt *sd,
      SCULPT_get_int(ss, dyntopo_disable_smooth, sd, brush));

  SCULPT_dyntopo_automasking_end(mask_cb_data);
+  SCULPT_update_customdata_refs(ss);

  /* Update average stroke position. */
  copy_v3_v3(location, ss->cache->true_location);
--- a/source/blender/editors/sculpt_paint/sculpt_dyntopo.c
+++ b/source/blender/editors/sculpt_paint/sculpt_dyntopo.c
@ -587,6 +587,7 @@ int SCULPT_dyntopo_get_templayer(SculptSession *ss, int type, const char *name)
 }

 char dyntopop_faces_areas_layer_id[] = "__dyntopo_face_areas";
+extern char *cdlayer_lock_attr_name;

 void SCULPT_dyntopo_node_layers_add(SculptSession *ss)
 {
@ -600,15 +601,22 @@ void SCULPT_dyntopo_node_layers_add(SculptSession *ss)
      {CD_DYNTOPO_VERT, NULL, CD_FLAG_TEMPORARY | CD_FLAG_NOCOPY},
      {CD_PROP_INT32, dyntopop_node_idx_layer_id, CD_FLAG_TEMPORARY | CD_FLAG_NOCOPY}};

-  BM_data_layers_ensure(ss->bm, &ss->bm->vdata, vlayers, 3);
+  BM_data_layers_ensure(ss->bm, &ss->bm->vdata, vlayers, ARRAY_SIZE(vlayers));

  ss->cd_vert_mask_offset = CustomData_get_offset(&ss->bm->vdata, CD_PAINT_MASK);

+  BMCustomLayerReq elayers[] = {CD_PROP_INT32,
+                                cdlayer_lock_attr_name,
+                                CD_FLAG_TEMPORARY | CD_FLAG_ELEM_NOCOPY | CD_FLAG_ELEM_NOINTERP};
+
+  BM_data_layers_ensure(ss->bm, &ss->bm->edata, elayers, 1);
+
  BMCustomLayerReq flayers[] = {
      {CD_PROP_INT32, dyntopop_node_idx_layer_id, CD_FLAG_TEMPORARY | CD_FLAG_NOCOPY},
      {CD_PROP_FLOAT, dyntopop_faces_areas_layer_id, CD_FLAG_TEMPORARY | CD_FLAG_NOCOPY},
  };
-  BM_data_layers_ensure(ss->bm, &ss->bm->pdata, flayers, 2);
+
+  BM_data_layers_ensure(ss->bm, &ss->bm->pdata, flayers, ARRAY_SIZE(flayers));

  // get indices again, as they might have changed after adding new layers
  cd_node_layer_index = CustomData_get_named_layer_index(