Allocator: improve multi-threaded allocation performance

Both, the guarded and lockfree allocator, are keeping track of current and peak memory usage. Even the lockfree allocator used to use a global atomic variable for the memory usage. When multiple threads use the allocator at the same time, this variable is highly contended. This can result in significant slowdowns as presented in D16862. While specific cases could always be optimized by reducing the number of allocations, having this synchronization point in functions used by almost every part of Blender is not great. The solution is use thread-local memory counters which are only added together when the memory usage is actually requested. For more details see in-code comments and D16862. Differential Revision: https://developer.blender.org/D16862
Referenced by commit 862d08bb97, Fix: use-after-free when threadlocal is destructed after static variable
2023-01-04 14:55:21 +01:00 · 2023-01-04 14:55:21 +01:00 · 78f28b55d3 · 2023-02-14 10:29:32 +01:00
parent 14d7cd46ba
commit 78f28b55d3
7 changed files with 287 additions and 36 deletions
--- a/intern/guardedalloc/CMakeLists.txt
+++ b/intern/guardedalloc/CMakeLists.txt
@ -20,6 +20,7 @@ set(SRC
  ./intern/mallocn.c
  ./intern/mallocn_guarded_impl.c
  ./intern/mallocn_lockfree_impl.c
+  ./intern/memory_usage.cc

  MEM_guardedalloc.h
  ./intern/mallocn_inline.h
--- a/intern/guardedalloc/intern/leak_detector.cc
+++ b/intern/guardedalloc/intern/leak_detector.cc
@ -53,6 +53,9 @@ class MemLeakPrinter {

 void MEM_init_memleak_detection()
 {
+  /* Calling this ensures that the memory usage counters outlive the memory leak detection. */
+  memory_usage_init();
+
  /**
   * This variable is constructed when this function is first called. This should happen as soon as
   * possible when the program starts.
--- a/intern/guardedalloc/intern/mallocn_intern.h
+++ b/intern/guardedalloc/intern/mallocn_intern.h
@ -89,6 +89,14 @@ void aligned_free(void *ptr);
 extern bool leak_detector_has_run;
 extern char free_after_leak_detection_message[];

+void memory_usage_init(void);
+void memory_usage_block_alloc(size_t size);
+void memory_usage_block_free(size_t size);
+size_t memory_usage_block_num(void);
+size_t memory_usage_current(void);
+size_t memory_usage_peak(void);
+void memory_usage_peak_reset(void);
+
 /* Prototypes for counted allocator functions */
 size_t MEM_lockfree_allocN_len(const void *vmemh) ATTR_WARN_UNUSED_RESULT;
 void MEM_lockfree_freeN(void *vmemh);
--- a/intern/guardedalloc/intern/mallocn_lockfree_impl.c
+++ b/intern/guardedalloc/intern/mallocn_lockfree_impl.c
@ -30,8 +30,6 @@ typedef struct MemHeadAligned {
  size_t len;
 } MemHeadAligned;

-static unsigned int totblock = 0;
-static size_t mem_in_use = 0, peak_mem = 0;
 static bool malloc_debug_memset = false;

 static void (*error_callback)(const char *) = NULL;
@ -46,18 +44,6 @@ enum {
 #define MEMHEAD_IS_ALIGNED(memhead) ((memhead)->len & (size_t)MEMHEAD_ALIGN_FLAG)
 #define MEMHEAD_LEN(memhead) ((memhead)->len & ~((size_t)(MEMHEAD_ALIGN_FLAG)))

-/* Uncomment this to have proper peak counter. */
-#define USE_ATOMIC_MAX
-
-MEM_INLINE void update_maximum(size_t *maximum_value, size_t value)
-{
-#ifdef USE_ATOMIC_MAX
-  atomic_fetch_and_update_max_z(maximum_value, value);
-#else
-  *maximum_value = value > *maximum_value ? value : *maximum_value;
-#endif
-}
-
 #ifdef __GNUC__
 __attribute__((format(printf, 1, 2)))
 #endif
@ -103,8 +89,7 @@ void MEM_lockfree_freeN(void *vmemh)
  MemHead *memh = MEMHEAD_FROM_PTR(vmemh);
  size_t len = MEMHEAD_LEN(memh);

-  atomic_sub_and_fetch_u(&totblock, 1);
-  atomic_sub_and_fetch_z(&mem_in_use, len);
+  memory_usage_block_free(len);

  if (UNLIKELY(malloc_debug_memset && len)) {
    memset(memh + 1, 255, len);
@ -224,16 +209,14 @@ void *MEM_lockfree_callocN(size_t len, const char *str)

  if (LIKELY(memh)) {
    memh->len = len;
-    atomic_add_and_fetch_u(&totblock, 1);
-    atomic_add_and_fetch_z(&mem_in_use, len);
-    update_maximum(&peak_mem, mem_in_use);
+    memory_usage_block_alloc(len);

    return PTR_FROM_MEMHEAD(memh);
  }
  print_error("Calloc returns null: len=" SIZET_FORMAT " in %s, total %u\n",
              SIZET_ARG(len),
              str,
-              (uint)mem_in_use);
+              (uint)memory_usage_current());
  return NULL;
 }

@ -247,7 +230,7 @@ void *MEM_lockfree_calloc_arrayN(size_t len, size_t size, const char *str)
        SIZET_ARG(len),
        SIZET_ARG(size),
        str,
-        (unsigned int)mem_in_use);
+        (unsigned int)memory_usage_current());
    abort();
    return NULL;
  }
@ -269,16 +252,14 @@ void *MEM_lockfree_mallocN(size_t len, const char *str)
    }

    memh->len = len;
-    atomic_add_and_fetch_u(&totblock, 1);
-    atomic_add_and_fetch_z(&mem_in_use, len);
-    update_maximum(&peak_mem, mem_in_use);
+    memory_usage_block_alloc(len);

    return PTR_FROM_MEMHEAD(memh);
  }
  print_error("Malloc returns null: len=" SIZET_FORMAT " in %s, total %u\n",
              SIZET_ARG(len),
              str,
-              (uint)mem_in_use);
+              (uint)memory_usage_current());
  return NULL;
 }

@ -292,7 +273,7 @@ void *MEM_lockfree_malloc_arrayN(size_t len, size_t size, const char *str)
        SIZET_ARG(len),
        SIZET_ARG(size),
        str,
-        (uint)mem_in_use);
+        (uint)memory_usage_current());
    abort();
    return NULL;
  }
@ -340,16 +321,14 @@ void *MEM_lockfree_mallocN_aligned(size_t len, size_t alignment, const char *str

    memh->len = len | (size_t)MEMHEAD_ALIGN_FLAG;
    memh->alignment = (short)alignment;
-    atomic_add_and_fetch_u(&totblock, 1);
-    atomic_add_and_fetch_z(&mem_in_use, len);
-    update_maximum(&peak_mem, mem_in_use);
+    memory_usage_block_alloc(len);

    return PTR_FROM_MEMHEAD(memh);
  }
  print_error("Malloc returns null: len=" SIZET_FORMAT " in %s, total %u\n",
              SIZET_ARG(len),
              str,
-              (uint)mem_in_use);
+              (uint)memory_usage_current());
  return NULL;
 }

@ -369,8 +348,8 @@ void MEM_lockfree_callbackmemlist(void (*func)(void *))

 void MEM_lockfree_printmemlist_stats(void)
 {
-  printf("\ntotal memory len: %.3f MB\n", (double)mem_in_use / (double)(1024 * 1024));
-  printf("peak memory len: %.3f MB\n", (double)peak_mem / (double)(1024 * 1024));
+  printf("\ntotal memory len: %.3f MB\n", (double)memory_usage_current() / (double)(1024 * 1024));
+  printf("peak memory len: %.3f MB\n", (double)memory_usage_peak() / (double)(1024 * 1024));
  printf(
      "\nFor more detailed per-block statistics run Blender with memory debugging command line "
      "argument.\n");
@ -398,23 +377,23 @@ void MEM_lockfree_set_memory_debug(void)

 size_t MEM_lockfree_get_memory_in_use(void)
 {
-  return mem_in_use;
+  return memory_usage_current();
 }

 uint MEM_lockfree_get_memory_blocks_in_use(void)
 {
-  return totblock;
+  return (uint)memory_usage_block_num();
 }

 /* dummy */
 void MEM_lockfree_reset_peak_memory(void)
 {
-  peak_mem = mem_in_use;
+  memory_usage_peak_reset();
 }

 size_t MEM_lockfree_get_peak_memory(void)
 {
-  return peak_mem;
+  return memory_usage_peak();
 }

 #ifndef NDEBUG
--- a/intern/guardedalloc/intern/memory_usage.cc
+++ b/intern/guardedalloc/intern/memory_usage.cc
@ -0,0 +1,258 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <iostream>
+#include <mutex>
+#include <vector>
+
+#include "MEM_guardedalloc.h"
+#include "mallocn_intern.h"
+
+#include "../../source/blender/blenlib/BLI_strict_flags.h"
+
+namespace {
+
+/**
+ * This is stored per thread. Align to cache line size to avoid false sharing.
+ */
+struct alignas(64) Local {
+  /** Helps to find bugs during program shutdown. */
+  bool destructed = false;
+  /**
+   * This is the first created #Local and on the main thread. When the main local data is
+   * destructed, we know that Blender is quitting and that we can't rely on thread locals being
+   * available still.
+   */
+  bool is_main = false;
+  /**
+   * Number of bytes. This can be negative when e.g. one thread allocates a lot of memory, and
+   * another frees it. It has to be an atomic, because it may be accessed by other threads when the
+   * total memory usage is counted.
+   */
+  std::atomic<int64_t> mem_in_use = 0;
+  /**
+   * Number of allocated blocks. Can be negative and is atomic for the same reason as above.
+   */
+  std::atomic<int64_t> blocks_num = 0;
+  /**
+   * Amount of memory used when the peak was last updated. This is used so that we don't have to
+   * update the peak memory usage after every memory allocation. Instead it's only updated when "a
+   * lot" of new memory has been allocated. This makes the peak memory usage a little bit less
+   * accurate, but it's still good enough for practical purposes.
+   */
+  std::atomic<int64_t> mem_in_use_during_peak_update = 0;
+
+  Local();
+  ~Local();
+};
+
+/**
+ * This is a singleton that stores global data.
+ */
+struct Global {
+  /**
+   * Mutex that protects the vector below.
+   */
+  std::mutex locals_mutex;
+  /**
+   * All currently constructed #Local. This must only be accessed when the mutex above is
+   * locked. Individual threads insert and remove themselves here.
+   */
+  std::vector<Local *> locals;
+  /**
+   * Number of bytes that are not tracked by #Local. This is necessary because when a thread exits,
+   * its #Local data is freed. The memory counts stored there would be lost. The memory counts may
+   * be non-zero during thread destruction, if the thread did an unequal amount of allocations and
+   * frees (which is perfectly valid behavior as long as other threads have the responsibility to
+   * free any memory that the thread allocated).
+   *
+   * To solve this, the memory counts are added to these global counters when the thread
+   * exists. The global counters are also used when the entire process starts to exit, because the
+   * #Local data of the main thread is already destructed when the leak detection happens (during
+   * destruction of static variables which happens after destruction of threadlocals).
+   */
+  std::atomic<int64_t> mem_in_use_outside_locals = 0;
+  /**
+   * Number of blocks that are not tracked by #Local, for the same reason as above.
+   */
+  std::atomic<int64_t> blocks_num_outside_locals = 0;
+  /**
+   * Peak memory usage since the last reset.
+   */
+  std::atomic<size_t> peak = 0;
+};
+
+}  // namespace
+
+/**
+ * This is true for most of the lifetime of the program. Only when it starts exiting this becomes
+ * false indicating that global counters should be used for correctness.
+ */
+static std::atomic<bool> use_local_counters = true;
+/**
+ * When a thread allocated this amount of memory, the peak memory usage is updated. An alternative
+ * would be to update the global peak memory after every allocation, but that would cause much more
+ * overhead with little benefit.
+ */
+static constexpr int64_t peak_update_threshold = 1024 * 1024;
+
+static Global &get_global()
+{
+  static Global global;
+  return global;
+}
+
+static Local &get_local_data()
+{
+  static thread_local Local local;
+  assert(!local.destructed);
+  return local;
+}
+
+Local::Local()
+{
+  Global &global = get_global();
+  std::lock_guard lock{global.locals_mutex};
+
+  if (global.locals.empty()) {
+    /* This is the first thread creating #Local, it is therefore the main thread because it's
+     * created through #memory_usage_init. */
+    this->is_main = true;
+  }
+  /* Register self in the global list. */
+  global.locals.push_back(this);
+}
+
+Local::~Local()
+{
+  Global &global = get_global();
+  std::lock_guard lock{global.locals_mutex};
+
+  /* Unregister self from the global list. */
+  global.locals.erase(std::find(global.locals.begin(), global.locals.end(), this));
+  /* Don't forget the memory counts stored locally. */
+  global.blocks_num_outside_locals.fetch_add(this->blocks_num, std::memory_order_relaxed);
+  global.mem_in_use_outside_locals.fetch_add(this->mem_in_use, std::memory_order_relaxed);
+
+  if (this->is_main) {
+    /* The main thread started shutting down. Use global counters from now on to avoid accessing
+     * threadlocals after they have been destructed. */
+    use_local_counters.store(false, std::memory_order_relaxed);
+  }
+  /* Helps to detect when thread locals are accidentally accessed after destruction. */
+  this->destructed = true;
+}
+
+/** Check if the current memory usage is higher than the peak and update it if yes. */
+static void update_global_peak()
+{
+  Global &global = get_global();
+  /* Update peak. */
+  global.peak = std::max<size_t>(global.peak, memory_usage_current());
+
+  std::lock_guard lock{global.locals_mutex};
+
+  for (Local *local : global.locals) {
+    assert(!local->destructed);
+    /* Updating this makes sure that the peak is not updated too often, which would degrade
+     * performance. */
+    local->mem_in_use_during_peak_update = local->mem_in_use.load(std::memory_order_relaxed);
+  }
+}
+
+void memory_usage_init()
+{
+  /* Makes sure that the static and threadlocal variables on the main thread are initialized. */
+  get_local_data();
+}
+
+void memory_usage_block_alloc(const size_t size)
+{
+  if (LIKELY(use_local_counters.load(std::memory_order_relaxed))) {
+    Local &local = get_local_data();
+    /* Increase local memory counts. This does not cause thread synchronization in the majority of
+     * cases, because each thread has these counters on a separate cache line. It may only cause
+     * synchronization if another thread is computing the total current memory usage at the same
+     * time, which is very rare compared to doing allocations. */
+    local.blocks_num.fetch_add(1, std::memory_order_relaxed);
+    local.mem_in_use.fetch_add(int64_t(size), std::memory_order_relaxed);
+
+    /* If a certain amount of new memory has been allocated, update the peak. */
+    if (local.mem_in_use - local.mem_in_use_during_peak_update > peak_update_threshold) {
+      update_global_peak();
+    }
+  }
+  else {
+    Global &global = get_global();
+    /* Increase global memory counts. */
+    global.blocks_num_outside_locals.fetch_add(1, std::memory_order_relaxed);
+    global.mem_in_use_outside_locals.fetch_add(int64_t(size), std::memory_order_relaxed);
+  }
+}
+
+void memory_usage_block_free(const size_t size)
+{
+  if (LIKELY(use_local_counters)) {
+    /* Decrease local memory counts. See comment in #memory_usage_block_alloc for details regarding
+     * thread synchronization. */
+    Local &local = get_local_data();
+    local.mem_in_use.fetch_sub(int64_t(size), std::memory_order_relaxed);
+    local.blocks_num.fetch_sub(1, std::memory_order_relaxed);
+  }
+  else {
+    Global &global = get_global();
+    /* Decrease global memory counts. */
+    global.blocks_num_outside_locals.fetch_sub(1, std::memory_order_relaxed);
+    global.mem_in_use_outside_locals.fetch_sub(int64_t(size), std::memory_order_relaxed);
+  }
+}
+
+size_t memory_usage_block_num()
+{
+  Global &global = get_global();
+  std::lock_guard lock{global.locals_mutex};
+
+  /* Count the number of active blocks. */
+  int64_t blocks_num = global.blocks_num_outside_locals;
+  for (Local *local : global.locals) {
+    blocks_num += local->blocks_num;
+  }
+  return size_t(blocks_num);
+}
+
+size_t memory_usage_current()
+{
+  Global &global = get_global();
+  std::lock_guard lock{global.locals_mutex};
+
+  /* Count the memory that's currently in use. */
+  int64_t mem_in_use = global.mem_in_use_outside_locals;
+  for (Local *local : global.locals) {
+    mem_in_use += local->mem_in_use;
+  }
+  return size_t(mem_in_use);
+}
+
+/**
+ * Get the approximate peak memory usage since the last call to #memory_usage_peak_reset.
+ * This is approximate, because the peak usage is not updated after every allocation (see
+ * #peak_update_threshold).
+ *
+ * In the worst case, the peak memory usage is underestimated by
+ * `peak_update_threshold * #threads`. After large allocations (larger than the threshold), the
+ * peak usage is always updated so those allocations will always be taken into account.
+ */
+size_t memory_usage_peak()
+{
+  update_global_peak();
+  Global &global = get_global();
+  return global.peak;
+}
+
+void memory_usage_peak_reset()
+{
+  Global &global = get_global();
+  global.peak = memory_usage_current();
+}
--- a/source/blender/makesdna/intern/CMakeLists.txt
+++ b/source/blender/makesdna/intern/CMakeLists.txt
@ -55,6 +55,7 @@ set(SRC
  ../../../../intern/guardedalloc/intern/mallocn.c
  ../../../../intern/guardedalloc/intern/mallocn_guarded_impl.c
  ../../../../intern/guardedalloc/intern/mallocn_lockfree_impl.c
+  ../../../../intern/guardedalloc/intern/memory_usage.cc
  ${dna_header_include_file}
  ${dna_header_string_file}
 )
--- a/source/blender/makesrna/intern/CMakeLists.txt
+++ b/source/blender/makesrna/intern/CMakeLists.txt
@ -178,6 +178,7 @@ set(SRC
  ../../../../intern/guardedalloc/intern/mallocn.c
  ../../../../intern/guardedalloc/intern/mallocn_guarded_impl.c
  ../../../../intern/guardedalloc/intern/mallocn_lockfree_impl.c
+  ../../../../intern/guardedalloc/intern/memory_usage.cc

  # Needed for defaults.
  ../../../../release/datafiles/userdef/userdef_default.c