Geometry Nodes: avoid using enumerable thread specific on single thread

The geometry nodes evaluator supports "lazy threading", i.e. it starts out single-threaded. But when it determines that multi-threading can be benefitial, it switches to multi-threaded mode. Now it only creates an enumerable-thread-specific if it is actually using multiple threads. This results in a 6% speedup in my test file with many node groups and math nodes.
Referenced by commit 0bc0e3f9f7, Fix: geometry nodes crashes with large trees
2022-12-29 21:05:41 +01:00 · 2022-12-29 21:05:41 +01:00 · dba2d82846 · 2023-02-13 23:16:02 +01:00
parent c744d5453f
commit dba2d82846
1 changed files with 33 additions and 9 deletions
--- a/source/blender/functions/intern/lazy_function_graph_executor.cc
+++ b/source/blender/functions/intern/lazy_function_graph_executor.cc
@ -245,8 +245,11 @@ class Executor {
   * A separate linear allocator for every thread. We could potentially reuse some memory, but that
   * doesn't seem worth it yet.
   */
-  threading::EnumerableThreadSpecific<LinearAllocator<>> local_allocators_;
-  LinearAllocator<> *main_local_allocator_ = nullptr;
+  struct ThreadLocalData {
+    LinearAllocator<> allocator;
+  };
+  std::unique_ptr<threading::EnumerableThreadSpecific<ThreadLocalData>> thread_locals_;
+  LinearAllocator<> main_allocator_;
  /**
   * Set to false when the first execution ends.
   */
@ -259,7 +262,6 @@ class Executor {
  {
    /* The indices are necessary, because they are used as keys in #node_states_. */
    BLI_assert(self_.graph_.node_indices_are_valid());
-    main_local_allocator_ = &local_allocators_.local();
  }

  ~Executor()
@ -338,16 +340,25 @@ class Executor {
    Span<const Node *> nodes = self_.graph_.nodes();
    node_states_.reinitialize(nodes.size());

-    /* Construct all node states in parallel. */
-    threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
-      LinearAllocator<> &allocator = local_allocators_.local();
+    auto construct_node_range = [&](const IndexRange range, LinearAllocator<> &allocator) {
      for (const int i : range) {
        const Node &node = *nodes[i];
        NodeState &node_state = *allocator.construct<NodeState>().release();
        node_states_[i] = &node_state;
        this->construct_initial_node_state(allocator, node, node_state);
      }
-    });
+    };
+    if (nodes.size() <= 256) {
+      construct_node_range(nodes.index_range(), main_allocator_);
+    }
+    else {
+      this->ensure_thread_locals();
+      /* Construct all node states in parallel. */
+      threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
+        LinearAllocator<> &allocator = this->get_main_or_local_allocator();
+        construct_node_range(range, allocator);
+      });
+    }
  }

  void construct_initial_node_state(LinearAllocator<> &allocator,
@ -1067,10 +1078,23 @@ class Executor {
    if (BLI_system_thread_count() <= 1) {
      return false;
    }
+    this->ensure_thread_locals();
    task_pool_.store(BLI_task_pool_create(this, TASK_PRIORITY_HIGH));
    return true;
  }

+  void ensure_thread_locals()
+  {
+#ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
+    if (current_main_thread_ != std::this_thread::get_id()) {
+      BLI_assert_unreachable();
+    }
+#endif
+    if (!thread_locals_) {
+      thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>();
+    }
+  }
+
  /**
   * Allow other threads to steal all the nodes that are currently scheduled on this thread.
   */
@ -1109,9 +1133,9 @@ class Executor {
  LinearAllocator<> &get_main_or_local_allocator()
  {
    if (this->use_multi_threading()) {
-      return local_allocators_.local();
+      return thread_locals_->local().allocator;
    }
-    return *main_local_allocator_;
+    return main_allocator_;
  }
 };