Geometry Nodes: avoid using enumerable thread specific on single thread

The geometry nodes evaluator supports "lazy threading", i.e. it starts out
single-threaded. But when it determines that multi-threading can be
benefitial, it switches to multi-threaded mode.

Now it only creates an enumerable-thread-specific if it is actually using
multiple threads. This results in a 6% speedup in my test file with many
node groups and math nodes.
This commit is contained in:
Jacques Lucke 2022-12-29 21:05:41 +01:00
parent c744d5453f
commit dba2d82846
Notes: blender-bot 2023-02-13 23:16:02 +01:00
Referenced by commit 0bc0e3f9f7, Fix: geometry nodes crashes with large trees
1 changed files with 33 additions and 9 deletions

View File

@ -245,8 +245,11 @@ class Executor {
* A separate linear allocator for every thread. We could potentially reuse some memory, but that
* doesn't seem worth it yet.
*/
threading::EnumerableThreadSpecific<LinearAllocator<>> local_allocators_;
LinearAllocator<> *main_local_allocator_ = nullptr;
struct ThreadLocalData {
LinearAllocator<> allocator;
};
std::unique_ptr<threading::EnumerableThreadSpecific<ThreadLocalData>> thread_locals_;
LinearAllocator<> main_allocator_;
/**
* Set to false when the first execution ends.
*/
@ -259,7 +262,6 @@ class Executor {
{
/* The indices are necessary, because they are used as keys in #node_states_. */
BLI_assert(self_.graph_.node_indices_are_valid());
main_local_allocator_ = &local_allocators_.local();
}
~Executor()
@ -338,16 +340,25 @@ class Executor {
Span<const Node *> nodes = self_.graph_.nodes();
node_states_.reinitialize(nodes.size());
/* Construct all node states in parallel. */
threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
LinearAllocator<> &allocator = local_allocators_.local();
auto construct_node_range = [&](const IndexRange range, LinearAllocator<> &allocator) {
for (const int i : range) {
const Node &node = *nodes[i];
NodeState &node_state = *allocator.construct<NodeState>().release();
node_states_[i] = &node_state;
this->construct_initial_node_state(allocator, node, node_state);
}
});
};
if (nodes.size() <= 256) {
construct_node_range(nodes.index_range(), main_allocator_);
}
else {
this->ensure_thread_locals();
/* Construct all node states in parallel. */
threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
LinearAllocator<> &allocator = this->get_main_or_local_allocator();
construct_node_range(range, allocator);
});
}
}
void construct_initial_node_state(LinearAllocator<> &allocator,
@ -1067,10 +1078,23 @@ class Executor {
if (BLI_system_thread_count() <= 1) {
return false;
}
this->ensure_thread_locals();
task_pool_.store(BLI_task_pool_create(this, TASK_PRIORITY_HIGH));
return true;
}
void ensure_thread_locals()
{
#ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
if (current_main_thread_ != std::this_thread::get_id()) {
BLI_assert_unreachable();
}
#endif
if (!thread_locals_) {
thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>();
}
}
/**
* Allow other threads to steal all the nodes that are currently scheduled on this thread.
*/
@ -1109,9 +1133,9 @@ class Executor {
LinearAllocator<> &get_main_or_local_allocator()
{
if (this->use_multi_threading()) {
return local_allocators_.local();
return thread_locals_->local().allocator;
}
return *main_local_allocator_;
return main_allocator_;
}
};