BLI: avoid invoking tbb for small workloads
We often call `parallel_for` in places with very variable sized workloads. When many elements are processed, using multi-threading is great, but when processing few elements (possibly many times) using `parallel_for` can result in significant overhead. I measured that this improves performance by >20% in the refactored realize instances code I'm working on separately. The change might also help with debugging sometimes, because the stack trace is smaller and contains fewer irrevelant symbols.
This commit is contained in:
parent
0f89d05848
commit
e130903060
Notes:
blender-bot
2023-10-04 09:42:55 +02:00
Referenced by commitc2737913db
, BLI: Avoid invoking tbb for small parallel_reduce calls Referenced by commite9334c5df8
, Geometry Nodes: Avoid parallel_for_each with a single geometry
|
@ -67,14 +67,19 @@ void parallel_for(IndexRange range, int64_t grain_size, const Function &function
|
|||
return;
|
||||
}
|
||||
#ifdef WITH_TBB
|
||||
tbb::parallel_for(tbb::blocked_range<int64_t>(range.first(), range.one_after_last(), grain_size),
|
||||
[&](const tbb::blocked_range<int64_t> &subrange) {
|
||||
function(IndexRange(subrange.begin(), subrange.size()));
|
||||
});
|
||||
/* Invoking tbb for small workloads has a large overhead. */
|
||||
if (range.size() >= grain_size) {
|
||||
tbb::parallel_for(
|
||||
tbb::blocked_range<int64_t>(range.first(), range.one_after_last(), grain_size),
|
||||
[&](const tbb::blocked_range<int64_t> &subrange) {
|
||||
function(IndexRange(subrange.begin(), subrange.size()));
|
||||
});
|
||||
return;
|
||||
}
|
||||
#else
|
||||
UNUSED_VARS(grain_size);
|
||||
function(range);
|
||||
#endif
|
||||
function(range);
|
||||
}
|
||||
|
||||
template<typename Value, typename Function, typename Reduction>
|
||||
|
|
Loading…
Reference in New Issue