Geometry Nodes: refactor multi-threading in field evaluation

Previously, there was a fixed grain size for all multi-functions. That was
not sufficient because some functions could benefit a lot from smaller
grain sizes.

This refactors adds a new `MultiFunction::call_auto` method which has the
same effect as just calling `MultiFunction::call` but additionally figures
out how to execute the specific multi-function efficiently. It determines
a good grain size and decides whether the mask indices should be shifted
or not.

Most multi-function evaluations benefit from this, but medium sized work
loads (1000 - 50000 elements) benefit from it the most. Especially when
expensive multi-functions (e.g. noise) is involved. This is because for
smaller work loads, threading is rarely used and for larger work loads
threading worked fine before already.

With this patch, multi-functions can specify execution hints, that allow
the caller to execute it most efficiently. These execution hints still
have to be added to more functions.

Some performance measurements of a field evaluation involving noise and
math nodes, ordered by the number of elements being evaluated:
```
1,000,000: 133 ms   -> 120 ms
  100,000:  30 ms   ->  18 ms
   10,000:  20 ms   ->   2.7 ms
    1,000:   4 ms   ->   0.5 ms
      100:   0.5 ms ->   0.4 ms
```
This commit is contained in:
Jacques Lucke 2021-11-26 11:05:47 +01:00
parent 004172de38
commit 658fd8df0b
13 changed files with 264 additions and 163 deletions

View File

@ -34,7 +34,7 @@ set(SRC
intern/generic_virtual_vector_array.cc
intern/multi_function.cc
intern/multi_function_builder.cc
intern/multi_function_parallel.cc
intern/multi_function_params.cc
intern/multi_function_procedure.cc
intern/multi_function_procedure_builder.cc
intern/multi_function_procedure_executor.cc
@ -54,7 +54,6 @@ set(SRC
FN_multi_function_builder.hh
FN_multi_function_context.hh
FN_multi_function_data_type.hh
FN_multi_function_parallel.hh
FN_multi_function_param_type.hh
FN_multi_function_params.hh
FN_multi_function_procedure.hh

View File

@ -60,6 +60,7 @@ class MultiFunction {
{
}
void call_auto(IndexMask mask, MFParams params, MFContext context) const;
virtual void call(IndexMask mask, MFParams params, MFContext context) const = 0;
virtual uint64_t hash() const
@ -110,6 +111,31 @@ class MultiFunction {
return *signature_ref_;
}
/**
* Information about how the multi-function behaves that help a caller to execute it efficiently.
*/
struct ExecutionHints {
/**
* Suggested minimum workload under which multi-threading does not really help.
* This should be lowered when the multi-function is doing something computationally expensive.
*/
int64_t min_grain_size = 10000;
/**
* Indicates that the multi-function will allocate an array large enough to hold all indices
* passed in as mask. This tells the caller that it would be preferable to pass in smaller
* indices. Also maybe the full mask should be split up into smaller segments to decrease peak
* memory usage.
*/
bool allocates_array = false;
/**
* Tells the caller that every execution takes about the same time. This helps making a more
* educated guess about a good grain size.
*/
bool uniform_execution_time = true;
};
ExecutionHints execution_hints() const;
protected:
/* Make the function use the given signature. This should be called once in the constructor of
* child classes. No copy of the signature is made, so the caller has to make sure that the
@ -121,6 +147,8 @@ class MultiFunction {
BLI_assert(signature != nullptr);
signature_ref_ = signature;
}
virtual ExecutionHints get_execution_hints() const;
};
inline MFParamsBuilder::MFParamsBuilder(const MultiFunction &fn, int64_t mask_size)

View File

@ -1,39 +0,0 @@
/*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#pragma once
/** \file
* \ingroup fn
*/
#include "FN_multi_function.hh"
namespace blender::fn {
class ParallelMultiFunction : public MultiFunction {
private:
const MultiFunction &fn_;
const int64_t grain_size_;
bool threading_supported_;
public:
ParallelMultiFunction(const MultiFunction &fn, const int64_t grain_size);
void call(IndexMask mask, MFParams params, MFContext context) const override;
};
} // namespace blender::fn

View File

@ -25,6 +25,8 @@
* the function. `MFParams` is then used inside the called function to access the parameters.
*/
#include <mutex>
#include "BLI_resource_scope.hh"
#include "FN_generic_pointer.hh"
@ -45,6 +47,9 @@ class MFParamsBuilder {
Vector<const GVVectorArray *> virtual_vector_arrays_;
Vector<GVectorArray *> vector_arrays_;
std::mutex mutex_;
Vector<std::pair<int, GMutableSpan>> dummy_output_spans_;
friend class MFParams;
MFParamsBuilder(const MFSignature &signature, const IndexMask mask)
@ -62,8 +67,8 @@ class MFParamsBuilder {
template<typename T> void add_readonly_single_input_value(T value, StringRef expected_name = "")
{
T *value_ptr = &scope_.add_value<T>(std::move(value));
this->add_readonly_single_input(value_ptr, expected_name);
this->add_readonly_single_input(VArray<T>::ForSingle(std::move(value), min_array_size_),
expected_name);
}
template<typename T> void add_readonly_single_input(const T *value, StringRef expected_name = "")
{
@ -254,20 +259,12 @@ class MFParams {
this->assert_correct_param(param_index, name, MFParamType::SingleOutput);
int data_index = builder_->signature_->data_index(param_index);
GMutableSpan span = builder_->mutable_spans_[data_index];
if (span.is_empty()) {
/* The output is ignored by the caller, but the multi-function does not handle this case. So
* create a temporary buffer that the multi-function can write to. */
const CPPType &type = span.type();
void *buffer = builder_->scope_.linear_allocator().allocate(
builder_->min_array_size_ * type.size(), type.alignment());
if (!type.is_trivially_destructible()) {
/* Make sure the temporary elements will be destructed in the end. */
builder_->scope_.add_destruct_call(
[&type, buffer, mask = builder_->mask_]() { type.destruct_indices(buffer, mask); });
}
span = GMutableSpan{type, buffer, builder_->min_array_size_};
if (!span.is_empty()) {
return span;
}
return span;
/* The output is ignored by the caller, but the multi-function does not handle this case. So
* create a temporary buffer that the multi-function can write to. */
return this->ensure_dummy_single_output(data_index);
}
/**
@ -356,6 +353,8 @@ class MFParams {
}
#endif
}
GMutableSpan ensure_dummy_single_output(int data_index);
};
} // namespace blender::fn

View File

@ -34,6 +34,9 @@ class MFProcedureExecutor : public MultiFunction {
MFProcedureExecutor(const MFProcedure &procedure);
void call(IndexMask mask, MFParams params, MFContext context) const override;
private:
ExecutionHints get_execution_hints() const override;
};
} // namespace blender::fn

View File

@ -21,7 +21,6 @@
#include "BLI_vector_set.hh"
#include "FN_field.hh"
#include "FN_multi_function_parallel.hh"
namespace blender::fn {
@ -358,13 +357,8 @@ Vector<GVArray> evaluate_fields(ResourceScope &scope,
build_multi_function_procedure_for_fields(
procedure, scope, field_tree_info, varying_fields_to_evaluate);
MFProcedureExecutor procedure_executor{procedure};
/* Add multi threading capabilities to the field evaluation. */
const int grain_size = 10000;
fn::ParallelMultiFunction parallel_procedure_executor{procedure_executor, grain_size};
/* Utility variable to make easy to switch the executor. */
const MultiFunction &executor_fn = parallel_procedure_executor;
MFParamsBuilder mf_params{executor_fn, &mask};
MFParamsBuilder mf_params{procedure_executor, &mask};
MFContextBuilder mf_context;
/* Provide inputs to the procedure executor. */
@ -405,7 +399,7 @@ Vector<GVArray> evaluate_fields(ResourceScope &scope,
mf_params.add_uninitialized_single_output(span);
}
executor_fn.call(mask, mf_params, mf_context);
procedure_executor.call_auto(mask, mf_params, mf_context);
}
/* Evaluate constant fields if necessary. */

View File

@ -16,8 +16,141 @@
#include "FN_multi_function.hh"
#include "BLI_task.hh"
#include "BLI_threads.h"
namespace blender::fn {
using ExecutionHints = MultiFunction::ExecutionHints;
ExecutionHints MultiFunction::execution_hints() const
{
return this->get_execution_hints();
}
ExecutionHints MultiFunction::get_execution_hints() const
{
return ExecutionHints{};
}
static bool supports_threading_by_slicing_params(const MultiFunction &fn)
{
for (const int i : fn.param_indices()) {
const MFParamType param_type = fn.param_type(i);
if (ELEM(param_type.interface_type(),
MFParamType::InterfaceType::Mutable,
MFParamType::InterfaceType::Output)) {
if (param_type.data_type().is_vector()) {
return false;
}
}
}
return true;
}
static int64_t compute_grain_size(const ExecutionHints &hints, const IndexMask mask)
{
int64_t grain_size = hints.min_grain_size;
if (hints.uniform_execution_time) {
const int thread_count = BLI_system_thread_count();
/* Avoid using a small grain size even if it is not necessary. */
const int64_t thread_based_grain_size = mask.size() / thread_count / 4;
grain_size = std::max(grain_size, thread_based_grain_size);
}
if (hints.allocates_array) {
const int64_t max_grain_size = 10000;
/* Avoid allocating many large intermediate arrays. Better process data in smaller chunks to
* keep peak memory usage lower. */
grain_size = std::min(grain_size, max_grain_size);
}
return grain_size;
}
/**
* The result is the same as using #call directly but this method has some additional features.
* - Automatic multi-threading when possible and appropriate.
* - Automatic index mask offsetting to avoid large temporary intermediate arrays that are mostly
* unused.
*/
void MultiFunction::call_auto(IndexMask mask, MFParams params, MFContext context) const
{
if (mask.is_empty()) {
return;
}
const ExecutionHints hints = this->execution_hints();
const int64_t grain_size = compute_grain_size(hints, mask);
if (mask.size() <= grain_size) {
this->call(mask, params, context);
return;
}
const bool supports_threading = supports_threading_by_slicing_params(*this);
if (!supports_threading) {
this->call(mask, params, context);
return;
}
threading::parallel_for(mask.index_range(), grain_size, [&](const IndexRange sub_range) {
const IndexMask sliced_mask = mask.slice(sub_range);
if (!hints.allocates_array) {
/* There is no benefit to changing indices in this case. */
this->call(sliced_mask, params, context);
return;
}
if (sliced_mask[0] < grain_size) {
/* The indices are low, no need to offset them. */
this->call(sliced_mask, params, context);
return;
}
const int64_t input_slice_start = sliced_mask[0];
const int64_t input_slice_size = sliced_mask.last() - input_slice_start + 1;
const IndexRange input_slice_range{input_slice_start, input_slice_size};
Vector<int64_t> offset_mask_indices;
const IndexMask offset_mask = mask.slice_and_offset(sub_range, offset_mask_indices);
MFParamsBuilder offset_params{*this, offset_mask.min_array_size()};
/* Slice all parameters so that for the actual function call. */
for (const int param_index : this->param_indices()) {
const MFParamType param_type = this->param_type(param_index);
switch (param_type.category()) {
case MFParamType::SingleInput: {
const GVArray &varray = params.readonly_single_input(param_index);
offset_params.add_readonly_single_input(varray.slice(input_slice_range));
break;
}
case MFParamType::SingleMutable: {
const GMutableSpan span = params.single_mutable(param_index);
const GMutableSpan sliced_span = span.slice(input_slice_range);
offset_params.add_single_mutable(sliced_span);
break;
}
case MFParamType::SingleOutput: {
const GMutableSpan span = params.uninitialized_single_output_if_required(param_index);
if (span.is_empty()) {
offset_params.add_ignored_single_output();
}
else {
const GMutableSpan sliced_span = span.slice(input_slice_range);
offset_params.add_uninitialized_single_output(sliced_span);
}
break;
}
case MFParamType::VectorInput:
case MFParamType::VectorMutable:
case MFParamType::VectorOutput: {
BLI_assert_unreachable();
break;
}
}
}
this->call(offset_mask, offset_params, context);
});
}
std::string MultiFunction::debug_name() const
{
return signature_ref_->function_name;

View File

@ -1,93 +0,0 @@
/*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include "FN_multi_function_parallel.hh"
#include "BLI_task.hh"
namespace blender::fn {
ParallelMultiFunction::ParallelMultiFunction(const MultiFunction &fn, const int64_t grain_size)
: fn_(fn), grain_size_(grain_size)
{
this->set_signature(&fn.signature());
threading_supported_ = true;
for (const int param_index : fn.param_indices()) {
const MFParamType param_type = fn.param_type(param_index);
if (param_type.data_type().category() == MFDataType::Vector) {
/* Vector parameters do not support threading yet. */
threading_supported_ = false;
break;
}
}
}
void ParallelMultiFunction::call(IndexMask full_mask, MFParams params, MFContext context) const
{
if (full_mask.size() <= grain_size_ || !threading_supported_) {
fn_.call(full_mask, params, context);
return;
}
threading::parallel_for(full_mask.index_range(), grain_size_, [&](const IndexRange mask_slice) {
Vector<int64_t> sub_mask_indices;
const IndexMask sub_mask = full_mask.slice_and_offset(mask_slice, sub_mask_indices);
if (sub_mask.is_empty()) {
return;
}
const int64_t input_slice_start = full_mask[mask_slice.first()];
const int64_t input_slice_size = full_mask[mask_slice.last()] - input_slice_start + 1;
const IndexRange input_slice_range{input_slice_start, input_slice_size};
MFParamsBuilder sub_params{fn_, sub_mask.min_array_size()};
/* All parameters are sliced so that the wrapped multi-function does not have to take care of
* the index offset. */
for (const int param_index : fn_.param_indices()) {
const MFParamType param_type = fn_.param_type(param_index);
switch (param_type.category()) {
case MFParamType::SingleInput: {
const GVArray &varray = params.readonly_single_input(param_index);
sub_params.add_readonly_single_input(varray.slice(input_slice_range));
break;
}
case MFParamType::SingleMutable: {
const GMutableSpan span = params.single_mutable(param_index);
const GMutableSpan sliced_span = span.slice(input_slice_start, input_slice_size);
sub_params.add_single_mutable(sliced_span);
break;
}
case MFParamType::SingleOutput: {
const GMutableSpan span = params.uninitialized_single_output(param_index);
const GMutableSpan sliced_span = span.slice(input_slice_start, input_slice_size);
sub_params.add_uninitialized_single_output(sliced_span);
break;
}
case MFParamType::VectorInput:
case MFParamType::VectorMutable:
case MFParamType::VectorOutput: {
BLI_assert_unreachable();
break;
}
}
}
fn_.call(sub_mask, sub_params, context);
});
}
} // namespace blender::fn

View File

@ -0,0 +1,44 @@
/*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include "FN_multi_function_params.hh"
namespace blender::fn {
GMutableSpan MFParams::ensure_dummy_single_output(int data_index)
{
/* Lock because we are actually modifying #builder_ and it may be used by multiple threads. */
std::lock_guard lock{builder_->mutex_};
for (const std::pair<int, GMutableSpan> &items : builder_->dummy_output_spans_) {
if (items.first == data_index) {
return items.second;
}
}
const CPPType &type = builder_->mutable_spans_[data_index].type();
void *buffer = builder_->scope_.linear_allocator().allocate(
builder_->min_array_size_ * type.size(), type.alignment());
if (!type.is_trivially_destructible()) {
builder_->scope_.add_destruct_call(
[&type, buffer, mask = builder_->mask_]() { type.destruct_indices(buffer, mask); });
}
const GMutableSpan span{type, buffer, builder_->min_array_size_};
builder_->dummy_output_spans_.append({data_index, span});
return span;
}
} // namespace blender::fn

View File

@ -1045,7 +1045,7 @@ static void execute_call_instruction(const MFCallInstruction &instruction,
}
try {
fn.call(mask, params, context);
fn.call_auto(mask, params, context);
}
catch (...) {
/* Multi-functions must not throw exceptions. */
@ -1236,4 +1236,12 @@ void MFProcedureExecutor::call(IndexMask full_mask, MFParams params, MFContext c
}
}
MultiFunction::ExecutionHints MFProcedureExecutor::get_execution_hints() const
{
ExecutionHints hints;
hints.allocates_array = true;
hints.min_grain_size = 10000;
return hints;
}
} // namespace blender::fn

View File

@ -131,9 +131,9 @@ static void get_closest_in_bvhtree(BVHTreeFromMesh &tree_data,
const MutableSpan<float> r_distances_sq,
const MutableSpan<float3> r_positions)
{
BLI_assert(positions.size() == r_indices.size() || r_indices.is_empty());
BLI_assert(positions.size() == r_distances_sq.size() || r_distances_sq.is_empty());
BLI_assert(positions.size() == r_positions.size() || r_positions.is_empty());
BLI_assert(positions.size() >= r_indices.size());
BLI_assert(positions.size() >= r_distances_sq.size());
BLI_assert(positions.size() >= r_positions.size());
for (const int i : mask) {
BVHTreeNearest nearest;
@ -159,7 +159,7 @@ static void get_closest_pointcloud_points(const PointCloud &pointcloud,
const MutableSpan<int> r_indices,
const MutableSpan<float> r_distances_sq)
{
BLI_assert(positions.size() == r_indices.size());
BLI_assert(positions.size() >= r_indices.size());
BLI_assert(pointcloud.totpoint > 0);
BVHTreeFromPointCloud tree_data;

View File

@ -229,6 +229,14 @@ class NoiseFunction : public fn::MultiFunction {
}
}
}
ExecutionHints get_execution_hints() const override
{
ExecutionHints hints;
hints.allocates_array = false;
hints.min_grain_size = 100;
return hints;
}
};
static void sh_node_noise_build_multi_function(blender::nodes::NodeMultiFunctionBuilder &builder)

View File

@ -167,6 +167,8 @@ static void node_shader_update_tex_voronoi(bNodeTree *ntree, bNode *node)
namespace blender::nodes {
static MultiFunction::ExecutionHints voronoi_execution_hints{50, false};
class VoronoiMinowskiFunction : public fn::MultiFunction {
private:
int dimensions_;
@ -592,6 +594,11 @@ class VoronoiMinowskiFunction : public fn::MultiFunction {
}
}
}
ExecutionHints get_execution_hints() const override
{
return voronoi_execution_hints;
}
};
class VoronoiMetricFunction : public fn::MultiFunction {
@ -1106,6 +1113,11 @@ class VoronoiMetricFunction : public fn::MultiFunction {
}
}
}
ExecutionHints get_execution_hints() const override
{
return voronoi_execution_hints;
}
};
class VoronoiEdgeFunction : public fn::MultiFunction {
@ -1282,7 +1294,12 @@ class VoronoiEdgeFunction : public fn::MultiFunction {
break;
}
}
};
}
ExecutionHints get_execution_hints() const override
{
return voronoi_execution_hints;
}
};
static void sh_node_voronoi_build_multi_function(blender::nodes::NodeMultiFunctionBuilder &builder)