Functions: align chunk sizes in multi-function evaluation

This can improve performance in some circumstances when there are
vectorized and/or unrolled loops. I especially noticed that this helps
a lot while working on D16970 (got a 10-20% speedup there by avoiding
running into the non-vectorized fallback loop too often).
This commit is contained in:
Jacques Lucke 2023-01-22 00:03:25 +01:00
parent 8d98d5c402
commit 3f1886d0b7
2 changed files with 101 additions and 57 deletions

View File

@ -71,6 +71,36 @@ void parallel_for(IndexRange range, int64_t grain_size, const Function &function
function(range);
}
/**
* Same as #parallel_for but tries to make the sub-range sizes multiples of the given alignment.
* This can improve performance when the range is processed using vectorized and/or unrolled loops,
* because the fallback loop that processes remaining values is used less often. A disadvantage of
* using this instead of #parallel_for is that the size differences between sub-ranges can be
* larger, which means that work is distributed less evenly.
*/
template<typename Function>
void parallel_for_aligned(const IndexRange range,
const int64_t grain_size,
const int64_t alignment,
const Function &function)
{
const int64_t global_begin = range.start();
const int64_t global_end = range.one_after_last();
const int64_t alignment_mask = ~(alignment - 1);
parallel_for(range, grain_size, [&](const IndexRange unaligned_range) {
/* Move the sub-range boundaries down to the next aligned index. The "global" begin and end
* remain fixed though. */
const int64_t unaligned_begin = unaligned_range.start();
const int64_t unaligned_end = unaligned_range.one_after_last();
const int64_t aligned_begin = std::max(global_begin, unaligned_begin & alignment_mask);
const int64_t aligned_end = unaligned_end == global_end ?
unaligned_end :
std::max(global_begin, unaligned_end & alignment_mask);
const IndexRange aligned_range{aligned_begin, aligned_end - aligned_begin};
function(aligned_range);
});
}
template<typename Value, typename Function, typename Reduction>
Value parallel_reduce(IndexRange range,
int64_t grain_size,

View File

@ -52,6 +52,16 @@ static int64_t compute_grain_size(const ExecutionHints &hints, const IndexMask m
return grain_size;
}
static int64_t compute_alignment(const int64_t grain_size)
{
if (grain_size <= 512) {
/* Don't use a number that's too large, or otherwise the work will be split quite unevenly. */
return 8;
}
/* It's not common that more elements are processed in a loop at once. */
return 32;
}
void MultiFunction::call_auto(IndexMask mask, Params params, Context context) const
{
if (mask.is_empty()) {
@ -71,71 +81,75 @@ void MultiFunction::call_auto(IndexMask mask, Params params, Context context) co
return;
}
threading::parallel_for(mask.index_range(), grain_size, [&](const IndexRange sub_range) {
const IndexMask sliced_mask = mask.slice(sub_range);
if (!hints.allocates_array) {
/* There is no benefit to changing indices in this case. */
this->call(sliced_mask, params, context);
return;
}
if (sliced_mask[0] < grain_size) {
/* The indices are low, no need to offset them. */
this->call(sliced_mask, params, context);
return;
}
const int64_t input_slice_start = sliced_mask[0];
const int64_t input_slice_size = sliced_mask.last() - input_slice_start + 1;
const IndexRange input_slice_range{input_slice_start, input_slice_size};
Vector<int64_t> offset_mask_indices;
const IndexMask offset_mask = mask.slice_and_offset(sub_range, offset_mask_indices);
ParamsBuilder offset_params{*this, offset_mask.min_array_size()};
/* Slice all parameters so that for the actual function call. */
for (const int param_index : this->param_indices()) {
const ParamType param_type = this->param_type(param_index);
switch (param_type.category()) {
case ParamCategory::SingleInput: {
const GVArray &varray = params.readonly_single_input(param_index);
offset_params.add_readonly_single_input(varray.slice(input_slice_range));
break;
const int64_t alignment = compute_alignment(grain_size);
threading::parallel_for_aligned(
mask.index_range(), grain_size, alignment, [&](const IndexRange sub_range) {
const IndexMask sliced_mask = mask.slice(sub_range);
if (!hints.allocates_array) {
/* There is no benefit to changing indices in this case. */
this->call(sliced_mask, params, context);
return;
}
case ParamCategory::SingleMutable: {
const GMutableSpan span = params.single_mutable(param_index);
const GMutableSpan sliced_span = span.slice(input_slice_range);
offset_params.add_single_mutable(sliced_span);
break;
if (sliced_mask[0] < grain_size) {
/* The indices are low, no need to offset them. */
this->call(sliced_mask, params, context);
return;
}
case ParamCategory::SingleOutput: {
if (bool(signature_ref_->params[param_index].flag & ParamFlag::SupportsUnusedOutput)) {
const GMutableSpan span = params.uninitialized_single_output_if_required(param_index);
if (span.is_empty()) {
offset_params.add_ignored_single_output();
const int64_t input_slice_start = sliced_mask[0];
const int64_t input_slice_size = sliced_mask.last() - input_slice_start + 1;
const IndexRange input_slice_range{input_slice_start, input_slice_size};
Vector<int64_t> offset_mask_indices;
const IndexMask offset_mask = mask.slice_and_offset(sub_range, offset_mask_indices);
ParamsBuilder offset_params{*this, offset_mask.min_array_size()};
/* Slice all parameters so that for the actual function call. */
for (const int param_index : this->param_indices()) {
const ParamType param_type = this->param_type(param_index);
switch (param_type.category()) {
case ParamCategory::SingleInput: {
const GVArray &varray = params.readonly_single_input(param_index);
offset_params.add_readonly_single_input(varray.slice(input_slice_range));
break;
}
else {
case ParamCategory::SingleMutable: {
const GMutableSpan span = params.single_mutable(param_index);
const GMutableSpan sliced_span = span.slice(input_slice_range);
offset_params.add_uninitialized_single_output(sliced_span);
offset_params.add_single_mutable(sliced_span);
break;
}
case ParamCategory::SingleOutput: {
if (bool(signature_ref_->params[param_index].flag &
ParamFlag::SupportsUnusedOutput)) {
const GMutableSpan span = params.uninitialized_single_output_if_required(
param_index);
if (span.is_empty()) {
offset_params.add_ignored_single_output();
}
else {
const GMutableSpan sliced_span = span.slice(input_slice_range);
offset_params.add_uninitialized_single_output(sliced_span);
}
}
else {
const GMutableSpan span = params.uninitialized_single_output(param_index);
const GMutableSpan sliced_span = span.slice(input_slice_range);
offset_params.add_uninitialized_single_output(sliced_span);
}
break;
}
case ParamCategory::VectorInput:
case ParamCategory::VectorMutable:
case ParamCategory::VectorOutput: {
BLI_assert_unreachable();
break;
}
}
else {
const GMutableSpan span = params.uninitialized_single_output(param_index);
const GMutableSpan sliced_span = span.slice(input_slice_range);
offset_params.add_uninitialized_single_output(sliced_span);
}
break;
}
case ParamCategory::VectorInput:
case ParamCategory::VectorMutable:
case ParamCategory::VectorOutput: {
BLI_assert_unreachable();
break;
}
}
}
this->call(offset_mask, offset_params, context);
});
this->call(offset_mask, offset_params, context);
});
}
std::string MultiFunction::debug_name() const