Functions: align chunk sizes in multi-function evaluation
This can improve performance in some circumstances when there are vectorized and/or unrolled loops. I especially noticed that this helps a lot while working on D16970 (got a 10-20% speedup there by avoiding running into the non-vectorized fallback loop too often).
This commit is contained in:
parent
8d98d5c402
commit
3f1886d0b7
|
@ -71,6 +71,36 @@ void parallel_for(IndexRange range, int64_t grain_size, const Function &function
|
|||
function(range);
|
||||
}
|
||||
|
||||
/**
|
||||
* Same as #parallel_for but tries to make the sub-range sizes multiples of the given alignment.
|
||||
* This can improve performance when the range is processed using vectorized and/or unrolled loops,
|
||||
* because the fallback loop that processes remaining values is used less often. A disadvantage of
|
||||
* using this instead of #parallel_for is that the size differences between sub-ranges can be
|
||||
* larger, which means that work is distributed less evenly.
|
||||
*/
|
||||
template<typename Function>
|
||||
void parallel_for_aligned(const IndexRange range,
|
||||
const int64_t grain_size,
|
||||
const int64_t alignment,
|
||||
const Function &function)
|
||||
{
|
||||
const int64_t global_begin = range.start();
|
||||
const int64_t global_end = range.one_after_last();
|
||||
const int64_t alignment_mask = ~(alignment - 1);
|
||||
parallel_for(range, grain_size, [&](const IndexRange unaligned_range) {
|
||||
/* Move the sub-range boundaries down to the next aligned index. The "global" begin and end
|
||||
* remain fixed though. */
|
||||
const int64_t unaligned_begin = unaligned_range.start();
|
||||
const int64_t unaligned_end = unaligned_range.one_after_last();
|
||||
const int64_t aligned_begin = std::max(global_begin, unaligned_begin & alignment_mask);
|
||||
const int64_t aligned_end = unaligned_end == global_end ?
|
||||
unaligned_end :
|
||||
std::max(global_begin, unaligned_end & alignment_mask);
|
||||
const IndexRange aligned_range{aligned_begin, aligned_end - aligned_begin};
|
||||
function(aligned_range);
|
||||
});
|
||||
}
|
||||
|
||||
template<typename Value, typename Function, typename Reduction>
|
||||
Value parallel_reduce(IndexRange range,
|
||||
int64_t grain_size,
|
||||
|
|
|
@ -52,6 +52,16 @@ static int64_t compute_grain_size(const ExecutionHints &hints, const IndexMask m
|
|||
return grain_size;
|
||||
}
|
||||
|
||||
static int64_t compute_alignment(const int64_t grain_size)
|
||||
{
|
||||
if (grain_size <= 512) {
|
||||
/* Don't use a number that's too large, or otherwise the work will be split quite unevenly. */
|
||||
return 8;
|
||||
}
|
||||
/* It's not common that more elements are processed in a loop at once. */
|
||||
return 32;
|
||||
}
|
||||
|
||||
void MultiFunction::call_auto(IndexMask mask, Params params, Context context) const
|
||||
{
|
||||
if (mask.is_empty()) {
|
||||
|
@ -71,71 +81,75 @@ void MultiFunction::call_auto(IndexMask mask, Params params, Context context) co
|
|||
return;
|
||||
}
|
||||
|
||||
threading::parallel_for(mask.index_range(), grain_size, [&](const IndexRange sub_range) {
|
||||
const IndexMask sliced_mask = mask.slice(sub_range);
|
||||
if (!hints.allocates_array) {
|
||||
/* There is no benefit to changing indices in this case. */
|
||||
this->call(sliced_mask, params, context);
|
||||
return;
|
||||
}
|
||||
if (sliced_mask[0] < grain_size) {
|
||||
/* The indices are low, no need to offset them. */
|
||||
this->call(sliced_mask, params, context);
|
||||
return;
|
||||
}
|
||||
const int64_t input_slice_start = sliced_mask[0];
|
||||
const int64_t input_slice_size = sliced_mask.last() - input_slice_start + 1;
|
||||
const IndexRange input_slice_range{input_slice_start, input_slice_size};
|
||||
|
||||
Vector<int64_t> offset_mask_indices;
|
||||
const IndexMask offset_mask = mask.slice_and_offset(sub_range, offset_mask_indices);
|
||||
|
||||
ParamsBuilder offset_params{*this, offset_mask.min_array_size()};
|
||||
|
||||
/* Slice all parameters so that for the actual function call. */
|
||||
for (const int param_index : this->param_indices()) {
|
||||
const ParamType param_type = this->param_type(param_index);
|
||||
switch (param_type.category()) {
|
||||
case ParamCategory::SingleInput: {
|
||||
const GVArray &varray = params.readonly_single_input(param_index);
|
||||
offset_params.add_readonly_single_input(varray.slice(input_slice_range));
|
||||
break;
|
||||
const int64_t alignment = compute_alignment(grain_size);
|
||||
threading::parallel_for_aligned(
|
||||
mask.index_range(), grain_size, alignment, [&](const IndexRange sub_range) {
|
||||
const IndexMask sliced_mask = mask.slice(sub_range);
|
||||
if (!hints.allocates_array) {
|
||||
/* There is no benefit to changing indices in this case. */
|
||||
this->call(sliced_mask, params, context);
|
||||
return;
|
||||
}
|
||||
case ParamCategory::SingleMutable: {
|
||||
const GMutableSpan span = params.single_mutable(param_index);
|
||||
const GMutableSpan sliced_span = span.slice(input_slice_range);
|
||||
offset_params.add_single_mutable(sliced_span);
|
||||
break;
|
||||
if (sliced_mask[0] < grain_size) {
|
||||
/* The indices are low, no need to offset them. */
|
||||
this->call(sliced_mask, params, context);
|
||||
return;
|
||||
}
|
||||
case ParamCategory::SingleOutput: {
|
||||
if (bool(signature_ref_->params[param_index].flag & ParamFlag::SupportsUnusedOutput)) {
|
||||
const GMutableSpan span = params.uninitialized_single_output_if_required(param_index);
|
||||
if (span.is_empty()) {
|
||||
offset_params.add_ignored_single_output();
|
||||
const int64_t input_slice_start = sliced_mask[0];
|
||||
const int64_t input_slice_size = sliced_mask.last() - input_slice_start + 1;
|
||||
const IndexRange input_slice_range{input_slice_start, input_slice_size};
|
||||
|
||||
Vector<int64_t> offset_mask_indices;
|
||||
const IndexMask offset_mask = mask.slice_and_offset(sub_range, offset_mask_indices);
|
||||
|
||||
ParamsBuilder offset_params{*this, offset_mask.min_array_size()};
|
||||
|
||||
/* Slice all parameters so that for the actual function call. */
|
||||
for (const int param_index : this->param_indices()) {
|
||||
const ParamType param_type = this->param_type(param_index);
|
||||
switch (param_type.category()) {
|
||||
case ParamCategory::SingleInput: {
|
||||
const GVArray &varray = params.readonly_single_input(param_index);
|
||||
offset_params.add_readonly_single_input(varray.slice(input_slice_range));
|
||||
break;
|
||||
}
|
||||
else {
|
||||
case ParamCategory::SingleMutable: {
|
||||
const GMutableSpan span = params.single_mutable(param_index);
|
||||
const GMutableSpan sliced_span = span.slice(input_slice_range);
|
||||
offset_params.add_uninitialized_single_output(sliced_span);
|
||||
offset_params.add_single_mutable(sliced_span);
|
||||
break;
|
||||
}
|
||||
case ParamCategory::SingleOutput: {
|
||||
if (bool(signature_ref_->params[param_index].flag &
|
||||
ParamFlag::SupportsUnusedOutput)) {
|
||||
const GMutableSpan span = params.uninitialized_single_output_if_required(
|
||||
param_index);
|
||||
if (span.is_empty()) {
|
||||
offset_params.add_ignored_single_output();
|
||||
}
|
||||
else {
|
||||
const GMutableSpan sliced_span = span.slice(input_slice_range);
|
||||
offset_params.add_uninitialized_single_output(sliced_span);
|
||||
}
|
||||
}
|
||||
else {
|
||||
const GMutableSpan span = params.uninitialized_single_output(param_index);
|
||||
const GMutableSpan sliced_span = span.slice(input_slice_range);
|
||||
offset_params.add_uninitialized_single_output(sliced_span);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ParamCategory::VectorInput:
|
||||
case ParamCategory::VectorMutable:
|
||||
case ParamCategory::VectorOutput: {
|
||||
BLI_assert_unreachable();
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
const GMutableSpan span = params.uninitialized_single_output(param_index);
|
||||
const GMutableSpan sliced_span = span.slice(input_slice_range);
|
||||
offset_params.add_uninitialized_single_output(sliced_span);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ParamCategory::VectorInput:
|
||||
case ParamCategory::VectorMutable:
|
||||
case ParamCategory::VectorOutput: {
|
||||
BLI_assert_unreachable();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this->call(offset_mask, offset_params, context);
|
||||
});
|
||||
this->call(offset_mask, offset_params, context);
|
||||
});
|
||||
}
|
||||
|
||||
std::string MultiFunction::debug_name() const
|
||||
|
|
Loading…
Reference in New Issue