Functions: align chunk sizes in multi-function evaluation

This can improve performance in some circumstances when there are vectorized and/or unrolled loops. I especially noticed that this helps a lot while working on D16970 (got a 10-20% speedup there by avoiding running into the non-vectorized fallback loop too often).
2023-01-22 00:03:25 +01:00 · 2023-01-22 00:03:25 +01:00 · 3f1886d0b7
parent 8d98d5c402
commit 3f1886d0b7
2 changed files with 101 additions and 57 deletions
--- a/source/blender/blenlib/BLI_task.hh
+++ b/source/blender/blenlib/BLI_task.hh
@ -71,6 +71,36 @@ void parallel_for(IndexRange range, int64_t grain_size, const Function &function
  function(range);
 }

+/**
+ * Same as #parallel_for but tries to make the sub-range sizes multiples of the given alignment.
+ * This can improve performance when the range is processed using vectorized and/or unrolled loops,
+ * because the fallback loop that processes remaining values is used less often. A disadvantage of
+ * using this instead of #parallel_for is that the size differences between sub-ranges can be
+ * larger, which means that work is distributed less evenly.
+ */
+template<typename Function>
+void parallel_for_aligned(const IndexRange range,
+                          const int64_t grain_size,
+                          const int64_t alignment,
+                          const Function &function)
+{
+  const int64_t global_begin = range.start();
+  const int64_t global_end = range.one_after_last();
+  const int64_t alignment_mask = ~(alignment - 1);
+  parallel_for(range, grain_size, [&](const IndexRange unaligned_range) {
+    /* Move the sub-range boundaries down to the next aligned index. The "global" begin and end
+     * remain fixed though. */
+    const int64_t unaligned_begin = unaligned_range.start();
+    const int64_t unaligned_end = unaligned_range.one_after_last();
+    const int64_t aligned_begin = std::max(global_begin, unaligned_begin & alignment_mask);
+    const int64_t aligned_end = unaligned_end == global_end ?
+                                    unaligned_end :
+                                    std::max(global_begin, unaligned_end & alignment_mask);
+    const IndexRange aligned_range{aligned_begin, aligned_end - aligned_begin};
+    function(aligned_range);
+  });
+}
+
 template<typename Value, typename Function, typename Reduction>
 Value parallel_reduce(IndexRange range,
                      int64_t grain_size,
--- a/source/blender/functions/intern/multi_function.cc
+++ b/source/blender/functions/intern/multi_function.cc
@ -52,6 +52,16 @@ static int64_t compute_grain_size(const ExecutionHints &hints, const IndexMask m
  return grain_size;
 }

+static int64_t compute_alignment(const int64_t grain_size)
+{
+  if (grain_size <= 512) {
+    /* Don't use a number that's too large, or otherwise the work will be split quite unevenly. */
+    return 8;
+  }
+  /* It's not common that more elements are processed in a loop at once. */
+  return 32;
+}
+
 void MultiFunction::call_auto(IndexMask mask, Params params, Context context) const
 {
  if (mask.is_empty()) {
@ -71,71 +81,75 @@ void MultiFunction::call_auto(IndexMask mask, Params params, Context context) co
    return;
  }

-  threading::parallel_for(mask.index_range(), grain_size, [&](const IndexRange sub_range) {
-    const IndexMask sliced_mask = mask.slice(sub_range);
-    if (!hints.allocates_array) {
-      /* There is no benefit to changing indices in this case. */
-      this->call(sliced_mask, params, context);
-      return;
-    }
-    if (sliced_mask[0] < grain_size) {
-      /* The indices are low, no need to offset them. */
-      this->call(sliced_mask, params, context);
-      return;
-    }
-    const int64_t input_slice_start = sliced_mask[0];
-    const int64_t input_slice_size = sliced_mask.last() - input_slice_start + 1;
-    const IndexRange input_slice_range{input_slice_start, input_slice_size};
-
-    Vector<int64_t> offset_mask_indices;
-    const IndexMask offset_mask = mask.slice_and_offset(sub_range, offset_mask_indices);
-
-    ParamsBuilder offset_params{*this, offset_mask.min_array_size()};
-
-    /* Slice all parameters so that for the actual function call. */
-    for (const int param_index : this->param_indices()) {
-      const ParamType param_type = this->param_type(param_index);
-      switch (param_type.category()) {
-        case ParamCategory::SingleInput: {
-          const GVArray &varray = params.readonly_single_input(param_index);
-          offset_params.add_readonly_single_input(varray.slice(input_slice_range));
-          break;
+  const int64_t alignment = compute_alignment(grain_size);
+  threading::parallel_for_aligned(
+      mask.index_range(), grain_size, alignment, [&](const IndexRange sub_range) {
+        const IndexMask sliced_mask = mask.slice(sub_range);
+        if (!hints.allocates_array) {
+          /* There is no benefit to changing indices in this case. */
+          this->call(sliced_mask, params, context);
+          return;
        }
-        case ParamCategory::SingleMutable: {
-          const GMutableSpan span = params.single_mutable(param_index);
-          const GMutableSpan sliced_span = span.slice(input_slice_range);
-          offset_params.add_single_mutable(sliced_span);
-          break;
+        if (sliced_mask[0] < grain_size) {
+          /* The indices are low, no need to offset them. */
+          this->call(sliced_mask, params, context);
+          return;
        }
-        case ParamCategory::SingleOutput: {
-          if (bool(signature_ref_->params[param_index].flag & ParamFlag::SupportsUnusedOutput)) {
-            const GMutableSpan span = params.uninitialized_single_output_if_required(param_index);
-            if (span.is_empty()) {
-              offset_params.add_ignored_single_output();
+        const int64_t input_slice_start = sliced_mask[0];
+        const int64_t input_slice_size = sliced_mask.last() - input_slice_start + 1;
+        const IndexRange input_slice_range{input_slice_start, input_slice_size};
+
+        Vector<int64_t> offset_mask_indices;
+        const IndexMask offset_mask = mask.slice_and_offset(sub_range, offset_mask_indices);
+
+        ParamsBuilder offset_params{*this, offset_mask.min_array_size()};
+
+        /* Slice all parameters so that for the actual function call. */
+        for (const int param_index : this->param_indices()) {
+          const ParamType param_type = this->param_type(param_index);
+          switch (param_type.category()) {
+            case ParamCategory::SingleInput: {
+              const GVArray &varray = params.readonly_single_input(param_index);
+              offset_params.add_readonly_single_input(varray.slice(input_slice_range));
+              break;
            }
-            else {
+            case ParamCategory::SingleMutable: {
+              const GMutableSpan span = params.single_mutable(param_index);
              const GMutableSpan sliced_span = span.slice(input_slice_range);
-              offset_params.add_uninitialized_single_output(sliced_span);
+              offset_params.add_single_mutable(sliced_span);
+              break;
+            }
+            case ParamCategory::SingleOutput: {
+              if (bool(signature_ref_->params[param_index].flag &
+                       ParamFlag::SupportsUnusedOutput)) {
+                const GMutableSpan span = params.uninitialized_single_output_if_required(
+                    param_index);
+                if (span.is_empty()) {
+                  offset_params.add_ignored_single_output();
+                }
+                else {
+                  const GMutableSpan sliced_span = span.slice(input_slice_range);
+                  offset_params.add_uninitialized_single_output(sliced_span);
+                }
+              }
+              else {
+                const GMutableSpan span = params.uninitialized_single_output(param_index);
+                const GMutableSpan sliced_span = span.slice(input_slice_range);
+                offset_params.add_uninitialized_single_output(sliced_span);
+              }
+              break;
+            }
+            case ParamCategory::VectorInput:
+            case ParamCategory::VectorMutable:
+            case ParamCategory::VectorOutput: {
+              BLI_assert_unreachable();
+              break;
            }
          }
-          else {
-            const GMutableSpan span = params.uninitialized_single_output(param_index);
-            const GMutableSpan sliced_span = span.slice(input_slice_range);
-            offset_params.add_uninitialized_single_output(sliced_span);
-          }
-          break;
        }
-        case ParamCategory::VectorInput:
-        case ParamCategory::VectorMutable:
-        case ParamCategory::VectorOutput: {
-          BLI_assert_unreachable();
-          break;
-        }
-      }
-    }

-    this->call(offset_mask, offset_params, context);
-  });
+        this->call(offset_mask, offset_params, context);
+      });
 }

 std::string MultiFunction::debug_name() const