BLI: optimize constructing IndexMask from bits and bools

This patch optimizes `IndexMask::from_bits` by making use of the fact that many bits can be processed at once and one does not have to look at every bit individual in many cases. Bits are stored as array of `BitInt` (aka `uint64_t`). So we can process at least 64 bits at a time. On some platforms we can also make use of SIMD and process up to 128 bits at once. This can significantly improve performance if all bits are set/unset. As a byproduct, this patch also optimizes `IndexMask::from_bools` which is now implemented in terms of `IndexMask::from_bits`. The conversion from bools to bits has been optimized significantly too by using SIMD intrinsics. Pull Request: #126888
2024-08-29 12:15:33 +02:00 · 2024-08-29 12:15:33 +02:00 · 66adedbd78
commit 66adedbd78
parent 491df9df6f
15 changed files with 1002 additions and 14 deletions
--- a/source/blender/blenlib/BLI_bit_bool_conversion.hh
+++ b/source/blender/blenlib/BLI_bit_bool_conversion.hh
@ -0,0 +1,25 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#pragma once
+
+#include "BLI_bit_span.hh"
+#include "BLI_span.hh"
+
+namespace blender::bits {
+
+/**
+ * Converts the bools to bits and `or`s them into the given bits. For pure conversion, the bits
+ * should therefore be zero initialized before they are passed into this function.
+ *
+ * \param allowed_overshoot: How many bools/bits can be read/written after the end of the given
+ * spans. This can help with performance because the internal algorithm can process many elements
+ * at once.
+ *
+ * \return True if any of the checked bools were true (this also includes the bools in the
+ * overshoot).
+ */
+bool or_bools_into_bits(Span<bool> bools, MutableBitSpan r_bits, int64_t allowed_overshoot = 0);
+
+}  // namespace blender::bits
--- a/source/blender/blenlib/BLI_bit_span_to_index_ranges.hh
+++ b/source/blender/blenlib/BLI_bit_span_to_index_ranges.hh
@ -0,0 +1,136 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#pragma once
+
+#include <limits>
+
+#include "BLI_bit_span.hh"
+#include "BLI_index_ranges_builder.hh"
+#include "BLI_math_bits.h"
+#include "BLI_simd.hh"
+
+namespace blender::bits {
+
+/**
+ * Extracts index ranges from the given bits.
+ * For example 00111011 would result in two ranges: [2-4], [6-7].
+ *
+ * It's especially optimized to handle cases where there are very many or very few set bits.
+ */
+template<typename IntT>
+inline void bits_to_index_ranges(const BitSpan bits, IndexRangesBuilder<IntT> &builder)
+{
+  if (bits.is_empty()) {
+    return;
+  }
+
+  /* -1 because we also need to store the end of the last range. */
+  constexpr int64_t max_index = std::numeric_limits<IntT>::max() - 1;
+  UNUSED_VARS_NDEBUG(max_index);
+
+  auto append_range = [&](const IndexRange range) {
+    BLI_assert(range.last() <= max_index);
+    builder.add_range(IntT(range.start()), IntT(range.one_after_last()));
+  };
+
+  auto process_bit_int = [&](const BitInt value,
+                             const int64_t start_bit,
+                             const int64_t bits_num,
+                             const int64_t start) {
+    /* The bits in the mask are the ones we should look at. */
+    const BitInt mask = mask_range_bits(start_bit, bits_num);
+    const BitInt masked_value = mask & value;
+    if (masked_value == 0) {
+      /* Do nothing. */
+      return;
+    }
+    if (masked_value == mask) {
+      /* All bits are set. */
+      append_range(IndexRange::from_begin_size(start, bits_num));
+      return;
+    }
+    const int64_t bit_i_to_output_offset = start - start_bit;
+
+    /* Iterate over ranges of 1s. For example, if the bits are 0b000111110001111000, the loop
+     * below requires two iterations. The worst case for this is when the there are very many small
+     * ranges of 1s (e.g. 0b10101010101). So far it seems like the overhead of detecting such
+     * cases is higher than the potential benefit of using another algorithm. */
+    BitInt current_value = masked_value;
+    while (current_value != 0) {
+      /* Find start of next range of 1s. */
+      const int64_t first_set_bit_i = int64_t(bitscan_forward_uint64(current_value));
+      /* This mask is used to find the end of the 1s range.  */
+      const BitInt find_unset_value = ~(current_value | mask_first_n_bits(first_set_bit_i) |
+                                        ~mask);
+      if (find_unset_value == 0) {
+        /* In this case, the range one 1s extends to the end of the current integer. */
+        const IndexRange range = IndexRange::from_begin_end(first_set_bit_i, start_bit + bits_num);
+        append_range(range.shift(bit_i_to_output_offset));
+        break;
+      }
+      /* Find the index of the first 0 after the range of 1s. */
+      const int64_t next_unset_bit_i = int64_t(bitscan_forward_uint64(find_unset_value));
+      /* Store the range of 1s. */
+      const IndexRange range = IndexRange::from_begin_end(first_set_bit_i, next_unset_bit_i);
+      append_range(range.shift(bit_i_to_output_offset));
+      /* Remove the processed range of 1s so that it is ignored in the next iteration. */
+      current_value &= ~mask_first_n_bits(next_unset_bit_i);
+    }
+    return;
+  };
+
+  const BitInt *data = bits.data();
+  const IndexRange bit_range = bits.bit_range();
+
+  /* As much as possible we want to process full 64-bit integers at once. However, the bit-span may
+   * not be aligned, so it's first split up into aligned and unaligned sections. */
+  const AlignedIndexRanges ranges = split_index_range_by_alignment(bit_range, bits::BitsPerInt);
+
+  /* Process the first (partial) integer in the bit-span. */
+  if (!ranges.prefix.is_empty()) {
+    const BitInt first_int = *int_containing_bit(data, bit_range.start());
+    process_bit_int(
+        first_int, BitInt(ranges.prefix.start()) & BitIndexMask, ranges.prefix.size(), 0);
+  }
+
+  /* Process all the full integers in the bit-span. */
+  if (!ranges.aligned.is_empty()) {
+    const BitInt *start = int_containing_bit(data, ranges.aligned.start());
+    const int64_t ints_to_check = ranges.aligned.size() / BitsPerInt;
+    int64_t int_i = 0;
+
+/* Checking for chunks of 0 bits can be speedup using intrinsics quite significantly. */
+#if BLI_HAVE_SSE2
+    for (; int_i + 1 < ints_to_check; int_i += 2) {
+      /* Loads the next 128 bit. */
+      const __m128i group = _mm_loadu_si128(reinterpret_cast<const __m128i *>(start + int_i));
+      /* Checks if all the 128 bits are zero. */
+      const bool group_is_zero = _mm_testz_si128(group, group);
+      if (group_is_zero) {
+        continue;
+      }
+      /* If at least one of them is not zero, process the two integers separately. */
+      for (int j = 0; j < 2; j++) {
+        process_bit_int(
+            start[int_i + j], 0, BitsPerInt, ranges.prefix.size() + (int_i + j) * BitsPerInt);
+      }
+    }
+#endif
+
+    /* Process the remaining integers. */
+    for (; int_i < ints_to_check; int_i++) {
+      process_bit_int(start[int_i], 0, BitsPerInt, ranges.prefix.size() + int_i * BitsPerInt);
+    }
+  }
+
+  /* Process the final few bits that don't fill up a full integer. */
+  if (!ranges.suffix.is_empty()) {
+    const BitInt last_int = *int_containing_bit(data, bit_range.last());
+    process_bit_int(
+        last_int, 0, ranges.suffix.size(), ranges.prefix.size() + ranges.aligned.size());
+  }
+}
+
+}  // namespace blender::bits
--- a/source/blender/blenlib/BLI_bit_vector.hh
+++ b/source/blender/blenlib/BLI_bit_vector.hh
@ -38,6 +38,7 @@
 */

 #include "BLI_allocator.hh"
+#include "BLI_bit_bool_conversion.hh"
 #include "BLI_bit_span.hh"
 #include "BLI_span.hh"

@ -158,10 +159,8 @@ class BitVector {
  explicit BitVector(const Span<bool> values, Allocator allocator = {})
      : BitVector(NoExceptConstructor(), allocator)
  {
-    this->resize(values.size());
-    for (const int64_t i : this->index_range()) {
-      (*this)[i].set(values[i]);
-    }
+    this->resize(values.size(), false);
+    or_bools_into_bits(values, *this);
  }

  ~BitVector()
@ -199,6 +198,14 @@ class BitVector {
    return size_in_bits_;
  }

+  /**
+   * Number of bits that can be stored before the BitVector has to grow.
+   */
+  int64_t capacity() const
+  {
+    return capacity_in_bits_;
+  }
+
  bool is_empty() const
  {
    return size_in_bits_ == 0;
--- a/source/blender/blenlib/BLI_index_mask.hh
+++ b/source/blender/blenlib/BLI_index_mask.hh
@ -12,6 +12,7 @@
 #include "BLI_bit_span.hh"
 #include "BLI_function_ref.hh"
 #include "BLI_index_mask_fwd.hh"
+#include "BLI_index_ranges_builder_fwd.hh"
 #include "BLI_linear_allocator.hh"
 #include "BLI_offset_span.hh"
 #include "BLI_task.hh"
@ -255,6 +256,20 @@ class IndexMask : private IndexMaskData {
                                  GrainSize grain_size,
                                  IndexMaskMemory &memory,
                                  Fn &&predicate);
+  /**
+   * This is a variant of #from_predicate that is more efficient if the predicate for many indices
+   * can be evaluated at once.
+   *
+   * \param batch_predicate: A function that finds indices in a certain segment that should become
+   * part of the mask. To efficiently handle ranges, this function uses #IndexRangesBuilder. It
+   * returns an index offset that should be applied to each index in the builder.
+   */
+  static IndexMask from_batch_predicate(
+      const IndexMask &universe,
+      GrainSize grain_size,
+      IndexMaskMemory &memory,
+      FunctionRef<int64_t(const IndexMaskSegment &universe_segment,
+                          IndexRangesBuilder<int16_t> &builder)> batch_predicate);
  /** Sorts all indices from #universe into the different output masks. */
  template<typename T, typename Fn>
  static void from_groups(const IndexMask &universe,
--- a/source/blender/blenlib/BLI_index_ranges_builder.hh
+++ b/source/blender/blenlib/BLI_index_ranges_builder.hh
@ -0,0 +1,121 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#pragma once
+
+#include <array>
+
+#include "BLI_index_range.hh"
+#include "BLI_index_ranges_builder_fwd.hh"
+#include "BLI_span.hh"
+#include "BLI_utility_mixins.hh"
+
+namespace blender {
+
+/**
+ * A data structure that is designed to allow building many index ranges efficiently.
+ *
+ * One first has to add individual indices or ranges in ascending order. Internally, consecutive
+ * indices and ranges are automatically joined.
+ *
+ * \note This data structure has a pre-defined capacity and can not automatically grow once that
+ * capacity is reached. Use #IndexRangesBuilderBuffer to control the capacity.
+ */
+template<typename T> class IndexRangesBuilder : NonCopyable, NonMovable {
+ private:
+  /** The current pointer into #data_. It's changed whenever a new range starts. */
+  T *c_;
+  /** Structure: [-1, start, end, start, end, start, end, ...]. */
+  MutableSpan<T> data_;
+
+ public:
+  IndexRangesBuilder(MutableSpan<T> data) : data_(data)
+  {
+    static_assert(std::is_signed_v<T>);
+    /* Set the first value to -1 so that when the first index is added, it is detected as the start
+     * of a new range. */
+    data_[0] = -1;
+    c_ = data_.data();
+  }
+
+  /** Add a new index. It has to be larger than any previously added index. */
+  bool add(const T index)
+  {
+    return this->add_range(index, index + 1);
+  }
+
+  /**
+   * Add a range of indices. It has to start after any previously added index.
+   * By design, this is branchless and requires O(1) time.
+   */
+  bool add_range(const T start, const T end)
+  {
+    /* Indices have to be added in ascending order. */
+    BLI_assert(start >= *c_);
+    BLI_assert(start >= 0);
+    BLI_assert(start < end);
+
+    const bool is_new_range = start > *c_;
+
+    /* Check that the capacity is not overflown. */
+    BLI_assert(!is_new_range || this->size() < this->capacity());
+
+    /* This is designed to either append to the last range or start a new range.
+     * It is intentionally branchless for more predictable performance on unpredictable data. */
+    c_ += is_new_range;
+    *c_ = start;
+    c_ += is_new_range;
+    *c_ = end;
+
+    return is_new_range;
+  }
+
+  /** Number of collected ranges. */
+  int64_t size() const
+  {
+    return (c_ - data_.data()) / 2;
+  }
+
+  /** How many ranges this container can hold at most. */
+  int64_t capacity() const
+  {
+    return data_.size() / 2;
+  }
+
+  /** True if there are no ranges yet. */
+  bool is_empty() const
+  {
+    return c_ == data_.data();
+  }
+
+  IndexRange index_range() const
+  {
+    return IndexRange(this->size());
+  }
+
+  /** Get the i-th collected #IndexRange. */
+  IndexRange operator[](const int64_t i) const
+  {
+    const T start = data_[size_t(1) + 2 * size_t(i)];
+    const T end = data_[size_t(2) + 2 * size_t(i)];
+    return IndexRange::from_begin_end(start, end);
+  }
+
+  static constexpr int64_t buffer_size_for_ranges_num(const int64_t ranges_num)
+  {
+    /* Two values for each range (start, end) and the dummy prefix value. */
+    return ranges_num * 2 + 1;
+  }
+};
+
+template<typename T, int64_t MaxRangesNum> struct IndexRangesBuilderBuffer {
+  std::array<T, size_t(IndexRangesBuilder<T>::buffer_size_for_ranges_num(MaxRangesNum))> data;
+
+  operator MutableSpan<T>()
+  {
+    return this->data;
+  }
+};
+
+}  // namespace blender
--- a/source/blender/blenlib/BLI_index_ranges_builder_fwd.hh
+++ b/source/blender/blenlib/BLI_index_ranges_builder_fwd.hh
@ -0,0 +1,13 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#pragma once
+
+#include "BLI_utildefines.h"
+
+namespace blender {
+
+template<typename T> class IndexRangesBuilder;
+
+}
--- a/source/blender/blenlib/BLI_math_bits.h
+++ b/source/blender/blenlib/BLI_math_bits.h
@ -8,6 +8,7 @@
 * \ingroup bli
 */

+#include "BLI_build_config.h"
 #include "BLI_math_inline.h"
 #include "BLI_utildefines.h"

@ -42,10 +43,15 @@ MINLINE unsigned int bitscan_reverse_clear_uint(unsigned int *a);
 MINLINE unsigned int highest_order_bit_uint(unsigned int n);
 MINLINE unsigned short highest_order_bit_s(unsigned short n);

-#ifdef __GNUC__
+#if COMPILER_GCC || COMPILER_CLANG
 #  define count_bits_i(i) __builtin_popcount(i)
+#  define count_bits_uint64(i) __builtin_popcountll(i)
+#elif COMPILER_MSVC
+#  define count_bits_i(i) __popcnt(i)
+#  define count_bits_uint64(i) __popcnt64(i)
 #else
 MINLINE int count_bits_i(unsigned int n);
+MINLINE int count_bits_uint64(uint64_t a);
 #endif

 MINLINE int float_as_int(float f);
--- a/source/blender/blenlib/CMakeLists.txt
+++ b/source/blender/blenlib/CMakeLists.txt
@ -48,6 +48,7 @@ set(SRC
  intern/array_utils.cc
  intern/astar.c
  intern/atomic_disjoint_set.cc
+  intern/bit_bool_conversion.cc
  intern/bit_ref.cc
  intern/bit_span.cc
  intern/bitmap.c
@ -188,10 +189,12 @@ set(SRC
  BLI_astar.h
  BLI_atomic_disjoint_set.hh
  BLI_binary_search.hh
+  BLI_bit_bool_conversion.hh
  BLI_bit_group_vector.hh
  BLI_bit_ref.hh
  BLI_bit_span.hh
  BLI_bit_span_ops.hh
+  BLI_bit_span_to_index_ranges.hh
  BLI_bit_vector.hh
  BLI_bitmap.h
  BLI_bitmap_draw_2d.h
@ -264,6 +267,8 @@ set(SRC
  BLI_index_mask_expression.hh
  BLI_index_mask_fwd.hh
  BLI_index_range.hh
+  BLI_index_ranges_builder.hh
+  BLI_index_ranges_builder_fwd.hh
  BLI_inplace_priority_queue.hh
  BLI_iterator.h
  BLI_jitter_2d.h
@ -540,6 +545,7 @@ if(WITH_GTESTS)
    tests/BLI_index_mask_expression_test.cc
    tests/BLI_index_mask_test.cc
    tests/BLI_index_range_test.cc
+    tests/BLI_index_ranges_builder_test.cc
    tests/BLI_inplace_priority_queue_test.cc
    tests/BLI_kdopbvh_test.cc
    tests/BLI_kdtree_test.cc
--- a/source/blender/blenlib/intern/bit_bool_conversion.cc
+++ b/source/blender/blenlib/intern/bit_bool_conversion.cc
@ -0,0 +1,69 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "BLI_bit_bool_conversion.hh"
+#include "BLI_simd.hh"
+#include "BLI_timeit.hh"
+
+namespace blender::bits {
+
+bool or_bools_into_bits(const Span<bool> bools,
+                        MutableBitSpan r_bits,
+                        const int64_t allowed_overshoot)
+{
+  BLI_assert(r_bits.size() >= bools.size());
+  if (bools.is_empty()) {
+    return false;
+  }
+
+  int64_t bool_i = 0;
+  const bool *bools_ = bools.data();
+
+  bool any_true = false;
+
+/* Conversion from bools to bits can be way faster with intrinsics. That's because instead of
+ * processing one element at a time, we can process 16 at once. */
+#if BLI_HAVE_SSE2
+  /* Initialize zeros, so that we can compare against it. */
+  const __m128i zero_bytes = _mm_set1_epi8(0);
+  int64_t iteration_end = bools.size();
+  if (iteration_end % 16 > 0) {
+    if (allowed_overshoot >= 16) {
+      iteration_end = (iteration_end + 16) & ~15;
+    }
+  }
+  /* Iterate over chunks of booleans. */
+  for (; bool_i + 16 <= iteration_end; bool_i += 16) {
+    /* Load 16 bools at once. */
+    const __m128i group = _mm_loadu_si128(reinterpret_cast<const __m128i *>(bools_ + bool_i));
+    /* Compare them all against zero. The result is a mask of the form [0x00, 0xff, 0xff, ...]. */
+    const __m128i is_false_byte_mask = _mm_cmpeq_epi8(group, zero_bytes);
+    /* Compress the byte-mask into a bit mask. This takes one bit from each byte. */
+    const uint16_t is_false_mask = _mm_movemask_epi8(is_false_byte_mask);
+    /* Now we have a bit mask where each bit corresponds to an input boolean. */
+    const uint16_t is_true_mask = ~is_false_mask;
+    any_true |= is_true_mask != 0;
+
+    const int start_bit_in_int = (r_bits.bit_range().start() + bool_i) & BitIndexMask;
+    BitInt *start_bit_int = int_containing_bit(r_bits.data(), r_bits.bit_range().start() + bool_i);
+    *start_bit_int |= BitInt(is_true_mask) << start_bit_in_int;
+
+    if (start_bit_in_int > BitsPerInt - 16) {
+      /* It's possible that the bits need inserted in two consecutive integers. */
+      start_bit_int[1] |= BitInt(is_true_mask) >> (64 - start_bit_in_int);
+    }
+  }
+#endif
+
+  /* Process remaining bools. */
+  for (; bool_i < bools.size(); bool_i++) {
+    if (bools_[bool_i]) {
+      r_bits[bool_i].set();
+      any_true = true;
+    }
+  }
+  return any_true;
+}
+
+}  // namespace blender::bits
--- a/source/blender/blenlib/intern/index_mask.cc
+++ b/source/blender/blenlib/intern/index_mask.cc
@ -7,11 +7,17 @@
 #include <mutex>

 #include "BLI_array.hh"
+#include "BLI_array_utils.hh"
+#include "BLI_bit_bool_conversion.hh"
+#include "BLI_bit_span_ops.hh"
+#include "BLI_bit_span_to_index_ranges.hh"
 #include "BLI_bit_vector.hh"
 #include "BLI_enumerable_thread_specific.hh"
 #include "BLI_index_mask.hh"
 #include "BLI_index_mask_expression.hh"
+#include "BLI_index_ranges_builder.hh"
 #include "BLI_math_base.hh"
+#include "BLI_math_bits.h"
 #include "BLI_set.hh"
 #include "BLI_sort.hh"
 #include "BLI_task.hh"
@ -442,13 +448,134 @@ IndexMask IndexMask::from_bits(const BitSpan bits, IndexMaskMemory &memory)
  return IndexMask::from_bits(bits.index_range(), bits, memory);
 }

+static int64_t from_bits_batch_predicate(const IndexMaskSegment universe_segment,
+                                         IndexRangesBuilder<int16_t> &builder,
+                                         const BitSpan bits_slice)
+{
+  const int64_t segment_start = universe_segment[0];
+  if (unique_sorted_indices::non_empty_is_range(universe_segment.base_span())) {
+    bits::bits_to_index_ranges<int16_t>(bits_slice, builder);
+  }
+  else {
+    /* If the universe is not a range, we need to create a new bit span first. In it, bits
+     * that are not part of the universe are set to 0. */
+    const int64_t segment_end = universe_segment.last() + 1;
+    BitVector<max_segment_size> local_bits(segment_end - segment_start, false);
+    for (const int64_t i : universe_segment.index_range()) {
+      const int64_t global_index = universe_segment[i];
+      const int64_t local_index = global_index - segment_start;
+      BLI_assert(local_index < max_segment_size);
+      /* It's not great to handle each index separately instead of working with bigger
+       * chunks, but that works well enough for now. */
+      if (bits_slice[local_index]) {
+        local_bits[local_index].set();
+      }
+    }
+    bits::bits_to_index_ranges<int16_t>(local_bits, builder);
+  }
+  return segment_start;
+}
+
 IndexMask IndexMask::from_bits(const IndexMask &universe,
                               const BitSpan bits,
                               IndexMaskMemory &memory)
 {
-  return IndexMask::from_predicate(universe, GrainSize(1024), memory, [bits](const int64_t index) {
-    return bits[index].test();
-  });
+  BLI_assert(bits.size() >= universe.min_array_size());
+  /* Use #from_batch_predicate because we can process many bits at once. */
+  return IndexMask::from_batch_predicate(
+      universe,
+      GrainSize(max_segment_size),
+      memory,
+      [&](const IndexMaskSegment universe_segment, IndexRangesBuilder<int16_t> &builder) {
+        const IndexRange slice = IndexRange::from_begin_end_inclusive(universe_segment[0],
+                                                                      universe_segment.last());
+        return from_bits_batch_predicate(universe_segment, builder, bits.slice(slice));
+      });
+}
+
+static void segments_from_batch_predicate(
+    const IndexMaskSegment universe_segment,
+    LinearAllocator<> &allocator,
+    const FunctionRef<int64_t(const IndexMaskSegment &universe_segment,
+                              IndexRangesBuilder<int16_t> &builder)> batch_predicate,
+    Vector<IndexMaskSegment, 16> &r_segments)
+{
+  IndexRangesBuilderBuffer<int16_t, max_segment_size> builder_buffer;
+  IndexRangesBuilder<int16_t> builder{builder_buffer};
+  const int64_t segment_shift = batch_predicate(universe_segment, builder);
+  if (builder.is_empty()) {
+    return;
+  }
+  const Span<int16_t> static_indices = get_static_indices_array();
+
+  /* This threshold trades off the number of segments and the number of ranges. In some cases,
+   * masks with fewer segments can be build more efficiently, but when iterating over a mask it may
+   * be benefitial to have more ranges if that means that there are more ranges which can be
+   * processed more efficiently. This could be exposed to the caller in the future. */
+  constexpr int64_t threshold = 64;
+  int64_t next_range_to_process = 0;
+  int64_t skipped_indices_num = 0;
+
+  /* Builds an index mask segment from a bunch of smaller ranges (which could be individual
+   * indices). */
+  auto consolidate_skipped_ranges = [&](int64_t end_range_i) {
+    if (skipped_indices_num == 0) {
+      return;
+    }
+    MutableSpan<int16_t> indices = allocator.allocate_array<int16_t>(skipped_indices_num);
+    int64_t counter = 0;
+    for (const int64_t i : IndexRange::from_begin_end(next_range_to_process, end_range_i)) {
+      const IndexRange range = builder[i];
+      array_utils::fill_index_range(indices.slice(counter, range.size()), int16_t(range.first()));
+      counter += range.size();
+    }
+    r_segments.append(IndexMaskSegment{segment_shift, indices});
+  };
+
+  for (const int64_t i : builder.index_range()) {
+    const IndexRange range = builder[i];
+    if (range.size() > threshold || builder.size() == 1) {
+      consolidate_skipped_ranges(i);
+      r_segments.append(IndexMaskSegment{segment_shift, static_indices.slice(range)});
+      next_range_to_process = i + 1;
+      skipped_indices_num = 0;
+    }
+    else {
+      skipped_indices_num += range.size();
+    }
+  }
+  consolidate_skipped_ranges(builder.size());
+}
+
+IndexMask IndexMask::from_batch_predicate(
+    const IndexMask &universe,
+    GrainSize grain_size,
+    IndexMaskMemory &memory,
+    const FunctionRef<int64_t(const IndexMaskSegment &universe_segment,
+                              IndexRangesBuilder<int16_t> &builder)> batch_predicate)
+{
+  if (universe.is_empty()) {
+    return {};
+  }
+
+  Vector<IndexMaskSegment, 16> segments;
+  if (universe.size() <= grain_size.value) {
+    for (const int64_t segment_i : IndexRange(universe.segments_num())) {
+      const IndexMaskSegment universe_segment = universe.segment(segment_i);
+      segments_from_batch_predicate(universe_segment, memory, batch_predicate, segments);
+    }
+  }
+  else {
+    ParallelSegmentsCollector segments_collector;
+    universe.foreach_segment(grain_size, [&](const IndexMaskSegment universe_segment) {
+      ParallelSegmentsCollector::LocalData &data = segments_collector.data_by_thread.local();
+      segments_from_batch_predicate(
+          universe_segment, data.allocator, batch_predicate, data.segments);
+    });
+    segments_collector.reduce(memory, segments);
+  }
+
+  return IndexMask::from_segments(segments, memory);
 }

 IndexMask IndexMask::from_bools(Span<bool> bools, IndexMaskMemory &memory)
@ -465,16 +592,38 @@ IndexMask IndexMask::from_bools(const IndexMask &universe,
                                Span<bool> bools,
                                IndexMaskMemory &memory)
 {
-  return IndexMask::from_predicate(
-      universe, GrainSize(1024), memory, [bools](const int64_t index) { return bools[index]; });
+  BLI_assert(bools.size() >= universe.min_array_size());
+  return IndexMask::from_batch_predicate(
+      universe,
+      GrainSize(max_segment_size),
+      memory,
+      [&](const IndexMaskSegment universe_segment,
+          IndexRangesBuilder<int16_t> &builder) -> int64_t {
+        const IndexRange slice = IndexRange::from_begin_end_inclusive(universe_segment[0],
+                                                                      universe_segment.last());
+        /* +16 to allow for some overshoot when converting bools to bits. */
+        BitVector<max_segment_size + 16> bits;
+        bits.resize(slice.size(), false);
+        const int64_t allowed_overshoot = std::min<int64_t>(bits.capacity() - slice.size(),
+                                                            bools.size() - slice.one_after_last());
+        const bool any_true = bits::or_bools_into_bits(
+            bools.slice(slice), bits, allowed_overshoot);
+        if (!any_true) {
+          return 0;
+        }
+        return from_bits_batch_predicate(universe_segment, builder, bits);
+      });
+  BitVector bits(bools);
+  return IndexMask::from_bits(universe, bits, memory);
 }

 IndexMask IndexMask::from_bools_inverse(const IndexMask &universe,
                                        Span<bool> bools,
                                        IndexMaskMemory &memory)
 {
-  return IndexMask::from_predicate(
-      universe, GrainSize(1024), memory, [bools](const int64_t index) { return !bools[index]; });
+  BitVector bits(bools);
+  bits::invert(bits);
+  return IndexMask::from_bits(universe, bits, memory);
 }

 IndexMask IndexMask::from_bools(const IndexMask &universe,
--- a/source/blender/blenlib/intern/math_bits_inline.c
+++ b/source/blender/blenlib/intern/math_bits_inline.c
@ -114,7 +114,7 @@ MINLINE unsigned short highest_order_bit_s(unsigned short n)
  return (unsigned short)(n - (n >> 1));
 }

-#ifndef __GNUC__
+#if !(COMPILER_GCC || COMPILER_CLANG || COMPILER_MSVC)
 MINLINE int count_bits_i(unsigned int i)
 {
  /* variable-precision SWAR algorithm. */
@ -122,6 +122,10 @@ MINLINE int count_bits_i(unsigned int i)
  i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
  return (((i + (i >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
 }
+MINLINE int count_bits_uint64(const uint64_t a)
+{
+  return count_bits_i((uint32_t)a) + count_bits_i((uint32_t)(a >> 32));
+}
 #endif

 MINLINE int float_as_int(float f)
--- a/source/blender/blenlib/tests/BLI_bit_span_test.cc
+++ b/source/blender/blenlib/tests/BLI_bit_span_test.cc
@ -6,6 +6,8 @@

 #include "BLI_bit_span.hh"
 #include "BLI_bit_span_ops.hh"
+#include "BLI_bit_span_to_index_ranges.hh"
+#include "BLI_bit_vector.hh"
 #include "BLI_timeit.hh"
 #include "BLI_vector.hh"

@ -248,4 +250,61 @@ TEST(bit_span, ForEach1)
  EXPECT_EQ(indices_test.as_span(), Span({24, 33, 82}));
 }

+TEST(bit_span, or_bools_into_bits)
+{
+  {
+    Vector<bool> bools(5, false);
+    bools[2] = true;
+    BitVector<> bits(bools.size());
+    bits[0].set();
+    bits::or_bools_into_bits(bools, bits);
+    EXPECT_TRUE(bits[0]);
+    EXPECT_FALSE(bits[1]);
+    EXPECT_TRUE(bits[2]);
+    EXPECT_FALSE(bits[3]);
+    EXPECT_FALSE(bits[4]);
+  }
+  {
+    Vector<bool> bools(100, true);
+    BitVector<> bits(1000, false);
+    bits::or_bools_into_bits(bools,
+                             MutableBitSpan(bits).slice(IndexRange::from_begin_size(100, 500)));
+    EXPECT_FALSE(bits[99]);
+    EXPECT_TRUE(bits[100]);
+    EXPECT_TRUE(bits[101]);
+    EXPECT_TRUE(bits[199]);
+    EXPECT_FALSE(bits[200]);
+  }
+}
+
+TEST(bit_span, to_index_ranges_small)
+{
+  BitVector<> bits(10, false);
+  bits[2].set();
+  bits[3].set();
+  bits[4].set();
+  bits[6].set();
+  bits[7].set();
+
+  IndexRangesBuilderBuffer<int, 10> builder_buffer;
+  IndexRangesBuilder<int> builder(builder_buffer);
+  bits_to_index_ranges(bits, builder);
+
+  EXPECT_EQ(builder.size(), 2);
+  EXPECT_EQ(builder[0], IndexRange::from_begin_end_inclusive(2, 4));
+  EXPECT_EQ(builder[1], IndexRange::from_begin_end_inclusive(6, 7));
+}
+
+TEST(bit_span, to_index_ranges_all_ones)
+{
+  BitVector<> bits(10000, true);
+
+  IndexRangesBuilderBuffer<int, 10> builder_buffer;
+  IndexRangesBuilder<int> builder(builder_buffer);
+  bits_to_index_ranges(BitSpan(bits).take_back(8765), builder);
+
+  EXPECT_EQ(builder.size(), 1);
+  EXPECT_EQ(builder[0], IndexRange(8765));
+}
+
 }  // namespace blender::bits::tests
--- a/source/blender/blenlib/tests/BLI_index_mask_test.cc
+++ b/source/blender/blenlib/tests/BLI_index_mask_test.cc
@ -4,8 +4,13 @@

 #include "testing/testing.h"

+#include <fmt/format.h>
+
 #include "BLI_array.hh"
+#include "BLI_bit_span_ops.hh"
+#include "BLI_bit_vector.hh"
 #include "BLI_index_mask.hh"
+#include "BLI_index_ranges_builder.hh"
 #include "BLI_rand.hh"
 #include "BLI_set.hh"
 #include "BLI_timeit.hh"
@ -27,7 +32,7 @@ TEST(index_mask, IndicesToMask)
  EXPECT_EQ(mask.bounds(), IndexRange::from_begin_end_inclusive(5, 101000));
 }

-TEST(index_mask, FromBits)
+TEST(index_mask, FromBitsManual)
 {
  IndexMaskMemory memory;
  const uint64_t bits =
@ -42,6 +47,269 @@ TEST(index_mask, FromBits)
  EXPECT_EQ(indices[4], 9);
 }

+TEST(index_mask, FromBitsSimple)
+{
+  IndexMaskMemory memory;
+  BitVector bit_vec(200, true);
+  bit_vec[0].reset();
+  bit_vec[100].reset();
+  const IndexMask mask = IndexMask::from_bits(bit_vec, memory);
+
+  EXPECT_EQ(mask.size(), 198);
+  EXPECT_FALSE(mask.contains(0));
+  EXPECT_FALSE(mask.contains(100));
+}
+
+TEST(index_mask, FromBitsWithUniverse)
+{
+  IndexMaskMemory memory;
+  BitVector bit_vec(200, true);
+  bit_vec[6].reset();
+  bit_vec[100].reset();
+
+  const IndexMask universe = IndexMask::from_indices<int>({4, 6, 7, 8, 9, 100, 101, 102}, memory);
+  const IndexMask mask = IndexMask::from_bits(universe, bit_vec, memory);
+  EXPECT_EQ(mask.size(), 6);
+  EXPECT_EQ(mask[0], 4);
+  EXPECT_EQ(mask[1], 7);
+  EXPECT_EQ(mask[2], 8);
+  EXPECT_EQ(mask[3], 9);
+  EXPECT_EQ(mask[4], 101);
+  EXPECT_EQ(mask[5], 102);
+}
+
+TEST(index_mask, FromBitsSparse)
+{
+  BitVector bit_vec(100'000, false);
+  bit_vec[5].set();
+  bit_vec[100].set();
+  bit_vec[200].set();
+  bit_vec[500].set();
+  bit_vec[800].set();
+  bit_vec[10'000].set();
+  bit_vec[10'002].set();
+  bit_vec[50'000].set();
+  bit_vec[70'000].set();
+  bit_vec[70'002].set();
+  bit_vec[70'004].set();
+  bit_vec[70'005].set();
+
+  IndexMaskMemory memory;
+  const IndexMask mask = IndexMask::from_bits(bit_vec, memory);
+  EXPECT_EQ(mask.size(), 12);
+  EXPECT_EQ(mask[0], 5);
+  EXPECT_EQ(mask[1], 100);
+  EXPECT_EQ(mask[2], 200);
+  EXPECT_EQ(mask[3], 500);
+  EXPECT_EQ(mask[4], 800);
+  EXPECT_EQ(mask[5], 10'000);
+  EXPECT_EQ(mask[6], 10'002);
+  EXPECT_EQ(mask[7], 50'000);
+  EXPECT_EQ(mask[8], 70'000);
+  EXPECT_EQ(mask[9], 70'002);
+  EXPECT_EQ(mask[10], 70'004);
+  EXPECT_EQ(mask[11], 70'005);
+}
+
+TEST(index_mask, FromBoolsSparse)
+{
+  Vector<bool> bools(10'000'000, false);
+  bools[5] = true;
+  bools[100] = true;
+  bools[200] = true;
+  bools[500] = true;
+  bools[800] = true;
+  bools[10'000] = true;
+  bools[10'002] = true;
+  bools[50'000] = true;
+  bools[70'000] = true;
+  bools[70'002] = true;
+  bools[70'004] = true;
+  bools[70'005] = true;
+
+  IndexMaskMemory memory;
+  const IndexMask mask = IndexMask::from_bools(bools, memory);
+  EXPECT_EQ(mask.size(), 12);
+  EXPECT_EQ(mask[0], 5);
+  EXPECT_EQ(mask[1], 100);
+  EXPECT_EQ(mask[2], 200);
+  EXPECT_EQ(mask[3], 500);
+  EXPECT_EQ(mask[4], 800);
+  EXPECT_EQ(mask[5], 10'000);
+  EXPECT_EQ(mask[6], 10'002);
+  EXPECT_EQ(mask[7], 50'000);
+  EXPECT_EQ(mask[8], 70'000);
+  EXPECT_EQ(mask[9], 70'002);
+  EXPECT_EQ(mask[10], 70'004);
+  EXPECT_EQ(mask[11], 70'005);
+}
+
+TEST(index_mask, FromBitsAlternating)
+{
+  int64_t size = 100'000;
+  BitVector<> bits(size, false);
+  for (const int64_t i : IndexRange(size / 2)) {
+    bits[i * 2].set();
+  }
+
+  IndexMaskMemory memory;
+  const IndexMask mask = IndexMask::from_bits(bits, memory);
+  EXPECT_EQ(mask.size(), size / 2);
+  EXPECT_EQ(mask[0], 0);
+  EXPECT_EQ(mask[1], 2);
+  EXPECT_EQ(mask[2], 4);
+  EXPECT_EQ(mask[3], 6);
+}
+
+static BitVector<> build_bits_with_uniform_distribution(const int bits_num,
+                                                        const int set_bits_num,
+                                                        const uint32_t seed = 0)
+{
+  if (set_bits_num > bits_num / 2) {
+    BitVector bit_vec = build_bits_with_uniform_distribution(bits_num, bits_num - set_bits_num);
+    bits::invert(bit_vec);
+    return bit_vec;
+  }
+  BitVector bit_vec(bits_num, false);
+  RandomNumberGenerator rng(seed);
+  int counter = 0;
+  while (counter < set_bits_num) {
+    const int i = rng.get_int32(int(bits_num));
+    MutableBitRef bit = bit_vec[i];
+    if (!bit) {
+      bit.set();
+      counter++;
+    }
+  }
+  return bit_vec;
+}
+
+/* The benchmark is too slow to run during normal test runs. */
+#if 0
+
+static void benchmark_uniform_bit_distribution(const int bits_num,
+                                               const int set_bits_num,
+                                               const int iterations)
+{
+  const bool machine_readable = true;
+  BitVector bit_vec = build_bits_with_uniform_distribution(bits_num, set_bits_num);
+  std::locale loc("en_US.UTF-8");
+  timeit::Nanoseconds min_duration{INT64_MAX};
+  for ([[maybe_unused]] const int64_t i : IndexRange(iterations)) {
+    IndexMaskMemory memory;
+    timeit::TimePoint start = timeit::Clock::now();
+    const IndexMask mask = IndexMask::from_bits(bit_vec, memory);
+    timeit::TimePoint end = timeit::Clock::now();
+    const timeit::Nanoseconds duration = end - start;
+    // const double ms = double(duration.count()) / 1'000'000.0;
+    // std::cout << fmt::format(loc, "{:15L} / {:L}: {:.4} ms\n", set_bits_num, bits_num, ms);
+    min_duration = std::min(min_duration, duration);
+    EXPECT_EQ(mask.size(), set_bits_num);
+  }
+  const double ms = double(min_duration.count()) / 1'000'000.0;
+  if (machine_readable) {
+    std::cout << fmt::format("{},{:.6}\n", set_bits_num, ms);
+  }
+  else {
+    std::cout << fmt::format(loc, "{:15L} / {:L}: {:.4} ms\n", set_bits_num, bits_num, ms);
+  }
+}
+
+TEST(index_mask, FromBitsBenchmark)
+{
+  const int size = 100'000'000;
+  const int iterations = 5;
+  Vector<int> set_bit_nums;
+  set_bit_nums.append(0);
+  int current = 100;
+  while (current < size / 2) {
+    set_bit_nums.append(current);
+    set_bit_nums.append(size - current);
+    current = int(current * 1.3);
+  }
+  set_bit_nums.append(size);
+  std::sort(set_bit_nums.begin(), set_bit_nums.end());
+
+  for (const int set_bit_num : set_bit_nums) {
+    benchmark_uniform_bit_distribution(size, set_bit_num, iterations);
+  }
+}
+
+/* Benchmark. */
+#endif
+
+TEST(index_mask, FromBitsFuzzy)
+{
+  RandomNumberGenerator rng(0);
+  for ([[maybe_unused]] const int64_t iteration : IndexRange(10)) {
+    const int size = rng.get_int32(100'000) + 1;
+    const int set_bits = rng.get_int32(size);
+
+    /* Remove part of the beginning and end of the bits to test unaligned bit spans. */
+    const int64_t slice_inset = rng.get_int32(size / 10);
+    const IndexRange slice = IndexRange::from_begin_end(slice_inset, size - slice_inset);
+
+    BitVector<> bits(size, false);
+    for ([[maybe_unused]] const int64_t set_bit_i : IndexRange(set_bits)) {
+      int index = rng.get_int32(size);
+      /* This is like linear probing which results in a somewhat random distribution but also leads
+       * to having longer ranges every now and then. */
+      while (true) {
+        if (!bits[index]) {
+          bits[index].set();
+          break;
+        }
+        index = (index + 1) % size;
+      }
+    }
+
+    IndexMaskMemory memory;
+
+    /* The universe is partially a range and partially only even/odd indices.
+     * For example: [1, 2, 3, 4, 6, 8, 10, 12]. */
+    const int64_t slice_half_size = slice.size() / 2;
+    const IndexMask universe = IndexMask::from_union(
+                                   IndexRange(slice_half_size),
+                                   IndexMask::from_repeating(
+                                       IndexRange(1), slice_half_size, 2, slice_half_size, memory),
+                                   memory)
+                                   .slice_content(slice.index_range());
+
+    const IndexMask mask = IndexMask::from_bits(universe, BitSpan(bits).slice(slice), memory);
+
+    BitVector<> inverted_bits{bits};
+    bits::invert(inverted_bits);
+    const IndexMask inverted_mask = IndexMask::from_bits(
+        universe, BitSpan(inverted_bits).slice(slice), memory);
+
+    const IndexMask double_inverted_mask = inverted_mask.complement(universe, memory);
+    EXPECT_EQ(mask, double_inverted_mask);
+  }
+}
+
+TEST(index_mask, FromBitsDense)
+{
+  BitVector bit_vec(1'000, true);
+  bit_vec[5].reset();
+  bit_vec[200].reset();
+  bit_vec[201].reset();
+  bit_vec[500].reset();
+  bit_vec[502].reset();
+  bit_vec[504].reset();
+  bit_vec[506].reset();
+
+  IndexMaskMemory memory;
+  const IndexMask mask = IndexMask::from_bits(bit_vec, memory);
+  EXPECT_EQ(mask.size(), 993);
+  EXPECT_FALSE(mask.contains(5));
+  EXPECT_FALSE(mask.contains(200));
+  EXPECT_FALSE(mask.contains(201));
+  EXPECT_FALSE(mask.contains(500));
+  EXPECT_FALSE(mask.contains(502));
+  EXPECT_FALSE(mask.contains(504));
+  EXPECT_FALSE(mask.contains(506));
+}
+
 TEST(index_mask, FromSize)
 {
  {
--- a/source/blender/blenlib/tests/BLI_index_ranges_builder_test.cc
+++ b/source/blender/blenlib/tests/BLI_index_ranges_builder_test.cc
@ -0,0 +1,80 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+#include "testing/testing.h"
+
+#include "BLI_index_ranges_builder.hh"
+
+namespace blender::tests {
+
+TEST(index_ranges_builder, Empty)
+{
+  IndexRangesBuilderBuffer<int, 10> builder_buffer;
+  IndexRangesBuilder<int> builder{builder_buffer};
+  EXPECT_EQ(builder.size(), 0);
+  EXPECT_TRUE(builder.is_empty());
+}
+
+TEST(index_ranges_builder, Single)
+{
+  {
+    IndexRangesBuilderBuffer<int, 10> builder_buffer;
+    IndexRangesBuilder<int> builder{builder_buffer};
+    builder.add(0);
+    EXPECT_EQ(builder.size(), 1);
+    EXPECT_EQ(builder[0], IndexRange::from_begin_size(0, 1));
+  }
+  {
+    IndexRangesBuilderBuffer<int, 10> builder_buffer;
+    IndexRangesBuilder<int> builder{builder_buffer};
+    builder.add(10);
+    EXPECT_EQ(builder.size(), 1);
+    EXPECT_EQ(builder[0], IndexRange::from_begin_size(10, 1));
+  }
+}
+
+TEST(index_ranges_builder, Multiple)
+{
+  IndexRangesBuilderBuffer<int, 10> builder_buffer;
+  IndexRangesBuilder<int> builder{builder_buffer};
+  builder.add(3);
+  builder.add(4);
+  builder.add(5);
+  builder.add(8);
+  builder.add(9);
+  builder.add_range(20, 100);
+  builder.add_range(100, 130);
+
+  EXPECT_EQ(builder.size(), 3);
+  EXPECT_EQ(builder[0], IndexRange::from_begin_end_inclusive(3, 5));
+  EXPECT_EQ(builder[1], IndexRange::from_begin_end_inclusive(8, 9));
+  EXPECT_EQ(builder[2], IndexRange::from_begin_end(20, 130));
+}
+
+TEST(index_ranges_builder, Full)
+{
+  {
+    IndexRangesBuilderBuffer<int, 1> builder_buffer;
+    IndexRangesBuilder<int> builder{builder_buffer};
+    builder.add(10);
+    builder.add(11);
+    builder.add(12);
+
+    EXPECT_EQ(builder.size(), 1);
+    EXPECT_EQ(builder[0], IndexRange::from_begin_end_inclusive(10, 12));
+  }
+  {
+    IndexRangesBuilderBuffer<int, 3> builder_buffer;
+    IndexRangesBuilder<int> builder{builder_buffer};
+    builder.add(100);
+    builder.add(200);
+    builder.add(300);
+    EXPECT_EQ(builder.size(), 3);
+    EXPECT_EQ(builder[0], IndexRange::from_begin_size(100, 1));
+    EXPECT_EQ(builder[1], IndexRange::from_begin_size(200, 1));
+    EXPECT_EQ(builder[2], IndexRange::from_begin_size(300, 1));
+  }
+}
+
+}  // namespace blender::tests
--- a/source/blender/blenlib/tests/BLI_math_bits_test.cc
+++ b/source/blender/blenlib/tests/BLI_math_bits_test.cc
@ -50,3 +50,33 @@ TEST(math_bits, BitscanReverseClearUint)
  EXPECT_EQ(bitscan_reverse_clear_uint(&a), 31);
  EXPECT_EQ(a, 0);
 }
+
+TEST(math_bits, PopCount)
+{
+  {
+    EXPECT_EQ(count_bits_i(0), 0);
+    EXPECT_EQ(count_bits_uint64(0), 0);
+  }
+  {
+    const int value = (1 << 0) | (1 << 5) | (1 << 7);
+    const int count_32 = count_bits_i(value);
+    const int count_64 = count_bits_uint64(value);
+    EXPECT_EQ(count_32, 3);
+    EXPECT_EQ(count_64, 3);
+  }
+  {
+    const uint64_t value = (uint64_t(1) << 0) | (uint64_t(1) << 50);
+    const int count = count_bits_uint64(value);
+    EXPECT_EQ(count, 2);
+  }
+  {
+    const int value = -1;
+    const int count = count_bits_i(value);
+    EXPECT_EQ(count, 32);
+  }
+  {
+    const uint64_t value = -1;
+    const int count = count_bits_uint64(value);
+    EXPECT_EQ(count, 64);
+  }
+}