Functions: devirtualize virtual arrays in simple functions

In some multi-functions (such as a simple add function), the virtual method
call overhead to access array elements adds significant overhead. For these
simple functions it makes sense to generate optimized versions for different
types of virtual arrays. This is done by giving the compiler all the information
it needs to devirtualize virtual arrays.

In my benchmark this speeds up processing a lot of data with small function 2-3x.

This devirtualization should not be done for larger functions, because it increases
compile time and binary size, while providing a negilible performance benefit.
This commit is contained in:
Jacques Lucke 2021-03-22 17:06:02 +01:00
parent 6cec782970
commit 21268ad20a
2 changed files with 98 additions and 8 deletions

View File

@ -144,8 +144,11 @@ template<typename T> class VArray {
}
};
/* A virtual array implementation for a span. */
template<typename T> class VArrayForSpan : public VArray<T> {
/**
* A virtual array implementation for a span. This class is final so that it can be devirtualized
* by the compiler in some cases (e.g. when #devirtualize_varray is used).
*/
template<typename T> class VArrayForSpan final : public VArray<T> {
private:
const T *data_;
@ -171,8 +174,12 @@ template<typename T> class VArrayForSpan : public VArray<T> {
}
};
/* A virtual array implementation that returns the same value for every index. */
template<typename T> class VArrayForSingle : public VArray<T> {
/**
* A virtual array implementation that returns the same value for every index. This class is final
* so that it can be devirtualized by the compiler in some cases (e.g. when #devirtualize_varray is
* used).
*/
template<typename T> class VArrayForSingle final : public VArray<T> {
private:
T value_;
@ -208,4 +215,81 @@ template<typename T> class VArrayForSingle : public VArray<T> {
}
};
/**
* Generate multiple versions of the given function optimized for different virtual arrays.
* One has to be careful with nesting multiple devirtualizations, because that results in an
* exponential number of function instantiations (increasing compile time and binary size).
*
* Generally, this function should only be used when the virtual method call overhead to get an
* element from a virtual array is signifant.
*/
template<typename T, typename Func>
inline void devirtualize_varray(const VArray<T> &varray, const Func &func, bool enable = true)
{
/* Support disabling the devirtualization to simplify benchmarking. */
if (enable) {
if (varray.is_single()) {
/* `VArrayForSingle` can be used for devirtualization, because it is declared `final`. */
const VArrayForSingle<T> varray_single{varray.get_single(), varray.size()};
func(varray_single);
return;
}
if (varray.is_span()) {
/* `VArrayForSpan` can be used for devirtualization, because it is declared `final`. */
const VArrayForSpan<T> varray_span{varray.get_span()};
func(varray_span);
return;
}
}
func(varray);
}
/**
* Same as `devirtualize_varray`, but devirtualizes two virtual arrays at the same time.
* This is better than nesting two calls to `devirtualize_varray`, because it instantiates fewer
* cases.
*/
template<typename T1, typename T2, typename Func>
inline void devirtualize_varray2(const VArray<T1> &varray1,
const VArray<T2> &varray2,
const Func &func,
bool enable = true)
{
/* Support disabling the devirtualization to simplify benchmarking. */
if (enable) {
const bool is_span1 = varray1.is_span();
const bool is_span2 = varray2.is_span();
const bool is_single1 = varray1.is_single();
const bool is_single2 = varray2.is_single();
if (is_span1 && is_span2) {
const VArrayForSpan<T1> varray1_span{varray1.get_span()};
const VArrayForSpan<T2> varray2_span{varray2.get_span()};
func(varray1_span, varray2_span);
return;
}
if (is_span1 && is_single2) {
const VArrayForSpan<T1> varray1_span{varray1.get_span()};
const VArrayForSingle<T2> varray2_single{varray2.get_single(), varray2.size()};
func(varray1_span, varray2_single);
return;
}
if (is_single1 && is_span2) {
const VArrayForSingle<T1> varray1_single{varray1.get_single(), varray1.size()};
const VArrayForSpan<T2> varray2_span{varray2.get_span()};
func(varray1_single, varray2_span);
return;
}
if (is_single1 && is_single2) {
const VArrayForSingle<T1> varray1_single{varray1.get_single(), varray1.size()};
const VArrayForSingle<T2> varray2_single{varray2.get_single(), varray2.size()};
func(varray1_single, varray2_single);
return;
}
}
/* This fallback is used even when one of the inputs could be optimized. It's probably not worth
* it to optimize just one of the inputs, because then the compiler still has to call into
* unknown code, which inhibits many compiler optimizations. */
func(varray1, varray2);
}
} // namespace blender

View File

@ -61,8 +61,11 @@ template<typename In1, typename Out1> class CustomMF_SI_SO : public MultiFunctio
template<typename ElementFuncT> static FunctionT create_function(ElementFuncT element_fn)
{
return [=](IndexMask mask, const VArray<In1> &in1, MutableSpan<Out1> out1) {
mask.foreach_index(
[&](int i) { new (static_cast<void *>(&out1[i])) Out1(element_fn(in1[i])); });
/* Devirtualization results in a 2-3x speedup for some simple functions. */
devirtualize_varray(in1, [&](const auto &in1) {
mask.foreach_index(
[&](int i) { new (static_cast<void *>(&out1[i])) Out1(element_fn(in1[i])); });
});
};
}
@ -111,8 +114,11 @@ class CustomMF_SI_SI_SO : public MultiFunction {
const VArray<In1> &in1,
const VArray<In2> &in2,
MutableSpan<Out1> out1) {
mask.foreach_index(
[&](int i) { new (static_cast<void *>(&out1[i])) Out1(element_fn(in1[i], in2[i])); });
/* Devirtualization results in a 2-3x speedup for some simple functions. */
devirtualize_varray2(in1, in2, [&](const auto &in1, const auto &in2) {
mask.foreach_index(
[&](int i) { new (static_cast<void *>(&out1[i])) Out1(element_fn(in1[i], in2[i])); });
});
};
}