LatticeDeform: Performance

This patch improves the single core performance of the lattice deform.

1. Prefetching deform vert during initialization. This data is constant for
   each innerloop. This reduces the complexity of the inner loop what makes
   more CPU resources free for other optimizations.
2. Prefetching the Lattice instance. It was constant. Although performance
   wise this isn't noticeable it is always good to free some space in the
   branch prediction tables.
3. Remove branching in all loops by not exiting when the effect of the loop
   isn't there. The checks in the inner loops detected if this loop didn't
   have any effect on the final result and then continue to the next loop.
   This made the branch prediction unpredictable and a lot of mis
   predictions were done. For smaller inner loops it is always better
   to remove unpredictable if statements by using branchless code patterns.
4. Use SSE2 instruction when available.

This gives 50% performance increase measured on a
Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz with GCC 9.3.
Also check other compilers.

Before:
```
performance_no_dvert_10000 (4 ms)
performance_no_dvert_100000 (30 ms)
performance_no_dvert_1000000 (268 ms)
performance_no_dvert_10000000 (2637 ms)
```

After:
```
performance_no_dvert_10000 (3 ms)
performance_no_dvert_100000 (21 ms)
performance_no_dvert_1000000 (180 ms)
performance_no_dvert_10000000 (1756 ms)
```

Reviewed By: Campbell Barton

Differential Revision: https://developer.blender.org/D9087
This commit is contained in:
Jeroen Bakker 2020-10-26 11:01:18 +01:00 committed by Jeroen Bakker
parent 2ddecfffc3
commit 042143440d
Notes: blender-bot 2023-02-14 11:21:43 +01:00
Referenced by commit bb5c4de009, Fix T82962: Crash changing lattice resolution with Vertex Group
Referenced by commit 8b836f6894, Fix (unreported) buffer-overflow in new lattice code.
Referenced by commit c0beeeb5de, Fix buffer-overflow in lattice deform evaluation
Referenced by issue #82962, Crash when changing lattice resolution with Vertex Group
3 changed files with 214 additions and 76 deletions

View File

@ -718,6 +718,7 @@ if(WITH_GTESTS)
set(TEST_SRC
intern/armature_test.cc
intern/fcurve_test.cc
intern/lattice_deform_test.cc
)
set(TEST_INC
../editors/include

View File

@ -49,14 +49,24 @@
#include "BKE_deform.h"
#ifdef __SSE2__
# include <emmintrin.h>
#endif
/* -------------------------------------------------------------------- */
/** \name Lattice Deform API
* \{ */
typedef struct LatticeDeformData {
const Object *object;
float *latticedata;
/* Convert from object space to deform space */
float latmat[4][4];
/* Cached reference to the lattice to use for evaluation. When in edit mode this attribute
* is set to the edit mode lattice. */
const Lattice *lt;
/* Preprocessed lattice points (converted to deform space). */
float *latticedata;
/* Prefetched DeformWeights of the lattice. */
float *lattice_weights;
} LatticeDeformData;
LatticeDeformData *BKE_lattice_deform_data_create(const Object *oblatt, const Object *ob)
@ -72,6 +82,7 @@ LatticeDeformData *BKE_lattice_deform_data_create(const Object *oblatt, const Ob
float fu, fv, fw;
int u, v, w;
float *latticedata;
float *lattice_weights = NULL;
float latmat[4][4];
LatticeDeformData *lattice_deform_data;
@ -80,8 +91,10 @@ LatticeDeformData *BKE_lattice_deform_data_create(const Object *oblatt, const Ob
}
bp = lt->def;
fp = latticedata = MEM_mallocN(sizeof(float[3]) * lt->pntsu * lt->pntsv * lt->pntsw,
"latticedata");
const int32_t num_points = lt->pntsu * lt->pntsv * lt->pntsw;
/* We allocate one additional float for SSE2 optimizations. Without this
* the SSE2 instructions for the last item would read in unallocated memory. */
fp = latticedata = MEM_mallocN(sizeof(float[3]) * num_points + sizeof(float), "latticedata");
/* for example with a particle system: (ob == NULL) */
if (ob == NULL) {
@ -100,6 +113,20 @@ LatticeDeformData *BKE_lattice_deform_data_create(const Object *oblatt, const Ob
invert_m4_m4(imat, latmat);
}
/* Prefetch latice deform group weights. */
int defgrp_index = -1;
const MDeformVert *dvert = BKE_lattice_deform_verts_get(oblatt);
if (lt->vgroup[0] && dvert) {
defgrp_index = BKE_object_defgroup_name_index(ob, lt->vgroup);
if (defgrp_index != -1) {
lattice_weights = MEM_malloc_arrayN(sizeof(float), num_points, "lattice_weights");
for (int index = 0; index < num_points; index++) {
lattice_weights[index] = BKE_defvert_find_weight(dvert + index, defgrp_index);
}
}
}
for (w = 0, fw = lt->fw; w < lt->pntsw; w++, fw += lt->dw) {
for (v = 0, fv = lt->fv; v < lt->pntsv; v++, fv += lt->dv) {
for (u = 0, fu = lt->fu; u < lt->pntsu; u++, bp++, co += 3, fp += 3, fu += lt->du) {
@ -121,7 +148,8 @@ LatticeDeformData *BKE_lattice_deform_data_create(const Object *oblatt, const Ob
lattice_deform_data = MEM_mallocN(sizeof(LatticeDeformData), "Lattice Deform Data");
lattice_deform_data->latticedata = latticedata;
lattice_deform_data->object = oblatt;
lattice_deform_data->lattice_weights = lattice_weights;
lattice_deform_data->lt = lt;
copy_m4_m4(lattice_deform_data->latmat, latmat);
return lattice_deform_data;
@ -131,30 +159,21 @@ void BKE_lattice_deform_data_eval_co(LatticeDeformData *lattice_deform_data,
float co[3],
float weight)
{
const Object *ob = lattice_deform_data->object;
Lattice *lt = ob->data;
float *latticedata = lattice_deform_data->latticedata;
float *lattice_weights = lattice_deform_data->lattice_weights;
BLI_assert(latticedata);
const Lattice *lt = lattice_deform_data->lt;
float u, v, w, tu[4], tv[4], tw[4];
float vec[3];
int idx_w, idx_v, idx_u;
int ui, vi, wi, uu, vv, ww;
/* vgroup influence */
int defgrp_index = -1;
float co_prev[3], weight_blend = 0.0f;
const MDeformVert *dvert = BKE_lattice_deform_verts_get(ob);
float *__restrict latticedata = lattice_deform_data->latticedata;
if (lt->editlatt) {
lt = lt->editlatt->latt;
}
if (latticedata == NULL) {
return;
}
if (lt->vgroup[0] && dvert) {
defgrp_index = BKE_object_defgroup_name_index(ob, lt->vgroup);
copy_v3_v3(co_prev, co);
}
copy_v3_v3(co_prev, co);
#ifdef __SSE2__
__m128 co_vec = _mm_loadu_ps(co_prev);
#endif
/* co is in local coords, treat with latmat */
mul_v3_m4v3(vec, lattice_deform_data->latmat, co);
@ -197,67 +216,47 @@ void BKE_lattice_deform_data_eval_co(LatticeDeformData *lattice_deform_data,
wi = 0;
}
const int w_stride = lt->pntsu * lt->pntsv;
const int idx_w_max = (lt->pntsw - 1) * lt->pntsu * lt->pntsv;
const int v_stride = lt->pntsu;
const int idx_v_max = (lt->pntsv - 1) * lt->pntsu;
const int idx_u_max = (lt->pntsu - 1);
for (ww = wi - 1; ww <= wi + 2; ww++) {
w = tw[ww - wi + 1];
if (w != 0.0f) {
if (ww > 0) {
if (ww < lt->pntsw) {
idx_w = ww * lt->pntsu * lt->pntsv;
w = weight * tw[ww - wi + 1];
idx_w = CLAMPIS(ww * w_stride, 0, idx_w_max);
for (vv = vi - 1; vv <= vi + 2; vv++) {
v = w * tv[vv - vi + 1];
idx_v = CLAMPIS(vv * v_stride, 0, idx_v_max);
for (uu = ui - 1; uu <= ui + 2; uu++) {
u = v * tu[uu - ui + 1];
idx_u = CLAMPIS(uu, 0, idx_u_max);
const int idx = idx_w + idx_v + idx_u;
#ifdef __SSE2__
{
__m128 weight_vec = _mm_set1_ps(u);
/* This will load one extra element, this is ok because
* we ignore that part of register anyway.
*/
__m128 lattice_vec = _mm_loadu_ps(&latticedata[idx * 3]);
co_vec = _mm_add_ps(co_vec, _mm_mul_ps(lattice_vec, weight_vec));
}
else {
idx_w = (lt->pntsw - 1) * lt->pntsu * lt->pntsv;
}
}
else {
idx_w = 0;
}
for (vv = vi - 1; vv <= vi + 2; vv++) {
v = w * tv[vv - vi + 1];
if (v != 0.0f) {
if (vv > 0) {
if (vv < lt->pntsv) {
idx_v = idx_w + vv * lt->pntsu;
}
else {
idx_v = idx_w + (lt->pntsv - 1) * lt->pntsu;
}
}
else {
idx_v = idx_w;
}
for (uu = ui - 1; uu <= ui + 2; uu++) {
u = weight * v * tu[uu - ui + 1];
if (u != 0.0f) {
if (uu > 0) {
if (uu < lt->pntsu) {
idx_u = idx_v + uu;
}
else {
idx_u = idx_v + (lt->pntsu - 1);
}
}
else {
idx_u = idx_v;
}
madd_v3_v3fl(co, &latticedata[idx_u * 3], u);
if (defgrp_index != -1) {
weight_blend += (u * BKE_defvert_find_weight(dvert + idx_u, defgrp_index));
}
}
}
#else
madd_v3_v3fl(co, &latticedata[idx * 3], u);
#endif
if (lattice_weights) {
weight_blend += (u * lattice_weights[idx]);
}
}
}
}
#ifdef __SSE2__
{
copy_v3_v3(co, (float *)&co_vec);
}
#endif
if (defgrp_index != -1) {
if (lattice_weights) {
interp_v3_v3v3(co, co_prev, co, weight_blend);
}
}

View File

@ -0,0 +1,138 @@
/*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* The Original Code is Copyright (C) 2020 by Blender Foundation.
*/
#include "testing/testing.h"
#include "BKE_idtype.h"
#include "BKE_lattice.h"
#include "MEM_guardedalloc.h"
#include "DNA_lattice_types.h"
#include "DNA_mesh_types.h"
#include "DNA_object_types.h"
#include "BLI_rand.hh"
namespace blender::bke::tests {
struct LatticeDeformTestContext {
Lattice lattice;
Object ob_lattice;
Mesh mesh;
Object ob_mesh;
float (*coords)[3];
LatticeDeformData *ldd;
};
static void test_lattice_deform_init(LatticeDeformTestContext *ctx,
RandomNumberGenerator *rng,
int32_t num_items)
{
/* Generate random input data between -5 and 5. */
ctx->coords = (float(*)[3])MEM_malloc_arrayN(sizeof(float[3]), num_items, __func__);
for (uint32_t index = 0; index < num_items; index++) {
ctx->coords[index][0] = (rng->get_float() - 0.5f) * 10;
ctx->coords[index][1] = (rng->get_float() - 0.5f) * 10;
ctx->coords[index][2] = (rng->get_float() - 0.5f) * 10;
}
IDType_ID_LT.init_data(&ctx->lattice.id);
IDType_ID_OB.init_data(&ctx->ob_lattice.id);
ctx->ob_lattice.type = OB_LATTICE;
ctx->ob_lattice.data = &ctx->lattice;
IDType_ID_OB.init_data(&ctx->ob_mesh.id);
IDType_ID_ME.init_data(&ctx->mesh.id);
ctx->ob_mesh.type = OB_MESH;
ctx->ob_mesh.data = &ctx->mesh;
ctx->ldd = BKE_lattice_deform_data_create(&ctx->ob_lattice, &ctx->ob_mesh);
}
static void test_lattice_deform(LatticeDeformTestContext *ctx, int32_t num_items)
{
for (int i = 0; i < num_items; i++) {
float *co = &ctx->coords[i][0];
BKE_lattice_deform_data_eval_co(ctx->ldd, co, 1.0f);
}
}
static void test_lattice_deform_free(LatticeDeformTestContext *ctx)
{
BKE_lattice_deform_data_destroy(ctx->ldd);
MEM_freeN(ctx->coords);
IDType_ID_LT.free_data(&ctx->lattice.id);
IDType_ID_OB.free_data(&ctx->ob_lattice.id);
IDType_ID_OB.free_data(&ctx->ob_mesh.id);
IDType_ID_ME.free_data(&ctx->mesh.id);
}
TEST(lattice_deform_performance, performance_no_dvert_1)
{
const int32_t num_items = 1;
LatticeDeformTestContext ctx = {0};
RandomNumberGenerator rng;
test_lattice_deform_init(&ctx, &rng, num_items);
test_lattice_deform(&ctx, num_items);
test_lattice_deform_free(&ctx);
}
TEST(lattice_deform_performance, performance_no_dvert_1000)
{
const int32_t num_items = 1000;
LatticeDeformTestContext ctx = {0};
RandomNumberGenerator rng;
test_lattice_deform_init(&ctx, &rng, num_items);
test_lattice_deform(&ctx, num_items);
test_lattice_deform_free(&ctx);
}
TEST(lattice_deform_performance, performance_no_dvert_10000)
{
const int32_t num_items = 10000;
LatticeDeformTestContext ctx = {0};
RandomNumberGenerator rng;
test_lattice_deform_init(&ctx, &rng, num_items);
test_lattice_deform(&ctx, num_items);
test_lattice_deform_free(&ctx);
}
TEST(lattice_deform_performance, performance_no_dvert_100000)
{
const int32_t num_items = 100000;
LatticeDeformTestContext ctx = {0};
RandomNumberGenerator rng;
test_lattice_deform_init(&ctx, &rng, num_items);
test_lattice_deform(&ctx, num_items);
test_lattice_deform_free(&ctx);
}
TEST(lattice_deform_performance, performance_no_dvert_1000000)
{
const int32_t num_items = 1000000;
LatticeDeformTestContext ctx = {0};
RandomNumberGenerator rng;
test_lattice_deform_init(&ctx, &rng, num_items);
test_lattice_deform(&ctx, num_items);
test_lattice_deform_free(&ctx);
}
TEST(lattice_deform_performance, performance_no_dvert_10000000)
{
const int32_t num_items = 10000000;
LatticeDeformTestContext ctx = {0};
RandomNumberGenerator rng;
test_lattice_deform_init(&ctx, &rng, num_items);
test_lattice_deform(&ctx, num_items);
test_lattice_deform_free(&ctx);
}
} // namespace blender::bke::tests