Refactor: use 'BLI_task_parallel_range' in Draw Cache

One drawback to trying to predict the number of threads that will be
used in the `task_graph` is that we are only sure of the number when the
threads are running.

Using `BLI_task_parallel_range` allows the driver to
choose the best thread distribution through `parallel_reduce`.

The benefit is most evident on hardware with fewer cores.

This is the result on an 4-core laptop:
||before:|after:
|---|---|---|
|large_mesh_editing:|Average: 5.203638 FPS|Average: 5.398925 FPS
||rdata 15ms iter 43ms (frame 193ms)|rdata 14ms iter 36ms (frame 187ms)

Differential Revision: https://developer.blender.org/D11558
This commit is contained in:
Germano Cavalcante 2021-06-10 11:13:01 -03:00
parent 2330cec2c6
commit 0eb9351296
9 changed files with 32 additions and 35 deletions

View File

@ -50,7 +50,7 @@
# include "PIL_time_utildefines.h"
#endif
#define CHUNK_SIZE 1024
#define MIM_RANGE_LEN 1024
namespace blender::draw {
@ -439,18 +439,18 @@ static void extract_task_range_run(void *__restrict taskdata)
const eMRIterType iter_type = data->iter_type;
const bool is_mesh = data->mr->extract_type != MR_EXTRACT_BMESH;
size_t userdata_chunk_size = data->extractors->data_size_total();
char *userdata_chunk = new char[userdata_chunk_size];
TaskParallelSettings settings;
BLI_parallel_range_settings_defaults(&settings);
settings.func_reduce = extract_task_reduce;
settings.min_iter_per_thread = CHUNK_SIZE;
settings.use_threading = data->use_threading;
settings.userdata_chunk = userdata_chunk;
settings.userdata_chunk_size = userdata_chunk_size;
settings.func_reduce = extract_task_reduce;
settings.min_iter_per_thread = MIM_RANGE_LEN;
size_t chunk_size = data->extractors->data_size_total();
char *chunk = new char[chunk_size];
extract_init(data->mr, data->cache, *data->extractors, data->mbc, (void *)chunk);
settings.userdata_chunk = chunk;
settings.userdata_chunk_size = chunk_size;
extract_init(data->mr, data->cache, *data->extractors, data->mbc, (void *)userdata_chunk);
if (iter_type & MR_ITER_LOOPTRI) {
extract_task_range_run_iter(data->mr, data->extractors, MR_ITER_LOOPTRI, is_mesh, &settings);
@ -465,14 +465,14 @@ static void extract_task_range_run(void *__restrict taskdata)
extract_task_range_run_iter(data->mr, data->extractors, MR_ITER_LVERT, is_mesh, &settings);
}
extract_finish(data->mr, data->cache, *data->extractors, (void *)chunk);
delete[] chunk;
extract_finish(data->mr, data->cache, *data->extractors, (void *)userdata_chunk);
delete[] userdata_chunk;
}
/** \} */
/* ---------------------------------------------------------------------- */
/** \name Extract Single Thread
/** \name Extract In Parallel Ranges
* \{ */
static struct TaskNode *extract_task_node_create(struct TaskGraph *task_graph,
@ -705,7 +705,7 @@ static void mesh_buffer_cache_create_requested(struct TaskGraph *task_graph,
task_graph, mr, iter_type, data_flag);
/* Simple heuristic. */
const bool use_thread = (mr->loop_len + mr->loop_loose_len) > CHUNK_SIZE;
const bool use_thread = (mr->loop_len + mr->loop_loose_len) > MIM_RANGE_LEN;
if (use_thread) {
/* First run the requested extractors that do not support asynchronous ranges. */

View File

@ -194,8 +194,7 @@ typedef void(ExtractFinishFn)(const MeshRenderData *mr,
struct MeshBatchCache *cache,
void *buffer,
void *data);
typedef void(ExtractTaskInitFn)(void *userdata, void *r_task_userdata);
typedef void(ExtractTaskFinishFn)(void *userdata, void *task_userdata);
typedef void(ExtractTaskReduceFn)(void *userdata, void *task_userdata);
typedef struct MeshExtract {
/** Executed on main thread and return user data for iteration functions. */
@ -210,7 +209,7 @@ typedef struct MeshExtract {
ExtractLVertBMeshFn *iter_lvert_bm;
ExtractLVertMeshFn *iter_lvert_mesh;
/** Executed on one worker thread after all elements iterations. */
ExtractTaskFinishFn *task_reduce;
ExtractTaskReduceFn *task_reduce;
ExtractFinishFn *finish;
/** Used to request common data. */
eMRDataType data_type;

View File

@ -41,7 +41,7 @@
#define MAX_TIMER_NAME 32
#define MAX_NESTED_TIMER 8
#define CHUNK_SIZE 8
#define MIM_RANGE_LEN 8
#define GPU_TIMER_FALLOFF 0.1
typedef struct DRWTimer {
@ -82,7 +82,7 @@ void DRW_stats_begin(void)
if (DTP.is_recording && DTP.timers == NULL) {
DTP.chunk_count = 1;
DTP.timer_count = DTP.chunk_count * CHUNK_SIZE;
DTP.timer_count = DTP.chunk_count * MIM_RANGE_LEN;
DTP.timers = MEM_callocN(sizeof(DRWTimer) * DTP.timer_count, "DRWTimer stack");
}
else if (!DTP.is_recording && DTP.timers != NULL) {
@ -99,7 +99,7 @@ static DRWTimer *drw_stats_timer_get(void)
if (UNLIKELY(DTP.timer_increment >= DTP.timer_count)) {
/* Resize the stack. */
DTP.chunk_count++;
DTP.timer_count = DTP.chunk_count * CHUNK_SIZE;
DTP.timer_count = DTP.chunk_count * MIM_RANGE_LEN;
DTP.timers = MEM_recallocN(DTP.timers, sizeof(DRWTimer) * DTP.timer_count);
}

View File

@ -138,11 +138,11 @@ static void extract_lines_iter_ledge_mesh(const MeshRenderData *mr,
GPU_indexbuf_set_line_restart(elb, e_index);
}
static void extract_lines_task_finish(void *_userdata_to, void *_userdata_from)
static void extract_lines_task_reduce(void *_userdata_to, void *_userdata_from)
{
GPUIndexBufBuilder *elb_to = static_cast<GPUIndexBufBuilder *>(_userdata_to);
GPUIndexBufBuilder *elb_from = static_cast<GPUIndexBufBuilder *>(_userdata_from);
GPU_indexbuf_join_copies(elb_to, elb_from);
GPU_indexbuf_join(elb_to, elb_from);
}
static void extract_lines_finish(const MeshRenderData *UNUSED(mr),
@ -163,7 +163,7 @@ constexpr MeshExtract create_extractor_lines()
extractor.iter_poly_mesh = extract_lines_iter_poly_mesh;
extractor.iter_ledge_bm = extract_lines_iter_ledge_bm;
extractor.iter_ledge_mesh = extract_lines_iter_ledge_mesh;
extractor.task_reduce = extract_lines_task_finish;
extractor.task_reduce = extract_lines_task_reduce;
extractor.finish = extract_lines_finish;
extractor.data_type = MR_DATA_NONE;
extractor.data_size = sizeof(GPUIndexBufBuilder);
@ -208,7 +208,7 @@ constexpr MeshExtract create_extractor_lines_with_lines_loose()
extractor.iter_poly_mesh = extract_lines_iter_poly_mesh;
extractor.iter_ledge_bm = extract_lines_iter_ledge_bm;
extractor.iter_ledge_mesh = extract_lines_iter_ledge_mesh;
extractor.task_reduce = extract_lines_task_finish;
extractor.task_reduce = extract_lines_task_reduce;
extractor.finish = extract_lines_with_lines_loose_finish;
extractor.data_type = MR_DATA_NONE;
extractor.data_size = sizeof(GPUIndexBufBuilder);

View File

@ -137,11 +137,11 @@ static void extract_points_iter_lvert_mesh(const MeshRenderData *mr,
vert_set_mesh(elb, mr, mr->lverts[lvert_index], offset + lvert_index);
}
static void extract_points_task_finish(void *_userdata_to, void *_userdata_from)
static void extract_points_task_reduce(void *_userdata_to, void *_userdata_from)
{
GPUIndexBufBuilder *elb_to = static_cast<GPUIndexBufBuilder *>(_userdata_to);
GPUIndexBufBuilder *elb_from = static_cast<GPUIndexBufBuilder *>(_userdata_from);
GPU_indexbuf_join_copies(elb_to, elb_from);
GPU_indexbuf_join(elb_to, elb_from);
}
static void extract_points_finish(const MeshRenderData *UNUSED(mr),
@ -164,7 +164,7 @@ constexpr MeshExtract create_extractor_points()
extractor.iter_ledge_mesh = extract_points_iter_ledge_mesh;
extractor.iter_lvert_bm = extract_points_iter_lvert_bm;
extractor.iter_lvert_mesh = extract_points_iter_lvert_mesh;
extractor.task_reduce = extract_points_task_finish;
extractor.task_reduce = extract_points_task_reduce;
extractor.finish = extract_points_finish;
extractor.use_threading = true;
extractor.data_type = MR_DATA_NONE;

View File

@ -211,11 +211,11 @@ static void extract_tris_single_mat_iter_looptri_mesh(const MeshRenderData *mr,
}
}
static void extract_tris_single_mat_task_finish(void *_userdata_to, void *_userdata_from)
static void extract_tris_single_mat_task_reduce(void *_userdata_to, void *_userdata_from)
{
GPUIndexBufBuilder *elb_to = static_cast<GPUIndexBufBuilder *>(_userdata_to);
GPUIndexBufBuilder *elb_from = static_cast<GPUIndexBufBuilder *>(_userdata_from);
GPU_indexbuf_join_copies(elb_to, elb_from);
GPU_indexbuf_join(elb_to, elb_from);
}
static void extract_tris_single_mat_finish(const MeshRenderData *mr,
@ -250,7 +250,7 @@ constexpr MeshExtract create_extractor_tris_single_mat()
extractor.init = extract_tris_single_mat_init;
extractor.iter_looptri_bm = extract_tris_single_mat_iter_looptri_bm;
extractor.iter_looptri_mesh = extract_tris_single_mat_iter_looptri_mesh;
extractor.task_reduce = extract_tris_single_mat_task_finish;
extractor.task_reduce = extract_tris_single_mat_task_reduce;
extractor.finish = extract_tris_single_mat_finish;
extractor.data_type = MR_DATA_NONE;
extractor.data_size = sizeof(GPUIndexBufBuilder);

View File

@ -58,8 +58,7 @@ GPUIndexBuf *GPU_indexbuf_build_on_device(uint index_len);
*
* Function inspired by the reduction directives of multithread work APIs..
*/
void GPU_indexbuf_join_copies(GPUIndexBufBuilder *builder,
const GPUIndexBufBuilder *parent_builder);
void GPU_indexbuf_join(GPUIndexBufBuilder *builder, const GPUIndexBufBuilder *parent_builder);
void GPU_indexbuf_add_generic_vert(GPUIndexBufBuilder *, uint v);
void GPU_indexbuf_add_primitive_restart(GPUIndexBufBuilder *);

View File

@ -79,8 +79,7 @@ GPUIndexBuf *GPU_indexbuf_build_on_device(uint index_len)
return elem_;
}
void GPU_indexbuf_join_copies(GPUIndexBufBuilder *builder_to,
const GPUIndexBufBuilder *builder_from)
void GPU_indexbuf_join(GPUIndexBufBuilder *builder_to, const GPUIndexBufBuilder *builder_from)
{
BLI_assert(builder_to->data == builder_from->data);
builder_to->index_len = max_uu(builder_to->index_len, builder_from->index_len);

View File

@ -21,7 +21,7 @@ TEST_F(GPUTest, gpu_index_buffer_subbuilders)
GPUIndexBufBuilder subbuilders[num_subbuilders];
for (int subbuilder_index = 0; subbuilder_index < num_subbuilders; subbuilder_index++) {
GPU_indexbuf_subbuilder_init(&builder, &subbuilders[subbuilder_index]);
memcpy(&subbuilders[subbuilder_index], &builder, sizeof(builder));
}
for (int subbuilder_index = 0; subbuilder_index < num_subbuilders; subbuilder_index++) {
@ -35,7 +35,7 @@ TEST_F(GPUTest, gpu_index_buffer_subbuilders)
for (int subbuilder_index = 0; subbuilder_index < num_subbuilders; subbuilder_index++) {
EXPECT_EQ(builder.index_len, subbuilder_index * verts_per_subbuilders);
GPU_indexbuf_subbuilder_finish(&builder, &subbuilders[subbuilder_index]);
GPU_indexbuf_join(&builder, &subbuilders[subbuilder_index]);
EXPECT_EQ(builder.index_len, (subbuilder_index + 1) * verts_per_subbuilders);
}