Refactor: use 'BLI_task_parallel_range' in Draw Cache
One drawback to trying to predict the number of threads that will be used in the `task_graph` is that we are only sure of the number when the threads are running. Using `BLI_task_parallel_range` allows the driver to choose the best thread distribution through `parallel_reduce`. The benefit is most evident on hardware with fewer cores. This is the result on an 4-core laptop: ||before:|after: |---|---|---| |large_mesh_editing:|Average: 5.203638 FPS|Average: 5.398925 FPS ||rdata 15ms iter 43ms (frame 193ms)|rdata 14ms iter 36ms (frame 187ms) Differential Revision: https://developer.blender.org/D11558
This commit is contained in:
parent
2330cec2c6
commit
0eb9351296
|
@ -50,7 +50,7 @@
|
|||
# include "PIL_time_utildefines.h"
|
||||
#endif
|
||||
|
||||
#define CHUNK_SIZE 1024
|
||||
#define MIM_RANGE_LEN 1024
|
||||
|
||||
namespace blender::draw {
|
||||
|
||||
|
@ -439,18 +439,18 @@ static void extract_task_range_run(void *__restrict taskdata)
|
|||
const eMRIterType iter_type = data->iter_type;
|
||||
const bool is_mesh = data->mr->extract_type != MR_EXTRACT_BMESH;
|
||||
|
||||
size_t userdata_chunk_size = data->extractors->data_size_total();
|
||||
char *userdata_chunk = new char[userdata_chunk_size];
|
||||
|
||||
TaskParallelSettings settings;
|
||||
BLI_parallel_range_settings_defaults(&settings);
|
||||
settings.func_reduce = extract_task_reduce;
|
||||
settings.min_iter_per_thread = CHUNK_SIZE;
|
||||
settings.use_threading = data->use_threading;
|
||||
settings.userdata_chunk = userdata_chunk;
|
||||
settings.userdata_chunk_size = userdata_chunk_size;
|
||||
settings.func_reduce = extract_task_reduce;
|
||||
settings.min_iter_per_thread = MIM_RANGE_LEN;
|
||||
|
||||
size_t chunk_size = data->extractors->data_size_total();
|
||||
char *chunk = new char[chunk_size];
|
||||
extract_init(data->mr, data->cache, *data->extractors, data->mbc, (void *)chunk);
|
||||
|
||||
settings.userdata_chunk = chunk;
|
||||
settings.userdata_chunk_size = chunk_size;
|
||||
extract_init(data->mr, data->cache, *data->extractors, data->mbc, (void *)userdata_chunk);
|
||||
|
||||
if (iter_type & MR_ITER_LOOPTRI) {
|
||||
extract_task_range_run_iter(data->mr, data->extractors, MR_ITER_LOOPTRI, is_mesh, &settings);
|
||||
|
@ -465,14 +465,14 @@ static void extract_task_range_run(void *__restrict taskdata)
|
|||
extract_task_range_run_iter(data->mr, data->extractors, MR_ITER_LVERT, is_mesh, &settings);
|
||||
}
|
||||
|
||||
extract_finish(data->mr, data->cache, *data->extractors, (void *)chunk);
|
||||
delete[] chunk;
|
||||
extract_finish(data->mr, data->cache, *data->extractors, (void *)userdata_chunk);
|
||||
delete[] userdata_chunk;
|
||||
}
|
||||
|
||||
/** \} */
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
/** \name Extract Single Thread
|
||||
/** \name Extract In Parallel Ranges
|
||||
* \{ */
|
||||
|
||||
static struct TaskNode *extract_task_node_create(struct TaskGraph *task_graph,
|
||||
|
@ -705,7 +705,7 @@ static void mesh_buffer_cache_create_requested(struct TaskGraph *task_graph,
|
|||
task_graph, mr, iter_type, data_flag);
|
||||
|
||||
/* Simple heuristic. */
|
||||
const bool use_thread = (mr->loop_len + mr->loop_loose_len) > CHUNK_SIZE;
|
||||
const bool use_thread = (mr->loop_len + mr->loop_loose_len) > MIM_RANGE_LEN;
|
||||
|
||||
if (use_thread) {
|
||||
/* First run the requested extractors that do not support asynchronous ranges. */
|
||||
|
|
|
@ -194,8 +194,7 @@ typedef void(ExtractFinishFn)(const MeshRenderData *mr,
|
|||
struct MeshBatchCache *cache,
|
||||
void *buffer,
|
||||
void *data);
|
||||
typedef void(ExtractTaskInitFn)(void *userdata, void *r_task_userdata);
|
||||
typedef void(ExtractTaskFinishFn)(void *userdata, void *task_userdata);
|
||||
typedef void(ExtractTaskReduceFn)(void *userdata, void *task_userdata);
|
||||
|
||||
typedef struct MeshExtract {
|
||||
/** Executed on main thread and return user data for iteration functions. */
|
||||
|
@ -210,7 +209,7 @@ typedef struct MeshExtract {
|
|||
ExtractLVertBMeshFn *iter_lvert_bm;
|
||||
ExtractLVertMeshFn *iter_lvert_mesh;
|
||||
/** Executed on one worker thread after all elements iterations. */
|
||||
ExtractTaskFinishFn *task_reduce;
|
||||
ExtractTaskReduceFn *task_reduce;
|
||||
ExtractFinishFn *finish;
|
||||
/** Used to request common data. */
|
||||
eMRDataType data_type;
|
||||
|
|
|
@ -41,7 +41,7 @@
|
|||
|
||||
#define MAX_TIMER_NAME 32
|
||||
#define MAX_NESTED_TIMER 8
|
||||
#define CHUNK_SIZE 8
|
||||
#define MIM_RANGE_LEN 8
|
||||
#define GPU_TIMER_FALLOFF 0.1
|
||||
|
||||
typedef struct DRWTimer {
|
||||
|
@ -82,7 +82,7 @@ void DRW_stats_begin(void)
|
|||
|
||||
if (DTP.is_recording && DTP.timers == NULL) {
|
||||
DTP.chunk_count = 1;
|
||||
DTP.timer_count = DTP.chunk_count * CHUNK_SIZE;
|
||||
DTP.timer_count = DTP.chunk_count * MIM_RANGE_LEN;
|
||||
DTP.timers = MEM_callocN(sizeof(DRWTimer) * DTP.timer_count, "DRWTimer stack");
|
||||
}
|
||||
else if (!DTP.is_recording && DTP.timers != NULL) {
|
||||
|
@ -99,7 +99,7 @@ static DRWTimer *drw_stats_timer_get(void)
|
|||
if (UNLIKELY(DTP.timer_increment >= DTP.timer_count)) {
|
||||
/* Resize the stack. */
|
||||
DTP.chunk_count++;
|
||||
DTP.timer_count = DTP.chunk_count * CHUNK_SIZE;
|
||||
DTP.timer_count = DTP.chunk_count * MIM_RANGE_LEN;
|
||||
DTP.timers = MEM_recallocN(DTP.timers, sizeof(DRWTimer) * DTP.timer_count);
|
||||
}
|
||||
|
||||
|
|
|
@ -138,11 +138,11 @@ static void extract_lines_iter_ledge_mesh(const MeshRenderData *mr,
|
|||
GPU_indexbuf_set_line_restart(elb, e_index);
|
||||
}
|
||||
|
||||
static void extract_lines_task_finish(void *_userdata_to, void *_userdata_from)
|
||||
static void extract_lines_task_reduce(void *_userdata_to, void *_userdata_from)
|
||||
{
|
||||
GPUIndexBufBuilder *elb_to = static_cast<GPUIndexBufBuilder *>(_userdata_to);
|
||||
GPUIndexBufBuilder *elb_from = static_cast<GPUIndexBufBuilder *>(_userdata_from);
|
||||
GPU_indexbuf_join_copies(elb_to, elb_from);
|
||||
GPU_indexbuf_join(elb_to, elb_from);
|
||||
}
|
||||
|
||||
static void extract_lines_finish(const MeshRenderData *UNUSED(mr),
|
||||
|
@ -163,7 +163,7 @@ constexpr MeshExtract create_extractor_lines()
|
|||
extractor.iter_poly_mesh = extract_lines_iter_poly_mesh;
|
||||
extractor.iter_ledge_bm = extract_lines_iter_ledge_bm;
|
||||
extractor.iter_ledge_mesh = extract_lines_iter_ledge_mesh;
|
||||
extractor.task_reduce = extract_lines_task_finish;
|
||||
extractor.task_reduce = extract_lines_task_reduce;
|
||||
extractor.finish = extract_lines_finish;
|
||||
extractor.data_type = MR_DATA_NONE;
|
||||
extractor.data_size = sizeof(GPUIndexBufBuilder);
|
||||
|
@ -208,7 +208,7 @@ constexpr MeshExtract create_extractor_lines_with_lines_loose()
|
|||
extractor.iter_poly_mesh = extract_lines_iter_poly_mesh;
|
||||
extractor.iter_ledge_bm = extract_lines_iter_ledge_bm;
|
||||
extractor.iter_ledge_mesh = extract_lines_iter_ledge_mesh;
|
||||
extractor.task_reduce = extract_lines_task_finish;
|
||||
extractor.task_reduce = extract_lines_task_reduce;
|
||||
extractor.finish = extract_lines_with_lines_loose_finish;
|
||||
extractor.data_type = MR_DATA_NONE;
|
||||
extractor.data_size = sizeof(GPUIndexBufBuilder);
|
||||
|
|
|
@ -137,11 +137,11 @@ static void extract_points_iter_lvert_mesh(const MeshRenderData *mr,
|
|||
vert_set_mesh(elb, mr, mr->lverts[lvert_index], offset + lvert_index);
|
||||
}
|
||||
|
||||
static void extract_points_task_finish(void *_userdata_to, void *_userdata_from)
|
||||
static void extract_points_task_reduce(void *_userdata_to, void *_userdata_from)
|
||||
{
|
||||
GPUIndexBufBuilder *elb_to = static_cast<GPUIndexBufBuilder *>(_userdata_to);
|
||||
GPUIndexBufBuilder *elb_from = static_cast<GPUIndexBufBuilder *>(_userdata_from);
|
||||
GPU_indexbuf_join_copies(elb_to, elb_from);
|
||||
GPU_indexbuf_join(elb_to, elb_from);
|
||||
}
|
||||
|
||||
static void extract_points_finish(const MeshRenderData *UNUSED(mr),
|
||||
|
@ -164,7 +164,7 @@ constexpr MeshExtract create_extractor_points()
|
|||
extractor.iter_ledge_mesh = extract_points_iter_ledge_mesh;
|
||||
extractor.iter_lvert_bm = extract_points_iter_lvert_bm;
|
||||
extractor.iter_lvert_mesh = extract_points_iter_lvert_mesh;
|
||||
extractor.task_reduce = extract_points_task_finish;
|
||||
extractor.task_reduce = extract_points_task_reduce;
|
||||
extractor.finish = extract_points_finish;
|
||||
extractor.use_threading = true;
|
||||
extractor.data_type = MR_DATA_NONE;
|
||||
|
|
|
@ -211,11 +211,11 @@ static void extract_tris_single_mat_iter_looptri_mesh(const MeshRenderData *mr,
|
|||
}
|
||||
}
|
||||
|
||||
static void extract_tris_single_mat_task_finish(void *_userdata_to, void *_userdata_from)
|
||||
static void extract_tris_single_mat_task_reduce(void *_userdata_to, void *_userdata_from)
|
||||
{
|
||||
GPUIndexBufBuilder *elb_to = static_cast<GPUIndexBufBuilder *>(_userdata_to);
|
||||
GPUIndexBufBuilder *elb_from = static_cast<GPUIndexBufBuilder *>(_userdata_from);
|
||||
GPU_indexbuf_join_copies(elb_to, elb_from);
|
||||
GPU_indexbuf_join(elb_to, elb_from);
|
||||
}
|
||||
|
||||
static void extract_tris_single_mat_finish(const MeshRenderData *mr,
|
||||
|
@ -250,7 +250,7 @@ constexpr MeshExtract create_extractor_tris_single_mat()
|
|||
extractor.init = extract_tris_single_mat_init;
|
||||
extractor.iter_looptri_bm = extract_tris_single_mat_iter_looptri_bm;
|
||||
extractor.iter_looptri_mesh = extract_tris_single_mat_iter_looptri_mesh;
|
||||
extractor.task_reduce = extract_tris_single_mat_task_finish;
|
||||
extractor.task_reduce = extract_tris_single_mat_task_reduce;
|
||||
extractor.finish = extract_tris_single_mat_finish;
|
||||
extractor.data_type = MR_DATA_NONE;
|
||||
extractor.data_size = sizeof(GPUIndexBufBuilder);
|
||||
|
|
|
@ -58,8 +58,7 @@ GPUIndexBuf *GPU_indexbuf_build_on_device(uint index_len);
|
|||
*
|
||||
* Function inspired by the reduction directives of multithread work APIs..
|
||||
*/
|
||||
void GPU_indexbuf_join_copies(GPUIndexBufBuilder *builder,
|
||||
const GPUIndexBufBuilder *parent_builder);
|
||||
void GPU_indexbuf_join(GPUIndexBufBuilder *builder, const GPUIndexBufBuilder *parent_builder);
|
||||
|
||||
void GPU_indexbuf_add_generic_vert(GPUIndexBufBuilder *, uint v);
|
||||
void GPU_indexbuf_add_primitive_restart(GPUIndexBufBuilder *);
|
||||
|
|
|
@ -79,8 +79,7 @@ GPUIndexBuf *GPU_indexbuf_build_on_device(uint index_len)
|
|||
return elem_;
|
||||
}
|
||||
|
||||
void GPU_indexbuf_join_copies(GPUIndexBufBuilder *builder_to,
|
||||
const GPUIndexBufBuilder *builder_from)
|
||||
void GPU_indexbuf_join(GPUIndexBufBuilder *builder_to, const GPUIndexBufBuilder *builder_from)
|
||||
{
|
||||
BLI_assert(builder_to->data == builder_from->data);
|
||||
builder_to->index_len = max_uu(builder_to->index_len, builder_from->index_len);
|
||||
|
|
|
@ -21,7 +21,7 @@ TEST_F(GPUTest, gpu_index_buffer_subbuilders)
|
|||
|
||||
GPUIndexBufBuilder subbuilders[num_subbuilders];
|
||||
for (int subbuilder_index = 0; subbuilder_index < num_subbuilders; subbuilder_index++) {
|
||||
GPU_indexbuf_subbuilder_init(&builder, &subbuilders[subbuilder_index]);
|
||||
memcpy(&subbuilders[subbuilder_index], &builder, sizeof(builder));
|
||||
}
|
||||
|
||||
for (int subbuilder_index = 0; subbuilder_index < num_subbuilders; subbuilder_index++) {
|
||||
|
@ -35,7 +35,7 @@ TEST_F(GPUTest, gpu_index_buffer_subbuilders)
|
|||
|
||||
for (int subbuilder_index = 0; subbuilder_index < num_subbuilders; subbuilder_index++) {
|
||||
EXPECT_EQ(builder.index_len, subbuilder_index * verts_per_subbuilders);
|
||||
GPU_indexbuf_subbuilder_finish(&builder, &subbuilders[subbuilder_index]);
|
||||
GPU_indexbuf_join(&builder, &subbuilders[subbuilder_index]);
|
||||
EXPECT_EQ(builder.index_len, (subbuilder_index + 1) * verts_per_subbuilders);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue