BLI_task: tweak default chunk size for `BLI_task_parallel_range()`.

Previously we were setting it to 1 (aka no 'chunking'), to follow previous behavior. However, this is far from optimal, especially with CPUs that can have tens of threads nowadays. Now taking an heuristic approach (inspired from the one already existing for `BLI_task_parallel_listbase()`, which tries to guesstimate best chunk sizes based on several factors (amount of threads/parallel tasks, total number of items, ...). Think this is a reasonable base ground, more optimization here would of course be possible. Note that code that was already explicitely settings some value here won't be affected at all by that change.
2019-07-30 14:36:59 +02:00 · 2019-07-30 14:36:59 +02:00 · b9c257019f
parent ae7db53744
commit b9c257019f
2 changed files with 46 additions and 14 deletions
--- a/source/blender/blenlib/BLI_task.h
+++ b/source/blender/blenlib/BLI_task.h
@ -216,11 +216,8 @@ BLI_INLINE void BLI_parallel_range_settings_defaults(ParallelRangeSettings *sett
  memset(settings, 0, sizeof(*settings));
  settings->use_threading = true;
  settings->scheduling_mode = TASK_SCHEDULING_STATIC;
-  /* NOTE: Current value mimics old behavior, but it's not ideal by any
-   * means. Would be cool to find a common value which will work good enough
-   * for both static and dynamic scheduling.
-   */
-  settings->min_iter_per_thread = 1;
+  /* Use default heuristic to define actual chunk size. */
+  settings->min_iter_per_thread = 0;
 }

 #ifdef __cplusplus
--- a/source/blender/blenlib/intern/task.c
+++ b/source/blender/blenlib/intern/task.c
@ -1054,6 +1054,49 @@ typedef struct ParallelRangeState {
  int chunk_size;
 } ParallelRangeState;

+BLI_INLINE void task_parallel_range_calc_chunk_size(const ParallelRangeSettings *settings,
+                                                    const int num_tasks,
+                                                    ParallelRangeState *state)
+{
+  const int tot_items = state->stop - state->start;
+  int chunk_size = 0;
+
+  if (settings->min_iter_per_thread > 0) {
+    /* Already set by user, no need to do anything here. */
+    chunk_size = settings->min_iter_per_thread;
+  }
+  else {
+    /* Basic heuristic to avoid threading on low amount of items. We could make that limit
+     * configurable in settings too... */
+    if (tot_items > 0 && tot_items < 256) {
+      chunk_size = tot_items;
+    }
+    /* NOTE: The idea here is to compensate for rather measurable threading
+     * overhead caused by fetching tasks. With too many CPU threads we are starting
+     * to spend too much time in those overheads. */
+    else if (num_tasks > 32) {
+      chunk_size = 128;
+    }
+    else if (num_tasks > 16) {
+      chunk_size = 64;
+    }
+    else {
+      chunk_size = 32;
+    }
+  }
+
+  BLI_assert(chunk_size > 0);
+
+  switch (settings->scheduling_mode) {
+    case TASK_SCHEDULING_STATIC:
+      state->chunk_size = max_ii(chunk_size, tot_items / (num_tasks));
+      break;
+    case TASK_SCHEDULING_DYNAMIC:
+      state->chunk_size = chunk_size;
+      break;
+  }
+}
+
 BLI_INLINE bool parallel_range_next_iter_get(ParallelRangeState *__restrict state,
                                             int *__restrict iter,
                                             int *__restrict count)
@ -1162,16 +1205,8 @@ void BLI_task_parallel_range(const int start,
  state.userdata = userdata;
  state.func = func;
  state.iter = start;
-  switch (settings->scheduling_mode) {
-    case TASK_SCHEDULING_STATIC:
-      state.chunk_size = max_ii(settings->min_iter_per_thread, (stop - start) / (num_tasks));
-      break;
-    case TASK_SCHEDULING_DYNAMIC:
-      /* TODO(sergey): Make it configurable from min_iter_per_thread. */
-      state.chunk_size = 32;
-      break;
-  }

+  task_parallel_range_calc_chunk_size(settings, num_tasks, &state);
  num_tasks = min_ii(num_tasks, max_ii(1, (stop - start) / state.chunk_size));

  if (num_tasks == 1) {