Fix performance decrease with Scrambling Distance on

With the current code in master, scrambling distance is enabled on non-hardware accelerated ray tracing devices see a measurable performance decrease when compared scrambling distance on vs off. From testing, this performance decrease comes from the large tile sizes scheduled in `tile.cpp`. This patch attempts to address the performance decrease by using different algorithms to calculate the tile size for devices with hardware accelerated ray traversal and devices without. Large tile sizes for hardware accelerated devices and small tile sizes for others. Most of this code is based on proposals from @brecht and @leesonw Reviewed By: brecht, leesonw Differential Revision: https://developer.blender.org/D13042
2021-11-25 09:20:28 +01:00 · 2021-11-25 09:20:28 +01:00 · b41c72b710
parent 8f2db94627
commit b41c72b710
7 changed files with 41 additions and 24 deletions
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@ -257,7 +257,8 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
   * become busy after adding new tiles). This is especially important for the shadow catcher which
   * schedules work in halves of available number of paths. */
  work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
-
+  work_tile_scheduler_.set_accelerated_rt((device_->get_bvh_layout_mask() & BVH_LAYOUT_OPTIX) !=
+                                          0);
  work_tile_scheduler_.reset(effective_buffer_params_,
                             start_sample,
                             samples_num,
--- a/intern/cycles/integrator/tile.cpp
+++ b/intern/cycles/integrator/tile.cpp
@ -46,7 +46,8 @@ ccl_device_inline uint round_up_to_power_of_two(uint x)
  return next_power_of_two(x);
 }

-TileSize tile_calculate_best_size(const int2 &image_size,
+TileSize tile_calculate_best_size(const bool accel_rt,
+                                  const int2 &image_size,
                                  const int num_samples,
                                  const int max_num_path_states,
                                  const float scrambling_distance)
@ -73,7 +74,7 @@ TileSize tile_calculate_best_size(const int2 &image_size,

  TileSize tile_size;
  const int num_path_states_per_sample = max_num_path_states / num_samples;
-  if (scrambling_distance < 0.9f) {
+  if (scrambling_distance < 0.9f && accel_rt) {
    /* Prefer large tiles for scrambling distance, bounded by max num path states. */
    tile_size.width = min(image_size.x, max_num_path_states);
    tile_size.height = min(image_size.y, max(max_num_path_states / tile_size.width, 1));
--- a/intern/cycles/integrator/tile.h
+++ b/intern/cycles/integrator/tile.h
@ -49,7 +49,8 @@ std::ostream &operator<<(std::ostream &os, const TileSize &tile_size);
 * of active path states.
 * Will attempt to provide best guess to keep path tracing threads of a device as localized as
 * possible, and have as many threads active for every tile as possible. */
-TileSize tile_calculate_best_size(const int2 &image_size,
+TileSize tile_calculate_best_size(const bool accel_rt,
+                                  const int2 &image_size,
                                  const int num_samples,
                                  const int max_num_path_states,
                                  const float scrambling_distance);
--- a/intern/cycles/integrator/work_tile_scheduler.cpp
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@ -28,6 +28,11 @@ WorkTileScheduler::WorkTileScheduler()
 {
 }

+void WorkTileScheduler::set_accelerated_rt(bool accelerated_rt)
+{
+  accelerated_rt_ = accelerated_rt;
+}
+
 void WorkTileScheduler::set_max_num_path_states(int max_num_path_states)
 {
  max_num_path_states_ = max_num_path_states;
@ -59,7 +64,7 @@ void WorkTileScheduler::reset(const BufferParams &buffer_params,
 void WorkTileScheduler::reset_scheduler_state()
 {
  tile_size_ = tile_calculate_best_size(
-      image_size_px_, samples_num_, max_num_path_states_, scrambling_distance_);
+      accelerated_rt_, image_size_px_, samples_num_, max_num_path_states_, scrambling_distance_);

  VLOG(3) << "Will schedule tiles of size " << tile_size_;

--- a/intern/cycles/integrator/work_tile_scheduler.h
+++ b/intern/cycles/integrator/work_tile_scheduler.h
@ -31,6 +31,9 @@ class WorkTileScheduler {
 public:
  WorkTileScheduler();

+  /* To indicate if there is accelerated RT support. */
+  void set_accelerated_rt(bool state);
+
  /* MAximum path states which are allowed to be used by a single scheduled work tile.
   *
   * Affects the scheduled work size: the work size will be as big as possible, but will not exceed
@ -54,6 +57,9 @@ class WorkTileScheduler {
 protected:
  void reset_scheduler_state();

+  /* Used to indicate if there is accelerated ray tracing. */
+  bool accelerated_rt_ = false;
+
  /* Maximum allowed path states to be used.
   *
   * TODO(sergey): Naming can be improved. The fact that this is a limiting factor based on the
--- a/intern/cycles/kernel/device/gpu/work_stealing.h
+++ b/intern/cycles/kernel/device/gpu/work_stealing.h
@ -29,17 +29,20 @@ ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
                                      ccl_private uint *y,
                                      ccl_private uint *sample)
 {
-#if 0
-  /* Keep threads for the same sample together. */
-  uint tile_pixels = tile->w * tile->h;
-  uint sample_offset = global_work_index / tile_pixels;
-  uint pixel_offset = global_work_index - sample_offset * tile_pixels;
-#else
-  /* Keeping threads for the same pixel together.
-   * Appears to improve performance by a few % on CUDA and OptiX. */
-  uint sample_offset = global_work_index % tile->num_samples;
-  uint pixel_offset = global_work_index / tile->num_samples;
-#endif
+  uint sample_offset, pixel_offset;
+
+  if (kernel_data.integrator.scrambling_distance < 0.9f) {
+    /* Keep threads for the same sample together. */
+    uint tile_pixels = tile->w * tile->h;
+    sample_offset = global_work_index / tile_pixels;
+    pixel_offset = global_work_index - sample_offset * tile_pixels;
+  }
+  else {
+    /* Keeping threads for the same pixel together.
+     * Appears to improve performance by a few % on CUDA and OptiX. */
+    sample_offset = global_work_index % tile->num_samples;
+    pixel_offset = global_work_index / tile->num_samples;
+  }

  uint y_offset = pixel_offset / tile->w;
  uint x_offset = pixel_offset - y_offset * tile->w;
--- a/intern/cycles/test/integrator_tile_test.cpp
+++ b/intern/cycles/test/integrator_tile_test.cpp
@ -24,26 +24,26 @@ CCL_NAMESPACE_BEGIN
 TEST(tile_calculate_best_size, Basic)
 {
  /* Make sure CPU-like case is handled properly. */
-  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1, 1.0f), TileSize(1, 1, 1));
-  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1, 1.0f), TileSize(1, 1, 1));
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 1, 1, 1.0f), TileSize(1, 1, 1));
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 100, 1, 1.0f), TileSize(1, 1, 1));

  /* Enough path states to fit an entire image with all samples. */
-  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1920 * 1080, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 1, 1920 * 1080, 1.0f),
            TileSize(1920, 1080, 1));
-  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1920 * 1080 * 100, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 100, 1920 * 1080 * 100, 1.0f),
            TileSize(1920, 1080, 100));
 }

 TEST(tile_calculate_best_size, Extreme)
 {
-  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 262144, 131072, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 262144, 131072, 1.0f),
            TileSize(1, 1, 512));
-  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 1048576, 131072, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 1048576, 131072, 1.0f),
            TileSize(1, 1, 1024));
-  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 10485760, 131072, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 10485760, 131072, 1.0f),
            TileSize(1, 1, 4096));

-  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 8192 * 8192 * 2, 1024, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 8192 * 8192 * 2, 1024, 1.0f),
            TileSize(1, 1, 1024));
 }