Merge branch 'blender-v3.0-release' to bring in D13042:

Fix performance decrease with Scrambling Distance on
2021-11-25 09:41:03 +01:00 · 2021-11-25 09:41:03 +01:00 · c49d2cbe92
parent 827c5b399e b41c72b710
commit c49d2cbe92
7 changed files with 41 additions and 24 deletions
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@ -258,7 +258,8 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
   * become busy after adding new tiles). This is especially important for the shadow catcher which
   * schedules work in halves of available number of paths. */
  work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
-
+  work_tile_scheduler_.set_accelerated_rt((device_->get_bvh_layout_mask() & BVH_LAYOUT_OPTIX) !=
+                                          0);
  work_tile_scheduler_.reset(effective_buffer_params_,
                             start_sample,
                             samples_num,
--- a/intern/cycles/integrator/tile.cpp
+++ b/intern/cycles/integrator/tile.cpp
@ -46,7 +46,8 @@ ccl_device_inline uint round_up_to_power_of_two(uint x)
  return next_power_of_two(x);
 }

-TileSize tile_calculate_best_size(const int2 &image_size,
+TileSize tile_calculate_best_size(const bool accel_rt,
+                                  const int2 &image_size,
                                  const int num_samples,
                                  const int max_num_path_states,
                                  const float scrambling_distance)
@ -73,7 +74,7 @@ TileSize tile_calculate_best_size(const int2 &image_size,

  TileSize tile_size;
  const int num_path_states_per_sample = max_num_path_states / num_samples;
-  if (scrambling_distance < 0.9f) {
+  if (scrambling_distance < 0.9f && accel_rt) {
    /* Prefer large tiles for scrambling distance, bounded by max num path states. */
    tile_size.width = min(image_size.x, max_num_path_states);
    tile_size.height = min(image_size.y, max(max_num_path_states / tile_size.width, 1));
--- a/intern/cycles/integrator/tile.h
+++ b/intern/cycles/integrator/tile.h
@ -49,7 +49,8 @@ std::ostream &operator<<(std::ostream &os, const TileSize &tile_size);
 * of active path states.
 * Will attempt to provide best guess to keep path tracing threads of a device as localized as
 * possible, and have as many threads active for every tile as possible. */
-TileSize tile_calculate_best_size(const int2 &image_size,
+TileSize tile_calculate_best_size(const bool accel_rt,
+                                  const int2 &image_size,
                                  const int num_samples,
                                  const int max_num_path_states,
                                  const float scrambling_distance);
--- a/intern/cycles/integrator/work_tile_scheduler.cpp
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@ -28,6 +28,11 @@ WorkTileScheduler::WorkTileScheduler()
 {
 }

+void WorkTileScheduler::set_accelerated_rt(bool accelerated_rt)
+{
+  accelerated_rt_ = accelerated_rt;
+}
+
 void WorkTileScheduler::set_max_num_path_states(int max_num_path_states)
 {
  max_num_path_states_ = max_num_path_states;
@ -61,7 +66,7 @@ void WorkTileScheduler::reset(const BufferParams &buffer_params,
 void WorkTileScheduler::reset_scheduler_state()
 {
  tile_size_ = tile_calculate_best_size(
-      image_size_px_, samples_num_, max_num_path_states_, scrambling_distance_);
+      accelerated_rt_, image_size_px_, samples_num_, max_num_path_states_, scrambling_distance_);

  VLOG(3) << "Will schedule tiles of size " << tile_size_;

--- a/intern/cycles/integrator/work_tile_scheduler.h
+++ b/intern/cycles/integrator/work_tile_scheduler.h
@ -31,6 +31,9 @@ class WorkTileScheduler {
 public:
  WorkTileScheduler();

+  /* To indicate if there is accelerated RT support. */
+  void set_accelerated_rt(bool state);
+
  /* MAximum path states which are allowed to be used by a single scheduled work tile.
   *
   * Affects the scheduled work size: the work size will be as big as possible, but will not exceed
@ -55,6 +58,9 @@ class WorkTileScheduler {
 protected:
  void reset_scheduler_state();

+  /* Used to indicate if there is accelerated ray tracing. */
+  bool accelerated_rt_ = false;
+
  /* Maximum allowed path states to be used.
   *
   * TODO(sergey): Naming can be improved. The fact that this is a limiting factor based on the
--- a/intern/cycles/kernel/device/gpu/work_stealing.h
+++ b/intern/cycles/kernel/device/gpu/work_stealing.h
@ -29,17 +29,20 @@ ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
                                      ccl_private uint *y,
                                      ccl_private uint *sample)
 {
-#if 0
-  /* Keep threads for the same sample together. */
-  uint tile_pixels = tile->w * tile->h;
-  uint sample_offset = global_work_index / tile_pixels;
-  uint pixel_offset = global_work_index - sample_offset * tile_pixels;
-#else
-  /* Keeping threads for the same pixel together.
-   * Appears to improve performance by a few % on CUDA and OptiX. */
-  uint sample_offset = global_work_index % tile->num_samples;
-  uint pixel_offset = global_work_index / tile->num_samples;
-#endif
+  uint sample_offset, pixel_offset;
+
+  if (kernel_data.integrator.scrambling_distance < 0.9f) {
+    /* Keep threads for the same sample together. */
+    uint tile_pixels = tile->w * tile->h;
+    sample_offset = global_work_index / tile_pixels;
+    pixel_offset = global_work_index - sample_offset * tile_pixels;
+  }
+  else {
+    /* Keeping threads for the same pixel together.
+     * Appears to improve performance by a few % on CUDA and OptiX. */
+    sample_offset = global_work_index % tile->num_samples;
+    pixel_offset = global_work_index / tile->num_samples;
+  }

  uint y_offset = pixel_offset / tile->w;
  uint x_offset = pixel_offset - y_offset * tile->w;
--- a/intern/cycles/test/integrator_tile_test.cpp
+++ b/intern/cycles/test/integrator_tile_test.cpp
@ -24,26 +24,26 @@ CCL_NAMESPACE_BEGIN
 TEST(tile_calculate_best_size, Basic)
 {
  /* Make sure CPU-like case is handled properly. */
-  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1, 1.0f), TileSize(1, 1, 1));
-  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1, 1.0f), TileSize(1, 1, 1));
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 1, 1, 1.0f), TileSize(1, 1, 1));
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 100, 1, 1.0f), TileSize(1, 1, 1));

  /* Enough path states to fit an entire image with all samples. */
-  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1920 * 1080, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 1, 1920 * 1080, 1.0f),
            TileSize(1920, 1080, 1));
-  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1920 * 1080 * 100, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 100, 1920 * 1080 * 100, 1.0f),
            TileSize(1920, 1080, 100));
 }

 TEST(tile_calculate_best_size, Extreme)
 {
-  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 262144, 131072, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 262144, 131072, 1.0f),
            TileSize(1, 1, 512));
-  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 1048576, 131072, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 1048576, 131072, 1.0f),
            TileSize(1, 1, 1024));
-  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 10485760, 131072, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 10485760, 131072, 1.0f),
            TileSize(1, 1, 4096));

-  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 8192 * 8192 * 2, 1024, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 8192 * 8192 * 2, 1024, 1.0f),
            TileSize(1, 1, 1024));
 }