Merge branch 'blender-v3.0-release' to bring in D13042:

Fix performance decrease with Scrambling Distance on
This commit is contained in:
William Leeson 2021-11-25 09:41:03 +01:00
commit c49d2cbe92
7 changed files with 41 additions and 24 deletions

View File

@ -258,7 +258,8 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
* become busy after adding new tiles). This is especially important for the shadow catcher which
* schedules work in halves of available number of paths. */
work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
work_tile_scheduler_.set_accelerated_rt((device_->get_bvh_layout_mask() & BVH_LAYOUT_OPTIX) !=
0);
work_tile_scheduler_.reset(effective_buffer_params_,
start_sample,
samples_num,

View File

@ -46,7 +46,8 @@ ccl_device_inline uint round_up_to_power_of_two(uint x)
return next_power_of_two(x);
}
TileSize tile_calculate_best_size(const int2 &image_size,
TileSize tile_calculate_best_size(const bool accel_rt,
const int2 &image_size,
const int num_samples,
const int max_num_path_states,
const float scrambling_distance)
@ -73,7 +74,7 @@ TileSize tile_calculate_best_size(const int2 &image_size,
TileSize tile_size;
const int num_path_states_per_sample = max_num_path_states / num_samples;
if (scrambling_distance < 0.9f) {
if (scrambling_distance < 0.9f && accel_rt) {
/* Prefer large tiles for scrambling distance, bounded by max num path states. */
tile_size.width = min(image_size.x, max_num_path_states);
tile_size.height = min(image_size.y, max(max_num_path_states / tile_size.width, 1));

View File

@ -49,7 +49,8 @@ std::ostream &operator<<(std::ostream &os, const TileSize &tile_size);
* of active path states.
* Will attempt to provide best guess to keep path tracing threads of a device as localized as
* possible, and have as many threads active for every tile as possible. */
TileSize tile_calculate_best_size(const int2 &image_size,
TileSize tile_calculate_best_size(const bool accel_rt,
const int2 &image_size,
const int num_samples,
const int max_num_path_states,
const float scrambling_distance);

View File

@ -28,6 +28,11 @@ WorkTileScheduler::WorkTileScheduler()
{
}
void WorkTileScheduler::set_accelerated_rt(bool accelerated_rt)
{
accelerated_rt_ = accelerated_rt;
}
void WorkTileScheduler::set_max_num_path_states(int max_num_path_states)
{
max_num_path_states_ = max_num_path_states;
@ -61,7 +66,7 @@ void WorkTileScheduler::reset(const BufferParams &buffer_params,
void WorkTileScheduler::reset_scheduler_state()
{
tile_size_ = tile_calculate_best_size(
image_size_px_, samples_num_, max_num_path_states_, scrambling_distance_);
accelerated_rt_, image_size_px_, samples_num_, max_num_path_states_, scrambling_distance_);
VLOG(3) << "Will schedule tiles of size " << tile_size_;

View File

@ -31,6 +31,9 @@ class WorkTileScheduler {
public:
WorkTileScheduler();
/* To indicate if there is accelerated RT support. */
void set_accelerated_rt(bool state);
/* MAximum path states which are allowed to be used by a single scheduled work tile.
*
* Affects the scheduled work size: the work size will be as big as possible, but will not exceed
@ -55,6 +58,9 @@ class WorkTileScheduler {
protected:
void reset_scheduler_state();
/* Used to indicate if there is accelerated ray tracing. */
bool accelerated_rt_ = false;
/* Maximum allowed path states to be used.
*
* TODO(sergey): Naming can be improved. The fact that this is a limiting factor based on the

View File

@ -29,17 +29,20 @@ ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
ccl_private uint *y,
ccl_private uint *sample)
{
#if 0
/* Keep threads for the same sample together. */
uint tile_pixels = tile->w * tile->h;
uint sample_offset = global_work_index / tile_pixels;
uint pixel_offset = global_work_index - sample_offset * tile_pixels;
#else
/* Keeping threads for the same pixel together.
* Appears to improve performance by a few % on CUDA and OptiX. */
uint sample_offset = global_work_index % tile->num_samples;
uint pixel_offset = global_work_index / tile->num_samples;
#endif
uint sample_offset, pixel_offset;
if (kernel_data.integrator.scrambling_distance < 0.9f) {
/* Keep threads for the same sample together. */
uint tile_pixels = tile->w * tile->h;
sample_offset = global_work_index / tile_pixels;
pixel_offset = global_work_index - sample_offset * tile_pixels;
}
else {
/* Keeping threads for the same pixel together.
* Appears to improve performance by a few % on CUDA and OptiX. */
sample_offset = global_work_index % tile->num_samples;
pixel_offset = global_work_index / tile->num_samples;
}
uint y_offset = pixel_offset / tile->w;
uint x_offset = pixel_offset - y_offset * tile->w;

View File

@ -24,26 +24,26 @@ CCL_NAMESPACE_BEGIN
TEST(tile_calculate_best_size, Basic)
{
/* Make sure CPU-like case is handled properly. */
EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1, 1.0f), TileSize(1, 1, 1));
EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1, 1.0f), TileSize(1, 1, 1));
EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 1, 1, 1.0f), TileSize(1, 1, 1));
EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 100, 1, 1.0f), TileSize(1, 1, 1));
/* Enough path states to fit an entire image with all samples. */
EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1920 * 1080, 1.0f),
EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 1, 1920 * 1080, 1.0f),
TileSize(1920, 1080, 1));
EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1920 * 1080 * 100, 1.0f),
EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 100, 1920 * 1080 * 100, 1.0f),
TileSize(1920, 1080, 100));
}
TEST(tile_calculate_best_size, Extreme)
{
EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 262144, 131072, 1.0f),
EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 262144, 131072, 1.0f),
TileSize(1, 1, 512));
EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 1048576, 131072, 1.0f),
EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 1048576, 131072, 1.0f),
TileSize(1, 1, 1024));
EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 10485760, 131072, 1.0f),
EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 10485760, 131072, 1.0f),
TileSize(1, 1, 4096));
EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 8192 * 8192 * 2, 1024, 1.0f),
EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 8192 * 8192 * 2, 1024, 1.0f),
TileSize(1, 1, 1024));
}