Cycles: Improve denoiser update performance when rendering with multiple GPUs

This patch causes the render buffers to be copied to the denoiser
device only once before denoising and output/display is then fed
from that single buffer on the denoiser device. That way usually all
but one copy (from all the render devices to the denoiser device)
can be eliminated, provided that the denoiser device is also the
display device (in which case interop is used to update the display).
As such this patch also adds some logic that tries to ensure the
chosen denoiser device is the same as the display device.

Differential Revision: https://developer.blender.org/D15657
This commit is contained in:
Patrick Mours 2022-08-12 15:49:30 +02:00
parent 27105af938
commit 79787bf8e1
Notes: blender-bot 2023-04-07 15:58:45 +02:00
Referenced by issue #95836, Viewport performance slows down when using OptiX Denoiser on Windows machine with dual GPUs.
Referenced by issue #106667, Cycles: Multi-device denoise runs denoising data passes
8 changed files with 76 additions and 37 deletions

View File

@ -1202,11 +1202,11 @@ bool CUDADevice::should_use_graphics_interop()
}
vector<CUdevice> gl_devices(num_all_devices);
uint num_gl_devices;
uint num_gl_devices = 0;
cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
for (CUdevice gl_device : gl_devices) {
if (gl_device == cuDevice) {
for (uint i = 0; i < num_gl_devices; ++i) {
if (gl_devices[i] == cuDevice) {
return true;
}
}

View File

@ -39,6 +39,9 @@ CCL_NAMESPACE_BEGIN
// The original code is Copyright NVIDIA Corporation, BSD-3-Clause.
namespace {
# if OPTIX_ABI_VERSION >= 60
using ::optixUtilDenoiserInvokeTiled;
# else
static OptixResult optixUtilDenoiserSplitImage(const OptixImage2D &input,
const OptixImage2D &output,
unsigned int overlapWindowSizeInPixels,
@ -215,6 +218,7 @@ static OptixResult optixUtilDenoiserInvokeTiled(OptixDenoiser denoiser,
}
return OPTIX_SUCCESS;
}
# endif
# if OPTIX_ABI_VERSION >= 55
static void execute_optix_task(TaskPool &pool, OptixTask task, OptixResult &failure_reason)

View File

@ -101,10 +101,17 @@ static Device *find_best_device(Device *device, DenoiserType type)
if ((sub_device->info.denoisers & type) == 0) {
return;
}
if (!best_device) {
best_device = sub_device;
}
else {
/* Prefer a device that can use graphics interop for faster display update. */
if (sub_device->should_use_graphics_interop() &&
!best_device->should_use_graphics_interop()) {
best_device = sub_device;
}
/* TODO(sergey): Choose fastest device from available ones. Taking into account performance
* of the device and data transfer cost. */
}

View File

@ -26,6 +26,7 @@ PathTrace::PathTrace(Device *device,
RenderScheduler &render_scheduler,
TileManager &tile_manager)
: device_(device),
film_(film),
device_scene_(device_scene),
render_scheduler_(render_scheduler),
tile_manager_(tile_manager)
@ -60,7 +61,17 @@ PathTrace::~PathTrace()
void PathTrace::load_kernels()
{
if (denoiser_) {
/* Activate graphics interop while denoiser device is created, so that it can choose a device
* that supports interop for faster display updates. */
if (display_ && path_trace_works_.size() > 1) {
display_->graphics_interop_activate();
}
denoiser_->load_kernels(progress_);
if (display_ && path_trace_works_.size() > 1) {
display_->graphics_interop_deactivate();
}
}
}
@ -506,28 +517,30 @@ void PathTrace::denoise(const RenderWork &render_work)
const double start_time = time_dt();
RenderBuffers *buffer_to_denoise = nullptr;
unique_ptr<RenderBuffers> multi_device_buffers;
bool allow_inplace_modification = false;
if (path_trace_works_.size() == 1) {
buffer_to_denoise = path_trace_works_.front()->get_render_buffers();
Device *denoiser_device = denoiser_->get_denoiser_device();
if (path_trace_works_.size() > 1 && denoiser_device && !big_tile_denoise_work_) {
big_tile_denoise_work_ = PathTraceWork::create(denoiser_device, film_, device_scene_, nullptr);
}
else {
Device *denoiser_device = denoiser_->get_denoiser_device();
if (!denoiser_device) {
return;
}
multi_device_buffers = make_unique<RenderBuffers>(denoiser_device);
multi_device_buffers->reset(render_state_.effective_big_tile_params);
if (big_tile_denoise_work_) {
big_tile_denoise_work_->set_effective_buffer_params(render_state_.effective_big_tile_params,
render_state_.effective_big_tile_params,
render_state_.effective_big_tile_params);
buffer_to_denoise = multi_device_buffers.get();
buffer_to_denoise = big_tile_denoise_work_->get_render_buffers();
buffer_to_denoise->reset(render_state_.effective_big_tile_params);
copy_to_render_buffers(multi_device_buffers.get());
copy_to_render_buffers(buffer_to_denoise);
allow_inplace_modification = true;
}
else {
DCHECK_EQ(path_trace_works_.size(), 1);
buffer_to_denoise = path_trace_works_.front()->get_render_buffers();
}
if (denoiser_->denoise_buffer(render_state_.effective_big_tile_params,
buffer_to_denoise,
@ -536,14 +549,6 @@ void PathTrace::denoise(const RenderWork &render_work)
render_state_.has_denoised_result = true;
}
if (multi_device_buffers) {
multi_device_buffers->copy_from_device();
parallel_for_each(
path_trace_works_, [&multi_device_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
path_trace_work->copy_from_denoised_render_buffers(multi_device_buffers.get());
});
}
render_scheduler_.report_denoise_time(render_work, time_dt() - start_time);
}
@ -635,8 +640,13 @@ void PathTrace::update_display(const RenderWork &render_work)
/* TODO(sergey): When using multi-device rendering map the GPUDisplay once and copy data from
* all works in parallel. */
const int num_samples = get_num_samples_in_buffer();
for (auto &&path_trace_work : path_trace_works_) {
path_trace_work->copy_to_display(display_.get(), pass_mode, num_samples);
if (big_tile_denoise_work_ && render_state_.has_denoised_result) {
big_tile_denoise_work_->copy_to_display(display_.get(), pass_mode, num_samples);
}
else {
for (auto &&path_trace_work : path_trace_works_) {
path_trace_work->copy_to_display(display_.get(), pass_mode, num_samples);
}
}
display_->update_end();
@ -721,11 +731,10 @@ void PathTrace::write_tile_buffer(const RenderWork &render_work)
VLOG_WORK << "Write tile result via buffer write callback.";
tile_buffer_write();
}
/* Write tile to disk, so that the render work's render buffer can be re-used for the next tile.
*/
if (has_multiple_tiles) {
VLOG_WORK << "Write tile result into .";
else {
VLOG_WORK << "Write tile result to disk.";
tile_buffer_write_to_disk();
}
}
@ -901,6 +910,10 @@ bool PathTrace::copy_render_tile_from_device()
return true;
}
if (big_tile_denoise_work_ && render_state_.has_denoised_result) {
return big_tile_denoise_work_->copy_render_buffers_from_device();
}
bool success = true;
parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
@ -1002,6 +1015,10 @@ bool PathTrace::get_render_tile_pixels(const PassAccessor &pass_accessor,
return pass_accessor.get_render_tile_pixels(full_frame_state_.render_buffers, destination);
}
if (big_tile_denoise_work_ && render_state_.has_denoised_result) {
return big_tile_denoise_work_->get_render_tile_pixels(pass_accessor, destination);
}
bool success = true;
parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
@ -1082,6 +1099,10 @@ void PathTrace::destroy_gpu_resources()
for (auto &&path_trace_work : path_trace_works_) {
path_trace_work->destroy_gpu_resources(display_.get());
}
if (big_tile_denoise_work_) {
big_tile_denoise_work_->destroy_gpu_resources(display_.get());
}
}
}

View File

@ -236,6 +236,7 @@ class PathTrace {
/* CPU device for creating temporary render buffers on the CPU side. */
unique_ptr<Device> cpu_device_;
Film *film_;
DeviceScene *device_scene_;
RenderScheduler &render_scheduler_;
@ -261,6 +262,9 @@ class PathTrace {
/* Denoiser which takes care of denoising the big tile. */
unique_ptr<Denoiser> denoiser_;
/* Denoiser device descriptor which holds the denoised big tile for multi-device workloads. */
unique_ptr<PathTraceWork> denoiser_buffer_;
/* State which is common for all the steps of the render work.
* Is brought up to date in the `render()` call and is accessed from all the steps involved into
* rendering the work. */

View File

@ -33,7 +33,7 @@ bool PathTraceTile::get_pass_pixels(const string_view pass_name,
if (!copied_from_device_) {
/* Copy from device on demand. */
path_trace_.copy_render_tile_from_device();
const_cast<PathTraceTile *>(this)->copied_from_device_ = true;
copied_from_device_ = true;
}
const BufferParams &buffer_params = path_trace_.get_render_tile_params();

View File

@ -24,7 +24,7 @@ class PathTraceTile : public OutputDriver::Tile {
private:
PathTrace &path_trace_;
bool copied_from_device_;
mutable bool copied_from_device_;
};
CCL_NAMESPACE_END

View File

@ -370,6 +370,14 @@ RenderWork Session::run_update_for_next_iteration()
if (update_scene(width, height)) {
profiler.reset(scene->shaders.size(), scene->objects.size());
}
/* Unlock scene mutex before loading denoiser kernels, since that may attempt to activate
* graphics interop, which can deadlock when the scene mutex is still being held. */
scene_lock.unlock();
path_trace_->load_kernels();
path_trace_->alloc_work_memory();
progress.add_skip_time(update_timer, params.background);
}
@ -618,12 +626,7 @@ bool Session::update_scene(int width, int height)
Camera *cam = scene->camera;
cam->set_screen_size(width, height);
const bool scene_update_result = scene->update(progress);
path_trace_->load_kernels();
path_trace_->alloc_work_memory();
return scene_update_result;
return scene->update(progress);
}
static string status_append(const string &status, const string &suffix)