Cycles: Improve denoiser update performance when rendering with multiple GPUs
This patch causes the render buffers to be copied to the denoiser device only once before denoising and output/display is then fed from that single buffer on the denoiser device. That way usually all but one copy (from all the render devices to the denoiser device) can be eliminated, provided that the denoiser device is also the display device (in which case interop is used to update the display). As such this patch also adds some logic that tries to ensure the chosen denoiser device is the same as the display device. Differential Revision: https://developer.blender.org/D15657
This commit is contained in:
parent
27105af938
commit
79787bf8e1
Notes:
blender-bot
2023-04-07 15:58:45 +02:00
Referenced by issue #95836, Viewport performance slows down when using OptiX Denoiser on Windows machine with dual GPUs. Referenced by issue #106667, Cycles: Multi-device denoise runs denoising data passes
|
@ -1202,11 +1202,11 @@ bool CUDADevice::should_use_graphics_interop()
|
|||
}
|
||||
|
||||
vector<CUdevice> gl_devices(num_all_devices);
|
||||
uint num_gl_devices;
|
||||
uint num_gl_devices = 0;
|
||||
cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
|
||||
|
||||
for (CUdevice gl_device : gl_devices) {
|
||||
if (gl_device == cuDevice) {
|
||||
for (uint i = 0; i < num_gl_devices; ++i) {
|
||||
if (gl_devices[i] == cuDevice) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,6 +39,9 @@ CCL_NAMESPACE_BEGIN
|
|||
// The original code is Copyright NVIDIA Corporation, BSD-3-Clause.
|
||||
namespace {
|
||||
|
||||
# if OPTIX_ABI_VERSION >= 60
|
||||
using ::optixUtilDenoiserInvokeTiled;
|
||||
# else
|
||||
static OptixResult optixUtilDenoiserSplitImage(const OptixImage2D &input,
|
||||
const OptixImage2D &output,
|
||||
unsigned int overlapWindowSizeInPixels,
|
||||
|
@ -215,6 +218,7 @@ static OptixResult optixUtilDenoiserInvokeTiled(OptixDenoiser denoiser,
|
|||
}
|
||||
return OPTIX_SUCCESS;
|
||||
}
|
||||
# endif
|
||||
|
||||
# if OPTIX_ABI_VERSION >= 55
|
||||
static void execute_optix_task(TaskPool &pool, OptixTask task, OptixResult &failure_reason)
|
||||
|
|
|
@ -101,10 +101,17 @@ static Device *find_best_device(Device *device, DenoiserType type)
|
|||
if ((sub_device->info.denoisers & type) == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!best_device) {
|
||||
best_device = sub_device;
|
||||
}
|
||||
else {
|
||||
/* Prefer a device that can use graphics interop for faster display update. */
|
||||
if (sub_device->should_use_graphics_interop() &&
|
||||
!best_device->should_use_graphics_interop()) {
|
||||
best_device = sub_device;
|
||||
}
|
||||
|
||||
/* TODO(sergey): Choose fastest device from available ones. Taking into account performance
|
||||
* of the device and data transfer cost. */
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@ PathTrace::PathTrace(Device *device,
|
|||
RenderScheduler &render_scheduler,
|
||||
TileManager &tile_manager)
|
||||
: device_(device),
|
||||
film_(film),
|
||||
device_scene_(device_scene),
|
||||
render_scheduler_(render_scheduler),
|
||||
tile_manager_(tile_manager)
|
||||
|
@ -60,7 +61,17 @@ PathTrace::~PathTrace()
|
|||
void PathTrace::load_kernels()
|
||||
{
|
||||
if (denoiser_) {
|
||||
/* Activate graphics interop while denoiser device is created, so that it can choose a device
|
||||
* that supports interop for faster display updates. */
|
||||
if (display_ && path_trace_works_.size() > 1) {
|
||||
display_->graphics_interop_activate();
|
||||
}
|
||||
|
||||
denoiser_->load_kernels(progress_);
|
||||
|
||||
if (display_ && path_trace_works_.size() > 1) {
|
||||
display_->graphics_interop_deactivate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -506,28 +517,30 @@ void PathTrace::denoise(const RenderWork &render_work)
|
|||
const double start_time = time_dt();
|
||||
|
||||
RenderBuffers *buffer_to_denoise = nullptr;
|
||||
|
||||
unique_ptr<RenderBuffers> multi_device_buffers;
|
||||
bool allow_inplace_modification = false;
|
||||
|
||||
if (path_trace_works_.size() == 1) {
|
||||
buffer_to_denoise = path_trace_works_.front()->get_render_buffers();
|
||||
Device *denoiser_device = denoiser_->get_denoiser_device();
|
||||
if (path_trace_works_.size() > 1 && denoiser_device && !big_tile_denoise_work_) {
|
||||
big_tile_denoise_work_ = PathTraceWork::create(denoiser_device, film_, device_scene_, nullptr);
|
||||
}
|
||||
else {
|
||||
Device *denoiser_device = denoiser_->get_denoiser_device();
|
||||
if (!denoiser_device) {
|
||||
return;
|
||||
}
|
||||
|
||||
multi_device_buffers = make_unique<RenderBuffers>(denoiser_device);
|
||||
multi_device_buffers->reset(render_state_.effective_big_tile_params);
|
||||
if (big_tile_denoise_work_) {
|
||||
big_tile_denoise_work_->set_effective_buffer_params(render_state_.effective_big_tile_params,
|
||||
render_state_.effective_big_tile_params,
|
||||
render_state_.effective_big_tile_params);
|
||||
|
||||
buffer_to_denoise = multi_device_buffers.get();
|
||||
buffer_to_denoise = big_tile_denoise_work_->get_render_buffers();
|
||||
buffer_to_denoise->reset(render_state_.effective_big_tile_params);
|
||||
|
||||
copy_to_render_buffers(multi_device_buffers.get());
|
||||
copy_to_render_buffers(buffer_to_denoise);
|
||||
|
||||
allow_inplace_modification = true;
|
||||
}
|
||||
else {
|
||||
DCHECK_EQ(path_trace_works_.size(), 1);
|
||||
|
||||
buffer_to_denoise = path_trace_works_.front()->get_render_buffers();
|
||||
}
|
||||
|
||||
if (denoiser_->denoise_buffer(render_state_.effective_big_tile_params,
|
||||
buffer_to_denoise,
|
||||
|
@ -536,14 +549,6 @@ void PathTrace::denoise(const RenderWork &render_work)
|
|||
render_state_.has_denoised_result = true;
|
||||
}
|
||||
|
||||
if (multi_device_buffers) {
|
||||
multi_device_buffers->copy_from_device();
|
||||
parallel_for_each(
|
||||
path_trace_works_, [&multi_device_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
|
||||
path_trace_work->copy_from_denoised_render_buffers(multi_device_buffers.get());
|
||||
});
|
||||
}
|
||||
|
||||
render_scheduler_.report_denoise_time(render_work, time_dt() - start_time);
|
||||
}
|
||||
|
||||
|
@ -635,8 +640,13 @@ void PathTrace::update_display(const RenderWork &render_work)
|
|||
/* TODO(sergey): When using multi-device rendering map the GPUDisplay once and copy data from
|
||||
* all works in parallel. */
|
||||
const int num_samples = get_num_samples_in_buffer();
|
||||
for (auto &&path_trace_work : path_trace_works_) {
|
||||
path_trace_work->copy_to_display(display_.get(), pass_mode, num_samples);
|
||||
if (big_tile_denoise_work_ && render_state_.has_denoised_result) {
|
||||
big_tile_denoise_work_->copy_to_display(display_.get(), pass_mode, num_samples);
|
||||
}
|
||||
else {
|
||||
for (auto &&path_trace_work : path_trace_works_) {
|
||||
path_trace_work->copy_to_display(display_.get(), pass_mode, num_samples);
|
||||
}
|
||||
}
|
||||
|
||||
display_->update_end();
|
||||
|
@ -721,11 +731,10 @@ void PathTrace::write_tile_buffer(const RenderWork &render_work)
|
|||
VLOG_WORK << "Write tile result via buffer write callback.";
|
||||
tile_buffer_write();
|
||||
}
|
||||
|
||||
/* Write tile to disk, so that the render work's render buffer can be re-used for the next tile.
|
||||
*/
|
||||
if (has_multiple_tiles) {
|
||||
VLOG_WORK << "Write tile result into .";
|
||||
else {
|
||||
VLOG_WORK << "Write tile result to disk.";
|
||||
tile_buffer_write_to_disk();
|
||||
}
|
||||
}
|
||||
|
@ -901,6 +910,10 @@ bool PathTrace::copy_render_tile_from_device()
|
|||
return true;
|
||||
}
|
||||
|
||||
if (big_tile_denoise_work_ && render_state_.has_denoised_result) {
|
||||
return big_tile_denoise_work_->copy_render_buffers_from_device();
|
||||
}
|
||||
|
||||
bool success = true;
|
||||
|
||||
parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
|
||||
|
@ -1002,6 +1015,10 @@ bool PathTrace::get_render_tile_pixels(const PassAccessor &pass_accessor,
|
|||
return pass_accessor.get_render_tile_pixels(full_frame_state_.render_buffers, destination);
|
||||
}
|
||||
|
||||
if (big_tile_denoise_work_ && render_state_.has_denoised_result) {
|
||||
return big_tile_denoise_work_->get_render_tile_pixels(pass_accessor, destination);
|
||||
}
|
||||
|
||||
bool success = true;
|
||||
|
||||
parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
|
||||
|
@ -1082,6 +1099,10 @@ void PathTrace::destroy_gpu_resources()
|
|||
for (auto &&path_trace_work : path_trace_works_) {
|
||||
path_trace_work->destroy_gpu_resources(display_.get());
|
||||
}
|
||||
|
||||
if (big_tile_denoise_work_) {
|
||||
big_tile_denoise_work_->destroy_gpu_resources(display_.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -236,6 +236,7 @@ class PathTrace {
|
|||
/* CPU device for creating temporary render buffers on the CPU side. */
|
||||
unique_ptr<Device> cpu_device_;
|
||||
|
||||
Film *film_;
|
||||
DeviceScene *device_scene_;
|
||||
|
||||
RenderScheduler &render_scheduler_;
|
||||
|
@ -261,6 +262,9 @@ class PathTrace {
|
|||
/* Denoiser which takes care of denoising the big tile. */
|
||||
unique_ptr<Denoiser> denoiser_;
|
||||
|
||||
/* Denoiser device descriptor which holds the denoised big tile for multi-device workloads. */
|
||||
unique_ptr<PathTraceWork> denoiser_buffer_;
|
||||
|
||||
/* State which is common for all the steps of the render work.
|
||||
* Is brought up to date in the `render()` call and is accessed from all the steps involved into
|
||||
* rendering the work. */
|
||||
|
|
|
@ -33,7 +33,7 @@ bool PathTraceTile::get_pass_pixels(const string_view pass_name,
|
|||
if (!copied_from_device_) {
|
||||
/* Copy from device on demand. */
|
||||
path_trace_.copy_render_tile_from_device();
|
||||
const_cast<PathTraceTile *>(this)->copied_from_device_ = true;
|
||||
copied_from_device_ = true;
|
||||
}
|
||||
|
||||
const BufferParams &buffer_params = path_trace_.get_render_tile_params();
|
||||
|
|
|
@ -24,7 +24,7 @@ class PathTraceTile : public OutputDriver::Tile {
|
|||
|
||||
private:
|
||||
PathTrace &path_trace_;
|
||||
bool copied_from_device_;
|
||||
mutable bool copied_from_device_;
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
|
|
@ -370,6 +370,14 @@ RenderWork Session::run_update_for_next_iteration()
|
|||
if (update_scene(width, height)) {
|
||||
profiler.reset(scene->shaders.size(), scene->objects.size());
|
||||
}
|
||||
|
||||
/* Unlock scene mutex before loading denoiser kernels, since that may attempt to activate
|
||||
* graphics interop, which can deadlock when the scene mutex is still being held. */
|
||||
scene_lock.unlock();
|
||||
|
||||
path_trace_->load_kernels();
|
||||
path_trace_->alloc_work_memory();
|
||||
|
||||
progress.add_skip_time(update_timer, params.background);
|
||||
}
|
||||
|
||||
|
@ -618,12 +626,7 @@ bool Session::update_scene(int width, int height)
|
|||
Camera *cam = scene->camera;
|
||||
cam->set_screen_size(width, height);
|
||||
|
||||
const bool scene_update_result = scene->update(progress);
|
||||
|
||||
path_trace_->load_kernels();
|
||||
path_trace_->alloc_work_memory();
|
||||
|
||||
return scene_update_result;
|
||||
return scene->update(progress);
|
||||
}
|
||||
|
||||
static string status_append(const string &status, const string &suffix)
|
||||
|
|
Loading…
Reference in New Issue