Cycles: Implement tile stealing to improve CPU+GPU rendering performance

While Cycles already supports using both CPU and GPU at the same time, there
currently is a large problem with it: Since the CPU grabs one tile per thread,
at the end of the render the GPU runs out of new work but the CPU still needs
quite some time to finish its current times.

Having smaller tiles helps somewhat, but especially OpenCL rendering tends to
lose performance with smaller tiles.

Therefore, this commit adds support for tile stealing: When a GPU device runs
out of new tiles, it can signal the CPU to release one of its tiles.
This way, at the end of the render, the GPU quickly finishes the remaining
tiles instead of having to wait for the CPU.

Thanks to AMD for sponsoring this work!

Differential Revision: https://developer.blender.org/D9324
This commit is contained in:
Lukas Stockner 2020-09-24 00:37:23 +02:00
parent 523414dda2
commit 517ff40b12
Notes: blender-bot 2023-02-14 07:17:43 +01:00
Referenced by issue #82351, Tile stealing glitches with adaptive sampling
7 changed files with 124 additions and 5 deletions

View File

@ -932,6 +932,11 @@ class CPUDevice : public Device {
break;
}
if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
tile.stealing_state = RenderTile::WAS_STOLEN;
break;
}
if (tile.task == RenderTile::PATH_TRACE) {
for (int y = tile.y; y < tile.y + tile.h; y++) {
for (int x = tile.x; x < tile.x + tile.w; x++) {

View File

@ -450,6 +450,14 @@ template<typename T> class device_vector : public device_memory {
device_zero();
}
void move_device(Device *new_device)
{
copy_from_device();
device_free();
device = new_device;
copy_to_device();
}
protected:
size_t size(size_t width, size_t height, size_t depth)
{

View File

@ -159,6 +159,7 @@ class DeviceTask {
function<void(RenderTile &)> update_tile_sample;
function<void(RenderTile &)> release_tile;
function<bool()> get_cancel;
function<bool()> get_tile_stolen;
function<void(RenderTileNeighbors &, Device *)> map_neighbor_tiles;
function<void(RenderTileNeighbors &, Device *)> unmap_neighbor_tiles;

View File

@ -125,6 +125,7 @@ RenderTile::RenderTile()
buffer = 0;
buffers = NULL;
stealing_state = NO_STEALING;
}
/* Render Buffers */

View File

@ -146,6 +146,9 @@ class RenderTile {
device_ptr buffer;
int device_size;
typedef enum { NO_STEALING = 0, CAN_BE_STOLEN = 1, WAS_STOLEN = 2 } StealingState;
StealingState stealing_state;
RenderBuffers *buffers;
RenderTile();

View File

@ -382,6 +382,63 @@ bool Session::draw_cpu(BufferParams &buffer_params, DeviceDrawParams &draw_param
return false;
}
bool Session::steal_tile(RenderTile &rtile, Device *tile_device, thread_scoped_lock &tile_lock)
{
/* Devices that can get their tiles stolen don't steal tiles themselves.
* Additionally, if there are no stealable tiles in flight, give up here. */
if (tile_device->info.type == DEVICE_CPU || stealable_tiles == 0) {
return false;
}
/* Wait until no other thread is trying to steal a tile. */
while (tile_stealing_state != NOT_STEALING && stealable_tiles > 0) {
/* Someone else is currently trying to get a tile.
* Wait on the condition variable and try later. */
tile_steal_cond.wait(tile_lock);
}
/* If another thread stole the last stealable tile in the meantime, give up. */
if (stealable_tiles == 0) {
return false;
}
/* There are stealable tiles in flight, so signal that one should be released. */
tile_stealing_state = WAITING_FOR_TILE;
assert(success == 0);
/* Wait until a device notices the signal and releases its tile. */
while (tile_stealing_state != GOT_TILE && stealable_tiles > 0) {
tile_steal_cond.wait(tile_lock);
}
/* If the last stealable tile finished on its own, give up. */
if (tile_stealing_state != GOT_TILE) {
tile_stealing_state = NOT_STEALING;
return false;
}
/* Successfully stole a tile, now move it to the new device. */
rtile = stolen_tile;
rtile.buffers->buffer.move_device(tile_device);
rtile.buffer = rtile.buffers->buffer.device_pointer;
rtile.stealing_state = RenderTile::NO_STEALING;
rtile.num_samples -= (rtile.sample - rtile.start_sample);
rtile.start_sample = rtile.sample;
tile_stealing_state = NOT_STEALING;
/* Poke any threads which might be waiting for NOT_STEALING above. */
tile_steal_cond.notify_one();
return true;
}
bool Session::get_tile_stolen()
{
/* If tile_stealing_state is WAITING_FOR_TILE, atomically set it to RELEASING_TILE
* and return true. */
TileStealingState expected = WAITING_FOR_TILE;
return tile_stealing_state.compare_exchange_weak(expected, RELEASING_TILE);
}
bool Session::acquire_tile(RenderTile &rtile, Device *tile_device, uint tile_types)
{
if (progress.get_cancel()) {
@ -403,7 +460,8 @@ bool Session::acquire_tile(RenderTile &rtile, Device *tile_device, uint tile_typ
denoising_cond.wait(tile_lock);
continue;
}
return false;
return steal_tile(rtile, tile_device, tile_lock);
}
/* fill render tile */
@ -419,11 +477,18 @@ bool Session::acquire_tile(RenderTile &rtile, Device *tile_device, uint tile_typ
if (tile->state == Tile::DENOISE) {
rtile.task = RenderTile::DENOISE;
}
else if (read_bake_tile_cb) {
rtile.task = RenderTile::BAKE;
}
else {
rtile.task = RenderTile::PATH_TRACE;
if (tile_device->info.type == DEVICE_CPU) {
stealable_tiles++;
rtile.stealing_state = RenderTile::CAN_BE_STOLEN;
}
if (read_bake_tile_cb) {
rtile.task = RenderTile::BAKE;
}
else {
rtile.task = RenderTile::PATH_TRACE;
}
}
tile_lock.unlock();
@ -508,6 +573,26 @@ void Session::release_tile(RenderTile &rtile, const bool need_denoise)
{
thread_scoped_lock tile_lock(tile_mutex);
if (rtile.stealing_state != RenderTile::NO_STEALING) {
stealable_tiles--;
if (rtile.stealing_state == RenderTile::WAS_STOLEN) {
/* If the tile is being stolen, don't release it here - the new device will pick up where
* the old one left off. */
assert(tile_stealing_state == RELEASING_TILE);
assert(rtile.sample < rtile.start_sample + rtile.num_samples);
tile_stealing_state = GOT_TILE;
stolen_tile = rtile;
tile_steal_cond.notify_all();
return;
}
else if (stealable_tiles == 0) {
/* If this was the last stealable tile, wake up any threads still waiting for one. */
tile_steal_cond.notify_all();
}
}
progress.add_finished_tile(rtile.task == RenderTile::DENOISE);
bool delete_tile;
@ -815,6 +900,8 @@ void Session::reset_(BufferParams &buffer_params, int samples)
}
tile_manager.reset(buffer_params, samples);
stealable_tiles = 0;
tile_stealing_state = NOT_STEALING;
progress.reset_sample();
bool show_progress = params.background || tile_manager.get_num_effective_samples() != INT_MAX;
@ -1075,6 +1162,7 @@ void Session::render(bool need_denoise)
task.get_cancel = function_bind(&Progress::get_cancel, &this->progress);
task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1);
task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2);
task.get_tile_stolen = function_bind(&Session::get_tile_stolen, this);
task.need_finish_queue = params.progressive_refine;
task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH;

View File

@ -193,6 +193,8 @@ class Session {
bool render_need_denoise(bool &delayed);
bool steal_tile(RenderTile &tile, Device *tile_device, thread_scoped_lock &tile_lock);
bool get_tile_stolen();
bool acquire_tile(RenderTile &tile, Device *tile_device, uint tile_types);
void update_tile_sample(RenderTile &tile);
void release_tile(RenderTile &tile, const bool need_denoise);
@ -217,11 +219,22 @@ class Session {
thread_mutex buffers_mutex;
thread_mutex display_mutex;
thread_condition_variable denoising_cond;
thread_condition_variable tile_steal_cond;
double reset_time;
double last_update_time;
double last_display_time;
RenderTile stolen_tile;
typedef enum {
NOT_STEALING, /* There currently is no tile stealing in progress. */
WAITING_FOR_TILE, /* A device is waiting for another device to release a tile. */
RELEASING_TILE, /* A device has releasing a stealable tile. */
GOT_TILE /* A device has released a stealable tile, which is now stored in stolen_tile. */
} TileStealingState;
std::atomic<TileStealingState> tile_stealing_state;
int stealable_tiles;
/* progressive refine */
bool update_progressive_refine(bool cancel);
};