Cycles: Metal integrator state size tuning
This patch tunes the integrator state sizing for Metal (`num_concurrent_states` and `num_concurrent_busy_states`). On all GPUs architecture, we adjust the busy:total states ratio to be 1:4 which gives better rendering performance than the previous 1:16 ratio (independent of total state count). This gives a small performance uplift (e.g. 2-3% on M1 Ultra). Additionally for M2 architectures, we double the overall state size if there is available headroom. Inclusive of the first change, we can expect uplift of close to 10% in future, as this results in larger dispatch sizes and minimises work submission overheads. In order to make an accurate determination of available headroom, we defer the calculation of `num_concurrent_states` and `num_concurrent_busy_states` until the time of integrator state allocation (i.e. after all of the scene data has been allocated). We also refactor `alloc_integrator_soa` to calculate an *exact* single-state-size in a first pass, right before allocating the integrator SoA buffers in a second pass. Reviewed By: brecht Differential Revision: https://developer.blender.org/D16313
This commit is contained in:
parent
4c6e07230f
commit
8dd7b5b26b
|
@ -49,7 +49,7 @@ int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
|
|||
return num_states;
|
||||
}
|
||||
|
||||
int CUDADeviceQueue::num_concurrent_busy_states() const
|
||||
int CUDADeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
|
||||
{
|
||||
const int max_num_threads = cuda_device_->get_num_multiprocessors() *
|
||||
cuda_device_->get_max_num_threads_per_multiprocessor();
|
||||
|
|
|
@ -23,7 +23,7 @@ class CUDADeviceQueue : public DeviceQueue {
|
|||
~CUDADeviceQueue();
|
||||
|
||||
virtual int num_concurrent_states(const size_t state_size) const override;
|
||||
virtual int num_concurrent_busy_states() const override;
|
||||
virtual int num_concurrent_busy_states(const size_t state_size) const override;
|
||||
|
||||
virtual void init_execution() override;
|
||||
|
||||
|
|
|
@ -49,7 +49,7 @@ int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const
|
|||
return num_states;
|
||||
}
|
||||
|
||||
int HIPDeviceQueue::num_concurrent_busy_states() const
|
||||
int HIPDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
|
||||
{
|
||||
const int max_num_threads = hip_device_->get_num_multiprocessors() *
|
||||
hip_device_->get_max_num_threads_per_multiprocessor();
|
||||
|
|
|
@ -23,7 +23,7 @@ class HIPDeviceQueue : public DeviceQueue {
|
|||
~HIPDeviceQueue();
|
||||
|
||||
virtual int num_concurrent_states(const size_t state_size) const override;
|
||||
virtual int num_concurrent_busy_states() const override;
|
||||
virtual int num_concurrent_busy_states(const size_t state_size) const override;
|
||||
|
||||
virtual void init_execution() override;
|
||||
|
||||
|
|
|
@ -296,9 +296,11 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
|
|||
}
|
||||
|
||||
source = global_defines + source;
|
||||
# if 0
|
||||
metal_printf("================\n%s================\n\%s================\n",
|
||||
global_defines.c_str(),
|
||||
baked_constants.c_str());
|
||||
# endif
|
||||
|
||||
/* Generate an MD5 from the source and include any baked constants. This is used when caching
|
||||
* PSOs. */
|
||||
|
|
|
@ -162,6 +162,13 @@ bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
|
|||
}
|
||||
}
|
||||
|
||||
if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
|
||||
if ((device->kernel_features & KERNEL_FEATURE_MNEE) == 0) {
|
||||
/* Skip shade_surface_mnee kernel if the scene doesn't require it. */
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (pso_type != PSO_GENERIC) {
|
||||
/* Only specialize kernels where it can make an impact. */
|
||||
if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
|
||||
|
|
|
@ -23,7 +23,7 @@ class MetalDeviceQueue : public DeviceQueue {
|
|||
~MetalDeviceQueue();
|
||||
|
||||
virtual int num_concurrent_states(const size_t) const override;
|
||||
virtual int num_concurrent_busy_states() const override;
|
||||
virtual int num_concurrent_busy_states(const size_t) const override;
|
||||
virtual int num_sort_partition_elements() const override;
|
||||
|
||||
virtual void init_execution() override;
|
||||
|
|
|
@ -264,33 +264,46 @@ MetalDeviceQueue::~MetalDeviceQueue()
|
|||
}
|
||||
}
|
||||
|
||||
int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
|
||||
int MetalDeviceQueue::num_concurrent_states(const size_t state_size) const
|
||||
{
|
||||
/* METAL_WIP */
|
||||
/* TODO: compute automatically. */
|
||||
/* TODO: must have at least num_threads_per_block. */
|
||||
int result = 1048576;
|
||||
if (metal_device_->device_vendor == METAL_GPU_AMD) {
|
||||
result *= 2;
|
||||
static int result = 0;
|
||||
if (result) {
|
||||
return result;
|
||||
}
|
||||
else if (metal_device_->device_vendor == METAL_GPU_APPLE) {
|
||||
|
||||
result = 1048576;
|
||||
if (metal_device_->device_vendor == METAL_GPU_APPLE) {
|
||||
result *= 4;
|
||||
|
||||
if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) == APPLE_M2) {
|
||||
size_t system_ram = system_physical_ram();
|
||||
size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize];
|
||||
size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize];
|
||||
|
||||
/* Determine whether we can double the state count, and leave enough GPU-available memory
|
||||
* (1/8 the system RAM or 1GB - whichever is largest). Enlarging the state size allows us to
|
||||
* keep dispatch sizes high and minimize work submission overheads. */
|
||||
size_t min_headroom = std::max(system_ram / 8, size_t(1024 * 1024 * 1024));
|
||||
size_t total_state_size = result * state_size;
|
||||
if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) {
|
||||
result *= 2;
|
||||
metal_printf("Doubling state count to exploit available RAM (new size = %d)\n", result);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (metal_device_->device_vendor == METAL_GPU_AMD) {
|
||||
/* METAL_WIP */
|
||||
/* TODO: compute automatically. */
|
||||
/* TODO: must have at least num_threads_per_block. */
|
||||
result *= 2;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int MetalDeviceQueue::num_concurrent_busy_states() const
|
||||
int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const
|
||||
{
|
||||
/* METAL_WIP */
|
||||
/* TODO: compute automatically. */
|
||||
int result = 65536;
|
||||
if (metal_device_->device_vendor == METAL_GPU_AMD) {
|
||||
result *= 2;
|
||||
}
|
||||
else if (metal_device_->device_vendor == METAL_GPU_APPLE) {
|
||||
result *= 4;
|
||||
}
|
||||
return result;
|
||||
/* A 1:4 busy:total ratio gives best rendering performance, independent of total state count. */
|
||||
return num_concurrent_states(state_size) / 4;
|
||||
}
|
||||
|
||||
int MetalDeviceQueue::num_sort_partition_elements() const
|
||||
|
|
|
@ -43,7 +43,7 @@ int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const
|
|||
return num_states;
|
||||
}
|
||||
|
||||
int OneapiDeviceQueue::num_concurrent_busy_states() const
|
||||
int OneapiDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
|
||||
{
|
||||
const int max_num_threads = oneapi_device_->get_num_multiprocessors() *
|
||||
oneapi_device_->get_max_num_threads_per_multiprocessor();
|
||||
|
|
|
@ -25,7 +25,7 @@ class OneapiDeviceQueue : public DeviceQueue {
|
|||
|
||||
virtual int num_concurrent_states(const size_t state_size) const override;
|
||||
|
||||
virtual int num_concurrent_busy_states() const override;
|
||||
virtual int num_concurrent_busy_states(const size_t state_size) const override;
|
||||
|
||||
virtual void init_execution() override;
|
||||
|
||||
|
|
|
@ -103,7 +103,7 @@ class DeviceQueue {
|
|||
/* Number of states which keeps the device occupied with work without losing performance.
|
||||
* The renderer will add more work (when available) when number of active paths falls below this
|
||||
* value. */
|
||||
virtual int num_concurrent_busy_states() const = 0;
|
||||
virtual int num_concurrent_busy_states(const size_t state_size) const = 0;
|
||||
|
||||
/* Number of elements in a partition of sorted shaders, that improves memory locality of
|
||||
* integrator state fetch at the cost of decreased coherence for shader kernel execution. */
|
||||
|
|
|
@ -18,13 +18,15 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
static size_t estimate_single_state_size()
|
||||
static size_t estimate_single_state_size(const uint kernel_features)
|
||||
{
|
||||
size_t state_size = 0;
|
||||
|
||||
#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
|
||||
#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type);
|
||||
#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type);
|
||||
#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
|
||||
state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
|
||||
#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
|
||||
state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
|
||||
#define KERNEL_STRUCT_END(name) \
|
||||
break; \
|
||||
}
|
||||
|
@ -76,16 +78,11 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
|
|||
num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
|
||||
work_tiles_(device, "work_tiles", MEM_READ_WRITE),
|
||||
display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
|
||||
max_num_paths_(queue_->num_concurrent_states(estimate_single_state_size())),
|
||||
min_num_active_main_paths_(queue_->num_concurrent_busy_states()),
|
||||
max_num_paths_(0),
|
||||
min_num_active_main_paths_(0),
|
||||
max_active_main_path_index_(0)
|
||||
{
|
||||
memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
|
||||
|
||||
/* Limit number of active paths to the half of the overall state. This is due to the logic in the
|
||||
* path compaction which relies on the fact that regeneration does not happen sooner than half of
|
||||
* the states are available again. */
|
||||
min_num_active_main_paths_ = min(min_num_active_main_paths_, max_num_paths_ / 2);
|
||||
}
|
||||
|
||||
void PathTraceWorkGPU::alloc_integrator_soa()
|
||||
|
@ -103,6 +100,20 @@ void PathTraceWorkGPU::alloc_integrator_soa()
|
|||
integrator_state_soa_volume_stack_size_ = max(integrator_state_soa_volume_stack_size_,
|
||||
requested_volume_stack_size);
|
||||
|
||||
/* Deterine the number of path states. Deferring this for as long as possible allows the backend
|
||||
* to make better decisions about memory availability. */
|
||||
if (max_num_paths_ == 0) {
|
||||
size_t single_state_size = estimate_single_state_size(kernel_features);
|
||||
|
||||
max_num_paths_ = queue_->num_concurrent_states(single_state_size);
|
||||
min_num_active_main_paths_ = queue_->num_concurrent_busy_states(single_state_size);
|
||||
|
||||
/* Limit number of active paths to the half of the overall state. This is due to the logic in
|
||||
* the path compaction which relies on the fact that regeneration does not happen sooner than
|
||||
* half of the states are available again. */
|
||||
min_num_active_main_paths_ = min(min_num_active_main_paths_, max_num_paths_ / 2);
|
||||
}
|
||||
|
||||
/* Allocate a device only memory buffer before for each struct member, and then
|
||||
* write the pointers into a struct that resides in constant memory.
|
||||
*
|
||||
|
|
Loading…
Reference in New Issue