Cycles: MetalRT optimisation for subsurface intersection queries

This patch optimises subsurface intersection queries on MetalRT. Currently intersect_local traverses from the scene root, retrospectively discarding all non-local hits. Using a lookup of bottom level acceleration structures, we can explicitly query only the relevant instance. On M1 Max, with MetalRT selected, this can give a render speedup of 15-20% for scenes like Monster which make heavy use of subsurface scattering.

Patch authored by Marco Giordano.

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D17153
This commit is contained in:
Michael Jones 2023-02-06 19:09:51 +00:00 committed by Michael Jones
parent 961d99d3a4
commit 2d994de77c
Notes: blender-bot 2023-02-21 12:05:03 +01:00
Referenced by commit 6583acb880, Fix Cycles MetalRT access of macOS 11 features when unavailable
10 changed files with 177 additions and 15 deletions

View File

@ -21,6 +21,7 @@ class BVHMetal : public BVH {
API_AVAILABLE(macos(11.0))
vector<id<MTLAccelerationStructure>> blas_array;
vector<uint32_t> blas_lookup;
bool motion_blur = false;

View File

@ -816,6 +816,11 @@ bool BVHMetal::build_TLAS(Progress &progress,
uint32_t instance_index = 0;
uint32_t motion_transform_index = 0;
// allocate look up buffer for wost case scenario
uint64_t count = objects.size();
blas_lookup.resize(count);
for (Object *ob : objects) {
/* Skip non-traceable objects */
if (!ob->is_traceable())
@ -843,12 +848,15 @@ bool BVHMetal::build_TLAS(Progress &progress,
/* Set user instance ID to object index */
int object_index = ob->get_device_index();
uint32_t user_id = uint32_t(object_index);
int currIndex = instance_index++;
assert(user_id < blas_lookup.size());
blas_lookup[user_id] = accel_struct_index;
/* Bake into the appropriate descriptor */
if (motion_blur) {
MTLAccelerationStructureMotionInstanceDescriptor *instances =
(MTLAccelerationStructureMotionInstanceDescriptor *)[instanceBuf contents];
MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[instance_index++];
MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[currIndex];
desc.accelerationStructureIndex = accel_struct_index;
desc.userID = user_id;
@ -894,7 +902,7 @@ bool BVHMetal::build_TLAS(Progress &progress,
else {
MTLAccelerationStructureUserIDInstanceDescriptor *instances =
(MTLAccelerationStructureUserIDInstanceDescriptor *)[instanceBuf contents];
MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[instance_index++];
MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[currIndex];
desc.accelerationStructureIndex = accel_struct_index;
desc.userID = user_id;

View File

@ -74,6 +74,11 @@ class MetalDevice : public Device {
id<MTLBuffer> texture_bindings_3d = nil;
std::vector<id<MTLTexture>> texture_slot_map;
/* BLAS encoding & lookup */
id<MTLArgumentEncoder> mtlBlasArgEncoder = nil;
id<MTLBuffer> blas_buffer = nil;
id<MTLBuffer> blas_lookup_buffer = nil;
bool use_metalrt = false;
MetalPipelineType kernel_specialization_level = PSO_GENERIC;

View File

@ -192,6 +192,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
arg_desc_as.access = MTLArgumentAccessReadOnly;
MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init];
arg_desc_ptrs.dataType = MTLDataTypePointer;
arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
arg_desc_ift.access = MTLArgumentAccessReadOnly;
@ -204,14 +208,28 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
arg_desc_ift.index = index++;
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
arg_desc_ift.index = index++;
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_prim */
arg_desc_ptrs.index = index++;
[ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas array */
arg_desc_ptrs.index = index++;
[ancillary_desc addObject:[arg_desc_ptrs copy]]; /* look up table for blas */
[arg_desc_ift release];
[arg_desc_as release];
[arg_desc_ptrs release];
}
}
mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];
// preparing the blas arg encoder
MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
arg_desc_blas.access = MTLArgumentAccessReadOnly;
mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
[arg_desc_blas release];
for (int i = 0; i < ancillary_desc.count; i++) {
[ancillary_desc[i] release];
}
@ -1240,6 +1258,33 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
if (@available(macos 11.0, *)) {
if (bvh->params.top_level) {
bvhMetalRT = bvh_metal;
// allocate required buffers for BLAS array
uint64_t count = bvhMetalRT->blas_array.size();
uint64_t bufferSize = mtlBlasArgEncoder.encodedLength * count;
blas_buffer = [mtlDevice newBufferWithLength:bufferSize options:default_storage_mode];
stats.mem_alloc(blas_buffer.allocatedSize);
for (uint64_t i = 0; i < count; ++i) {
[mtlBlasArgEncoder setArgumentBuffer:blas_buffer
offset:i * mtlBlasArgEncoder.encodedLength];
[mtlBlasArgEncoder setAccelerationStructure:bvhMetalRT->blas_array[i] atIndex:0];
}
count = bvhMetalRT->blas_lookup.size();
bufferSize = sizeof(uint32_t) * count;
blas_lookup_buffer = [mtlDevice newBufferWithLength:bufferSize
options:default_storage_mode];
stats.mem_alloc(blas_lookup_buffer.allocatedSize);
memcpy([blas_lookup_buffer contents],
bvhMetalRT -> blas_lookup.data(),
blas_lookup_buffer.allocatedSize);
if (default_storage_mode == MTLResourceStorageModeManaged) {
[blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)];
[blas_lookup_buffer didModifyRange:NSMakeRange(0, blas_lookup_buffer.length)];
}
}
}
}

View File

@ -19,6 +19,8 @@ enum {
METALRT_FUNC_SHADOW_BOX,
METALRT_FUNC_LOCAL_TRI,
METALRT_FUNC_LOCAL_BOX,
METALRT_FUNC_LOCAL_TRI_PRIM,
METALRT_FUNC_LOCAL_BOX_PRIM,
METALRT_FUNC_CURVE_RIBBON,
METALRT_FUNC_CURVE_RIBBON_SHADOW,
METALRT_FUNC_CURVE_ALL,
@ -28,7 +30,13 @@ enum {
METALRT_FUNC_NUM
};
enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
enum {
METALRT_TABLE_DEFAULT,
METALRT_TABLE_SHADOW,
METALRT_TABLE_LOCAL,
METALRT_TABLE_LOCAL_PRIM,
METALRT_TABLE_NUM
};
/* Pipeline State Object types */
enum MetalPipelineType {

View File

@ -524,6 +524,8 @@ void MetalKernelPipeline::compile()
"__anyhit__cycles_metalrt_shadow_all_hit_box",
"__anyhit__cycles_metalrt_local_hit_tri",
"__anyhit__cycles_metalrt_local_hit_box",
"__anyhit__cycles_metalrt_local_hit_tri_prim",
"__anyhit__cycles_metalrt_local_hit_box_prim",
"__intersection__curve_ribbon",
"__intersection__curve_ribbon_shadow",
"__intersection__curve_all",
@ -614,11 +616,17 @@ void MetalKernelPipeline::compile()
rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
nil];
table_functions[METALRT_TABLE_LOCAL_PRIM] = [NSArray
arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI_PRIM],
rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
nil];
NSMutableSet *unique_functions = [NSMutableSet
setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL_PRIM]];
if (kernel_has_intersection(device_kernel)) {
linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]

View File

@ -482,6 +482,12 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
if (metal_device_->bvhMetalRT) {
id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct;
[metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
offset:0
atIndex:7];
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_lookup_buffer
offset:0
atIndex:8];
}
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
@ -532,6 +538,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
if (bvhMetalRT) {
/* Mark all Accelerations resources as used */
[mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct usage:MTLResourceUsageRead];
[mtlComputeCommandEncoder useResource:metal_device_->blas_buffer
usage:MTLResourceUsageRead];
[mtlComputeCommandEncoder useResource:metal_device_->blas_lookup_buffer
usage:MTLResourceUsageRead];
[mtlComputeCommandEncoder useResources:bvhMetalRT->blas_array.data()
count:bvhMetalRT->blas_array.size()
usage:MTLResourceUsageRead];

View File

@ -172,17 +172,14 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
kernel_assert(!"Invalid ift_local");
return false;
}
# endif
metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
metalrt_intersector_type metalrt_intersect;
metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
if (triangle_only) {
metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
if (is_null_intersection_function_table(metal_ancillaries->ift_local_prim)) {
if (local_isect) {
local_isect->num_hits = 0;
}
kernel_assert(!"Invalid ift_local_prim");
return false;
}
# endif
MetalRTIntersectionLocalPayload payload;
payload.self = ray->self;
@ -195,14 +192,48 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
}
payload.result = false;
typename metalrt_intersector_type::result_type intersection;
metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
# if defined(__METALRT_MOTION__)
metalrt_intersector_type metalrt_intersect;
typename metalrt_intersector_type::result_type intersection;
metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
if (triangle_only) {
metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
}
intersection = metalrt_intersect.intersect(
r, metal_ancillaries->accel_struct, 0xFF, ray->time, metal_ancillaries->ift_local, payload);
# else
metalrt_blas_intersector_type metalrt_intersect;
typename metalrt_blas_intersector_type::result_type intersection;
metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
if (triangle_only) {
metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
}
// if we know we are going to get max one hit, like for random-sss-walk we can
// optimize and accept the first hit
if (max_hits == 1) {
metalrt_intersect.accept_any_intersection(true);
}
int blas_index = metal_ancillaries->blas_userID_to_index_lookUp[local_object];
// transform the ray into object's local space
Transform itfm = kernel_data_fetch(objects, local_object).itfm;
r.origin = transform_point(&itfm, r.origin);
r.direction = transform_direction(&itfm, r.direction);
intersection = metalrt_intersect.intersect(
r, metal_ancillaries->accel_struct, 0xFF, metal_ancillaries->ift_local, payload);
r,
metal_ancillaries->blas_accel_structs[blas_index].blas,
metal_ancillaries->ift_local_prim,
payload);
# endif
if (lcg_state) {

View File

@ -266,13 +266,25 @@ ccl_device_forceinline uchar4 make_uchar4(const uchar x,
# if defined(__METALRT_MOTION__)
# define METALRT_TAGS instancing, instance_motion, primitive_motion
# define METALRT_BLAS_TAGS , primitive_motion
# else
# define METALRT_TAGS instancing
# define METALRT_BLAS_TAGS
# endif /* __METALRT_MOTION__ */
typedef acceleration_structure<METALRT_TAGS> metalrt_as_type;
typedef intersection_function_table<triangle_data, METALRT_TAGS> metalrt_ift_type;
typedef metal::raytracing::intersector<triangle_data, METALRT_TAGS> metalrt_intersector_type;
# if defined(__METALRT_MOTION__)
typedef acceleration_structure<primitive_motion> metalrt_blas_as_type;
typedef intersection_function_table<triangle_data, primitive_motion> metalrt_blas_ift_type;
typedef metal::raytracing::intersector<triangle_data, primitive_motion>
metalrt_blas_intersector_type;
# else
typedef acceleration_structure<> metalrt_blas_as_type;
typedef intersection_function_table<triangle_data> metalrt_blas_ift_type;
typedef metal::raytracing::intersector<triangle_data> metalrt_blas_intersector_type;
# endif
#endif /* __METALRT__ */
@ -285,6 +297,12 @@ struct Texture3DParamsMetal {
texture3d<float, access::sample> tex;
};
#ifdef __METALRT__
struct MetalRTBlasWrapper {
metalrt_blas_as_type blas;
};
#endif
struct MetalAncillaries {
device Texture2DParamsMetal *textures_2d;
device Texture3DParamsMetal *textures_3d;
@ -294,6 +312,9 @@ struct MetalAncillaries {
metalrt_ift_type ift_default;
metalrt_ift_type ift_shadow;
metalrt_ift_type ift_local;
metalrt_blas_ift_type ift_local_prim;
constant MetalRTBlasWrapper *blas_accel_structs;
constant int *blas_userID_to_index_lookUp;
#endif
};

View File

@ -139,6 +139,20 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
#endif
}
[[intersection(triangle, triangle_data )]] TriangleIntersectionResult
__anyhit__cycles_metalrt_local_hit_tri_prim(
constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
uint primitive_id [[primitive_id]],
float2 barycentrics [[barycentric_coord]],
float ray_tmax [[distance]])
{
//instance_id, aka the user_id has been removed. If we take this function we optimized the
//SSS for starting traversal from a primitive acceleration structure instead of the root of the global AS.
//this means we will always be intersecting the correct object no need for the userid to check
return metalrt_local_hit<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
launch_params_metal, payload, payload.local_object, primitive_id, barycentrics, ray_tmax);
}
[[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
__anyhit__cycles_metalrt_local_hit_tri(
constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
@ -163,6 +177,17 @@ __anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
return result;
}
[[intersection(bounding_box, triangle_data )]] BoundingBoxIntersectionResult
__anyhit__cycles_metalrt_local_hit_box_prim(const float ray_tmax [[max_distance]])
{
/* unused function */
BoundingBoxIntersectionResult result;
result.distance = ray_tmax;
result.accept = false;
result.continue_search = false;
return result;
}
template<uint intersection_type>
bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,