Metal: Optimise shader texture cache usage and branch reduction via point sampling.

Replace texelFetch calls with a texture point-sample rather than a textureRead call. This increases texture cache utilisation when mixing between sampled calls and reads. Bounds checking can also be removed from these functions, reducing instruction count and branch divergence, as the sampler routine handles range clamping.

Authored by Apple: Michael Parkin-White
Ref T96261

Depends on D16923

Reviewed By: fclem

Maniphest Tasks: T96261

Differential Revision: https://developer.blender.org/D17021
This commit is contained in:
Jason Fielder 2023-01-31 10:56:13 +01:00 committed by Clément Foucault
parent 9f866a92dc
commit f3bd5458a3
Notes: blender-bot 2023-02-14 05:50:03 +01:00
Referenced by issue #96261, Metal Viewport
2 changed files with 90 additions and 3 deletions

View File

@ -1752,8 +1752,9 @@ void MSLGeneratorInterface::prepare_from_createinfo(const shader::ShaderCreateIn
bool MSLGeneratorInterface::use_argument_buffer_for_samplers() const
{
/* We can only use argument buffers IF sampler count exceeds static limit of 16,
* AND we can support more samplers with an argument buffer. */
return texture_samplers.size() >= 16 && GPU_max_samplers() > 16;
* AND we can support more samplers with an argument buffer.
* NOTE: We reserve one constant sampler within the shader for fast read via point-sampling. */
return texture_samplers.size() >= 15 && GPU_max_samplers() > 16;
}
uint32_t MSLGeneratorInterface::num_samplers_for_stage(ShaderStage stage) const

View File

@ -291,7 +291,93 @@ union _msl_return_float {
/* Add custom texture sampling/reading routines for each type to account for special return cases,
* e.g. returning a float with an r parameter Note: Cannot use template specialization for input
* type, as return types are specific to the signature of 'tex'. */
/* Texture Read. */
/* Use point sampler instead of texture read to benefit from texture caching and reduce branching
* through removal of bounds tests, as these are handled by the sample operation. */
constexpr sampler _point_sample_(address::clamp_to_zero, filter::nearest, coord::pixel);
/* Texture Read via point sampling.
* NOTE: These templates will evaluate first for texture resources bound with sample. */
template<typename S, typename T>
inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d<S, access::sample> tex,
T texel,
uint lod = 0)
{
return tex.texture->sample(_point_sample_, float(texel));
}
template<typename S, typename T>
inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d<S, access::sample> tex,
T texel,
uint lod,
T offset)
{
return tex.texture->sample(_point_sample_, float(texel + offset));
}
template<typename S, typename T>
inline vec<S, 4> _texelFetch_internal(
thread _mtl_combined_image_sampler_1d_array<S, access::sample> tex,
vec<T, 2> texel,
uint lod,
vec<T, 2> offset = vec<T, 2>(0, 0))
{
return tex.texture->sample(_point_sample_, float(texel.x + offset.x), uint(texel.y + offset.y));
}
template<typename S, typename T>
inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_2d<S, access::sample> tex,
vec<T, 2> texel,
uint lod,
vec<T, 2> offset = vec<T, 2>(0))
{
return tex.texture->sample(_point_sample_, float2(texel.xy + offset.xy), level(lod));
}
template<typename S, typename T>
inline vec<S, 4> _texelFetch_internal(
thread _mtl_combined_image_sampler_2d_array<S, access::sample> tex,
vec<T, 3> texel,
uint lod,
vec<T, 3> offset = vec<T, 3>(0))
{
return tex.texture->sample(
_point_sample_, float2(texel.xy + offset.xy), uint(texel.z + offset.z), level(lod));
}
template<typename S, typename T>
inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_3d<S, access::sample> tex,
vec<T, 3> texel,
uint lod,
vec<T, 3> offset = vec<T, 3>(0))
{
return tex.texture->sample(_point_sample_, float3(texel.xyz + offset.xyz), level(lod));
}
template<typename T>
inline _msl_return_float _texelFetch_internal(
thread _mtl_combined_image_sampler_depth_2d<float, access::sample> tex,
vec<T, 2> texel,
uint lod,
vec<T, 2> offset = vec<T, 2>(0))
{
_msl_return_float fl = {
tex.texture->sample(_point_sample_, float2(texel.xy + offset.xy), level(lod))};
return fl;
}
template<typename S, typename T>
inline vec<S, 4> _texture_internal_samp(
thread _mtl_combined_image_sampler_2d_array<S, access::sample> tex,
vec<T, 3> texel,
uint lod,
vec<T, 3> offset = vec<T, 3>(0))
{
return tex.texture->sample(
_point_sample_, float2(texel.xy + offset.xy), uint(texel.z + offset.z), level(lod));
}
/* Texture Read via read operation. Required by compute/image-bindings. */
template<typename S, typename T, access A>
inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d<S, A> tex,
T texel,