Metal: Optimize local shader memory usage.

Global scope arrays can incur suboptimal per-shader-thread memory allocations, resulting in excessive usage of limited local memory resources. These changes ensure that any arrays are limited to the closest scope in which they are required and thus will get correctly optimized by the compiler.

A number of constants have also been replaced with Macro's as these can result in better runtime performance for complex shader code.

Authored by Apple: Michael Parkin-White

Ref T96261

Reviewed By: fclem

Maniphest Tasks: T96261

Differential Revision: https://developer.blender.org/D16825
This commit is contained in:
Jason Fielder 2022-12-21 14:11:20 +01:00 committed by Clément Foucault
parent 7ff47f7a94
commit 3535670ff1
Notes: blender-bot 2023-02-14 09:43:37 +01:00
Referenced by issue #96261, Metal Viewport
13 changed files with 101 additions and 87 deletions

View File

@ -23,8 +23,8 @@ uniform sampler2DArray utilTex;
#define LTC_DISK_LAYER 3 /* UNUSED */
/* Layers 4 to 20 are for BTDF Lut. */
const float lut_btdf_layer_first = 4.0;
const float lut_btdf_layer_count = 16.0;
#define lut_btdf_layer_first 4.0
#define lut_btdf_layer_count 16.0
/**
* Reminder: The 4 noise values are based of 3 uncorrelated blue noises:

View File

@ -6,10 +6,10 @@
#pragma BLENDER_REQUIRE(effect_dof_lib.glsl)
const float tile_to_fullres_factor = float(DOF_TILE_DIVISOR);
#define tile_to_fullres_factor float(DOF_TILE_DIVISOR)
/* Error introduced by the random offset of the gathering kernel's center. */
const float bluring_radius_error = 1.0 + 1.0 / (gather_ring_count + 0.5);
#define bluring_radius_error (1.0 + 1.0 / (gather_ring_count + 0.5))
void main()
{

View File

@ -10,6 +10,7 @@
void main()
{
DEFINE_DOF_QUAD_OFFSETS
vec2 halfres_texel_size = 1.0 / vec2(textureSize(colorBuffer, 0).xy);
/* Center uv around the 4 halfres pixels. */
vec2 quad_center = (floor(gl_FragCoord.xy) * 2.0 + 1.0) * halfres_texel_size;

View File

@ -9,7 +9,7 @@
#pragma BLENDER_REQUIRE(effect_dof_lib.glsl)
const int halfres_tile_divisor = DOF_TILE_DIVISOR / 2;
#define halfres_tile_divisor (DOF_TILE_DIVISOR / 2)
void main()
{

View File

@ -18,9 +18,9 @@ vec2 outOcclusion;
#endif
#ifdef DOF_FOREGROUND_PASS
const bool is_foreground = true;
# define is_foreground true
#else /* DOF_BACKGROUND_PASS */
const bool is_foreground = false;
# define is_foreground false
#endif
const float unit_ring_radius = 1.0 / float(gather_ring_count);

View File

@ -12,38 +12,41 @@
// #define DOF_DEBUG_GATHER_PERF
// #define DOF_DEBUG_SCATTER_PERF
const bool no_smooth_intersection = false;
const bool no_gather_occlusion = false;
const bool no_gather_mipmaps = false;
const bool no_gather_random = false;
const bool no_gather_filtering = false;
const bool no_scatter_occlusion = false;
const bool no_scatter_pass = false;
const bool no_foreground_pass = false;
const bool no_background_pass = false;
const bool no_slight_focus_pass = false;
const bool no_focus_pass = false;
const bool no_holefill_pass = false;
#define no_smooth_intersection false
#define no_gather_occlusion false
#define no_gather_mipmaps false
#define no_gather_random false
#define no_gather_filtering false
#define no_scatter_occlusion false
#define no_scatter_pass false
#define no_foreground_pass false
#define no_background_pass false
#define no_slight_focus_pass false
#define no_focus_pass false
#define no_holefill_pass false
/* -------------- Quality Defines ------------- */
#ifdef DOF_HOLEFILL_PASS
/* No need for very high density for holefill. */
const int gather_ring_count = 3;
const int gather_ring_density = 3;
const int gather_max_density_change = 0;
const int gather_density_change_ring = 1;
# define gather_ring_count 3
# define gather_ring_density 3
# define gather_max_density_change 0
# define gather_density_change_ring 1
#else
const int gather_ring_count = DOF_GATHER_RING_COUNT;
const int gather_ring_density = 3;
const int gather_max_density_change = 50; /* Dictates the maximum good quality blur. */
const int gather_density_change_ring = 1;
# define gather_ring_count DOF_GATHER_RING_COUNT
# define gather_ring_density 3
# define gather_max_density_change 50 /* Dictates the maximum good quality blur. */
# define gather_density_change_ring 1
#endif
/* -------------- Utils ------------- */
const vec2 quad_offsets[4] = vec2[4](
vec2(-0.5, 0.5), vec2(0.5, 0.5), vec2(0.5, -0.5), vec2(-0.5, -0.5));
/* For performance on macOS, constants declared within function scope utilize constant uniform
register space rather than per-thread, reducing spill and incrasing
thread execution width - and thus performance */
#define DEFINE_DOF_QUAD_OFFSETS \
const vec2 quad_offsets[4] = vec2[4]( \
vec2(-0.5, 0.5), vec2(0.5, 0.5), vec2(0.5, -0.5), vec2(-0.5, -0.5));
/* Divide by sensor size to get the normalized size. */
#define calculate_coc_persp(zdepth) (cocMul / zdepth - cocBias)
@ -128,11 +131,11 @@ float dof_load_gather_coc(sampler2D gather_input_coc_buffer, vec2 uv, float lod)
}
/* Distribute weights between near/slightfocus/far fields (slide 117). */
const float layer_threshold = 4.0;
#define layer_threshold 4.0
/* Make sure it overlaps. */
const float layer_offset_fg = 0.5 + 1.0;
#define layer_offset_fg (0.5 + 1.0)
/* Extra offset for convolution layers to avoid light leaking from background. */
const float layer_offset = 0.5 + 0.5;
#define layer_offset (0.5 + 0.5)
#define DOF_MAX_SLIGHT_FOCUS_RADIUS 16

View File

@ -11,6 +11,7 @@
/* NOTE: Do not compare alpha as it is not scattered by the scatter pass. */
float dof_scatter_neighborhood_rejection(vec3 color)
{
DEFINE_DOF_QUAD_OFFSETS;
color = min(vec3(scatterColorNeighborMax), color);
float validity = 0.0;
@ -132,6 +133,7 @@ void main()
/* Downsample pass done for each mip starting from mip1. */
void main()
{
DEFINE_DOF_QUAD_OFFSETS
vec2 input_texel_size = 1.0 / vec2(textureSize(colorBuffer, 0).xy);
/* Center uv around the 4 pixels of the previous mip. */
vec2 quad_center = (floor(gl_FragCoord.xy) * 2.0 + 1.0) * input_texel_size;

View File

@ -29,6 +29,7 @@ float bokeh_shape(vec2 center)
void main(void)
{
DEFINE_DOF_QUAD_OFFSETS
vec4 shapes;
for (int i = 0; i < 4; i++) {
shapes[i] = bokeh_shape(spritepos + quad_offsets[i]);

View File

@ -37,6 +37,7 @@ void vertex_discard()
void main()
{
DEFINE_DOF_QUAD_OFFSETS
ivec2 tex_size = textureSize(cocBuffer, 0);
int t_id = gl_VertexID / 3; /* Triangle Id */

View File

@ -10,6 +10,7 @@
void main()
{
DEFINE_DOF_QUAD_OFFSETS
vec2 fullres_texel_size = 1.0 / vec2(textureSize(colorBuffer, 0).xy);
/* Center uv around the 4 fullres pixels. */
vec2 quad_center = (floor(gl_FragCoord.xy) * 2.0 + 1.0) * fullres_texel_size;

View File

@ -47,53 +47,4 @@ HitData decode_hit_data(vec4 hit_data, float hit_depth)
/* Blue noise categorized into 4 sets of samples.
* See "Stochastic all the things" presentation slide 32-37. */
const int resolve_samples_count = 9;
const vec2 resolve_sample_offsets[36] = vec2[36](
/* Set 1. */
/* First Ring (2x2). */
vec2(0, 0),
/* Second Ring (6x6). */
vec2(-1, 3),
vec2(1, 3),
vec2(-1, 1),
vec2(3, 1),
vec2(-2, 0),
vec2(3, 0),
vec2(2, -1),
vec2(1, -2),
/* Set 2. */
/* First Ring (2x2). */
vec2(1, 1),
/* Second Ring (6x6). */
vec2(-2, 3),
vec2(3, 3),
vec2(0, 2),
vec2(2, 2),
vec2(-2, -1),
vec2(1, -1),
vec2(0, -2),
vec2(3, -2),
/* Set 3. */
/* First Ring (2x2). */
vec2(0, 1),
/* Second Ring (6x6). */
vec2(0, 3),
vec2(3, 2),
vec2(-2, 1),
vec2(2, 1),
vec2(-1, 0),
vec2(-2, -2),
vec2(0, -1),
vec2(2, -2),
/* Set 4. */
/* First Ring (2x2). */
vec2(1, 0),
/* Second Ring (6x6). */
vec2(2, 3),
vec2(-2, 2),
vec2(-1, 2),
vec2(1, 2),
vec2(2, 0),
vec2(-1, -1),
vec2(3, -1),
vec2(-1, -2));
#define resolve_samples_count 9

View File

@ -102,6 +102,58 @@ void raytrace_resolve(ClosureInputGlossy cl_in,
inout ClosureEvalCommon cl_common,
inout ClosureOutputGlossy cl_out)
{
/* Note: Reflection samples declared in function scope to avoid per-thread memory pressure on
* tile-based GPUs e.g. Apple Silicon. */
const vec2 resolve_sample_offsets[36] = vec2[36](
/* Set 1. */
/* First Ring (2x2). */
vec2(0, 0),
/* Second Ring (6x6). */
vec2(-1, 3),
vec2(1, 3),
vec2(-1, 1),
vec2(3, 1),
vec2(-2, 0),
vec2(3, 0),
vec2(2, -1),
vec2(1, -2),
/* Set 2. */
/* First Ring (2x2). */
vec2(1, 1),
/* Second Ring (6x6). */
vec2(-2, 3),
vec2(3, 3),
vec2(0, 2),
vec2(2, 2),
vec2(-2, -1),
vec2(1, -1),
vec2(0, -2),
vec2(3, -2),
/* Set 3. */
/* First Ring (2x2). */
vec2(0, 1),
/* Second Ring (6x6). */
vec2(0, 3),
vec2(3, 2),
vec2(-2, 1),
vec2(2, 1),
vec2(-1, 0),
vec2(-2, -2),
vec2(0, -1),
vec2(2, -2),
/* Set 4. */
/* First Ring (2x2). */
vec2(1, 0),
/* Second Ring (6x6). */
vec2(2, 3),
vec2(-2, 2),
vec2(-1, 2),
vec2(1, 2),
vec2(2, 0),
vec2(-1, -1),
vec2(3, -1),
vec2(-1, -2));
float roughness = cl_in.roughness;
vec4 ssr_accum = vec4(0.0);

View File

@ -182,13 +182,15 @@ void eval_volume_step(inout vec3 Lscat, float extinction, float step_len, out fl
}
#define P(x) ((x + 0.5) * (1.0 / 16.0))
const vec4 dither_mat[4] = vec4[4](vec4(P(0.0), P(8.0), P(2.0), P(10.0)),
vec4(P(12.0), P(4.0), P(14.0), P(6.0)),
vec4(P(3.0), P(11.0), P(1.0), P(9.0)),
vec4(P(15.0), P(7.0), P(13.0), P(5.0)));
vec4 volume_integration(vec3 ray_ori, vec3 ray_dir, float ray_inc, float ray_max, float step_len)
{
/* Note: Constant array declared inside function scope to reduce shader core thread memory
* pressure on Apple Silicon. */
const vec4 dither_mat[4] = vec4[4](vec4(P(0.0), P(8.0), P(2.0), P(10.0)),
vec4(P(12.0), P(4.0), P(14.0), P(6.0)),
vec4(P(3.0), P(11.0), P(1.0), P(9.0)),
vec4(P(15.0), P(7.0), P(13.0), P(5.0)));
/* Start with full transmittance and no scattered light. */
vec3 final_scattering = vec3(0.0);
float final_transmittance = 1.0;