Metal: Optimize local shader memory usage.
Global scope arrays can incur suboptimal per-shader-thread memory allocations, resulting in excessive usage of limited local memory resources. These changes ensure that any arrays are limited to the closest scope in which they are required and thus will get correctly optimized by the compiler. A number of constants have also been replaced with Macro's as these can result in better runtime performance for complex shader code. Authored by Apple: Michael Parkin-White Ref T96261 Reviewed By: fclem Maniphest Tasks: T96261 Differential Revision: https://developer.blender.org/D16825
This commit is contained in:
parent
7ff47f7a94
commit
3535670ff1
Notes:
blender-bot
2023-02-14 09:43:37 +01:00
Referenced by issue #96261, Metal Viewport
|
@ -23,8 +23,8 @@ uniform sampler2DArray utilTex;
|
|||
#define LTC_DISK_LAYER 3 /* UNUSED */
|
||||
|
||||
/* Layers 4 to 20 are for BTDF Lut. */
|
||||
const float lut_btdf_layer_first = 4.0;
|
||||
const float lut_btdf_layer_count = 16.0;
|
||||
#define lut_btdf_layer_first 4.0
|
||||
#define lut_btdf_layer_count 16.0
|
||||
|
||||
/**
|
||||
* Reminder: The 4 noise values are based of 3 uncorrelated blue noises:
|
||||
|
|
|
@ -6,10 +6,10 @@
|
|||
|
||||
#pragma BLENDER_REQUIRE(effect_dof_lib.glsl)
|
||||
|
||||
const float tile_to_fullres_factor = float(DOF_TILE_DIVISOR);
|
||||
#define tile_to_fullres_factor float(DOF_TILE_DIVISOR)
|
||||
|
||||
/* Error introduced by the random offset of the gathering kernel's center. */
|
||||
const float bluring_radius_error = 1.0 + 1.0 / (gather_ring_count + 0.5);
|
||||
#define bluring_radius_error (1.0 + 1.0 / (gather_ring_count + 0.5))
|
||||
|
||||
void main()
|
||||
{
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
|
||||
void main()
|
||||
{
|
||||
DEFINE_DOF_QUAD_OFFSETS
|
||||
vec2 halfres_texel_size = 1.0 / vec2(textureSize(colorBuffer, 0).xy);
|
||||
/* Center uv around the 4 halfres pixels. */
|
||||
vec2 quad_center = (floor(gl_FragCoord.xy) * 2.0 + 1.0) * halfres_texel_size;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
|
||||
#pragma BLENDER_REQUIRE(effect_dof_lib.glsl)
|
||||
|
||||
const int halfres_tile_divisor = DOF_TILE_DIVISOR / 2;
|
||||
#define halfres_tile_divisor (DOF_TILE_DIVISOR / 2)
|
||||
|
||||
void main()
|
||||
{
|
||||
|
|
|
@ -18,9 +18,9 @@ vec2 outOcclusion;
|
|||
#endif
|
||||
|
||||
#ifdef DOF_FOREGROUND_PASS
|
||||
const bool is_foreground = true;
|
||||
# define is_foreground true
|
||||
#else /* DOF_BACKGROUND_PASS */
|
||||
const bool is_foreground = false;
|
||||
# define is_foreground false
|
||||
#endif
|
||||
|
||||
const float unit_ring_radius = 1.0 / float(gather_ring_count);
|
||||
|
|
|
@ -12,38 +12,41 @@
|
|||
// #define DOF_DEBUG_GATHER_PERF
|
||||
// #define DOF_DEBUG_SCATTER_PERF
|
||||
|
||||
const bool no_smooth_intersection = false;
|
||||
const bool no_gather_occlusion = false;
|
||||
const bool no_gather_mipmaps = false;
|
||||
const bool no_gather_random = false;
|
||||
const bool no_gather_filtering = false;
|
||||
const bool no_scatter_occlusion = false;
|
||||
const bool no_scatter_pass = false;
|
||||
const bool no_foreground_pass = false;
|
||||
const bool no_background_pass = false;
|
||||
const bool no_slight_focus_pass = false;
|
||||
const bool no_focus_pass = false;
|
||||
const bool no_holefill_pass = false;
|
||||
#define no_smooth_intersection false
|
||||
#define no_gather_occlusion false
|
||||
#define no_gather_mipmaps false
|
||||
#define no_gather_random false
|
||||
#define no_gather_filtering false
|
||||
#define no_scatter_occlusion false
|
||||
#define no_scatter_pass false
|
||||
#define no_foreground_pass false
|
||||
#define no_background_pass false
|
||||
#define no_slight_focus_pass false
|
||||
#define no_focus_pass false
|
||||
#define no_holefill_pass false
|
||||
|
||||
/* -------------- Quality Defines ------------- */
|
||||
|
||||
#ifdef DOF_HOLEFILL_PASS
|
||||
/* No need for very high density for holefill. */
|
||||
const int gather_ring_count = 3;
|
||||
const int gather_ring_density = 3;
|
||||
const int gather_max_density_change = 0;
|
||||
const int gather_density_change_ring = 1;
|
||||
# define gather_ring_count 3
|
||||
# define gather_ring_density 3
|
||||
# define gather_max_density_change 0
|
||||
# define gather_density_change_ring 1
|
||||
#else
|
||||
const int gather_ring_count = DOF_GATHER_RING_COUNT;
|
||||
const int gather_ring_density = 3;
|
||||
const int gather_max_density_change = 50; /* Dictates the maximum good quality blur. */
|
||||
const int gather_density_change_ring = 1;
|
||||
# define gather_ring_count DOF_GATHER_RING_COUNT
|
||||
# define gather_ring_density 3
|
||||
# define gather_max_density_change 50 /* Dictates the maximum good quality blur. */
|
||||
# define gather_density_change_ring 1
|
||||
#endif
|
||||
|
||||
/* -------------- Utils ------------- */
|
||||
|
||||
const vec2 quad_offsets[4] = vec2[4](
|
||||
vec2(-0.5, 0.5), vec2(0.5, 0.5), vec2(0.5, -0.5), vec2(-0.5, -0.5));
|
||||
/* For performance on macOS, constants declared within function scope utilize constant uniform
|
||||
register space rather than per-thread, reducing spill and incrasing
|
||||
thread execution width - and thus performance */
|
||||
#define DEFINE_DOF_QUAD_OFFSETS \
|
||||
const vec2 quad_offsets[4] = vec2[4]( \
|
||||
vec2(-0.5, 0.5), vec2(0.5, 0.5), vec2(0.5, -0.5), vec2(-0.5, -0.5));
|
||||
|
||||
/* Divide by sensor size to get the normalized size. */
|
||||
#define calculate_coc_persp(zdepth) (cocMul / zdepth - cocBias)
|
||||
|
@ -128,11 +131,11 @@ float dof_load_gather_coc(sampler2D gather_input_coc_buffer, vec2 uv, float lod)
|
|||
}
|
||||
|
||||
/* Distribute weights between near/slightfocus/far fields (slide 117). */
|
||||
const float layer_threshold = 4.0;
|
||||
#define layer_threshold 4.0
|
||||
/* Make sure it overlaps. */
|
||||
const float layer_offset_fg = 0.5 + 1.0;
|
||||
#define layer_offset_fg (0.5 + 1.0)
|
||||
/* Extra offset for convolution layers to avoid light leaking from background. */
|
||||
const float layer_offset = 0.5 + 0.5;
|
||||
#define layer_offset (0.5 + 0.5)
|
||||
|
||||
#define DOF_MAX_SLIGHT_FOCUS_RADIUS 16
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
/* NOTE: Do not compare alpha as it is not scattered by the scatter pass. */
|
||||
float dof_scatter_neighborhood_rejection(vec3 color)
|
||||
{
|
||||
DEFINE_DOF_QUAD_OFFSETS;
|
||||
color = min(vec3(scatterColorNeighborMax), color);
|
||||
|
||||
float validity = 0.0;
|
||||
|
@ -132,6 +133,7 @@ void main()
|
|||
/* Downsample pass done for each mip starting from mip1. */
|
||||
void main()
|
||||
{
|
||||
DEFINE_DOF_QUAD_OFFSETS
|
||||
vec2 input_texel_size = 1.0 / vec2(textureSize(colorBuffer, 0).xy);
|
||||
/* Center uv around the 4 pixels of the previous mip. */
|
||||
vec2 quad_center = (floor(gl_FragCoord.xy) * 2.0 + 1.0) * input_texel_size;
|
||||
|
|
|
@ -29,6 +29,7 @@ float bokeh_shape(vec2 center)
|
|||
|
||||
void main(void)
|
||||
{
|
||||
DEFINE_DOF_QUAD_OFFSETS
|
||||
vec4 shapes;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
shapes[i] = bokeh_shape(spritepos + quad_offsets[i]);
|
||||
|
|
|
@ -37,6 +37,7 @@ void vertex_discard()
|
|||
|
||||
void main()
|
||||
{
|
||||
DEFINE_DOF_QUAD_OFFSETS
|
||||
ivec2 tex_size = textureSize(cocBuffer, 0);
|
||||
|
||||
int t_id = gl_VertexID / 3; /* Triangle Id */
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
|
||||
void main()
|
||||
{
|
||||
DEFINE_DOF_QUAD_OFFSETS
|
||||
vec2 fullres_texel_size = 1.0 / vec2(textureSize(colorBuffer, 0).xy);
|
||||
/* Center uv around the 4 fullres pixels. */
|
||||
vec2 quad_center = (floor(gl_FragCoord.xy) * 2.0 + 1.0) * fullres_texel_size;
|
||||
|
|
|
@ -47,53 +47,4 @@ HitData decode_hit_data(vec4 hit_data, float hit_depth)
|
|||
|
||||
/* Blue noise categorized into 4 sets of samples.
|
||||
* See "Stochastic all the things" presentation slide 32-37. */
|
||||
const int resolve_samples_count = 9;
|
||||
const vec2 resolve_sample_offsets[36] = vec2[36](
|
||||
/* Set 1. */
|
||||
/* First Ring (2x2). */
|
||||
vec2(0, 0),
|
||||
/* Second Ring (6x6). */
|
||||
vec2(-1, 3),
|
||||
vec2(1, 3),
|
||||
vec2(-1, 1),
|
||||
vec2(3, 1),
|
||||
vec2(-2, 0),
|
||||
vec2(3, 0),
|
||||
vec2(2, -1),
|
||||
vec2(1, -2),
|
||||
/* Set 2. */
|
||||
/* First Ring (2x2). */
|
||||
vec2(1, 1),
|
||||
/* Second Ring (6x6). */
|
||||
vec2(-2, 3),
|
||||
vec2(3, 3),
|
||||
vec2(0, 2),
|
||||
vec2(2, 2),
|
||||
vec2(-2, -1),
|
||||
vec2(1, -1),
|
||||
vec2(0, -2),
|
||||
vec2(3, -2),
|
||||
/* Set 3. */
|
||||
/* First Ring (2x2). */
|
||||
vec2(0, 1),
|
||||
/* Second Ring (6x6). */
|
||||
vec2(0, 3),
|
||||
vec2(3, 2),
|
||||
vec2(-2, 1),
|
||||
vec2(2, 1),
|
||||
vec2(-1, 0),
|
||||
vec2(-2, -2),
|
||||
vec2(0, -1),
|
||||
vec2(2, -2),
|
||||
/* Set 4. */
|
||||
/* First Ring (2x2). */
|
||||
vec2(1, 0),
|
||||
/* Second Ring (6x6). */
|
||||
vec2(2, 3),
|
||||
vec2(-2, 2),
|
||||
vec2(-1, 2),
|
||||
vec2(1, 2),
|
||||
vec2(2, 0),
|
||||
vec2(-1, -1),
|
||||
vec2(3, -1),
|
||||
vec2(-1, -2));
|
||||
#define resolve_samples_count 9
|
|
@ -102,6 +102,58 @@ void raytrace_resolve(ClosureInputGlossy cl_in,
|
|||
inout ClosureEvalCommon cl_common,
|
||||
inout ClosureOutputGlossy cl_out)
|
||||
{
|
||||
/* Note: Reflection samples declared in function scope to avoid per-thread memory pressure on
|
||||
* tile-based GPUs e.g. Apple Silicon. */
|
||||
const vec2 resolve_sample_offsets[36] = vec2[36](
|
||||
/* Set 1. */
|
||||
/* First Ring (2x2). */
|
||||
vec2(0, 0),
|
||||
/* Second Ring (6x6). */
|
||||
vec2(-1, 3),
|
||||
vec2(1, 3),
|
||||
vec2(-1, 1),
|
||||
vec2(3, 1),
|
||||
vec2(-2, 0),
|
||||
vec2(3, 0),
|
||||
vec2(2, -1),
|
||||
vec2(1, -2),
|
||||
/* Set 2. */
|
||||
/* First Ring (2x2). */
|
||||
vec2(1, 1),
|
||||
/* Second Ring (6x6). */
|
||||
vec2(-2, 3),
|
||||
vec2(3, 3),
|
||||
vec2(0, 2),
|
||||
vec2(2, 2),
|
||||
vec2(-2, -1),
|
||||
vec2(1, -1),
|
||||
vec2(0, -2),
|
||||
vec2(3, -2),
|
||||
/* Set 3. */
|
||||
/* First Ring (2x2). */
|
||||
vec2(0, 1),
|
||||
/* Second Ring (6x6). */
|
||||
vec2(0, 3),
|
||||
vec2(3, 2),
|
||||
vec2(-2, 1),
|
||||
vec2(2, 1),
|
||||
vec2(-1, 0),
|
||||
vec2(-2, -2),
|
||||
vec2(0, -1),
|
||||
vec2(2, -2),
|
||||
/* Set 4. */
|
||||
/* First Ring (2x2). */
|
||||
vec2(1, 0),
|
||||
/* Second Ring (6x6). */
|
||||
vec2(2, 3),
|
||||
vec2(-2, 2),
|
||||
vec2(-1, 2),
|
||||
vec2(1, 2),
|
||||
vec2(2, 0),
|
||||
vec2(-1, -1),
|
||||
vec2(3, -1),
|
||||
vec2(-1, -2));
|
||||
|
||||
float roughness = cl_in.roughness;
|
||||
|
||||
vec4 ssr_accum = vec4(0.0);
|
||||
|
|
|
@ -182,13 +182,15 @@ void eval_volume_step(inout vec3 Lscat, float extinction, float step_len, out fl
|
|||
}
|
||||
|
||||
#define P(x) ((x + 0.5) * (1.0 / 16.0))
|
||||
const vec4 dither_mat[4] = vec4[4](vec4(P(0.0), P(8.0), P(2.0), P(10.0)),
|
||||
vec4(P(12.0), P(4.0), P(14.0), P(6.0)),
|
||||
vec4(P(3.0), P(11.0), P(1.0), P(9.0)),
|
||||
vec4(P(15.0), P(7.0), P(13.0), P(5.0)));
|
||||
|
||||
vec4 volume_integration(vec3 ray_ori, vec3 ray_dir, float ray_inc, float ray_max, float step_len)
|
||||
{
|
||||
/* Note: Constant array declared inside function scope to reduce shader core thread memory
|
||||
* pressure on Apple Silicon. */
|
||||
const vec4 dither_mat[4] = vec4[4](vec4(P(0.0), P(8.0), P(2.0), P(10.0)),
|
||||
vec4(P(12.0), P(4.0), P(14.0), P(6.0)),
|
||||
vec4(P(3.0), P(11.0), P(1.0), P(9.0)),
|
||||
vec4(P(15.0), P(7.0), P(13.0), P(5.0)));
|
||||
/* Start with full transmittance and no scattered light. */
|
||||
vec3 final_scattering = vec3(0.0);
|
||||
float final_transmittance = 1.0;
|
||||
|
|
Loading…
Reference in New Issue