Metal: Optimize shader local memory usage.

Due to shader global scope emulation via class interface, global constant arrays in shaders are allocated in per-thread shader local memory. To reduce memory pressure, placing these constant arrays inside function scope will ensure they only reside within device constant memory. This results in a tangible 1.5-2x performance uplift for the specific shaders affected.

Authored by Apple: Michael Parkin-White

Ref T96261

Reviewed By: fclem

Maniphest Tasks: T96261

Differential Revision: https://developer.blender.org/D17089
This commit is contained in:
Jason Fielder 2023-01-30 13:44:46 +01:00 committed by Clément Foucault
parent dea924a91f
commit 596ee79a9f
Notes: blender-bot 2023-02-14 06:42:54 +01:00
Referenced by issue #96261, Metal Viewport
7 changed files with 101 additions and 87 deletions

View File

@ -6,16 +6,17 @@
#define M_4PI 12.5663706143591729
const mat3 CUBE_ROTATIONS[6] = mat3[](
mat3(vec3(0.0, 0.0, -1.0), vec3(0.0, -1.0, 0.0), vec3(-1.0, 0.0, 0.0)),
mat3(vec3(0.0, 0.0, 1.0), vec3(0.0, -1.0, 0.0), vec3(1.0, 0.0, 0.0)),
mat3(vec3(1.0, 0.0, 0.0), vec3(0.0, 0.0, 1.0), vec3(0.0, -1.0, 0.0)),
mat3(vec3(1.0, 0.0, 0.0), vec3(0.0, 0.0, -1.0), vec3(0.0, 1.0, 0.0)),
mat3(vec3(1.0, 0.0, 0.0), vec3(0.0, -1.0, 0.0), vec3(0.0, 0.0, -1.0)),
mat3(vec3(-1.0, 0.0, 0.0), vec3(0.0, -1.0, 0.0), vec3(0.0, 0.0, 1.0)));
vec3 get_cubemap_vector(vec2 co, int face)
{
/* NOTE(Metal): Declaring constant array in function scope to avoid increasing local shader
* memory pressure. */
const mat3 CUBE_ROTATIONS[6] = mat3[](
mat3(vec3(0.0, 0.0, -1.0), vec3(0.0, -1.0, 0.0), vec3(-1.0, 0.0, 0.0)),
mat3(vec3(0.0, 0.0, 1.0), vec3(0.0, -1.0, 0.0), vec3(1.0, 0.0, 0.0)),
mat3(vec3(1.0, 0.0, 0.0), vec3(0.0, 0.0, 1.0), vec3(0.0, -1.0, 0.0)),
mat3(vec3(1.0, 0.0, 0.0), vec3(0.0, 0.0, -1.0), vec3(0.0, 1.0, 0.0)),
mat3(vec3(1.0, 0.0, 0.0), vec3(0.0, -1.0, 0.0), vec3(0.0, 0.0, -1.0)),
mat3(vec3(-1.0, 0.0, 0.0), vec3(0.0, -1.0, 0.0), vec3(0.0, 0.0, 1.0)));
return normalize(CUBE_ROTATIONS[face] * vec3(co * 2.0 - 1.0, 1.0));
}

View File

@ -4,13 +4,16 @@
/* 4x4 bayer matrix prepared for 8bit UNORM precision error. */
#define P(x) (((x + 0.5) * (1.0 / 16.0) - 0.5) * (1.0 / 255.0))
const vec4 dither_mat4x4[4] = vec4[4](vec4(P(0.0), P(8.0), P(2.0), P(10.0)),
vec4(P(12.0), P(4.0), P(14.0), P(6.0)),
vec4(P(3.0), P(11.0), P(1.0), P(9.0)),
vec4(P(15.0), P(7.0), P(13.0), P(5.0)));
float dither(void)
{
/* NOTE(Metal): Declaring constant array in function scope to avoid increasing local shader
* memory pressure. */
const vec4 dither_mat4x4[4] = vec4[4](vec4(P(0.0), P(8.0), P(2.0), P(10.0)),
vec4(P(12.0), P(4.0), P(14.0), P(6.0)),
vec4(P(3.0), P(11.0), P(1.0), P(9.0)),
vec4(P(15.0), P(7.0), P(13.0), P(5.0)));
ivec2 co = ivec2(gl_FragCoord.xy) % 4;
return dither_mat4x4[co.x][co.y];
}

View File

@ -1,10 +1,10 @@
#pragma BLENDER_REQUIRE(common_view_lib.glsl)
/* TODO: Theme? */
const vec4 pinned_col = vec4(1.0, 0.0, 0.0, 1.0);
void main()
{
/* TODO: Theme? */
const vec4 pinned_col = vec4(1.0, 0.0, 0.0, 1.0);
bool is_selected = (flag & (VERT_UV_SELECT | FACE_UV_SELECT)) != 0u;
bool is_pinned = (flag & VERT_UV_PINNED) != 0u;
vec4 deselect_col = (is_pinned) ? pinned_col : vec4(color.rgb, 1.0);

View File

@ -1,15 +1,6 @@
#pragma BLENDER_REQUIRE(common_view_lib.glsl)
/* Corners for cell outlines. 0.45 is arbitrary. Any value below 0.5 can be used to avoid
* overlapping of the outlines. */
const vec3 corners[4] = vec3[4](vec3(-0.45, 0.45, 0.0),
vec3(0.45, 0.45, 0.0),
vec3(0.45, -0.45, 0.0),
vec3(-0.45, -0.45, 0.0));
const int indices[8] = int[8](0, 1, 1, 2, 2, 3, 3, 0);
vec4 flag_to_color(uint flag)
{
/* Color mapping for flags */
@ -88,6 +79,16 @@ void main()
}
}
#endif
/* NOTE(Metal): Declaring constant arrays in function scope to avoid increasing local shader
* memory pressure. */
const int indices[8] = int[8](0, 1, 1, 2, 2, 3, 3, 0);
/* Corners for cell outlines. 0.45 is arbitrary. Any value below 0.5 can be used to avoid
* overlapping of the outlines. */
const vec3 corners[4] = vec3[4](vec3(-0.45, 0.45, 0.0),
vec3(0.45, 0.45, 0.0),
vec3(0.45, -0.45, 0.0),
vec3(-0.45, -0.45, 0.0));
vec3 pos = domainOriginOffset + cellSize * (vec3(cell_co + adaptiveCellOffset) + cell_offset);
vec3 rotated_pos = rot_mat * corners[indices[gl_VertexID % 8]];

View File

@ -1,13 +1,6 @@
#pragma BLENDER_REQUIRE(common_view_lib.glsl)
const vec3 corners[4] = vec3[4](vec3(0.0, 0.2, -0.5),
vec3(-0.2 * 0.866, -0.2 * 0.5, -0.5),
vec3(0.2 * 0.866, -0.2 * 0.5, -0.5),
vec3(0.0, 0.0, 0.5));
const int indices[12] = int[12](0, 1, 1, 2, 2, 0, 0, 3, 1, 3, 2, 3);
/* Straight Port from BKE_defvert_weight_to_rgb()
* TODO: port this to a color ramp. */
vec3 weight_to_color(float weight)
@ -177,6 +170,15 @@ void main()
mat3 rot_mat = rotation_from_vector(vector);
# ifdef USE_NEEDLE
/* NOTE(Metal): Declaring constant arrays in function scope to avoid increasing local shader
* memory pressure. */
const vec3 corners[4] = vec3[4](vec3(0.0, 0.2, -0.5),
vec3(-0.2 * 0.866, -0.2 * 0.5, -0.5),
vec3(0.2 * 0.866, -0.2 * 0.5, -0.5),
vec3(0.0, 0.0, 0.5));
const int indices[12] = int[12](0, 1, 1, 2, 2, 0, 0, 3, 1, 3, 2, 3);
vec3 rotated_pos = rot_mat * corners[indices[gl_VertexID % 12]];
pos += rotated_pos * vector_length * displaySize * cellSize;
# else

View File

@ -12,42 +12,6 @@
/* 4bits for corner id */
#define CORNER_VEC_OFS 2u
#define CORNER_VEC_RANGE BIT_RANGE(4)
const vec2 cornervec[36] = vec2[36](vec2(0.0, 1.0),
vec2(0.02, 0.805),
vec2(0.067, 0.617),
vec2(0.169, 0.45),
vec2(0.293, 0.293),
vec2(0.45, 0.169),
vec2(0.617, 0.076),
vec2(0.805, 0.02),
vec2(1.0, 0.0),
vec2(-1.0, 0.0),
vec2(-0.805, 0.02),
vec2(-0.617, 0.067),
vec2(-0.45, 0.169),
vec2(-0.293, 0.293),
vec2(-0.169, 0.45),
vec2(-0.076, 0.617),
vec2(-0.02, 0.805),
vec2(0.0, 1.0),
vec2(0.0, -1.0),
vec2(-0.02, -0.805),
vec2(-0.067, -0.617),
vec2(-0.169, -0.45),
vec2(-0.293, -0.293),
vec2(-0.45, -0.169),
vec2(-0.617, -0.076),
vec2(-0.805, -0.02),
vec2(-1.0, 0.0),
vec2(1.0, 0.0),
vec2(0.805, -0.02),
vec2(0.617, -0.067),
vec2(0.45, -0.169),
vec2(0.293, -0.293),
vec2(0.169, -0.45),
vec2(0.076, -0.617),
vec2(0.02, -0.805),
vec2(0.0, -1.0));
#define INNER_FLAG uint(1 << 10) /* is inner vert */
@ -60,6 +24,45 @@ const vec2 cornervec[36] = vec2[36](vec2(0.0, 1.0),
void main()
{
/* NOTE(Metal): Declaring constant array in function scope to avoid increasing local shader
* memory pressure.*/
const vec2 cornervec[36] = vec2[36](vec2(0.0, 1.0),
vec2(0.02, 0.805),
vec2(0.067, 0.617),
vec2(0.169, 0.45),
vec2(0.293, 0.293),
vec2(0.45, 0.169),
vec2(0.617, 0.076),
vec2(0.805, 0.02),
vec2(1.0, 0.0),
vec2(-1.0, 0.0),
vec2(-0.805, 0.02),
vec2(-0.617, 0.067),
vec2(-0.45, 0.169),
vec2(-0.293, 0.293),
vec2(-0.169, 0.45),
vec2(-0.076, 0.617),
vec2(-0.02, 0.805),
vec2(0.0, 1.0),
vec2(0.0, -1.0),
vec2(-0.02, -0.805),
vec2(-0.067, -0.617),
vec2(-0.169, -0.45),
vec2(-0.293, -0.293),
vec2(-0.45, -0.169),
vec2(-0.617, -0.076),
vec2(-0.805, -0.02),
vec2(-1.0, 0.0),
vec2(1.0, 0.0),
vec2(0.805, -0.02),
vec2(0.617, -0.067),
vec2(0.45, -0.169),
vec2(0.293, -0.293),
vec2(0.169, -0.45),
vec2(0.076, -0.617),
vec2(0.02, -0.805),
vec2(0.0, -1.0));
uint cflag = vflag & CNR_FLAG_RANGE;
uint vofs = (vflag >> CORNER_VEC_OFS) & CORNER_VEC_RANGE;

View File

@ -1,25 +1,5 @@
#pragma BLENDER_REQUIRE(gpu_shader_colorspace_lib.glsl)
const vec2 offsets4[4] = vec2[4](
vec2(-0.5, 0.5), vec2(0.5, 0.5), vec2(-0.5, -0.5), vec2(-0.5, -0.5));
const vec2 offsets16[16] = vec2[16](vec2(-1.5, 1.5),
vec2(-0.5, 1.5),
vec2(0.5, 1.5),
vec2(1.5, 1.5),
vec2(-1.5, 0.5),
vec2(-0.5, 0.5),
vec2(0.5, 0.5),
vec2(1.5, 0.5),
vec2(-1.5, -0.5),
vec2(-0.5, -0.5),
vec2(0.5, -0.5),
vec2(1.5, -0.5),
vec2(-1.5, -1.5),
vec2(-0.5, -1.5),
vec2(0.5, -1.5),
vec2(1.5, -1.5));
//#define GPU_NEAREST
#define sample_glyph_offset(texel, ofs) \
texture_1D_custom_bilinear_filter(texCoord_interp + ofs * texel)
@ -92,6 +72,11 @@ void main()
fragColor.a = 0.0;
if (interp_size == 1) {
/* NOTE(Metal): Declaring constant array in function scope to avoid increasing local shader
* memory pressure.*/
const vec2 offsets4[4] = vec2[4](
vec2(-0.5, 0.5), vec2(0.5, 0.5), vec2(-0.5, -0.5), vec2(-0.5, -0.5));
/* 3x3 blur */
/* Manual unroll for perf. (stupid glsl compiler) */
fragColor.a += sample_glyph_offset(texel, offsets4[0]);
@ -101,6 +86,25 @@ void main()
fragColor.a *= (1.0 / 4.0);
}
else {
/* NOTE(Metal): Declaring constant array in function scope to avoid increasing local shader
* memory pressure.*/
const vec2 offsets16[16] = vec2[16](vec2(-1.5, 1.5),
vec2(-0.5, 1.5),
vec2(0.5, 1.5),
vec2(1.5, 1.5),
vec2(-1.5, 0.5),
vec2(-0.5, 0.5),
vec2(0.5, 0.5),
vec2(1.5, 0.5),
vec2(-1.5, -0.5),
vec2(-0.5, -0.5),
vec2(0.5, -0.5),
vec2(1.5, -0.5),
vec2(-1.5, -1.5),
vec2(-0.5, -1.5),
vec2(0.5, -1.5),
vec2(1.5, -1.5));
/* 5x5 blur */
/* Manual unroll for perf. (stupid glsl compiler) */
fragColor.a += sample_glyph_offset(texel, offsets16[0]);