Metal: Realtime compositor enablement with addition of GPU Compute.

This patch adds support for compilation and execution of GLSL compute shaders. This, along with a few systematic changes and fixes, enable realtime compositor functionality with the Metal backend on macOS. A number of GLSL source modifications have been made to add the required level of type explicitness, allowing all compilations to succeed. GLSL Compute shader compilation follows a similar path to Vertex/Fragment translation, with added support for shader atomics, shared memory blocks and barriers. Texture flags have also been updated to ensure correct read/write specification for textures used within the compositor pipeline. GPU command submission changes have also been made in the high level path, when Metal is used, to address command buffer time-outs caused by certain expensive compute shaders. Authored by Apple: Michael Parkin-White Ref T96261 Ref T99210 Reviewed By: fclem Maniphest Tasks: T99210, T96261 Differential Revision: https://developer.blender.org/D16990
Referenced by issue #104236, MacOS Metal & real time compositor: Options are greyed out on Mac even though they are supported Referenced by issue #99210, Viewport Compositor Referenced by issue #96261, Metal Viewport
2023-01-30 11:00:26 +01:00 · 2023-01-30 11:00:26 +01:00 · 57552f52b2 · 2023-03-17 17:44:27 +01:00
parent d0f55aa671
commit 57552f52b2
49 changed files with 1926 additions and 303 deletions
--- a/source/blender/compositor/realtime_compositor/intern/shader_operation.cc
+++ b/source/blender/compositor/realtime_compositor/intern/shader_operation.cc
@ -10,6 +10,7 @@

 #include "DNA_customdata_types.h"

+#include "GPU_context.h"
 #include "GPU_material.h"
 #include "GPU_shader.h"
 #include "GPU_texture.h"
@ -329,8 +330,11 @@ void ShaderOperation::generate_code(void *thunk,
  shader_create_info.compute_source("gpu_shader_compositor_main.glsl");

  /* The main function is emitted in the shader before the evaluate function, so the evaluate
-   * function needs to be forward declared here. */
-  shader_create_info.typedef_source_generated += "void evaluate();\n";
+   * function needs to be forward declared here.
+   * NOTE(Metal): Metal does not require forward declarations. */
+  if (GPU_backend_get_type() != GPU_BACKEND_METAL) {
+    shader_create_info.typedef_source_generated += "void evaluate();\n";
+  }

  operation->generate_code_for_outputs(shader_create_info);

@ -383,10 +387,13 @@ void ShaderOperation::generate_code_for_outputs(ShaderCreateInfo &shader_create_

  /* The store functions are used by the node_compositor_store_output_[float|vector|color]
   * functions but are only defined later as part of the compute source, so they need to be forward
-   * declared. */
-  shader_create_info.typedef_source_generated += store_float_function_header + ";\n";
-  shader_create_info.typedef_source_generated += store_vector_function_header + ";\n";
-  shader_create_info.typedef_source_generated += store_color_function_header + ";\n";
+   * declared.
+   * NOTE(Metal): Metal does not require forward declarations. */
+  if (GPU_backend_get_type() != GPU_BACKEND_METAL) {
+    shader_create_info.typedef_source_generated += store_float_function_header + ";\n";
+    shader_create_info.typedef_source_generated += store_vector_function_header + ";\n";
+    shader_create_info.typedef_source_generated += store_color_function_header + ";\n";
+  }

  /* Each of the store functions is essentially a single switch case on the given ID, so start by
   * opening the function with a curly bracket followed by opening a switch statement in each of
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_blur.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_blur.glsl
@ -35,7 +35,7 @@ vec4 load_weight(ivec2 texel)
   * the texel into the normalized range [0, 1] needed to sample the weights sampler. Finally,
   * invert the textures coordinates by subtracting from 1 to maintain the shape of the weights as
   * mentioned in the function description. */
-  return texture(weights_tx, 1.0 - ((texel + vec2(radius + 0.5)) / (radius * 2 + 1)));
+  return texture(weights_tx, 1.0 - ((vec2(texel) + vec2(radius + 0.5)) / (radius * 2.0 + 1.0)));
 }

 void main()
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_blur_variable_size.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_blur_variable_size.glsl
@ -17,7 +17,7 @@ vec4 load_weight(ivec2 texel, float radius)
  /* The center zero texel is always assigned a unit weight regardless of the corresponding weight
   * in the weights texture. That's to guarantee that at last the center pixel will be accumulated
   * even if the weights texture is zero at its center. */
-  if (texel == ivec2(0)) {
+  if (texel.x == 0 && texel.y == 0) {
    return vec4(1.0);
  }

@ -26,7 +26,7 @@ vec4 load_weight(ivec2 texel, float radius)
   * the texel into the normalized range [0, 1] needed to sample the weights sampler. Finally,
   * invert the textures coordinates by subtracting from 1 to maintain the shape of the weights as
   * mentioned in the function description. */
-  return texture(weights_tx, 1.0 - ((texel + vec2(radius + 0.5)) / (radius * 2 + 1)));
+  return texture(weights_tx, 1.0 - ((vec2(texel) + vec2(radius + 0.5)) / (radius * 2.0 + 1.0)));
 }

 void main()
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_directional_blur.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_directional_blur.glsl
@ -12,7 +12,7 @@ void main()
   * input size, then transform the coordinates for the next iteration. */
  vec4 accumulated_color = vec4(0.0);
  for (int i = 0; i < iterations; i++) {
-    accumulated_color += texture(input_tx, coordinates / input_size);
+    accumulated_color += texture(input_tx, coordinates / vec2(input_size));
    coordinates = (mat3(inverse_transformation) * vec3(coordinates, 1.0)).xy;
  }

--- a/source/blender/compositor/realtime_compositor/shaders/compositor_edge_filter.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_edge_filter.glsl
@ -12,8 +12,8 @@ void main()
  for (int j = 0; j < 3; j++) {
    for (int i = 0; i < 3; i++) {
      vec3 color = texture_load(input_tx, texel + ivec2(i - 1, j - 1)).rgb;
-      color_x += color * kernel[j][i];
-      color_y += color * kernel[i][j];
+      color_x += color * ukernel[j][i];
+      color_y += color * ukernel[i][j];
    }
  }

--- a/source/blender/compositor/realtime_compositor/shaders/compositor_filter.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_filter.glsl
@ -8,7 +8,7 @@ void main()
  vec4 color = vec4(0);
  for (int j = 0; j < 3; j++) {
    for (int i = 0; i < 3; i++) {
-      color += texture_load(input_tx, texel + ivec2(i - 1, j - 1)) * kernel[j][i];
+      color += texture_load(input_tx, texel + ivec2(i - 1, j - 1)) * ukernel[j][i];
    }
  }

--- a/source/blender/compositor/realtime_compositor/shaders/compositor_glare_ghost_accumulate.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_glare_ghost_accumulate.glsl
@ -7,7 +7,7 @@ void main()

  /* Add 0.5 to evaluate the input sampler at the center of the pixel and divide by the image size
   * to get the coordinates into the sampler's expected [0, 1] range. */
-  vec2 coordinates = (vec2(texel) + vec2(0.5)) / input_size;
+  vec2 coordinates = (vec2(texel) + vec2(0.5)) / vec2(input_size);

  /* We accumulate four variants of the input ghost texture, each is scaled by some amount and
   * possibly multiplied by some color as a form of color modulation. */
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_glare_ghost_base.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_glare_ghost_base.glsl
@ -7,7 +7,7 @@ void main()

  /* Add 0.5 to evaluate the input sampler at the center of the pixel and divide by the image size
   * to get the coordinates into the sampler's expected [0, 1] range. */
-  vec2 coordinates = (vec2(texel) + vec2(0.5)) / input_size;
+  vec2 coordinates = (vec2(texel) + vec2(0.5)) / vec2(input_size);

  /* The small ghost is scaled down with the origin as the center of the image by a factor of 2.13,
   * while the big ghost is flipped and scaled up with the origin as the center of the image by a
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_glare_highlights.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_glare_highlights.glsl
@ -12,11 +12,11 @@ void main()
   * the number of input pixels that covers a single output pixel. In case the input and output
   * have the same size, this will be 0.5, which is the offset required to evaluate the sampler at
   * the center of the pixel. */
-  vec2 offset = (texture_size(input_tx) / imageSize(output_img)) / 2.0;
+  vec2 offset = vec2(texture_size(input_tx) / imageSize(output_img)) / 2.0;

  /* Add the aforementioned offset and divide by the output image size to get the coordinates into
   * the sampler's expected [0, 1] range. */
-  vec2 normalized_coordinates = (vec2(texel) + offset) / imageSize(output_img);
+  vec2 normalized_coordinates = (vec2(texel) + offset) / vec2(imageSize(output_img));

  vec4 input_color = texture(input_tx, normalized_coordinates);
  float luminance = dot(input_color.rgb, luminance_coefficients);
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_glare_mix.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_glare_mix.glsl
@ -7,7 +7,7 @@ void main()
  /* Add 0.5 to evaluate the input sampler at the center of the pixel and divide by the input image
   * size to get the relevant coordinates into the sampler's expected [0, 1] range. Make sure the
   * input color is not negative to avoid a subtractive effect when mixing the glare. */
-  vec2 normalized_coordinates = (vec2(texel) + vec2(0.5)) / texture_size(input_tx);
+  vec2 normalized_coordinates = (vec2(texel) + vec2(0.5)) / vec2(texture_size(input_tx));
  vec4 glare_color = texture(glare_tx, normalized_coordinates);
  vec4 input_color = max(vec4(0.0), texture_load(input_tx, texel));

--- a/source/blender/compositor/realtime_compositor/shaders/compositor_glare_streaks_filter.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_glare_streaks_filter.glsl
@ -8,8 +8,8 @@ void main()
  /* Add 0.5 to evaluate the input sampler at the center of the pixel and divide by the image size
   * to get the coordinates into the sampler's expected [0, 1] range. Similarly, transform the
   * vector into the sampler's space by dividing by the input size. */
-  vec2 coordinates = (vec2(texel) + vec2(0.5)) / input_size;
-  vec2 vector = streak_vector / input_size;
+  vec2 coordinates = (vec2(texel) + vec2(0.5)) / vec2(input_size);
+  vec2 vector = streak_vector / vec2(input_size);

  /* Load three equally spaced neighbours to the current pixel in the direction of the streak
   * vector. */
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_morphological_distance_threshold.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_morphological_distance_threshold.glsl
@ -78,7 +78,8 @@ void main()

  /* Compute the actual distance from the squared distance and assign it an appropriate sign
   * depending on whether it lies in a masked region or not. */
-  float signed_minimum_distance = sqrt(minimum_squared_distance) * (is_center_masked ? 1.0 : -1.0);
+  float signed_minimum_distance = sqrt(float(minimum_squared_distance)) *
+                                  (is_center_masked ? 1.0 : -1.0);

  /* Add the erode/dilate distance and divide by the inset amount as described in the discussion,
   * then clamp to the [0, 1] range. */
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_parallel_reduction.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_parallel_reduction.glsl
@ -44,7 +44,7 @@
 * for reduction, so we just load the data in a 1D array to simplify reduction. The developer is
 * expected to define the TYPE macro to be a float or a vec4, depending on the type of data being
 * reduced. */
-const uint reduction_size = gl_WorkGroupSize.x * gl_WorkGroupSize.y;
+#define reduction_size (gl_WorkGroupSize.x * gl_WorkGroupSize.y)
 shared TYPE reduction_data[reduction_size];

 void main()
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_realize_on_domain.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_realize_on_domain.glsl
@ -19,11 +19,11 @@ void main()
   * case the difference in sizes was odd. */
  ivec2 domain_size = imageSize(domain_img);
  ivec2 input_size = texture_size(input_tx);
-  vec2 offset = floor((domain_size - input_size) / 2.0);
+  vec2 offset = floor(vec2(domain_size - input_size) / 2.0);

  /* Subtract the offset and divide by the input image size to get the relevant coordinates into
   * the sampler's expected [0, 1] range. */
-  vec2 normalized_coordinates = (coordinates - offset) / input_size;
+  vec2 normalized_coordinates = (coordinates - offset) / vec2(input_size);

  imageStore(domain_img, texel, texture(input_tx, normalized_coordinates));
 }
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_screen_lens_distortion.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_screen_lens_distortion.glsl
@ -22,7 +22,7 @@ vec3 compute_chromatic_distortion_scale(float distance_squared)
 * coordinates but outputs non-centered image coordinates. */
 vec2 compute_distorted_uv(vec2 uv, float uv_scale)
 {
-  return (uv * uv_scale + 0.5) * texture_size(input_tx) - 0.5;
+  return (uv * uv_scale + 0.5) * vec2(texture_size(input_tx)) - 0.5;
 }

 /* Compute the number of integration steps that should be used to approximate the distorted pixel
@ -102,7 +102,7 @@ vec3 integrate_distortion(int start, int end, float distance_squared, vec2 uv, i
    /* Sample the color at the distorted coordinates and accumulate it weighted by the increment
     * value for both the start and end channels. */
    vec2 distorted_uv = compute_distorted_uv(uv, distortion_scale);
-    vec4 color = texture(input_tx, distorted_uv / texture_size(input_tx));
+    vec4 color = texture(input_tx, distorted_uv / vec2(texture_size(input_tx)));
    accumulated_color[start] += (1.0 - increment) * color[start];
    accumulated_color[end] += increment * color[end];
  }
@ -115,8 +115,8 @@ void main()

  /* Compute the UV image coordinates in the range [-1, 1] as well as the squared distance to the
   * center of the image, which is at (0, 0) in the UV coordinates. */
-  vec2 center = texture_size(input_tx) / 2.0;
-  vec2 uv = scale * (texel + 0.5 - center) / center;
+  vec2 center = vec2(texture_size(input_tx)) / 2.0;
+  vec2 uv = scale * (vec2(texel) + vec2(0.5) - center) / center;
  float distance_squared = dot(uv, uv);

  /* If any of the color channels will get distorted outside of the screen beyond what is possible,
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_symmetric_blur_variable_size.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_symmetric_blur_variable_size.glsl
@ -113,7 +113,7 @@ void main()

      /* Upper right quadrant. */
      float upper_right_size = load_size(texel + ivec2(x, y));
-      vec2 upper_right_blur_radius = upper_right_size * weights_size;
+      vec2 upper_right_blur_radius = upper_right_size * vec2(weights_size);
      if (x < upper_right_blur_radius.x && y < upper_right_blur_radius.y) {
        accumulated_color += load_input(texel + ivec2(x, y)) * weight;
        accumulated_weight += weight;
@ -121,7 +121,7 @@ void main()

      /* Upper left quadrant. */
      float upper_left_size = load_size(texel + ivec2(-x, y));
-      vec2 upper_left_blur_radius = upper_left_size * weights_size;
+      vec2 upper_left_blur_radius = upper_left_size * vec2(weights_size);
      if (x < upper_left_blur_radius.x && y < upper_left_blur_radius.y) {
        accumulated_color += load_input(texel + ivec2(-x, y)) * weight;
        accumulated_weight += weight;
@ -129,7 +129,7 @@ void main()

      /* Bottom right quadrant. */
      float bottom_right_size = load_size(texel + ivec2(x, -y));
-      vec2 bottom_right_blur_radius = bottom_right_size * weights_size;
+      vec2 bottom_right_blur_radius = bottom_right_size * vec2(weights_size);
      if (x < bottom_right_blur_radius.x && y < bottom_right_blur_radius.y) {
        accumulated_color += load_input(texel + ivec2(x, -y)) * weight;
        accumulated_weight += weight;
@ -137,7 +137,7 @@ void main()

      /* Bottom left quadrant. */
      float bottom_left_size = load_size(texel + ivec2(-x, -y));
-      vec2 bottom_left_blur_radius = bottom_left_size * weights_size;
+      vec2 bottom_left_blur_radius = bottom_left_size * vec2(weights_size);
      if (x < bottom_left_blur_radius.x && y < bottom_left_blur_radius.y) {
        accumulated_color += load_input(texel + ivec2(-x, -y)) * weight;
        accumulated_weight += weight;
--- a/source/blender/compositor/realtime_compositor/shaders/infos/compositor_edge_filter_info.hh
+++ b/source/blender/compositor/realtime_compositor/shaders/infos/compositor_edge_filter_info.hh
@ -4,7 +4,7 @@

 GPU_SHADER_CREATE_INFO(compositor_edge_filter)
    .local_group_size(16, 16)
-    .push_constant(Type::MAT4, "kernel")
+    .push_constant(Type::MAT4, "ukernel")
    .sampler(0, ImageType::FLOAT_2D, "input_tx")
    .sampler(1, ImageType::FLOAT_2D, "factor_tx")
    .image(0, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_img")
--- a/source/blender/compositor/realtime_compositor/shaders/infos/compositor_filter_info.hh
+++ b/source/blender/compositor/realtime_compositor/shaders/infos/compositor_filter_info.hh
@ -4,7 +4,7 @@

 GPU_SHADER_CREATE_INFO(compositor_filter)
    .local_group_size(16, 16)
-    .push_constant(Type::MAT4, "kernel")
+    .push_constant(Type::MAT4, "ukernel")
    .sampler(0, ImageType::FLOAT_2D, "input_tx")
    .sampler(1, ImageType::FLOAT_2D, "factor_tx")
    .image(0, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_img")
--- a/source/blender/draw/engines/compositor/compositor_engine.cc
+++ b/source/blender/draw/engines/compositor/compositor_engine.cc
@ -27,6 +27,7 @@
 #include "COM_evaluator.hh"
 #include "COM_texture_pool.hh"

+#include "GPU_context.h"
 #include "GPU_texture.h"

 #include "compositor_engine.h" /* Own include. */
@ -226,12 +227,36 @@ static void compositor_engine_draw(void *data)
  COMPOSITOR_Data *compositor_data = static_cast<COMPOSITOR_Data *>(data);

 #if defined(__APPLE__)
-  blender::StringRef("Viewport compositor not supported on MacOS")
-      .copy(compositor_data->info, GPU_INFO_SIZE);
-  return;
+  if (GPU_backend_get_type() == GPU_BACKEND_METAL) {
+    /* NOTE(Metal): Isolate Compositor compute work in individual command buffer to improve
+     * workload scheduling. When expensive compositor nodes are in the graph, these can stall out
+     * the GPU for extended periods of time and suboptimally schedule work for execution. */
+    GPU_flush();
+  }
+  else {
+    /* Realtime Compositor is not supported on macOS with the OpenGL backend. */
+    blender::StringRef("Viewport compositor is only supported on MacOS with the Metal Backend.")
+        .copy(compositor_data->info, GPU_INFO_SIZE);
+    return;
+  }
 #endif

+  /* Exceute Compositor render commands. */
  compositor_data->instance_data->draw();
+
+#if defined(__APPLE__)
+  /* NOTE(Metal): Following previous flush to break commmand stream, with compositor command
+   * buffers potentially being heavy, we avoid issuing subsequent commands until compositor work
+   * has completed. If subsequent work is prematurely queued up, the subsequent command buffers
+   * will be blocked behind compositor work and may trigger a command buffer time-out error. As a
+   * result, we should wait for compositor work to complete.
+   *
+   * This is not an efficient approach for peak performance, but a catch-all to prevent command
+   * buffer failure, until the offending cases can be resolved. */
+  if (GPU_backend_get_type() == GPU_BACKEND_METAL) {
+    GPU_finish();
+  }
+#endif
 }

 static void compositor_engine_update(void *data)
--- a/source/blender/draw/engines/eevee_next/eevee_depth_of_field.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_depth_of_field.cc
@ -285,7 +285,7 @@ void DepthOfField::stabilize_pass_sync()
  stabilize_ps_.bind_texture("in_history_tx", &stabilize_input_, with_filter);
  stabilize_ps_.bind_texture("depth_tx", &render_buffers.depth_tx, no_filter);
  stabilize_ps_.bind_ubo("dof_buf", data_);
-  stabilize_ps_.push_constant("use_history", &stabilize_valid_history_, 1);
+  stabilize_ps_.push_constant("u_use_history", &stabilize_valid_history_, 1);
  stabilize_ps_.bind_image("out_coc_img", reduced_coc_tx_.mip_view(0));
  stabilize_ps_.bind_image("out_color_img", reduced_color_tx_.mip_view(0));
  stabilize_ps_.bind_image("out_history_img", &stabilize_output_tx_);
--- a/source/blender/draw/engines/eevee_next/eevee_shader_shared.hh
+++ b/source/blender/draw/engines/eevee_next/eevee_shader_shared.hh
@ -120,9 +120,9 @@ struct SamplingData {
 BLI_STATIC_ASSERT_ALIGN(SamplingData, 16)

 /* Returns total sample count in a web pattern of the given size. */
-static inline int sampling_web_sample_count_get(int web_density, int ring_count)
+static inline int sampling_web_sample_count_get(int web_density, int in_ring_count)
 {
-  return ((ring_count * ring_count + ring_count) / 2) * web_density + 1;
+  return ((in_ring_count * in_ring_count + in_ring_count) / 2) * web_density + 1;
 }

 /* Returns lowest possible ring count that contains at least sample_count samples. */
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_accumulator_lib.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_accumulator_lib.glsl
@ -60,6 +60,27 @@ struct DofGatherData {
  float transparency;

  float layer_opacity;
+
+#ifdef GPU_METAL
+  /* Explicit constructors -- To support GLSL syntax. */
+  inline DofGatherData() = default;
+  inline DofGatherData(vec4 in_color,
+                       float in_weight,
+                       float in_dist,
+                       float in_coc,
+                       float in_coc_sqr,
+                       float in_transparency,
+                       float in_layer_opacity)
+      : color(in_color),
+        weight(in_weight),
+        dist(in_dist),
+        coc(in_coc),
+        coc_sqr(in_coc_sqr),
+        transparency(in_transparency),
+        layer_opacity(in_layer_opacity)
+  {
+  }
+#endif
 };

 #define GATHER_DATA_INIT DofGatherData(vec4(0.0), 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_downsample_comp.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_downsample_comp.glsl
@ -12,7 +12,7 @@ void main()
 {
  vec2 halfres_texel_size = 1.0 / vec2(textureSize(color_tx, 0).xy);
  /* Center uv around the 4 halfres pixels. */
-  vec2 quad_center = vec2(gl_GlobalInvocationID * 2 + 1) * halfres_texel_size;
+  vec2 quad_center = vec2(gl_GlobalInvocationID.xy * 2 + 1) * halfres_texel_size;

  vec4 colors[4];
  vec4 cocs;
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_filter_comp.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_filter_comp.glsl
@ -9,13 +9,19 @@
 struct FilterSample {
  vec4 color;
  float weight;
+
+#ifdef GPU_METAL
+  inline FilterSample() = default;
+  inline FilterSample(vec4 in_color, float in_weight) : color(in_color), weight(in_weight)
+  {
+  }
+#endif
 };

 /* -------------------------------------------------------------------- */
 /** \name Pixel cache.
 * \{ */
-
-const uint cache_size = gl_WorkGroupSize.x + 2;
+#define cache_size (gl_WorkGroupSize.x + 2)
 shared vec4 color_cache[cache_size][cache_size];
 shared float weight_cache[cache_size][cache_size];

--- a/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_reduce_comp.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_reduce_comp.glsl
@ -73,7 +73,7 @@ float fast_luma(vec3 color)
  return (2.0 * color.g) + color.r + color.b;
 }

-const uint cache_size = gl_WorkGroupSize.x;
+#define cache_size (gl_WorkGroupSize.x)
 shared vec4 color_cache[cache_size][cache_size];
 shared float coc_cache[cache_size][cache_size];
 shared float do_scatter[cache_size][cache_size];
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_stabilize_comp.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_stabilize_comp.glsl
@ -22,17 +22,24 @@
 struct DofSample {
  vec4 color;
  float coc;
+
+#ifdef GPU_METAL
+  /* Explicit constructors -- To support GLSL syntax. */
+  inline DofSample() = default;
+  inline DofSample(vec4 in_color, float in_coc) : color(in_color), coc(in_coc)
+  {
+  }
+#endif
 };

 /* -------------------------------------------------------------------- */
 /** \name LDS Cache
 * \{ */
-
-const uint cache_size = gl_WorkGroupSize.x + 2;
+#define cache_size (gl_WorkGroupSize.x + 2)
 shared vec4 color_cache[cache_size][cache_size];
 shared float coc_cache[cache_size][cache_size];
 /* Need 2 pixel border for depth. */
-const uint cache_depth_size = gl_WorkGroupSize.x + 4;
+#define cache_depth_size (gl_WorkGroupSize.x + 4)
 shared float depth_cache[cache_depth_size][cache_depth_size];

 void dof_cache_init()
@ -146,6 +153,14 @@ DofSample dof_spatial_filtering()
 struct DofNeighborhoodMinMax {
  DofSample min;
  DofSample max;
+
+#ifdef GPU_METAL
+  /* Explicit constructors -- To support GLSL syntax. */
+  inline DofNeighborhoodMinMax() = default;
+  inline DofNeighborhoodMinMax(DofSample in_min, DofSample in_max) : min(in_min), max(in_max)
+  {
+  }
+#endif
 };

 /* Return history clipping bounding box in YCoCg color space. */
@ -216,7 +231,7 @@ vec2 dof_pixel_history_motion_vector(ivec2 texel_sample)
 DofSample dof_sample_history(vec2 input_texel)
 {
 #if 1 /* Bilinar. */
-  vec2 uv = vec2(input_texel + 0.5) / textureSize(in_history_tx, 0);
+  vec2 uv = vec2(input_texel + 0.5) / vec2(textureSize(in_history_tx, 0));
  vec4 color = textureLod(in_history_tx, uv, 0.0);

 #else /* Catmull Rom interpolation. 5 Bilinear Taps. */
@ -308,7 +323,7 @@ float dof_history_blend_factor(
    blend = 1.0;
  }
  /* Discard history if invalid. */
-  if (use_history == false) {
+  if (u_use_history == false) {
    blend = 1.0;
  }
  return blend;
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_film_cryptomatte_post_comp.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_film_cryptomatte_post_comp.glsl
@ -45,7 +45,7 @@ void cryptomatte_normalize_weight(float total_weight, inout vec2 samples[CRYPTOM
  }
 }

-void cryptomatte_store_samples(ivec2 texel, int layer, in vec2 samples[CRYPTOMATTE_LEVELS_MAX])
+void cryptomatte_store_samples(ivec2 texel, int layer, vec2 samples[CRYPTOMATTE_LEVELS_MAX])
 {
  int pass_len = divide_ceil(cryptomatte_samples_per_layer, 2);
  int layer_id = layer * pass_len;
--- a/source/blender/draw/engines/eevee_next/shaders/infos/eevee_depth_of_field_info.hh
+++ b/source/blender/draw/engines/eevee_next/shaders/infos/eevee_depth_of_field_info.hh
@ -38,7 +38,7 @@ GPU_SHADER_CREATE_INFO(eevee_depth_of_field_stabilize)
    .sampler(2, ImageType::FLOAT_2D, "velocity_tx")
    .sampler(3, ImageType::FLOAT_2D, "in_history_tx")
    .sampler(4, ImageType::DEPTH_2D, "depth_tx")
-    .push_constant(Type::BOOL, "use_history")
+    .push_constant(Type::BOOL, "u_use_history")
    .image(0, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "out_color_img")
    .image(1, GPU_R16F, Qualifier::WRITE, ImageType::FLOAT_2D, "out_coc_img")
    .image(2, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "out_history_img")
--- a/source/blender/draw/intern/draw_curves.cc
+++ b/source/blender/draw/intern/draw_curves.cc
@ -490,7 +490,7 @@ void DRW_curves_update()
    GPUFrameBuffer *temp_fb = nullptr;
    GPUFrameBuffer *prev_fb = nullptr;
    if (GPU_type_matches_ex(GPU_DEVICE_ANY, GPU_OS_MAC, GPU_DRIVER_ANY, GPU_BACKEND_METAL)) {
-      if (!GPU_compute_shader_support()) {
+      if (!(GPU_compute_shader_support() && GPU_shader_storage_buffer_objects_support())) {
        prev_fb = GPU_framebuffer_active_get();
        char errorOut[256];
        /* if the frame-buffer is invalid we need a dummy frame-buffer to be bound. */
--- a/source/blender/draw/intern/draw_hair.cc
+++ b/source/blender/draw/intern/draw_hair.cc
@ -386,7 +386,7 @@ void DRW_hair_update()
    GPUFrameBuffer *temp_fb = nullptr;
    GPUFrameBuffer *prev_fb = nullptr;
    if (GPU_type_matches_ex(GPU_DEVICE_ANY, GPU_OS_MAC, GPU_DRIVER_ANY, GPU_BACKEND_METAL)) {
-      if (!GPU_compute_shader_support()) {
+      if (!(GPU_compute_shader_support() && GPU_shader_storage_buffer_objects_support())) {
        prev_fb = GPU_framebuffer_active_get();
        char errorOut[256];
        /* if the frame-buffer is invalid we need a dummy frame-buffer to be bound. */
--- a/source/blender/gpu/intern/gpu_shader_create_info.cc
+++ b/source/blender/gpu/intern/gpu_shader_create_info.cc
@ -400,7 +400,7 @@ void gpu_shader_create_info_init()
  }

  /* TEST */
-  // gpu_shader_create_info_compile_all();
+  gpu_shader_create_info_compile_all();
 }

 void gpu_shader_create_info_exit()
--- a/source/blender/gpu/intern/gpu_viewport.c
+++ b/source/blender/gpu/intern/gpu_viewport.c
@ -124,8 +124,10 @@ static void gpu_viewport_textures_create(GPUViewport *viewport)

  if (viewport->color_render_tx[0] == NULL) {

+    /* NOTE: dtxl_color texture requires write support as it may be written to by the realtime
+     * compositor. */
    viewport->color_render_tx[0] = GPU_texture_create_2d_ex(
-        "dtxl_color", UNPACK2(size), 1, GPU_RGBA16F, usage, NULL);
+        "dtxl_color", UNPACK2(size), 1, GPU_RGBA16F, usage | GPU_TEXTURE_USAGE_SHADER_WRITE, NULL);
    viewport->color_overlay_tx[0] = GPU_texture_create_2d_ex(
        "dtxl_color_overlay", UNPACK2(size), 1, GPU_SRGB8_A8, usage, NULL);

@ -136,8 +138,12 @@ static void gpu_viewport_textures_create(GPUViewport *viewport)
  }

  if ((viewport->flag & GPU_VIEWPORT_STEREO) != 0 && viewport->color_render_tx[1] == NULL) {
-    viewport->color_render_tx[1] = GPU_texture_create_2d_ex(
-        "dtxl_color_stereo", UNPACK2(size), 1, GPU_RGBA16F, usage, NULL);
+    viewport->color_render_tx[1] = GPU_texture_create_2d_ex("dtxl_color_stereo",
+                                                            UNPACK2(size),
+                                                            1,
+                                                            GPU_RGBA16F,
+                                                            usage | GPU_TEXTURE_USAGE_SHADER_WRITE,
+                                                            NULL);
    viewport->color_overlay_tx[1] = GPU_texture_create_2d_ex(
        "dtxl_color_overlay_stereo", UNPACK2(size), 1, GPU_SRGB8_A8, usage, NULL);

--- a/source/blender/gpu/metal/mtl_backend.hh
+++ b/source/blender/gpu/metal/mtl_backend.hh
@ -51,10 +51,7 @@ class MTLBackend : public GPUBackend {
  }

  void samplers_update() override;
-  void compute_dispatch(int groups_x_len, int groups_y_len, int groups_z_len) override
-  {
-    /* Placeholder */
-  }
+  void compute_dispatch(int groups_x_len, int groups_y_len, int groups_z_len) override;

  void compute_dispatch_indirect(StorageBuf *indirect_buf) override
  {
--- a/source/blender/gpu/metal/mtl_backend.mm
+++ b/source/blender/gpu/metal/mtl_backend.mm
@ -398,7 +398,7 @@ void MTLBackend::capabilities_init(MTLContext *ctx)
                                           MTLBackend::capabilities.supports_family_mac2);
  /* TODO(Metal): Add support? */
  GCaps.shader_draw_parameters_support = false;
-  GCaps.compute_shader_support = false; /* TODO(Metal): Add compute support. */
+  GCaps.compute_shader_support = true;
  GCaps.geometry_shader_support = false;
  GCaps.shader_storage_buffer_objects_support =
      false; /* TODO(Metal): implement Storage Buffer support. */
@ -442,4 +442,22 @@ void MTLBackend::capabilities_init(MTLContext *ctx)

 /** \} */

+/* -------------------------------------------------------------------- */
+/** \name Compute dispatch.
+ * \{ */
+
+void MTLBackend::compute_dispatch(int groups_x_len, int groups_y_len, int groups_z_len)
+{
+  /* Fetch Context.
+   * With Metal, workload submission and resource management occurs within the context.
+   * Call compute dispatch on valid context. */
+  MTLContext *ctx = MTLContext::get();
+  BLI_assert(ctx != nullptr);
+  if (ctx) {
+    ctx->compute_dispatch(groups_x_len, groups_y_len, groups_z_len);
+  }
+}
+
+/** \} */
+
 }  // blender::gpu
--- a/source/blender/gpu/metal/mtl_capabilities.hh
+++ b/source/blender/gpu/metal/mtl_capabilities.hh
@ -16,10 +16,18 @@ namespace gpu {
 #define MTL_MAX_SAMPLER_SLOTS MTL_MAX_TEXTURE_SLOTS
 /* Max limit without using bind-less for samplers. */
 #define MTL_MAX_DEFAULT_SAMPLERS 16
-#define MTL_MAX_UNIFORM_BUFFER_BINDINGS 31
+/* Total maximum buffers which can be bound to an encoder, for use within a shader.
+ * MTL_MAX_UNIFORM_BUFFER_BINDINGS + MTL_MAX_STORAGE_BUFFER_BINDINGS must be <=
+ * than MTL_MAX_BUFFER_BINDINGS. */
+#define MTL_MAX_BUFFER_BINDINGS 31
+#define MTL_MAX_UNIFORM_BUFFER_BINDINGS 16
+#define MTL_MAX_STORAGE_BUFFER_BINDINGS 12
 #define MTL_MAX_VERTEX_INPUT_ATTRIBUTES 31
 #define MTL_MAX_UNIFORMS_PER_BLOCK 64

+static_assert((MTL_MAX_UNIFORM_BUFFER_BINDINGS + MTL_MAX_STORAGE_BUFFER_BINDINGS) <=
+              MTL_MAX_BUFFER_BINDINGS);
+
 /* Context-specific limits -- populated in 'MTLBackend::platform_init' */
 struct MTLCapabilities {

--- a/source/blender/gpu/metal/mtl_command_buffer.mm
+++ b/source/blender/gpu/metal/mtl_command_buffer.mm
@ -416,6 +416,9 @@ id<MTLComputeCommandEncoder> MTLCommandBufferManager::ensure_begin_compute_encod

    /* Update command buffer encoder heuristics. */
    this->register_encoder_counters();
+
+    /* Reset RenderPassState to ensure resource bindings are re-applied. */
+    compute_state_.reset_state();
  }
  BLI_assert(active_compute_command_encoder_ != nil);
  return active_compute_command_encoder_;
@ -496,11 +499,21 @@ bool MTLCommandBufferManager::insert_memory_barrier(eGPUBarrier barrier_bits,
  /* Only supporting Metal on 10.14 onward anyway - Check required for warnings. */
  if (@available(macOS 10.14, *)) {

+    /* Apple Silicon does not support memory barriers for RenderCommandEncoder's.
+     * We do not currently need these due to implicit API guarantees.
+     * NOTE(Metal): MTLFence/MTLEvent may be required to synchronize work if
+     * untracked resources are ever used. */
+    if ([context_.device hasUnifiedMemory] &&
+        (active_command_encoder_type_ != MTL_COMPUTE_COMMAND_ENCODER)) {
+      return false;
+    }
+
    /* Resolve scope. */
    MTLBarrierScope scope = 0;
    if (barrier_bits & GPU_BARRIER_SHADER_IMAGE_ACCESS ||
        barrier_bits & GPU_BARRIER_TEXTURE_FETCH) {
-      scope = scope | MTLBarrierScopeTextures | MTLBarrierScopeRenderTargets;
+      bool is_compute = (active_command_encoder_type_ != MTL_RENDER_COMMAND_ENCODER);
+      scope |= (is_compute ? 0 : MTLBarrierScopeRenderTargets) | MTLBarrierScopeTextures;
    }
    if (barrier_bits & GPU_BARRIER_SHADER_STORAGE ||
        barrier_bits & GPU_BARRIER_VERTEX_ATTRIB_ARRAY ||
@ -604,7 +617,7 @@ void MTLRenderPassState::reset_state()
                             (uint)((fb != nullptr) ? fb->get_height() : 0)};

  /* Reset cached resource binding state */
-  for (int ubo = 0; ubo < MTL_MAX_UNIFORM_BUFFER_BINDINGS; ubo++) {
+  for (int ubo = 0; ubo < MTL_MAX_BUFFER_BINDINGS; ubo++) {
    this->cached_vertex_buffer_bindings[ubo].is_bytes = false;
    this->cached_vertex_buffer_bindings[ubo].metal_buffer = nil;
    this->cached_vertex_buffer_bindings[ubo].offset = -1;
@ -626,6 +639,26 @@ void MTLRenderPassState::reset_state()
  }
 }

+void MTLComputeState::reset_state()
+{
+  /* Reset Cached pipeline state. */
+  this->bound_pso = nil;
+
+  /* Reset cached resource binding state */
+  for (int ubo = 0; ubo < MTL_MAX_BUFFER_BINDINGS; ubo++) {
+    this->cached_compute_buffer_bindings[ubo].is_bytes = false;
+    this->cached_compute_buffer_bindings[ubo].metal_buffer = nil;
+    this->cached_compute_buffer_bindings[ubo].offset = -1;
+  }
+
+  /* Reset cached texture and sampler state binding state. */
+  for (int tex = 0; tex < MTL_MAX_TEXTURE_SLOTS; tex++) {
+    this->cached_compute_texture_bindings[tex].metal_texture = nil;
+    this->cached_compute_sampler_state_bindings[tex].sampler_state = nil;
+    this->cached_compute_sampler_state_bindings[tex].is_arg_buffer_binding = false;
+  }
+}
+
 /* Bind Texture to current RenderCommandEncoder. */
 void MTLRenderPassState::bind_vertex_texture(id<MTLTexture> tex, uint slot)
 {
@ -647,6 +680,19 @@ void MTLRenderPassState::bind_fragment_texture(id<MTLTexture> tex, uint slot)
  }
 }

+void MTLComputeState::bind_compute_texture(id<MTLTexture> tex, uint slot)
+{
+  if (this->cached_compute_texture_bindings[slot].metal_texture != tex) {
+    id<MTLComputeCommandEncoder> rec = this->cmd.get_active_compute_command_encoder();
+    BLI_assert(rec != nil);
+    [rec setTexture:tex atIndex:slot];
+    [rec useResource:tex
+               usage:MTLResourceUsageRead | MTLResourceUsageWrite | MTLResourceUsageSample];
+
+    this->cached_compute_texture_bindings[slot].metal_texture = tex;
+  }
+}
+
 void MTLRenderPassState::bind_vertex_sampler(MTLSamplerBinding &sampler_binding,
                                             bool use_argument_buffer_for_samplers,
                                             uint slot)
@ -726,9 +772,49 @@ void MTLRenderPassState::bind_fragment_sampler(MTLSamplerBinding &sampler_bindin
  }
 }

+void MTLComputeState::bind_compute_sampler(MTLSamplerBinding &sampler_binding,
+                                           bool use_argument_buffer_for_samplers,
+                                           uint slot)
+{
+  /* Range check. */
+  const MTLShaderInterface *shader_interface = ctx.pipeline_state.active_shader->get_interface();
+  BLI_assert(slot >= 0);
+  BLI_assert(slot <= shader_interface->get_max_texture_index());
+  BLI_assert(slot < MTL_MAX_TEXTURE_SLOTS);
+  UNUSED_VARS_NDEBUG(shader_interface);
+
+  /* If sampler state has not changed for the given slot, we do not need to fetch. */
+  if (this->cached_compute_sampler_state_bindings[slot].sampler_state == nil ||
+      !(this->cached_compute_sampler_state_bindings[slot].binding_state ==
+        sampler_binding.state) ||
+      use_argument_buffer_for_samplers) {
+
+    id<MTLSamplerState> sampler_state = (sampler_binding.state == DEFAULT_SAMPLER_STATE) ?
+                                            ctx.get_default_sampler_state() :
+                                            ctx.get_sampler_from_state(sampler_binding.state);
+    if (!use_argument_buffer_for_samplers) {
+      /* Update binding and cached state. */
+      id<MTLComputeCommandEncoder> rec = this->cmd.get_active_compute_command_encoder();
+      BLI_assert(rec != nil);
+      [rec setSamplerState:sampler_state atIndex:slot];
+      this->cached_compute_sampler_state_bindings[slot].binding_state = sampler_binding.state;
+      this->cached_compute_sampler_state_bindings[slot].sampler_state = sampler_state;
+    }
+
+    /* Flag last binding type */
+    this->cached_compute_sampler_state_bindings[slot].is_arg_buffer_binding =
+        use_argument_buffer_for_samplers;
+
+    /* Always assign to argument buffer samplers binding array - Efficiently ensures the value in
+     * the samplers array is always up to date. */
+    ctx.samplers_.mtl_sampler[slot] = sampler_state;
+    ctx.samplers_.mtl_sampler_flags[slot] = sampler_binding.state;
+  }
+}
+
 void MTLRenderPassState::bind_vertex_buffer(id<MTLBuffer> buffer, uint buffer_offset, uint index)
 {
-  BLI_assert(index >= 0);
+  BLI_assert(index >= 0 && index < MTL_MAX_BUFFER_BINDINGS);
  BLI_assert(buffer_offset >= 0);
  BLI_assert(buffer != nil);

@ -757,7 +843,7 @@ void MTLRenderPassState::bind_vertex_buffer(id<MTLBuffer> buffer, uint buffer_of

 void MTLRenderPassState::bind_fragment_buffer(id<MTLBuffer> buffer, uint buffer_offset, uint index)
 {
-  BLI_assert(index >= 0);
+  BLI_assert(index >= 0 && index < MTL_MAX_BUFFER_BINDINGS);
  BLI_assert(buffer_offset >= 0);
  BLI_assert(buffer != nil);

@ -784,10 +870,45 @@ void MTLRenderPassState::bind_fragment_buffer(id<MTLBuffer> buffer, uint buffer_
  }
 }

+void MTLComputeState::bind_compute_buffer(id<MTLBuffer> buffer,
+                                          uint buffer_offset,
+                                          uint index,
+                                          bool writeable)
+{
+  BLI_assert(index >= 0 && index < MTL_MAX_BUFFER_BINDINGS);
+  BLI_assert(buffer_offset >= 0);
+  BLI_assert(buffer != nil);
+
+  BufferBindingCached &current_comp_ubo_binding = this->cached_compute_buffer_bindings[index];
+  if (current_comp_ubo_binding.offset != buffer_offset ||
+      current_comp_ubo_binding.metal_buffer != buffer || current_comp_ubo_binding.is_bytes) {
+
+    id<MTLComputeCommandEncoder> rec = this->cmd.get_active_compute_command_encoder();
+    BLI_assert(rec != nil);
+
+    if (current_comp_ubo_binding.metal_buffer == buffer) {
+      /* If buffer is the same, but offset has changed. */
+      [rec setBufferOffset:buffer_offset atIndex:index];
+    }
+    else {
+      /* Bind Fragment Buffer */
+      [rec setBuffer:buffer offset:buffer_offset atIndex:index];
+    }
+    [rec useResource:buffer
+               usage:((writeable) ? (MTLResourceUsageRead | MTLResourceUsageWrite) :
+                                    MTLResourceUsageRead)];
+
+    /* Update Bind-state cache */
+    this->cached_compute_buffer_bindings[index].is_bytes = false;
+    this->cached_compute_buffer_bindings[index].metal_buffer = buffer;
+    this->cached_compute_buffer_bindings[index].offset = buffer_offset;
+  }
+}
+
 void MTLRenderPassState::bind_vertex_bytes(void *bytes, uint length, uint index)
 {
  /* Bytes always updated as source data may have changed. */
-  BLI_assert(index >= 0 && index < MTL_MAX_UNIFORM_BUFFER_BINDINGS);
+  BLI_assert(index >= 0 && index < MTL_MAX_BUFFER_BINDINGS);
  BLI_assert(length > 0);
  BLI_assert(bytes != nullptr);

@ -812,7 +933,7 @@ void MTLRenderPassState::bind_vertex_bytes(void *bytes, uint length, uint index)
 void MTLRenderPassState::bind_fragment_bytes(void *bytes, uint length, uint index)
 {
  /* Bytes always updated as source data may have changed. */
-  BLI_assert(index >= 0 && index < MTL_MAX_UNIFORM_BUFFER_BINDINGS);
+  BLI_assert(index >= 0 && index < MTL_MAX_BUFFER_BINDINGS);
  BLI_assert(length > 0);
  BLI_assert(bytes != nullptr);

@ -834,6 +955,40 @@ void MTLRenderPassState::bind_fragment_bytes(void *bytes, uint length, uint inde
  this->cached_fragment_buffer_bindings[index].offset = -1;
 }

+void MTLComputeState::bind_compute_bytes(void *bytes, uint length, uint index)
+{
+  /* Bytes always updated as source data may have changed. */
+  BLI_assert(index >= 0 && index < MTL_MAX_BUFFER_BINDINGS);
+  BLI_assert(length > 0);
+  BLI_assert(bytes != nullptr);
+
+  if (length < MTL_MAX_SET_BYTES_SIZE) {
+    id<MTLComputeCommandEncoder> rec = this->cmd.get_active_compute_command_encoder();
+    [rec setBytes:bytes length:length atIndex:index];
+  }
+  else {
+    /* We have run over the setBytes limit, bind buffer instead. */
+    MTLTemporaryBuffer range =
+        ctx.get_scratchbuffer_manager().scratch_buffer_allocate_range_aligned(length, 256);
+    memcpy(range.data, bytes, length);
+    this->bind_compute_buffer(range.metal_buffer, range.buffer_offset, index);
+  }
+
+  /* Update Bind-state cache. */
+  this->cached_compute_buffer_bindings[index].is_bytes = true;
+  this->cached_compute_buffer_bindings[index].metal_buffer = nil;
+  this->cached_compute_buffer_bindings[index].offset = -1;
+}
+
+void MTLComputeState::bind_pso(id<MTLComputePipelineState> pso)
+{
+  if (this->bound_pso != pso) {
+    id<MTLComputeCommandEncoder> rec = this->cmd.get_active_compute_command_encoder();
+    [rec setComputePipelineState:pso];
+    this->bound_pso = pso;
+  }
+}
+
 /** \} */

 }  // blender::gpu
--- a/source/blender/gpu/metal/mtl_context.hh
+++ b/source/blender/gpu/metal/mtl_context.hh
@ -66,6 +66,40 @@ struct MTLSamplerBinding {
  }
 };

+/* Caching of resource bindings for active MTLRenderCommandEncoder.
+ * In Metal, resource bindings are local to the MTLCommandEncoder,
+ * not globally to the whole pipeline/cmd buffer. */
+struct MTLBoundShaderState {
+  MTLShader *shader_ = nullptr;
+  uint pso_index_;
+  void set(MTLShader *shader, uint pso_index)
+  {
+    shader_ = shader;
+    pso_index_ = pso_index;
+  }
+};
+
+/* Caching of CommandEncoder Vertex/Fragment buffer bindings. */
+struct BufferBindingCached {
+  /* Whether the given binding slot uses byte data (Push Constant equivalent)
+   * or an MTLBuffer. */
+  bool is_bytes;
+  id<MTLBuffer> metal_buffer;
+  int offset;
+};
+
+/* Caching of CommandEncoder textures bindings. */
+struct TextureBindingCached {
+  id<MTLTexture> metal_texture;
+};
+
+/* Cached of CommandEncoder sampler states. */
+struct SamplerStateBindingCached {
+  MTLSamplerState binding_state;
+  id<MTLSamplerState> sampler_state;
+  bool is_arg_buffer_binding;
+};
+
 /* Metal Context Render Pass State -- Used to track active RenderCommandEncoder state based on
 * bound MTLFrameBuffer's.Owned by MTLContext. */
 class MTLRenderPassState {
@ -80,52 +114,16 @@ class MTLRenderPassState {
  MTLContext &ctx;
  MTLCommandBufferManager &cmd;

-  /* Caching of resource bindings for active MTLRenderCommandEncoder.
-   * In Metal, resource bindings are local to the MTLCommandEncoder,
-   * not globally to the whole pipeline/cmd buffer. */
-  struct MTLBoundShaderState {
-    MTLShader *shader_ = nullptr;
-    uint pso_index_;
-    void set(MTLShader *shader, uint pso_index)
-    {
-      shader_ = shader;
-      pso_index_ = pso_index;
-    }
-  };
-
  MTLBoundShaderState last_bound_shader_state;
  id<MTLRenderPipelineState> bound_pso = nil;
  id<MTLDepthStencilState> bound_ds_state = nil;
  uint last_used_stencil_ref_value = 0;
  MTLScissorRect last_scissor_rect;

-  /* Caching of CommandEncoder Vertex/Fragment buffer bindings. */
-  struct BufferBindingCached {
-    /* Whether the given binding slot uses byte data (Push Constant equivalent)
-     * or an MTLBuffer. */
-    bool is_bytes;
-    id<MTLBuffer> metal_buffer;
-    int offset;
-  };
-
-  BufferBindingCached cached_vertex_buffer_bindings[MTL_MAX_UNIFORM_BUFFER_BINDINGS];
-  BufferBindingCached cached_fragment_buffer_bindings[MTL_MAX_UNIFORM_BUFFER_BINDINGS];
-
-  /* Caching of CommandEncoder textures bindings. */
-  struct TextureBindingCached {
-    id<MTLTexture> metal_texture;
-  };
-
+  BufferBindingCached cached_vertex_buffer_bindings[MTL_MAX_BUFFER_BINDINGS];
+  BufferBindingCached cached_fragment_buffer_bindings[MTL_MAX_BUFFER_BINDINGS];
  TextureBindingCached cached_vertex_texture_bindings[MTL_MAX_TEXTURE_SLOTS];
  TextureBindingCached cached_fragment_texture_bindings[MTL_MAX_TEXTURE_SLOTS];
-
-  /* Cached of CommandEncoder sampler states. */
-  struct SamplerStateBindingCached {
-    MTLSamplerState binding_state;
-    id<MTLSamplerState> sampler_state;
-    bool is_arg_buffer_binding;
-  };
-
  SamplerStateBindingCached cached_vertex_sampler_state_bindings[MTL_MAX_TEXTURE_SLOTS];
  SamplerStateBindingCached cached_fragment_sampler_state_bindings[MTL_MAX_TEXTURE_SLOTS];

@ -151,6 +149,44 @@ class MTLRenderPassState {
  void bind_fragment_bytes(void *bytes, uint length, uint index);
 };

+/* Metal Context Compute Pass State -- Used to track active ComputeCommandEncoder state. */
+class MTLComputeState {
+  friend class MTLContext;
+
+ public:
+  MTLComputeState(MTLContext &context, MTLCommandBufferManager &command_buffer_manager)
+      : ctx(context), cmd(command_buffer_manager){};
+
+  /* Given a ComputePassState is associated with a live ComputeCommandEncoder,
+   * this state sits within the MTLCommandBufferManager. */
+  MTLContext &ctx;
+  MTLCommandBufferManager &cmd;
+
+  id<MTLComputePipelineState> bound_pso = nil;
+  BufferBindingCached cached_compute_buffer_bindings[MTL_MAX_BUFFER_BINDINGS];
+  TextureBindingCached cached_compute_texture_bindings[MTL_MAX_TEXTURE_SLOTS];
+  SamplerStateBindingCached cached_compute_sampler_state_bindings[MTL_MAX_TEXTURE_SLOTS];
+
+  /* Reset ComputeCommandEncoder binding state. */
+  void reset_state();
+
+  /* PSO Binding. */
+  void bind_pso(id<MTLComputePipelineState> pso);
+
+  /* Texture Binding (ComputeCommandEncoder). */
+  void bind_compute_texture(id<MTLTexture> tex, uint slot);
+  /* Sampler Binding (ComputeCommandEncoder). */
+  void bind_compute_sampler(MTLSamplerBinding &sampler_binding,
+                            bool use_argument_buffer_for_samplers,
+                            uint slot);
+  /* Buffer binding (ComputeCommandEncoder). */
+  void bind_compute_buffer(id<MTLBuffer> buffer,
+                           uint buffer_offset,
+                           uint index,
+                           bool writeable = false);
+  void bind_compute_bytes(void *bytes, uint length, uint index);
+};
+
 /* Depth Stencil State */
 struct MTLContextDepthStencilState {

@ -521,6 +557,9 @@ class MTLCommandBufferManager {
  MTLFrameBuffer *active_frame_buffer_ = nullptr;
  MTLRenderPassDescriptor *active_pass_descriptor_ = nullptr;

+  /* State associated with active ComputeCommandEncoder. */
+  MTLComputeState compute_state_;
+
  /* Workload heuristics - We may need to split command buffers to optimize workload and balancing.
   */
  int current_draw_call_count_ = 0;
@ -530,7 +569,7 @@ class MTLCommandBufferManager {

 public:
  MTLCommandBufferManager(MTLContext &context)
-      : context_(context), render_pass_state_(context, *this){};
+      : context_(context), render_pass_state_(context, *this), compute_state_(context, *this){};
  void prepare(bool supports_render = true);

  /* If wait is true, CPU will stall until GPU work has completed. */
@ -553,6 +592,14 @@ class MTLCommandBufferManager {
    return render_pass_state_;
  }

+  /* RenderPassState for RenderCommandEncoder. */
+  MTLComputeState &get_compute_state()
+  {
+    /* Render pass state should only be valid if we are inside a compute encoder. */
+    BLI_assert(this->is_inside_compute());
+    return compute_state_;
+  }
+
  /* Rendering Heuristics. */
  void register_draw_counters(int vertex_submission);
  void reset_counters();
@ -593,6 +640,7 @@ class MTLCommandBufferManager {
 class MTLContext : public Context {
  friend class MTLBackend;
  friend class MTLRenderPassState;
+  friend class MTLComputeState;

 public:
  /* Swap-chain and latency management. */
@ -745,9 +793,16 @@ class MTLContext : public Context {
      id<MTLRenderCommandEncoder> rec,
      const MTLShaderInterface *shader_interface,
      const MTLRenderPipelineStateInstance *pipeline_state_instance);
+  bool ensure_uniform_buffer_bindings(
+      id<MTLComputeCommandEncoder> rec,
+      const MTLShaderInterface *shader_interface,
+      const MTLComputePipelineStateInstance &pipeline_state_instance);
  void ensure_texture_bindings(id<MTLRenderCommandEncoder> rec,
                               MTLShaderInterface *shader_interface,
                               const MTLRenderPipelineStateInstance *pipeline_state_instance);
+  void ensure_texture_bindings(id<MTLComputeCommandEncoder> rec,
+                               MTLShaderInterface *shader_interface,
+                               const MTLComputePipelineStateInstance &pipeline_state_instance);
  void ensure_depth_stencil_state(MTLPrimitiveType prim_type);

  id<MTLBuffer> get_null_buffer();
@ -755,6 +810,10 @@ class MTLContext : public Context {
  gpu::MTLTexture *get_dummy_texture(eGPUTextureType type, eGPUSamplerFormat sampler_format);
  void free_dummy_resources();

+  /* Compute. */
+  bool ensure_compute_pipeline_state();
+  void compute_dispatch(int groups_x_len, int groups_y_len, int groups_z_len);
+
  /* State assignment. */
  void set_viewport(int origin_x, int origin_y, int width, int height);
  void set_scissor(int scissor_x, int scissor_y, int scissor_width, int scissor_height);
--- a/source/blender/gpu/metal/mtl_context.mm
+++ b/source/blender/gpu/metal/mtl_context.mm
@ -1034,6 +1034,7 @@ bool MTLContext::ensure_uniform_buffer_bindings(
    uint32_t block_size = push_constant_block.size;
    uint32_t buffer_index = pipeline_state_instance->base_uniform_buffer_index +
                            push_constant_block.buffer_index;
+    BLI_assert(buffer_index >= 0 && buffer_index < MTL_MAX_BUFFER_BINDINGS);

    /* Only need to rebind block if push constants have been modified -- or if no data is bound for
     * the current RenderCommandEncoder. */
@ -1156,15 +1157,13 @@ bool MTLContext::ensure_uniform_buffer_bindings(

        /* Bind Vertex UBO. */
        if (bool(ubo.stage_mask & ShaderStage::VERTEX)) {
-          BLI_assert(buffer_bind_index >= 0 &&
-                     buffer_bind_index < MTL_MAX_UNIFORM_BUFFER_BINDINGS);
+          BLI_assert(buffer_bind_index >= 0 && buffer_bind_index < MTL_MAX_BUFFER_BINDINGS);
          rps.bind_vertex_buffer(ubo_buffer, ubo_offset, buffer_bind_index);
        }

        /* Bind Fragment UBOs. */
        if (bool(ubo.stage_mask & ShaderStage::FRAGMENT)) {
-          BLI_assert(buffer_bind_index >= 0 &&
-                     buffer_bind_index < MTL_MAX_UNIFORM_BUFFER_BINDINGS);
+          BLI_assert(buffer_bind_index >= 0 && buffer_bind_index < MTL_MAX_BUFFER_BINDINGS);
          rps.bind_fragment_buffer(ubo_buffer, ubo_offset, buffer_bind_index);
        }
      }
@ -1181,6 +1180,115 @@ bool MTLContext::ensure_uniform_buffer_bindings(
  return true;
 }

+/* Variant for compute. Bind uniform buffers to an active compute command encoder using the
+ * rendering state of the current context -> Active shader, Bound UBOs). */
+bool MTLContext::ensure_uniform_buffer_bindings(
+    id<MTLComputeCommandEncoder> rec,
+    const MTLShaderInterface *shader_interface,
+    const MTLComputePipelineStateInstance &pipeline_state_instance)
+{
+  /* Fetch Compute Pass state. */
+  MTLComputeState &cs = this->main_command_buffer.get_compute_state();
+
+  /* Fetch push constant block and bind. */
+  const MTLShaderUniformBlock &push_constant_block = shader_interface->get_push_constant_block();
+  if (push_constant_block.size > 0) {
+
+    /* Fetch uniform buffer base binding index from pipeline_state_instance - There buffer index
+     * will be offset by the number of bound VBOs. */
+    uint32_t block_size = push_constant_block.size;
+    uint32_t buffer_index = pipeline_state_instance.base_uniform_buffer_index +
+                            push_constant_block.buffer_index;
+    BLI_assert(buffer_index >= 0 && buffer_index < MTL_MAX_BUFFER_BINDINGS);
+
+    /* For compute, we must always re-bind the push constant block as other compute
+     * operations may have assigned reources over the top, outside of the compiled
+     * compute shader path. */
+    /* Bind push constant data. */
+    BLI_assert(this->pipeline_state.active_shader->get_push_constant_data() != nullptr);
+    cs.bind_compute_bytes(
+        this->pipeline_state.active_shader->get_push_constant_data(), block_size, buffer_index);
+
+    /* Only need to rebind block if it has been modified. */
+    this->pipeline_state.active_shader->push_constant_bindstate_mark_dirty(false);
+  }
+
+  /* Bind Global GPUUniformBuffers */
+  /* Iterate through expected UBOs in the shader interface, and check if the globally bound ones
+   * match. This is used to support the gpu_uniformbuffer module, where the uniform data is global,
+   * and not owned by the shader instance. */
+  for (const uint ubo_index : IndexRange(shader_interface->get_total_uniform_blocks())) {
+    const MTLShaderUniformBlock &ubo = shader_interface->get_uniform_block(ubo_index);
+
+    if (ubo.buffer_index >= 0) {
+
+      /* Uniform Buffer index offset by 1 as the first shader buffer binding slot is reserved for
+       * the uniform PushConstantBlock. */
+      const uint32_t buffer_index = ubo.buffer_index + 1;
+      int ubo_offset = 0;
+      id<MTLBuffer> ubo_buffer = nil;
+      int ubo_size = 0;
+
+      bool bind_dummy_buffer = false;
+      if (this->pipeline_state.ubo_bindings[ubo_index].bound) {
+
+        /* Fetch UBO global-binding properties from slot. */
+        ubo_offset = 0;
+        ubo_buffer = this->pipeline_state.ubo_bindings[ubo_index].ubo->get_metal_buffer(
+            &ubo_offset);
+        ubo_size = this->pipeline_state.ubo_bindings[ubo_index].ubo->get_size();
+        UNUSED_VARS_NDEBUG(ubo_size);
+
+        /* Use dummy zero buffer if no buffer assigned -- this is an optimization to avoid
+         * allocating zero buffers. */
+        if (ubo_buffer == nil) {
+          bind_dummy_buffer = true;
+        }
+        else {
+          BLI_assert(ubo_buffer != nil);
+          BLI_assert(ubo_size > 0);
+        }
+      }
+      else {
+        MTL_LOG_INFO(
+            "[Warning][UBO] Shader '%s' expected UBO '%s' to be bound at buffer index: %d -- but "
+            "nothing was bound -- binding dummy buffer\n",
+            shader_interface->get_name(),
+            shader_interface->get_name_at_offset(ubo.name_offset),
+            buffer_index);
+        bind_dummy_buffer = true;
+      }
+
+      if (bind_dummy_buffer) {
+        /* Perform Dummy binding. */
+        ubo_offset = 0;
+        ubo_buffer = this->get_null_buffer();
+        ubo_size = [ubo_buffer length];
+      }
+
+      if (ubo_buffer != nil) {
+        uint32_t buffer_bind_index = pipeline_state_instance.base_uniform_buffer_index +
+                                     buffer_index;
+
+        /* Bind Vertex UBO. */
+        if (bool(ubo.stage_mask & ShaderStage::COMPUTE)) {
+          BLI_assert(buffer_bind_index >= 0 && buffer_bind_index < MTL_MAX_BUFFER_BINDINGS);
+          cs.bind_compute_buffer(ubo_buffer, ubo_offset, buffer_bind_index);
+        }
+      }
+      else {
+        MTL_LOG_WARNING(
+            "[UBO] Compute Shader '%s' has UBO '%s' bound at buffer index: %d -- but MTLBuffer "
+            "is NULL!\n",
+            shader_interface->get_name(),
+            shader_interface->get_name_at_offset(ubo.name_offset),
+            buffer_index);
+      }
+    }
+  }
+  return true;
+}
+
 /* Ensure texture bindings are correct and up to date for current draw call. */
 void MTLContext::ensure_texture_bindings(
    id<MTLRenderCommandEncoder> rec,
@ -1198,8 +1306,11 @@ void MTLContext::ensure_texture_bindings(
    int fragment_arg_buffer_bind_index = -1;

    /* Argument buffers are used for samplers, when the limit of 16 is exceeded. */
-    bool use_argument_buffer_for_samplers = shader_interface->get_use_argument_buffer_for_samplers(
-        &vertex_arg_buffer_bind_index, &fragment_arg_buffer_bind_index);
+    bool use_argument_buffer_for_samplers = shader_interface->uses_argument_buffer_for_samplers();
+    vertex_arg_buffer_bind_index = shader_interface->get_argument_buffer_bind_index(
+        ShaderStage::VERTEX);
+    fragment_arg_buffer_bind_index = shader_interface->get_argument_buffer_bind_index(
+        ShaderStage::FRAGMENT);

    /* Loop through expected textures in shader interface and resolve bindings with currently
     * bound textures.. */
@ -1396,6 +1507,200 @@ void MTLContext::ensure_texture_bindings(
  }
 }

+/* Texture binding variant for compute command encoder.
+ * Ensure bound texture resources are bound to the active MTLComputeCommandEncoder. */
+void MTLContext::ensure_texture_bindings(
+    id<MTLComputeCommandEncoder> rec,
+    MTLShaderInterface *shader_interface,
+    const MTLComputePipelineStateInstance &pipeline_state_instance)
+{
+  BLI_assert(shader_interface != nil);
+  BLI_assert(rec != nil);
+
+  /* Fetch Render Pass state. */
+  MTLComputeState &cs = this->main_command_buffer.get_compute_state();
+
+  @autoreleasepool {
+    int compute_arg_buffer_bind_index = -1;
+    int null_index = -1;
+
+    /* Argument buffers are used for samplers, when the limit of 16 is exceeded.
+     * NOTE: Compute uses vertex argument for arg buffer bind index.*/
+    bool use_argument_buffer_for_samplers = shader_interface->uses_argument_buffer_for_samplers();
+    compute_arg_buffer_bind_index = shader_interface->get_argument_buffer_bind_index(
+        ShaderStage::COMPUTE);
+
+    /* Loop through expected textures in shader interface and resolve bindings with currently
+     * bound textures.. */
+    for (const uint t : IndexRange(shader_interface->get_max_texture_index() + 1)) {
+      /* Ensure the bound texture is compatible with the shader interface. If the
+       * shader does not expect a texture to be bound for the current slot, we skip
+       * binding.
+       * NOTE: Global texture bindings may be left over from prior draw calls. */
+      const MTLShaderTexture &shader_texture_info = shader_interface->get_texture(t);
+      if (!shader_texture_info.used) {
+        /* Skip unused binding points if explicit indices are specified. */
+        continue;
+      }
+
+      int slot = shader_texture_info.slot_index;
+      if (slot >= 0 && slot < GPU_max_textures()) {
+        bool bind_dummy_texture = true;
+        if (this->pipeline_state.texture_bindings[slot].used) {
+          gpu::MTLTexture *bound_texture =
+              this->pipeline_state.texture_bindings[slot].texture_resource;
+          MTLSamplerBinding &bound_sampler = this->pipeline_state.sampler_bindings[slot];
+          BLI_assert(bound_texture);
+          BLI_assert(bound_sampler.used);
+
+          if (shader_texture_info.type == bound_texture->type_) {
+            /* Bind texture and sampler if the bound texture matches the type expected by the
+             * shader. */
+            id<MTLTexture> tex = bound_texture->get_metal_handle();
+
+            if (bool(shader_texture_info.stage_mask & ShaderStage::COMPUTE)) {
+              cs.bind_compute_texture(tex, slot);
+              cs.bind_compute_sampler(bound_sampler, use_argument_buffer_for_samplers, slot);
+            }
+
+            /* Texture state resolved, no need to bind dummy texture */
+            bind_dummy_texture = false;
+          }
+          else {
+            /* Texture type for bound texture (e.g. Texture2DArray) does not match what was
+             * expected in the shader interface. This is a problem and we will need to bind
+             * a dummy texture to ensure correct API usage. */
+            MTL_LOG_WARNING(
+                "(Shader '%s') Texture %p bound to slot %d is incompatible -- Wrong "
+                "texture target type. (Expecting type %d, actual type %d) (binding "
+                "name:'%s')(texture name:'%s')\n",
+                shader_interface->get_name(),
+                bound_texture,
+                slot,
+                shader_texture_info.type,
+                bound_texture->type_,
+                shader_interface->get_name_at_offset(shader_texture_info.name_offset),
+                bound_texture->get_name());
+          }
+        }
+        else {
+          MTL_LOG_WARNING(
+              "Shader '%s' expected texture to be bound to slot %d -- No texture was "
+              "bound. (name:'%s')\n",
+              shader_interface->get_name(),
+              slot,
+              shader_interface->get_name_at_offset(shader_texture_info.name_offset));
+        }
+
+        /* Bind Dummy texture -- will temporarily resolve validation issues while incorrect formats
+         * are provided -- as certain configurations may not need any binding. These issues should
+         * be fixed in the high-level, if problems crop up. */
+        if (bind_dummy_texture) {
+          if (bool(shader_texture_info.stage_mask & ShaderStage::COMPUTE)) {
+            cs.bind_compute_texture(
+                get_dummy_texture(shader_texture_info.type, shader_texture_info.sampler_format)
+                    ->get_metal_handle(),
+                slot);
+
+            /* Bind default sampler state. */
+            MTLSamplerBinding default_binding = {true, DEFAULT_SAMPLER_STATE};
+            cs.bind_compute_sampler(default_binding, use_argument_buffer_for_samplers, slot);
+          }
+        }
+      }
+      else {
+        MTL_LOG_WARNING(
+            "Shader %p expected texture to be bound to slot %d -- Slot exceeds the "
+            "hardware/API limit of '%d'. (name:'%s')\n",
+            this->pipeline_state.active_shader,
+            slot,
+            GPU_max_textures(),
+            shader_interface->get_name_at_offset(shader_texture_info.name_offset));
+      }
+    }
+
+    /* Construct and Bind argument buffer.
+     * NOTE(Metal): Samplers use an argument buffer when the limit of 16 samplers is exceeded. */
+    if (use_argument_buffer_for_samplers) {
+#ifndef NDEBUG
+      /* Debug check to validate each expected texture in the shader interface has a valid
+       * sampler object bound to the context. We will need all of these to be valid
+       * when constructing the sampler argument buffer. */
+      for (const uint i : IndexRange(shader_interface->get_max_texture_index() + 1)) {
+        const MTLShaderTexture &texture = shader_interface->get_texture(i);
+        if (texture.used) {
+          BLI_assert(this->samplers_.mtl_sampler[i] != nil);
+        }
+      }
+#endif
+
+      /* Check to ensure the buffer binding index for the argument buffer has been assigned.
+       * This PSO property will be set if we expect to use argument buffers, and the shader
+       * uses any amount of textures. */
+      BLI_assert(compute_arg_buffer_bind_index >= 0);
+      if (compute_arg_buffer_bind_index >= 0) {
+        /* Offset binding index to be relative to the start of static uniform buffer binding slots.
+         * The first N slots, prior to `pipeline_state_instance->base_uniform_buffer_index` are
+         * used by vertex and index buffer bindings, and the number of buffers present will vary
+         * between PSOs. */
+        int arg_buffer_idx = (pipeline_state_instance.base_uniform_buffer_index +
+                              compute_arg_buffer_bind_index);
+        assert(arg_buffer_idx < 32);
+        id<MTLArgumentEncoder> argument_encoder = shader_interface->find_argument_encoder(
+            arg_buffer_idx);
+        if (argument_encoder == nil) {
+          argument_encoder = [pipeline_state_instance.compute
+              newArgumentEncoderWithBufferIndex:arg_buffer_idx];
+          shader_interface->insert_argument_encoder(arg_buffer_idx, argument_encoder);
+        }
+
+        /* Generate or Fetch argument buffer sampler configuration.
+         * NOTE(Metal): we need to base sampler counts off of the maximal texture
+         * index. This is not the most optimal, but in practice, not a use-case
+         * when argument buffers are required.
+         * This is because with explicit texture indices, the binding indices
+         * should match across draws, to allow the high-level to optimize bind-points. */
+        gpu::MTLBuffer *encoder_buffer = nullptr;
+        this->samplers_.num_samplers = shader_interface->get_max_texture_index() + 1;
+
+        gpu::MTLBuffer **cached_smp_buffer_search = this->cached_sampler_buffers_.lookup_ptr(
+            this->samplers_);
+        if (cached_smp_buffer_search != nullptr) {
+          encoder_buffer = *cached_smp_buffer_search;
+        }
+        else {
+          /* Populate argument buffer with current global sampler bindings. */
+          int size = [argument_encoder encodedLength];
+          int alignment = max_uu([argument_encoder alignment], 256);
+          int size_align_delta = (size % alignment);
+          int aligned_alloc_size = ((alignment > 1) && (size_align_delta > 0)) ?
+                                       size + (alignment - (size % alignment)) :
+                                       size;
+
+          /* Allocate buffer to store encoded sampler arguments. */
+          encoder_buffer = MTLContext::get_global_memory_manager()->allocate(aligned_alloc_size,
+                                                                             true);
+          BLI_assert(encoder_buffer);
+          BLI_assert(encoder_buffer->get_metal_buffer());
+          [argument_encoder setArgumentBuffer:encoder_buffer->get_metal_buffer() offset:0];
+          [argument_encoder
+              setSamplerStates:this->samplers_.mtl_sampler
+                     withRange:NSMakeRange(0, shader_interface->get_max_texture_index() + 1)];
+          encoder_buffer->flush();
+
+          /* Insert into cache. */
+          this->cached_sampler_buffers_.add_new(this->samplers_, encoder_buffer);
+        }
+
+        BLI_assert(encoder_buffer != nullptr);
+        int compute_buffer_index = (pipeline_state_instance.base_uniform_buffer_index +
+                                    compute_arg_buffer_bind_index);
+        cs.bind_compute_buffer(encoder_buffer->get_metal_buffer(), 0, compute_buffer_index);
+      }
+    }
+  }
+}
+
 /* Encode latest depth-stencil state. */
 void MTLContext::ensure_depth_stencil_state(MTLPrimitiveType prim_type)
 {
@ -1531,6 +1836,81 @@ void MTLContext::ensure_depth_stencil_state(MTLPrimitiveType prim_type)

 /** \} */

+/* -------------------------------------------------------------------- */
+/** \name Compute dispatch.
+ * \{ */
+
+bool MTLContext::ensure_compute_pipeline_state()
+{
+  /* Verify if bound shader is valid and fetch MTLComputePipelineStateInstance. */
+  /* Check if an active shader is bound. */
+  if (!this->pipeline_state.active_shader) {
+    MTL_LOG_WARNING("No Metal shader bound!\n");
+    return false;
+  }
+  /* Also ensure active shader is valid. */
+  if (!this->pipeline_state.active_shader->is_valid()) {
+    MTL_LOG_WARNING(
+        "Bound active shader is not valid (Missing/invalid implementation for Metal).\n", );
+    return false;
+  }
+  /* Verify this is a compute shader. */
+
+  /* Fetch shader interface. */
+  MTLShaderInterface *shader_interface = this->pipeline_state.active_shader->get_interface();
+  if (shader_interface == nullptr) {
+    MTL_LOG_WARNING("Bound active shader does not have a valid shader interface!\n", );
+    return false;
+  }
+
+  bool success = this->pipeline_state.active_shader->bake_compute_pipeline_state(this);
+  const MTLComputePipelineStateInstance &compute_pso_inst =
+      this->pipeline_state.active_shader->get_compute_pipeline_state();
+  if (!success || compute_pso_inst.pso == nil) {
+    MTL_LOG_WARNING("No valid compute PSO for compute dispatch!\n", );
+    return false;
+  }
+  return true;
+}
+
+void MTLContext::compute_dispatch(int groups_x_len, int groups_y_len, int groups_z_len)
+{
+  /* Ensure all resources required by upcoming compute submission are correctly bound. */
+  if (this->ensure_compute_pipeline_state()) {
+    /* Shader instance. */
+    MTLShaderInterface *shader_interface = this->pipeline_state.active_shader->get_interface();
+    const MTLComputePipelineStateInstance &compute_pso_inst =
+        this->pipeline_state.active_shader->get_compute_pipeline_state();
+
+    /* Begin compute encoder. */
+    id<MTLComputeCommandEncoder> compute_encoder =
+        this->main_command_buffer.ensure_begin_compute_encoder();
+    BLI_assert(compute_encoder != nil);
+
+    /* Bind PSO. */
+    MTLComputeState &cs = this->main_command_buffer.get_compute_state();
+    cs.bind_pso(compute_pso_inst.pso);
+
+    /* Bind buffers. */
+    this->ensure_uniform_buffer_bindings(compute_encoder, shader_interface, compute_pso_inst);
+    /** Ensure resource bindings. */
+    /* Texture Bindings. */
+    /* We will iterate through all texture bindings on the context and determine if any of the
+     * active slots match those in our shader interface. If so, textures will be bound. */
+    if (shader_interface->get_total_textures() > 0) {
+      this->ensure_texture_bindings(compute_encoder, shader_interface, compute_pso_inst);
+    }
+
+    /* Dispatch compute. */
+    [compute_encoder dispatchThreadgroups:MTLSizeMake(groups_x_len, groups_y_len, groups_z_len)
+                    threadsPerThreadgroup:MTLSizeMake(compute_pso_inst.threadgroup_x_len,
+                                                      compute_pso_inst.threadgroup_y_len,
+                                                      compute_pso_inst.threadgroup_z_len)];
+  }
+}
+
+/** \} */
+
 /* -------------------------------------------------------------------- */
 /** \name Visibility buffer control for MTLQueryPool.
 * \{ */
--- a/source/blender/gpu/metal/mtl_shader.hh
+++ b/source/blender/gpu/metal/mtl_shader.hh
@ -70,6 +70,8 @@ struct MTLRenderPipelineStateInstance {
  /* Base bind index for binding uniform buffers, offset based on other
   * bound buffers such as vertex buffers, as the count can vary. */
  int base_uniform_buffer_index;
+  /* Base bind index for binding storage buffers. */
+  int base_ssbo_buffer_index;
  /* buffer bind slot used for null attributes (-1 if not needed). */
  int null_attribute_buffer_index;
  /* buffer bind used for transform feedback output buffer. */
@ -86,14 +88,43 @@ struct MTLRenderPipelineStateInstance {
  blender::Vector<MTLBufferArgumentData> buffer_bindings_reflection_data_frag;
 };

+/* Metal COmpute Pipeline State instance. */
+struct MTLComputePipelineStateInstance {
+  /* Function instances with specialization.
+   * Required for argument encoder construction. */
+  id<MTLFunction> compute = nil;
+  /* PSO handle. */
+  id<MTLComputePipelineState> pso = nil;
+  /* Base bind index for binding uniform buffers, offset based on other
+   * bound buffers such as vertex buffers, as the count can vary. */
+  int base_uniform_buffer_index = -1;
+  /* Base bind index for binding storage buffers. */
+  int base_ssbo_buffer_index = -1;
+
+  int threadgroup_x_len = 1;
+  int threadgroup_y_len = 1;
+  int threadgroup_z_len = 1;
+
+  inline void set_compute_workgroup_size(int workgroup_size_x,
+                                         int workgroup_size_y,
+                                         int workgroup_size_z)
+  {
+    this->threadgroup_x_len = workgroup_size_x;
+    this->threadgroup_y_len = workgroup_size_y;
+    this->threadgroup_z_len = workgroup_size_z;
+  }
+};
+
 /* #MTLShaderBuilder source wrapper used during initial compilation. */
 struct MTLShaderBuilder {
  NSString *msl_source_vert_ = @"";
  NSString *msl_source_frag_ = @"";
+  NSString *msl_source_compute_ = @"";

  /* Generated GLSL source used during compilation. */
  std::string glsl_vertex_source_ = "";
  std::string glsl_fragment_source_ = "";
+  std::string glsl_compute_source_ = "";

  /* Indicates whether source code has been provided via MSL directly. */
  bool source_from_msl_ = false;
@ -141,10 +172,12 @@ class MTLShader : public Shader {
  MTLShaderBuilder *shd_builder_ = nullptr;
  NSString *vertex_function_name_ = @"";
  NSString *fragment_function_name_ = @"";
+  NSString *compute_function_name_ = @"";

  /** Compiled shader resources. */
  id<MTLLibrary> shader_library_vert_ = nil;
  id<MTLLibrary> shader_library_frag_ = nil;
+  id<MTLLibrary> shader_library_compute_ = nil;
  bool valid_ = false;

  /** Render pipeline state and PSO caching. */
@ -156,6 +189,9 @@ class MTLShader : public Shader {
  /* Cache of compiled PipelineStateObjects. */
  blender::Map<MTLRenderPipelineStateDescriptor, MTLRenderPipelineStateInstance *> pso_cache_;

+  /** Compute pipeline state and Compute PSO caching. */
+  MTLComputePipelineStateInstance compute_pso_instance_;
+
  /* True to enable multi-layered rendering support. */
  bool uses_mtl_array_index_ = false;

@ -219,6 +255,7 @@ class MTLShader : public Shader {

  /* Compile and build - Return true if successful. */
  bool finalize(const shader::ShaderCreateInfo *info = nullptr) override;
+  bool finalize_compute(const shader::ShaderCreateInfo *info);

  /* Utility. */
  bool is_valid()
@ -289,11 +326,15 @@ class MTLShader : public Shader {

  /* Metal shader properties and source mapping. */
  void set_vertex_function_name(NSString *vetex_function_name);
-  void set_fragment_function_name(NSString *fragment_function_name_);
+  void set_fragment_function_name(NSString *fragment_function_name);
+  void set_compute_function_name(NSString *compute_function_name);
  void shader_source_from_msl(NSString *input_vertex_source, NSString *input_fragment_source);
+  void shader_compute_source_from_msl(NSString *input_compute_source);
  void set_interface(MTLShaderInterface *interface);
  MTLRenderPipelineStateInstance *bake_current_pipeline_state(MTLContext *ctx,
                                                              MTLPrimitiveTopologyClass prim_type);
+  bool bake_compute_pipeline_state(MTLContext *ctx);
+  const MTLComputePipelineStateInstance &get_compute_pipeline_state();

  /* Transform Feedback. */
  GPUVertBuf *get_transform_feedback_active_buffer();
@ -302,6 +343,7 @@ class MTLShader : public Shader {
 private:
  /* Generate MSL shader from GLSL source. */
  bool generate_msl_from_glsl(const shader::ShaderCreateInfo *info);
+  bool generate_msl_from_glsl_compute(const shader::ShaderCreateInfo *info);

  MEM_CXX_CLASS_ALLOC_FUNCS("MTLShader");
 };
--- a/source/blender/gpu/metal/mtl_shader.mm
+++ b/source/blender/gpu/metal/mtl_shader.mm
@ -124,6 +124,15 @@ MTLShader::~MTLShader()
    }
    pso_cache_.clear();

+    /* Free Compute pipeline state object. */
+    if (compute_pso_instance_.compute) {
+      [compute_pso_instance_.compute release];
+      compute_pso_instance_.compute = nil;
+    }
+    if (compute_pso_instance_.pso) {
+      [compute_pso_instance_.pso release];
+      compute_pso_instance_.pso = nil;
+    }
    /* NOTE(Metal): #ShaderInterface deletion is handled in the super destructor `~Shader()`. */
  }
  valid_ = false;
@ -181,12 +190,19 @@ void MTLShader::fragment_shader_from_glsl(MutableSpan<const char *> sources)

 void MTLShader::compute_shader_from_glsl(MutableSpan<const char *> sources)
 {
+  /* Flag source as not being compiled from native MSL. */
+  BLI_assert(shd_builder_ != nullptr);
+  shd_builder_->source_from_msl_ = false;
+
  /* Remove #version tag entry. */
  sources[0] = "";

-  /* TODO(Metal): Support compute shaders in Metal. */
-  MTL_LOG_WARNING(
-      "MTLShader::compute_shader_from_glsl - Compute shaders currently unsupported!\n");
+  /* Consolidate GLSL compute sources. */
+  std::stringstream ss;
+  for (int i = 0; i < sources.size(); i++) {
+    ss << sources[i] << std::endl;
+  }
+  shd_builder_->glsl_compute_source_ = ss.str();
 }

 bool MTLShader::finalize(const shader::ShaderCreateInfo *info)
@ -196,6 +212,14 @@ bool MTLShader::finalize(const shader::ShaderCreateInfo *info)
    MTL_LOG_ERROR("Shader (%p) '%s' has already been finalized!\n", this, this->name_get());
  }

+  /* Compute shaders. */
+  bool is_compute = false;
+  if (shd_builder_->glsl_compute_source_.size() > 0) {
+    BLI_assert_msg(info != nullptr, "Compute shaders must use CreateInfo.\n");
+    BLI_assert_msg(!shd_builder_->source_from_msl_, "Compute shaders must compile from GLSL.");
+    is_compute = true;
+  }
+
  /* Perform GLSL to MSL source translation. */
  BLI_assert(shd_builder_ != nullptr);
  if (!shd_builder_->source_from_msl_) {
@ -226,12 +250,20 @@ bool MTLShader::finalize(const shader::ShaderCreateInfo *info)
  BLI_assert(device != nil);

  /* Ensure source and stage entry-point names are set. */
-  BLI_assert([vertex_function_name_ length] > 0);
-  if (transform_feedback_type_ == GPU_SHADER_TFB_NONE) {
-    BLI_assert([fragment_function_name_ length] > 0);
-  }
  BLI_assert(shd_builder_ != nullptr);
-  BLI_assert([shd_builder_->msl_source_vert_ length] > 0);
+  if (is_compute) {
+    /* Compute path. */
+    BLI_assert([compute_function_name_ length] > 0);
+    BLI_assert([shd_builder_->msl_source_compute_ length] > 0);
+  }
+  else {
+    /* Vertex/Fragment path. */
+    BLI_assert([vertex_function_name_ length] > 0);
+    if (transform_feedback_type_ == GPU_SHADER_TFB_NONE) {
+      BLI_assert([fragment_function_name_ length] > 0);
+    }
+    BLI_assert([shd_builder_->msl_source_vert_ length] > 0);
+  }

  @autoreleasepool {
    MTLCompileOptions *options = [[[MTLCompileOptions alloc] init] autorelease];
@ -239,13 +271,24 @@ bool MTLShader::finalize(const shader::ShaderCreateInfo *info)
    options.fastMathEnabled = YES;

    NSString *source_to_compile = shd_builder_->msl_source_vert_;
-    for (int src_stage = 0; src_stage <= 1; src_stage++) {

-      source_to_compile = (src_stage == 0) ? shd_builder_->msl_source_vert_ :
-                                             shd_builder_->msl_source_frag_;
+    /* Vertex/Fragment compile stages 0 and/or 1.
+     * Compute shaders compile as stage 2. */
+    ShaderStage initial_stage = (is_compute) ? ShaderStage::COMPUTE : ShaderStage::VERTEX;
+    ShaderStage src_stage = initial_stage;
+    uint8_t total_stages = (is_compute) ? 1 : 2;
+
+    for (int stage_count = 0; stage_count < total_stages; stage_count++) {
+
+      source_to_compile = (src_stage == ShaderStage::VERTEX) ?
+                              shd_builder_->msl_source_vert_ :
+                              ((src_stage == ShaderStage::COMPUTE) ?
+                                   shd_builder_->msl_source_compute_ :
+                                   shd_builder_->msl_source_frag_);

      /* Transform feedback, skip compilation. */
-      if (src_stage == 1 && (transform_feedback_type_ != GPU_SHADER_TFB_NONE)) {
+      if (src_stage == ShaderStage::FRAGMENT &&
+          (transform_feedback_type_ != GPU_SHADER_TFB_NONE)) {
        shader_library_frag_ = nil;
        break;
      }
@ -276,8 +319,9 @@ bool MTLShader::finalize(const shader::ShaderCreateInfo *info)
        /* Only exit out if genuine error and not warning. */
        if ([[error localizedDescription] rangeOfString:@"Compilation succeeded"].location ==
            NSNotFound) {
-          NSLog(
-              @"Compile Error - Metal Shader Library (Stage: %d), error %@ \n", src_stage, error);
+          NSLog(@"Compile Error - Metal Shader Library (Stage: %hhu), error %@ \n",
+                src_stage,
+                error);
          BLI_assert(false);

          /* Release temporary compilation resources. */
@ -287,30 +331,52 @@ bool MTLShader::finalize(const shader::ShaderCreateInfo *info)
        }
      }

-      MTL_LOG_INFO("Successfully compiled Metal Shader Library (Stage: %d) for shader; %s\n",
-                   src_stage,
-                   name);
      BLI_assert(library != nil);
-      if (src_stage == 0) {
-        /* Retain generated library and assign debug name. */
-        shader_library_vert_ = library;
-        [shader_library_vert_ retain];
-        shader_library_vert_.label = [NSString stringWithUTF8String:this->name];
-      }
-      else {
-        /* Retain generated library for fragment shader and assign debug name. */
-        shader_library_frag_ = library;
-        [shader_library_frag_ retain];
-        shader_library_frag_.label = [NSString stringWithUTF8String:this->name];
+
+      switch (src_stage) {
+        case ShaderStage::VERTEX: {
+          /* Retain generated library and assign debug name. */
+          shader_library_vert_ = library;
+          [shader_library_vert_ retain];
+          shader_library_vert_.label = [NSString stringWithUTF8String:this->name];
+        } break;
+        case ShaderStage::FRAGMENT: {
+          /* Retain generated library for fragment shader and assign debug name. */
+          shader_library_frag_ = library;
+          [shader_library_frag_ retain];
+          shader_library_frag_.label = [NSString stringWithUTF8String:this->name];
+        } break;
+        case ShaderStage::COMPUTE: {
+          /* Retain generated library for fragment shader and assign debug name. */
+          shader_library_compute_ = library;
+          [shader_library_compute_ retain];
+          shader_library_compute_.label = [NSString stringWithUTF8String:this->name];
+        } break;
+        case ShaderStage::ANY: {
+          /* Supress warnings. */
+          BLI_assert_unreachable();
+        } break;
      }

      [source_with_header autorelease];
-    }
-    pso_descriptor_.label = [NSString stringWithUTF8String:this->name];

-    /* Prepare descriptor. */
-    pso_descriptor_ = [[MTLRenderPipelineDescriptor alloc] init];
-    [pso_descriptor_ retain];
+      /* Move onto next compilation stage. */
+      if (!is_compute) {
+        src_stage = ShaderStage::FRAGMENT;
+      }
+      else {
+        break;
+      }
+    }
+
+    /* Create descriptors.
+     * Each shader type requires a differing descriptor. */
+    if (!is_compute) {
+      /* Prepare Render pipeline descriptor. */
+      pso_descriptor_ = [[MTLRenderPipelineDescriptor alloc] init];
+      [pso_descriptor_ retain];
+      pso_descriptor_.label = [NSString stringWithUTF8String:this->name];
+    }

    /* Shader has successfully been created. */
    valid_ = true;
@ -324,6 +390,11 @@ bool MTLShader::finalize(const shader::ShaderCreateInfo *info)
    else {
      push_constant_data_ = nullptr;
    }
+
+    /* If this is a compute shader, bake PSO for compute straight-away. */
+    if (is_compute) {
+      this->bake_compute_pipeline_state(context_);
+    }
  }

  /* Release temporary compilation resources. */
@ -332,6 +403,11 @@ bool MTLShader::finalize(const shader::ShaderCreateInfo *info)
  return true;
 }

+const MTLComputePipelineStateInstance &MTLShader::get_compute_pipeline_state()
+{
+  return this->compute_pso_instance_;
+}
+
 void MTLShader::transform_feedback_names_set(Span<const char *> name_list,
                                             const eGPUShaderTFBType geom_type)
 {
@ -556,6 +632,11 @@ void MTLShader::set_fragment_function_name(NSString *frag_function_name)
  fragment_function_name_ = frag_function_name;
 }

+void MTLShader::set_compute_function_name(NSString *compute_function_name)
+{
+  compute_function_name_ = compute_function_name;
+}
+
 void MTLShader::shader_source_from_msl(NSString *input_vertex_source,
                                       NSString *input_fragment_source)
 {
@ -565,6 +646,13 @@ void MTLShader::shader_source_from_msl(NSString *input_vertex_source,
  shd_builder_->source_from_msl_ = true;
 }

+void MTLShader::shader_compute_source_from_msl(NSString *input_compute_source)
+{
+  BLI_assert(shd_builder_ != nullptr);
+  shd_builder_->msl_source_compute_ = input_compute_source;
+  shd_builder_->source_from_msl_ = true;
+}
+
 void MTLShader::set_interface(MTLShaderInterface *interface)
 {
  /* Assign gpu::Shader super-class interface. */
@ -985,7 +1073,6 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
    desc.stencilAttachmentPixelFormat = current_state.stencil_attachment_format;

    /* Compile PSO */
-
    MTLAutoreleasedRenderPipelineReflection reflection_data;
    id<MTLRenderPipelineState> pso = [ctx->device
        newRenderPipelineStateWithDescriptor:desc
@ -1003,7 +1090,9 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
      return nullptr;
    }
    else {
+#ifndef NDEBUG
      NSLog(@"Successfully compiled PSO for shader: %s (Metal Context: %p)\n", this->name, ctx);
+#endif
    }

    /* Prepare pipeline state instance. */
@ -1106,6 +1195,83 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
    return pso_inst;
  }
 }
+
+bool MTLShader::bake_compute_pipeline_state(MTLContext *ctx)
+{
+  /* NOTE(Metal): Bakes and caches a PSO for compute. */
+  BLI_assert(this);
+  BLI_assert(this->is_valid());
+  BLI_assert(shader_library_compute_ != nil);
+
+  if (compute_pso_instance_.pso == nil) {
+    /* Prepare Compute Pipeline Descriptor. */
+
+    /* Setup function specialization constants, used to modify and optimize
+     * generated code based on current render pipeline configuration. */
+    MTLFunctionConstantValues *values = [[MTLFunctionConstantValues new] autorelease];
+
+    /* Offset the bind index for Uniform buffers such that they begin after the VBO
+     * buffer bind slots. `MTL_uniform_buffer_base_index` is passed as a function
+     * specialization constant, customized per unique pipeline state permutation.
+     *
+     * For Compute shaders, this offset is always zero, but this needs setting as
+     * it is expected as part of the common Metal shader header.*/
+    int MTL_uniform_buffer_base_index = 0;
+    [values setConstantValue:&MTL_uniform_buffer_base_index
+                        type:MTLDataTypeInt
+                    withName:@"MTL_uniform_buffer_base_index"];
+
+    /* TODO: SSBO binding base index. */
+
+    /* Compile compute function. */
+    NSError *error = nullptr;
+    id<MTLFunction> compute_function = [shader_library_compute_
+        newFunctionWithName:compute_function_name_
+             constantValues:values
+                      error:&error];
+    if (error) {
+      NSLog(@"Compile Error - Metal Shader compute function, error %@", error);
+
+      /* Only exit out if genuine error and not warning */
+      if ([[error localizedDescription] rangeOfString:@"Compilation succeeded"].location ==
+          NSNotFound) {
+        BLI_assert(false);
+        return false;
+      }
+    }
+
+    /* Compile PSO. */
+    id<MTLComputePipelineState> pso = [ctx->device
+        newComputePipelineStateWithFunction:compute_function
+                                      error:&error];
+    if (error) {
+      NSLog(@"Failed to create PSO for compute shader: %s error %@\n", this->name, error);
+      BLI_assert(false);
+      return false;
+    }
+    else if (!pso) {
+      NSLog(@"Failed to create PSO for compute shader: %s, but no error was provided!\n",
+            this->name);
+      BLI_assert(false);
+      return false;
+    }
+    else {
+#ifndef NDEBUG
+      NSLog(@"Successfully compiled compute PSO for shader: %s (Metal Context: %p)\n",
+            this->name,
+            ctx);
+#endif
+    }
+
+    /* Gather reflection data and create MTLComputePipelineStateInstance to store results. */
+    compute_pso_instance_.compute = [compute_function retain];
+    compute_pso_instance_.pso = [pso retain];
+    compute_pso_instance_.base_uniform_buffer_index = MTL_uniform_buffer_base_index;
+    /* TODO: Add SSBO base buffer index support. */
+    compute_pso_instance_.base_ssbo_buffer_index = -1;
+  }
+  return true;
+}
 /** \} */

 /* -------------------------------------------------------------------- */
--- a/source/blender/gpu/metal/mtl_shader_generator.hh
+++ b/source/blender/gpu/metal/mtl_shader_generator.hh
@ -355,6 +355,14 @@ struct MSLFragmentOutputAttribute {
  }
 };

+struct MSLSharedMemoryBlock {
+  /* e.g. shared vec4 color_cache[cache_size][cache_size]; */
+  std::string type_name;
+  std::string varname;
+  bool is_array;
+  std::string array_decl; /* String containing array declaration. e.g. [cache_size][cache_size]*/
+};
+
 class MSLGeneratorInterface {
  static char *msl_patch_default;

@ -375,6 +383,8 @@ class MSLGeneratorInterface {
  blender::Vector<MSLVertexOutputAttribute> vertex_output_varyings_tf;
  /* Clip Distances. */
  blender::Vector<std::string> clip_distances;
+  /* Shared Memory Blocks. */
+  blender::Vector<MSLSharedMemoryBlock> shared_memory_blocks;

  /** GL Global usage. */
  /* Whether GL position is used, or an alternative vertex output should be the default. */
@ -397,12 +407,20 @@ class MSLGeneratorInterface {
  bool uses_mtl_array_index_;
  bool uses_transform_feedback;
  bool uses_barycentrics;
+  /* Compute shader global variables. */
+  bool uses_gl_GlobalInvocationID;
+  bool uses_gl_WorkGroupSize;
+  bool uses_gl_WorkGroupID;
+  bool uses_gl_NumWorkGroups;
+  bool uses_gl_LocalInvocationIndex;
+  bool uses_gl_LocalInvocationID;

  /* Parameters. */
  shader::DepthWrite depth_write;

-  /* Shader buffer bind indices for argument buffers. */
-  int sampler_argument_buffer_bind_index[2] = {-1, -1};
+  /* Shader buffer bind indices for argument buffers per shader stage.
+   * NOTE: Compute stage will re-use index 0. */
+  int sampler_argument_buffer_bind_index[3] = {-1, -1, -1};

  /*** SSBO Vertex fetch mode. ***/
  /* Indicates whether to pass in Vertex Buffer's as a regular buffers instead of using vertex
@ -453,8 +471,10 @@ class MSLGeneratorInterface {
  std::string generate_msl_fragment_out_struct();
  std::string generate_msl_vertex_inputs_string();
  std::string generate_msl_fragment_inputs_string();
+  std::string generate_msl_compute_inputs_string();
  std::string generate_msl_vertex_entry_stub();
  std::string generate_msl_fragment_entry_stub();
+  std::string generate_msl_compute_entry_stub();
  std::string generate_msl_global_uniform_population(ShaderStage stage);
  std::string generate_ubo_block_macro_chain(MSLUniformBlock block);
  std::string generate_msl_uniform_block_population(ShaderStage stage);
@ -482,13 +502,31 @@ class MSLGeneratorInterface {
  MEM_CXX_CLASS_ALLOC_FUNCS("MSLGeneratorInterface");
 };

-inline std::string get_stage_class_name(ShaderStage stage)
+inline const char *get_stage_class_name(ShaderStage stage)
 {
  switch (stage) {
    case ShaderStage::VERTEX:
      return "MTLShaderVertexImpl";
    case ShaderStage::FRAGMENT:
      return "MTLShaderFragmentImpl";
+    case ShaderStage::COMPUTE:
+      return "MTLShaderComputeImpl";
+    default:
+      BLI_assert_unreachable();
+      return "";
+  }
+  return "";
+}
+
+inline const char *get_shader_stage_instance_name(ShaderStage stage)
+{
+  switch (stage) {
+    case ShaderStage::VERTEX:
+      return "vertex_shader_instance";
+    case ShaderStage::FRAGMENT:
+      return "fragment_shader_instance";
+    case ShaderStage::COMPUTE:
+      return "compute_shader_instance";
    default:
      BLI_assert_unreachable();
      return "";
@ -726,4 +764,26 @@ inline const char *to_string(const shader::Type &type)
  }
 }

+inline char *next_symbol_in_range(char *begin, char *end, char symbol)
+{
+  for (char *a = begin; a < end; a++) {
+    if (*a == symbol) {
+      return a;
+    }
+  }
+  return nullptr;
+}
+
+inline char *next_word_in_range(char *begin, char *end)
+{
+  for (char *a = begin; a < end; a++) {
+    char chr = *a;
+    if ((chr >= 'a' && chr <= 'z') || (chr >= 'A' && chr <= 'Z') || (chr >= '0' && chr <= '9') ||
+        (chr == '_')) {
+      return a;
+    }
+  }
+  return nullptr;
+}
+
 }  // namespace blender::gpu
--- a/source/blender/gpu/metal/mtl_shader_generator.mm
+++ b/source/blender/gpu/metal/mtl_shader_generator.mm
--- a/source/blender/gpu/metal/mtl_shader_interface.hh
+++ b/source/blender/gpu/metal/mtl_shader_interface.hh
@ -65,12 +65,13 @@ namespace blender::gpu {
 * information to a specified buffer, and is unique to the shader's resource interface.
 */

-enum class ShaderStage : uint32_t {
+enum class ShaderStage : uint8_t {
  VERTEX = 1 << 0,
  FRAGMENT = 1 << 1,
-  BOTH = (ShaderStage::VERTEX | ShaderStage::FRAGMENT),
+  COMPUTE = 2 << 1,
+  ANY = (ShaderStage::VERTEX | ShaderStage::FRAGMENT | ShaderStage::COMPUTE),
 };
-ENUM_OPERATORS(ShaderStage, ShaderStage::BOTH);
+ENUM_OPERATORS(ShaderStage, ShaderStage::ANY);

 inline uint get_shader_stage_index(ShaderStage stage)
 {
@ -79,6 +80,8 @@ inline uint get_shader_stage_index(ShaderStage stage)
      return 0;
    case ShaderStage::FRAGMENT:
      return 1;
+    case ShaderStage::COMPUTE:
+      return 2;
    default:
      BLI_assert_unreachable();
      return 0;
@ -182,8 +185,7 @@ class MTLShaderInterface : public ShaderInterface {

  /* Whether argument buffers are used for sampler bindings. */
  bool sampler_use_argument_buffer_;
-  int sampler_argument_buffer_bind_index_vert_;
-  int sampler_argument_buffer_bind_index_frag_;
+  int sampler_argument_buffer_bind_index_[3];

  /* Attribute Mask. */
  uint32_t enabled_attribute_mask_;
@ -206,7 +208,7 @@ class MTLShaderInterface : public ShaderInterface {
  uint32_t add_uniform_block(uint32_t name_offset,
                             uint32_t buffer_index,
                             uint32_t size,
-                             ShaderStage stage_mask = ShaderStage::BOTH);
+                             ShaderStage stage_mask = ShaderStage::ANY);
  void add_uniform(uint32_t name_offset, eMTLDataType type, int array_len = 1);
  void add_texture(uint32_t name_offset,
                   uint32_t texture_slot,
@ -219,7 +221,8 @@ class MTLShaderInterface : public ShaderInterface {
  void map_builtins();
  void set_sampler_properties(bool use_argument_buffer,
                              uint32_t argument_buffer_bind_index_vert,
-                              uint32_t argument_buffer_bind_index_frag);
+                              uint32_t argument_buffer_bind_index_frag,
+                              uint32_t argument_buffer_bind_index_compute);

  /* Prepare #ShaderInput interface for binding resolution. */
  void prepare_common_shader_inputs();
@ -242,8 +245,8 @@ class MTLShaderInterface : public ShaderInterface {
  const MTLShaderTexture &get_texture(uint index) const;
  uint32_t get_total_textures() const;
  uint32_t get_max_texture_index() const;
-  bool get_use_argument_buffer_for_samplers(int *vertex_arg_buffer_bind_index,
-                                            int *fragment_arg_buffer_bind_index) const;
+  bool uses_argument_buffer_for_samplers() const;
+  int get_argument_buffer_bind_index(ShaderStage stage) const;

  /* Fetch Attributes. */
  const MTLShaderInputAttribute &get_attribute(uint index) const;
--- a/source/blender/gpu/metal/mtl_shader_interface.mm
+++ b/source/blender/gpu/metal/mtl_shader_interface.mm
@ -62,8 +62,9 @@ void MTLShaderInterface::init()
  enabled_attribute_mask_ = 0;
  total_vert_stride_ = 0;
  sampler_use_argument_buffer_ = false;
-  sampler_argument_buffer_bind_index_vert_ = -1;
-  sampler_argument_buffer_bind_index_frag_ = -1;
+  for (int i = 0; i < ARRAY_SIZE(sampler_argument_buffer_bind_index_); i++) {
+    sampler_argument_buffer_bind_index_[i] = -1;
+  }

  /* NULL initialize uniform location markers for builtins. */
  for (const int u : IndexRange(GPU_NUM_UNIFORMS)) {
@ -121,7 +122,7 @@ uint32_t MTLShaderInterface::add_uniform_block(uint32_t name_offset,
  uni_block.buffer_index = buffer_index;
  uni_block.size = size;
  uni_block.current_offset = 0;
-  uni_block.stage_mask = ShaderStage::BOTH;
+  uni_block.stage_mask = ShaderStage::ANY;
  max_uniformbuf_index_ = max_ii(max_uniformbuf_index_, buffer_index);
  return (total_uniform_blocks_++);
 }
@ -135,7 +136,7 @@ void MTLShaderInterface::add_push_constant_block(uint32_t name_offset)
  push_constant_block_.size = 0;

  push_constant_block_.current_offset = 0;
-  push_constant_block_.stage_mask = ShaderStage::BOTH;
+  push_constant_block_.stage_mask = ShaderStage::ANY;
 }

 void MTLShaderInterface::add_uniform(uint32_t name_offset, eMTLDataType type, int array_len)
@ -367,11 +368,16 @@ void MTLShaderInterface::prepare_common_shader_inputs()

 void MTLShaderInterface::set_sampler_properties(bool use_argument_buffer,
                                                uint32_t argument_buffer_bind_index_vert,
-                                                uint32_t argument_buffer_bind_index_frag)
+                                                uint32_t argument_buffer_bind_index_frag,
+                                                uint32_t argument_buffer_bind_index_compute)
 {
  sampler_use_argument_buffer_ = use_argument_buffer;
-  sampler_argument_buffer_bind_index_vert_ = argument_buffer_bind_index_vert;
-  sampler_argument_buffer_bind_index_frag_ = argument_buffer_bind_index_frag;
+  sampler_argument_buffer_bind_index_[get_shader_stage_index(ShaderStage::VERTEX)] =
+      argument_buffer_bind_index_vert;
+  sampler_argument_buffer_bind_index_[get_shader_stage_index(ShaderStage::FRAGMENT)] =
+      argument_buffer_bind_index_frag;
+  sampler_argument_buffer_bind_index_[get_shader_stage_index(ShaderStage::COMPUTE)] =
+      argument_buffer_bind_index_compute;
 }

 /* Attributes. */
@ -461,16 +467,16 @@ uint32_t MTLShaderInterface::get_max_texture_index() const
  return max_texture_index_;
 }

-bool MTLShaderInterface::get_use_argument_buffer_for_samplers(
-    int *vertex_arg_buffer_bind_index, int *fragment_arg_buffer_bind_index) const
+bool MTLShaderInterface::uses_argument_buffer_for_samplers() const
 {
-  /* Returns argument buffer binding slot for each shader stage.
-   * The exact bind slot may be different, as each stage has different buffer inputs. */
-  *vertex_arg_buffer_bind_index = sampler_argument_buffer_bind_index_vert_;
-  *fragment_arg_buffer_bind_index = sampler_argument_buffer_bind_index_frag_;
  return sampler_use_argument_buffer_;
 }

+int MTLShaderInterface::get_argument_buffer_bind_index(ShaderStage stage) const
+{
+  return sampler_argument_buffer_bind_index_[get_shader_stage_index(stage)];
+}
+
 id<MTLArgumentEncoder> MTLShaderInterface::find_argument_encoder(int buffer_index) const
 {
  id encoder = nil;
--- a/source/blender/gpu/metal/mtl_state.mm
+++ b/source/blender/gpu/metal/mtl_state.mm
@ -575,14 +575,6 @@ void MTLStateManager::issue_barrier(eGPUBarrier barrier_bits)
  MTLContext *ctx = reinterpret_cast<MTLContext *>(GPU_context_active_get());
  BLI_assert(ctx);

-  /* Apple Silicon does not support memory barriers.
-   * We do not currently need these due to implicit API guarantees.
-   * NOTE(Metal): MTLFence/MTLEvent may be required to synchronize work if
-   * untracked resources are ever used. */
-  if ([ctx->device hasUnifiedMemory]) {
-    return;
-  }
-
  ctx->main_command_buffer.insert_memory_barrier(barrier_bits, before_stages, after_stages);
 }

--- a/source/blender/gpu/metal/mtl_texture.mm
+++ b/source/blender/gpu/metal/mtl_texture.mm
@ -722,10 +722,13 @@ void gpu::MTLTexture::update_sub(
                                          ((ctx->pipeline_state.unpack_row_length == 0) ?
                                               extent[0] :
                                               ctx->pipeline_state.unpack_row_length)};
-            [compute_encoder setComputePipelineState:pso];
-            [compute_encoder setBytes:&params length:sizeof(params) atIndex:0];
-            [compute_encoder setBuffer:staging_buffer offset:staging_buffer_offset atIndex:1];
-            [compute_encoder setTexture:texture_handle atIndex:0];
+
+            /* Bind resources via compute state for optimal state caching performance. */
+            MTLComputeState &cs = ctx->main_command_buffer.get_compute_state();
+            cs.bind_pso(pso);
+            cs.bind_compute_bytes(&params, sizeof(params), 0);
+            cs.bind_compute_buffer(staging_buffer, staging_buffer_offset, 1);
+            cs.bind_compute_texture(texture_handle, 0);
            [compute_encoder
                      dispatchThreads:MTLSizeMake(extent[0], 1, 1) /* Width, Height, Layer */
                threadsPerThreadgroup:MTLSizeMake(64, 1, 1)];
@ -739,10 +742,13 @@ void gpu::MTLTexture::update_sub(
                                          ((ctx->pipeline_state.unpack_row_length == 0) ?
                                               extent[0] :
                                               ctx->pipeline_state.unpack_row_length)};
-            [compute_encoder setComputePipelineState:pso];
-            [compute_encoder setBytes:&params length:sizeof(params) atIndex:0];
-            [compute_encoder setBuffer:staging_buffer offset:staging_buffer_offset atIndex:1];
-            [compute_encoder setTexture:texture_handle atIndex:0];
+
+            /* Bind resources via compute state for optimal state caching performance. */
+            MTLComputeState &cs = ctx->main_command_buffer.get_compute_state();
+            cs.bind_pso(pso);
+            cs.bind_compute_bytes(&params, sizeof(params), 0);
+            cs.bind_compute_buffer(staging_buffer, staging_buffer_offset, 1);
+            cs.bind_compute_texture(texture_handle, 0);
            [compute_encoder
                      dispatchThreads:MTLSizeMake(extent[0], extent[1], 1) /* Width, layers, nil */
                threadsPerThreadgroup:MTLSizeMake(8, 8, 1)];
@ -796,10 +802,13 @@ void gpu::MTLTexture::update_sub(
                                          ((ctx->pipeline_state.unpack_row_length == 0) ?
                                               extent[0] :
                                               ctx->pipeline_state.unpack_row_length)};
-            [compute_encoder setComputePipelineState:pso];
-            [compute_encoder setBytes:&params length:sizeof(params) atIndex:0];
-            [compute_encoder setBuffer:staging_buffer offset:staging_buffer_offset atIndex:1];
-            [compute_encoder setTexture:texture_handle atIndex:0];
+
+            /* Bind resources via compute state for optimal state caching performance. */
+            MTLComputeState &cs = ctx->main_command_buffer.get_compute_state();
+            cs.bind_pso(pso);
+            cs.bind_compute_bytes(&params, sizeof(params), 0);
+            cs.bind_compute_buffer(staging_buffer, staging_buffer_offset, 1);
+            cs.bind_compute_texture(texture_handle, 0);
            [compute_encoder
                      dispatchThreads:MTLSizeMake(
                                          extent[0], extent[1], 1) /* Width, Height, Layer */
@ -814,10 +823,13 @@ void gpu::MTLTexture::update_sub(
                                          ((ctx->pipeline_state.unpack_row_length == 0) ?
                                               extent[0] :
                                               ctx->pipeline_state.unpack_row_length)};
-            [compute_encoder setComputePipelineState:pso];
-            [compute_encoder setBytes:&params length:sizeof(params) atIndex:0];
-            [compute_encoder setBuffer:staging_buffer offset:staging_buffer_offset atIndex:1];
-            [compute_encoder setTexture:texture_handle atIndex:0];
+
+            /* Bind resources via compute state for optimal state caching performance. */
+            MTLComputeState &cs = ctx->main_command_buffer.get_compute_state();
+            cs.bind_pso(pso);
+            cs.bind_compute_bytes(&params, sizeof(params), 0);
+            cs.bind_compute_buffer(staging_buffer, staging_buffer_offset, 1);
+            cs.bind_compute_texture(texture_handle, 0);
            [compute_encoder dispatchThreads:MTLSizeMake(extent[0],
                                                         extent[1],
                                                         extent[2]) /* Width, Height, Layer */
@ -854,10 +866,13 @@ void gpu::MTLTexture::update_sub(
                                        ((ctx->pipeline_state.unpack_row_length == 0) ?
                                             extent[0] :
                                             ctx->pipeline_state.unpack_row_length)};
-          [compute_encoder setComputePipelineState:pso];
-          [compute_encoder setBytes:&params length:sizeof(params) atIndex:0];
-          [compute_encoder setBuffer:staging_buffer offset:staging_buffer_offset atIndex:1];
-          [compute_encoder setTexture:texture_handle atIndex:0];
+
+          /* Bind resources via compute state for optimal state caching performance. */
+          MTLComputeState &cs = ctx->main_command_buffer.get_compute_state();
+          cs.bind_pso(pso);
+          cs.bind_compute_bytes(&params, sizeof(params), 0);
+          cs.bind_compute_buffer(staging_buffer, staging_buffer_offset, 1);
+          cs.bind_compute_texture(texture_handle, 0);
          [compute_encoder
                    dispatchThreads:MTLSizeMake(
                                        extent[0], extent[1], extent[2]) /* Width, Height, Depth */
@ -1521,10 +1536,13 @@ void gpu::MTLTexture::read_internal(int mip,
              {width, height, 1},
              {x_off, y_off, 0},
          };
-          [compute_encoder setComputePipelineState:pso];
-          [compute_encoder setBytes:&params length:sizeof(params) atIndex:0];
-          [compute_encoder setBuffer:destination_buffer offset:0 atIndex:1];
-          [compute_encoder setTexture:read_texture atIndex:0];
+
+          /* Bind resources via compute state for optimal state caching performance. */
+          MTLComputeState &cs = ctx->main_command_buffer.get_compute_state();
+          cs.bind_pso(pso);
+          cs.bind_compute_bytes(&params, sizeof(params), 0);
+          cs.bind_compute_buffer(destination_buffer, 0, 1);
+          cs.bind_compute_texture(read_texture, 0);
          [compute_encoder dispatchThreads:MTLSizeMake(width, height, 1) /* Width, Height, Layer */
                     threadsPerThreadgroup:MTLSizeMake(8, 8, 1)];
          copy_successful = true;
@ -1568,10 +1586,13 @@ void gpu::MTLTexture::read_internal(int mip,
              {width, height, depth},
              {x_off, y_off, z_off},
          };
-          [compute_encoder setComputePipelineState:pso];
-          [compute_encoder setBytes:&params length:sizeof(params) atIndex:0];
-          [compute_encoder setBuffer:destination_buffer offset:0 atIndex:1];
-          [compute_encoder setTexture:read_texture atIndex:0];
+
+          /* Bind resources via compute state for optimal state caching performance. */
+          MTLComputeState &cs = ctx->main_command_buffer.get_compute_state();
+          cs.bind_pso(pso);
+          cs.bind_compute_bytes(&params, sizeof(params), 0);
+          cs.bind_compute_buffer(destination_buffer, 0, 1);
+          cs.bind_compute_texture(read_texture, 0);
          [compute_encoder
                    dispatchThreads:MTLSizeMake(width, height, depth) /* Width, Height, Layer */
              threadsPerThreadgroup:MTLSizeMake(8, 8, 1)];
--- a/source/blender/gpu/shaders/metal/mtl_shader_common.msl
+++ b/source/blender/gpu/shaders/metal/mtl_shader_common.msl
@ -58,6 +58,9 @@ constant int MTL_clip_distance_enabled3 [[function_constant(23)]];
 constant int MTL_clip_distance_enabled4 [[function_constant(24)]];
 constant int MTL_clip_distance_enabled5 [[function_constant(25)]];

+/* Compute and SSBOs. */
+constant int MTL_storage_buffer_base_index [[function_constant(26)]];
+
 /** Internal attribute conversion functionality. */
 /* Following descriptions in mtl_shader.hh, Metal only supports some implicit
 * attribute type conversions. These conversions occur when there is a difference
--- a/source/blender/gpu/shaders/metal/mtl_shader_defines.msl
+++ b/source/blender/gpu/shaders/metal/mtl_shader_defines.msl
@ -59,9 +59,122 @@ using uvec4 = uint4;
 #  define uniform
 #endif

+/* Compute decorators. */
+#define TG threadgroup
+#define barrier() threadgroup_barrier(mem_flags::mem_threadgroup)
+
+#ifdef MTL_USE_WORKGROUP_SIZE
+/* Compute workgroup size. */
+struct constexp_uvec3 {
+  /* Type union to cover all syntax accessors:
+   * .x, .y, .z, .xy, .xyz
+   * Swizzle types invalid.*/
+  union {
+    struct {
+      uint x, y, z;
+    };
+    struct {
+      uint2 xy;
+    };
+    uint3 xyz;
+  };
+
+  constexpr constexp_uvec3(uint _x, uint _y, uint _z) : x(_x), y(_y), z(_z)
+  {
+  }
+  constexpr uint operator[](int i)
+  {
+    /* Note: Need to switch on each elem value as array accessor triggers
+     * non-constant sizing error. This will be statically evaluated at compile time. */
+    switch (i) {
+      case 0:
+        return x;
+      case 1:
+        return y;
+      case 2:
+        return z;
+      default:
+        return 0;
+    }
+  }
+  inline operator uint3() const
+  {
+    return xyz;
+  }
+};
+
+constexpr constexp_uvec3 __internal_workgroupsize_get()
+{
+  return constexp_uvec3(MTL_WORKGROUP_SIZE_X, MTL_WORKGROUP_SIZE_Y, MTL_WORKGROUP_SIZE_Z);
+}
+
+#  define gl_WorkGroupSize __internal_workgroupsize_get()
+#endif
+
+/** Shader atomics:
+ * In order to emulate GLSL-style atomic operations, wherein variables can be used within atomic
+ * operations, even if they are not explicitly declared atomic, we can cast the pointer to atomic,
+ * to ensure that the load instruction follows atomic_load/store idioms.
+ *
+ * NOTE: We cannot hoist the address space into the template declaration, so these must be declared
+ * for each relevant address space. */
+
+/* Threadgroup memory. */
+template<typename T> T atomicMax(threadgroup T &mem, T data)
+{
+  return atomic_fetch_max_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
+}
+template<typename T> T atomicMin(threadgroup T &mem, T data)
+{
+  return atomic_fetch_min_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
+}
+template<typename T> T atomicAdd(threadgroup T &mem, T data)
+{
+  return atomic_fetch_add_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
+}
+template<typename T> T atomicSub(threadgroup T &mem, T data)
+{
+  return atomic_fetch_sub_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
+}
+template<typename T> T atomicOr(threadgroup T &mem, T data)
+{
+  return atomic_fetch_or_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
+}
+template<typename T> T atomicXor(threadgroup T &mem, T data)
+{
+  return atomic_fetch_xor_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
+}
+
+/* Device memory. */
+template<typename T> T atomicMax(device T &mem, T data)
+{
+  return atomic_fetch_max_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
+}
+template<typename T> T atomicMin(device T &mem, T data)
+{
+  return atomic_fetch_min_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
+}
+template<typename T> T atomicAdd(device T &mem, T data)
+{
+  return atomic_fetch_add_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
+}
+template<typename T> T atomicSub(device T &mem, T data)
+{
+  return atomic_fetch_sub_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
+}
+template<typename T> T atomicOr(device T &mem, T data)
+{
+  return atomic_fetch_or_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
+}
+template<typename T> T atomicXor(device T &mem, T data)
+{
+  return atomic_fetch_xor_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
+}
+
 /* Used to replace 'out' in function parameters with threadlocal reference
 * shortened to avoid expanding the glsl source string. */
 #define THD thread
+#define OUT(type, name, array) thread type(&name)[array]

 /* Generate wrapper structs for combined texture and sampler type. */
 #ifdef USE_ARGUMENT_BUFFER_FOR_SAMPLERS
--- a/source/blender/nodes/composite/nodes/node_composite_filter.cc
+++ b/source/blender/nodes/composite/nodes/node_composite_filter.cc
@ -49,7 +49,7 @@ class FilterOperation : public NodeOperation {
    GPUShader *shader = shader_manager().get(get_shader_name());
    GPU_shader_bind(shader);

-    GPU_shader_uniform_mat3_as_mat4(shader, "kernel", get_filter_kernel().ptr());
+    GPU_shader_uniform_mat3_as_mat4(shader, "ukernel", get_filter_kernel().ptr());

    const Result &input_image = get_input("Image");
    input_image.bind_as_texture(shader, "input_tx");