Metal: MTLBatch and MTLDrawList implementation.

MTLBatch and MTLDrawList implementation enables use of Metal Viewport for UI and Workbench. Includes Vertex descriptor caching and SSBO Vertex Fetch mode draw call submission. Authored by Apple: Michael Parkin-White Ref T96261 Reviewed By: fclem Maniphest Tasks: T96261 Differential Revision: https://developer.blender.org/D16101
Referenced by issue #96261, Metal Viewport Referenced by issue #111370, Metal: Assert is triggered when drawing a Batch with an attribute format different from the type used in the Shader
2022-10-18 12:18:25 +02:00 · 2022-10-18 12:18:25 +02:00 · aed301704a · 2023-08-22 05:12:20 +02:00
parent e47bf05e85
commit aed301704a
16 changed files with 1567 additions and 87 deletions
--- a/source/blender/draw/engines/overlay/shaders/overlay_motion_path_line_vert_no_geom.glsl
+++ b/source/blender/draw/engines/overlay/shaders/overlay_motion_path_line_vert_no_geom.glsl
@ -108,8 +108,8 @@ void main()
  vec3 in_pos0 = vertex_fetch_attribute(base_vertex_id, pos, vec3);
  vec3 in_pos1 = vertex_fetch_attribute(base_vertex_id + 1, pos, vec3);

-  vec4 out_pos0 = ProjectionMatrix * (ViewMatrix * vec4(in_pos0, 1.0));
-  vec4 out_pos1 = ProjectionMatrix * (ViewMatrix * vec4(in_pos1, 1.0));
+  vec4 out_pos0 = drw_view.winmat * (drw_view.viewmat * vec4(in_pos0, 1.0));
+  vec4 out_pos1 = drw_view.winmat * (drw_view.viewmat * vec4(in_pos1, 1.0));

  /* Final calculations required for Geometry Shader alternative.
   * We need to calculate values for each vertex position to correctly determine the final output
@ -130,28 +130,28 @@ void main()
  float line_size = float(lineThickness) * sizePixel;

  if (quad_vertex_id == 0) {
-    view_clipping_distances(out_pos0);
+    view_clipping_distances(out_pos0.xyz);

    interp.color = finalColor_geom[0];
    t = edge_dir * (line_size * (is_persp ? out_pos0.w : 1.0));
    gl_Position = out_pos0 + vec4(t, 0.0, 0.0);
  }
  else if (quad_vertex_id == 1 || quad_vertex_id == 3) {
-    view_clipping_distances(out_pos0);
+    view_clipping_distances(out_pos0.xyz);

    interp.color = finalColor_geom[0];
    t = edge_dir * (line_size * (is_persp ? out_pos0.w : 1.0));
    gl_Position = out_pos0 - vec4(t, 0.0, 0.0);
  }
  else if (quad_vertex_id == 2 || quad_vertex_id == 5) {
-    view_clipping_distances(out_pos1);
+    view_clipping_distances(out_pos1.xyz);

    interp.color = finalColor_geom[1];
    t = edge_dir * (line_size * (is_persp ? out_pos1.w : 1.0));
    gl_Position = out_pos1 + vec4(t, 0.0, 0.0);
  }
  else if (quad_vertex_id == 4) {
-    view_clipping_distances(out_pos1);
+    view_clipping_distances(out_pos1.xyz);

    interp.color = finalColor_geom[1];
    t = edge_dir * (line_size * (is_persp ? out_pos1.w : 1.0));
--- a/source/blender/draw/intern/draw_manager_data.cc
+++ b/source/blender/draw/intern/draw_manager_data.cc
@ -1714,23 +1714,32 @@ static void drw_shgroup_init(DRWShadingGroup *shgroup, GPUShader *shader)
  }

 #ifdef DEBUG
-  int debug_print_location = GPU_shader_get_builtin_ssbo(shader, GPU_STORAGE_BUFFER_DEBUG_PRINT);
-  if (debug_print_location != -1) {
-    GPUStorageBuf *buf = drw_debug_gpu_print_buf_get();
-    drw_shgroup_uniform_create_ex(
-        shgroup, debug_print_location, DRW_UNIFORM_STORAGE_BLOCK, buf, GPU_SAMPLER_DEFAULT, 0, 1);
+  /* TODO(Metal): Support Shader debug print.
+   * This is not currently supported by Metal Backend. */
+  if (GPU_backend_get_type() != GPU_BACKEND_METAL) {
+    int debug_print_location = GPU_shader_get_builtin_ssbo(shader, GPU_STORAGE_BUFFER_DEBUG_PRINT);
+    if (debug_print_location != -1) {
+      GPUStorageBuf *buf = drw_debug_gpu_print_buf_get();
+      drw_shgroup_uniform_create_ex(shgroup,
+                                    debug_print_location,
+                                    DRW_UNIFORM_STORAGE_BLOCK,
+                                    buf,
+                                    GPU_SAMPLER_DEFAULT,
+                                    0,
+                                    1);
 #  ifndef DISABLE_DEBUG_SHADER_PRINT_BARRIER
-    /* Add a barrier to allow multiple shader writing to the same buffer. */
-    DRW_shgroup_barrier(shgroup, GPU_BARRIER_SHADER_STORAGE);
+      /* Add a barrier to allow multiple shader writing to the same buffer. */
+      DRW_shgroup_barrier(shgroup, GPU_BARRIER_SHADER_STORAGE);
 #  endif
-  }
+    }

-  int debug_draw_location = GPU_shader_get_builtin_ssbo(shader, GPU_STORAGE_BUFFER_DEBUG_VERTS);
-  if (debug_draw_location != -1) {
-    GPUStorageBuf *buf = drw_debug_gpu_draw_buf_get();
-    drw_shgroup_uniform_create_ex(
-        shgroup, debug_draw_location, DRW_UNIFORM_STORAGE_BLOCK, buf, GPU_SAMPLER_DEFAULT, 0, 1);
-    /* NOTE(fclem): No barrier as ordering is not important. */
+    int debug_draw_location = GPU_shader_get_builtin_ssbo(shader, GPU_STORAGE_BUFFER_DEBUG_VERTS);
+    if (debug_draw_location != -1) {
+      GPUStorageBuf *buf = drw_debug_gpu_draw_buf_get();
+      drw_shgroup_uniform_create_ex(
+          shgroup, debug_draw_location, DRW_UNIFORM_STORAGE_BLOCK, buf, GPU_SAMPLER_DEFAULT, 0, 1);
+      /* NOTE(fclem): No barrier as ordering is not important. */
+    }
  }
 #endif

--- a/source/blender/gpu/CMakeLists.txt
+++ b/source/blender/gpu/CMakeLists.txt
@ -186,9 +186,11 @@ set(OPENGL_SRC

 set(METAL_SRC
  metal/mtl_backend.mm
+  metal/mtl_batch.mm
  metal/mtl_command_buffer.mm
  metal/mtl_context.mm
  metal/mtl_debug.mm
+  metal/mtl_drawlist.mm
  metal/mtl_framebuffer.mm
  metal/mtl_immediate.mm
  metal/mtl_index_buffer.mm
--- a/source/blender/gpu/intern/gpu_texture_private.hh
+++ b/source/blender/gpu/intern/gpu_texture_private.hh
@ -431,15 +431,16 @@ inline bool validate_data_format(eGPUTextureFormat tex_format, eGPUDataFormat da
    case GPU_DEPTH_COMPONENT24:
    case GPU_DEPTH_COMPONENT16:
    case GPU_DEPTH_COMPONENT32F:
-      return data_format == GPU_DATA_FLOAT;
+      return ELEM(data_format, GPU_DATA_FLOAT, GPU_DATA_UINT);
    case GPU_DEPTH24_STENCIL8:
    case GPU_DEPTH32F_STENCIL8:
-      return data_format == GPU_DATA_UINT_24_8;
+      return ELEM(data_format, GPU_DATA_UINT_24_8, GPU_DATA_UINT);
    case GPU_R8UI:
    case GPU_R16UI:
    case GPU_RG16UI:
    case GPU_R32UI:
      return data_format == GPU_DATA_UINT;
+    case GPU_R32I:
    case GPU_RG16I:
    case GPU_R16I:
      return data_format == GPU_DATA_INT;
@ -453,6 +454,8 @@ inline bool validate_data_format(eGPUTextureFormat tex_format, eGPUDataFormat da
      return ELEM(data_format, GPU_DATA_2_10_10_10_REV, GPU_DATA_FLOAT);
    case GPU_R11F_G11F_B10F:
      return ELEM(data_format, GPU_DATA_10_11_11_REV, GPU_DATA_FLOAT);
+    case GPU_RGBA16F:
+      return ELEM(data_format, GPU_DATA_HALF_FLOAT, GPU_DATA_FLOAT);
    default:
      return data_format == GPU_DATA_FLOAT;
  }
--- a/source/blender/gpu/intern/gpu_viewport.c
+++ b/source/blender/gpu/intern/gpu_viewport.c
@ -147,6 +147,10 @@ static void gpu_viewport_textures_create(GPUViewport *viewport)
  if (viewport->depth_tx == NULL) {
    viewport->depth_tx = GPU_texture_create_2d(
        "dtxl_depth", UNPACK2(size), 1, GPU_DEPTH24_STENCIL8, NULL);
+    if (GPU_clear_viewport_workaround()) {
+      static int depth_clear = 0;
+      GPU_texture_clear(viewport->depth_tx, GPU_DATA_UINT_24_8, &depth_clear);
+    }
  }

  if (!viewport->depth_tx || !viewport->color_render_tx[0] || !viewport->color_overlay_tx[0]) {
--- a/source/blender/gpu/metal/mtl_backend.mm
+++ b/source/blender/gpu/metal/mtl_backend.mm
@ -47,13 +47,11 @@ Context *MTLBackend::context_alloc(void *ghost_window, void *ghost_context)

 Batch *MTLBackend::batch_alloc()
 {
-  /* TODO(Metal): Full MTLBatch implementation. */
  return new MTLBatch();
 };

 DrawList *MTLBackend::drawlist_alloc(int list_length)
 {
-  /* TODO(Metal): Full MTLDrawList implementation. */
  return new MTLDrawList(list_length);
 };

@ -420,6 +418,7 @@ void MTLBackend::capabilities_init(MTLContext *ctx)
  GCaps.depth_blitting_workaround = false;
  GCaps.use_main_context_workaround = false;
  GCaps.broken_amd_driver = false;
+  GCaps.clear_viewport_workaround = true;

  /* Metal related workarounds. */
  /* Minimum per-vertex stride is 4 bytes in Metal.
--- a/source/blender/gpu/metal/mtl_batch.hh
+++ b/source/blender/gpu/metal/mtl_batch.hh
@ -10,31 +10,126 @@
 #pragma once

 #include "MEM_guardedalloc.h"
-
 #include "gpu_batch_private.hh"
+#include "mtl_index_buffer.hh"
+#include "mtl_primitive.hh"
+#include "mtl_shader.hh"
+#include "mtl_vertex_buffer.hh"

-namespace blender {
-namespace gpu {
+namespace blender::gpu {
+
+class MTLContext;
+class MTLShaderInterface;
+
+#define GPU_VAO_STATIC_LEN 64
+
+struct VertexBufferID {
+  uint32_t id : 16;
+  uint32_t is_instance : 15;
+  uint32_t used : 1;
+};

-/* Pass-through MTLBatch. TODO(Metal): Implement. */
 class MTLBatch : public Batch {
- public:
-  void draw(int v_first, int v_count, int i_first, int i_count) override
-  {
-  }

+  /* Vertex Bind-state Caching for a given shader interface used with the Batch. */
+  struct VertexDescriptorShaderInterfacePair {
+    MTLVertexDescriptor vertex_descriptor{};
+    const ShaderInterface *interface = nullptr;
+    uint16_t attr_mask{};
+    int num_buffers{};
+    VertexBufferID bufferIds[GPU_BATCH_VBO_MAX_LEN] = {};
+    /* Cache life index compares a cache entry with the active MTLBatch state.
+     * This is initially set to the cache life index of MTLBatch. If the batch has been modified,
+     * this index is incremented to cheaply invalidate existing cache entries.  */
+    uint32_t cache_life_index = 0;
+  };
+
+  class MTLVertexDescriptorCache {
+
+   private:
+    MTLBatch *batch_;
+
+    VertexDescriptorShaderInterfacePair cache_[GPU_VAO_STATIC_LEN] = {};
+    MTLContext *cache_context_ = nullptr;
+    uint32_t cache_life_index_ = 0;
+
+   public:
+    MTLVertexDescriptorCache(MTLBatch *batch) : batch_(batch){};
+    VertexDescriptorShaderInterfacePair *find(const ShaderInterface *interface);
+    bool insert(VertexDescriptorShaderInterfacePair &data);
+
+   private:
+    void vertex_descriptor_cache_init(MTLContext *ctx);
+    void vertex_descriptor_cache_clear();
+    void vertex_descriptor_cache_ensure();
+  };
+
+ private:
+  MTLShader *active_shader_ = nullptr;
+  bool shader_in_use_ = false;
+  MTLVertexDescriptorCache vao_cache = {this};
+
+  /* Topology emulation. */
+  gpu::MTLBuffer *emulated_topology_buffer_ = nullptr;
+  GPUPrimType emulated_topology_type_;
+  uint32_t topology_buffer_input_v_count_ = 0;
+  uint32_t topology_buffer_output_v_count_ = 0;
+
+ public:
+  MTLBatch(){};
+  ~MTLBatch(){};
+
+  void draw(int v_first, int v_count, int i_first, int i_count) override;
  void draw_indirect(GPUStorageBuf *indirect_buf, intptr_t offset) override
  {
+    /* TODO(Metal): Support indirect draw commands. */
  }
-
  void multi_draw_indirect(GPUStorageBuf *indirect_buf,
                           int count,
                           intptr_t offset,
                           intptr_t stride) override
  {
+    /* TODO(Metal): Support indirect draw commands. */
  }
+
+  /* Returns an initialized RenderComandEncoder for drawing if all is good.
+   * Otherwise, nil. */
+  id<MTLRenderCommandEncoder> bind(uint v_first, uint v_count, uint i_first, uint i_count);
+  void unbind();
+
+  /* Convenience getters. */
+  MTLIndexBuf *elem_() const
+  {
+    return static_cast<MTLIndexBuf *>(unwrap(elem));
+  }
+  MTLVertBuf *verts_(const int index) const
+  {
+    return static_cast<MTLVertBuf *>(unwrap(verts[index]));
+  }
+  MTLVertBuf *inst_(const int index) const
+  {
+    return static_cast<MTLVertBuf *>(unwrap(inst[index]));
+  }
+  MTLShader *active_shader_get() const
+  {
+    return active_shader_;
+  }
+
+ private:
+  void shader_bind();
+  void draw_advanced(int v_first, int v_count, int i_first, int i_count);
+  int prepare_vertex_binding(MTLVertBuf *verts,
+                             MTLRenderPipelineStateDescriptor &desc,
+                             const MTLShaderInterface *interface,
+                             uint16_t &attr_mask,
+                             bool instanced);
+
+  id<MTLBuffer> get_emulated_toplogy_buffer(GPUPrimType &in_out_prim_type, uint32_t &v_count);
+
+  void prepare_vertex_descriptor_and_bindings(
+      MTLVertBuf **buffers, int &num_buffers, int v_first, int v_count, int i_first, int i_count);
+
  MEM_CXX_CLASS_ALLOC_FUNCS("MTLBatch");
 };

-}  // namespace gpu
-}  // namespace blender
+}  // namespace blender::gpu
--- a/source/blender/gpu/metal/mtl_batch.mm
+++ b/source/blender/gpu/metal/mtl_batch.mm
@ -0,0 +1,995 @@
+
+/** \file
+ * \ingroup gpu
+ *
+ * Metal implementation of GPUBatch.
+ */
+
+#include "BLI_assert.h"
+#include "BLI_span.hh"
+
+#include "BKE_global.h"
+
+#include "GPU_common.h"
+#include "gpu_batch_private.hh"
+#include "gpu_shader_private.hh"
+
+#include "mtl_batch.hh"
+#include "mtl_context.hh"
+#include "mtl_debug.hh"
+#include "mtl_index_buffer.hh"
+#include "mtl_shader.hh"
+#include "mtl_vertex_buffer.hh"
+
+#include <string>
+
+namespace blender::gpu {
+
+/* -------------------------------------------------------------------- */
+/** \name Creation & Deletion
+ * \{ */
+void MTLBatch::draw(int v_first, int v_count, int i_first, int i_count)
+{
+  if (this->flag & GPU_BATCH_INVALID) {
+    this->shader_in_use_ = false;
+  }
+  this->draw_advanced(v_first, v_count, i_first, i_count);
+}
+
+void MTLBatch::shader_bind()
+{
+  if (active_shader_ && active_shader_->is_valid()) {
+    active_shader_->bind();
+    shader_in_use_ = true;
+  }
+}
+
+void MTLBatch::MTLVertexDescriptorCache::vertex_descriptor_cache_init(MTLContext *ctx)
+{
+  BLI_assert(ctx != nullptr);
+  this->vertex_descriptor_cache_clear();
+  cache_context_ = ctx;
+}
+
+void MTLBatch::MTLVertexDescriptorCache::vertex_descriptor_cache_clear()
+{
+  cache_life_index_++;
+  cache_context_ = nullptr;
+}
+
+void MTLBatch::MTLVertexDescriptorCache::vertex_descriptor_cache_ensure()
+{
+  if (this->cache_context_ != nullptr) {
+
+    /* Invalidate vertex descriptor bindings cache if batch has changed. */
+    if (batch_->flag & GPU_BATCH_DIRTY) {
+      batch_->flag &= ~GPU_BATCH_DIRTY;
+      this->vertex_descriptor_cache_clear();
+    }
+  }
+
+  /* Initialise cache if not ready. */
+  if (cache_context_ == nullptr) {
+    this->vertex_descriptor_cache_init(MTLContext::get());
+  }
+}
+
+MTLBatch::VertexDescriptorShaderInterfacePair *MTLBatch::MTLVertexDescriptorCache::find(
+    const ShaderInterface *interface)
+{
+  this->vertex_descriptor_cache_ensure();
+  for (int i = 0; i < GPU_VAO_STATIC_LEN; ++i) {
+    if (cache_[i].interface == interface && cache_[i].cache_life_index == cache_life_index_) {
+      return &cache_[i];
+    }
+  }
+  return nullptr;
+}
+
+bool MTLBatch::MTLVertexDescriptorCache::insert(
+    MTLBatch::VertexDescriptorShaderInterfacePair &data)
+{
+  vertex_descriptor_cache_ensure();
+  for (int i = 0; i < GPU_VAO_STATIC_LEN; ++i) {
+    if (cache_[i].interface == nullptr || cache_[i].cache_life_index != cache_life_index_) {
+      cache_[i] = data;
+      cache_[i].cache_life_index = cache_life_index_;
+      return true;
+    }
+  }
+  return false;
+}
+
+int MTLBatch::prepare_vertex_binding(MTLVertBuf *verts,
+                                     MTLRenderPipelineStateDescriptor &desc,
+                                     const MTLShaderInterface *interface,
+                                     uint16_t &attr_mask,
+                                     bool instanced)
+{
+
+  const GPUVertFormat *format = &verts->format;
+  /* Whether the current vertex buffer has been added to the buffer layout descriptor. */
+  bool buffer_added = false;
+  /* Per-vertex stride of current vertex buffer. */
+  int buffer_stride = format->stride;
+  /* Buffer binding index of the vertex buffer once added to the buffer layout descriptor. */
+  int buffer_index = -1;
+  int attribute_offset = 0;
+
+  if (!active_shader_->get_uses_ssbo_vertex_fetch()) {
+    BLI_assert(
+        buffer_stride >= 4 &&
+        "In Metal, Vertex buffer stride should be 4. SSBO Vertex fetch is not affected by this");
+  }
+
+  /* Iterate over GPUVertBuf vertex format and find attributes matching those in the active
+   * shader's interface. */
+  for (uint32_t a_idx = 0; a_idx < format->attr_len; a_idx++) {
+    const GPUVertAttr *a = &format->attrs[a_idx];
+
+    if (format->deinterleaved) {
+      attribute_offset += ((a_idx == 0) ? 0 : format->attrs[a_idx - 1].size) * verts->vertex_len;
+      buffer_stride = a->size;
+    }
+    else {
+      attribute_offset = a->offset;
+    }
+
+    /* Find attribute with the matching name. Attributes may have multiple compatible
+     * name aliases. */
+    for (uint32_t n_idx = 0; n_idx < a->name_len; n_idx++) {
+      const char *name = GPU_vertformat_attr_name_get(format, a, n_idx);
+      const ShaderInput *input = interface->attr_get(name);
+
+      if (input == nullptr || input->location == -1) {
+        /* Vertex/instance buffers provided have attribute data for attributes which are not needed
+         * by this particular shader. This shader only needs binding information for the attributes
+         * has in the shader interface. */
+        MTL_LOG_WARNING(
+            "MTLBatch: Could not find attribute with name '%s' (defined in active vertex format) "
+            "in the shader interface for shader '%s'\n",
+            name,
+            interface->get_name());
+        continue;
+      }
+
+      /* Fetch metal attribute information. */
+      const MTLShaderInputAttribute &mtl_attr = interface->get_attribute(input->location);
+      BLI_assert(mtl_attr.location >= 0);
+      /* Verify that the attribute location from the shader interface
+       * matches the attribute location returned. */
+      BLI_assert(mtl_attr.location == input->location);
+
+      /* Check if attribute is already present in the given slot. */
+      if ((~attr_mask) & (1 << mtl_attr.location)) {
+        MTL_LOG_INFO(
+            "  -- [Batch] Skipping attribute with input location %d (As one is already bound)\n",
+            mtl_attr.location);
+      }
+      else {
+
+        /* Update attribute used-slot mask. */
+        attr_mask &= ~(1 << mtl_attr.location);
+
+        /* Add buffer layout entry in descriptor if it has not yet been added
+         * for current vertex buffer. */
+        if (!buffer_added) {
+          buffer_index = desc.vertex_descriptor.num_vert_buffers;
+          desc.vertex_descriptor.buffer_layouts[buffer_index].step_function =
+              (instanced) ? MTLVertexStepFunctionPerInstance : MTLVertexStepFunctionPerVertex;
+          desc.vertex_descriptor.buffer_layouts[buffer_index].step_rate = 1;
+          desc.vertex_descriptor.buffer_layouts[buffer_index].stride = buffer_stride;
+          desc.vertex_descriptor.num_vert_buffers++;
+          buffer_added = true;
+
+          MTL_LOG_INFO("  -- [Batch] Adding source %s buffer (Index: %d, Stride: %d)\n",
+                       (instanced) ? "instance" : "vertex",
+                       buffer_index,
+                       buffer_stride);
+        }
+        else {
+          /* Ensure stride is correct for de-interlevaed attributes. */
+          desc.vertex_descriptor.buffer_layouts[buffer_index].stride = buffer_stride;
+        }
+
+        /* Handle Matrix/Array vertex attribute types.
+         * Metal does not natively support these as attribute types, so we handle these cases
+         * by stacking together compatible types (e.g. 4xVec4 for Mat4) and combining
+         * the data in the shader.
+         * The generated Metal shader will contain a generated input binding, which reads
+         * in individual attributes and merges them into the desired type after vertex
+         * assembly. e.g. a Mat4 (Float4x4) will generate 4 Float4 attributes. */
+        if (a->comp_len == 16 || a->comp_len == 12 || a->comp_len == 8) {
+          BLI_assert_msg(
+              a->comp_len == 16,
+              "only mat4 attributes currently supported -- Not ready to handle other long "
+              "component length attributes yet");
+
+          /* SSBO Vertex Fetch Attribute safety checks. */
+          if (active_shader_->get_uses_ssbo_vertex_fetch()) {
+            /* When using SSBO vertex fetch, we do not need to expose split attributes,
+             * A matrix can be read directly as a whole block of contiguous data. */
+            MTLSSBOAttribute ssbo_attr(mtl_attr.index,
+                                       buffer_index,
+                                       attribute_offset,
+                                       buffer_stride,
+                                       GPU_SHADER_ATTR_TYPE_MAT4,
+                                       instanced);
+            active_shader_->ssbo_vertex_fetch_bind_attribute(ssbo_attr);
+            desc.vertex_descriptor.ssbo_attributes[desc.vertex_descriptor.num_ssbo_attributes] =
+                ssbo_attr;
+            desc.vertex_descriptor.num_ssbo_attributes++;
+          }
+          else {
+
+            /* Handle Mat4 attributes. */
+            if (a->comp_len == 16) {
+              /* Debug safety checks. */
+              BLI_assert_msg(mtl_attr.matrix_element_count == 4,
+                             "mat4 type expected but there are fewer components");
+              BLI_assert_msg(mtl_attr.size == 16, "Expecting subtype 'vec4' with 16 bytes");
+              BLI_assert_msg(
+                  mtl_attr.format == MTLVertexFormatFloat4,
+                  "Per-attribute vertex format MUST be float4 for an input type of 'mat4'");
+
+              /* We have found the 'ROOT' attribute. A mat4 contains 4 consecutive float4 attribute
+               * locations we must map to. */
+              for (int i = 0; i < a->comp_len / 4; i++) {
+                desc.vertex_descriptor.attributes[mtl_attr.location + i].format =
+                    MTLVertexFormatFloat4;
+                /* Data is consecutive in the buffer for the whole matrix, each float4 will shift
+                 * the offset by 16 bytes. */
+                desc.vertex_descriptor.attributes[mtl_attr.location + i].offset =
+                    attribute_offset + i * 16;
+                /* All source data for a matrix is in the same singular buffer. */
+                desc.vertex_descriptor.attributes[mtl_attr.location + i].buffer_index =
+                    buffer_index;
+
+                /* Update total attribute account. */
+                desc.vertex_descriptor.num_attributes = max_ii(
+                    mtl_attr.location + i + 1, desc.vertex_descriptor.num_attributes);
+                MTL_LOG_INFO("-- Sub-Attrib Location: %d, offset: %d, buffer index: %d\n",
+                             mtl_attr.location + i,
+                             attribute_offset + i * 16,
+                             buffer_index);
+              }
+              MTL_LOG_INFO(
+                  "Float4x4 attribute type added for '%s' at attribute locations: %d to %d\n",
+                  name,
+                  mtl_attr.location,
+                  mtl_attr.location + 3);
+            }
+
+            /* Ensure we are not exceeding the attribute limit. */
+            BLI_assert(desc.vertex_descriptor.num_attributes <= MTL_MAX_VERTEX_INPUT_ATTRIBUTES);
+          }
+        }
+        else {
+
+          /* Handle Any required format conversions.
+           * NOTE(Metal): If there is a mis-match between the format of an attribute
+           * in the shader interface, and the specified format in the VertexBuffer VertexFormat,
+           * we need to perform a format conversion.
+           *
+           * The Metal API can perform certain conversions internally during vertex assembly:
+           *   - Type Normalization e.g short2 to float2 between 0.0 to 1.0.
+           *   - Type Truncation e.g. Float4 to Float2.
+           *   - Type expansion e,g, Float3 to Float4 (Following 0,0,0,1 for assignment to empty
+           * elements).
+           *
+           * Certain conversion cannot be performed however, and in these cases, we need to
+           * instruct the shader to generate a specialised version with a conversion routine upon
+           * attribute read.
+           *   - This handles cases such as conversion between types e.g. Integer to float without
+           * normalization.
+           *
+           * For more information on the supported and unsupported conversions, see:
+           * https://developer.apple.com/documentation/metal/mtlvertexattributedescriptor/1516081-format?language=objc
+           */
+          MTLVertexFormat converted_format;
+          bool can_use_internal_conversion = mtl_convert_vertex_format(
+              mtl_attr.format,
+              (GPUVertCompType)a->comp_type,
+              a->comp_len,
+              (GPUVertFetchMode)a->fetch_mode,
+              &converted_format);
+          bool is_floating_point_format = (a->comp_type == GPU_COMP_F32);
+
+          if (can_use_internal_conversion) {
+            desc.vertex_descriptor.attributes[mtl_attr.location].format = converted_format;
+            desc.vertex_descriptor.attributes[mtl_attr.location].format_conversion_mode =
+                is_floating_point_format ? (GPUVertFetchMode)GPU_FETCH_FLOAT :
+                                           (GPUVertFetchMode)GPU_FETCH_INT;
+            BLI_assert(converted_format != MTLVertexFormatInvalid);
+          }
+          else {
+            /* The internal implicit conversion is not supported.
+             * In this case, we need to handle conversion inside the shader.
+             * This is handled using `format_conversion_mode`.
+             * `format_conversion_mode` is assigned the blender-specified fetch mode (GPU_FETCH_*).
+             * This then controls how a given attribute is interpreted. The data will be read
+             * as specified and then converted appropriately to the correct form.
+             *
+             * e.g. if `GPU_FETCH_INT_TO_FLOAT` is specified, the specialised read-routine
+             * in the shader will read the data as an int, and cast this to floating point
+             * representation. (Rather than reading the source data as float).
+             *
+             * NOTE: Even if full conversion is not supported, we may still partially perform an
+             * implicit conversion where possible, such as vector truncation or expansion. */
+            MTLVertexFormat converted_format;
+            bool can_convert = mtl_vertex_format_resize(
+                mtl_attr.format, a->comp_len, &converted_format);
+            desc.vertex_descriptor.attributes[mtl_attr.location].format = can_convert ?
+                                                                              converted_format :
+                                                                              mtl_attr.format;
+            desc.vertex_descriptor.attributes[mtl_attr.location].format_conversion_mode =
+                (GPUVertFetchMode)a->fetch_mode;
+            BLI_assert(desc.vertex_descriptor.attributes[mtl_attr.location].format !=
+                       MTLVertexFormatInvalid);
+          }
+          desc.vertex_descriptor.attributes[mtl_attr.location].offset = attribute_offset;
+          desc.vertex_descriptor.attributes[mtl_attr.location].buffer_index = buffer_index;
+          desc.vertex_descriptor.num_attributes = ((mtl_attr.location + 1) >
+                                                   desc.vertex_descriptor.num_attributes) ?
+                                                      (mtl_attr.location + 1) :
+                                                      desc.vertex_descriptor.num_attributes;
+
+          /* SSBO Vertex Fetch attribute bind. */
+          if (active_shader_->get_uses_ssbo_vertex_fetch()) {
+            BLI_assert_msg(desc.vertex_descriptor.attributes[mtl_attr.location].format ==
+                               mtl_attr.format,
+                           "SSBO Vertex Fetch does not support attribute conversion.");
+
+            MTLSSBOAttribute ssbo_attr(
+                mtl_attr.index,
+                buffer_index,
+                attribute_offset,
+                buffer_stride,
+                MTLShader::ssbo_vertex_type_to_attr_type(
+                    desc.vertex_descriptor.attributes[mtl_attr.location].format),
+                instanced);
+
+            active_shader_->ssbo_vertex_fetch_bind_attribute(ssbo_attr);
+            desc.vertex_descriptor.ssbo_attributes[desc.vertex_descriptor.num_ssbo_attributes] =
+                ssbo_attr;
+            desc.vertex_descriptor.num_ssbo_attributes++;
+          }
+
+          /* NOTE: We are setting num_attributes to be up to the maximum found index, because of
+           * this, it is possible that we may skip over certain attributes if they were not in the
+           * source GPUVertFormat. */
+          MTL_LOG_INFO(
+              " -- Batch Attribute(%d): ORIG Shader Format: %d, ORIG Vert format: %d, Vert "
+              "components: %d, Fetch Mode %d --> FINAL FORMAT: %d\n",
+              mtl_attr.location,
+              (int)mtl_attr.format,
+              (int)a->comp_type,
+              (int)a->comp_len,
+              (int)a->fetch_mode,
+              (int)desc.vertex_descriptor.attributes[mtl_attr.location].format);
+
+          MTL_LOG_INFO(
+              "  -- [Batch] matching %s attribute '%s' (Attribute Index: %d, Buffer index: %d, "
+              "offset: %d)\n",
+              (instanced) ? "instance" : "vertex",
+              name,
+              mtl_attr.location,
+              buffer_index,
+              attribute_offset);
+        }
+      }
+    }
+  }
+  if (buffer_added) {
+    return buffer_index;
+  }
+  return -1;
+}
+
+id<MTLRenderCommandEncoder> MTLBatch::bind(uint v_first, uint v_count, uint i_first, uint i_count)
+{
+  /* Setup draw call and render pipeline state here. Called by every draw, but setup here so that
+   * MTLDrawList only needs to perform setup a single time. */
+  BLI_assert(this);
+
+  /* Fetch Metal device. */
+  MTLContext *ctx = MTLContext::get();
+  if (!ctx) {
+    BLI_assert_msg(false, "No context available for rendering.");
+    return nil;
+  }
+
+  /* Verify Shader. */
+  active_shader_ = (shader) ? static_cast<MTLShader *>(unwrap(shader)) : nullptr;
+
+  if (active_shader_ == nullptr || !active_shader_->is_valid()) {
+    /* Skip drawing if there is no vaid Metal shader.
+     * This will occur if the path through which the shader is prepared
+     * is invalid (e.g. Python without create-info), or, the source shader uses a geometry pass. */
+    BLI_assert_msg(false, "No valid Metal shader!");
+    return nil;
+  }
+
+  /* Check if using SSBO Fetch Mode.
+   * This is an alternative drawing mode to geometry shaders, wherein vertex buffers
+   * are bound as readable (random-access) GPU buffers and certain descriptor properties
+   * are passed using Shader uniforms. */
+  bool uses_ssbo_fetch = active_shader_->get_uses_ssbo_vertex_fetch();
+
+  /* Prepare Vertex Descriptor and extract VertexBuffers to bind. */
+  MTLVertBuf *buffers[GPU_BATCH_VBO_MAX_LEN] = {nullptr};
+  int num_buffers = 0;
+
+  /* Ensure Index Buffer is ready. */
+  MTLIndexBuf *mtl_elem = static_cast<MTLIndexBuf *>(reinterpret_cast<IndexBuf *>(this->elem));
+  if (mtl_elem != NULL) {
+    mtl_elem->upload_data();
+  }
+
+  /* Populate vertex descriptor with attribute binding information.
+   * The vertex descriptor and buffer layout descriptors describe
+   * how vertex data from bound vertex buffers maps to the
+   * shader's input.
+   * A unique vertex descriptor will result in a new PipelineStateObject
+   * being generated for the currently bound shader. */
+  prepare_vertex_descriptor_and_bindings(buffers, num_buffers, v_first, v_count, i_first, i_count);
+
+  /* Prepare Vertex Buffers - Run before RenderCommandEncoder in case BlitCommandEncoder buffer
+   * data operations are required. */
+  for (int i = 0; i < num_buffers; i++) {
+    MTLVertBuf *buf_at_index = buffers[i];
+    if (buf_at_index == NULL) {
+      BLI_assert_msg(
+          false,
+          "Total buffer count does not match highest buffer index, could be gaps in bindings");
+      continue;
+    }
+
+    MTLVertBuf *mtlvbo = static_cast<MTLVertBuf *>(reinterpret_cast<VertBuf *>(buf_at_index));
+    mtlvbo->bind();
+  }
+
+  /* Ensure render pass is active and fetch active RenderCommandEncoder. */
+  id<MTLRenderCommandEncoder> rec = ctx->ensure_begin_render_pass();
+
+  /* Fetch RenderPassState to enable resource binding for active pass. */
+  MTLRenderPassState &rps = ctx->main_command_buffer.get_render_pass_state();
+
+  /* Debug Check: Ensure Framebuffer instance is not dirty. */
+  BLI_assert(!ctx->main_command_buffer.get_active_framebuffer()->get_dirty());
+
+  /* Bind Shader. */
+  this->shader_bind();
+
+  /* GPU debug markers. */
+  if (G.debug & G_DEBUG_GPU) {
+    [rec pushDebugGroup:[NSString stringWithFormat:@"batch_bind%@(shader: %s)",
+                                                   this->elem ? @"(indexed)" : @"",
+                                                   active_shader_->get_interface()->get_name()]];
+    [rec insertDebugSignpost:[NSString
+                                 stringWithFormat:@"batch_bind%@(shader: %s)",
+                                                  this->elem ? @"(indexed)" : @"",
+                                                  active_shader_->get_interface()->get_name()]];
+  }
+
+  /* Ensure Context Render Pipeline State is fully setup and ready to execute the draw. */
+  MTLPrimitiveType mtl_prim_type = gpu_prim_type_to_metal(this->prim_type);
+  if (!ctx->ensure_render_pipeline_state(mtl_prim_type)) {
+    printf("FAILED TO ENSURE RENDER PIPELINE STATE");
+    BLI_assert(false);
+
+    if (G.debug & G_DEBUG_GPU) {
+      [rec popDebugGroup];
+    }
+    return nil;
+  }
+
+  /*** Bind Vertex Buffers and Index Buffers **/
+
+  /* SSBO Vertex Fetch Buffer bindings. */
+  if (uses_ssbo_fetch) {
+
+    /* SSBO Vertex Fetch - Bind Index Buffer to appropriate slot -- if used. */
+    id<MTLBuffer> idx_buffer = nil;
+    GPUPrimType final_prim_type = this->prim_type;
+
+    if (mtl_elem != nullptr) {
+
+      /* Fetch index buffer. This function can situationally return an optimised
+       * index buffer of a different primtiive type. If this is the case, `final_prim_type`
+       * and `v_count` will be updated with the new format.
+       * NOTE: For indexed rendering, v_count represents the number of indices. */
+      idx_buffer = mtl_elem->get_index_buffer(final_prim_type, v_count);
+      BLI_assert(idx_buffer != nil);
+
+      /* Update uniforms for SSBO-vertex-fetch-mode indexed rendering to flag usage. */
+      int &uniform_ssbo_index_mode_u16 = active_shader_->uni_ssbo_uses_index_mode_u16;
+      BLI_assert(uniform_ssbo_index_mode_u16 != -1);
+      int uses_index_mode_u16 = (mtl_elem->index_type_ == GPU_INDEX_U16) ? 1 : 0;
+      active_shader_->uniform_int(uniform_ssbo_index_mode_u16, 1, 1, &uses_index_mode_u16);
+    }
+    else {
+      idx_buffer = ctx->get_null_buffer();
+    }
+    rps.bind_vertex_buffer(idx_buffer, 0, MTL_SSBO_VERTEX_FETCH_IBO_INDEX);
+
+    /* Ensure all attributes are set */
+    active_shader_->ssbo_vertex_fetch_bind_attributes_end(rec);
+
+    /* Bind NULL Buffers for unused vertex data slots. */
+    id<MTLBuffer> null_buffer = ctx->get_null_buffer();
+    BLI_assert(null_buffer != nil);
+    for (int i = num_buffers; i < MTL_SSBO_VERTEX_FETCH_MAX_VBOS; i++) {
+      if (rps.cached_vertex_buffer_bindings[i].metal_buffer == nil) {
+        rps.bind_vertex_buffer(null_buffer, 0, i);
+      }
+    }
+
+    /* Flag whether Indexed rendering is used or not. */
+    int &uniform_ssbo_use_indexed = active_shader_->uni_ssbo_uses_indexed_rendering;
+    BLI_assert(uniform_ssbo_use_indexed != -1);
+    int uses_indexed_rendering = (mtl_elem != NULL) ? 1 : 0;
+    active_shader_->uniform_int(uniform_ssbo_use_indexed, 1, 1, &uses_indexed_rendering);
+
+    /* Set SSBO-fetch-mode status uniforms. */
+    BLI_assert(active_shader_->uni_ssbo_input_prim_type_loc != -1);
+    BLI_assert(active_shader_->uni_ssbo_input_vert_count_loc != -1);
+    GPU_shader_uniform_vector_int(reinterpret_cast<GPUShader *>(wrap(active_shader_)),
+                                  active_shader_->uni_ssbo_input_prim_type_loc,
+                                  1,
+                                  1,
+                                  (const int *)(&final_prim_type));
+    GPU_shader_uniform_vector_int(reinterpret_cast<GPUShader *>(wrap(active_shader_)),
+                                  active_shader_->uni_ssbo_input_vert_count_loc,
+                                  1,
+                                  1,
+                                  (const int *)(&v_count));
+  }
+
+  /* Bind Vertex Buffers. */
+  for (int i = 0; i < num_buffers; i++) {
+    MTLVertBuf *buf_at_index = buffers[i];
+    if (buf_at_index == NULL) {
+      BLI_assert_msg(
+          false,
+          "Total buffer count does not match highest buffer index, could be gaps in bindings");
+      continue;
+    }
+    /* Buffer handle. */
+    MTLVertBuf *mtlvbo = static_cast<MTLVertBuf *>(reinterpret_cast<VertBuf *>(buf_at_index));
+    mtlvbo->flag_used();
+
+    /* Fetch buffer from MTLVertexBuffer and bind. */
+    id<MTLBuffer> mtl_buffer = mtlvbo->get_metal_buffer();
+
+    BLI_assert(mtl_buffer != nil);
+    rps.bind_vertex_buffer(mtl_buffer, 0, i);
+  }
+
+  if (G.debug & G_DEBUG_GPU) {
+    [rec popDebugGroup];
+  }
+
+  /* Return Render Command Encoder used with setup. */
+  return rec;
+}
+
+void MTLBatch::unbind()
+{
+}
+
+void MTLBatch::prepare_vertex_descriptor_and_bindings(
+    MTLVertBuf **buffers, int &num_buffers, int v_first, int v_count, int i_first, int i_count)
+{
+
+  /* Here we populate the MTLContext vertex descriptor and resolve which buffers need to be bound.
+   */
+  MTLStateManager *state_manager = static_cast<MTLStateManager *>(
+      MTLContext::get()->state_manager);
+  MTLRenderPipelineStateDescriptor &desc = state_manager->get_pipeline_descriptor();
+  const MTLShaderInterface *interface = active_shader_->get_interface();
+  uint16_t attr_mask = interface->get_enabled_attribute_mask();
+
+  /* Reset vertex descriptor to default state. */
+  desc.reset_vertex_descriptor();
+
+  /* Fetch Vertex and Instance Buffers. */
+  Span<MTLVertBuf *> mtl_verts(reinterpret_cast<MTLVertBuf **>(this->verts), GPU_BATCH_VBO_MAX_LEN);
+  Span<MTLVertBuf *> mtl_inst(reinterpret_cast<MTLVertBuf **>(this->inst), GPU_BATCH_INST_VBO_MAX_LEN);
+
+  /* SSBO Vertex fetch also passes vertex descriptor information into the shader. */
+  if (active_shader_->get_uses_ssbo_vertex_fetch()) {
+    active_shader_->ssbo_vertex_fetch_bind_attributes_begin();
+  }
+
+  /* Resolve Metal vertex buffer bindings. */
+  /* Vertex Descriptors
+   * ------------------
+   * Vertex Descriptors are required to generate a pipeline state, based on the current Batch's
+   * buffer bindings. These bindings are a unique matching, depending on what input attributes a
+   * batch has in its buffers, and those which are supported by the shader interface.
+
+   * We iterate through the buffers and resolve which attributes satisfy the requirements of the
+   * currently bound shader. We cache this data, for a given Batch<->ShderInterface pairing in a
+   * VAO cache to avoid the need to recalculate this data. */
+  bool buffer_is_instanced[GPU_BATCH_VBO_MAX_LEN] = {false};
+
+  VertexDescriptorShaderInterfacePair *descriptor = this->vao_cache.find(interface);
+  if (descriptor) {
+    desc.vertex_descriptor = descriptor->vertex_descriptor;
+    attr_mask = descriptor->attr_mask;
+    num_buffers = descriptor->num_buffers;
+
+    for (int bid = 0; bid < GPU_BATCH_VBO_MAX_LEN; ++bid) {
+      if (descriptor->bufferIds[bid].used) {
+        if (descriptor->bufferIds[bid].is_instance) {
+          buffers[bid] = mtl_inst[descriptor->bufferIds[bid].id];
+          buffer_is_instanced[bid] = true;
+        }
+        else {
+          buffers[bid] = mtl_verts[descriptor->bufferIds[bid].id];
+          buffer_is_instanced[bid] = false;
+        }
+      }
+    }
+
+    /* Use cached ssbo attribute binding data. */
+    if (active_shader_->get_uses_ssbo_vertex_fetch()) {
+      BLI_assert(desc.vertex_descriptor.uses_ssbo_vertex_fetch);
+      for (int attr_id = 0; attr_id < desc.vertex_descriptor.num_ssbo_attributes; attr_id++) {
+        active_shader_->ssbo_vertex_fetch_bind_attribute(
+            desc.vertex_descriptor.ssbo_attributes[attr_id]);
+      }
+    }
+  }
+  else {
+    VertexDescriptorShaderInterfacePair pair{};
+    pair.interface = interface;
+
+    for (int i = 0; i < GPU_BATCH_VBO_MAX_LEN; ++i) {
+      pair.bufferIds[i].id = -1;
+      pair.bufferIds[i].is_instance = 0;
+      pair.bufferIds[i].used = 0;
+    }
+    /* NOTE: Attribute extraction order from buffer is the reverse of the OpenGL as we flag once an
+     * attribute is found, rather than pre-setting the mask. */
+    /* Extract Instance attributes (These take highest priority). */
+    for (int v = 0; v < GPU_BATCH_INST_VBO_MAX_LEN; v++) {
+      if (mtl_inst[v]) {
+        MTL_LOG_INFO(" -- [Batch] Checking bindings for bound instance buffer %p\n", mtl_inst[v]);
+        int buffer_ind = this->prepare_vertex_binding(
+            mtl_inst[v], desc, interface, attr_mask, true);
+        if (buffer_ind >= 0) {
+          buffers[buffer_ind] = mtl_inst[v];
+          buffer_is_instanced[buffer_ind] = true;
+
+          pair.bufferIds[buffer_ind].id = v;
+          pair.bufferIds[buffer_ind].used = 1;
+          pair.bufferIds[buffer_ind].is_instance = 1;
+          num_buffers = ((buffer_ind + 1) > num_buffers) ? (buffer_ind + 1) : num_buffers;
+        }
+      }
+    }
+
+    /* Extract Vertex attribues (First-bound vertex buffer takes priority). */
+    for (int v = 0; v < GPU_BATCH_VBO_MAX_LEN; v++) {
+      if (mtl_verts[v] != NULL) {
+        MTL_LOG_INFO(" -- [Batch] Checking bindings for bound vertex buffer %p\n", mtl_verts[v]);
+        int buffer_ind = this->prepare_vertex_binding(
+            mtl_verts[v], desc, interface, attr_mask, false);
+        if (buffer_ind >= 0) {
+          buffers[buffer_ind] = mtl_verts[v];
+          buffer_is_instanced[buffer_ind] = false;
+
+          pair.bufferIds[buffer_ind].id = v;
+          pair.bufferIds[buffer_ind].used = 1;
+          pair.bufferIds[buffer_ind].is_instance = 0;
+          num_buffers = ((buffer_ind + 1) > num_buffers) ? (buffer_ind + 1) : num_buffers;
+        }
+      }
+    }
+
+    /* Add to VertexDescriptor cache */
+    desc.vertex_descriptor.uses_ssbo_vertex_fetch = active_shader_->get_uses_ssbo_vertex_fetch();
+    pair.attr_mask = attr_mask;
+    pair.vertex_descriptor = desc.vertex_descriptor;
+    pair.num_buffers = num_buffers;
+    if (!this->vao_cache.insert(pair)) {
+      printf(
+          "[Performance Warning] cache is full (Size: %d), vertex descriptor will not be cached\n",
+          GPU_VAO_STATIC_LEN);
+    }
+  }
+
+/* DEBUG: verify if our attribute bindings have been fully provided as expected. */
+#if MTL_DEBUG_SHADER_ATTRIBUTES == 1
+  if (attr_mask != 0) {
+    for (uint16_t mask = 1, a = 0; a < 16; a++, mask <<= 1) {
+      if (attr_mask & mask) {
+        /* Fallback for setting default attributes, for missed slots. Attributes flagged with
+         * 'MTLVertexFormatInvalid' in the vertex descriptor are bound to a NULL buffer during PSO
+         * creation. */
+        MTL_LOG_WARNING("MTLBatch: Missing expected attribute '%s' at index '%d' for shader: %s\n",
+                        this->active_shader->interface->attributes[a].name,
+                        a,
+                        interface->name);
+        /* Ensure any assigned attribute has not been given an invalid format. This should not
+         * occur and may be the result of an unsupported attribute type conversion. */
+        BLI_assert(desc.attributes[a].format == MTLVertexFormatInvalid);
+      }
+    }
+  }
+#endif
+}
+
+void MTLBatch::draw_advanced(int v_first, int v_count, int i_first, int i_count)
+{
+
+#if TRUST_NO_ONE
+  BLI_assert(v_count > 0 && i_count > 0);
+#endif
+
+  /* Setup RenderPipelineState for batch. */
+  MTLContext *ctx = reinterpret_cast<MTLContext *>(GPU_context_active_get());
+  id<MTLRenderCommandEncoder> rec = this->bind(v_first, v_count, i_first, i_count);
+  if (rec == nil) {
+    return;
+  }
+
+  /* Fetch IndexBuffer and resolve primitive type. */
+  MTLIndexBuf *mtl_elem = static_cast<MTLIndexBuf *>(reinterpret_cast<IndexBuf *>(this->elem));
+  MTLPrimitiveType mtl_prim_type = gpu_prim_type_to_metal(this->prim_type);
+
+  /* Render using SSBO Vertex Fetch. */
+  if (active_shader_->get_uses_ssbo_vertex_fetch()) {
+
+    /* Submit draw call with modified vertex count, which reflects vertices per primitive defined
+     * in the USE_SSBO_VERTEX_FETCH pragma. */
+    int num_input_primitives = gpu_get_prim_count_from_type(v_count, this->prim_type);
+    int output_num_verts = num_input_primitives *
+                           active_shader_->get_ssbo_vertex_fetch_output_num_verts();
+    BLI_assert_msg(
+        mtl_vertex_count_fits_primitive_type(
+            output_num_verts, active_shader_->get_ssbo_vertex_fetch_output_prim_type()),
+        "Output Vertex count is not compatible with the requested output vertex primitive type");
+    [rec drawPrimitives:active_shader_->get_ssbo_vertex_fetch_output_prim_type()
+            vertexStart:0
+            vertexCount:output_num_verts
+          instanceCount:i_count
+           baseInstance:i_first];
+    ctx->main_command_buffer.register_draw_counters(output_num_verts * i_count);
+  }
+  /* Perform regular draw. */
+  else if (mtl_elem == NULL) {
+
+    /* Primitive Type toplogy emulation. */
+    if (mtl_needs_topology_emulation(this->prim_type)) {
+
+      /* Generate index buffer for primitive types requiring emulation. */
+      GPUPrimType emulated_prim_type = this->prim_type;
+      uint32_t emulated_v_count = v_count;
+      id<MTLBuffer> generated_index_buffer = this->get_emulated_toplogy_buffer(emulated_prim_type,
+                                                                               emulated_v_count);
+      BLI_assert(generated_index_buffer != nil);
+
+      MTLPrimitiveType emulated_mtl_prim_type = gpu_prim_type_to_metal(emulated_prim_type);
+
+      /* Temp: Disable culling for emulated primitive types.
+       * TODO(Metal): Support face winding in topology buffer. */
+      [rec setCullMode:MTLCullModeNone];
+
+      if (generated_index_buffer != nil) {
+        BLI_assert(emulated_mtl_prim_type == MTLPrimitiveTypeTriangle ||
+                   emulated_mtl_prim_type == MTLPrimitiveTypeLine);
+        if (emulated_mtl_prim_type == MTLPrimitiveTypeTriangle) {
+          BLI_assert(emulated_v_count % 3 == 0);
+        }
+        if (emulated_mtl_prim_type == MTLPrimitiveTypeLine) {
+          BLI_assert(emulated_v_count % 2 == 0);
+        }
+
+        /* Set depth stencil state (requires knowledge of primitive type). */
+        ctx->ensure_depth_stencil_state(emulated_mtl_prim_type);
+
+        [rec drawIndexedPrimitives:emulated_mtl_prim_type
+                        indexCount:emulated_v_count
+                         indexType:MTLIndexTypeUInt32
+                       indexBuffer:generated_index_buffer
+                 indexBufferOffset:0
+                     instanceCount:i_count
+                        baseVertex:v_first
+                      baseInstance:i_first];
+      }
+      else {
+        printf("[Note] Cannot draw batch -- Emulated Topology mode: %u not yet supported\n",
+               this->prim_type);
+      }
+    }
+    else {
+      /* Set depth stencil state (requires knowledge of primitive type). */
+      ctx->ensure_depth_stencil_state(mtl_prim_type);
+
+      /* Issue draw call. */
+      [rec drawPrimitives:mtl_prim_type
+              vertexStart:v_first
+              vertexCount:v_count
+            instanceCount:i_count
+             baseInstance:i_first];
+    }
+    ctx->main_command_buffer.register_draw_counters(v_count * i_count);
+  }
+  /* Perform indexed draw. */
+  else {
+
+    MTLIndexType index_type = MTLIndexBuf::gpu_index_type_to_metal(mtl_elem->index_type_);
+    uint32_t base_index = mtl_elem->index_base_;
+    uint32_t index_size = (mtl_elem->index_type_ == GPU_INDEX_U16) ? 2 : 4;
+    uint32_t v_first_ofs = ((v_first + mtl_elem->index_start_) * index_size);
+    BLI_assert_msg((v_first_ofs % index_size) == 0,
+                   "Index offset is not 2/4-byte aligned as per METAL spec");
+
+    /* Fetch index buffer. May return an index buffer of a differing format,
+     * if index buffer optimisation is used. In these cases, final_prim_type and
+     * index_count get updated with the new properties. */
+    GPUPrimType final_prim_type = this->prim_type;
+    uint index_count = v_count;
+
+    id<MTLBuffer> index_buffer = mtl_elem->get_index_buffer(final_prim_type, index_count);
+    mtl_prim_type = gpu_prim_type_to_metal(final_prim_type);
+    BLI_assert(index_buffer != nil);
+
+    if (index_buffer != nil) {
+
+      /* Set depth stencil state (requires knowledge of primitive type). */
+      ctx->ensure_depth_stencil_state(mtl_prim_type);
+
+      /* Issue draw call. */
+      [rec drawIndexedPrimitives:mtl_prim_type
+                      indexCount:index_count
+                       indexType:index_type
+                     indexBuffer:index_buffer
+               indexBufferOffset:v_first_ofs
+                   instanceCount:i_count
+                      baseVertex:base_index
+                    baseInstance:i_first];
+      ctx->main_command_buffer.register_draw_counters(index_count * i_count);
+    }
+    else {
+      BLI_assert_msg(false, "Index buffer does not have backing Metal buffer");
+    }
+  }
+
+  /* End of draw. */
+  this->unbind();
+}
+
+/** \} */
+
+/* -------------------------------------------------------------------- */
+/** \name Topology emulation and optimization
+ * \{ */
+
+id<MTLBuffer> MTLBatch::get_emulated_toplogy_buffer(GPUPrimType &in_out_prim_type,
+                                                    uint32_t &in_out_v_count)
+{
+
+  BLI_assert(in_out_v_count > 0);
+  /* Determine emulated primitive types. */
+  GPUPrimType input_prim_type = in_out_prim_type;
+  uint32_t v_count = in_out_v_count;
+  GPUPrimType output_prim_type;
+  switch (input_prim_type) {
+    case GPU_PRIM_POINTS:
+    case GPU_PRIM_LINES:
+    case GPU_PRIM_TRIS:
+      BLI_assert_msg(false, "Optimal primitive types should not reach here.");
+      return nil;
+      break;
+    case GPU_PRIM_LINES_ADJ:
+    case GPU_PRIM_TRIS_ADJ:
+      BLI_assert_msg(false, "Adjacency primitive types should not reach here.");
+      return nil;
+      break;
+    case GPU_PRIM_LINE_STRIP:
+    case GPU_PRIM_LINE_LOOP:
+    case GPU_PRIM_LINE_STRIP_ADJ:
+      output_prim_type = GPU_PRIM_LINES;
+      break;
+    case GPU_PRIM_TRI_STRIP:
+    case GPU_PRIM_TRI_FAN:
+      output_prim_type = GPU_PRIM_TRIS;
+      break;
+    default:
+      BLI_assert_msg(false, "Invalid primitive type.");
+      return nil;
+  }
+
+  /* Check if topology buffer exists and is valid. */
+  if (this->emulated_topology_buffer_ != nullptr &&
+      (emulated_topology_type_ != input_prim_type || topology_buffer_input_v_count_ != v_count)) {
+
+    /* Release existing topology buffer. */
+    emulated_topology_buffer_->free();
+    emulated_topology_buffer_ = nullptr;
+  }
+
+  /* Generate new topology index buffer. */
+  if (this->emulated_topology_buffer_ == nullptr) {
+    /* Calculate IB len. */
+    uint32_t output_prim_count = 0;
+    switch (input_prim_type) {
+      case GPU_PRIM_LINE_STRIP:
+      case GPU_PRIM_LINE_STRIP_ADJ:
+        output_prim_count = v_count - 1;
+        break;
+      case GPU_PRIM_LINE_LOOP:
+        output_prim_count = v_count;
+        break;
+      case GPU_PRIM_TRI_STRIP:
+      case GPU_PRIM_TRI_FAN:
+        output_prim_count = v_count - 2;
+        break;
+      default:
+        BLI_assert_msg(false, "Cannot generate optimized topology buffer for other types.");
+        break;
+    }
+    uint32_t output_IB_elems = output_prim_count * ((output_prim_type == GPU_PRIM_TRIS) ? 3 : 2);
+
+    /* Allocate buffer. */
+    uint32_t buffer_bytes = output_IB_elems * 4;
+    BLI_assert(buffer_bytes > 0);
+    this->emulated_topology_buffer_ = MTLContext::get_global_memory_manager().allocate(
+        buffer_bytes, true);
+
+    /* Populate. */
+    uint32_t *data = (uint32_t *)this->emulated_topology_buffer_->get_host_ptr();
+    BLI_assert(data != nullptr);
+
+    /* TODO(Metal): Support inverse winding modes. */
+    bool winding_clockwise = false;
+    UNUSED_VARS(winding_clockwise);
+
+    switch (input_prim_type) {
+      /* Line Loop. */
+      case GPU_PRIM_LINE_LOOP: {
+        int line = 0;
+        for (line = 0; line < output_prim_count - 1; line++) {
+          data[line * 3 + 0] = line + 0;
+          data[line * 3 + 1] = line + 1;
+        }
+        /* Closing line. */
+        data[line * 2 + 0] = line + 0;
+        data[line * 2 + 1] = 0;
+      } break;
+
+      /* Triangle Fan. */
+      case GPU_PRIM_TRI_FAN: {
+        for (int triangle = 0; triangle < output_prim_count; triangle++) {
+          data[triangle * 3 + 0] = 0; /* Always 0 */
+          data[triangle * 3 + 1] = triangle + 1;
+          data[triangle * 3 + 2] = triangle + 2;
+        }
+      } break;
+
+      default:
+        BLI_assert_msg(false, "Other primitive types do not require emulation.");
+        return nil;
+    }
+
+    /* Flush. */
+    this->emulated_topology_buffer_->flush();
+    /* Assign members relating to current cached IB. */
+    topology_buffer_input_v_count_ = v_count;
+    topology_buffer_output_v_count_ = output_IB_elems;
+    emulated_topology_type_ = input_prim_type;
+  }
+
+  /* Return. */
+  in_out_v_count = topology_buffer_output_v_count_;
+  in_out_prim_type = output_prim_type;
+  return (emulated_topology_buffer_) ? emulated_topology_buffer_->get_metal_buffer() : nil;
+}
+
+/** \} */
+
+}  // blender::gpu
--- a/source/blender/gpu/metal/mtl_context.mm
+++ b/source/blender/gpu/metal/mtl_context.mm
@ -995,19 +995,21 @@ bool MTLContext::ensure_uniform_buffer_bindings(

    if (ubo.buffer_index >= 0) {

-      const uint32_t buffer_index = ubo.buffer_index;
+      /* Uniform Buffer index offset by 1 as the first shader buffer binding slot is reserved for
+       * the uniform PushConstantBlock. */
+      const uint32_t buffer_index = ubo.buffer_index + 1;
      int ubo_offset = 0;
      id<MTLBuffer> ubo_buffer = nil;
      int ubo_size = 0;

      bool bind_dummy_buffer = false;
-      if (this->pipeline_state.ubo_bindings[buffer_index].bound) {
+      if (this->pipeline_state.ubo_bindings[ubo_index].bound) {

        /* Fetch UBO global-binding properties from slot. */
        ubo_offset = 0;
-        ubo_buffer = this->pipeline_state.ubo_bindings[buffer_index].ubo->get_metal_buffer(
+        ubo_buffer = this->pipeline_state.ubo_bindings[ubo_index].ubo->get_metal_buffer(
            &ubo_offset);
-        ubo_size = this->pipeline_state.ubo_bindings[buffer_index].ubo->get_size();
+        ubo_size = this->pipeline_state.ubo_bindings[ubo_index].ubo->get_size();

        /* Use dummy zero buffer if no buffer assigned -- this is an optimization to avoid
         * allocating zero buffers. */
--- a/source/blender/gpu/metal/mtl_drawlist.hh
+++ b/source/blender/gpu/metal/mtl_drawlist.hh
@ -9,34 +9,50 @@

 #pragma once

-#pragma once
-
+#include "BLI_sys_types.h"
+#include "GPU_batch.h"
+#include "MEM_guardedalloc.h"
 #include "gpu_drawlist_private.hh"

-namespace blender {
-namespace gpu {
+#include "mtl_batch.hh"
+#include "mtl_context.hh"
+
+namespace blender::gpu {

 /**
- * TODO(Metal): MTLDrawList Implementation. Included as temporary stub.
- */
+ * Implementation of Multi Draw Indirect using OpenGL.
+ **/
 class MTLDrawList : public DrawList {
- public:
-  MTLDrawList(int length)
-  {
-  }
-  ~MTLDrawList()
-  {
-  }

-  void append(GPUBatch *batch, int i_first, int i_count) override
-  {
-  }
-  void submit() override
-  {
-  }
+ private:
+  /** Batch for which we are recording commands for. */
+  MTLBatch *batch_;
+  /** Mapped memory bounds. */
+  void *data_;
+  /** Length of the mapped buffer (in byte). */
+  size_t data_size_;
+  /** Current offset inside the mapped buffer (in byte). */
+  size_t command_offset_;
+  /** Current number of command recorded inside the mapped buffer. */
+  uint32_t command_len_;
+  /** Is UINT_MAX if not drawing indexed geom. Also Avoid dereferencing batch. */
+  uint32_t base_index_;
+  /** Also Avoid dereferencing batch. */
+  uint32_t v_first_, v_count_;
+  /** Length of whole the buffer (in byte). */
+  uint32_t buffer_size_;
+
+ public:
+  MTLDrawList(int length);
+  ~MTLDrawList();
+
+  void append(GPUBatch *batch, int i_first, int i_count) override;
+  void submit() override;
+
+ private:
+  void init();

  MEM_CXX_CLASS_ALLOC_FUNCS("MTLDrawList");
 };

-}  // namespace gpu
-}  // namespace blender
+}  // namespace blender::gpu
--- a/source/blender/gpu/metal/mtl_drawlist.mm
+++ b/source/blender/gpu/metal/mtl_drawlist.mm
@ -0,0 +1,282 @@
+/** \file
+ * \ingroup gpu
+ *
+ * Implementation of Multi Draw Indirect using OpenGL.
+ * Fallback if the needed extensions are not supported.
+ */
+
+#include "BLI_assert.h"
+
+#include "GPU_batch.h"
+#include "mtl_common.hh"
+#include "mtl_drawlist.hh"
+#include "mtl_primitive.hh"
+
+using namespace blender::gpu;
+
+namespace blender::gpu {
+
+/* Indirect draw call structure for reference. */
+/* MTLDrawPrimitivesIndirectArguments --
+ * https://developer.apple.com/documentation/metal/mtldrawprimitivesindirectarguments?language=objc
+ */
+/* struct MTLDrawPrimitivesIndirectArguments {
+ * uint32_t vertexCount;
+ * uint32_t instanceCount;
+ * uint32_t vertexStart;
+ * uint32_t baseInstance;
+};*/
+
+/* MTLDrawIndexedPrimitivesIndirectArguments --
+ * https://developer.apple.com/documentation/metal/mtldrawindexedprimitivesindirectarguments?language=objc
+ */
+/* struct MTLDrawIndexedPrimitivesIndirectArguments {
+ * uint32_t indexCount;
+ * uint32_t instanceCount;
+ * uint32_t indexStart;
+ * uint32_t baseVertex;
+ * uint32_t baseInstance;
+};*/
+
+#define MDI_ENABLED (buffer_size_ != 0)
+#define MDI_DISABLED (buffer_size_ == 0)
+#define MDI_INDEXED (base_index_ != UINT_MAX)
+
+MTLDrawList::MTLDrawList(int length)
+{
+  BLI_assert(length > 0);
+  batch_ = nullptr;
+  command_len_ = 0;
+  base_index_ = 0;
+  command_offset_ = 0;
+  data_size_ = 0;
+  buffer_size_ = sizeof(MTLDrawIndexedPrimitivesIndirectArguments) * length;
+  data_ = (void *)MEM_mallocN(buffer_size_, __func__);
+}
+
+MTLDrawList::~MTLDrawList()
+{
+  if (data_) {
+    MEM_freeN(data_);
+    data_ = nullptr;
+  }
+}
+
+void MTLDrawList::init()
+{
+  MTLContext *ctx = reinterpret_cast<MTLContext *>(GPU_context_active_get());
+  BLI_assert(ctx);
+  BLI_assert(MDI_ENABLED);
+  BLI_assert(data_ == nullptr);
+  UNUSED_VARS_NDEBUG(ctx);
+
+  batch_ = nullptr;
+  command_len_ = 0;
+  BLI_assert(data_);
+
+  command_offset_ = 0;
+}
+
+void MTLDrawList::append(GPUBatch *gpu_batch, int i_first, int i_count)
+{
+  /* Fallback when MultiDrawIndirect is not supported/enabled. */
+  MTLShader *shader = static_cast<MTLShader *>(unwrap(gpu_batch->shader));
+  bool requires_ssbo = (shader->get_uses_ssbo_vertex_fetch());
+  bool requires_emulation = mtl_needs_topology_emulation(gpu_batch->prim_type);
+  if (MDI_DISABLED || requires_ssbo || requires_emulation) {
+    GPU_batch_draw_advanced(gpu_batch, 0, 0, i_first, i_count);
+    return;
+  }
+
+  if (data_ == nullptr) {
+    this->init();
+  }
+  BLI_assert(data_);
+
+  MTLBatch *mtl_batch = static_cast<MTLBatch *>(gpu_batch);
+  BLI_assert(mtl_batch);
+  if (mtl_batch != batch_) {
+    /* Submit existing calls. */
+    this->submit();
+
+    /* Begin new batch. */
+    batch_ = mtl_batch;
+
+    /* Cached for faster access. */
+    MTLIndexBuf *el = batch_->elem_();
+    base_index_ = el ? el->index_base_ : UINT_MAX;
+    v_first_ = el ? el->index_start_ : 0;
+    v_count_ = el ? el->index_len_ : batch_->verts_(0)->vertex_len;
+  }
+
+  if (v_count_ == 0) {
+    /* Nothing to draw. */
+    return;
+  }
+
+  if (MDI_INDEXED) {
+    MTLDrawIndexedPrimitivesIndirectArguments *cmd =
+        reinterpret_cast<MTLDrawIndexedPrimitivesIndirectArguments *>((char *)data_ +
+                                                                      command_offset_);
+    cmd->indexStart = v_first_;
+    cmd->indexCount = v_count_;
+    cmd->instanceCount = i_count;
+    cmd->baseVertex = base_index_;
+    cmd->baseInstance = i_first;
+  }
+  else {
+    MTLDrawPrimitivesIndirectArguments *cmd =
+        reinterpret_cast<MTLDrawPrimitivesIndirectArguments *>((char *)data_ + command_offset_);
+    cmd->vertexStart = v_first_;
+    cmd->vertexCount = v_count_;
+    cmd->instanceCount = i_count;
+    cmd->baseInstance = i_first;
+  }
+
+  size_t command_size = MDI_INDEXED ? sizeof(MTLDrawIndexedPrimitivesIndirectArguments) :
+                                      sizeof(MTLDrawPrimitivesIndirectArguments);
+
+  command_offset_ += command_size;
+  command_len_++;
+
+  /* Check if we can fit at least one other command. */
+  if (command_offset_ + command_size > buffer_size_) {
+    this->submit();
+  }
+
+  return;
+}
+
+void MTLDrawList::submit()
+{
+  /* Metal does not support MDI from the host side, but we still benefit from only executing the
+   * batch bind a single time, rather than per-draw.
+   * NOTE(Metal): Consider using MTLIndirectCommandBuffer to achieve similar behaviour. */
+  if (command_len_ == 0) {
+    return;
+  }
+
+  /* Something's wrong if we get here without MDI support. */
+  BLI_assert(MDI_ENABLED);
+  BLI_assert(data_);
+
+  /* Host-side MDI Currently unsupported on Metal. */
+  bool can_use_MDI = false;
+
+  /* Verify context. */
+  MTLContext *ctx = reinterpret_cast<MTLContext *>(GPU_context_active_get());
+  BLI_assert(ctx);
+
+  /* Execute indirect draw calls. */
+  MTLShader *shader = static_cast<MTLShader *>(unwrap(batch_->shader));
+  bool SSBO_MODE = (shader->get_uses_ssbo_vertex_fetch());
+  if (SSBO_MODE) {
+    can_use_MDI = false;
+    BLI_assert(false);
+    return;
+  }
+
+  /* Heuristic to determine whether using indirect drawing is more efficient. */
+  size_t command_size = MDI_INDEXED ? sizeof(MTLDrawIndexedPrimitivesIndirectArguments) :
+                                      sizeof(MTLDrawPrimitivesIndirectArguments);
+  const bool is_finishing_a_buffer = (command_offset_ + command_size > buffer_size_);
+  can_use_MDI = can_use_MDI && (is_finishing_a_buffer || command_len_ > 2);
+
+  /* Bind Batch to setup render pipeline state. */
+  id<MTLRenderCommandEncoder> rec = batch_->bind(0, 0, 0, 0);
+  if (!rec) {
+    BLI_assert_msg(false, "A RenderCommandEncoder should always be available!\n");
+    return;
+  }
+
+  /* Common properties. */
+  MTLPrimitiveType mtl_prim_type = gpu_prim_type_to_metal(batch_->prim_type);
+
+  /* Execute multidraw indirect. */
+  if (can_use_MDI && false) {
+    /* Metal Doesn't support MDI -- Singular Indirect draw calls are supported,
+     * but Multidraw is not.
+     * TODO(Metal): Consider using IndirectCommandBuffers to provide similar
+     * behaviour. */
+  }
+  else {
+
+    /* Execute draws manually. */
+    if (MDI_INDEXED) {
+      MTLDrawIndexedPrimitivesIndirectArguments *cmd =
+          (MTLDrawIndexedPrimitivesIndirectArguments *)data_;
+      MTLIndexBuf *mtl_elem = static_cast<MTLIndexBuf *>(
+          reinterpret_cast<IndexBuf *>(batch_->elem));
+      BLI_assert(mtl_elem);
+      MTLIndexType index_type = MTLIndexBuf::gpu_index_type_to_metal(mtl_elem->index_type_);
+      uint32_t index_size = (mtl_elem->index_type_ == GPU_INDEX_U16) ? 2 : 4;
+      uint32_t v_first_ofs = (mtl_elem->index_start_ * index_size);
+      uint32_t index_count = cmd->indexCount;
+
+      /* Fetch index buffer. May return an index buffer of a differing format,
+       * if index buffer optimisation is used. In these cases, mtl_prim_type and
+       * index_count get updated with the new properties. */
+      GPUPrimType final_prim_type = batch_->prim_type;
+      id<MTLBuffer> index_buffer = mtl_elem->get_index_buffer(final_prim_type, index_count);
+      BLI_assert(index_buffer != nil);
+
+      /* Final primitive type. */
+      mtl_prim_type = gpu_prim_type_to_metal(final_prim_type);
+
+      if (index_buffer != nil) {
+
+        /* Set depth stencil state (requires knowledge of primitive type). */
+        ctx->ensure_depth_stencil_state(mtl_prim_type);
+
+        for (int i = 0; i < command_len_; i++, cmd++) {
+          [rec drawIndexedPrimitives:mtl_prim_type
+                          indexCount:index_count
+                           indexType:index_type
+                         indexBuffer:index_buffer
+                   indexBufferOffset:v_first_ofs
+                       instanceCount:cmd->instanceCount
+                          baseVertex:cmd->baseVertex
+                        baseInstance:cmd->baseInstance];
+          ctx->main_command_buffer.register_draw_counters(cmd->indexCount * cmd->instanceCount);
+        }
+      }
+      else {
+        BLI_assert_msg(false, "Index buffer does not have backing Metal buffer");
+      }
+    }
+    else {
+      MTLDrawPrimitivesIndirectArguments *cmd = (MTLDrawPrimitivesIndirectArguments *)data_;
+
+      /* Verify if topology emulation is required. */
+      if (mtl_needs_topology_emulation(batch_->prim_type)) {
+        BLI_assert_msg(false, "topology emulation cases should use fallback.");
+      }
+      else {
+
+        /* Set depth stencil state (requires knowledge of primitive type). */
+        ctx->ensure_depth_stencil_state(mtl_prim_type);
+
+        for (int i = 0; i < command_len_; i++, cmd++) {
+          [rec drawPrimitives:mtl_prim_type
+                  vertexStart:cmd->vertexStart
+                  vertexCount:cmd->vertexCount
+                instanceCount:cmd->instanceCount
+                 baseInstance:cmd->baseInstance];
+          ctx->main_command_buffer.register_draw_counters(cmd->vertexCount * cmd->instanceCount);
+        }
+      }
+    }
+  }
+
+  /* Unbind batch. */
+  batch_->unbind();
+
+  /* Reset command offsets. */
+  command_len_ = 0;
+  command_offset_ = 0;
+
+  /* Avoid keeping reference to the batch. */
+  batch_ = nullptr;
+}
+
+}  // namespace blender::gpu
--- a/source/blender/gpu/metal/mtl_immediate.mm
+++ b/source/blender/gpu/metal/mtl_immediate.mm
@ -99,6 +99,9 @@ void MTLImmediate::end()
    MTLRenderPipelineStateDescriptor &desc = state_manager->get_pipeline_descriptor();
    const MTLShaderInterface *interface = active_mtl_shader->get_interface();

+    /* Reset vertex descriptor to default state. */
+    desc.reset_vertex_descriptor();
+
    desc.vertex_descriptor.num_attributes = interface->get_total_attributes();
    desc.vertex_descriptor.num_vert_buffers = 1;

--- a/source/blender/gpu/metal/mtl_pso_descriptor_state.hh
+++ b/source/blender/gpu/metal/mtl_pso_descriptor_state.hh
@ -243,6 +243,19 @@ struct MTLRenderPipelineStateDescriptor {

    return hash;
  }
+
+  /* Reset the Vertex Descriptor to default. */
+  void reset_vertex_descriptor()
+  {
+    vertex_descriptor.num_attributes = 0;
+    vertex_descriptor.num_vert_buffers = 0;
+    for (int i = 0; i < GPU_VERT_ATTR_MAX_LEN; i++) {
+      vertex_descriptor.attributes[i].format = MTLVertexFormatInvalid;
+      vertex_descriptor.attributes[i].offset = 0;
+    }
+    vertex_descriptor.uses_ssbo_vertex_fetch = false;
+    vertex_descriptor.num_ssbo_attributes = 0;
+  }
 };

 }  // namespace blender::gpu
--- a/source/blender/gpu/metal/mtl_shader_interface.mm
+++ b/source/blender/gpu/metal/mtl_shader_interface.mm
@ -117,9 +117,7 @@ uint32_t MTLShaderInterface::add_uniform_block(uint32_t name_offset,

  MTLShaderUniformBlock &uni_block = ubos_[total_uniform_blocks_];
  uni_block.name_offset = name_offset;
-  /* We offset the buffer binding index by one, as the first slot is reserved for push constant
-   * data. */
-  uni_block.buffer_index = buffer_index + 1;
+  uni_block.buffer_index = buffer_index;
  uni_block.size = size;
  uni_block.current_offset = 0;
  uni_block.stage_mask = ShaderStage::BOTH;
@ -297,8 +295,10 @@ void MTLShaderInterface::prepare_common_shader_inputs()
    current_input->name_hash = BLI_hash_string(this->get_name_at_offset(shd_ubo.name_offset));
    /* Location refers to the index in the ubos_ array. */
    current_input->location = ubo_index;
-    /* Final binding location refers to the buffer binding index within the shader (Relative to
-     * MTL_uniform_buffer_base_index). */
+    /* Binding location refers to the UBO bind slot in
+     * #MTLContextGlobalShaderPipelineState::ubo_bindings. The buffer bind index [[buffer(N)]]
+     * within the shader will apply an offset for bound vertex buffers and the default uniform
+     * PushConstantBlock. */
    current_input->binding = shd_ubo.buffer_index;
    current_input++;
  }
--- a/source/blender/gpu/metal/mtl_texture.hh
+++ b/source/blender/gpu/metal/mtl_texture.hh
@ -51,9 +51,9 @@ struct TextureUpdateRoutineSpecialisation {
  uint64_t hash() const
  {
    blender::DefaultHash<std::string> string_hasher;
-    return uint64_t(string_hasher(
+    return (uint64_t)string_hasher(
        this->input_data_type + this->output_data_type +
-        std::to_string((this->component_count_input << 8) + this->component_count_output)));
+        std::to_string((this->component_count_input << 8) + this->component_count_output));
  }
 };

--- a/source/blender/gpu/metal/mtl_texture.mm
+++ b/source/blender/gpu/metal/mtl_texture.mm
@ -337,20 +337,6 @@ void gpu::MTLTexture::blit(gpu::MTLTexture *dst,

  GPU_batch_draw(quad);

-  /* TMP draw with IMM TODO(Metal): Remove this once GPUBatch is supported. */
-  GPUVertFormat *imm_format = immVertexFormat();
-  uint pos = GPU_vertformat_attr_add(imm_format, "pos", GPU_COMP_F32, 2, GPU_FETCH_FLOAT);
-
-  immBindShader(shader);
-  immBegin(GPU_PRIM_TRI_STRIP, 4);
-  immVertex2f(pos, 1, 0);
-  immVertex2f(pos, 0, 0);
-  immVertex2f(pos, 1, 1);
-  immVertex2f(pos, 0, 1);
-  immEnd();
-  immUnbindProgram();
-  /**********************/
-
  /* restoring old pipeline state. */
  GPU_depth_mask(depth_write_prev);
  GPU_stencil_write_mask_set(stencil_mask_prev);
@ -1472,10 +1458,82 @@ bool gpu::MTLTexture::init_internal()

 bool gpu::MTLTexture::init_internal(GPUVertBuf *vbo)
 {
-  /* Not a valid vertex buffer format, though verifying texture is not set as such
-   * as this is not supported on Apple Silicon. */
-  BLI_assert_msg(this->format_ != GPU_DEPTH24_STENCIL8,
-                 "Apple silicon does not support GPU_DEPTH24_S8");
+  if (this->format_ == GPU_DEPTH24_STENCIL8) {
+    /* Apple Silicon requires GPU_DEPTH32F_STENCIL8 instead of GPU_DEPTH24_STENCIL8. */
+    this->format_ = GPU_DEPTH32F_STENCIL8;
+  }
+
+  MTLPixelFormat mtl_format = gpu_texture_format_to_metal(this->format_);
+  mtl_max_mips_ = 1;
+  mipmaps_ = 0;
+  this->mip_range_set(0, 0);
+
+  /* Create texture from GPUVertBuf's buffer. */
+  MTLVertBuf *mtl_vbo = static_cast<MTLVertBuf *>(unwrap(vbo));
+  mtl_vbo->bind();
+  mtl_vbo->flag_used();
+
+  /* Get Metal Buffer. */
+  id<MTLBuffer> source_buffer = mtl_vbo->get_metal_buffer();
+  BLI_assert(source_buffer);
+
+  /* Verify size. */
+  if (w_ <= 0) {
+    MTL_LOG_WARNING("Allocating texture buffer of width 0!\n");
+    w_ = 1;
+  }
+
+  /* Verify Texture and vertex buffer alignment. */
+  int bytes_per_pixel = get_mtl_format_bytesize(mtl_format);
+  int bytes_per_row = bytes_per_pixel * w_;
+
+  MTLContext *mtl_ctx = MTLContext::get();
+  uint32_t align_requirement = static_cast<uint32_t>(
+      [mtl_ctx->device minimumLinearTextureAlignmentForPixelFormat:mtl_format]);
+
+  /* Verify per-vertex size aligns with texture size. */
+  const GPUVertFormat *format = GPU_vertbuf_get_format(vbo);
+  BLI_assert(bytes_per_pixel == format->stride &&
+             "Pixel format stride MUST match the texture format stride -- These being different "
+             "is likely caused by Metal's VBO padding to a minimum of 4-bytes per-vertex");
+  UNUSED_VARS_NDEBUG(format);
+
+  /* Create texture descriptor. */
+  BLI_assert(type_ == GPU_TEXTURE_BUFFER);
+  texture_descriptor_ = [[MTLTextureDescriptor alloc] init];
+  texture_descriptor_.pixelFormat = mtl_format;
+  texture_descriptor_.textureType = MTLTextureTypeTextureBuffer;
+  texture_descriptor_.width = w_;
+  texture_descriptor_.height = 1;
+  texture_descriptor_.depth = 1;
+  texture_descriptor_.arrayLength = 1;
+  texture_descriptor_.mipmapLevelCount = mtl_max_mips_;
+  texture_descriptor_.usage =
+      MTLTextureUsageShaderRead | MTLTextureUsageShaderWrite |
+      MTLTextureUsagePixelFormatView; /* TODO(Metal): Optimise usage flags. */
+  texture_descriptor_.storageMode = [source_buffer storageMode];
+  texture_descriptor_.sampleCount = 1;
+  texture_descriptor_.cpuCacheMode = [source_buffer cpuCacheMode];
+  texture_descriptor_.hazardTrackingMode = [source_buffer hazardTrackingMode];
+
+  texture_ = [source_buffer
+      newTextureWithDescriptor:texture_descriptor_
+                        offset:0
+                   bytesPerRow:ceil_to_multiple_u(bytes_per_row, align_requirement)];
+  aligned_w_ = bytes_per_row / bytes_per_pixel;
+
+  BLI_assert(texture_);
+  texture_.label = [NSString stringWithUTF8String:this->get_name()];
+  is_baked_ = true;
+  is_dirty_ = false;
+  resource_mode_ = MTL_TEXTURE_MODE_VBO;
+
+  /* Track Status. */
+  vert_buffer_ = mtl_vbo;
+  vert_buffer_mtl_ = source_buffer;
+  /* Cleanup. */
+  [texture_descriptor_ release];
+  texture_descriptor_ = nullptr;

  return true;
 }
@ -1522,7 +1580,6 @@ bool gpu::MTLTexture::texture_is_baked()
 /* Prepare texture parameters after initialization, but before baking. */
 void gpu::MTLTexture::prepare_internal()
 {
-
  /* Derive implicit usage flags for Depth/Stencil attachments. */
  if (format_flag_ & GPU_FORMAT_DEPTH || format_flag_ & GPU_FORMAT_STENCIL) {
    gpu_image_usage_flags_ |= GPU_TEXTURE_USAGE_ATTACHMENT;
@ -1687,7 +1744,7 @@ void gpu::MTLTexture::ensure_baked()
    /* Determine Resource Mode. */
    resource_mode_ = MTL_TEXTURE_MODE_DEFAULT;

-    /* Create texture. */
+    /* Standard texture allocation. */
    texture_ = [ctx->device newTextureWithDescriptor:texture_descriptor_];

    [texture_descriptor_ release];