Gawain: Refactor: VAOs caching AND use new VAOs manager.

A major bottleneck of current implementation is the call to create_bindings() for basically every drawcalls. This is due to the VAO being tagged dirty when assigning a new shader to the Batch, defeating the purpose of the Batch (reuse it for drawing). Since managing hundreds of batches in DrawManager and DrawCache seems not fun enough to me, I prefered rewritting the batches itself. --- Batch changes --- For this to happen I needed to change the Instancing to be part of the Batch rather than being another batch supplied at drawtime. The Gwn_VertBuffers are copied from the batch to be instanciated and a new Gwn_VertBuffer is supplied for instancing attribs. This mean a VAO can be generated and cached for this instancing case. A Batch can be rendered with instancing, without instancing attribs and without the need for a new VAO using the GWN_batch_draw_range_ex with the force_instance parameter set to true. --- Draw manager changes --- The downside with this approach is that we must track the validity of the instanced batch (the original one). For this the only way (I could think of) is to set a callback for when the batch is getting free. This means a bit of refactor in the DrawManager with the separation of batching and instancing Batches. --- VAO cache --- Each VAO is generated for a given ShaderInterface. This means we can keep it alive as long as the shader interface lives. If a ShaderInterface is discarded, it needs to destroy every VAO associated to it. Otherwise, a new ShaderInterface with the same adress could be generated and reuse the same VAO with incorrect bindings. The VAO cache itself is using a mix between a static array of VAO and a dynamic array if the is not enough space in the static. Using this hybrid approach is a bit more performant than the dynamic array alone. The array will not resize down but empty entries will be filled up again. It's unlikely we get a buffer overflow from this. Resizing could be done on next allocation if needed. --- Results --- Using Cached VAOs means that we are not querying each vertex attrib for each vbo for each drawcall, every redraw! In a CPU limited test scene (10000 cubes in Clay engine) I get a reduction of CPU drawing time from ~20ms to 13ms. The only area that is not caching VAOs is the instancing from particles (see comment DRW_shgroup_instance_batch).
2018-02-20 01:55:19 +01:00 · 2018-02-20 01:55:19 +01:00 · c5eba46d7f
parent 1b3f9ecd0d
commit c5eba46d7f
15 changed files with 665 additions and 374 deletions
--- a/intern/gawain/CMakeLists.txt
+++ b/intern/gawain/CMakeLists.txt
@ -16,6 +16,7 @@ set(SRC
 	src/gwn_imm_util.c
 	src/gwn_primitive.c
 	src/gwn_shader_interface.c
+	src/gwn_vertex_array_id.cpp
 	src/gwn_vertex_buffer.c
 	src/gwn_vertex_format.c

@ -30,6 +31,7 @@ set(SRC
 	gawain/gwn_primitive.h
 	gawain/gwn_primitive_private.h
 	gawain/gwn_shader_interface.h
+	gawain/gwn_vertex_array_id.h
 	gawain/gwn_vertex_buffer.h
 	gawain/gwn_vertex_format.h
 	gawain/gwn_vertex_format_private.h
--- a/intern/gawain/gawain/gwn_batch.h
+++ b/intern/gawain/gawain/gwn_batch.h
@ -23,34 +23,61 @@ typedef enum {
 } Gwn_BatchPhase;

 #define GWN_BATCH_VBO_MAX_LEN 3
+#define GWN_BATCH_VAO_STATIC_LEN 3
+#define GWN_BATCH_VAO_DYN_ALLOC_COUNT 16

 typedef struct Gwn_Batch {
 	// geometry
 	Gwn_VertBuf* verts[GWN_BATCH_VBO_MAX_LEN]; // verts[0] is required, others can be NULL
+	Gwn_VertBuf* inst; // instance attribs
 	Gwn_IndexBuf* elem; // NULL if element list not needed
-	Gwn_PrimType prim_type;
 	GLenum gl_prim_type;

-	// book-keeping
-	GLuint vao_id; // remembers all geometry state (vertex attrib bindings & element buffer)
-	Gwn_BatchPhase phase;
-	bool program_dirty;
-	bool program_in_use;
-	unsigned owns_flag;
-
-	// state
+	// cached values (avoid dereferencing later)
+	GLuint vao_id;
 	GLuint program;
-	const Gwn_ShaderInterface* interface;
+	const struct Gwn_ShaderInterface* interface;
+
+	// book-keeping
+	unsigned owns_flag;
+	struct Gwn_Context *context; // used to free all vaos. this implies all vaos were created under the same context.
+	Gwn_BatchPhase phase;
+	bool program_in_use;
+
+	// Vao management: remembers all geometry state (vertex attrib bindings & element buffer)
+	// for each shader interface. Start with a static number of vaos and fallback to dynamic count
+	// if necessary. Once a batch goes dynamic it does not go back.
+	bool is_dynamic_vao_count;
+	union {
+		// Static handle count
+		struct {
+			const struct Gwn_ShaderInterface* interfaces[GWN_BATCH_VAO_STATIC_LEN];
+			GLuint vao_ids[GWN_BATCH_VAO_STATIC_LEN];
+		} static_vaos;
+		// Dynamic handle count
+		struct {
+			unsigned count;
+			const struct Gwn_ShaderInterface** interfaces;
+			GLuint* vao_ids;
+		} dynamic_vaos;
+	};
+
+	// XXX This is the only solution if we want to have some data structure using
+	// batches as key to identify nodes. We must destroy these nodes with this callback.
+	void (*free_callback)(struct Gwn_Batch*, void*);
+	void* callback_data;
 } Gwn_Batch;

 enum {
 	GWN_BATCH_OWNS_VBO = (1 << 0),
 	/* each vbo index gets bit-shifted */
+	GWN_BATCH_OWNS_INSTANCES = (1 << 30),
 	GWN_BATCH_OWNS_INDEX = (1 << 31),
 };

 Gwn_Batch* GWN_batch_create_ex(Gwn_PrimType, Gwn_VertBuf*, Gwn_IndexBuf*, unsigned owns_flag);
 void GWN_batch_init_ex(Gwn_Batch*, Gwn_PrimType, Gwn_VertBuf*, Gwn_IndexBuf*, unsigned owns_flag);
+Gwn_Batch* GWN_batch_duplicate(Gwn_Batch* batch_src);

 #define GWN_batch_create(prim, verts, elem) \
 	GWN_batch_create_ex(prim, verts, elem, 0)
@ -59,11 +86,18 @@ void GWN_batch_init_ex(Gwn_Batch*, Gwn_PrimType, Gwn_VertBuf*, Gwn_IndexBuf*, un

 void GWN_batch_discard(Gwn_Batch*); // verts & elem are not discarded

+void GWN_batch_callback_free_set(Gwn_Batch*, void (*callback)(Gwn_Batch*, void*), void*);
+
+void GWN_batch_instbuf_set(Gwn_Batch*, Gwn_VertBuf*, bool own_vbo); // Instancing
+
 int GWN_batch_vertbuf_add_ex(Gwn_Batch*, Gwn_VertBuf*, bool own_vbo);

 #define GWN_batch_vertbuf_add(batch, verts) \
 	GWN_batch_vertbuf_add_ex(batch, verts, false)

+// This is a private function
+void GWN_batch_remove_interface_ref(Gwn_Batch*, const Gwn_ShaderInterface*);
+
 void GWN_batch_program_set(Gwn_Batch*, GLuint program, const Gwn_ShaderInterface*);
 void GWN_batch_program_unset(Gwn_Batch*);
 // Entire batch draws with one shader program, but can be redrawn later with another program.
@ -84,11 +118,14 @@ void GWN_batch_uniform_4fv(Gwn_Batch*, const char* name, const float data[4]);

 void GWN_batch_draw(Gwn_Batch*);

+// This does not bind/unbind shader and does not call gpuBindMatrices()
+void GWN_batch_draw_range_ex(Gwn_Batch*, int v_first, int v_count, bool force_instance);

-void GWN_batch_draw_stupid(Gwn_Batch*, int v_first, int v_count);
-void GWN_batch_draw_stupid_instanced(Gwn_Batch*, Gwn_Batch*, int instance_first, int instance_count);
-void GWN_batch_draw_procedural(Gwn_Batch*, Gwn_PrimType, int v_count);
+#define GWN_batch_draw_range(batch, first, count) \
+	GWN_batch_draw_range_ex(batch, first, count, false)

+// Does not even need batch
+void GWN_draw_primitive(Gwn_PrimType, int v_count);

 #if 0 // future plans

--- a/intern/gawain/gawain/gwn_buffer_id.h
+++ b/intern/gawain/gawain/gwn_buffer_id.h
@ -25,10 +25,6 @@ extern "C" {
 GLuint GWN_buf_id_alloc(void);
 void GWN_buf_id_free(GLuint buffer_id);

-GLuint GWN_vao_alloc(void);
-void GWN_vao_free(GLuint vao_id);
-
-
 #ifdef __cplusplus
 }
 #endif
--- a/intern/gawain/gawain/gwn_shader_interface.h
+++ b/intern/gawain/gawain/gwn_shader_interface.h
@ -54,6 +54,7 @@ typedef struct Gwn_ShaderInput {
 } Gwn_ShaderInput;

 #define GWN_NUM_SHADERINTERFACE_BUCKETS 257
+#define GWN_SHADERINTERFACE_REF_ALLOC_COUNT 16

 typedef struct Gwn_ShaderInterface {
 	GLint program;
@ -63,6 +64,8 @@ typedef struct Gwn_ShaderInterface {
 	Gwn_ShaderInput* ubo_buckets[GWN_NUM_SHADERINTERFACE_BUCKETS];
 	Gwn_ShaderInput* builtin_uniforms[GWN_NUM_UNIFORMS];
 	char* name_buffer;
+	struct Gwn_Batch** batches; // references to batches using this interface
+	unsigned batches_ct;
 } Gwn_ShaderInterface;

 Gwn_ShaderInterface* GWN_shaderinterface_create(GLint program_id);
@ -72,3 +75,7 @@ const Gwn_ShaderInput* GWN_shaderinterface_uniform(const Gwn_ShaderInterface*, c
 const Gwn_ShaderInput* GWN_shaderinterface_uniform_builtin(const Gwn_ShaderInterface*, Gwn_UniformBuiltin);
 const Gwn_ShaderInput* GWN_shaderinterface_ubo(const Gwn_ShaderInterface*, const char* name);
 const Gwn_ShaderInput* GWN_shaderinterface_attr(const Gwn_ShaderInterface*, const char* name);
+
+// keep track of batches using this interface
+void GWN_shaderinterface_add_batch_ref(Gwn_ShaderInterface*, struct Gwn_Batch*);
+void GWN_shaderinterface_remove_batch_ref(Gwn_ShaderInterface*, struct Gwn_Batch*);
--- a/intern/gawain/gawain/gwn_vertex_array_id.h
+++ b/intern/gawain/gawain/gwn_vertex_array_id.h
@ -26,8 +26,8 @@ extern "C" {
 #include "gwn_context.h"

 GLuint GWN_vao_default(void);
-GLuint GWN_vao_alloc_new(void);
-void GWN_vao_free_new(GLuint vao_id, Gwn_Context*);
+GLuint GWN_vao_alloc(void);
+void GWN_vao_free(GLuint vao_id, Gwn_Context*);

 #ifdef __cplusplus
 }
--- a/intern/gawain/src/gwn_batch.c
+++ b/intern/gawain/src/gwn_batch.c
@ -11,12 +11,48 @@

 #include "gwn_batch.h"
 #include "gwn_buffer_id.h"
+#include "gwn_vertex_array_id.h"
 #include "gwn_primitive_private.h"
 #include <stdlib.h>
+#include <string.h>

 // necessary functions from matrix API
 extern void gpuBindMatrices(const Gwn_ShaderInterface* shaderface);
-extern bool gpuMatricesDirty(void); // how best to use this here?
+
+static void batch_update_program_bindings(Gwn_Batch* batch, unsigned int v_first);
+
+static void Batch_vao_cache_clear(Gwn_Batch* batch)
+	{
+	if (batch->is_dynamic_vao_count)
+		{
+		for (int i = 0; i < batch->dynamic_vaos.count; ++i)
+			{
+			if (batch->dynamic_vaos.vao_ids[i])
+				GWN_vao_free(batch->dynamic_vaos.vao_ids[i], batch->context);
+			if (batch->dynamic_vaos.interfaces[i])
+				GWN_shaderinterface_remove_batch_ref((Gwn_ShaderInterface *)batch->dynamic_vaos.interfaces[i], batch);
+			}
+		free(batch->dynamic_vaos.interfaces);
+		free(batch->dynamic_vaos.vao_ids);
+		}
+	else
+		{
+		for (int i = 0; i < GWN_BATCH_VAO_STATIC_LEN; ++i)
+			{
+			if (batch->static_vaos.vao_ids[i])
+				GWN_vao_free(batch->static_vaos.vao_ids[i], batch->context);
+			if (batch->static_vaos.interfaces[i])
+				GWN_shaderinterface_remove_batch_ref((Gwn_ShaderInterface *)batch->static_vaos.interfaces[i], batch);
+			}
+		}
+
+	batch->is_dynamic_vao_count = false;
+	for (int i = 0; i < GWN_BATCH_VAO_STATIC_LEN; ++i)
+		{
+		batch->static_vaos.vao_ids[i] = 0;
+		batch->static_vaos.interfaces[i] = NULL;
+		}
+	}

 Gwn_Batch* GWN_batch_create_ex(
        Gwn_PrimType prim_type, Gwn_VertBuf* verts, Gwn_IndexBuf* elem,
@ -40,11 +76,25 @@ void GWN_batch_init_ex(
 	batch->verts[0] = verts;
 	for (int v = 1; v < GWN_BATCH_VBO_MAX_LEN; ++v)
 		batch->verts[v] = NULL;
+	batch->inst = NULL;
 	batch->elem = elem;
-	batch->prim_type = prim_type;
 	batch->gl_prim_type = convert_prim_type_to_gl(prim_type);
 	batch->phase = GWN_BATCH_READY_TO_DRAW;
+	batch->is_dynamic_vao_count = false;
 	batch->owns_flag = owns_flag;
+	batch->free_callback = NULL;
+	}
+
+// This will share the VBOs with the new batch
+Gwn_Batch* GWN_batch_duplicate(Gwn_Batch* batch_src)
+	{
+	Gwn_Batch* batch = GWN_batch_create_ex(GWN_PRIM_POINTS, batch_src->verts[0], batch_src->elem, 0);
+
+	batch->gl_prim_type = batch_src->gl_prim_type;
+	for (int v = 1; v < GWN_BATCH_VBO_MAX_LEN; ++v)
+		batch->verts[v] = batch_src->verts[v];
+
+	return batch;
 	}

 void GWN_batch_discard(Gwn_Batch* batch)
@ -52,6 +102,9 @@ void GWN_batch_discard(Gwn_Batch* batch)
 	if (batch->owns_flag & GWN_BATCH_OWNS_INDEX)
 		GWN_indexbuf_discard(batch->elem);

+	if (batch->owns_flag & GWN_BATCH_OWNS_INSTANCES)
+		GWN_vertbuf_discard(batch->inst);
+
 	if ((batch->owns_flag & ~GWN_BATCH_OWNS_INDEX) != 0)
 		{
 		for (int v = 0; v < GWN_BATCH_VBO_MAX_LEN; ++v)
@ -63,12 +116,39 @@ void GWN_batch_discard(Gwn_Batch* batch)
 			}
 		}

-	if (batch->vao_id)
-		GWN_vao_free(batch->vao_id);
+	Batch_vao_cache_clear(batch);
+
+	if (batch->free_callback)
+		batch->free_callback(batch, batch->callback_data);

 	free(batch);
 	}

+void GWN_batch_callback_free_set(Gwn_Batch* batch, void (*callback)(Gwn_Batch*, void*), void* user_data)
+	{
+	batch->free_callback = callback;
+	batch->callback_data = user_data;
+	}
+
+void GWN_batch_instbuf_set(Gwn_Batch* batch, Gwn_VertBuf* inst, bool own_vbo)
+	{
+#if TRUST_NO_ONE
+	assert(inst != NULL);
+#endif
+	// redo the bindings
+	Batch_vao_cache_clear(batch);
+
+	if (batch->inst != NULL && (batch->owns_flag & GWN_BATCH_OWNS_INSTANCES))
+		GWN_vertbuf_discard(batch->inst);
+
+	batch->inst = inst;
+
+	if (own_vbo)
+		batch->owns_flag |= GWN_BATCH_OWNS_INSTANCES;
+	else
+		batch->owns_flag &= ~GWN_BATCH_OWNS_INSTANCES;
+	}
+
 int GWN_batch_vertbuf_add_ex(
        Gwn_Batch* batch, Gwn_VertBuf* verts,
        bool own_vbo)
@ -100,12 +180,96 @@ int GWN_batch_vertbuf_add_ex(
 void GWN_batch_program_set(Gwn_Batch* batch, GLuint program, const Gwn_ShaderInterface* shaderface)
 	{
 #if TRUST_NO_ONE
-	assert(glIsProgram(program));
+	assert(glIsProgram(shaderface->program));
+	assert(batch->program_in_use == 0);
 #endif

+	batch->vao_id = 0;
 	batch->program = program;
 	batch->interface = shaderface;
-	batch->program_dirty = true;
+
+
+	// Search through cache
+	if (batch->is_dynamic_vao_count)
+		{
+		for (int i = 0; i < batch->dynamic_vaos.count && batch->vao_id == 0; ++i)
+			if (batch->dynamic_vaos.interfaces[i] == shaderface)
+				batch->vao_id = batch->dynamic_vaos.vao_ids[i];
+		}
+	else
+		{
+		for (int i = 0; i < GWN_BATCH_VAO_STATIC_LEN && batch->vao_id == 0; ++i)
+			if (batch->static_vaos.interfaces[i] == shaderface)
+				batch->vao_id = batch->static_vaos.vao_ids[i];
+		}
+
+	if (batch->vao_id == 0)
+		{
+		if (batch->context == NULL)
+			batch->context = GWN_context_active_get();
+#if TRUST_NO_ONE && 0 // disabled until we use a separate single context for UI.
+		else // Make sure you are not trying to draw this batch in another context.
+			assert(batch->context == GWN_context_active_get());
+#endif
+		// Cache miss, time to add a new entry!
+		if (!batch->is_dynamic_vao_count)
+			{
+			int i; // find first unused slot
+			for (i = 0; i < GWN_BATCH_VAO_STATIC_LEN; ++i)
+				if (batch->static_vaos.vao_ids[i] == 0)
+					break;
+
+			if (i < GWN_BATCH_VAO_STATIC_LEN)
+				{
+				batch->static_vaos.interfaces[i] = shaderface;
+				batch->static_vaos.vao_ids[i] = batch->vao_id = GWN_vao_alloc();
+				}
+			else
+				{
+				// Not enough place switch to dynamic.
+				batch->is_dynamic_vao_count = true;
+				// Erase previous entries, they will be added back if drawn again.
+				for (int j = 0; j < GWN_BATCH_VAO_STATIC_LEN; ++j)
+					{
+					GWN_shaderinterface_remove_batch_ref((Gwn_ShaderInterface*)batch->static_vaos.interfaces[j], batch);
+					GWN_vao_free(batch->static_vaos.vao_ids[j], batch->context);
+					}
+				// Init dynamic arrays and let the branch below set the values.
+				batch->dynamic_vaos.count = GWN_BATCH_VAO_DYN_ALLOC_COUNT;
+				batch->dynamic_vaos.interfaces = calloc(batch->dynamic_vaos.count, sizeof(Gwn_ShaderInterface*));
+				batch->dynamic_vaos.vao_ids = calloc(batch->dynamic_vaos.count, sizeof(GLuint));
+				}
+			}
+
+		if (batch->is_dynamic_vao_count)
+			{
+			int i; // find first unused slot
+			for (i = 0; i < batch->dynamic_vaos.count; ++i)
+				if (batch->dynamic_vaos.vao_ids[i] == 0)
+					break;
+
+			if (i == batch->dynamic_vaos.count)
+				{
+				// Not enough place, realloc the array.
+				i = batch->dynamic_vaos.count;
+				batch->dynamic_vaos.count += GWN_BATCH_VAO_DYN_ALLOC_COUNT;
+				batch->dynamic_vaos.interfaces = realloc(batch->dynamic_vaos.interfaces, sizeof(Gwn_ShaderInterface*) * batch->dynamic_vaos.count);
+				batch->dynamic_vaos.vao_ids = realloc(batch->dynamic_vaos.vao_ids, sizeof(GLuint) * batch->dynamic_vaos.count);
+				memset(batch->dynamic_vaos.interfaces + i, 0, sizeof(Gwn_ShaderInterface*) * GWN_BATCH_VAO_DYN_ALLOC_COUNT);
+				memset(batch->dynamic_vaos.vao_ids + i, 0, sizeof(GLuint) * GWN_BATCH_VAO_DYN_ALLOC_COUNT);
+				}
+
+			batch->dynamic_vaos.interfaces[i] = shaderface;
+			batch->dynamic_vaos.vao_ids[i] = batch->vao_id = GWN_vao_alloc();
+			}
+
+		GWN_shaderinterface_add_batch_ref((Gwn_ShaderInterface*)shaderface, batch);
+
+		// We just got a fresh VAO we need to initialize it.
+		glBindVertexArray(batch->vao_id);
+		batch_update_program_bindings(batch, 0);
+		glBindVertexArray(0);
+		}

 	GWN_batch_program_use_begin(batch); // hack! to make Batch_Uniform* simpler
 	}
@ -118,94 +282,104 @@ void GWN_batch_program_unset(Gwn_Batch* batch)
 	batch->program_in_use = false;
 	}

-static void create_bindings(Gwn_Batch* batch, const Gwn_ShaderInterface* interface, unsigned int v_first, const bool use_instancing)
+void GWN_batch_remove_interface_ref(Gwn_Batch* batch, const Gwn_ShaderInterface* interface)
 	{
-	for (int v = 0; v < GWN_BATCH_VBO_MAX_LEN; ++v)
+	if (batch->is_dynamic_vao_count)
 		{
-		Gwn_VertBuf* verts = batch->verts[v];
-		if (verts == NULL)
-			break;
-
-		const Gwn_VertFormat* format = &verts->format;
-
-		const unsigned attrib_ct = format->attrib_ct;
-		const unsigned stride = format->stride;
-
-		GWN_vertbuf_use(verts);
-
-		for (unsigned a_idx = 0; a_idx < attrib_ct; ++a_idx)
+		for (int i = 0; i < batch->dynamic_vaos.count; ++i)
 			{
-			const Gwn_VertAttr* a = format->attribs + a_idx;
-
-			const GLvoid* pointer = (const GLubyte*)0 + a->offset + v_first * stride;
-
-			for (unsigned n_idx = 0; n_idx < a->name_ct; ++n_idx)
+			if (batch->dynamic_vaos.interfaces[i] == interface)
 				{
-				const Gwn_ShaderInput* input = GWN_shaderinterface_attr(interface, a->name[n_idx]);
+				GWN_vao_free(batch->dynamic_vaos.vao_ids[i], batch->context);
+				batch->dynamic_vaos.vao_ids[i] = 0;
+				batch->dynamic_vaos.interfaces[i] = NULL;
+				break; // cannot have duplicates
+				}
+			}
+		}
+	else
+		{
+		int i;
+		for (i = 0; i < GWN_BATCH_VAO_STATIC_LEN; ++i)
+			{
+			if (batch->static_vaos.interfaces[i] == interface)
+				{
+				GWN_vao_free(batch->static_vaos.vao_ids[i], batch->context);
+				batch->static_vaos.vao_ids[i] = 0;
+				batch->static_vaos.interfaces[i] = NULL;
+				break; // cannot have duplicates
+				}
+			}
+		}
+	}

-				if (input == NULL) continue;
+static void create_bindings(Gwn_VertBuf* verts, const Gwn_ShaderInterface* interface, unsigned int v_first, const bool use_instancing)
+	{
+	const Gwn_VertFormat* format = &verts->format;

-				if (a->comp_ct == 16 || a->comp_ct == 12 || a->comp_ct == 8)
-					{
+	const unsigned attrib_ct = format->attrib_ct;
+	const unsigned stride = format->stride;
+
+	GWN_vertbuf_use(verts);
+
+	for (unsigned a_idx = 0; a_idx < attrib_ct; ++a_idx)
+		{
+		const Gwn_VertAttr* a = format->attribs + a_idx;
+
+		const GLvoid* pointer = (const GLubyte*)0 + a->offset + v_first * stride;
+
+		for (unsigned n_idx = 0; n_idx < a->name_ct; ++n_idx)
+			{
+			const Gwn_ShaderInput* input = GWN_shaderinterface_attr(interface, a->name[n_idx]);
+
+			if (input == NULL) continue;
+
+			if (a->comp_ct == 16 || a->comp_ct == 12 || a->comp_ct == 8)
+				{
 #if TRUST_NO_ONE
-					assert(a->fetch_mode == GWN_FETCH_FLOAT);
-					assert(a->gl_comp_type == GL_FLOAT);
+				assert(a->fetch_mode == GWN_FETCH_FLOAT);
+				assert(a->gl_comp_type == GL_FLOAT);
 #endif
-					for (int i = 0; i < a->comp_ct / 4; ++i)
-						{
-						glEnableVertexAttribArray(input->location + i);
-						glVertexAttribDivisor(input->location + i, (use_instancing) ? 1 : 0);
-						glVertexAttribPointer(input->location + i, 4, a->gl_comp_type, GL_FALSE, stride,
-						                      (const GLubyte*)pointer + i * 16);
-						}
-					}
-				else
+				for (int i = 0; i < a->comp_ct / 4; ++i)
 					{
-					glEnableVertexAttribArray(input->location);
-					glVertexAttribDivisor(input->location, (use_instancing) ? 1 : 0);
+					glEnableVertexAttribArray(input->location + i);
+					glVertexAttribDivisor(input->location + i, (use_instancing) ? 1 : 0);
+					glVertexAttribPointer(input->location + i, 4, a->gl_comp_type, GL_FALSE, stride,
+					                      (const GLubyte*)pointer + i * 16);
+					}
+				}
+			else
+				{
+				glEnableVertexAttribArray(input->location);
+				glVertexAttribDivisor(input->location, (use_instancing) ? 1 : 0);

-					switch (a->fetch_mode)
-						{
-						case GWN_FETCH_FLOAT:
-						case GWN_FETCH_INT_TO_FLOAT:
-							glVertexAttribPointer(input->location, a->comp_ct, a->gl_comp_type, GL_FALSE, stride, pointer);
-							break;
-						case GWN_FETCH_INT_TO_FLOAT_UNIT:
-							glVertexAttribPointer(input->location, a->comp_ct, a->gl_comp_type, GL_TRUE, stride, pointer);
-							break;
-						case GWN_FETCH_INT:
-							glVertexAttribIPointer(input->location, a->comp_ct, a->gl_comp_type, stride, pointer);
-						}
+				switch (a->fetch_mode)
+					{
+					case GWN_FETCH_FLOAT:
+					case GWN_FETCH_INT_TO_FLOAT:
+						glVertexAttribPointer(input->location, a->comp_ct, a->gl_comp_type, GL_FALSE, stride, pointer);
+						break;
+					case GWN_FETCH_INT_TO_FLOAT_UNIT:
+						glVertexAttribPointer(input->location, a->comp_ct, a->gl_comp_type, GL_TRUE, stride, pointer);
+						break;
+					case GWN_FETCH_INT:
+						glVertexAttribIPointer(input->location, a->comp_ct, a->gl_comp_type, stride, pointer);
 					}
 				}
 			}
 		}
 	}

-static void Batch_update_program_bindings(Gwn_Batch* batch, unsigned int v_first)
+static void batch_update_program_bindings(Gwn_Batch* batch, unsigned int v_first)
 	{
-	// disable all as a precaution
-	// why are we not using prev_attrib_enabled_bits?? see immediate.c
-	for (unsigned a_idx = 0; a_idx < GWN_VERT_ATTR_MAX_LEN; ++a_idx)
-		glDisableVertexAttribArray(a_idx);
+	for (int v = 0; v < GWN_BATCH_VBO_MAX_LEN && batch->verts[v] != NULL; ++v)
+		create_bindings(batch->verts[v], batch->interface, (batch->inst) ? 0 : v_first, false);

-	create_bindings(batch, batch->interface, v_first, false);
+	if (batch->inst)
+		create_bindings(batch->inst, batch->interface, v_first, true);

-	batch->program_dirty = false;
-	}
-
-static void Batch_update_program_bindings_instancing(Gwn_Batch* batch, Gwn_Batch* batch_instancing, unsigned int instance_first)
-	{
-	// disable all as a precaution
-	// why are we not using prev_attrib_enabled_bits?? see immediate.c
-	for (unsigned a_idx = 0; a_idx < GWN_VERT_ATTR_MAX_LEN; ++a_idx)
-		glDisableVertexAttribArray(a_idx);
-
-	create_bindings(batch, batch->interface, 0, false);
-	if (batch_instancing)
-		create_bindings(batch_instancing, batch->interface, instance_first, true);
-
-	batch->program_dirty = false;
+	if (batch->elem)
+		GWN_indexbuf_use(batch->elem);
 	}

 void GWN_batch_program_use_begin(Gwn_Batch* batch)
@ -290,142 +464,86 @@ void GWN_batch_uniform_4fv(Gwn_Batch* batch, const char* name, const float data[
 	glUniform4fv(uniform->location, 1, data);
 	}

-static void Batch_prime(Gwn_Batch* batch)
-	{
-	batch->vao_id = GWN_vao_alloc();
-	glBindVertexArray(batch->vao_id);
-
-	for (int v = 0; v < GWN_BATCH_VBO_MAX_LEN; ++v)
-		{
-		if (batch->verts[v] == NULL)
-			break;
-		GWN_vertbuf_use(batch->verts[v]);
-		}
-
-	if (batch->elem)
-		GWN_indexbuf_use(batch->elem);
-
-	// vertex attribs and element list remain bound to this VAO
-	}
-
 void GWN_batch_draw(Gwn_Batch* batch)
 	{
 #if TRUST_NO_ONE
 	assert(batch->phase == GWN_BATCH_READY_TO_DRAW);
-	assert(glIsProgram(batch->program));
+	assert(batch->verts[0]->vbo_id != 0);
 #endif
-
-	if (batch->vao_id)
-		glBindVertexArray(batch->vao_id);
-	else
-		Batch_prime(batch);
-
-	if (batch->program_dirty)
-		Batch_update_program_bindings(batch, 0);
-
 	GWN_batch_program_use_begin(batch);
+	gpuBindMatrices(batch->interface); // external call.

-	gpuBindMatrices(batch->interface);
-
-	if (batch->elem)
-		{
-		const Gwn_IndexBuf* el = batch->elem;
-
-#if GWN_TRACK_INDEX_RANGE
-		if (el->base_index)
-			glDrawRangeElementsBaseVertex(batch->gl_prim_type, el->min_index, el->max_index, el->index_ct, el->gl_index_type, 0, el->base_index);
-		else
-			glDrawRangeElements(batch->gl_prim_type, el->min_index, el->max_index, el->index_ct, el->gl_index_type, 0);
-#else
-		glDrawElements(batch->gl_prim_type, el->index_ct, GL_UNSIGNED_INT, 0);
-#endif
-		}
-	else
-		glDrawArrays(batch->gl_prim_type, 0, batch->verts[0]->vertex_ct);
+	GWN_batch_draw_range_ex(batch, 0, 0, false);

 	GWN_batch_program_use_end(batch);
-	glBindVertexArray(0);
 	}

-void GWN_batch_draw_stupid(Gwn_Batch* batch, int v_first, int v_count)
-	{
-	if (batch->vao_id)
-		glBindVertexArray(batch->vao_id);
-	else
-		Batch_prime(batch);
-
-	if (batch->program_dirty)
-		Batch_update_program_bindings(batch, v_first);
-
-	// GWN_batch_program_use_begin(batch);
-
-	//gpuBindMatrices(batch->program);
-
-	// Infer lenght if vertex count is not given
-	if (v_count == 0)
-		v_count = (batch->elem) ? batch->elem->index_ct : batch->verts[0]->vertex_ct;
-
-	if (batch->elem)
-		{
-		const Gwn_IndexBuf* el = batch->elem;
-
-#if GWN_TRACK_INDEX_RANGE
-		if (el->base_index)
-			glDrawRangeElementsBaseVertex(batch->gl_prim_type, el->min_index, el->max_index, v_count, el->gl_index_type, 0, el->base_index);
-		else
-			glDrawRangeElements(batch->gl_prim_type, el->min_index, el->max_index, v_count, el->gl_index_type, 0);
-#else
-		glDrawElements(batch->gl_prim_type, v_count, GL_UNSIGNED_INT, 0);
-#endif
-		}
-	else
-		glDrawArrays(batch->gl_prim_type, 0, v_count);
-
-	// GWN_batch_program_use_end(batch);
-	glBindVertexArray(0);
-	}
-
-void GWN_batch_draw_stupid_instanced(Gwn_Batch* batch_instanced, Gwn_Batch* batch_instancing, int instance_first, int instance_count)
+void GWN_batch_draw_range_ex(Gwn_Batch* batch, int v_first, int v_count, bool force_instance)
 	{
 #if TRUST_NO_ONE
-	// batch_instancing can be null if the number of instances is specified.
-	assert(batch_instancing != NULL || instance_count != 0);
+	assert(!(force_instance && (batch->inst == NULL)) || v_count > 0); // we cannot infer length if force_instance
 #endif
-	if (batch_instanced->vao_id)
-		glBindVertexArray(batch_instanced->vao_id);
-	else
-		Batch_prime(batch_instanced);

-	if (batch_instanced->program_dirty)
-		Batch_update_program_bindings_instancing(batch_instanced, batch_instancing, instance_first);
-
-	if (instance_count == 0)
-		instance_count = batch_instancing->verts[0]->vertex_ct;
-
-	if (batch_instanced->elem)
+	// If using offset drawing, use the default VAO and redo bindings.
+	if (v_first != 0)
 		{
-		const Gwn_IndexBuf* el = batch_instanced->elem;
-
-#if GWN_TRACK_INDEX_RANGE
-		glDrawElementsInstancedBaseVertex(batch_instanced->gl_prim_type, el->index_ct, el->gl_index_type, 0, instance_count, el->base_index);
-#else
-		glDrawElementsInstanced(batch_instanced->gl_prim_type, el->index_ct, GL_UNSIGNED_INT, 0, instance_count);
-#endif
+		glBindVertexArray(GWN_vao_default());
+		batch_update_program_bindings(batch, v_first);
 		}
 	else
-		glDrawArraysInstanced(batch_instanced->gl_prim_type, 0, batch_instanced->verts[0]->vertex_ct, instance_count);
+		glBindVertexArray(batch->vao_id);
+
+	if (force_instance || batch->inst)
+		{
+		// Infer length if vertex count is not given
+		if (v_count == 0)
+			v_count = batch->inst->vertex_ct;
+
+		if (batch->elem)
+			{
+			const Gwn_IndexBuf* el = batch->elem;
+
+#if GWN_TRACK_INDEX_RANGE
+			glDrawElementsInstancedBaseVertex(batch->gl_prim_type, el->index_ct, el->gl_index_type, 0, v_count, el->base_index);
+#else
+			glDrawElementsInstanced(batch->gl_prim_type, el->index_ct, GL_UNSIGNED_INT, 0, v_count);
+#endif
+			}
+		else
+			glDrawArraysInstanced(batch->gl_prim_type, 0, batch->verts[0]->vertex_ct, v_count);
+		}
+	else
+		{
+		// Infer length if vertex count is not given
+		if (v_count == 0)
+			v_count = (batch->elem) ? batch->elem->index_ct : batch->verts[0]->vertex_ct;
+
+		if (batch->elem)
+			{
+			const Gwn_IndexBuf* el = batch->elem;
+
+#if GWN_TRACK_INDEX_RANGE
+			if (el->base_index)
+				glDrawRangeElementsBaseVertex(batch->gl_prim_type, el->min_index, el->max_index, v_count, el->gl_index_type, 0, el->base_index);
+			else
+				glDrawRangeElements(batch->gl_prim_type, el->min_index, el->max_index, v_count, el->gl_index_type, 0);
+#else
+			glDrawElements(batch->gl_prim_type, v_count, GL_UNSIGNED_INT, 0);
+#endif
+			}
+		else
+			glDrawArrays(batch->gl_prim_type, 0, v_count);
+		}
+

 	glBindVertexArray(0);
 	}

 // just draw some vertices and let shader place them where we want.
-void GWN_batch_draw_procedural(Gwn_Batch* batch, Gwn_PrimType prim_type, int v_count)
+void GWN_draw_primitive(Gwn_PrimType prim_type, int v_count)
 	{
 	// we cannot draw without vao ... annoying ...
-	if (batch->vao_id)
-		glBindVertexArray(batch->vao_id);
-	else
-		Batch_prime(batch);
+	glBindVertexArray(GWN_vao_default());

 	GLenum type = convert_prim_type_to_gl(prim_type);
 	glDrawArrays(type, 0, v_count);
--- a/intern/gawain/src/gwn_buffer_id.cpp
+++ b/intern/gawain/src/gwn_buffer_id.cpp
@ -20,7 +20,6 @@
 #endif

 static std::vector<GLuint> orphaned_buffer_ids;
-static std::vector<GLuint> orphaned_vao_ids;

 static std::mutex orphan_mutex;

@ -36,10 +35,6 @@ static bool thread_is_main()

 GLuint GWN_buf_id_alloc()
 	{
-#if TRUST_NO_ONE
-	assert(thread_is_main());
-#endif
-
 	// delete orphaned IDs
 	orphan_mutex.lock();
 	if (!orphaned_buffer_ids.empty())
@ -73,43 +68,3 @@ void GWN_buf_id_free(GLuint buffer_id)
 		orphan_mutex.unlock();
 		}
 	}
-
-GLuint GWN_vao_alloc()
-	{
-#if TRUST_NO_ONE
-	assert(thread_is_main());
-#endif
-
-	// delete orphaned IDs
-	orphan_mutex.lock();
-	if (!orphaned_vao_ids.empty())
-		{
-		const auto orphaned_vao_ct = (unsigned)orphaned_vao_ids.size();
-#if ORPHAN_DEBUG
-		printf("deleting %u orphaned VAO%s\n", orphaned_vao_ct, orphaned_vao_ct == 1 ? "" : "s");
-#endif
-		glDeleteVertexArrays(orphaned_vao_ct, orphaned_vao_ids.data());
-		orphaned_vao_ids.clear();
-		}
-	orphan_mutex.unlock();
-
-	GLuint new_vao_id = 0;
-	glGenVertexArrays(1, &new_vao_id);
-	return new_vao_id;
-	}
-
-void GWN_vao_free(GLuint vao_id)
-	{
-	if (thread_is_main())
-		glDeleteVertexArrays(1, &vao_id);
-	else
-		{
-		// add this ID to the orphaned list
-		orphan_mutex.lock();
-#if ORPHAN_DEBUG
-		printf("orphaning VAO %u\n", vao_id);
-#endif
-		orphaned_vao_ids.emplace_back(vao_id);
-		orphan_mutex.unlock();
-		}
-	}
--- a/intern/gawain/src/gwn_immediate.c
+++ b/intern/gawain/src/gwn_immediate.c
@ -14,6 +14,7 @@
 #include "gwn_attr_binding.h"
 #include "gwn_attr_binding_private.h"
 #include "gwn_vertex_format_private.h"
+#include "gwn_vertex_array_id.h"
 #include "gwn_primitive_private.h"
 #include <string.h>

@ -27,6 +28,7 @@ typedef struct {
 #if IMM_BATCH_COMBO
 	Gwn_Batch* batch;
 #endif
+	Gwn_Context* context;

 	// current draw call
 	GLubyte* buffer_data;
@ -86,8 +88,8 @@ void immActivate(void)
 	assert(imm.prim_type == GWN_PRIM_NONE); // make sure we're not between a Begin/End pair
 	assert(imm.vao_id == 0);
 #endif
-
 	imm.vao_id = GWN_vao_alloc();
+	imm.context = GWN_context_active_get();
 	}

 void immDeactivate(void)
@ -97,8 +99,7 @@ void immDeactivate(void)
 	assert(imm.prim_type == GWN_PRIM_NONE); // make sure we're not between a Begin/End pair
 	assert(imm.vao_id != 0);
 #endif
-
-	GWN_vao_free(imm.vao_id);
+	GWN_vao_free(imm.vao_id, imm.context);
 	imm.vao_id = 0;
 	imm.prev_enabled_attrib_bits = 0;
 	}
--- a/intern/gawain/src/gwn_shader_interface.c
+++ b/intern/gawain/src/gwn_shader_interface.c
@ -10,6 +10,7 @@
 // the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.

 #include "gwn_shader_interface.h"
+#include "gwn_vertex_array_id.h"
 #include <stdlib.h>
 #include <stddef.h>
 #include <string.h>
@ -263,6 +264,10 @@ Gwn_ShaderInterface* GWN_shaderinterface_create(GLint program)
 #endif
 		}

+	// Batches ref buffer
+	shaderface->batches_ct = GWN_SHADERINTERFACE_REF_ALLOC_COUNT;
+	shaderface->batches = calloc(shaderface->batches_ct, sizeof(Gwn_Batch*));
+
 	return shaderface;
 	}

@ -274,6 +279,12 @@ void GWN_shaderinterface_discard(Gwn_ShaderInterface* shaderface)
 	buckets_free(shaderface->ubo_buckets);
 	// Free memory used by name_buffer.
 	free(shaderface->name_buffer);
+	// Remove this interface from all linked Batches vao cache.
+	for (int i = 0; i < shaderface->batches_ct; ++i)
+		if (shaderface->batches[i] != NULL)
+			GWN_batch_remove_interface_ref(shaderface->batches[i], shaderface);
+
+	free(shaderface->batches);
 	// Free memory used by shader interface by its self.
 	free(shaderface);
 	}
@ -316,3 +327,34 @@ const Gwn_ShaderInput* GWN_shaderinterface_attr(const Gwn_ShaderInterface* shade
 	{
 	return buckets_lookup(shaderface->attrib_buckets, shaderface->name_buffer, name);
 	}
+
+void GWN_shaderinterface_add_batch_ref(Gwn_ShaderInterface* shaderface, Gwn_Batch* batch)
+	{
+	int i; // find first unused slot
+	for (i = 0; i < shaderface->batches_ct; ++i)
+		if (shaderface->batches[i] == NULL)
+			break;
+
+	if (i == shaderface->batches_ct)
+		{
+		// Not enough place, realloc the array.
+		i = shaderface->batches_ct;
+		shaderface->batches_ct += GWN_SHADERINTERFACE_REF_ALLOC_COUNT;
+		shaderface->batches = realloc(shaderface->batches, sizeof(Gwn_Batch*) * shaderface->batches_ct);
+		memset(shaderface->batches + i, 0, sizeof(Gwn_Batch*) * GWN_SHADERINTERFACE_REF_ALLOC_COUNT);
+		}
+
+	shaderface->batches[i] = batch;
+	}
+
+void GWN_shaderinterface_remove_batch_ref(Gwn_ShaderInterface* shaderface, Gwn_Batch* batch)
+	{
+	for (int i = 0; i < shaderface->batches_ct; ++i)
+		{
+		if (shaderface->batches[i] == batch)
+			{
+			shaderface->batches[i] = NULL;
+			break; // cannot have duplicates
+			}
+		}
+	}
--- a/intern/gawain/src/gwn_vertex_array_id.cpp
+++ b/intern/gawain/src/gwn_vertex_array_id.cpp
@ -109,7 +109,7 @@ GLuint GWN_vao_default(void)
 	return active_ctx->default_vao;
 	}

-GLuint GWN_vao_alloc_new(void)
+GLuint GWN_vao_alloc(void)
 	{
 #if TRUST_NO_ONE
 	assert(active_ctx); // need at least an active context
@ -123,7 +123,7 @@ GLuint GWN_vao_alloc_new(void)
 	}

 // this can be called from multiple thread
-void GWN_vao_free_new(GLuint vao_id, Gwn_Context* ctx)
+void GWN_vao_free(GLuint vao_id, Gwn_Context* ctx)
 	{
 	if (ctx == active_ctx)
 		glDeleteVertexArrays(1, &vao_id);
--- a/source/blender/draw/intern/DRW_render.h
+++ b/source/blender/draw/intern/DRW_render.h
@ -341,7 +341,7 @@ typedef void (DRWCallGenerateFn)(
        void (*draw_fn)(DRWShadingGroup *shgroup, struct Gwn_Batch *geom),
        void *user_data);

-void DRW_shgroup_instance_batch(DRWShadingGroup *shgroup, struct Gwn_Batch *instances);
+void DRW_shgroup_instance_batch(DRWShadingGroup *shgroup, struct Gwn_Batch *batch);

 void DRW_shgroup_free(struct DRWShadingGroup *shgroup);
 void DRW_shgroup_call_add(DRWShadingGroup *shgroup, struct Gwn_Batch *geom, float (*obmat)[4]);
--- a/source/blender/draw/intern/draw_instance_data.c
+++ b/source/blender/draw/intern/draw_instance_data.c
@ -42,12 +42,29 @@
 #define BUFFER_CHUNK_SIZE 32
 #define BUFFER_VERTS_CHUNK 32

-typedef struct DRWInstanceBuffer {
+typedef struct DRWBatchingBuffer {
 	struct DRWShadingGroup *shgroup;  /* Link back to the owning shGroup. Also tells if it's used */
 	Gwn_VertFormat *format;           /* Identifier. */
 	Gwn_VertBuf *vert;                /* Gwn_VertBuf contained in the Gwn_Batch. */
 	Gwn_Batch *batch;                 /* Gwn_Batch containing the Gwn_VertBuf. */
-} DRWInstanceBuffer;
+} DRWBatchingBuffer;
+
+typedef struct DRWInstancingBuffer {
+	struct DRWShadingGroup *shgroup;  /* Link back to the owning shGroup. Also tells if it's used */
+	Gwn_VertFormat *format;           /* Identifier. */
+	Gwn_Batch *instance;              /* Identifier. */
+	Gwn_VertBuf *vert;                /* Gwn_VertBuf contained in the Gwn_Batch. */
+	Gwn_Batch *batch;                 /* Gwn_Batch containing the Gwn_VertBuf. */
+} DRWInstancingBuffer;
+
+typedef struct DRWInstanceChunk {
+	size_t cursor;             /* Offset to the next instance data. */
+	size_t alloc_size;         /* Number of DRWBatchingBuffer/Batches alloc'd in ibufs/btchs. */
+	union {
+		DRWBatchingBuffer *bbufs;
+		DRWInstancingBuffer *ibufs;
+	};
+} DRWInstanceChunk;

 struct DRWInstanceData {
 	struct DRWInstanceData *next;
@ -60,19 +77,19 @@ struct DRWInstanceData {
 };

 struct DRWInstanceDataList {
+	struct DRWInstanceDataList *next, *prev;
 	/* Linked lists for all possible data pool size */
 	/* Not entirely sure if we should separate them in the first place.
 	 * This is done to minimize the reattribution misses. */
 	DRWInstanceData *idata_head[MAX_INSTANCE_DATA_SIZE];
 	DRWInstanceData *idata_tail[MAX_INSTANCE_DATA_SIZE];

-	struct {
-		size_t cursor;             /* Offset to the next instance data. */
-		size_t alloc_size;         /* Number of DRWInstanceBuffer alloc'd in ibufs. */
-		DRWInstanceBuffer *ibufs;
-	} ibuffers;
+	DRWInstanceChunk instancing;
+	DRWInstanceChunk batching;
 };

+static ListBase g_idatalists = {NULL, NULL};
+
 /* -------------------------------------------------------------------- */

 /** \name Instance Buffer Management
@ -87,89 +104,174 @@ struct DRWInstanceDataList {
 * that would be too slow]).
 **/

-void DRW_instance_buffer_request(
-        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, struct DRWShadingGroup *shgroup,
-        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert, Gwn_PrimType type)
+static void instance_batch_free(Gwn_Batch *batch, void *UNUSED(user_data))
 {
-	BLI_assert(format);
-
-	DRWInstanceBuffer *ibuf = idatalist->ibuffers.ibufs;
-	int first_non_alloced = -1;
-
-	/* Search for an unused batch. */
-	for (int i = 0; i < idatalist->ibuffers.alloc_size; i++, ibuf++) {
-		if (ibuf->shgroup == NULL) {
-			if (ibuf->format == format) {
-				ibuf->shgroup = shgroup;
-				*r_batch = ibuf->batch;
-				*r_vert = ibuf->vert;
-				return;
-			}
-			else if (ibuf->format == NULL && first_non_alloced == -1) {
-				first_non_alloced = i;
+	/* Free all batches that have the same key before they are reused. */
+	/* TODO: Make it thread safe! Batch freeing can happen from another thread. */
+	/* XXX we need to iterate over all idatalists unless we make some smart
+	 * data structure to store the locations to update. */
+	for (DRWInstanceDataList *idatalist = g_idatalists.first; idatalist; ++idatalist) {
+		DRWInstancingBuffer *ibuf = idatalist->instancing.ibufs;
+		for (int i = 0; i < idatalist->instancing.alloc_size; i++, ibuf++) {
+			if (ibuf->instance == batch) {
+				BLI_assert(ibuf->shgroup == NULL); /* Make sure it has no other users. */
+				GWN_VERTBUF_DISCARD_SAFE(ibuf->vert);
+				GWN_BATCH_DISCARD_SAFE(ibuf->batch);
+				/* Tag as non alloced. */
+				ibuf->format = NULL;
 			}
 		}
 	}
+}

-	if (first_non_alloced == -1) {
-		/* There is no batch left. Allocate more. */
-		first_non_alloced = idatalist->ibuffers.alloc_size;
-		idatalist->ibuffers.alloc_size += BUFFER_CHUNK_SIZE;
-		idatalist->ibuffers.ibufs = MEM_reallocN(idatalist->ibuffers.ibufs,
-		                                         idatalist->ibuffers.alloc_size * sizeof(DRWInstanceBuffer));
-		/* Clear new part of the memory. */
-		memset(idatalist->ibuffers.ibufs + first_non_alloced, 0, sizeof(DRWInstanceBuffer) * BUFFER_CHUNK_SIZE);
+void DRW_batching_buffer_request(
+        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, Gwn_PrimType type, struct DRWShadingGroup *shgroup,
+        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert)
+{
+	DRWInstanceChunk *chunk = &idatalist->batching;
+	DRWBatchingBuffer *bbuf = idatalist->batching.bbufs;
+	BLI_assert(format);
+	/* Search for an unused batch. */
+	for (int i = 0; i < idatalist->batching.alloc_size; i++, bbuf++) {
+		if (bbuf->shgroup == NULL) {
+			if (bbuf->format == format) {
+				bbuf->shgroup = shgroup;
+				*r_batch = bbuf->batch;
+				*r_vert = bbuf->vert;
+				return;
+			}
+		}
+	}
+	int new_id = 0; /* Find insertion point. */
+	for (; new_id < chunk->alloc_size; ++new_id) {
+		if (chunk->bbufs[new_id].format == NULL)
+			break;
+	}
+	/* If there is no batch left. Allocate more. */
+	if (new_id == chunk->alloc_size) {
+		new_id = chunk->alloc_size;
+		chunk->alloc_size += BUFFER_CHUNK_SIZE;
+		chunk->bbufs = MEM_reallocN(chunk->bbufs, chunk->alloc_size * sizeof(DRWBatchingBuffer));
+		memset(chunk->bbufs + new_id, 0, sizeof(DRWBatchingBuffer) * BUFFER_CHUNK_SIZE);
 	}
-
 	/* Create the batch. */
-	ibuf = idatalist->ibuffers.ibufs + first_non_alloced;
+	bbuf = chunk->bbufs + new_id;
+	bbuf->vert = *r_vert = GWN_vertbuf_create_dynamic_with_format(format);
+	bbuf->batch = *r_batch = GWN_batch_create_ex(type, bbuf->vert, NULL, 0);
+	bbuf->format = format;
+	bbuf->shgroup = shgroup;
+	GWN_vertbuf_data_alloc(*r_vert, BUFFER_VERTS_CHUNK);
+}
+
+void DRW_instancing_buffer_request(
+        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, Gwn_Batch *instance, struct DRWShadingGroup *shgroup,
+        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert)
+{
+	DRWInstanceChunk *chunk = &idatalist->instancing;
+	DRWInstancingBuffer *ibuf = idatalist->instancing.ibufs;
+	BLI_assert(format);
+	/* Search for an unused batch. */
+	for (int i = 0; i < idatalist->instancing.alloc_size; i++, ibuf++) {
+		if (ibuf->shgroup == NULL) {
+			if (ibuf->format == format) {
+				if (ibuf->instance == instance) {
+					ibuf->shgroup = shgroup;
+					*r_batch = ibuf->batch;
+					*r_vert = ibuf->vert;
+					return;
+				}
+			}
+		}
+	}
+	int new_id = 0; /* Find insertion point. */
+	for (; new_id < chunk->alloc_size; ++new_id) {
+		if (chunk->ibufs[new_id].format == NULL)
+			break;
+	}
+	/* If there is no batch left. Allocate more. */
+	if (new_id == chunk->alloc_size) {
+		new_id = chunk->alloc_size;
+		chunk->alloc_size += BUFFER_CHUNK_SIZE;
+		chunk->ibufs = MEM_reallocN(chunk->ibufs, chunk->alloc_size * sizeof(DRWInstancingBuffer));
+		memset(chunk->ibufs + new_id, 0, sizeof(DRWInstancingBuffer) * BUFFER_CHUNK_SIZE);
+	}
+	/* Create the batch. */
+	ibuf = chunk->ibufs + new_id;
 	ibuf->vert = *r_vert = GWN_vertbuf_create_dynamic_with_format(format);
-	ibuf->batch = *r_batch = GWN_batch_create_ex(type, ibuf->vert, NULL, GWN_BATCH_OWNS_VBO);
+	ibuf->batch = *r_batch = GWN_batch_duplicate(instance);
 	ibuf->format = format;
 	ibuf->shgroup = shgroup;
-
+	ibuf->instance = instance;
 	GWN_vertbuf_data_alloc(*r_vert, BUFFER_VERTS_CHUNK);
+	GWN_batch_instbuf_set(ibuf->batch, ibuf->vert, false);
+	/* Make sure to free this ibuf if the instance batch gets free. */
+	GWN_batch_callback_free_set(instance, &instance_batch_free, NULL);
 }

 void DRW_instance_buffer_finish(DRWInstanceDataList *idatalist)
 {
-	DRWInstanceBuffer *ibuf = idatalist->ibuffers.ibufs;
-	size_t minimum_alloc_size = 1; /* Avoid 0 size realloc. */
-
+	size_t realloc_size = 1; /* Avoid 0 size realloc. */
 	/* Resize down buffers in use and send data to GPU & free unused buffers. */
-	for (int i = 0; i < idatalist->ibuffers.alloc_size; i++, ibuf++) {
+	DRWInstanceChunk *batching = &idatalist->batching;
+	DRWBatchingBuffer *bbuf = batching->bbufs;
+	for (int i = 0; i < batching->alloc_size; i++, bbuf++) {
+		if (bbuf->shgroup != NULL) {
+			realloc_size = i + 1;
+			unsigned int vert_ct = DRW_shgroup_get_instance_count(bbuf->shgroup);
+			vert_ct += (vert_ct == 0) ? 1 : 0; /* Do not realloc to 0 size buffer */
+			if (vert_ct + BUFFER_VERTS_CHUNK <= bbuf->vert->vertex_ct) {
+				unsigned int size = vert_ct + BUFFER_VERTS_CHUNK - 1;
+				size = size - size % BUFFER_VERTS_CHUNK;
+				GWN_vertbuf_data_resize(bbuf->vert, size);
+			}
+			GWN_vertbuf_use(bbuf->vert); /* Send data. */
+			bbuf->shgroup = NULL; /* Set as non used for the next round. */
+		}
+		else {
+			GWN_VERTBUF_DISCARD_SAFE(bbuf->vert);
+			GWN_BATCH_DISCARD_SAFE(bbuf->batch);
+			bbuf->format = NULL; /* Tag as non alloced. */
+		}
+	}
+	/* Rounding up to nearest chunk size. */
+	realloc_size += BUFFER_CHUNK_SIZE - 1;
+	realloc_size -= realloc_size % BUFFER_CHUNK_SIZE;
+	/* Resize down if necessary. */
+	if (realloc_size < batching->alloc_size) {
+		batching->alloc_size = realloc_size;
+		batching->ibufs = MEM_reallocN(batching->ibufs, realloc_size * sizeof(DRWBatchingBuffer));
+	}
+
+	realloc_size = 1;
+	/* Resize down buffers in use and send data to GPU & free unused buffers. */
+	DRWInstanceChunk *instancing = &idatalist->instancing;
+	DRWInstancingBuffer *ibuf = instancing->ibufs;
+	for (int i = 0; i < instancing->alloc_size; i++, ibuf++) {
 		if (ibuf->shgroup != NULL) {
-			minimum_alloc_size = i + 1;
+			realloc_size = i + 1;
 			unsigned int vert_ct = DRW_shgroup_get_instance_count(ibuf->shgroup);
-			/* Do not realloc to 0 size buffer */
-			vert_ct += (vert_ct == 0) ? 1 : 0;
-			/* Resize buffer to reclame space. */
+			vert_ct += (vert_ct == 0) ? 1 : 0; /* Do not realloc to 0 size buffer */
 			if (vert_ct + BUFFER_VERTS_CHUNK <= ibuf->vert->vertex_ct) {
 				unsigned int size = vert_ct + BUFFER_VERTS_CHUNK - 1;
 				size = size - size % BUFFER_VERTS_CHUNK;
 				GWN_vertbuf_data_resize(ibuf->vert, size);
 			}
-			/* Send data. */
-			GWN_vertbuf_use(ibuf->vert);
-			/* Set as non used for the next round. */
-			ibuf->shgroup = NULL;
+			GWN_vertbuf_use(ibuf->vert); /* Send data. */
+			ibuf->shgroup = NULL; /* Set as non used for the next round. */
 		}
 		else {
+			GWN_VERTBUF_DISCARD_SAFE(ibuf->vert);
 			GWN_BATCH_DISCARD_SAFE(ibuf->batch);
-			/* Tag as non alloced. */
-			ibuf->format = NULL;
+			ibuf->format = NULL; /* Tag as non alloced. */
 		}
 	}
-
-	/* Resize down the handle buffer (ibuffers). */
 	/* Rounding up to nearest chunk size. */
-	minimum_alloc_size += BUFFER_CHUNK_SIZE - 1;
-	minimum_alloc_size -= minimum_alloc_size % BUFFER_CHUNK_SIZE;
+	realloc_size += BUFFER_CHUNK_SIZE - 1;
+	realloc_size -= realloc_size % BUFFER_CHUNK_SIZE;
 	/* Resize down if necessary. */
-	if (minimum_alloc_size < idatalist->ibuffers.alloc_size) {
-		idatalist->ibuffers.alloc_size = minimum_alloc_size;
-		idatalist->ibuffers.ibufs = MEM_reallocN(idatalist->ibuffers.ibufs,
-		                                         minimum_alloc_size * sizeof(DRWInstanceBuffer));
+	if (realloc_size < instancing->alloc_size) {
+		instancing->alloc_size = realloc_size;
+		instancing->ibufs = MEM_reallocN(instancing->ibufs, realloc_size * sizeof(DRWInstancingBuffer));
 	}
 }

@ -183,7 +285,7 @@ void DRW_instance_buffer_finish(DRWInstanceDataList *idatalist)
 static DRWInstanceData *drw_instance_data_create(
        DRWInstanceDataList *idatalist, unsigned int attrib_size, unsigned int instance_group)
 {
-	DRWInstanceData *idata = MEM_mallocN(sizeof(DRWInstanceData), "DRWInstanceData");
+	DRWInstanceData *idata = MEM_callocN(sizeof(DRWInstanceData), "DRWInstanceData");
 	idata->next = NULL;
 	idata->used = true;
 	idata->data_size = attrib_size;
@ -263,15 +365,18 @@ DRWInstanceData *DRW_instance_data_request(
 DRWInstanceDataList *DRW_instance_data_list_create(void)
 {
 	DRWInstanceDataList *idatalist = MEM_callocN(sizeof(DRWInstanceDataList), "DRWInstanceDataList");
-	idatalist->ibuffers.ibufs = MEM_callocN(sizeof(DRWInstanceBuffer) * BUFFER_CHUNK_SIZE, "DRWInstanceBuffers");
-	idatalist->ibuffers.alloc_size = BUFFER_CHUNK_SIZE;
+	idatalist->batching.bbufs = MEM_callocN(sizeof(DRWBatchingBuffer) * BUFFER_CHUNK_SIZE, "DRWBatchingBuffers");
+	idatalist->batching.alloc_size = BUFFER_CHUNK_SIZE;
+	idatalist->instancing.ibufs = MEM_callocN(sizeof(DRWInstancingBuffer) * BUFFER_CHUNK_SIZE, "DRWInstancingBuffers");
+	idatalist->instancing.alloc_size = BUFFER_CHUNK_SIZE;
+
+	BLI_addtail(&g_idatalists, idatalist);

 	return idatalist;
 }

 void DRW_instance_data_list_free(DRWInstanceDataList *idatalist)
 {
-	DRWInstanceBuffer *ibuf = idatalist->ibuffers.ibufs;
 	DRWInstanceData *idata, *next_idata;

 	for (int i = 0; i < MAX_INSTANCE_DATA_SIZE; ++i) {
@ -284,10 +389,21 @@ void DRW_instance_data_list_free(DRWInstanceDataList *idatalist)
 		idatalist->idata_tail[i] = NULL;
 	}

-	for (int i = 0; i < idatalist->ibuffers.alloc_size; i++, ibuf++) {
+	DRWBatchingBuffer *bbuf = idatalist->batching.bbufs;
+	for (int i = 0; i < idatalist->batching.alloc_size; i++, bbuf++) {
+		GWN_VERTBUF_DISCARD_SAFE(bbuf->vert);
+		GWN_BATCH_DISCARD_SAFE(bbuf->batch);
+	}
+	MEM_freeN(idatalist->batching.bbufs);
+
+	DRWInstancingBuffer *ibuf = idatalist->instancing.ibufs;
+	for (int i = 0; i < idatalist->instancing.alloc_size; i++, ibuf++) {
+		GWN_VERTBUF_DISCARD_SAFE(ibuf->vert);
 		GWN_BATCH_DISCARD_SAFE(ibuf->batch);
 	}
-	MEM_freeN(idatalist->ibuffers.ibufs);
+	MEM_freeN(idatalist->instancing.ibufs);
+
+	BLI_remlink(&g_idatalists, idatalist);
 }

 void DRW_instance_data_list_reset(DRWInstanceDataList *idatalist)
--- a/source/blender/draw/intern/draw_instance_data.h
+++ b/source/blender/draw/intern/draw_instance_data.h
@ -43,9 +43,12 @@ void *DRW_instance_data_get(DRWInstanceData *idata);
 DRWInstanceData *DRW_instance_data_request(
        DRWInstanceDataList *idatalist, unsigned int attrib_size, unsigned int instance_group);

-void DRW_instance_buffer_request(
-        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, struct DRWShadingGroup *shgroup,
-        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert, Gwn_PrimType type);
+void DRW_batching_buffer_request(
+        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, Gwn_PrimType type, struct DRWShadingGroup *shgroup,
+        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert);
+void DRW_instancing_buffer_request(
+        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, Gwn_Batch *instance, struct DRWShadingGroup *shgroup,
+        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert);

 /* Upload all instance data to the GPU as soon as possible. */
 void DRW_instance_buffer_finish(DRWInstanceDataList *idatalist);
--- a/source/blender/draw/intern/draw_manager.c
+++ b/source/blender/draw/intern/draw_manager.c
@ -665,6 +665,24 @@ static void drw_interface_init(DRWInterface *interface, GPUShader *shader)
 }

 static void drw_interface_instance_init(
+        DRWShadingGroup *shgroup, GPUShader *shader, Gwn_Batch *batch, Gwn_VertFormat *format)
+{
+	DRWInterface *interface = &shgroup->interface;
+	drw_interface_init(interface, shader);
+
+#ifndef NDEBUG
+	interface->attribs_count = (format != NULL) ? format->attrib_ct : 0;
+#endif
+	BLI_assert(shgroup->type == DRW_SHG_INSTANCE);
+	BLI_assert(shgroup->instance_geom != NULL);
+
+	if (format != NULL) {
+		DRW_instancing_buffer_request(DST.idatalist, format, batch, shgroup,
+		                              &shgroup->instancing_geom, &interface->instance_vbo);
+	}
+}
+
+static void drw_interface_batching_init(
        DRWShadingGroup *shgroup, GPUShader *shader, Gwn_VertFormat *format)
 {
 	DRWInterface *interface = &shgroup->interface;
@ -673,36 +691,19 @@ static void drw_interface_instance_init(
 #ifndef NDEBUG
 	interface->attribs_count = (format != NULL) ? format->attrib_ct : 0;
 #endif
+	BLI_assert(format != NULL);

 	Gwn_PrimType type;
-	Gwn_Batch **r_batch = NULL;
 	switch (shgroup->type) {
-		case DRW_SHG_INSTANCE:
-			r_batch = &shgroup->instancing_geom;
-			type = GWN_PRIM_POINTS;
-			break;
-		case DRW_SHG_POINT_BATCH:
-			r_batch = &shgroup->batch_geom;
-			type = GWN_PRIM_POINTS;
-			break;
-		case DRW_SHG_LINE_BATCH:
-			r_batch = &shgroup->batch_geom;
-			type = GWN_PRIM_LINES;
-			break;
-		case DRW_SHG_TRIANGLE_BATCH:
-			r_batch = &shgroup->batch_geom;
-			type = GWN_PRIM_TRIS;
-			break;
+		case DRW_SHG_POINT_BATCH: type = GWN_PRIM_POINTS; break;
+		case DRW_SHG_LINE_BATCH: type = GWN_PRIM_LINES; break;
+		case DRW_SHG_TRIANGLE_BATCH: type = GWN_PRIM_TRIS; break;
 		default:
 			BLI_assert(0);
 	}

-	if (format != NULL) {
-		DRW_instance_buffer_request(DST.idatalist, format, shgroup, r_batch, &interface->instance_vbo, type);
-	}
-	else {
-		*r_batch = NULL;
-	}
+	DRW_batching_buffer_request(DST.idatalist, format, type, shgroup,
+	                            &shgroup->batch_geom, &interface->instance_vbo);
 }

 static void drw_interface_uniform(DRWShadingGroup *shgroup, const char *name,
@ -882,7 +883,7 @@ DRWShadingGroup *DRW_shgroup_material_instance_create(
 		shgroup->type = DRW_SHG_INSTANCE;
 		shgroup->instance_geom = geom;
 		shgroup->instance_data = ob->data;
-		drw_interface_instance_init(shgroup, GPU_pass_shader(gpupass), format);
+		drw_interface_instance_init(shgroup, GPU_pass_shader(gpupass), geom, format);
 		drw_shgroup_material_inputs(shgroup, material, gpupass);
 	}

@ -890,7 +891,7 @@ DRWShadingGroup *DRW_shgroup_material_instance_create(
 }

 DRWShadingGroup *DRW_shgroup_material_empty_tri_batch_create(
-        struct GPUMaterial *material, DRWPass *pass, int size)
+        struct GPUMaterial *material, DRWPass *pass, int tri_count)
 {
 #ifdef USE_GPU_SELECT
 	BLI_assert((G.f & G_PICKSEL) == 0);
@ -899,10 +900,10 @@ DRWShadingGroup *DRW_shgroup_material_empty_tri_batch_create(
 	DRWShadingGroup *shgroup = drw_shgroup_material_create_ex(gpupass, pass);

 	if (shgroup) {
-		shgroup->type = DRW_SHG_TRIANGLE_BATCH;
-		shgroup->interface.instance_count = size * 3;
-		/* Calling drw_interface_init will cause it to GWN_batch_draw_procedural. */
+		/* Calling drw_interface_init will cause it to call GWN_draw_primitive(). */
 		drw_interface_init(&shgroup->interface, GPU_pass_shader(gpupass));
+		shgroup->type = DRW_SHG_TRIANGLE_BATCH;
+		shgroup->interface.instance_count = tri_count * 3;
 		drw_shgroup_material_inputs(shgroup, material, gpupass);
 	}

@ -923,7 +924,7 @@ DRWShadingGroup *DRW_shgroup_instance_create(
 	shgroup->type = DRW_SHG_INSTANCE;
 	shgroup->instance_geom = geom;

-	drw_interface_instance_init(shgroup, shader, format);
+	drw_interface_instance_init(shgroup, shader, geom, format);

 	return shgroup;
 }
@ -937,7 +938,7 @@ DRWShadingGroup *DRW_shgroup_point_batch_create(struct GPUShader *shader, DRWPas
 	DRWShadingGroup *shgroup = drw_shgroup_create_ex(shader, pass);
 	shgroup->type = DRW_SHG_POINT_BATCH;

-	drw_interface_instance_init(shgroup, shader, g_pos_format);
+	drw_interface_batching_init(shgroup, shader, g_pos_format);

 	return shgroup;
 }
@ -949,7 +950,7 @@ DRWShadingGroup *DRW_shgroup_line_batch_create(struct GPUShader *shader, DRWPass
 	DRWShadingGroup *shgroup = drw_shgroup_create_ex(shader, pass);
 	shgroup->type = DRW_SHG_LINE_BATCH;

-	drw_interface_instance_init(shgroup, shader, g_pos_format);
+	drw_interface_batching_init(shgroup, shader, g_pos_format);

 	return shgroup;
 }
@ -957,18 +958,18 @@ DRWShadingGroup *DRW_shgroup_line_batch_create(struct GPUShader *shader, DRWPass
 /* Very special batch. Use this if you position
 * your vertices with the vertex shader
 * and dont need any VBO attrib */
-DRWShadingGroup *DRW_shgroup_empty_tri_batch_create(struct GPUShader *shader, DRWPass *pass, int size)
+DRWShadingGroup *DRW_shgroup_empty_tri_batch_create(struct GPUShader *shader, DRWPass *pass, int tri_count)
 {
 #ifdef USE_GPU_SELECT
 	BLI_assert((G.f & G_PICKSEL) == 0);
 #endif
 	DRWShadingGroup *shgroup = drw_shgroup_create_ex(shader, pass);

-	/* Calling drw_interface_init will cause it to GWN_batch_draw_procedural. */
+	/* Calling drw_interface_init will cause it to call GWN_draw_primitive(). */
 	drw_interface_init(&shgroup->interface, shader);

 	shgroup->type = DRW_SHG_TRIANGLE_BATCH;
-	shgroup->interface.instance_count = size * 3;
+	shgroup->interface.instance_count = tri_count * 3;

 	return shgroup;
 }
@ -991,13 +992,19 @@ void DRW_shgroup_free(struct DRWShadingGroup *UNUSED(shgroup))
 } ((void)0)

 /* Specify an external batch instead of adding each attrib one by one. */
-void DRW_shgroup_instance_batch(DRWShadingGroup *shgroup, struct Gwn_Batch *instances)
+void DRW_shgroup_instance_batch(DRWShadingGroup *shgroup, struct Gwn_Batch *batch)
 {
 	BLI_assert(shgroup->type == DRW_SHG_INSTANCE);
-	BLI_assert(shgroup->instancing_geom == NULL);
+	BLI_assert(shgroup->interface.instance_count == 0);
+	/* You cannot use external instancing batch without a dummy format. */
+	BLI_assert(shgroup->instancing_geom != NULL);

 	shgroup->type = DRW_SHG_INSTANCE_EXTERNAL;
-	shgroup->instancing_geom = instances;
+	/* PERF : This destroys the vaos cache so better check if it's necessary. */
+	/* Note: This WILL break if batch->verts[0] is destroyed and reallocated
+	 * at the same adress. Bindings/VAOs would remain obsolete. */
+	//if (shgroup->instancing_geom->inst != batch->verts[0])
+	GWN_batch_instbuf_set(shgroup->instancing_geom, batch->verts[0], false);

 #ifdef USE_GPU_SELECT
 	DRWCall *call = BLI_mempool_alloc(DST.vmempool->calls);
@ -1140,8 +1147,6 @@ void DRW_shgroup_set_instance_count(DRWShadingGroup *shgroup, unsigned int count

 unsigned int DRW_shgroup_get_instance_count(const DRWShadingGroup *shgroup)
 {
-	BLI_assert(shgroup->type != DRW_SHG_NORMAL && shgroup->type != DRW_SHG_INSTANCE_EXTERNAL);
-
 	return shgroup->interface.instance_count;
 }

@ -1765,18 +1770,17 @@ static void draw_geometry_execute_ex(
 	if (geom == NULL) {
 		BLI_assert(shgroup->type == DRW_SHG_TRIANGLE_BATCH); /* Add other type if needed. */
 		/* Shader is already bound. */
-		Gwn_Batch *batch = DRW_cache_fullscreen_quad_get();
-		GWN_batch_draw_procedural(batch, GWN_PRIM_TRIS, count);
+		GWN_draw_primitive(GWN_PRIM_TRIS, count);
 		return;
 	}

 	/* step 2 : bind vertex array & draw */
 	GWN_batch_program_set(geom, GPU_shader_get_program(shgroup->shader), GPU_shader_get_interface(shgroup->shader));
 	if (ELEM(shgroup->type, DRW_SHG_INSTANCE, DRW_SHG_INSTANCE_EXTERNAL)) {
-		GWN_batch_draw_stupid_instanced(geom, shgroup->instancing_geom, start, count);
+		GWN_batch_draw_range_ex(geom, start, count, true);
 	}
 	else {
-		GWN_batch_draw_stupid(geom, start, count);
+		GWN_batch_draw_range(geom, start, count);
 	}
 	/* XXX this just tells gawain we are done with the shader.
 	 * This does not unbind the shader. */
@ -1998,7 +2002,7 @@ static void draw_shgroup(DRWShadingGroup *shgroup, DRWState pass_state)
 			if (shgroup->type == DRW_SHG_INSTANCE_EXTERNAL) {
 				if (shgroup->instancing_geom != NULL) {
 					GPU_SELECT_LOAD_IF_PICKSEL((DRWCall *)shgroup->calls_first);
-					draw_geometry(shgroup, shgroup->instance_geom, obmat, shgroup->instance_data, 0, 0);
+					draw_geometry(shgroup, shgroup->instancing_geom, obmat, shgroup->instance_data, 0, 0);
 				}
 			}
 			else {
@ -2006,13 +2010,15 @@ static void draw_shgroup(DRWShadingGroup *shgroup, DRWState pass_state)
 					unsigned int count, start;
 					GPU_SELECT_LOAD_IF_PICKSEL_LIST(shgroup, start, count)
 					{
-						draw_geometry(shgroup, shgroup->instance_geom, obmat, shgroup->instance_data, start, count);
+						draw_geometry(shgroup,
+						              (shgroup->instancing_geom) ? shgroup->instancing_geom : shgroup->instance_geom,
+						              obmat, shgroup->instance_data, start, count);
 					}
 					GPU_SELECT_LOAD_IF_PICKSEL_LIST_END(start, count)
 				}
 			}
 		}
-		else {
+		else { /* DRW_SHG_***_BATCH */
 			/* Some dynamic batch can have no geom (no call to aggregate) */
 			if (shgroup->interface.instance_count > 0) {
 				unsigned int count, start;
--- a/source/blender/draw/modes/object_mode.c
+++ b/source/blender/draw/modes/object_mode.c
@ -218,6 +218,7 @@ typedef struct OBJECT_PrivateData {

 static struct {
 	/* Instance Data format */
+	struct Gwn_VertFormat *particle_format;
 	struct Gwn_VertFormat *empty_image_format;
 	struct Gwn_VertFormat *empty_image_wire_format;

@ -537,6 +538,7 @@ static void OBJECT_engine_init(void *vedata)

 static void OBJECT_engine_free(void)
 {
+	MEM_SAFE_FREE(e_data.particle_format);
 	MEM_SAFE_FREE(e_data.empty_image_format);
 	MEM_SAFE_FREE(e_data.empty_image_wire_format);
 	DRW_SHADER_FREE_SAFE(e_data.outline_resolve_sh);
@ -1752,6 +1754,9 @@ static void OBJECT_cache_populate_particles(Object *ob,
 				static float def_prim_col[3] = {0.5f, 0.5f, 0.5f};
 				static float def_sec_col[3] = {1.0f, 1.0f, 1.0f};

+				/* Dummy particle format for instancing to work. */
+				DRW_shgroup_instance_format(e_data.particle_format, {{"dummy", DRW_ATTRIB_FLOAT, 1}});
+
 				Material *ma = give_current_material(ob, part->omat);

 				switch (draw_as) {
@ -1766,21 +1771,24 @@ static void OBJECT_cache_populate_particles(Object *ob,
 						break;
 					case PART_DRAW_CROSS:
 						shgrp = DRW_shgroup_instance_create(
-						        e_data.part_prim_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_CROSS), NULL);
+						        e_data.part_prim_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_CROSS),
+						        e_data.particle_format);
 						DRW_shgroup_uniform_texture(shgrp, "ramp", globals_ramp);
 						DRW_shgroup_uniform_vec3(shgrp, "color", ma ? &ma->r : def_prim_col, 1);
 						DRW_shgroup_uniform_int(shgrp, "screen_space", &screen_space[0], 1);
 						break;
 					case PART_DRAW_CIRC:
 						shgrp = DRW_shgroup_instance_create(
-						        e_data.part_prim_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_CIRC), NULL);
+						        e_data.part_prim_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_CIRC),
+						        e_data.particle_format);
 						DRW_shgroup_uniform_texture(shgrp, "ramp", globals_ramp);
 						DRW_shgroup_uniform_vec3(shgrp, "color", ma ? &ma->r : def_prim_col, 1);
 						DRW_shgroup_uniform_int(shgrp, "screen_space", &screen_space[1], 1);
 						break;
 					case PART_DRAW_AXIS:
 						shgrp = DRW_shgroup_instance_create(
-						        e_data.part_axis_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_AXIS), NULL);
+						        e_data.part_axis_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_AXIS),
+						        e_data.particle_format);
 						DRW_shgroup_uniform_int(shgrp, "screen_space", &screen_space[0], 1);
 						break;
 					default: