LineArt: Use CAS for add_triangles.

Using the atomic "compare and swap" method in add_triangles stage dramatically speeds up the add_triangles call and significantly reduced the overall calculation time. This is currently the fastest method we have experimented with so far. Reviewed By: Sebastian Parborg (zeddb) Differential Revision: https://developer.blender.org/D14953
Referenced by commit 2655f47ca3, Revert "LineArt: Use CAS for add_triangles." Referenced by issue #87739, Line Art further improvement list
2022-05-19 18:56:50 +08:00 · 2022-05-19 18:56:50 +08:00 · 14a5a91e0e · 2023-02-14 09:36:46 +01:00
parent 54f357ed2a
commit 14a5a91e0e
4 changed files with 629 additions and 363 deletions
--- a/source/blender/gpencil_modifiers/CMakeLists.txt
+++ b/source/blender/gpencil_modifiers/CMakeLists.txt
@ -17,6 +17,7 @@ set(INC
  ../windowmanager
  ../../../intern/eigen
  ../../../intern/guardedalloc
+  ../../../intern/atomic

  # dna_type_offsets.h in BLO_read_write.h
  ${CMAKE_BINARY_DIR}/source/blender/makesdna/intern
--- a/source/blender/gpencil_modifiers/intern/lineart/MOD_lineart.h
+++ b/source/blender/gpencil_modifiers/intern/lineart/MOD_lineart.h
@ -236,6 +236,9 @@ typedef struct LineartRenderBuffer {
  ListBase line_buffer_pointers;
  ListBase triangle_buffer_pointers;

+  LineartElementLinkNode *isect_scheduled_up_to;
+  int isect_scheduled_up_to_index;
+
  /** This one's memory is not from main pool and is free()ed after culling stage. */
  ListBase triangle_adjacent_pointers;

@ -429,15 +432,19 @@ typedef struct LineartBoundingArea {
  /** 1,2,3,4 quadrant */
  struct LineartBoundingArea *child;

+  SpinLock lock;
+
  ListBase lp;
  ListBase rp;
  ListBase up;
  ListBase bp;

-  uint16_t triangle_count;
-  uint16_t max_triangle_count;
-  uint16_t line_count;
-  uint16_t max_line_count;
+  /* Need uint32 for the atomic cas instruction. */
+  uint32_t triangle_count;
+  uint32_t max_triangle_count;
+  uint32_t line_count;
+  uint32_t max_line_count;
+  uint32_t user_count;

  /* Use array for speeding up multiple accesses. */
  struct LineartTriangle **linked_triangles;
--- a/source/blender/gpencil_modifiers/intern/lineart/lineart_cpu.c
+++ b/source/blender/gpencil_modifiers/intern/lineart/lineart_cpu.c
--- a/source/blender/gpencil_modifiers/intern/lineart/lineart_intern.h
+++ b/source/blender/gpencil_modifiers/intern/lineart/lineart_intern.h
@ -82,7 +82,7 @@ void lineart_count_and_print_render_buffer_memory(struct LineartRenderBuffer *rb

 /* Initial bounding area row/column count, setting 4 is the simplest way algorithm could function
 * efficiently. */
-#define LRT_BA_ROWS 4
+#define LRT_BA_ROWS 10

 #ifdef __cplusplus
 extern "C" {
@ -93,3 +93,32 @@ void lineart_sort_adjacent_items(LineartAdjacentEdge *ai, int length);
 #ifdef __cplusplus
 }
 #endif
+
+#ifndef __cplusplus /* Compatibility code for atomics, only for C. */
+
+#  if defined __has_include /* Try to use C11 atomics support. */
+#    if __has_include(<stdatomic.h>)
+#      include <stdatomic.h>
+#      define lineart_atomic_load(p) atomic_load((volatile size_t *)p)
+#      define lineart_atomic_store(p, d) atomic_store((volatile size_t *)p, (size_t)d)
+#    endif
+#  endif
+
+#  ifdef _MSC_VER /* Atomics walkaround for windows. */
+#    define WIN32_LEAN_AND_MEAN
+#    include <windows.h>
+#    define lineart_atomic_load(p) (MemoryBarrier(), *(p))
+#    define lineart_atomic_store(p, d) \
+      do { \
+        *(p) = (d); \
+        MemoryBarrier(); \
+      } while (0)
+#  endif
+
+#  if !defined lineart_atomic_load /* Fallback */
+#    include "atomic_ops.h"
+#    define lineart_atomic_load(p) atomic_add_and_fetch_z((size_t *)p, 0)
+#    define lineart_atomic_store(p, d) atomic_add_and_fetch_z((size_t *)p, (size_t)d)
+#  endif
+
+#endif /* !__cplusplus */