LineArt: Use CAS for add_triangles.

Using the atomic "compare and swap" method in add_triangles stage
dramatically speeds up the add_triangles call and significantly reduced
the overall calculation time. This is currently the fastest method we
have experimented with so far.

Reviewed By: Sebastian Parborg (zeddb)

Differential Revision: https://developer.blender.org/D14953
This commit is contained in:
YimingWu 2022-05-19 18:56:50 +08:00
parent 54f357ed2a
commit 14a5a91e0e
Notes: blender-bot 2023-02-14 09:36:46 +01:00
Referenced by commit 2655f47ca3, Revert "LineArt: Use CAS for add_triangles."
Referenced by issue #87739, Line Art further improvement list
4 changed files with 629 additions and 363 deletions

View File

@ -17,6 +17,7 @@ set(INC
../windowmanager
../../../intern/eigen
../../../intern/guardedalloc
../../../intern/atomic
# dna_type_offsets.h in BLO_read_write.h
${CMAKE_BINARY_DIR}/source/blender/makesdna/intern

View File

@ -236,6 +236,9 @@ typedef struct LineartRenderBuffer {
ListBase line_buffer_pointers;
ListBase triangle_buffer_pointers;
LineartElementLinkNode *isect_scheduled_up_to;
int isect_scheduled_up_to_index;
/** This one's memory is not from main pool and is free()ed after culling stage. */
ListBase triangle_adjacent_pointers;
@ -429,15 +432,19 @@ typedef struct LineartBoundingArea {
/** 1,2,3,4 quadrant */
struct LineartBoundingArea *child;
SpinLock lock;
ListBase lp;
ListBase rp;
ListBase up;
ListBase bp;
uint16_t triangle_count;
uint16_t max_triangle_count;
uint16_t line_count;
uint16_t max_line_count;
/* Need uint32 for the atomic cas instruction. */
uint32_t triangle_count;
uint32_t max_triangle_count;
uint32_t line_count;
uint32_t max_line_count;
uint32_t user_count;
/* Use array for speeding up multiple accesses. */
struct LineartTriangle **linked_triangles;

File diff suppressed because it is too large Load Diff

View File

@ -82,7 +82,7 @@ void lineart_count_and_print_render_buffer_memory(struct LineartRenderBuffer *rb
/* Initial bounding area row/column count, setting 4 is the simplest way algorithm could function
* efficiently. */
#define LRT_BA_ROWS 4
#define LRT_BA_ROWS 10
#ifdef __cplusplus
extern "C" {
@ -93,3 +93,32 @@ void lineart_sort_adjacent_items(LineartAdjacentEdge *ai, int length);
#ifdef __cplusplus
}
#endif
#ifndef __cplusplus /* Compatibility code for atomics, only for C. */
# if defined __has_include /* Try to use C11 atomics support. */
# if __has_include(<stdatomic.h>)
# include <stdatomic.h>
# define lineart_atomic_load(p) atomic_load((volatile size_t *)p)
# define lineart_atomic_store(p, d) atomic_store((volatile size_t *)p, (size_t)d)
# endif
# endif
# ifdef _MSC_VER /* Atomics walkaround for windows. */
# define WIN32_LEAN_AND_MEAN
# include <windows.h>
# define lineart_atomic_load(p) (MemoryBarrier(), *(p))
# define lineart_atomic_store(p, d) \
do { \
*(p) = (d); \
MemoryBarrier(); \
} while (0)
# endif
# if !defined lineart_atomic_load /* Fallback */
# include "atomic_ops.h"
# define lineart_atomic_load(p) atomic_add_and_fetch_z((size_t *)p, 0)
# define lineart_atomic_store(p, d) atomic_add_and_fetch_z((size_t *)p, (size_t)d)
# endif
#endif /* !__cplusplus */