Image viewer scopes update: OMP->BLI_task.

Gives over 50% faster scope update (from 4.5ms to 2.2ms here with SD shot)!
Probably mostly due to more clever usage of thread-local data (which avoids any lock,
when OMP code had a rather stupid critical section for minmax)...
This commit is contained in:
Bastien Montagne 2016-05-26 14:30:14 +02:00
parent d5e0e681ce
commit f87842a73a
1 changed files with 163 additions and 136 deletions

View File

@ -43,6 +43,7 @@
#include "BLI_blenlib.h"
#include "BLI_math.h"
#include "BLI_utildefines.h"
#include "BLI_task.h"
#include "BLI_threads.h"
#include "BKE_colortools.h"
@ -53,10 +54,6 @@
#include "IMB_colormanagement.h"
#include "IMB_imbuf_types.h"
#ifdef _OPENMP
# include <omp.h>
#endif
/* ********************************* color curve ********************* */
/* ***************** operations on full struct ************* */
@ -1089,31 +1086,170 @@ void BKE_histogram_update_sample_line(Histogram *hist, ImBuf *ibuf, const ColorM
}
/* if view_settings, it also applies this to byte buffers */
typedef struct ScopesUpdateData {
Scopes *scopes;
const ImBuf *ibuf;
struct ColormanageProcessor *cm_processor;
const unsigned char *display_buffer;
const int ycc_mode;
unsigned int *bin_lum, *bin_r, *bin_g, *bin_b, *bin_a;
} ScopesUpdateData;
typedef struct ScopesUpdateDataChunk {
unsigned int bin_lum[256];
unsigned int bin_r[256];
unsigned int bin_g[256];
unsigned int bin_b[256];
unsigned int bin_a[256];
float min[3], max[3];
} ScopesUpdateDataChunk;
static void scopes_update_cb(void *userdata, void *userdata_chunk, const int y, const int UNUSED(threadid))
{
const ScopesUpdateData *data = userdata;
Scopes *scopes = data->scopes;
const ImBuf *ibuf = data->ibuf;
struct ColormanageProcessor *cm_processor = data->cm_processor;
const unsigned char *display_buffer = data->display_buffer;
const int ycc_mode = data->ycc_mode;
ScopesUpdateDataChunk *data_chunk = userdata_chunk;
unsigned int *bin_lum = data_chunk->bin_lum;
unsigned int *bin_r = data_chunk->bin_r;
unsigned int *bin_g = data_chunk->bin_g;
unsigned int *bin_b = data_chunk->bin_b;
unsigned int *bin_a = data_chunk->bin_a;
float *min = data_chunk->min;
float *max = data_chunk->max;
const float *rf = NULL;
const unsigned char *rc = NULL;
const int rows_per_sample_line = ibuf->y / scopes->sample_lines;
const int savedlines = y / rows_per_sample_line;
const bool do_sample_line = (savedlines < scopes->sample_lines) && (y % rows_per_sample_line) == 0;
const bool is_float = (ibuf->rect_float != NULL);
if (is_float)
rf = ibuf->rect_float + ((size_t)y) * ibuf->x * ibuf->channels;
else {
rc = display_buffer + ((size_t)y) * ibuf->x * ibuf->channels;
}
for (int x = 0; x < ibuf->x; x++) {
float rgba[4], ycc[3], luma;
if (is_float) {
switch (ibuf->channels) {
case 4:
copy_v4_v4(rgba, rf);
IMB_colormanagement_processor_apply_v4(cm_processor, rgba);
break;
case 3:
copy_v3_v3(rgba, rf);
IMB_colormanagement_processor_apply_v3(cm_processor, rgba);
rgba[3] = 1.0f;
break;
case 2:
copy_v3_fl(rgba, rf[0]);
rgba[3] = rf[1];
break;
case 1:
copy_v3_fl(rgba, rf[0]);
rgba[3] = 1.0f;
break;
default:
BLI_assert(0);
}
}
else {
for (int c = 4; c--;)
rgba[c] = rc[c] * INV_255;
}
/* we still need luma for histogram */
luma = IMB_colormanagement_get_luminance(rgba);
/* check for min max */
if (ycc_mode == -1) {
minmax_v3v3_v3(min, max, rgba);
}
else {
rgb_to_ycc(rgba[0], rgba[1], rgba[2], &ycc[0], &ycc[1], &ycc[2], ycc_mode);
mul_v3_fl(ycc, INV_255);
minmax_v3v3_v3(min, max, ycc);
}
/* increment count for histo*/
bin_lum[get_bin_float(luma)]++;
bin_r[get_bin_float(rgba[0])]++;
bin_g[get_bin_float(rgba[1])]++;
bin_b[get_bin_float(rgba[2])]++;
bin_a[get_bin_float(rgba[3])]++;
/* save sample if needed */
if (do_sample_line) {
const float fx = (float)x / (float)ibuf->x;
const int idx = 2 * (ibuf->x * savedlines + x);
save_sample_line(scopes, idx, fx, rgba, ycc);
}
rf += ibuf->channels;
rc += ibuf->channels;
}
}
static void scopes_update_finalize(void *userdata, void *userdata_chunk)
{
const ScopesUpdateData *data = userdata;
const ScopesUpdateDataChunk *data_chunk = userdata_chunk;
unsigned int *bin_lum = data->bin_lum;
unsigned int *bin_r = data->bin_r;
unsigned int *bin_g = data->bin_g;
unsigned int *bin_b = data->bin_b;
unsigned int *bin_a = data->bin_a;
const unsigned int *bin_lum_c = data_chunk->bin_lum;
const unsigned int *bin_r_c = data_chunk->bin_r;
const unsigned int *bin_g_c = data_chunk->bin_g;
const unsigned int *bin_b_c = data_chunk->bin_b;
const unsigned int *bin_a_c = data_chunk->bin_a;
float (*minmax)[2] = data->scopes->minmax;
const float *min = data_chunk->min;
const float *max = data_chunk->max;
for (int b = 256; b--;) {
bin_lum[b] += bin_lum_c[b];
bin_r[b] += bin_r_c[b];
bin_g[b] += bin_g_c[b];
bin_b[b] += bin_b_c[b];
bin_a[b] += bin_a_c[b];
}
for (int c = 3; c--;) {
if (min[c] < minmax[c][0])
minmax[c][0] = min[c];
if (max[c] > minmax[c][1])
minmax[c][1] = max[c];
}
}
void scopes_update(Scopes *scopes, ImBuf *ibuf, const ColorManagedViewSettings *view_settings,
const ColorManagedDisplaySettings *display_settings)
{
#ifdef _OPENMP
const int num_threads = BLI_system_thread_count();
#endif
int a, y;
int a;
unsigned int nl, na, nr, ng, nb;
double divl, diva, divr, divg, divb;
unsigned char *display_buffer;
const unsigned char *display_buffer = NULL;
unsigned int bin_lum[256] = {0},
bin_r[256] = {0},
bin_g[256] = {0},
bin_b[256] = {0},
bin_a[256] = {0};
unsigned int bin_lum_t[BLENDER_MAX_THREADS][256] = {{0}},
bin_r_t[BLENDER_MAX_THREADS][256] = {{0}},
bin_g_t[BLENDER_MAX_THREADS][256] = {{0}},
bin_b_t[BLENDER_MAX_THREADS][256] = {{0}},
bin_a_t[BLENDER_MAX_THREADS][256] = {{0}};
int ycc_mode = -1;
const bool is_float = (ibuf->rect_float != NULL);
void *cache_handle = NULL;
struct ColormanageProcessor *cm_processor = NULL;
int rows_per_sample_line;
if (ibuf->rect == NULL && ibuf->rect_float == NULL) return;
@ -1151,7 +1287,6 @@ void scopes_update(Scopes *scopes, ImBuf *ibuf, const ColorManagedViewSettings *
scopes->sample_lines = ibuf->y;
/* scan the image */
rows_per_sample_line = ibuf->y / scopes->sample_lines;
for (a = 0; a < 3; a++) {
scopes->minmax[a][0] = 25500.0f;
scopes->minmax[a][1] = -25500.0f;
@ -1177,129 +1312,21 @@ void scopes_update(Scopes *scopes, ImBuf *ibuf, const ColorManagedViewSettings *
cm_processor = IMB_colormanagement_display_processor_new(view_settings, display_settings);
}
else {
display_buffer = (unsigned char *)IMB_display_buffer_acquire(ibuf,
view_settings,
display_settings,
&cache_handle);
display_buffer = (const unsigned char *)IMB_display_buffer_acquire(
ibuf, view_settings, display_settings, &cache_handle);
}
/* Keep number of threads in sync with the merge parts below. */
#pragma omp parallel for private(y) schedule(static) num_threads(num_threads) if (ibuf->y > 256)
for (y = 0; y < ibuf->y; y++) {
#ifdef _OPENMP
const int thread_idx = omp_get_thread_num();
#else
const int thread_idx = 0;
#endif
const float *rf = NULL;
const unsigned char *rc = NULL;
const int savedlines = y / rows_per_sample_line;
const bool do_sample_line = (savedlines < scopes->sample_lines) && (y % rows_per_sample_line) == 0;
float min[3] = { FLT_MAX, FLT_MAX, FLT_MAX},
max[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
int x, c;
if (is_float)
rf = ibuf->rect_float + ((size_t)y) * ibuf->x * ibuf->channels;
else {
rc = display_buffer + ((size_t)y) * ibuf->x * ibuf->channels;
}
for (x = 0; x < ibuf->x; x++) {
float rgba[4], ycc[3], luma;
if (is_float) {
ScopesUpdateData data = {
.scopes = scopes, . ibuf = ibuf,
.cm_processor = cm_processor, .display_buffer = display_buffer, .ycc_mode = ycc_mode,
.bin_lum = bin_lum, .bin_r = bin_r, .bin_g = bin_g, .bin_b = bin_b, .bin_a = bin_a,
};
ScopesUpdateDataChunk data_chunk = {0};
INIT_MINMAX(data_chunk.min, data_chunk.max);
switch (ibuf->channels) {
case 4:
copy_v4_v4(rgba, rf);
IMB_colormanagement_processor_apply_v4(cm_processor, rgba);
break;
case 3:
copy_v3_v3(rgba, rf);
IMB_colormanagement_processor_apply_v3(cm_processor, rgba);
rgba[3] = 1.0f;
break;
case 2:
copy_v3_fl(rgba, rf[0]);
rgba[3] = rf[1];
break;
case 1:
copy_v3_fl(rgba, rf[0]);
rgba[3] = 1.0f;
break;
default:
BLI_assert(0);
}
}
else {
for (c = 0; c < 4; c++)
rgba[c] = rc[c] * INV_255;
}
/* we still need luma for histogram */
luma = IMB_colormanagement_get_luminance(rgba);
/* check for min max */
if (ycc_mode == -1) {
for (c = 0; c < 3; c++) {
if (rgba[c] < min[c]) min[c] = rgba[c];
if (rgba[c] > max[c]) max[c] = rgba[c];
}
}
else {
rgb_to_ycc(rgba[0], rgba[1], rgba[2], &ycc[0], &ycc[1], &ycc[2], ycc_mode);
for (c = 0; c < 3; c++) {
ycc[c] *= INV_255;
if (ycc[c] < min[c]) min[c] = ycc[c];
if (ycc[c] > max[c]) max[c] = ycc[c];
}
}
/* increment count for histo*/
bin_lum_t[thread_idx][get_bin_float(luma)] += 1;
bin_r_t[thread_idx][get_bin_float(rgba[0])] += 1;
bin_g_t[thread_idx][get_bin_float(rgba[1])] += 1;
bin_b_t[thread_idx][get_bin_float(rgba[2])] += 1;
bin_a_t[thread_idx][get_bin_float(rgba[3])] += 1;
/* save sample if needed */
if (do_sample_line) {
const float fx = (float)x / (float)ibuf->x;
const int idx = 2 * (ibuf->x * savedlines + x);
save_sample_line(scopes, idx, fx, rgba, ycc);
}
rf += ibuf->channels;
rc += ibuf->channels;
}
#pragma omp critical
{
for (c = 0; c < 3; c++) {
if (min[c] < scopes->minmax[c][0]) scopes->minmax[c][0] = min[c];
if (max[c] > scopes->minmax[c][1]) scopes->minmax[c][1] = max[c];
}
}
}
#ifdef _OPENMP
if (ibuf->y > 256) {
for (a = 0; a < num_threads; a++) {
int b;
for (b = 0; b < 256; b++) {
bin_lum[b] += bin_lum_t[a][b];
bin_r[b] += bin_r_t[a][b];
bin_g[b] += bin_g_t[a][b];
bin_b[b] += bin_b_t[a][b];
bin_a[b] += bin_a_t[a][b];
}
}
}
else
#endif
{
memcpy(bin_lum, bin_lum_t[0], sizeof(bin_lum));
memcpy(bin_r, bin_r_t[0], sizeof(bin_r));
memcpy(bin_g, bin_g_t[0], sizeof(bin_g));
memcpy(bin_b, bin_b_t[0], sizeof(bin_b));
memcpy(bin_a, bin_a_t[0], sizeof(bin_a));
}
BLI_task_parallel_range_finalize(0, ibuf->y, &data, &data_chunk, sizeof(data_chunk),
scopes_update_cb, scopes_update_finalize, ibuf->y > 256, false);
/* test for nicer distribution even - non standard, leave it out for a while */
#if 0