BLI_string_utf8: remove unnecessary utf8 decoding functions

Remove BLI_str_utf8_as_unicode_and_size and
BLI_str_utf8_as_unicode_and_size_safe.

Use BLI_str_utf8_as_unicode_step instead since it takes
a buffer bounds argument to prevent buffer over-reading.
This commit is contained in:
Campbell Barton 2021-08-25 15:19:00 +10:00
parent be906f44c6
commit 38630711a0
Notes: blender-bot 2024-03-25 12:30:38 +01:00
Referenced by commit 9df063df19, Fix assert caused by 38630711a0
Referenced by commit 820d50d3cb, Correct error in 38630711a0
7 changed files with 51 additions and 70 deletions

View File

@ -1888,8 +1888,9 @@ void txt_delete_char(Text *text)
}
}
else { /* Just deleting a char */
size_t c_len = 0;
c = BLI_str_utf8_as_unicode_and_size(text->curl->line + text->curc, &c_len);
size_t c_len = text->curc;
c = BLI_str_utf8_as_unicode_step(text->curl->line, text->curl->len, &c_len);
c_len -= text->curc;
UNUSED_VARS(c);
memmove(text->curl->line + text->curc,
@ -1937,9 +1938,11 @@ void txt_backspace_char(Text *text)
txt_pop_sel(text);
}
else { /* Just backspacing a char */
size_t c_len = 0;
const char *prev = BLI_str_prev_char_utf8(text->curl->line + text->curc);
c = BLI_str_utf8_as_unicode_and_size(prev, &c_len);
size_t c_len = prev - text->curl->line;
c = BLI_str_utf8_as_unicode_step(text->curl->line, text->curl->len, &c_len);
c_len -= prev - text->curl->line;
UNUSED_VARS(c);
/* source and destination overlap, don't use memcpy() */
@ -2053,7 +2056,9 @@ bool txt_replace_char(Text *text, unsigned int add)
return txt_add_char(text, add);
}
del = BLI_str_utf8_as_unicode_and_size(text->curl->line + text->curc, &del_size);
del_size = text->curc;
del = BLI_str_utf8_as_unicode_step(text->curl->line, text->curl->len, &del_size);
del_size -= text->curc;
UNUSED_VARS(del);
add_size = BLI_str_utf8_from_unicode(add, ch);

View File

@ -39,10 +39,6 @@ int BLI_str_utf8_size(const char *p) ATTR_NONNULL();
int BLI_str_utf8_size_safe(const char *p) ATTR_NONNULL();
/* copied from glib */
unsigned int BLI_str_utf8_as_unicode(const char *p) ATTR_NONNULL();
unsigned int BLI_str_utf8_as_unicode_and_size(const char *__restrict p, size_t *__restrict index)
ATTR_NONNULL();
unsigned int BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p,
size_t *__restrict index) ATTR_NONNULL();
unsigned int BLI_str_utf8_as_unicode_step(const char *__restrict p,
size_t p_len,
size_t *__restrict index) ATTR_NONNULL(1, 3);

View File

@ -101,11 +101,14 @@ static eStrCursorDelimType cursor_delim_type_unicode(const uint uch)
return STRCUR_DELIM_ALPHANUMERIC; /* Not quite true, but ok for now */
}
static eStrCursorDelimType cursor_delim_type_utf8(const char *ch_utf8)
static eStrCursorDelimType cursor_delim_type_utf8(const char *ch_utf8,
const size_t ch_utf8_len,
const int pos)
{
/* for full unicode support we really need to have large lookup tables to figure
* out what's what in every possible char set - and python, glib both have these. */
uint uch = BLI_str_utf8_as_unicode(ch_utf8);
size_t index = (size_t)pos;
uint uch = BLI_str_utf8_as_unicode_step_or_error(ch_utf8, ch_utf8_len, &index);
return cursor_delim_type_unicode(uch);
}
@ -157,14 +160,16 @@ void BLI_str_cursor_step_utf8(const char *str,
}
if (jump != STRCUR_JUMP_NONE) {
const eStrCursorDelimType delim_type = (*pos) < maxlen ? cursor_delim_type_utf8(&str[*pos]) :
STRCUR_DELIM_NONE;
const eStrCursorDelimType delim_type = (*pos) < maxlen ?
cursor_delim_type_utf8(str, maxlen, *pos) :
STRCUR_DELIM_NONE;
/* jump between special characters (/,\,_,-, etc.),
* look at function cursor_delim_type() for complete
* list of special character, ctr -> */
while ((*pos) < maxlen) {
if (BLI_str_cursor_step_next_utf8(str, maxlen, pos)) {
if ((jump != STRCUR_JUMP_ALL) && (delim_type != cursor_delim_type_utf8(&str[*pos]))) {
if ((jump != STRCUR_JUMP_ALL) &&
(delim_type != cursor_delim_type_utf8(str, maxlen, *pos))) {
break;
}
}
@ -184,7 +189,7 @@ void BLI_str_cursor_step_utf8(const char *str,
if (jump != STRCUR_JUMP_NONE) {
const eStrCursorDelimType delim_type = (*pos) > 0 ?
cursor_delim_type_utf8(&str[(*pos) - 1]) :
cursor_delim_type_utf8(str, maxlen, *pos - 1) :
STRCUR_DELIM_NONE;
/* jump between special characters (/,\,_,-, etc.),
* look at function cursor_delim_type() for complete
@ -192,7 +197,8 @@ void BLI_str_cursor_step_utf8(const char *str,
while ((*pos) > 0) {
const int pos_prev = *pos;
if (BLI_str_cursor_step_prev_utf8(str, maxlen, pos)) {
if ((jump != STRCUR_JUMP_ALL) && (delim_type != cursor_delim_type_utf8(&str[*pos]))) {
if ((jump != STRCUR_JUMP_ALL) &&
(delim_type != cursor_delim_type_utf8(str, maxlen, (size_t)*pos))) {
/* left only: compensate for index/change in direction */
if ((pos_orig - (*pos)) >= 1) {
*pos = pos_prev;

View File

@ -71,12 +71,12 @@ int damerau_levenshtein_distance(StringRef a, StringRef b)
for (const int i : IndexRange(size_a)) {
v2[0] = (i + 1) * deletion_cost;
const uint32_t unicode_a = BLI_str_utf8_as_unicode_and_size(a.data() + offset_a, &offset_a);
const uint32_t unicode_a = BLI_str_utf8_as_unicode_step(a.data(), a.size(), &offset_a);
uint32_t prev_unicode_b;
size_t offset_b = 0;
for (const int j : IndexRange(size_b)) {
const uint32_t unicode_b = BLI_str_utf8_as_unicode_and_size(b.data() + offset_b, &offset_b);
const uint32_t unicode_b = BLI_str_utf8_as_unicode_step(b.data(), b.size(), &offset_b);
/* Check how costly the different operations would be and pick the cheapest - the one with
* minimal cost. */
@ -202,8 +202,8 @@ static bool match_word_initials(StringRef query,
int first_found_word_index = -1;
while (query_index < query.size()) {
const uint query_unicode = BLI_str_utf8_as_unicode_and_size(query.data() + query_index,
&query_index);
const uint query_unicode = BLI_str_utf8_as_unicode_step(
query.data(), query.size(), &query_index);
while (true) {
/* We are at the end of words, no complete match has been found yet. */
if (word_index >= words.size()) {
@ -226,8 +226,8 @@ static bool match_word_initials(StringRef query,
StringRef word = words[word_index];
/* Try to match the current character with the current word. */
if (static_cast<int>(char_index) < word.size()) {
const uint32_t char_unicode = BLI_str_utf8_as_unicode_and_size(word.data() + char_index,
&char_index);
const uint32_t char_unicode = BLI_str_utf8_as_unicode_step(
word.data(), word.size(), &char_index);
if (query_unicode == char_unicode) {
r_word_is_matched[word_index] = true;
if (first_found_word_index == -1) {
@ -368,8 +368,9 @@ void extract_normalized_words(StringRef str,
size_t word_start = 0;
size_t offset = 0;
while (offset < str_size_in_bytes) {
size_t size = 0;
uint32_t unicode = BLI_str_utf8_as_unicode_and_size(str.data() + offset, &size);
size_t size = offset;
uint32_t unicode = BLI_str_utf8_as_unicode_step(str.data(), str.size(), &size);
size -= offset;
if (is_separator(unicode)) {
if (is_in_word) {
r_words.append(

View File

@ -546,40 +546,6 @@ uint BLI_str_utf8_as_unicode(const char *p)
return result;
}
/* variant that increments the length */
uint BLI_str_utf8_as_unicode_and_size(const char *__restrict p, size_t *__restrict index)
{
int i, len;
uint mask = 0;
uint result;
const unsigned char c = (unsigned char)*p;
UTF8_COMPUTE(c, mask, len, -1);
if (UNLIKELY(len == -1)) {
return BLI_UTF8_ERR;
}
UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
*index += (size_t)len;
return result;
}
uint BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p, size_t *__restrict index)
{
int i, len;
uint mask = 0;
uint result;
const unsigned char c = (unsigned char)*p;
UTF8_COMPUTE(c, mask, len, -1);
if (UNLIKELY(len == -1)) {
*index += 1;
return c;
}
UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
*index += (size_t)len;
return result;
}
/**
* UTF8 decoding that steps over the index (unless an error is encountered).
*
@ -709,16 +675,23 @@ size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w,
memset(dst_w, 0xff, sizeof(*dst_w) * maxncpy);
#endif
const size_t src_c_len = strlen(src_c);
const char *src_c_end = src_c + src_c_len;
size_t index = 0;
while (*src_c && len != maxlen) {
size_t step = 0;
uint unicode = BLI_str_utf8_as_unicode_and_size(src_c, &step);
const uint unicode = BLI_str_utf8_as_unicode_step_or_error(src_c, src_c_len, &index);
if (unicode != BLI_UTF8_ERR) {
*dst_w = unicode;
src_c += step;
}
else {
*dst_w = '?';
src_c = BLI_str_find_next_char_utf8(src_c, NULL);
const char *src_c_next = BLI_str_find_next_char_utf8(src_c + index, src_c_end);
if (src_c_next != NULL) {
index = (size_t)(src_c_next - src_c);
}
else {
index += 1;
}
}
dst_w++;
len++;
@ -898,7 +871,9 @@ size_t BLI_str_partition_ex_utf8(const char *str,
index = 0;
*sep >= str && (!end || *sep < end) && **sep != '\0';
*sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(str, *sep) : str + index)) {
const uint c = BLI_str_utf8_as_unicode_and_size(*sep, &index);
size_t index_ofs = 0;
const uint c = BLI_str_utf8_as_unicode_step_or_error(*sep, (size_t)(end - *sep), &index_ofs);
index += index_ofs;
if (c == BLI_UTF8_ERR) {
*suf = *sep = NULL;

View File

@ -177,13 +177,12 @@ static GHash *text_autocomplete_build(Text *text)
i_pos = i_start;
while ((i_start < linep->len) &&
(!text_check_identifier_nodigit_unicode(
BLI_str_utf8_as_unicode_and_size_safe(&linep->line[i_start], &i_pos)))) {
BLI_str_utf8_as_unicode_step(linep->line, linep->len, &i_pos)))) {
i_start = i_pos;
}
i_pos = i_end = i_start;
while ((i_end < linep->len) &&
(text_check_identifier_unicode(
BLI_str_utf8_as_unicode_and_size_safe(&linep->line[i_end], &i_pos)))) {
while ((i_end < linep->len) && (text_check_identifier_unicode(BLI_str_utf8_as_unicode_step(
linep->line, linep->len, &i_pos)))) {
i_end = i_pos;
}

View File

@ -649,10 +649,9 @@ static void rna_Event_unicode_get(PointerRNA *ptr, char *value)
size_t len = 0;
if (event->utf8_buf[0]) {
BLI_str_utf8_as_unicode_and_size(event->utf8_buf, &len);
if (len > 0) {
if (BLI_str_utf8_as_unicode_step_or_error(event->utf8_buf, sizeof(event->utf8_buf), &len) !=
BLI_UTF8_ERR)
memcpy(value, event->utf8_buf, len);
}
}
value[len] = '\0';