From: Vsevolod Stakhov Date: Fri, 15 Nov 2019 17:27:42 +0000 (+0000) Subject: [Rework] Rewrite rspamd_str_make_utf_valid function X-Git-Tag: 2.2~33 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=c234e5bc9c8b19625009d3925f37875e5fa820d4;p=rspamd.git [Rework] Rewrite rspamd_str_make_utf_valid function --- diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index 90924f8d1..dd1b139d8 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -2935,7 +2935,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, if (flags & RSPAMD_REGEXP_ESCAPE_UTF) { if (rspamd_fast_utf8_validate (pattern, slen) != 0) { - tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL); + tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL, NULL); } } @@ -3052,61 +3052,110 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, gchar * -rspamd_str_make_utf_valid (const guchar *src, gsize slen, gsize *dstlen) +rspamd_str_make_utf_valid (const guchar *src, gsize slen, + gsize *dstlen, + rspamd_mempool_t *pool) { - GString *dst; - const gchar *last; - gchar *dchar; - gsize valid, prev; UChar32 uc; - gint32 i; + goffset err_offset; + const guchar *p; + gchar *dst, *d; + gsize remain = slen, dlen = 0; if (src == NULL) { return NULL; } if (slen == 0) { - slen = strlen (src); + return NULL; } - dst = g_string_sized_new (slen); - i = 0; - last = src; - valid = 0; - prev = 0; + p = src; + dlen = slen; - while (i < slen) { - U8_NEXT (src, i, slen, uc); + /* Check space required */ + while (remain > 0 && (err_offset = rspamd_fast_utf8_validate (p, remain) > 0)) { + gint i = 0; - if (uc <= 0) { - if (valid > 0) { - g_string_append_len (dst, last, valid); + p += err_offset; + remain -= err_offset; + dlen += err_offset; + + /* Each invalid character of input requires 3 bytes of output */ + while (i < remain) { + gint old_i = i; + U8_NEXT (p, i, remain, uc); + + if (uc < 0) { + dlen += 3; + } + else { + p += old_i; + remain -= old_i; + break; } - /* 0xFFFD in UTF8 */ - g_string_append_len (dst, "\357\277\275", 3); - valid = 0; - last = &src[i]; - } - else { - valid += i - prev; } + } - prev = i; + if (pool) { + dst = rspamd_mempool_alloc (pool, dlen + 1); + } + else { + dst = g_malloc (dlen + 1); } - if (valid > 0) { - g_string_append_len (dst, last, valid); + p = src; + d = dst; + remain = slen; + + while (remain > 0 && (err_offset = rspamd_fast_utf8_validate (p, remain) > 0)) { + /* Copy valid */ + memcpy (d, p, err_offset); + d += err_offset; + + /* Append 0xFFFD for each bad character */ + gint i = 0; + + p += err_offset; + remain -= err_offset; + + while (i < remain) { + gint old_i = i; + U8_NEXT (p, i, remain, uc); + + if (uc < 0) { + *d++ = '\357'; + *d++ = '\277'; + *d++ = '\275'; + } + else { + /* Adjust p and remaining stuff and go to the outer cycle */ + p += old_i; + remain -= old_i; + break; + } + } + /* + * Now p is the first valid utf8 character and remain is the rest of the string + * so we can continue our loop + */ } - dchar = dst->str; + if (err_offset == 0 && remain > 0) { + /* Last piece */ + memcpy (d, p, remain); + d += remain; + } + + /* Last '\0' */ + g_assert (dlen > d - dst); + *d = '\0'; if (dstlen) { - *dstlen = dst->len; + *dstlen = d - dst; } - g_string_free (dst, FALSE); - - return dchar; + return dst; } gsize diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index 7891a8e54..77bb96249 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -527,7 +527,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, * @param dstelen * @return */ -gchar *rspamd_str_make_utf_valid (const guchar *src, gsize slen, gsize *dstlen); +gchar *rspamd_str_make_utf_valid (const guchar *src, gsize slen, gsize *dstlen, rspamd_mempool_t *pool); /** * Strips characters in `strip_chars` from start and end of the GString