diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-11-15 17:27:42 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-11-15 17:27:42 +0000 |
commit | c234e5bc9c8b19625009d3925f37875e5fa820d4 (patch) | |
tree | d39c2a3ed923bd0879eafcd4eb49845d1ade636d /src/libutil/str_util.c | |
parent | 23208fb40ca7c0b18d26c739e2861491f0f42abb (diff) | |
download | rspamd-c234e5bc9c8b19625009d3925f37875e5fa820d4.tar.gz rspamd-c234e5bc9c8b19625009d3925f37875e5fa820d4.zip |
[Rework] Rewrite rspamd_str_make_utf_valid function
Diffstat (limited to 'src/libutil/str_util.c')
-rw-r--r-- | src/libutil/str_util.c | 115 |
1 files changed, 82 insertions, 33 deletions
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index 90924f8d1..dd1b139d8 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -2935,7 +2935,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, if (flags & RSPAMD_REGEXP_ESCAPE_UTF) { if (rspamd_fast_utf8_validate (pattern, slen) != 0) { - tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL); + tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL, NULL); } } @@ -3052,61 +3052,110 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, gchar * -rspamd_str_make_utf_valid (const guchar *src, gsize slen, gsize *dstlen) +rspamd_str_make_utf_valid (const guchar *src, gsize slen, + gsize *dstlen, + rspamd_mempool_t *pool) { - GString *dst; - const gchar *last; - gchar *dchar; - gsize valid, prev; UChar32 uc; - gint32 i; + goffset err_offset; + const guchar *p; + gchar *dst, *d; + gsize remain = slen, dlen = 0; if (src == NULL) { return NULL; } if (slen == 0) { - slen = strlen (src); + return NULL; } - dst = g_string_sized_new (slen); - i = 0; - last = src; - valid = 0; - prev = 0; + p = src; + dlen = slen; - while (i < slen) { - U8_NEXT (src, i, slen, uc); + /* Check space required */ + while (remain > 0 && (err_offset = rspamd_fast_utf8_validate (p, remain) > 0)) { + gint i = 0; - if (uc <= 0) { - if (valid > 0) { - g_string_append_len (dst, last, valid); + p += err_offset; + remain -= err_offset; + dlen += err_offset; + + /* Each invalid character of input requires 3 bytes of output */ + while (i < remain) { + gint old_i = i; + U8_NEXT (p, i, remain, uc); + + if (uc < 0) { + dlen += 3; + } + else { + p += old_i; + remain -= old_i; + break; } - /* 0xFFFD in UTF8 */ - g_string_append_len (dst, "\357\277\275", 3); - valid = 0; - last = &src[i]; - } - else { - valid += i - prev; } + } - prev = i; + if (pool) { + dst = rspamd_mempool_alloc (pool, dlen + 1); + } + else { + dst = g_malloc (dlen + 1); } - if (valid > 0) { - g_string_append_len (dst, last, valid); + p = src; + d = dst; + remain = slen; + + while (remain > 0 && (err_offset = rspamd_fast_utf8_validate (p, remain) > 0)) { + /* Copy valid */ + memcpy (d, p, err_offset); + d += err_offset; + + /* Append 0xFFFD for each bad character */ + gint i = 0; + + p += err_offset; + remain -= err_offset; + + while (i < remain) { + gint old_i = i; + U8_NEXT (p, i, remain, uc); + + if (uc < 0) { + *d++ = '\357'; + *d++ = '\277'; + *d++ = '\275'; + } + else { + /* Adjust p and remaining stuff and go to the outer cycle */ + p += old_i; + remain -= old_i; + break; + } + } + /* + * Now p is the first valid utf8 character and remain is the rest of the string + * so we can continue our loop + */ } - dchar = dst->str; + if (err_offset == 0 && remain > 0) { + /* Last piece */ + memcpy (d, p, remain); + d += remain; + } + + /* Last '\0' */ + g_assert (dlen > d - dst); + *d = '\0'; if (dstlen) { - *dstlen = dst->len; + *dstlen = d - dst; } - g_string_free (dst, FALSE); - - return dchar; + return dst; } gsize |