From 803a9062065ccf7a3dde90db5adb872e86d4be5b Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 15 Nov 2019 16:40:54 +0000 Subject: [PATCH] [Project] Use own utf8 validation instead of glib --- src/libmime/mime_encoding.c | 42 ++++++++++++++++++++----------------- src/libserver/protocol.c | 10 ++++----- src/libserver/re_cache.c | 4 +++- src/libutil/map_helpers.c | 5 +++-- src/libutil/str_util.c | 4 +++- src/lua/lua_util.c | 28 +++++++++++++++++++++++-- 6 files changed, 62 insertions(+), 31 deletions(-) diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c index 0fbba54b2..942358d11 100644 --- a/src/libmime/mime_encoding.c +++ b/src/libmime/mime_encoding.c @@ -22,6 +22,7 @@ #include "libserver/task.h" #include "mime_encoding.h" #include "message.h" +#include "contrib/fastutf8/fastutf8.h" #include #include #if U_ICU_VERSION_MAJOR_NUM >= 44 @@ -468,36 +469,39 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in, void rspamd_mime_charset_utf_enforce (gchar *in, gsize len) { - const gchar *end, *p; - gsize remain = len; + gchar *p, *end; + goffset err_offset; + UChar32 uc = 0; /* Now we validate input and replace bad characters with '?' symbol */ p = in; + end = in + len; - while (remain > 0 && !g_utf8_validate (p, remain, &end)) { - gchar *valid; + while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate (p, len) > 0)) { + goffset cur_offset = err_offset; - if (end >= in + len) { - if (p < in + len) { - memset ((gchar *)p, '?', (in + len) - p); - } - break; - } + while (cur_offset < len) { + goffset tmp = cur_offset; - valid = g_utf8_find_next_char (end, in + len); + U8_NEXT (in, cur_offset, len, uc); - if (!valid) { - valid = in + len; + if (uc > 0) { + /* Fill string between err_offset and tmp with `?` character */ + memset (in + err_offset, '?', + tmp - err_offset); + break; + } } - if (valid > end) { - memset ((gchar *)end, '?', valid - end); - p = valid; - remain = (in + len) - p; - } - else { + if (uc < 0) { + /* Fill till the end */ + memset (p + err_offset, '?', + len - err_offset); break; } + + p = in + cur_offset; + len = end - p; } } diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c index 0786f4860..c457fc455 100644 --- a/src/libserver/protocol.c +++ b/src/libserver/protocol.c @@ -26,6 +26,7 @@ #include "unix-std.h" #include "protocol_internal.h" #include "libserver/mempool_vars_internal.h" +#include "contrib/fastutf8/fastutf8.h" #include "task.h" #include @@ -922,16 +923,13 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud) return; } - const gchar *end = NULL; + goffset err_offset; - if (g_utf8_validate (url->host, url->hostlen, &end)) { + if ((err_offset = rspamd_fast_utf8_validate (url->host, url->hostlen) == 0)) { obj = ucl_object_fromlstring (url->host, url->hostlen); } - else if (end - url->host > 0) { - obj = ucl_object_fromlstring (url->host, end - url->host); - } else { - return; + obj = ucl_object_fromlstring (url->host, err_offset); } } else { diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c index a9fc2270b..a495dfdd5 100644 --- a/src/libserver/re_cache.c +++ b/src/libserver/re_cache.c @@ -41,6 +41,8 @@ #include #endif +#include "contrib/fastutf8/fastutf8.h" + #ifdef HAVE_SYS_WAIT_H #include #endif @@ -988,7 +990,7 @@ rspamd_re_cache_process_headers_list (struct rspamd_task *task, in = (const guchar *)cur->value; lenvec[i] = strlen (cur->value); - if (!g_utf8_validate (in, lenvec[i], NULL)) { + if (rspamd_fast_utf8_validate (in, lenvec[i]) != 0) { raw = TRUE; } } diff --git a/src/libutil/map_helpers.c b/src/libutil/map_helpers.c index a9bd8d70e..d67e2fc4d 100644 --- a/src/libutil/map_helpers.c +++ b/src/libutil/map_helpers.c @@ -20,6 +20,7 @@ #include "radix.h" #include "rspamd.h" #include "cryptobox.h" +#include "contrib/fastutf8/fastutf8.h" #ifdef WITH_HYPERSCAN #include "hs.h" @@ -1189,7 +1190,7 @@ rspamd_match_regexp_map_single (struct rspamd_regexp_map_helper *map, } if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) { - if (g_utf8_validate (in, len, NULL)) { + if (rspamd_fast_utf8_validate (in, len) == 0) { validated = TRUE; } } @@ -1280,7 +1281,7 @@ rspamd_match_regexp_map_all (struct rspamd_regexp_map_helper *map, g_assert (in != NULL); if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) { - if (g_utf8_validate (in, len, NULL)) { + if (rspamd_fast_utf8_validate (in, len) == 0) { validated = TRUE; } } diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index 866ef52d8..90924f8d1 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -27,6 +27,8 @@ #endif #include +#include "contrib/fastutf8/fastutf8.h" + const guchar lc_map[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, @@ -2932,7 +2934,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, } if (flags & RSPAMD_REGEXP_ESCAPE_UTF) { - if (!g_utf8_validate (pattern, slen, NULL)) { + if (rspamd_fast_utf8_validate (pattern, slen) != 0) { tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL); } } diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 1ea8d380c..ef9c3105e 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -34,6 +34,7 @@ #include "unicode/uspoof.h" #include "unicode/uscript.h" +#include "contrib/fastutf8/fastutf8.h" /*** * @module rspamd_util @@ -2855,10 +2856,33 @@ lua_util_is_valid_utf8 (lua_State *L) const gchar *str; gsize len; - str = lua_tolstring (L, 1, &len); + if (lua_isstring (L, 1)) { + str = lua_tolstring (L, 1, &len); + } + else { + struct rspamd_lua_text *t = lua_check_text (L, 1); + + if (t) { + str = t->start; + len = t->len; + } + else { + return luaL_error (L, "invalid arguments (text expected)"); + } + } if (str) { - lua_pushboolean (L, g_utf8_validate (str, len, NULL)); + goffset error_offset = rspamd_fast_utf8_validate (str, len); + + if (error_offset == 0) { + lua_pushboolean (L, true); + } + else { + lua_pushboolean (L, false); + lua_pushnumber (L, error_offset); + + return 2; + } } else { return luaL_error (L, "invalid arguments"); -- 2.39.5