]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Use own utf8 validation instead of glib
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 15 Nov 2019 16:40:54 +0000 (16:40 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 15 Nov 2019 16:40:54 +0000 (16:40 +0000)
src/libmime/mime_encoding.c
src/libserver/protocol.c
src/libserver/re_cache.c
src/libutil/map_helpers.c
src/libutil/str_util.c
src/lua/lua_util.c

index 0fbba54b29445841aa2b8caeab38967bc7ab643d..942358d11bf3d464bfd8dea291cf0592d6feadba 100644 (file)
@@ -22,6 +22,7 @@
 #include "libserver/task.h"
 #include "mime_encoding.h"
 #include "message.h"
+#include "contrib/fastutf8/fastutf8.h"
 #include <unicode/ucnv.h>
 #include <unicode/ucsdet.h>
 #if U_ICU_VERSION_MAJOR_NUM >= 44
@@ -468,36 +469,39 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
 void
 rspamd_mime_charset_utf_enforce (gchar *in, gsize len)
 {
-       const gchar *end, *p;
-       gsize remain = len;
+       gchar *p, *end;
+       goffset err_offset;
+       UChar32 uc = 0;
 
        /* Now we validate input and replace bad characters with '?' symbol */
        p = in;
+       end = in + len;
 
-       while (remain > 0 && !g_utf8_validate (p, remain, &end)) {
-               gchar *valid;
+       while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate (p, len) > 0)) {
+               goffset cur_offset = err_offset;
 
-               if (end >= in + len) {
-                       if (p < in + len) {
-                               memset ((gchar *)p, '?', (in + len) - p);
-                       }
-                       break;
-               }
+               while (cur_offset < len) {
+                       goffset tmp = cur_offset;
 
-               valid = g_utf8_find_next_char (end, in + len);
+                       U8_NEXT (in, cur_offset, len, uc);
 
-               if (!valid) {
-                       valid = in + len;
+                       if (uc > 0) {
+                               /* Fill string between err_offset and tmp with `?` character */
+                               memset (in + err_offset, '?',
+                                       tmp - err_offset);
+                               break;
+                       }
                }
 
-               if (valid > end) {
-                       memset ((gchar *)end, '?', valid - end);
-                       p = valid;
-                       remain = (in + len) - p;
-               }
-               else {
+               if (uc < 0) {
+                       /* Fill till the end */
+                       memset (p + err_offset, '?',
+                                       len - err_offset);
                        break;
                }
+
+               p = in + cur_offset;
+               len = end - p;
        }
 }
 
index 0786f4860c7a89a31a98fbe4a385913b68a0b3f6..c457fc4555e27c24fe99ad17c9b7e75bd519df5f 100644 (file)
@@ -26,6 +26,7 @@
 #include "unix-std.h"
 #include "protocol_internal.h"
 #include "libserver/mempool_vars_internal.h"
+#include "contrib/fastutf8/fastutf8.h"
 #include "task.h"
 #include <math.h>
 
@@ -922,16 +923,13 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
                                return;
                        }
 
-                       const gchar *end = NULL;
+                       goffset err_offset;
 
-                       if (g_utf8_validate (url->host, url->hostlen, &end)) {
+                       if ((err_offset = rspamd_fast_utf8_validate (url->host, url->hostlen) == 0)) {
                                obj = ucl_object_fromlstring (url->host, url->hostlen);
                        }
-                       else if (end - url->host > 0) {
-                               obj = ucl_object_fromlstring (url->host, end - url->host);
-                       }
                        else {
-                               return;
+                               obj = ucl_object_fromlstring (url->host, err_offset);
                        }
                }
                else {
index a9fc2270b711ad47cece6c33a969fd8174307ee7..a495dfdd58d2199337b2405dbe02b164ca61d353 100644 (file)
@@ -41,6 +41,8 @@
 #include <pcre2.h>
 #endif
 
+#include "contrib/fastutf8/fastutf8.h"
+
 #ifdef HAVE_SYS_WAIT_H
 #include <sys/wait.h>
 #endif
@@ -988,7 +990,7 @@ rspamd_re_cache_process_headers_list (struct rspamd_task *task,
                        in = (const guchar *)cur->value;
                        lenvec[i] = strlen (cur->value);
 
-                       if (!g_utf8_validate (in, lenvec[i], NULL)) {
+                       if (rspamd_fast_utf8_validate (in, lenvec[i]) != 0) {
                                raw = TRUE;
                        }
                }
index a9bd8d70eaf978869b745b39c46707c786c1849c..d67e2fc4dda228ab91127eebb3c33ed5ded08089 100644 (file)
@@ -20,6 +20,7 @@
 #include "radix.h"
 #include "rspamd.h"
 #include "cryptobox.h"
+#include "contrib/fastutf8/fastutf8.h"
 
 #ifdef WITH_HYPERSCAN
 #include "hs.h"
@@ -1189,7 +1190,7 @@ rspamd_match_regexp_map_single (struct rspamd_regexp_map_helper *map,
        }
 
        if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) {
-               if (g_utf8_validate (in, len, NULL)) {
+               if (rspamd_fast_utf8_validate (in, len) == 0) {
                        validated = TRUE;
                }
        }
@@ -1280,7 +1281,7 @@ rspamd_match_regexp_map_all (struct rspamd_regexp_map_helper *map,
        g_assert (in != NULL);
 
        if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) {
-               if (g_utf8_validate (in, len, NULL)) {
+               if (rspamd_fast_utf8_validate (in, len) == 0) {
                        validated = TRUE;
                }
        }
index 866ef52d859d2ed777987bc1b57ad15f80917a8a..90924f8d17124318bc186efa81599a791bdb49ce 100644 (file)
@@ -27,6 +27,8 @@
 #endif
 #include <math.h>
 
+#include "contrib/fastutf8/fastutf8.h"
+
 const guchar lc_map[256] = {
                0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
                0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
@@ -2932,7 +2934,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
        }
 
        if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
-               if (!g_utf8_validate (pattern, slen, NULL)) {
+               if (rspamd_fast_utf8_validate (pattern, slen) != 0) {
                        tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL);
                }
        }
index 1ea8d380cee750cb2bbe9fac1695f5a8ba65609f..ef9c3105e021c78f534e9237a84f4024f9fae7b0 100644 (file)
@@ -34,6 +34,7 @@
 
 #include "unicode/uspoof.h"
 #include "unicode/uscript.h"
+#include "contrib/fastutf8/fastutf8.h"
 
 /***
  * @module rspamd_util
@@ -2855,10 +2856,33 @@ lua_util_is_valid_utf8 (lua_State *L)
        const gchar *str;
        gsize len;
 
-       str = lua_tolstring (L, 1, &len);
+       if (lua_isstring (L, 1)) {
+               str = lua_tolstring (L, 1, &len);
+       }
+       else {
+               struct rspamd_lua_text *t = lua_check_text (L, 1);
+
+               if (t) {
+                       str = t->start;
+                       len = t->len;
+               }
+               else {
+                       return luaL_error (L, "invalid arguments (text expected)");
+               }
+       }
 
        if (str) {
-               lua_pushboolean (L, g_utf8_validate (str, len, NULL));
+               goffset error_offset = rspamd_fast_utf8_validate (str, len);
+
+               if (error_offset == 0) {
+                       lua_pushboolean (L, true);
+               }
+               else {
+                       lua_pushboolean (L, false);
+                       lua_pushnumber (L, error_offset);
+
+                       return 2;
+               }
        }
        else {
                return luaL_error (L, "invalid arguments");