[Project] Use own utf8 validation instead of glib

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Fri, 15 Nov 2019 16:40:54 +0000 (16:40 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Fri, 15 Nov 2019 16:40:54 +0000 (16:40 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 15 Nov 2019 16:40:54 +0000 (16:40 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 15 Nov 2019 16:40:54 +0000 (16:40 +0000)
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c

index 0fbba54b29445841aa2b8caeab38967bc7ab643d..942358d11bf3d464bfd8dea291cf0592d6feadba 100644 (file)
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -22,6 +22,7 @@
  #include "libserver/task.h"
  #include "mime_encoding.h"
  #include "message.h"
+#include "contrib/fastutf8/fastutf8.h"
  #include <unicode/ucnv.h>
  #include <unicode/ucsdet.h>
  #if U_ICU_VERSION_MAJOR_NUM >= 44
@@ -468,36 +469,39 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
  void
  rspamd_mime_charset_utf_enforce (gchar *in, gsize len)
  {
-       const gchar *end, *p;
-       gsize remain = len;
+       gchar *p, *end;
+       goffset err_offset;
+       UChar32 uc = 0;
  
         /* Now we validate input and replace bad characters with '?' symbol */
         p = in;
+       end = in + len;
  
-       while (remain > 0 && !g_utf8_validate (p, remain, &end)) {
-               gchar *valid;
+       while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate (p, len) > 0)) {
+               goffset cur_offset = err_offset;
  
-               if (end >= in + len) {
-                       if (p < in + len) {
-                               memset ((gchar *)p, '?', (in + len) - p);
-                       }
-                       break;
-               }
+               while (cur_offset < len) {
+                       goffset tmp = cur_offset;
  
-               valid = g_utf8_find_next_char (end, in + len);
+                       U8_NEXT (in, cur_offset, len, uc);
  
-               if (!valid) {
-                       valid = in + len;
+                       if (uc > 0) {
+                               /* Fill string between err_offset and tmp with `?` character */
+                               memset (in + err_offset, '?',
+                                       tmp - err_offset);
+                               break;
+                       }
                 }
  
-               if (valid > end) {
-                       memset ((gchar *)end, '?', valid - end);
-                       p = valid;
-                       remain = (in + len) - p;
-               }
-               else {
+               if (uc < 0) {
+                       /* Fill till the end */
+                       memset (p + err_offset, '?',
+                                       len - err_offset);
                         break;
                 }
+
+               p = in + cur_offset;
+               len = end - p;
         }
  }
  
diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c

index 0786f4860c7a89a31a98fbe4a385913b68a0b3f6..c457fc4555e27c24fe99ad17c9b7e75bd519df5f 100644 (file)
--- a/src/libserver/protocol.c
+++ b/src/libserver/protocol.c
@@ -26,6 +26,7 @@
  #include "unix-std.h"
  #include "protocol_internal.h"
  #include "libserver/mempool_vars_internal.h"
+#include "contrib/fastutf8/fastutf8.h"
  #include "task.h"
  #include <math.h>
  
@@ -922,16 +923,13 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
                                 return;
                         }
  
-                       const gchar *end = NULL;
+                       goffset err_offset;
  
-                       if (g_utf8_validate (url->host, url->hostlen, &end)) {
+                       if ((err_offset = rspamd_fast_utf8_validate (url->host, url->hostlen) == 0)) {
                                 obj = ucl_object_fromlstring (url->host, url->hostlen);
                         }
-                       else if (end - url->host > 0) {
-                               obj = ucl_object_fromlstring (url->host, end - url->host);
-                       }
                         else {
-                               return;
+                               obj = ucl_object_fromlstring (url->host, err_offset);
                         }
                 }
                 else {
diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c

index a9fc2270b711ad47cece6c33a969fd8174307ee7..a495dfdd58d2199337b2405dbe02b164ca61d353 100644 (file)
--- a/src/libserver/re_cache.c
+++ b/src/libserver/re_cache.c
@@ -41,6 +41,8 @@
  #include <pcre2.h>
  #endif
  
+#include "contrib/fastutf8/fastutf8.h"
+
  #ifdef HAVE_SYS_WAIT_H
  #include <sys/wait.h>
  #endif
@@ -988,7 +990,7 @@ rspamd_re_cache_process_headers_list (struct rspamd_task *task,
                         in = (const guchar *)cur->value;
                         lenvec[i] = strlen (cur->value);
  
-                       if (!g_utf8_validate (in, lenvec[i], NULL)) {
+                       if (rspamd_fast_utf8_validate (in, lenvec[i]) != 0) {
                                 raw = TRUE;
                         }
                 }
diff --git a/src/libutil/map_helpers.c b/src/libutil/map_helpers.c

index a9bd8d70eaf978869b745b39c46707c786c1849c..d67e2fc4dda228ab91127eebb3c33ed5ded08089 100644 (file)
--- a/src/libutil/map_helpers.c
+++ b/src/libutil/map_helpers.c
@@ -20,6 +20,7 @@
  #include "radix.h"
  #include "rspamd.h"
  #include "cryptobox.h"
+#include "contrib/fastutf8/fastutf8.h"
  
  #ifdef WITH_HYPERSCAN
  #include "hs.h"
@@ -1189,7 +1190,7 @@ rspamd_match_regexp_map_single (struct rspamd_regexp_map_helper *map,
         }
  
         if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) {
-               if (g_utf8_validate (in, len, NULL)) {
+               if (rspamd_fast_utf8_validate (in, len) == 0) {
                         validated = TRUE;
                 }
         }
@@ -1280,7 +1281,7 @@ rspamd_match_regexp_map_all (struct rspamd_regexp_map_helper *map,
         g_assert (in != NULL);
  
         if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) {
-               if (g_utf8_validate (in, len, NULL)) {
+               if (rspamd_fast_utf8_validate (in, len) == 0) {
                         validated = TRUE;
                 }
         }
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c

index 866ef52d859d2ed777987bc1b57ad15f80917a8a..90924f8d17124318bc186efa81599a791bdb49ce 100644 (file)
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -27,6 +27,8 @@
  #endif
  #include <math.h>
  
+#include "contrib/fastutf8/fastutf8.h"
+
  const guchar lc_map[256] = {
                 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
                 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
@@ -2932,7 +2934,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
         }
  
         if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
-               if (!g_utf8_validate (pattern, slen, NULL)) {
+               if (rspamd_fast_utf8_validate (pattern, slen) != 0) {
                         tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL);
                 }
         }
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c

index 1ea8d380cee750cb2bbe9fac1695f5a8ba65609f..ef9c3105e021c78f534e9237a84f4024f9fae7b0 100644 (file)
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -34,6 +34,7 @@
  
  #include "unicode/uspoof.h"
  #include "unicode/uscript.h"
+#include "contrib/fastutf8/fastutf8.h"
  
  /***
   * @module rspamd_util
@@ -2855,10 +2856,33 @@ lua_util_is_valid_utf8 (lua_State *L)
         const gchar *str;
         gsize len;
  
-       str = lua_tolstring (L, 1, &len);
+       if (lua_isstring (L, 1)) {
+               str = lua_tolstring (L, 1, &len);
+       }
+       else {
+               struct rspamd_lua_text *t = lua_check_text (L, 1);
+
+               if (t) {
+                       str = t->start;
+                       len = t->len;
+               }
+               else {
+                       return luaL_error (L, "invalid arguments (text expected)");
+               }
+       }
  
         if (str) {
-               lua_pushboolean (L, g_utf8_validate (str, len, NULL));
+               goffset error_offset = rspamd_fast_utf8_validate (str, len);
+
+               if (error_offset == 0) {
+                       lua_pushboolean (L, true);
+               }
+               else {
+                       lua_pushboolean (L, false);
+                       lua_pushnumber (L, error_offset);
+
+                       return 2;
+               }
         }
         else {
                 return luaL_error (L, "invalid arguments");
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Fri, 15 Nov 2019 16:40:54 +0000 (16:40 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Fri, 15 Nov 2019 16:40:54 +0000 (16:40 +0000)
src/libmime/mime_encoding.c		patch \| blob \| history
src/libserver/protocol.c		patch \| blob \| history
src/libserver/re_cache.c		patch \| blob \| history
src/libutil/map_helpers.c		patch \| blob \| history
src/libutil/str_util.c		patch \| blob \| history
src/lua/lua_util.c		patch \| blob \| history