@@ -22,6 +22,7 @@ | |||
#include "libserver/task.h" | |||
#include "mime_encoding.h" | |||
#include "message.h" | |||
#include "contrib/fastutf8/fastutf8.h" | |||
#include <unicode/ucnv.h> | |||
#include <unicode/ucsdet.h> | |||
#if U_ICU_VERSION_MAJOR_NUM >= 44 | |||
@@ -468,36 +469,39 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in, | |||
void | |||
rspamd_mime_charset_utf_enforce (gchar *in, gsize len) | |||
{ | |||
const gchar *end, *p; | |||
gsize remain = len; | |||
gchar *p, *end; | |||
goffset err_offset; | |||
UChar32 uc = 0; | |||
/* Now we validate input and replace bad characters with '?' symbol */ | |||
p = in; | |||
end = in + len; | |||
while (remain > 0 && !g_utf8_validate (p, remain, &end)) { | |||
gchar *valid; | |||
while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate (p, len) > 0)) { | |||
goffset cur_offset = err_offset; | |||
if (end >= in + len) { | |||
if (p < in + len) { | |||
memset ((gchar *)p, '?', (in + len) - p); | |||
} | |||
break; | |||
} | |||
while (cur_offset < len) { | |||
goffset tmp = cur_offset; | |||
valid = g_utf8_find_next_char (end, in + len); | |||
U8_NEXT (in, cur_offset, len, uc); | |||
if (!valid) { | |||
valid = in + len; | |||
if (uc > 0) { | |||
/* Fill string between err_offset and tmp with `?` character */ | |||
memset (in + err_offset, '?', | |||
tmp - err_offset); | |||
break; | |||
} | |||
} | |||
if (valid > end) { | |||
memset ((gchar *)end, '?', valid - end); | |||
p = valid; | |||
remain = (in + len) - p; | |||
} | |||
else { | |||
if (uc < 0) { | |||
/* Fill till the end */ | |||
memset (p + err_offset, '?', | |||
len - err_offset); | |||
break; | |||
} | |||
p = in + cur_offset; | |||
len = end - p; | |||
} | |||
} | |||
@@ -26,6 +26,7 @@ | |||
#include "unix-std.h" | |||
#include "protocol_internal.h" | |||
#include "libserver/mempool_vars_internal.h" | |||
#include "contrib/fastutf8/fastutf8.h" | |||
#include "task.h" | |||
#include <math.h> | |||
@@ -922,16 +923,13 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud) | |||
return; | |||
} | |||
const gchar *end = NULL; | |||
goffset err_offset; | |||
if (g_utf8_validate (url->host, url->hostlen, &end)) { | |||
if ((err_offset = rspamd_fast_utf8_validate (url->host, url->hostlen) == 0)) { | |||
obj = ucl_object_fromlstring (url->host, url->hostlen); | |||
} | |||
else if (end - url->host > 0) { | |||
obj = ucl_object_fromlstring (url->host, end - url->host); | |||
} | |||
else { | |||
return; | |||
obj = ucl_object_fromlstring (url->host, err_offset); | |||
} | |||
} | |||
else { |
@@ -41,6 +41,8 @@ | |||
#include <pcre2.h> | |||
#endif | |||
#include "contrib/fastutf8/fastutf8.h" | |||
#ifdef HAVE_SYS_WAIT_H | |||
#include <sys/wait.h> | |||
#endif | |||
@@ -988,7 +990,7 @@ rspamd_re_cache_process_headers_list (struct rspamd_task *task, | |||
in = (const guchar *)cur->value; | |||
lenvec[i] = strlen (cur->value); | |||
if (!g_utf8_validate (in, lenvec[i], NULL)) { | |||
if (rspamd_fast_utf8_validate (in, lenvec[i]) != 0) { | |||
raw = TRUE; | |||
} | |||
} |
@@ -20,6 +20,7 @@ | |||
#include "radix.h" | |||
#include "rspamd.h" | |||
#include "cryptobox.h" | |||
#include "contrib/fastutf8/fastutf8.h" | |||
#ifdef WITH_HYPERSCAN | |||
#include "hs.h" | |||
@@ -1189,7 +1190,7 @@ rspamd_match_regexp_map_single (struct rspamd_regexp_map_helper *map, | |||
} | |||
if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) { | |||
if (g_utf8_validate (in, len, NULL)) { | |||
if (rspamd_fast_utf8_validate (in, len) == 0) { | |||
validated = TRUE; | |||
} | |||
} | |||
@@ -1280,7 +1281,7 @@ rspamd_match_regexp_map_all (struct rspamd_regexp_map_helper *map, | |||
g_assert (in != NULL); | |||
if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) { | |||
if (g_utf8_validate (in, len, NULL)) { | |||
if (rspamd_fast_utf8_validate (in, len) == 0) { | |||
validated = TRUE; | |||
} | |||
} |
@@ -27,6 +27,8 @@ | |||
#endif | |||
#include <math.h> | |||
#include "contrib/fastutf8/fastutf8.h" | |||
const guchar lc_map[256] = { | |||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |||
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, | |||
@@ -2932,7 +2934,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, | |||
} | |||
if (flags & RSPAMD_REGEXP_ESCAPE_UTF) { | |||
if (!g_utf8_validate (pattern, slen, NULL)) { | |||
if (rspamd_fast_utf8_validate (pattern, slen) != 0) { | |||
tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL); | |||
} | |||
} |
@@ -34,6 +34,7 @@ | |||
#include "unicode/uspoof.h" | |||
#include "unicode/uscript.h" | |||
#include "contrib/fastutf8/fastutf8.h" | |||
/*** | |||
* @module rspamd_util | |||
@@ -2855,10 +2856,33 @@ lua_util_is_valid_utf8 (lua_State *L) | |||
const gchar *str; | |||
gsize len; | |||
str = lua_tolstring (L, 1, &len); | |||
if (lua_isstring (L, 1)) { | |||
str = lua_tolstring (L, 1, &len); | |||
} | |||
else { | |||
struct rspamd_lua_text *t = lua_check_text (L, 1); | |||
if (t) { | |||
str = t->start; | |||
len = t->len; | |||
} | |||
else { | |||
return luaL_error (L, "invalid arguments (text expected)"); | |||
} | |||
} | |||
if (str) { | |||
lua_pushboolean (L, g_utf8_validate (str, len, NULL)); | |||
goffset error_offset = rspamd_fast_utf8_validate (str, len); | |||
if (error_offset == 0) { | |||
lua_pushboolean (L, true); | |||
} | |||
else { | |||
lua_pushboolean (L, false); | |||
lua_pushnumber (L, error_offset); | |||
return 2; | |||
} | |||
} | |||
else { | |||
return luaL_error (L, "invalid arguments"); |