aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-20 20:44:49 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-20 20:45:22 +0000
commita45141c003e065341474de0ec0b4310b2f4437c8 (patch)
tree2e9851e15290df805793cf6b51d0a0ae0753c195 /src
parentdc506fc54b60f4bcc7390447a0d80bfd6f799e54 (diff)
downloadrspamd-a45141c003e065341474de0ec0b4310b2f4437c8.tar.gz
rspamd-a45141c003e065341474de0ec0b4310b2f4437c8.zip
[Fix] Properly escape utf8 regexps in hyperscan mode
Diffstat (limited to 'src')
-rw-r--r--src/libmime/lang_detection.c5
-rw-r--r--src/libutil/map_helpers.c2
-rw-r--r--src/libutil/multipattern.c11
-rw-r--r--src/libutil/str_util.c46
-rw-r--r--src/libutil/str_util.h7
-rw-r--r--src/lua/lua_regexp.c6
6 files changed, 61 insertions, 16 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 49e788438..102117b21 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -458,8 +458,9 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
const char *word = ucl_object_tolstring (w, &wlen);
const char *saved;
- rspamd_multipattern_add_pattern (d->stop_words[cat].mp,
- word, wlen);
+ rspamd_multipattern_add_pattern_len (d->stop_words[cat].mp,
+ word, wlen,
+ RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
nelt->stop_words ++;
nstop ++;
diff --git a/src/libutil/map_helpers.c b/src/libutil/map_helpers.c
index e6b940f23..4c34cba15 100644
--- a/src/libutil/map_helpers.c
+++ b/src/libutil/map_helpers.c
@@ -540,7 +540,7 @@ rspamd_map_helper_insert_re (gpointer st, gconstpointer key, gconstpointer value
if (re_map->map_flags & RSPAMD_REGEXP_MAP_FLAG_GLOB) {
escaped = rspamd_str_regexp_escape (key, strlen (key), &escaped_len,
- TRUE);
+ RSPAMD_REGEXP_ESCAPE_GLOB|RSPAMD_REGEXP_ESCAPE_UTF);
re = rspamd_regexp_new (escaped, NULL, &err);
g_free (escaped);
}
diff --git a/src/libutil/multipattern.c b/src/libutil/multipattern.c
index 94b5398b3..e4a39d5fe 100644
--- a/src/libutil/multipattern.c
+++ b/src/libutil/multipattern.c
@@ -193,6 +193,12 @@ rspamd_multipattern_pattern_filter (const gchar *pattern, gsize len,
gchar *ret = NULL;
#ifdef WITH_HYPERSCAN
if (rspamd_hs_check ()) {
+ gint gl_flags = RSPAMD_REGEXP_ESCAPE_ASCII;
+
+ if (flags & RSPAMD_MULTIPATTERN_UTF8) {
+ gl_flags |= RSPAMD_REGEXP_ESCAPE_UTF;
+ }
+
if (flags & RSPAMD_MULTIPATTERN_TLD) {
ret = rspamd_multipattern_escape_tld_hyperscan (pattern, len, dst_len);
}
@@ -201,10 +207,11 @@ rspamd_multipattern_pattern_filter (const gchar *pattern, gsize len,
*dst_len = rspamd_strlcpy (ret, pattern, len + 1);
}
else if (flags & RSPAMD_MULTIPATTERN_GLOB) {
- ret = rspamd_str_regexp_escape (pattern, len, dst_len, TRUE);
+ ret = rspamd_str_regexp_escape (pattern, len, dst_len,
+ gl_flags | RSPAMD_REGEXP_ESCAPE_GLOB);
}
else {
- ret = rspamd_str_regexp_escape (pattern, len, dst_len, FALSE);
+ ret = rspamd_str_regexp_escape (pattern, len, dst_len, gl_flags);
}
return ret;
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index f798d9eeb..be7323df3 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -2327,10 +2327,10 @@ out:
gchar *
rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
- gsize *dst_len, gboolean allow_glob)
+ gsize *dst_len, enum rspamd_regexp_escape_flags flags)
{
const gchar *p, *end = pattern + slen;
- gchar *res, *d, t;
+ gchar *res, *d, t, *tmp_utf = NULL;
gsize len;
static const gchar hexdigests[16] = "0123456789abcdef";
@@ -2365,20 +2365,46 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
if (g_ascii_isspace (t)) {
len ++;
}
- else if (!g_ascii_isprint (t)) {
- /* \\xHH -> 4 symbols */
- len += 3;
+ else {
+ if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) {
+ if (!g_ascii_isprint (t)) {
+ /* \\xHH -> 4 symbols */
+ len += 3;
+ }
+ }
}
break;
}
}
+ if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
+ if (!g_utf8_validate (pattern, slen, NULL)) {
+ tmp_utf = g_utf8_make_valid (pattern, slen);
+ }
+ }
+
if (slen == len) {
if (dst_len) {
+
+ if (tmp_utf) {
+ slen = strlen (tmp_utf);
+ }
+
*dst_len = slen;
}
- return g_strdup (pattern);
+
+
+ if (tmp_utf) {
+ return tmp_utf;
+ }
+ else {
+ return g_strdup (pattern);
+ }
+ }
+
+ if (tmp_utf) {
+ pattern = tmp_utf;
}
res = g_malloc (len + 1);
@@ -2408,7 +2434,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
case '*':
case '?':
case '+':
- if (allow_glob) {
+ if (flags & RSPAMD_REGEXP_ESCAPE_GLOB) {
/* Treat * as .* and ? as .? */
*d++ = '.';
}
@@ -2420,7 +2446,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
if (g_ascii_isspace (t)) {
*d++ = '\\';
}
- else if (!g_ascii_isgraph (t)) {
+ else if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF) && !g_ascii_isgraph (t)) {
*d++ = '\\';
*d++ = 'x';
*d++ = hexdigests[((t >> 4) & 0xF)];
@@ -2439,5 +2465,9 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
*dst_len = d - res;
}
+ if (tmp_utf) {
+ g_free (tmp_utf);
+ }
+
return res;
}
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index 100b64b88..ffcc69197 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -396,6 +396,11 @@ rspamd_str_has_8bit (const guchar *beg, gsize len)
gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
gchar *start, guint *len);
+enum rspamd_regexp_escape_flags {
+ RSPAMD_REGEXP_ESCAPE_ASCII = 0,
+ RSPAMD_REGEXP_ESCAPE_UTF = 1u << 0,
+ RSPAMD_REGEXP_ESCAPE_GLOB = 1u << 1,
+};
/**
* Escapes special characters when reading plain data to be processed in pcre
* @param pattern pattern to process
@@ -406,6 +411,6 @@ gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
*/
gchar *
rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
- gsize *dst_len, gboolean allow_glob);
+ gsize *dst_len, enum rspamd_regexp_escape_flags flags);
#endif /* SRC_LIBUTIL_STR_UTIL_H_ */
diff --git a/src/lua/lua_regexp.c b/src/lua/lua_regexp.c
index 584f7d8c1..4e233448b 100644
--- a/src/lua/lua_regexp.c
+++ b/src/lua/lua_regexp.c
@@ -191,7 +191,8 @@ lua_regexp_import_glob (lua_State *L)
}
if (string) {
- escaped = rspamd_str_regexp_escape (string, pat_len, NULL, TRUE);
+ escaped = rspamd_str_regexp_escape (string, pat_len, NULL,
+ RSPAMD_REGEXP_ESCAPE_GLOB|RSPAMD_REGEXP_ESCAPE_UTF);
re = rspamd_regexp_new (escaped, flags_str, &err);
@@ -249,7 +250,8 @@ lua_regexp_import_plain (lua_State *L)
}
if (string) {
- escaped = rspamd_str_regexp_escape (string, pat_len, NULL, FALSE);
+ escaped = rspamd_str_regexp_escape (string, pat_len, NULL,
+ RSPAMD_REGEXP_ESCAPE_ASCII);
re = rspamd_regexp_new (escaped, flags_str, &err);