diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-20 20:44:49 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-20 20:45:22 +0000 |
commit | a45141c003e065341474de0ec0b4310b2f4437c8 (patch) | |
tree | 2e9851e15290df805793cf6b51d0a0ae0753c195 /src | |
parent | dc506fc54b60f4bcc7390447a0d80bfd6f799e54 (diff) | |
download | rspamd-a45141c003e065341474de0ec0b4310b2f4437c8.tar.gz rspamd-a45141c003e065341474de0ec0b4310b2f4437c8.zip |
[Fix] Properly escape utf8 regexps in hyperscan mode
Diffstat (limited to 'src')
-rw-r--r-- | src/libmime/lang_detection.c | 5 | ||||
-rw-r--r-- | src/libutil/map_helpers.c | 2 | ||||
-rw-r--r-- | src/libutil/multipattern.c | 11 | ||||
-rw-r--r-- | src/libutil/str_util.c | 46 | ||||
-rw-r--r-- | src/libutil/str_util.h | 7 | ||||
-rw-r--r-- | src/lua/lua_regexp.c | 6 |
6 files changed, 61 insertions, 16 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 49e788438..102117b21 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -458,8 +458,9 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, const char *word = ucl_object_tolstring (w, &wlen); const char *saved; - rspamd_multipattern_add_pattern (d->stop_words[cat].mp, - word, wlen); + rspamd_multipattern_add_pattern_len (d->stop_words[cat].mp, + word, wlen, + RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8); nelt->stop_words ++; nstop ++; diff --git a/src/libutil/map_helpers.c b/src/libutil/map_helpers.c index e6b940f23..4c34cba15 100644 --- a/src/libutil/map_helpers.c +++ b/src/libutil/map_helpers.c @@ -540,7 +540,7 @@ rspamd_map_helper_insert_re (gpointer st, gconstpointer key, gconstpointer value if (re_map->map_flags & RSPAMD_REGEXP_MAP_FLAG_GLOB) { escaped = rspamd_str_regexp_escape (key, strlen (key), &escaped_len, - TRUE); + RSPAMD_REGEXP_ESCAPE_GLOB|RSPAMD_REGEXP_ESCAPE_UTF); re = rspamd_regexp_new (escaped, NULL, &err); g_free (escaped); } diff --git a/src/libutil/multipattern.c b/src/libutil/multipattern.c index 94b5398b3..e4a39d5fe 100644 --- a/src/libutil/multipattern.c +++ b/src/libutil/multipattern.c @@ -193,6 +193,12 @@ rspamd_multipattern_pattern_filter (const gchar *pattern, gsize len, gchar *ret = NULL; #ifdef WITH_HYPERSCAN if (rspamd_hs_check ()) { + gint gl_flags = RSPAMD_REGEXP_ESCAPE_ASCII; + + if (flags & RSPAMD_MULTIPATTERN_UTF8) { + gl_flags |= RSPAMD_REGEXP_ESCAPE_UTF; + } + if (flags & RSPAMD_MULTIPATTERN_TLD) { ret = rspamd_multipattern_escape_tld_hyperscan (pattern, len, dst_len); } @@ -201,10 +207,11 @@ rspamd_multipattern_pattern_filter (const gchar *pattern, gsize len, *dst_len = rspamd_strlcpy (ret, pattern, len + 1); } else if (flags & RSPAMD_MULTIPATTERN_GLOB) { - ret = rspamd_str_regexp_escape (pattern, len, dst_len, TRUE); + ret = rspamd_str_regexp_escape (pattern, len, dst_len, + gl_flags | RSPAMD_REGEXP_ESCAPE_GLOB); } else { - ret = rspamd_str_regexp_escape (pattern, len, dst_len, FALSE); + ret = rspamd_str_regexp_escape (pattern, len, dst_len, gl_flags); } return ret; diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index f798d9eeb..be7323df3 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -2327,10 +2327,10 @@ out: gchar * rspamd_str_regexp_escape (const gchar *pattern, gsize slen, - gsize *dst_len, gboolean allow_glob) + gsize *dst_len, enum rspamd_regexp_escape_flags flags) { const gchar *p, *end = pattern + slen; - gchar *res, *d, t; + gchar *res, *d, t, *tmp_utf = NULL; gsize len; static const gchar hexdigests[16] = "0123456789abcdef"; @@ -2365,20 +2365,46 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, if (g_ascii_isspace (t)) { len ++; } - else if (!g_ascii_isprint (t)) { - /* \\xHH -> 4 symbols */ - len += 3; + else { + if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) { + if (!g_ascii_isprint (t)) { + /* \\xHH -> 4 symbols */ + len += 3; + } + } } break; } } + if (flags & RSPAMD_REGEXP_ESCAPE_UTF) { + if (!g_utf8_validate (pattern, slen, NULL)) { + tmp_utf = g_utf8_make_valid (pattern, slen); + } + } + if (slen == len) { if (dst_len) { + + if (tmp_utf) { + slen = strlen (tmp_utf); + } + *dst_len = slen; } - return g_strdup (pattern); + + + if (tmp_utf) { + return tmp_utf; + } + else { + return g_strdup (pattern); + } + } + + if (tmp_utf) { + pattern = tmp_utf; } res = g_malloc (len + 1); @@ -2408,7 +2434,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, case '*': case '?': case '+': - if (allow_glob) { + if (flags & RSPAMD_REGEXP_ESCAPE_GLOB) { /* Treat * as .* and ? as .? */ *d++ = '.'; } @@ -2420,7 +2446,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, if (g_ascii_isspace (t)) { *d++ = '\\'; } - else if (!g_ascii_isgraph (t)) { + else if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF) && !g_ascii_isgraph (t)) { *d++ = '\\'; *d++ = 'x'; *d++ = hexdigests[((t >> 4) & 0xF)]; @@ -2439,5 +2465,9 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, *dst_len = d - res; } + if (tmp_utf) { + g_free (tmp_utf); + } + return res; } diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index 100b64b88..ffcc69197 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -396,6 +396,11 @@ rspamd_str_has_8bit (const guchar *beg, gsize len) gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start, guint *len); +enum rspamd_regexp_escape_flags { + RSPAMD_REGEXP_ESCAPE_ASCII = 0, + RSPAMD_REGEXP_ESCAPE_UTF = 1u << 0, + RSPAMD_REGEXP_ESCAPE_GLOB = 1u << 1, +}; /** * Escapes special characters when reading plain data to be processed in pcre * @param pattern pattern to process @@ -406,6 +411,6 @@ gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, */ gchar * rspamd_str_regexp_escape (const gchar *pattern, gsize slen, - gsize *dst_len, gboolean allow_glob); + gsize *dst_len, enum rspamd_regexp_escape_flags flags); #endif /* SRC_LIBUTIL_STR_UTIL_H_ */ diff --git a/src/lua/lua_regexp.c b/src/lua/lua_regexp.c index 584f7d8c1..4e233448b 100644 --- a/src/lua/lua_regexp.c +++ b/src/lua/lua_regexp.c @@ -191,7 +191,8 @@ lua_regexp_import_glob (lua_State *L) } if (string) { - escaped = rspamd_str_regexp_escape (string, pat_len, NULL, TRUE); + escaped = rspamd_str_regexp_escape (string, pat_len, NULL, + RSPAMD_REGEXP_ESCAPE_GLOB|RSPAMD_REGEXP_ESCAPE_UTF); re = rspamd_regexp_new (escaped, flags_str, &err); @@ -249,7 +250,8 @@ lua_regexp_import_plain (lua_State *L) } if (string) { - escaped = rspamd_str_regexp_escape (string, pat_len, NULL, FALSE); + escaped = rspamd_str_regexp_escape (string, pat_len, NULL, + RSPAMD_REGEXP_ESCAPE_ASCII); re = rspamd_regexp_new (escaped, flags_str, &err); |