From 79f15b27c647f6a7028b6167f9c243ca3f6fa96a Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 23 Apr 2018 10:32:48 +0100 Subject: [PATCH] [Minor] Move regexp escape function to the public space --- src/libutil/multipattern.c | 115 +----------------------------------- src/libutil/str_util.c | 117 +++++++++++++++++++++++++++++++++++++ src/libutil/str_util.h | 12 ++++ 3 files changed, 131 insertions(+), 113 deletions(-) diff --git a/src/libutil/multipattern.c b/src/libutil/multipattern.c index e55b5d0b5..94b5398b3 100644 --- a/src/libutil/multipattern.c +++ b/src/libutil/multipattern.c @@ -133,117 +133,6 @@ rspamd_multipattern_escape_tld_hyperscan (const gchar *pattern, gsize slen, return res; } -static gchar * -rspamd_multipattern_escape_hyperscan (const gchar *pattern, gsize slen, - gsize *dst_len, gboolean allow_glob) -{ - const gchar *p, *end = pattern + slen; - gchar *res, *d, t; - gsize len; - static const gchar hexdigests[16] = "0123456789abcdef"; - - len = slen; - p = pattern; - - /* [-[\]{}()*+?.,\\^$|#\s] need to be escaped */ - while (p < end) { - t = *p ++; - - switch (t) { - case '[': - case ']': - case '-': - case '\\': - case '{': - case '}': - case '(': - case ')': - case '*': - case '+': - case '?': - case '.': - case ',': - case '^': - case '$': - case '|': - case '#': - len ++; - break; - default: - if (g_ascii_isspace (t)) { - len ++; - } - else if (!g_ascii_isprint (t)) { - /* \\xHH -> 4 symbols */ - len += 3; - } - break; - } - } - - if (slen == len) { - *dst_len = slen; - return g_strdup (pattern); - } - - res = g_malloc (len + 1); - p = pattern; - d = res; - - while (p < end) { - t = *p ++; - - switch (t) { - case '[': - case ']': - case '-': - case '\\': - case '{': - case '}': - case '(': - case ')': - case '.': - case ',': - case '^': - case '$': - case '|': - case '#': - *d++ = '\\'; - break; - case '*': - case '?': - case '+': - if (allow_glob) { - /* Treat * as .* and ? as .? */ - *d++ = '.'; - } - else { - *d++ = '\\'; - } - break; - default: - if (g_ascii_isspace (t)) { - *d++ = '\\'; - } - else if (!g_ascii_isgraph (t)) { - *d++ = '\\'; - *d++ = 'x'; - *d++ = hexdigests[((t >> 4) & 0xF)]; - *d++ = hexdigests[((t) & 0xF)]; - continue; /* To avoid *d++ = t; */ - } - break; - } - - *d++ = t; - } - - *d = '\0'; - *dst_len = d - res; - - return res; -} - #endif static gchar * rspamd_multipattern_escape_tld_acism (const gchar *pattern, gsize len, @@ -312,10 +201,10 @@ rspamd_multipattern_pattern_filter (const gchar *pattern, gsize len, *dst_len = rspamd_strlcpy (ret, pattern, len + 1); } else if (flags & RSPAMD_MULTIPATTERN_GLOB) { - ret = rspamd_multipattern_escape_hyperscan (pattern, len, dst_len, TRUE); + ret = rspamd_str_regexp_escape (pattern, len, dst_len, TRUE); } else { - ret = rspamd_multipattern_escape_hyperscan (pattern, len, dst_len, FALSE); + ret = rspamd_str_regexp_escape (pattern, len, dst_len, FALSE); } return ret; diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index 3b1f3c1e3..186ce5d38 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -2093,3 +2093,120 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start, return FALSE; #endif } + +gchar * +rspamd_str_regexp_escape (const gchar *pattern, gsize slen, + gsize *dst_len, gboolean allow_glob) +{ + const gchar *p, *end = pattern + slen; + gchar *res, *d, t; + gsize len; + static const gchar hexdigests[16] = "0123456789abcdef"; + + len = slen; + p = pattern; + + /* [-[\]{}()*+?.,\\^$|#\s] need to be escaped */ + while (p < end) { + t = *p ++; + + switch (t) { + case '[': + case ']': + case '-': + case '\\': + case '{': + case '}': + case '(': + case ')': + case '*': + case '+': + case '?': + case '.': + case ',': + case '^': + case '$': + case '|': + case '#': + len ++; + break; + default: + if (g_ascii_isspace (t)) { + len ++; + } + else if (!g_ascii_isprint (t)) { + /* \\xHH -> 4 symbols */ + len += 3; + } + break; + } + } + + if (slen == len) { + if (dst_len) { + *dst_len = slen; + } + + return g_strdup (pattern); + } + + res = g_malloc (len + 1); + p = pattern; + d = res; + + while (p < end) { + t = *p ++; + + switch (t) { + case '[': + case ']': + case '-': + case '\\': + case '{': + case '}': + case '(': + case ')': + case '.': + case ',': + case '^': + case '$': + case '|': + case '#': + *d++ = '\\'; + break; + case '*': + case '?': + case '+': + if (allow_glob) { + /* Treat * as .* and ? as .? */ + *d++ = '.'; + } + else { + *d++ = '\\'; + } + break; + default: + if (g_ascii_isspace (t)) { + *d++ = '\\'; + } + else if (!g_ascii_isgraph (t)) { + *d++ = '\\'; + *d++ = 'x'; + *d++ = hexdigests[((t >> 4) & 0xF)]; + *d++ = hexdigests[((t) & 0xF)]; + continue; /* To avoid *d++ = t; */ + } + break; + } + + *d++ = t; + } + + *d = '\0'; + + if (dst_len) { + *dst_len = d - res; + } + + return res; +} diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index 5f0695c2a..45507e2be 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -375,4 +375,16 @@ rspamd_str_has_8bit (const guchar *beg, gsize len) gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start, guint *len); +/** + * Escapes special characters when reading plain data to be processed in pcre + * @param pattern pattern to process + * @param slen source length + * @param dst_len destination length pointer (can be NULL) + * @param allow_glob allow glob expressions to be translated into pcre + * @return newly allocated zero terminated escaped pattern + */ +gchar * +rspamd_str_regexp_escape (const gchar *pattern, gsize slen, + gsize *dst_len, gboolean allow_glob); + #endif /* SRC_LIBUTIL_STR_UTIL_H_ */ -- 2.39.5