diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-02-14 17:27:01 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-02-14 17:28:19 +0000 |
commit | 9b2e2d70a83c5c679f917253bcdb733d4bbbe705 (patch) | |
tree | cf8508b8311b72a9564241873660cccbaa904862 | |
parent | 5f2aad306e13c1688a0e0d48be65aa5bd6070ebb (diff) | |
download | rspamd-9b2e2d70a83c5c679f917253bcdb733d4bbbe705.tar.gz rspamd-9b2e2d70a83c5c679f917253bcdb733d4bbbe705.zip |
[Feature] Better escaping of unicode
-rw-r--r-- | src/libutil/str_util.c | 60 | ||||
-rw-r--r-- | src/libutil/str_util.h | 1 |
2 files changed, 46 insertions, 15 deletions
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index 06d7a6cc7..0defa2acf 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -2605,7 +2605,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, gsize *dst_len, enum rspamd_regexp_escape_flags flags) { const gchar *p, *end = pattern + slen; - gchar *res, *d, t, *tmp_utf = NULL; + gchar *res, *d, t, *tmp_utf = NULL, *dend; gsize len; static const gchar hexdigests[16] = "0123456789abcdef"; @@ -2634,15 +2634,22 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, case '$': case '|': case '#': - len ++; + if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) { + len++; + } break; default: if (g_ascii_isspace (t)) { len ++; } else { - if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) { - if (!g_ascii_isprint (t)) { + if (!g_ascii_isprint (t) || (t & 0x80)) { + + if (flags & RSPAMD_REGEXP_ESCAPE_UTF) { + /* \x{code}, where code can be up to 5 digits */ + len += 4; + } + else { /* \\xHH -> 4 symbols */ len += 3; } @@ -2668,8 +2675,6 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, *dst_len = slen; } - - if (tmp_utf) { return tmp_utf; } @@ -2685,8 +2690,10 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, res = g_malloc (len + 1); p = pattern; d = res; + dend = d + len; while (p < end) { + g_assert (d < dend); t = *p ++; switch (t) { @@ -2704,7 +2711,9 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, case '$': case '|': case '#': - *d++ = '\\'; + if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) { + *d++ = '\\'; + } break; case '*': case '?': @@ -2714,19 +2723,40 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, *d++ = '.'; } else { - *d++ = '\\'; + if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) { + *d++ = '\\'; + } } break; default: if (g_ascii_isspace (t)) { - *d++ = '\\'; + if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) { + *d++ = '\\'; + } } - else if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF) && !g_ascii_isgraph (t)) { - *d++ = '\\'; - *d++ = 'x'; - *d++ = hexdigests[((t >> 4) & 0xF)]; - *d++ = hexdigests[((t) & 0xF)]; - continue; /* To avoid *d++ = t; */ + else if (t & 0x80 || !g_ascii_isprint (t)) { + if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) { + *d++ = '\\'; + *d++ = 'x'; + *d++ = hexdigests[((t >> 4) & 0xF)]; + *d++ = hexdigests[((t) & 0xF)]; + continue; /* To avoid *d++ = t; */ + } + else { + if (flags & (RSPAMD_REGEXP_ESCAPE_RE|RSPAMD_REGEXP_ESCAPE_GLOB)) { + UChar32 uc; + gint32 off = p - pattern - 1; + U8_NEXT (pattern, off, slen, uc); + + if (uc > 0) { + d += rspamd_snprintf (d, dend - d, + "\\x{%xd}", uc); + p = pattern + off; + } + + continue; /* To avoid *d++ = t; */ + } + } } break; } diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index 46b74001b..34c1271d4 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -436,6 +436,7 @@ enum rspamd_regexp_escape_flags { RSPAMD_REGEXP_ESCAPE_ASCII = 0, RSPAMD_REGEXP_ESCAPE_UTF = 1u << 0, RSPAMD_REGEXP_ESCAPE_GLOB = 1u << 1, + RSPAMD_REGEXP_ESCAPE_RE = 1u << 2, }; /** * Escapes special characters when reading plain data to be processed in pcre |