]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Better escaping of unicode
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 14 Feb 2019 17:27:01 +0000 (17:27 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 14 Feb 2019 17:28:19 +0000 (17:28 +0000)
src/libutil/str_util.c
src/libutil/str_util.h

index 06d7a6cc7927603c4740c3065bd218e89196a2e3..0defa2acf1db2885635773d532116e6a2c5c8c45 100644 (file)
@@ -2605,7 +2605,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
                gsize *dst_len, enum rspamd_regexp_escape_flags flags)
 {
        const gchar *p, *end = pattern + slen;
-       gchar *res, *d, t, *tmp_utf = NULL;
+       gchar *res, *d, t, *tmp_utf = NULL, *dend;
        gsize len;
        static const gchar hexdigests[16] = "0123456789abcdef";
 
@@ -2634,15 +2634,22 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
                case '$':
                case '|':
                case '#':
-                       len ++;
+                       if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
+                               len++;
+                       }
                        break;
                default:
                        if (g_ascii_isspace (t)) {
                                len ++;
                        }
                        else {
-                               if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) {
-                                       if (!g_ascii_isprint (t)) {
+                               if (!g_ascii_isprint (t) || (t & 0x80)) {
+
+                                       if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
+                                               /* \x{code}, where code can be up to 5 digits */
+                                               len += 4;
+                                       }
+                                       else {
                                                /* \\xHH -> 4 symbols */
                                                len += 3;
                                        }
@@ -2668,8 +2675,6 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
                        *dst_len = slen;
                }
 
-
-
                if (tmp_utf) {
                        return tmp_utf;
                }
@@ -2685,8 +2690,10 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
        res = g_malloc (len + 1);
        p = pattern;
        d = res;
+       dend = d + len;
 
        while (p < end) {
+               g_assert (d < dend);
                t = *p ++;
 
                switch (t) {
@@ -2704,7 +2711,9 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
                case '$':
                case '|':
                case '#':
-                       *d++ = '\\';
+                       if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
+                               *d++ = '\\';
+                       }
                        break;
                case '*':
                case '?':
@@ -2714,19 +2723,40 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
                                *d++ = '.';
                        }
                        else {
-                               *d++ = '\\';
+                               if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
+                                       *d++ = '\\';
+                               }
                        }
                        break;
                default:
                        if (g_ascii_isspace (t)) {
-                               *d++ = '\\';
+                               if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
+                                       *d++ = '\\';
+                               }
                        }
-                       else if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF) && !g_ascii_isgraph (t)) {
-                               *d++ = '\\';
-                               *d++ = 'x';
-                               *d++ = hexdigests[((t >> 4) & 0xF)];
-                               *d++ = hexdigests[((t) & 0xF)];
-                               continue; /* To avoid *d++ = t; */
+                       else if (t & 0x80 || !g_ascii_isprint (t)) {
+                               if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) {
+                                       *d++ = '\\';
+                                       *d++ = 'x';
+                                       *d++ = hexdigests[((t >> 4) & 0xF)];
+                                       *d++ = hexdigests[((t) & 0xF)];
+                                       continue; /* To avoid *d++ = t; */
+                               }
+                               else {
+                                       if (flags & (RSPAMD_REGEXP_ESCAPE_RE|RSPAMD_REGEXP_ESCAPE_GLOB)) {
+                                               UChar32 uc;
+                                               gint32 off = p - pattern - 1;
+                                               U8_NEXT (pattern, off, slen, uc);
+
+                                               if (uc > 0) {
+                                                       d += rspamd_snprintf (d, dend - d,
+                                                                       "\\x{%xd}", uc);
+                                                       p = pattern + off;
+                                               }
+
+                                               continue; /* To avoid *d++ = t; */
+                                       }
+                               }
                        }
                        break;
                }
index 46b74001bf338f53f7b639abf8eda09deb65dc20..34c1271d4fbe9b11382eed8f4cc2a56a17dc39e1 100644 (file)
@@ -436,6 +436,7 @@ enum rspamd_regexp_escape_flags {
        RSPAMD_REGEXP_ESCAPE_ASCII = 0,
        RSPAMD_REGEXP_ESCAPE_UTF = 1u << 0,
        RSPAMD_REGEXP_ESCAPE_GLOB = 1u << 1,
+       RSPAMD_REGEXP_ESCAPE_RE = 1u << 2,
 };
 /**
  * Escapes special characters when reading plain data to be processed in pcre