Quellcode durchsuchen

[Feature] Better escaping of unicode

tags/1.9.0
Vsevolod Stakhov vor 5 Jahren
Ursprung
Commit
9b2e2d70a8
2 geänderte Dateien mit 46 neuen und 15 gelöschten Zeilen
  1. 45
    15
      src/libutil/str_util.c
  2. 1
    0
      src/libutil/str_util.h

+ 45
- 15
src/libutil/str_util.c Datei anzeigen

@@ -2605,7 +2605,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
gsize *dst_len, enum rspamd_regexp_escape_flags flags)
{
const gchar *p, *end = pattern + slen;
gchar *res, *d, t, *tmp_utf = NULL;
gchar *res, *d, t, *tmp_utf = NULL, *dend;
gsize len;
static const gchar hexdigests[16] = "0123456789abcdef";

@@ -2634,15 +2634,22 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
case '$':
case '|':
case '#':
len ++;
if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
len++;
}
break;
default:
if (g_ascii_isspace (t)) {
len ++;
}
else {
if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) {
if (!g_ascii_isprint (t)) {
if (!g_ascii_isprint (t) || (t & 0x80)) {

if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
/* \x{code}, where code can be up to 5 digits */
len += 4;
}
else {
/* \\xHH -> 4 symbols */
len += 3;
}
@@ -2668,8 +2675,6 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
*dst_len = slen;
}



if (tmp_utf) {
return tmp_utf;
}
@@ -2685,8 +2690,10 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
res = g_malloc (len + 1);
p = pattern;
d = res;
dend = d + len;

while (p < end) {
g_assert (d < dend);
t = *p ++;

switch (t) {
@@ -2704,7 +2711,9 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
case '$':
case '|':
case '#':
*d++ = '\\';
if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
*d++ = '\\';
}
break;
case '*':
case '?':
@@ -2714,19 +2723,40 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
*d++ = '.';
}
else {
*d++ = '\\';
if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
*d++ = '\\';
}
}
break;
default:
if (g_ascii_isspace (t)) {
*d++ = '\\';
if (!(flags & RSPAMD_REGEXP_ESCAPE_RE)) {
*d++ = '\\';
}
}
else if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF) && !g_ascii_isgraph (t)) {
*d++ = '\\';
*d++ = 'x';
*d++ = hexdigests[((t >> 4) & 0xF)];
*d++ = hexdigests[((t) & 0xF)];
continue; /* To avoid *d++ = t; */
else if (t & 0x80 || !g_ascii_isprint (t)) {
if (!(flags & RSPAMD_REGEXP_ESCAPE_UTF)) {
*d++ = '\\';
*d++ = 'x';
*d++ = hexdigests[((t >> 4) & 0xF)];
*d++ = hexdigests[((t) & 0xF)];
continue; /* To avoid *d++ = t; */
}
else {
if (flags & (RSPAMD_REGEXP_ESCAPE_RE|RSPAMD_REGEXP_ESCAPE_GLOB)) {
UChar32 uc;
gint32 off = p - pattern - 1;
U8_NEXT (pattern, off, slen, uc);

if (uc > 0) {
d += rspamd_snprintf (d, dend - d,
"\\x{%xd}", uc);
p = pattern + off;
}

continue; /* To avoid *d++ = t; */
}
}
}
break;
}

+ 1
- 0
src/libutil/str_util.h Datei anzeigen

@@ -436,6 +436,7 @@ enum rspamd_regexp_escape_flags {
RSPAMD_REGEXP_ESCAPE_ASCII = 0,
RSPAMD_REGEXP_ESCAPE_UTF = 1u << 0,
RSPAMD_REGEXP_ESCAPE_GLOB = 1u << 1,
RSPAMD_REGEXP_ESCAPE_RE = 1u << 2,
};
/**
* Escapes special characters when reading plain data to be processed in pcre

Laden…
Abbrechen
Speichern