Kaynağa Gözat

[Feature] Core: Normalise zero-width spaces in urls

tags/1.9.0
Vsevolod Stakhov 5 yıl önce
ebeveyn
işleme
8a4c5ca57a
2 değiştirilmiş dosya ile 73 ekleme ve 18 silme
  1. 64
    17
      src/libutil/str_util.c
  2. 9
    1
      src/libutil/str_util.h

+ 64
- 17
src/libutil/str_util.c Dosyayı Görüntüle

@@ -2420,7 +2420,7 @@ rspamd_get_unicode_normalizer (void)
}


gboolean
enum rspamd_normalise_result
rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
guint *len)
{
@@ -2430,7 +2430,8 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
gint32 nsym, end;
UChar *src = NULL, *dest = NULL;
gboolean ret = FALSE;
enum rspamd_normalise_result ret = 0;
gboolean has_invisible = FALSE;

/* We first need to convert data to UChars :( */
src = g_malloc ((*len + 1) * sizeof (*src));
@@ -2440,6 +2441,7 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
if (!U_SUCCESS (uc_err)) {
msg_warn_pool_check ("cannot normalise URL, cannot convert to unicode: %s",
u_errorName (uc_err));
ret |= RSPAMD_UNICODE_NORM_ERROR;
goto out;
}

@@ -2449,36 +2451,81 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
if (!U_SUCCESS (uc_err)) {
msg_warn_pool_check ("cannot normalise URL, cannot check normalisation: %s",
u_errorName (uc_err));
ret |= RSPAMD_UNICODE_NORM_ERROR;
goto out;
}

if (end == nsym) {
/* No normalisation needed */
for (gint32 i = 0; i < nsym; i ++) {
if (IS_ZERO_WIDTH_SPACE (src[i])) {
has_invisible = TRUE;
break;
}
}

uc_err = U_ZERO_ERROR;

if (end != nsym) {
/* No normalisation needed, but we may still have invisible spaces */
/* We copy sub(src, 0, end) to dest and normalise the rest */
ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
dest = g_malloc (nsym * sizeof (*dest));
memcpy (dest, src, end * sizeof (*dest));
nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
src + end, nsym - end, &uc_err);

if (!U_SUCCESS (uc_err)) {
if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
msg_warn_pool_check ("cannot normalise URL: %s",
u_errorName (uc_err));
ret |= RSPAMD_UNICODE_NORM_ERROR;
}

goto out;
}
}
else if (!has_invisible) {
goto out;
}
else {
dest = src;
src = NULL;
}

/* We copy sub(src, 0, end) to dest and normalise the rest */
ret = TRUE;
dest = g_malloc (nsym * sizeof (*dest));
memcpy (dest, src, end * sizeof (*dest));
nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
src + end, nsym - end, &uc_err);
if (has_invisible) {
/* Also filter zero width spaces */
gint32 new_len = 0;
UChar *t = dest, *h = dest;

if (!U_SUCCESS (uc_err)) {
if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
msg_warn_pool_check ("cannot normalise URL: %s",
u_errorName (uc_err));
ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;

for (gint32 i = 0; i < nsym; i ++) {
if (!IS_ZERO_WIDTH_SPACE (*h)) {
*t++ = *h++;
new_len ++;
}
else {
h ++;
}
}

goto out;
nsym = new_len;
}

/* We now convert it back to utf */
nsym = ucnv_fromUChars (utf8_conv, start, *len, dest, nsym, &uc_err);

if (!U_SUCCESS (uc_err)) {
msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s",
u_errorName (uc_err));
msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s"
" input length: %d chars, unicode length: %d utf16 symbols",
u_errorName (uc_err), (gint)*len, (gint)nsym);

if (uc_err == U_BUFFER_OVERFLOW_ERROR) {
ret |= RSPAMD_UNICODE_NORM_OVERFLOW;
}
else {
ret |= RSPAMD_UNICODE_NORM_ERROR;
}

goto out;
}


+ 9
- 1
src/libutil/str_util.h Dosyayı Görüntüle

@@ -402,6 +402,14 @@ struct UConverter *rspamd_get_utf8_converter (void);
struct UNormalizer2;
const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);

enum rspamd_normalise_result {
RSPAMD_UNICODE_NORM_NORMAL = 0,
RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0),
RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1),
RSPAMD_UNICODE_NORM_ERROR = (1 << 2),
RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3)
};

/**
* Gets a string in UTF8 and normalises it to NFKC_Casefold form
* @param pool optional memory pool used for logging purposes
@@ -409,7 +417,7 @@ const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);
* @param len
* @return TRUE if a string has been normalised
*/
gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
enum rspamd_normalise_result rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
gchar *start, guint *len);

enum rspamd_regexp_escape_flags {

Loading…
İptal
Kaydet