diff options
-rw-r--r-- | src/libutil/str_util.c | 81 | ||||
-rw-r--r-- | src/libutil/str_util.h | 10 |
2 files changed, 73 insertions, 18 deletions
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index 27d50aead..2016808cf 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -2420,7 +2420,7 @@ rspamd_get_unicode_normalizer (void) } -gboolean +enum rspamd_normalise_result rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start, guint *len) { @@ -2430,7 +2430,8 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start, const UNormalizer2 *norm = rspamd_get_unicode_normalizer (); gint32 nsym, end; UChar *src = NULL, *dest = NULL; - gboolean ret = FALSE; + enum rspamd_normalise_result ret = 0; + gboolean has_invisible = FALSE; /* We first need to convert data to UChars :( */ src = g_malloc ((*len + 1) * sizeof (*src)); @@ -2440,6 +2441,7 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start, if (!U_SUCCESS (uc_err)) { msg_warn_pool_check ("cannot normalise URL, cannot convert to unicode: %s", u_errorName (uc_err)); + ret |= RSPAMD_UNICODE_NORM_ERROR; goto out; } @@ -2449,36 +2451,81 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start, if (!U_SUCCESS (uc_err)) { msg_warn_pool_check ("cannot normalise URL, cannot check normalisation: %s", u_errorName (uc_err)); + ret |= RSPAMD_UNICODE_NORM_ERROR; goto out; } - if (end == nsym) { - /* No normalisation needed */ + for (gint32 i = 0; i < nsym; i ++) { + if (IS_ZERO_WIDTH_SPACE (src[i])) { + has_invisible = TRUE; + break; + } + } + + uc_err = U_ZERO_ERROR; + + if (end != nsym) { + /* No normalisation needed, but we may still have invisible spaces */ + /* We copy sub(src, 0, end) to dest and normalise the rest */ + ret |= RSPAMD_UNICODE_NORM_UNNORMAL; + dest = g_malloc (nsym * sizeof (*dest)); + memcpy (dest, src, end * sizeof (*dest)); + nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym, + src + end, nsym - end, &uc_err); + + if (!U_SUCCESS (uc_err)) { + if (uc_err != U_BUFFER_OVERFLOW_ERROR) { + msg_warn_pool_check ("cannot normalise URL: %s", + u_errorName (uc_err)); + ret |= RSPAMD_UNICODE_NORM_ERROR; + } + + goto out; + } + } + else if (!has_invisible) { goto out; } + else { + dest = src; + src = NULL; + } - /* We copy sub(src, 0, end) to dest and normalise the rest */ - ret = TRUE; - dest = g_malloc (nsym * sizeof (*dest)); - memcpy (dest, src, end * sizeof (*dest)); - nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym, - src + end, nsym - end, &uc_err); + if (has_invisible) { + /* Also filter zero width spaces */ + gint32 new_len = 0; + UChar *t = dest, *h = dest; - if (!U_SUCCESS (uc_err)) { - if (uc_err != U_BUFFER_OVERFLOW_ERROR) { - msg_warn_pool_check ("cannot normalise URL: %s", - u_errorName (uc_err)); + ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES; + + for (gint32 i = 0; i < nsym; i ++) { + if (!IS_ZERO_WIDTH_SPACE (*h)) { + *t++ = *h++; + new_len ++; + } + else { + h ++; + } } - goto out; + nsym = new_len; } /* We now convert it back to utf */ nsym = ucnv_fromUChars (utf8_conv, start, *len, dest, nsym, &uc_err); if (!U_SUCCESS (uc_err)) { - msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s", - u_errorName (uc_err)); + msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s" + " input length: %d chars, unicode length: %d utf16 symbols", + u_errorName (uc_err), (gint)*len, (gint)nsym); + + if (uc_err == U_BUFFER_OVERFLOW_ERROR) { + ret |= RSPAMD_UNICODE_NORM_OVERFLOW; + } + else { + ret |= RSPAMD_UNICODE_NORM_ERROR; + } + goto out; } diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index 742d34184..059665388 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -402,6 +402,14 @@ struct UConverter *rspamd_get_utf8_converter (void); struct UNormalizer2; const struct UNormalizer2 *rspamd_get_unicode_normalizer (void); +enum rspamd_normalise_result { + RSPAMD_UNICODE_NORM_NORMAL = 0, + RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0), + RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1), + RSPAMD_UNICODE_NORM_ERROR = (1 << 2), + RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3) +}; + /** * Gets a string in UTF8 and normalises it to NFKC_Casefold form * @param pool optional memory pool used for logging purposes @@ -409,7 +417,7 @@ const struct UNormalizer2 *rspamd_get_unicode_normalizer (void); * @param len * @return TRUE if a string has been normalised */ -gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, +enum rspamd_normalise_result rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start, guint *len); enum rspamd_regexp_escape_flags { |