summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/libutil/str_util.c81
-rw-r--r--src/libutil/str_util.h10
2 files changed, 73 insertions, 18 deletions
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 27d50aead..2016808cf 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -2420,7 +2420,7 @@ rspamd_get_unicode_normalizer (void)
}
-gboolean
+enum rspamd_normalise_result
rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
guint *len)
{
@@ -2430,7 +2430,8 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
gint32 nsym, end;
UChar *src = NULL, *dest = NULL;
- gboolean ret = FALSE;
+ enum rspamd_normalise_result ret = 0;
+ gboolean has_invisible = FALSE;
/* We first need to convert data to UChars :( */
src = g_malloc ((*len + 1) * sizeof (*src));
@@ -2440,6 +2441,7 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
if (!U_SUCCESS (uc_err)) {
msg_warn_pool_check ("cannot normalise URL, cannot convert to unicode: %s",
u_errorName (uc_err));
+ ret |= RSPAMD_UNICODE_NORM_ERROR;
goto out;
}
@@ -2449,36 +2451,81 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
if (!U_SUCCESS (uc_err)) {
msg_warn_pool_check ("cannot normalise URL, cannot check normalisation: %s",
u_errorName (uc_err));
+ ret |= RSPAMD_UNICODE_NORM_ERROR;
goto out;
}
- if (end == nsym) {
- /* No normalisation needed */
+ for (gint32 i = 0; i < nsym; i ++) {
+ if (IS_ZERO_WIDTH_SPACE (src[i])) {
+ has_invisible = TRUE;
+ break;
+ }
+ }
+
+ uc_err = U_ZERO_ERROR;
+
+ if (end != nsym) {
+ /* No normalisation needed, but we may still have invisible spaces */
+ /* We copy sub(src, 0, end) to dest and normalise the rest */
+ ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
+ dest = g_malloc (nsym * sizeof (*dest));
+ memcpy (dest, src, end * sizeof (*dest));
+ nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
+ src + end, nsym - end, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
+ msg_warn_pool_check ("cannot normalise URL: %s",
+ u_errorName (uc_err));
+ ret |= RSPAMD_UNICODE_NORM_ERROR;
+ }
+
+ goto out;
+ }
+ }
+ else if (!has_invisible) {
goto out;
}
+ else {
+ dest = src;
+ src = NULL;
+ }
- /* We copy sub(src, 0, end) to dest and normalise the rest */
- ret = TRUE;
- dest = g_malloc (nsym * sizeof (*dest));
- memcpy (dest, src, end * sizeof (*dest));
- nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
- src + end, nsym - end, &uc_err);
+ if (has_invisible) {
+ /* Also filter zero width spaces */
+ gint32 new_len = 0;
+ UChar *t = dest, *h = dest;
- if (!U_SUCCESS (uc_err)) {
- if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
- msg_warn_pool_check ("cannot normalise URL: %s",
- u_errorName (uc_err));
+ ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;
+
+ for (gint32 i = 0; i < nsym; i ++) {
+ if (!IS_ZERO_WIDTH_SPACE (*h)) {
+ *t++ = *h++;
+ new_len ++;
+ }
+ else {
+ h ++;
+ }
}
- goto out;
+ nsym = new_len;
}
/* We now convert it back to utf */
nsym = ucnv_fromUChars (utf8_conv, start, *len, dest, nsym, &uc_err);
if (!U_SUCCESS (uc_err)) {
- msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s",
- u_errorName (uc_err));
+ msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s"
+ " input length: %d chars, unicode length: %d utf16 symbols",
+ u_errorName (uc_err), (gint)*len, (gint)nsym);
+
+ if (uc_err == U_BUFFER_OVERFLOW_ERROR) {
+ ret |= RSPAMD_UNICODE_NORM_OVERFLOW;
+ }
+ else {
+ ret |= RSPAMD_UNICODE_NORM_ERROR;
+ }
+
goto out;
}
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index 742d34184..059665388 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -402,6 +402,14 @@ struct UConverter *rspamd_get_utf8_converter (void);
struct UNormalizer2;
const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);
+enum rspamd_normalise_result {
+ RSPAMD_UNICODE_NORM_NORMAL = 0,
+ RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0),
+ RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1),
+ RSPAMD_UNICODE_NORM_ERROR = (1 << 2),
+ RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3)
+};
+
/**
* Gets a string in UTF8 and normalises it to NFKC_Casefold form
* @param pool optional memory pool used for logging purposes
@@ -409,7 +417,7 @@ const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);
* @param len
* @return TRUE if a string has been normalised
*/
-gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
+enum rspamd_normalise_result rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
gchar *start, guint *len);
enum rspamd_regexp_escape_flags {