[Feature] Core: Normalise zero-width spaces in urls

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Wed, 16 Jan 2019 15:04:27 +0000 (15:04 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Wed, 16 Jan 2019 15:04:27 +0000 (15:04 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 16 Jan 2019 15:04:27 +0000 (15:04 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 16 Jan 2019 15:04:27 +0000 (15:04 +0000)
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c

index 27d50aead98205de35657b6b5432f9d04a0e070b..2016808cf89fee55813932a5cfe0cab49422e3dc 100644 (file)
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -2420,7 +2420,7 @@ rspamd_get_unicode_normalizer (void)
  }
  
  
-gboolean
+enum rspamd_normalise_result
  rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
                 guint *len)
  {
@@ -2430,7 +2430,8 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
         const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
         gint32 nsym, end;
         UChar *src = NULL, *dest = NULL;
-       gboolean ret = FALSE;
+       enum rspamd_normalise_result ret = 0;
+       gboolean has_invisible = FALSE;
  
         /* We first need to convert data to UChars :( */
         src = g_malloc ((*len + 1) * sizeof (*src));
@@ -2440,6 +2441,7 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
         if (!U_SUCCESS (uc_err)) {
                 msg_warn_pool_check ("cannot normalise URL, cannot convert to unicode: %s",
                                 u_errorName (uc_err));
+               ret |= RSPAMD_UNICODE_NORM_ERROR;
                 goto out;
         }
  
@@ -2449,36 +2451,81 @@ rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
         if (!U_SUCCESS (uc_err)) {
                 msg_warn_pool_check ("cannot normalise URL, cannot check normalisation: %s",
                                 u_errorName (uc_err));
+               ret |= RSPAMD_UNICODE_NORM_ERROR;
                 goto out;
         }
  
-       if (end == nsym) {
-               /* No normalisation needed */
+       for (gint32 i = 0; i < nsym; i ++) {
+               if (IS_ZERO_WIDTH_SPACE (src[i])) {
+                       has_invisible = TRUE;
+                       break;
+               }
+       }
+
+       uc_err = U_ZERO_ERROR;
+
+       if (end != nsym) {
+               /* No normalisation needed, but we may still have invisible spaces */
+               /* We copy sub(src, 0, end) to dest and normalise the rest */
+               ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
+               dest = g_malloc (nsym * sizeof (*dest));
+               memcpy (dest, src, end * sizeof (*dest));
+               nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
+                               src + end, nsym - end, &uc_err);
+
+               if (!U_SUCCESS (uc_err)) {
+                       if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
+                               msg_warn_pool_check ("cannot normalise URL: %s",
+                                               u_errorName (uc_err));
+                               ret |= RSPAMD_UNICODE_NORM_ERROR;
+                       }
+
+                       goto out;
+               }
+       }
+       else if (!has_invisible) {
                 goto out;
         }
+       else {
+               dest = src;
+               src = NULL;
+       }
  
-       /* We copy sub(src, 0, end) to dest and normalise the rest */
-       ret = TRUE;
-       dest = g_malloc (nsym * sizeof (*dest));
-       memcpy (dest, src, end * sizeof (*dest));
-       nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
-                       src + end, nsym - end, &uc_err);
+       if (has_invisible) {
+               /* Also filter zero width spaces */
+               gint32 new_len = 0;
+               UChar *t = dest, *h = dest;
  
-       if (!U_SUCCESS (uc_err)) {
-               if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
-                       msg_warn_pool_check ("cannot normalise URL: %s",
-                                       u_errorName (uc_err));
+               ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;
+
+               for (gint32 i = 0; i < nsym; i ++) {
+                       if (!IS_ZERO_WIDTH_SPACE (*h)) {
+                               *t++ = *h++;
+                               new_len ++;
+                       }
+                       else {
+                               h ++;
+                       }
                 }
  
-               goto out;
+               nsym = new_len;
         }
  
         /* We now convert it back to utf */
         nsym = ucnv_fromUChars (utf8_conv, start, *len, dest, nsym, &uc_err);
  
         if (!U_SUCCESS (uc_err)) {
-               msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s",
-                               u_errorName (uc_err));
+               msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s"
+                                          " input length: %d chars, unicode length: %d utf16 symbols",
+                               u_errorName (uc_err), (gint)*len, (gint)nsym);
+
+               if (uc_err == U_BUFFER_OVERFLOW_ERROR) {
+                       ret |= RSPAMD_UNICODE_NORM_OVERFLOW;
+               }
+               else {
+                       ret |= RSPAMD_UNICODE_NORM_ERROR;
+               }
+
                 goto out;
         }
  
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h

index 742d34184307a2689065a8a8bc5ee8d4acad3a07..05966538869d95a961a81cfc38b656525f32a220 100644 (file)
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -402,6 +402,14 @@ struct UConverter *rspamd_get_utf8_converter (void);
  struct UNormalizer2;
  const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);
  
+enum rspamd_normalise_result {
+       RSPAMD_UNICODE_NORM_NORMAL = 0,
+       RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0),
+       RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1),
+       RSPAMD_UNICODE_NORM_ERROR = (1 << 2),
+       RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3)
+};
+
  /**
   * Gets a string in UTF8 and normalises it to NFKC_Casefold form
   * @param pool optional memory pool used for logging purposes
@@ -409,7 +417,7 @@ const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);
   * @param len
   * @return TRUE if a string has been normalised
   */
-gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
+enum rspamd_normalise_result rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
                 gchar *start, guint *len);
  
  enum rspamd_regexp_escape_flags {
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Wed, 16 Jan 2019 15:04:27 +0000 (15:04 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Wed, 16 Jan 2019 15:04:27 +0000 (15:04 +0000)
src/libutil/str_util.c		patch \| blob \| history
src/libutil/str_util.h		patch \| blob \| history