7 files changed, 122 insertions, 5 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c
index c8917503d..b27e07fad 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -22,6 +22,7 @@
 #include "html_colors.h"
 #include "url.h"
 #include <unicode/uversion.h>
+#include <unicode/ucnv.h>
 #if U_ICU_VERSION_MAJOR_NUM >= 46
 #include <unicode/uidna.h>
 #endif
@@ -1469,6 +1470,8 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
 	*statep = state;
 }
 
+
+
 struct rspamd_url *
 rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
 		struct html_tag_component *comp)
@@ -1554,9 +1557,15 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
 	}
 
 	*d = '\0';
+	dlen = d - decoded;
 
 	url = rspamd_mempool_alloc0 (pool, sizeof (*url));
-	rc = rspamd_url_parse (url, decoded, d - decoded, pool);
+
+	if (rspamd_normalise_unicode_inplace (pool, decoded, &dlen)) {
+		url->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+	}
+
+	rc = rspamd_url_parse (url, decoded, dlen, pool);
 
 	if (rc == URI_ERRNO_OK) {
 		if (has_bad_chars) {
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 1665ff379..ef187f94c 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -1543,7 +1543,7 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
 	gchar *p, *comp;
 	const gchar *end;
 	guint i, complen, ret, flags = 0;
-	gsize unquoted_len = 0;
+	guint unquoted_len = 0;
 
 	memset (uri, 0, sizeof (*uri));
 	memset (&u, 0, sizeof (u));
@@ -1649,10 +1649,16 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
 			uri->protocollen);
 	rspamd_url_shift (uri, unquoted_len, UF_SCHEMA);
 	unquoted_len = rspamd_url_decode (uri->host, uri->host, uri->hostlen);
+	if (rspamd_normalise_unicode_inplace (pool, uri->host, &unquoted_len)) {
+		uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+	}
 	rspamd_url_shift (uri, unquoted_len, UF_HOST);
 
 	if (uri->datalen) {
 		unquoted_len = rspamd_url_decode (uri->data, uri->data, uri->datalen);
+		if (rspamd_normalise_unicode_inplace (pool, uri->data, &unquoted_len)) {
+			uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+		}
 		rspamd_url_shift (uri, unquoted_len, UF_PATH);
 		/* We now normalize path */
 		rspamd_http_normalize_path_inplace (uri->data, uri->datalen, &unquoted_len);
@@ -1662,12 +1668,18 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
 		unquoted_len = rspamd_url_decode (uri->query,
 				uri->query,
 				uri->querylen);
+		if (rspamd_normalise_unicode_inplace (pool, uri->query, &unquoted_len)) {
+			uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+		}
 		rspamd_url_shift (uri, unquoted_len, UF_QUERY);
 	}
 	if (uri->fragmentlen) {
 		unquoted_len = rspamd_url_decode (uri->fragment,
 				uri->fragment,
 				uri->fragmentlen);
+		if (rspamd_normalise_unicode_inplace (pool, uri->fragment, &unquoted_len)) {
+			uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+		}
 		rspamd_url_shift (uri, unquoted_len, UF_FRAGMENT);
 	}
 
diff --git a/src/libserver/url.h b/src/libserver/url.h
index e6ccfc0f9..a02d3c9d0 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -26,6 +26,7 @@ enum rspamd_url_flags {
 	RSPAMD_URL_FLAG_HAS_PORT = 1 << 13,
 	RSPAMD_URL_FLAG_HAS_USER = 1 << 14,
 	RSPAMD_URL_FLAG_SCHEMALESS = 1 << 15,
+	RSPAMD_URL_FLAG_UNNORMALISED = 1 << 16,
 };
 
 struct rspamd_url_tag {
diff --git a/src/libutil/http.c b/src/libutil/http.c
index 5732f8b8e..c6b77ee15 100644
--- a/src/libutil/http.c
+++ b/src/libutil/http.c
@@ -3252,12 +3252,14 @@ rspamd_http_router_finish_handler (struct rspamd_http_connection *conn,
 			http_parser_parse_url (msg->url->str, msg->url->len, TRUE, &u);
 
 			if (u.field_set & (1 << UF_PATH)) {
+				guint unnorm_len;
 				lookup.begin = msg->url->str + u.field_data[UF_PATH].off;
 				lookup.len = u.field_data[UF_PATH].len;
 
 				rspamd_http_normalize_path_inplace ((gchar *)lookup.begin,
 						lookup.len,
-						&lookup.len);
+						&unnorm_len);
+				lookup.len = unnorm_len;
 			}
 			else {
 				lookup.begin = msg->url->str;
@@ -3712,7 +3714,7 @@ rspamd_http_message_unref (struct rspamd_http_message *msg)
 
 
 void
-rspamd_http_normalize_path_inplace (gchar *path, gsize len, gsize *nlen)
+rspamd_http_normalize_path_inplace (gchar *path, guint len, guint *nlen)
 {
 	const gchar *p, *end, *slash = NULL, *dot = NULL;
 	gchar *o;
diff --git a/src/libutil/http.h b/src/libutil/http.h
index 1c418ebb8..4ce9e0a84 100644
--- a/src/libutil/http.h
+++ b/src/libutil/http.h
@@ -570,6 +570,6 @@ glong rspamd_http_date_format (gchar *buf, gsize len, time_t time);
  * @param len
  * @param nlen
  */
-void rspamd_http_normalize_path_inplace (gchar *path, gsize len, gsize *nlen);
+void rspamd_http_normalize_path_inplace (gchar *path, guint len, guint *nlen);
 
 #endif /* HTTP_H_ */
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 8026ea7e5..ab6be966a 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -18,7 +18,11 @@
 #include "cryptobox.h"
 #include "url.h"
 #include "str_util.h"
+#include "logger.h"
 #include "contrib/t1ha/t1ha.h"
+#include <unicode/uversion.h>
+#include <unicode/ucnv.h>
+#include <unicode/unorm2.h>
 #include <math.h>
 
 const guchar lc_map[256] = {
@@ -1958,3 +1962,82 @@ rspamd_memrchr (const void *m, gint c, gsize len)
 
 	return NULL;
 }
+
+gboolean
+rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
+		guint *len)
+{
+	UErrorCode uc_err = U_ZERO_ERROR;
+	static UConverter *utf8_conv = NULL;
+	static const UNormalizer2 *norm = NULL;
+	gint32 nsym, end;
+	UChar *src = NULL, *dest = NULL;
+	gboolean ret = FALSE;
+
+	if (utf8_conv == NULL) {
+		utf8_conv = ucnv_open ("UTF-8", &uc_err);
+		g_assert (U_SUCCESS (uc_err));
+		norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
+		g_assert (U_SUCCESS (uc_err));
+	}
+
+	/* We first need to convert data to UChars :( */
+	src = g_malloc ((*len + 1) * sizeof (*src));
+	nsym = ucnv_toUChars (utf8_conv, src, *len + 1,
+			start, *len, &uc_err);
+
+	if (!U_SUCCESS (uc_err)) {
+		msg_warn_pool_check ("cannot normalise URL, cannot convert to unicode: %s",
+				u_errorName (uc_err));
+		goto out;
+	}
+
+	/* We can now check if we need to decompose */
+	end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
+
+	if (!U_SUCCESS (uc_err)) {
+		msg_warn_pool_check ("cannot normalise URL, cannot check normalisation: %s",
+				u_errorName (uc_err));
+		goto out;
+	}
+
+	if (end == nsym) {
+		/* No normalisation needed */
+		goto out;
+	}
+
+	/* We copy sub(src, 0, end) to dest and normalise the rest */
+	ret = TRUE;
+	dest = g_malloc (nsym * sizeof (*dest));
+	memcpy (dest, src, end * sizeof (*dest));
+	nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
+			src + end, nsym - end, &uc_err);
+
+	if (!U_SUCCESS (uc_err)) {
+		msg_warn_pool_check ("cannot normalise URL: %s",
+				u_errorName (uc_err));
+		goto out;
+	}
+
+	/* We now convert it back to utf */
+	nsym = ucnv_fromUChars (utf8_conv, start, *len, dest, nsym, &uc_err);
+
+	if (!U_SUCCESS (uc_err)) {
+		msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s",
+				u_errorName (uc_err));
+		goto out;
+	}
+
+	*len = nsym;
+	out:
+
+	if (src) {
+		g_free (src);
+	}
+
+	if (dest) {
+		g_free (dest);
+	}
+
+	return ret;
+}
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index ab97555ac..68ec5f0bd 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -361,4 +361,14 @@ rspamd_str_has_8bit (const guchar *beg, gsize len)
 	return FALSE;
 }
 
+/**
+ * Gets a string in UTF8 and normalises it to NFKC_Casefold form
+ * @param pool optional memory pool used for logging purposes
+ * @param start
+ * @param len
+ * @return TRUE if a string has been normalised
+ */
+gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
+		gchar *start, guint *len);
+
 #endif /* SRC_LIBUTIL_STR_UTIL_H_ */