aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/libserver/html.c11
-rw-r--r--src/libserver/url.c14
-rw-r--r--src/libserver/url.h1
-rw-r--r--src/libutil/http.c6
-rw-r--r--src/libutil/http.h2
-rw-r--r--src/libutil/str_util.c83
-rw-r--r--src/libutil/str_util.h10
7 files changed, 122 insertions, 5 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c
index c8917503d..b27e07fad 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -22,6 +22,7 @@
#include "html_colors.h"
#include "url.h"
#include <unicode/uversion.h>
+#include <unicode/ucnv.h>
#if U_ICU_VERSION_MAJOR_NUM >= 46
#include <unicode/uidna.h>
#endif
@@ -1469,6 +1470,8 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
*statep = state;
}
+
+
struct rspamd_url *
rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
struct html_tag_component *comp)
@@ -1554,9 +1557,15 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
}
*d = '\0';
+ dlen = d - decoded;
url = rspamd_mempool_alloc0 (pool, sizeof (*url));
- rc = rspamd_url_parse (url, decoded, d - decoded, pool);
+
+ if (rspamd_normalise_unicode_inplace (pool, decoded, &dlen)) {
+ url->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+ }
+
+ rc = rspamd_url_parse (url, decoded, dlen, pool);
if (rc == URI_ERRNO_OK) {
if (has_bad_chars) {
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 1665ff379..ef187f94c 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -1543,7 +1543,7 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
gchar *p, *comp;
const gchar *end;
guint i, complen, ret, flags = 0;
- gsize unquoted_len = 0;
+ guint unquoted_len = 0;
memset (uri, 0, sizeof (*uri));
memset (&u, 0, sizeof (u));
@@ -1649,10 +1649,16 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
uri->protocollen);
rspamd_url_shift (uri, unquoted_len, UF_SCHEMA);
unquoted_len = rspamd_url_decode (uri->host, uri->host, uri->hostlen);
+ if (rspamd_normalise_unicode_inplace (pool, uri->host, &unquoted_len)) {
+ uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+ }
rspamd_url_shift (uri, unquoted_len, UF_HOST);
if (uri->datalen) {
unquoted_len = rspamd_url_decode (uri->data, uri->data, uri->datalen);
+ if (rspamd_normalise_unicode_inplace (pool, uri->data, &unquoted_len)) {
+ uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+ }
rspamd_url_shift (uri, unquoted_len, UF_PATH);
/* We now normalize path */
rspamd_http_normalize_path_inplace (uri->data, uri->datalen, &unquoted_len);
@@ -1662,12 +1668,18 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
unquoted_len = rspamd_url_decode (uri->query,
uri->query,
uri->querylen);
+ if (rspamd_normalise_unicode_inplace (pool, uri->query, &unquoted_len)) {
+ uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+ }
rspamd_url_shift (uri, unquoted_len, UF_QUERY);
}
if (uri->fragmentlen) {
unquoted_len = rspamd_url_decode (uri->fragment,
uri->fragment,
uri->fragmentlen);
+ if (rspamd_normalise_unicode_inplace (pool, uri->fragment, &unquoted_len)) {
+ uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+ }
rspamd_url_shift (uri, unquoted_len, UF_FRAGMENT);
}
diff --git a/src/libserver/url.h b/src/libserver/url.h
index e6ccfc0f9..a02d3c9d0 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -26,6 +26,7 @@ enum rspamd_url_flags {
RSPAMD_URL_FLAG_HAS_PORT = 1 << 13,
RSPAMD_URL_FLAG_HAS_USER = 1 << 14,
RSPAMD_URL_FLAG_SCHEMALESS = 1 << 15,
+ RSPAMD_URL_FLAG_UNNORMALISED = 1 << 16,
};
struct rspamd_url_tag {
diff --git a/src/libutil/http.c b/src/libutil/http.c
index 5732f8b8e..c6b77ee15 100644
--- a/src/libutil/http.c
+++ b/src/libutil/http.c
@@ -3252,12 +3252,14 @@ rspamd_http_router_finish_handler (struct rspamd_http_connection *conn,
http_parser_parse_url (msg->url->str, msg->url->len, TRUE, &u);
if (u.field_set & (1 << UF_PATH)) {
+ guint unnorm_len;
lookup.begin = msg->url->str + u.field_data[UF_PATH].off;
lookup.len = u.field_data[UF_PATH].len;
rspamd_http_normalize_path_inplace ((gchar *)lookup.begin,
lookup.len,
- &lookup.len);
+ &unnorm_len);
+ lookup.len = unnorm_len;
}
else {
lookup.begin = msg->url->str;
@@ -3712,7 +3714,7 @@ rspamd_http_message_unref (struct rspamd_http_message *msg)
void
-rspamd_http_normalize_path_inplace (gchar *path, gsize len, gsize *nlen)
+rspamd_http_normalize_path_inplace (gchar *path, guint len, guint *nlen)
{
const gchar *p, *end, *slash = NULL, *dot = NULL;
gchar *o;
diff --git a/src/libutil/http.h b/src/libutil/http.h
index 1c418ebb8..4ce9e0a84 100644
--- a/src/libutil/http.h
+++ b/src/libutil/http.h
@@ -570,6 +570,6 @@ glong rspamd_http_date_format (gchar *buf, gsize len, time_t time);
* @param len
* @param nlen
*/
-void rspamd_http_normalize_path_inplace (gchar *path, gsize len, gsize *nlen);
+void rspamd_http_normalize_path_inplace (gchar *path, guint len, guint *nlen);
#endif /* HTTP_H_ */
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 8026ea7e5..ab6be966a 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -18,7 +18,11 @@
#include "cryptobox.h"
#include "url.h"
#include "str_util.h"
+#include "logger.h"
#include "contrib/t1ha/t1ha.h"
+#include <unicode/uversion.h>
+#include <unicode/ucnv.h>
+#include <unicode/unorm2.h>
#include <math.h>
const guchar lc_map[256] = {
@@ -1958,3 +1962,82 @@ rspamd_memrchr (const void *m, gint c, gsize len)
return NULL;
}
+
+gboolean
+rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
+ guint *len)
+{
+ UErrorCode uc_err = U_ZERO_ERROR;
+ static UConverter *utf8_conv = NULL;
+ static const UNormalizer2 *norm = NULL;
+ gint32 nsym, end;
+ UChar *src = NULL, *dest = NULL;
+ gboolean ret = FALSE;
+
+ if (utf8_conv == NULL) {
+ utf8_conv = ucnv_open ("UTF-8", &uc_err);
+ g_assert (U_SUCCESS (uc_err));
+ norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
+ g_assert (U_SUCCESS (uc_err));
+ }
+
+ /* We first need to convert data to UChars :( */
+ src = g_malloc ((*len + 1) * sizeof (*src));
+ nsym = ucnv_toUChars (utf8_conv, src, *len + 1,
+ start, *len, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ msg_warn_pool_check ("cannot normalise URL, cannot convert to unicode: %s",
+ u_errorName (uc_err));
+ goto out;
+ }
+
+ /* We can now check if we need to decompose */
+ end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ msg_warn_pool_check ("cannot normalise URL, cannot check normalisation: %s",
+ u_errorName (uc_err));
+ goto out;
+ }
+
+ if (end == nsym) {
+ /* No normalisation needed */
+ goto out;
+ }
+
+ /* We copy sub(src, 0, end) to dest and normalise the rest */
+ ret = TRUE;
+ dest = g_malloc (nsym * sizeof (*dest));
+ memcpy (dest, src, end * sizeof (*dest));
+ nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
+ src + end, nsym - end, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ msg_warn_pool_check ("cannot normalise URL: %s",
+ u_errorName (uc_err));
+ goto out;
+ }
+
+ /* We now convert it back to utf */
+ nsym = ucnv_fromUChars (utf8_conv, start, *len, dest, nsym, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s",
+ u_errorName (uc_err));
+ goto out;
+ }
+
+ *len = nsym;
+ out:
+
+ if (src) {
+ g_free (src);
+ }
+
+ if (dest) {
+ g_free (dest);
+ }
+
+ return ret;
+}
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index ab97555ac..68ec5f0bd 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -361,4 +361,14 @@ rspamd_str_has_8bit (const guchar *beg, gsize len)
return FALSE;
}
+/**
+ * Gets a string in UTF8 and normalises it to NFKC_Casefold form
+ * @param pool optional memory pool used for logging purposes
+ * @param start
+ * @param len
+ * @return TRUE if a string has been normalised
+ */
+gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
+ gchar *start, guint *len);
+
#endif /* SRC_LIBUTIL_STR_UTIL_H_ */