diff options
-rw-r--r-- | src/libserver/html.c | 11 | ||||
-rw-r--r-- | src/libserver/url.c | 14 | ||||
-rw-r--r-- | src/libserver/url.h | 1 | ||||
-rw-r--r-- | src/libutil/http.c | 6 | ||||
-rw-r--r-- | src/libutil/http.h | 2 | ||||
-rw-r--r-- | src/libutil/str_util.c | 83 | ||||
-rw-r--r-- | src/libutil/str_util.h | 10 |
7 files changed, 122 insertions, 5 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c index c8917503d..b27e07fad 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -22,6 +22,7 @@ #include "html_colors.h" #include "url.h" #include <unicode/uversion.h> +#include <unicode/ucnv.h> #if U_ICU_VERSION_MAJOR_NUM >= 46 #include <unicode/uidna.h> #endif @@ -1469,6 +1470,8 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool, *statep = state; } + + struct rspamd_url * rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len, struct html_tag_component *comp) @@ -1554,9 +1557,15 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len, } *d = '\0'; + dlen = d - decoded; url = rspamd_mempool_alloc0 (pool, sizeof (*url)); - rc = rspamd_url_parse (url, decoded, d - decoded, pool); + + if (rspamd_normalise_unicode_inplace (pool, decoded, &dlen)) { + url->flags |= RSPAMD_URL_FLAG_UNNORMALISED; + } + + rc = rspamd_url_parse (url, decoded, dlen, pool); if (rc == URI_ERRNO_OK) { if (has_bad_chars) { diff --git a/src/libserver/url.c b/src/libserver/url.c index 1665ff379..ef187f94c 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -1543,7 +1543,7 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, gchar *p, *comp; const gchar *end; guint i, complen, ret, flags = 0; - gsize unquoted_len = 0; + guint unquoted_len = 0; memset (uri, 0, sizeof (*uri)); memset (&u, 0, sizeof (u)); @@ -1649,10 +1649,16 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, uri->protocollen); rspamd_url_shift (uri, unquoted_len, UF_SCHEMA); unquoted_len = rspamd_url_decode (uri->host, uri->host, uri->hostlen); + if (rspamd_normalise_unicode_inplace (pool, uri->host, &unquoted_len)) { + uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED; + } rspamd_url_shift (uri, unquoted_len, UF_HOST); if (uri->datalen) { unquoted_len = rspamd_url_decode (uri->data, uri->data, uri->datalen); + if (rspamd_normalise_unicode_inplace (pool, uri->data, &unquoted_len)) { + uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED; + } rspamd_url_shift (uri, unquoted_len, UF_PATH); /* We now normalize path */ rspamd_http_normalize_path_inplace (uri->data, uri->datalen, &unquoted_len); @@ -1662,12 +1668,18 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, unquoted_len = rspamd_url_decode (uri->query, uri->query, uri->querylen); + if (rspamd_normalise_unicode_inplace (pool, uri->query, &unquoted_len)) { + uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED; + } rspamd_url_shift (uri, unquoted_len, UF_QUERY); } if (uri->fragmentlen) { unquoted_len = rspamd_url_decode (uri->fragment, uri->fragment, uri->fragmentlen); + if (rspamd_normalise_unicode_inplace (pool, uri->fragment, &unquoted_len)) { + uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED; + } rspamd_url_shift (uri, unquoted_len, UF_FRAGMENT); } diff --git a/src/libserver/url.h b/src/libserver/url.h index e6ccfc0f9..a02d3c9d0 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -26,6 +26,7 @@ enum rspamd_url_flags { RSPAMD_URL_FLAG_HAS_PORT = 1 << 13, RSPAMD_URL_FLAG_HAS_USER = 1 << 14, RSPAMD_URL_FLAG_SCHEMALESS = 1 << 15, + RSPAMD_URL_FLAG_UNNORMALISED = 1 << 16, }; struct rspamd_url_tag { diff --git a/src/libutil/http.c b/src/libutil/http.c index 5732f8b8e..c6b77ee15 100644 --- a/src/libutil/http.c +++ b/src/libutil/http.c @@ -3252,12 +3252,14 @@ rspamd_http_router_finish_handler (struct rspamd_http_connection *conn, http_parser_parse_url (msg->url->str, msg->url->len, TRUE, &u); if (u.field_set & (1 << UF_PATH)) { + guint unnorm_len; lookup.begin = msg->url->str + u.field_data[UF_PATH].off; lookup.len = u.field_data[UF_PATH].len; rspamd_http_normalize_path_inplace ((gchar *)lookup.begin, lookup.len, - &lookup.len); + &unnorm_len); + lookup.len = unnorm_len; } else { lookup.begin = msg->url->str; @@ -3712,7 +3714,7 @@ rspamd_http_message_unref (struct rspamd_http_message *msg) void -rspamd_http_normalize_path_inplace (gchar *path, gsize len, gsize *nlen) +rspamd_http_normalize_path_inplace (gchar *path, guint len, guint *nlen) { const gchar *p, *end, *slash = NULL, *dot = NULL; gchar *o; diff --git a/src/libutil/http.h b/src/libutil/http.h index 1c418ebb8..4ce9e0a84 100644 --- a/src/libutil/http.h +++ b/src/libutil/http.h @@ -570,6 +570,6 @@ glong rspamd_http_date_format (gchar *buf, gsize len, time_t time); * @param len * @param nlen */ -void rspamd_http_normalize_path_inplace (gchar *path, gsize len, gsize *nlen); +void rspamd_http_normalize_path_inplace (gchar *path, guint len, guint *nlen); #endif /* HTTP_H_ */ diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index 8026ea7e5..ab6be966a 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -18,7 +18,11 @@ #include "cryptobox.h" #include "url.h" #include "str_util.h" +#include "logger.h" #include "contrib/t1ha/t1ha.h" +#include <unicode/uversion.h> +#include <unicode/ucnv.h> +#include <unicode/unorm2.h> #include <math.h> const guchar lc_map[256] = { @@ -1958,3 +1962,82 @@ rspamd_memrchr (const void *m, gint c, gsize len) return NULL; } + +gboolean +rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start, + guint *len) +{ + UErrorCode uc_err = U_ZERO_ERROR; + static UConverter *utf8_conv = NULL; + static const UNormalizer2 *norm = NULL; + gint32 nsym, end; + UChar *src = NULL, *dest = NULL; + gboolean ret = FALSE; + + if (utf8_conv == NULL) { + utf8_conv = ucnv_open ("UTF-8", &uc_err); + g_assert (U_SUCCESS (uc_err)); + norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err); + g_assert (U_SUCCESS (uc_err)); + } + + /* We first need to convert data to UChars :( */ + src = g_malloc ((*len + 1) * sizeof (*src)); + nsym = ucnv_toUChars (utf8_conv, src, *len + 1, + start, *len, &uc_err); + + if (!U_SUCCESS (uc_err)) { + msg_warn_pool_check ("cannot normalise URL, cannot convert to unicode: %s", + u_errorName (uc_err)); + goto out; + } + + /* We can now check if we need to decompose */ + end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err); + + if (!U_SUCCESS (uc_err)) { + msg_warn_pool_check ("cannot normalise URL, cannot check normalisation: %s", + u_errorName (uc_err)); + goto out; + } + + if (end == nsym) { + /* No normalisation needed */ + goto out; + } + + /* We copy sub(src, 0, end) to dest and normalise the rest */ + ret = TRUE; + dest = g_malloc (nsym * sizeof (*dest)); + memcpy (dest, src, end * sizeof (*dest)); + nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym, + src + end, nsym - end, &uc_err); + + if (!U_SUCCESS (uc_err)) { + msg_warn_pool_check ("cannot normalise URL: %s", + u_errorName (uc_err)); + goto out; + } + + /* We now convert it back to utf */ + nsym = ucnv_fromUChars (utf8_conv, start, *len, dest, nsym, &uc_err); + + if (!U_SUCCESS (uc_err)) { + msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s", + u_errorName (uc_err)); + goto out; + } + + *len = nsym; + out: + + if (src) { + g_free (src); + } + + if (dest) { + g_free (dest); + } + + return ret; +} diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index ab97555ac..68ec5f0bd 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -361,4 +361,14 @@ rspamd_str_has_8bit (const guchar *beg, gsize len) return FALSE; } +/** + * Gets a string in UTF8 and normalises it to NFKC_Casefold form + * @param pool optional memory pool used for logging purposes + * @param start + * @param len + * @return TRUE if a string has been normalised + */ +gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, + gchar *start, guint *len); + #endif /* SRC_LIBUTIL_STR_UTIL_H_ */ |