From 7815be9c25f95a04a3f530da8724c5c9eb15952e Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 2 Dec 2016 12:58:58 +0000 Subject: [PATCH] [Fix] Fix parsing of URLs with spaces and other bad chars --- src/libserver/html.c | 62 ++++++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/src/libserver/html.c b/src/libserver/html.c index ce03118b6..8618930eb 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -1230,8 +1230,11 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len, gchar *decoded; gint rc; gsize decoded_len; - const gchar *p; - gchar *t, *h; + const gchar *p, *s; + gchar *d; + guint i, dlen; + gboolean has_bad_chars = FALSE; + static const gchar hexdigests[16] = "0123456789abcdef"; p = start; @@ -1260,36 +1263,55 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len, } } - /* Also we need to perform url decode */ - decoded = rspamd_mempool_alloc (pool, len + 1); - rspamd_strlcpy (decoded, start, len + 1); - decoded_len = rspamd_decode_url (decoded, start, len); + s = start; + dlen = 0; - /* We also need to remove all internal newlines */ - t = decoded; - h = t; - - while (*h) { - if (*h == '\r' || *h == '\n') { - h ++; - decoded_len --; + for (i = 0; i < len; i ++) { + if (G_UNLIKELY (!g_ascii_isgraph (s[i]))) { + dlen += 3; } else { - *t++ = *h++; + dlen ++; } } - *t = *h; - if (comp) { - comp->start = decoded; - comp->len = decoded_len; + decoded = rspamd_mempool_alloc (pool, dlen + 1); + d = decoded; + + /* We also need to remove all internal newlines and encode unsafe characters */ + for (i = 0; i < len; i ++) { + if (G_UNLIKELY (s[i] == '\r' || s[i] == '\n')) { + continue; + } + else if (G_UNLIKELY (!g_ascii_isgraph (s[i]))) { + /* URL encode */ + *d++ = '%'; + *d++ = hexdigests[(s[i] >> 4) & 0xf]; + *d++ = hexdigests[s[i] & 0xf]; + has_bad_chars = TRUE; + } + else { + *d++ = s[i]; + } } + *d = '\0'; + url = rspamd_mempool_alloc (pool, sizeof (*url)); - rc = rspamd_url_parse (url, decoded, decoded_len, pool); + rc = rspamd_url_parse (url, decoded, d - decoded, pool); if (rc == URI_ERRNO_OK) { + if (has_bad_chars) { + url->flags |= RSPAMD_URL_FLAG_OBSCURED; + } + decoded = url->string; + decoded_len = url->urllen; + + if (comp) { + comp->start = decoded; + comp->len = decoded_len; + } /* Spaces in href usually mean an attempt to obfuscate URL */ /* See https://github.com/vstakhov/rspamd/issues/593 */ #if 0 -- 2.39.5