From 627cdb74b5e027184583a0ebe4b0eb610e5c7194 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 23 Sep 2017 12:38:27 +0100 Subject: [PATCH] [Fix] Deal with URLs with no slashes after protocol --- src/libserver/html.c | 2 +- src/libserver/url.c | 108 +++++++++++++++++++++++++++---------------- src/libserver/url.h | 4 ++ 3 files changed, 73 insertions(+), 41 deletions(-) diff --git a/src/libserver/html.c b/src/libserver/html.c index 3f479937a..905615794 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -1497,7 +1497,7 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len, } } - if (rspamd_substring_search (s, len, "://", 3) == (-1)) { + if (memchr (s, ':', len) == NULL) { /* We have no prefix */ dlen += sizeof ("http://") - 1; no_prefix = TRUE; diff --git a/src/libserver/url.c b/src/libserver/url.c index 918f7eee3..824c0dff8 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -46,6 +46,8 @@ #include "message.h" #include "multipattern.h" #include "contrib/uthash/utlist.h" +#include +#include typedef struct url_match_s { const gchar *m_begin; @@ -153,21 +155,17 @@ struct url_matcher static_matchers[] = { /* Common prefixes */ {"file://", "", url_file_start, url_file_end, 0, 0}, - {"file:\\\\", "", url_file_start, url_file_end, + {"file:\\\\", "", url_file_start, url_file_end, 0, 0}, {"ftp://", "", url_web_start, url_web_end, 0, 0}, - {"ftp:\\\\", "", url_web_start, url_web_end, + {"ftp:\\\\", "", url_web_start, url_web_end, 0, 0}, {"sftp://", "", url_web_start, url_web_end, 0, 0}, - {"http://", "", url_web_start, url_web_end, + {"http:", "", url_web_start, url_web_end, 0, 0}, - {"http:\\\\", "", url_web_start, url_web_end, - 0, 0}, - {"https://", "", url_web_start, url_web_end, - 0, 0}, - {"https:\\\\", "", url_web_start, url_web_end, + {"https:", "", url_web_start, url_web_end, 0, 0}, {"news://", "", url_web_start, url_web_end, 0, 0}, @@ -550,7 +548,7 @@ is_url_end (gchar c) static gint rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len, - gchar const **end, gboolean strict) + gchar const **end, gboolean strict, guint *flags) { const gchar *p = str, *c = str, *last = str + len; gchar t; @@ -591,6 +589,7 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len, p++; } else { + *flags |= RSPAMD_URL_FLAG_MISSINGSLASHES; st = parse_slash_slash; } break; @@ -704,11 +703,11 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len, static gint rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, - gchar const **end, gboolean strict, gboolean *obscured) + gchar const **end, gboolean strict, guint *flags) { const gchar *p = str, *c = str, *last = str + len, *slash = NULL; gchar t; - gunichar uc; + UChar32 uc; glong pt; gint ret = 1; gboolean user_seen = FALSE; @@ -766,6 +765,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, } else { st = parse_slash_slash; + *(flags) |= RSPAMD_URL_FLAG_MISSINGSLASHES; } break; case parse_slash: @@ -833,15 +833,13 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, /* We have multiple at in fact */ st = parse_multiple_at; user_seen = TRUE; - - if (obscured) { - *obscured = TRUE; - } + *flags |= RSPAMD_URL_FLAG_OBSCURED; continue; } SET_U (u, UF_USERINFO); + *flags |= RSPAMD_URL_FLAG_HAS_USER; st = parse_at; } else if (!g_ascii_isgraph (t)) { @@ -867,6 +865,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, if (t == '@') { /* Empty password */ SET_U (u, UF_USERINFO); + *flags |= RSPAMD_URL_FLAG_HAS_USER; st = parse_at; } else { @@ -878,6 +877,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, case parse_password: if (t == '@') { /* XXX: password is not stored */ + *flags |= RSPAMD_URL_FLAG_HAS_USER; SET_U (u, UF_USERINFO); st = parse_at; } @@ -942,24 +942,32 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, st = parse_user; } else if (*p != '.' && *p != '-' && *p != '_' && *p != '%') { - uc = g_utf8_get_char_validated (p, last - p); + if (*p & 0x80) { + *flags |= RSPAMD_URL_FLAG_IDN; + guint i = 0; - if (uc == (gunichar) -1) { - /* Bad utf8 */ - goto out; - } + U8_NEXT (p, i, last - p, uc); - if (!g_unichar_isalnum (uc)) { - /* Bad symbol */ - if (strict) { + if (uc < 0) { + /* Bad utf8 */ goto out; } - else { - goto set; + + if (!u_isalnum (uc)) { + /* Bad symbol */ + if (strict) { + goto out; + } + else { + goto set; + } } - } - p = g_utf8_next_char (p); + p = p + i; + } + else { + p ++; + } } else { p++; @@ -992,6 +1000,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, } if (u != NULL) { u->port = pt; + *flags |= RSPAMD_URL_FLAG_HAS_PORT; } st = parse_suffix_slash; } @@ -1002,6 +1011,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, } if (u != NULL) { u->port = pt; + *flags |= RSPAMD_URL_FLAG_HAS_PORT; } c = p + 1; @@ -1014,6 +1024,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, } if (u != NULL) { u->port = pt; + *flags |= RSPAMD_URL_FLAG_HAS_PORT; } c = p + 1; @@ -1514,9 +1525,8 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, struct http_parser_url u; gchar *p, *comp; const gchar *end; - guint i, complen, ret; + guint i, complen, ret, flags = 0; gsize unquoted_len = 0; - gboolean obscured = FALSE; memset (uri, 0, sizeof (*uri)); memset (&u, 0, sizeof (u)); @@ -1531,14 +1541,14 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, if (len > sizeof ("mailto:") - 1) { /* For mailto: urls we also need to add slashes to make it a valid URL */ if (g_ascii_strncasecmp (p, "mailto:", sizeof ("mailto:") - 1) == 0) { - ret = rspamd_mailto_parse (&u, uristring, len, &end, TRUE); + ret = rspamd_mailto_parse (&u, uristring, len, &end, TRUE, &flags); } else { - ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &obscured); + ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &flags); } } else { - ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &obscured); + ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &flags); } if (ret != 0) { @@ -1551,8 +1561,27 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, uri->raw = p; uri->rawlen = len; - uri->string = rspamd_mempool_alloc (pool, len + 1); - rspamd_strlcpy (uri->string, p, len + 1); + + if (flags & RSPAMD_URL_FLAG_MISSINGSLASHES) { + len += 2; + uri->string = rspamd_mempool_alloc (pool, len + 1); + memcpy (uri->string, p, u.field_data[UF_SCHEMA].len); + memcpy (uri->string + u.field_data[UF_SCHEMA].len, "://", 3); + rspamd_strlcpy (uri->string + u.field_data[UF_SCHEMA].len + 3, + p + u.field_data[UF_SCHEMA].len + 1, + len - 1 - u.field_data[UF_SCHEMA].len); + /* Compensate slashes added */ + for (i = UF_SCHEMA + 1; i < UF_MAX; i++) { + if (u.field_set & (1 << i)) { + u.field_data[i].off += 2; + } + } + } + else { + uri->string = rspamd_mempool_alloc (pool, len + 1); + rspamd_strlcpy (uri->string, p, len + 1); + } + uri->urllen = len; for (i = 0; i < UF_MAX; i++) { @@ -1591,15 +1620,12 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, } uri->port = u.port; + uri->flags = flags; if (!uri->hostlen) { return URI_ERRNO_HOST_MISSING; } - if (obscured) { - uri->flags |= RSPAMD_URL_FLAG_OBSCURED; - } - /* Now decode url symbols */ unquoted_len = rspamd_url_decode (uri->string, uri->string, @@ -1961,13 +1987,14 @@ url_web_end (struct url_callback_data *cb, { const gchar *last = NULL; gint len = cb->end - pos; + guint flags = 0; if (match->newline_pos && match->st != '<') { /* We should also limit our match end to the newline */ len = MIN (len, match->newline_pos - pos); } - if (rspamd_web_parse (NULL, pos, len, &last, FALSE, NULL) != 0) { + if (rspamd_web_parse (NULL, pos, len, &last, FALSE, &flags) != 0) { return FALSE; } @@ -2026,6 +2053,7 @@ url_email_end (struct url_callback_data *cb, const gchar *last = NULL; struct http_parser_url u; gint len = cb->end - pos; + guint flags = 0; if (match->newline_pos && match->st != '<') { /* We should also limit our match end to the newline */ @@ -2034,7 +2062,7 @@ url_email_end (struct url_callback_data *cb, if (!match->prefix || match->prefix[0] == '\0') { /* We have mailto:// at the beginning */ - if (rspamd_mailto_parse (&u, pos, len, &last, FALSE) != 0) { + if (rspamd_mailto_parse (&u, pos, len, &last, FALSE, &flags) != 0) { return FALSE; } diff --git a/src/libserver/url.h b/src/libserver/url.h index a9466c636..3c0ff7599 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -21,6 +21,10 @@ enum rspamd_url_flags { RSPAMD_URL_FLAG_SCHEMAENCODED = 1 << 8, RSPAMD_URL_FLAG_PATHENCODED = 1 << 9, RSPAMD_URL_FLAG_QUERYENCODED = 1 << 10, + RSPAMD_URL_FLAG_MISSINGSLASHES = 1 << 11, + RSPAMD_URL_FLAG_IDN = 1 << 12, + RSPAMD_URL_FLAG_HAS_PORT = 1 << 13, + RSPAMD_URL_FLAG_HAS_USER = 1 << 12, }; struct rspamd_url_tag { -- 2.39.5