From 0f0717ee7ad5ee93f2ecfb24e8f57fbb42e8feca Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 16 Jan 2019 15:04:50 +0000 Subject: [PATCH] [Fix] Core: Implement logic to find some bad characters in URLs --- src/libserver/html.c | 65 +++++++++++++++++++++++++++---- src/libserver/url.c | 93 +++++++++++++++++++++++++++++++------------- src/libserver/url.h | 13 +++++-- src/plugins/surbl.c | 6 +-- 4 files changed, 136 insertions(+), 41 deletions(-) diff --git a/src/libserver/html.c b/src/libserver/html.c index cbc0fe7da..e97a010fe 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -571,7 +571,8 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool, } } text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url)); - rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool); + rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool, + RSPAMD_URL_PARSE_TEXT); if (rc == URI_ERRNO_OK) { disp_tok.len = text_url->hostlen; @@ -991,23 +992,61 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool, state = ignore_bad_tag; } else { + const guchar *attr_name_end = in; + if (*in == '=') { state = parse_equal; } + else if (*in == '"') { + /* No equal or something sane but we have quote character */ + state = parse_start_dquote; + attr_name_end = in - 1; + + while (attr_name_end > *savep) { + if (!g_ascii_isalnum (*attr_name_end)) { + attr_name_end --; + } + else { + break; + } + } + + /* One character forward to obtain length */ + attr_name_end ++; + } else if (g_ascii_isspace (*in)) { state = spaces_before_eq; } else if (*in == '/') { tag->flags |= FL_CLOSED; } + else if (!g_ascii_isgraph (*in)) { + state = parse_value; + attr_name_end = in - 1; + + while (attr_name_end > *savep) { + if (!g_ascii_isalnum (*attr_name_end)) { + attr_name_end --; + } + else { + break; + } + } + + /* One character forward to obtain length */ + attr_name_end ++; + } else { return; } - if (!rspamd_html_parse_tag_component (pool, *savep, in, tag)) { + if (!rspamd_html_parse_tag_component (pool, *savep, attr_name_end, tag)) { /* Ignore unknown params */ *savep = NULL; } + else if (state == parse_value) { + *savep = in + 1; + } } break; @@ -1153,7 +1192,7 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool, tag->flags |= FL_CLOSED; store = TRUE; } - else if (g_ascii_isspace (*in) || *in == '>') { + else if (g_ascii_isspace (*in) || *in == '>' || *in == '"') { store = TRUE; state = spaces_after_param; } @@ -1210,6 +1249,7 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len, struct html_tag_component *comp) { struct rspamd_url *url; + guint saved_flags = 0; gchar *decoded; gint rc; gsize decoded_len; @@ -1301,13 +1341,23 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len, url = rspamd_mempool_alloc0 (pool, sizeof (*url)); - if (rspamd_normalise_unicode_inplace (pool, decoded, &dlen)) { - url->flags |= RSPAMD_URL_FLAG_UNNORMALISED; + enum rspamd_normalise_result norm_res; + + norm_res = rspamd_normalise_unicode_inplace (pool, decoded, &dlen); + + if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) { + saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED; } - rc = rspamd_url_parse (url, decoded, dlen, pool); + if (norm_res & (RSPAMD_UNICODE_NORM_ZERO_SPACES|RSPAMD_UNICODE_NORM_ERROR)) { + saved_flags |= RSPAMD_URL_FLAG_OBSCURED; + } + + rc = rspamd_url_parse (url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF); if (rc == URI_ERRNO_OK) { + url->flags |= saved_flags; + if (has_bad_chars) { url->flags |= RSPAMD_URL_FLAG_OBSCURED; } @@ -1439,7 +1489,8 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url, rc = rspamd_url_parse (query_url, url_str, strlen (url_str), - pool); + pool, + RSPAMD_URL_PARSE_TEXT); if (rc == URI_ERRNO_OK && query_url->hostlen > 0) { diff --git a/src/libserver/url.c b/src/libserver/url.c index e27a2c39b..3a08ec748 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -564,8 +564,10 @@ is_url_end (gchar c) } static gint -rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len, - gchar const **end, gboolean strict, guint *flags) +rspamd_mailto_parse (struct http_parser_url *u, + const gchar *str, gsize len, + gchar const **end, + enum rspamd_url_parse_flags parse_flags, guint *flags) { const gchar *p = str, *c = str, *last = str + len; gchar t; @@ -711,7 +713,7 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len, *end = p; } - if (!strict) { + if ((parse_flags & RSPAMD_URL_PARSE_CHECK)) { return 0; } @@ -720,7 +722,9 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len, static gint rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, - gchar const **end, gboolean strict, guint *flags) + gchar const **end, + enum rspamd_url_parse_flags parse_flags, + guint *flags) { const gchar *p = str, *c = str, *last = str + len, *slash = NULL, *password_start = NULL, *user_start = NULL; @@ -763,7 +767,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, SET_U (u, UF_SCHEMA); } else if (!g_ascii_isalnum (t) && t != '+' && t != '-') { - if (!strict && p > c) { + if ((parse_flags & RSPAMD_URL_PARSE_CHECK) && p > c) { /* We might have some domain, but no protocol */ st = parse_domain; p = c; @@ -985,7 +989,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, } else if (*p != '.' && *p != '-' && *p != '_' && *p != '%') { if (*p & 0x80) { - *flags |= RSPAMD_URL_FLAG_IDN; + (*flags) |= RSPAMD_URL_FLAG_IDN; guint i = 0; U8_NEXT (p, i, last - p, uc); @@ -997,11 +1001,16 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, if (!u_isalnum (uc)) { /* Bad symbol */ - if (strict) { - goto out; + if (IS_ZERO_WIDTH_SPACE (uc)) { + (*flags) |= RSPAMD_URL_FLAG_OBSCURED; } else { - goto set; + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + goto out; + } + else { + goto set; + } } } @@ -1011,11 +1020,18 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, p ++; } else { - if (strict) { - goto out; + if (parse_flags & RSPAMD_URL_PARSE_HREF) { + /* We have to use all shit we are given here */ + p ++; + (*flags) |= RSPAMD_URL_FLAG_OBSCURED; } else { - goto set; + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + goto out; + } + else { + goto set; + } } } } @@ -1117,7 +1133,8 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, goto set; } else if (!g_ascii_isdigit (t)) { - if (strict || !g_ascii_isspace (t)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK) || + !g_ascii_isspace (t)) { goto out; } else { @@ -1148,7 +1165,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, goto set; } else if (is_lwsp (t)) { - if (strict) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { if (g_ascii_isspace (t)) { goto set; } @@ -1172,7 +1189,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, goto set; } else if (is_lwsp (t)) { - if (strict) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { if (g_ascii_isspace (t)) { goto set; } @@ -1189,7 +1206,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, goto set; } else if (is_lwsp (t)) { - if (strict) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { if (g_ascii_isspace (t)) { goto set; } @@ -1602,8 +1619,10 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen, } enum uri_errno -rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, - rspamd_mempool_t *pool) +rspamd_url_parse (struct rspamd_url *uri, + gchar *uristring, gsize len, + rspamd_mempool_t *pool, + enum rspamd_url_parse_flags parse_flags) { struct http_parser_url u; gchar *p, *comp; @@ -1624,14 +1643,16 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, if (len > sizeof ("mailto:") - 1) { /* For mailto: urls we also need to add slashes to make it a valid URL */ if (g_ascii_strncasecmp (p, "mailto:", sizeof ("mailto:") - 1) == 0) { - ret = rspamd_mailto_parse (&u, uristring, len, &end, TRUE, &flags); + ret = rspamd_mailto_parse (&u, uristring, len, &end, parse_flags, + &flags); } else { - ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &flags); + ret = rspamd_web_parse (&u, uristring, len, &end, parse_flags, + &flags); } } else { - ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &flags); + ret = rspamd_web_parse (&u, uristring, len, &end, parse_flags, &flags); } if (ret != 0) { @@ -1715,9 +1736,11 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, uri->protocollen); rspamd_url_shift (uri, unquoted_len, UF_SCHEMA); unquoted_len = rspamd_url_decode (uri->host, uri->host, uri->hostlen); + if (rspamd_normalise_unicode_inplace (pool, uri->host, &unquoted_len)) { uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED; } + rspamd_url_shift (uri, unquoted_len, UF_HOST); if (uri->datalen) { @@ -1730,6 +1753,7 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, rspamd_http_normalize_path_inplace (uri->data, uri->datalen, &unquoted_len); rspamd_url_shift (uri, unquoted_len, UF_PATH); } + if (uri->querylen) { unquoted_len = rspamd_url_decode (uri->query, uri->query, @@ -1739,6 +1763,7 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, } rspamd_url_shift (uri, unquoted_len, UF_QUERY); } + if (uri->fragmentlen) { unquoted_len = rspamd_url_decode (uri->fragment, uri->fragment, @@ -1769,7 +1794,7 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, uri->host, uri->hostlen, rspamd_tld_trie_callback, uri, NULL); - if (uri->tldlen == 0) { + if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && uri->tldlen == 0) { /* Ignore URL's without TLD if it is not a numeric URL */ if (!rspamd_url_is_ip (uri, pool)) { return URI_ERRNO_TLD_MISSING; @@ -1777,7 +1802,13 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, } if (uri->protocol == PROTOCOL_UNKNOWN) { - return URI_ERRNO_INVALID_PROTOCOL; + if (!(parse_flags & RSPAMD_URL_PARSE_HREF)) { + return URI_ERRNO_INVALID_PROTOCOL; + } + else { + /* Hack, hack, hack */ + uri->protocol = PROTOCOL_HTTP; + } } return URI_ERRNO_OK; @@ -2089,7 +2120,8 @@ url_web_end (struct url_callback_data *cb, len = MIN (len, match->newline_pos - pos); } - if (rspamd_web_parse (NULL, pos, len, &last, FALSE, &flags) != 0) { + if (rspamd_web_parse (NULL, pos, len, &last, + RSPAMD_URL_PARSE_CHECK, &flags) != 0) { return FALSE; } @@ -2157,7 +2189,8 @@ url_email_end (struct url_callback_data *cb, if (!match->prefix || match->prefix[0] == '\0') { /* We have mailto:// at the beginning */ - if (rspamd_mailto_parse (&u, pos, len, &last, FALSE, &flags) != 0) { + if (rspamd_mailto_parse (&u, pos, len, &last, + RSPAMD_URL_PARSE_CHECK, &flags) != 0) { return FALSE; } @@ -2470,7 +2503,9 @@ rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp, cb->fin = m.m_begin + m.m_len; url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url)); g_strstrip (cb->url_str); - rc = rspamd_url_parse (url, cb->url_str, strlen (cb->url_str), pool); + rc = rspamd_url_parse (url, cb->url_str, + strlen (cb->url_str), pool, + RSPAMD_URL_PARSE_TEXT); if (rc == URI_ERRNO_OK && url->hostlen > 0) { if (cb->prefix_added) { @@ -2583,7 +2618,8 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, rc = rspamd_url_parse (query_url, url_str, strlen (url_str), - task->task_pool); + task->task_pool, + RSPAMD_URL_PARSE_TEXT); if (rc == URI_ERRNO_OK && query_url->hostlen > 0) { @@ -2737,7 +2773,8 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset, rc = rspamd_url_parse (query_url, url_str, strlen (url_str), - task->task_pool); + task->task_pool, + RSPAMD_URL_PARSE_TEXT); if (rc == URI_ERRNO_OK && url->hostlen > 0) { diff --git a/src/libserver/url.h b/src/libserver/url.h index b0cc10239..a9eda71de 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -104,6 +104,12 @@ void rspamd_url_text_extract (rspamd_mempool_t *pool, struct rspamd_mime_text_part *part, gboolean is_html); +enum rspamd_url_parse_flags { + RSPAMD_URL_PARSE_TEXT = 0, + RSPAMD_URL_PARSE_HREF = (1u << 0), + RSPAMD_URL_PARSE_CHECK = (1 << 1), +}; + /* * Parse a single url into an uri structure * @param pool memory pool @@ -111,9 +117,10 @@ void rspamd_url_text_extract (rspamd_mempool_t *pool, * @param uri url object, must be pre allocated */ enum uri_errno rspamd_url_parse (struct rspamd_url *uri, - gchar *uristring, - gsize len, - rspamd_mempool_t *pool); + gchar *uristring, + gsize len, + rspamd_mempool_t *pool, + enum rspamd_url_parse_flags flags); /* * Try to extract url from a text diff --git a/src/plugins/surbl.c b/src/plugins/surbl.c index 4bc17db20..ab9e5bb47 100644 --- a/src/plugins/surbl.c +++ b/src/plugins/surbl.c @@ -1660,7 +1660,7 @@ surbl_redirector_finish (struct rspamd_http_connection *conn, sizeof (*redirected_url)); rspamd_strlcpy (urlstr, hdr->begin, urllen + 1); r = rspamd_url_parse (redirected_url, urlstr, urllen, - task->task_pool); + task->task_pool, RSPAMD_URL_PARSE_TEXT); if (r == URI_ERRNO_OK) { if ((existing = g_hash_table_lookup (task->urls, redirected_url)) == NULL) { @@ -2120,7 +2120,7 @@ surbl_is_redirector_handler (lua_State *L) url_cpy = rspamd_mempool_alloc (task->task_pool, len); memcpy (url_cpy, url, len); - if (rspamd_url_parse (&uri, url_cpy, len, task->task_pool)) { + if (rspamd_url_parse (&uri, url_cpy, len, task->task_pool, RSPAMD_URL_PARSE_TEXT)) { msg_debug_surbl ("check url redirection %*s", uri.urllen, uri.string); @@ -2198,7 +2198,7 @@ surbl_continue_process_handler (lua_State *L) sizeof (*redirected_url)); rspamd_strlcpy (urlstr, nurl, urllen + 1); r = rspamd_url_parse (redirected_url, urlstr, urllen, - task->task_pool); + task->task_pool, RSPAMD_URL_PARSE_TEXT); if (r == URI_ERRNO_OK) { if (!g_hash_table_lookup (task->urls, redirected_url)) { -- 2.39.5