]> source.dussan.org Git - rspamd.git/commitdiff
[Fix] Core: Implement logic to find some bad characters in URLs
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 16 Jan 2019 15:04:50 +0000 (15:04 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 16 Jan 2019 15:04:50 +0000 (15:04 +0000)
src/libserver/html.c
src/libserver/url.c
src/libserver/url.h
src/plugins/surbl.c

index cbc0fe7da6b319dc32eedd79be58cc168b70cc09..e97a010fe29e8819a24a6852b258eb5944961548 100644 (file)
@@ -571,7 +571,8 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
                        }
                }
                text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
-               rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool);
+               rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool,
+                               RSPAMD_URL_PARSE_TEXT);
 
                if (rc == URI_ERRNO_OK) {
                        disp_tok.len = text_url->hostlen;
@@ -991,23 +992,61 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
                        state = ignore_bad_tag;
                }
                else {
+                       const guchar *attr_name_end = in;
+
                        if (*in == '=') {
                                state = parse_equal;
                        }
+                       else if (*in == '"') {
+                               /* No equal or something sane but we have quote character */
+                               state = parse_start_dquote;
+                               attr_name_end = in - 1;
+
+                               while (attr_name_end > *savep) {
+                                       if (!g_ascii_isalnum (*attr_name_end)) {
+                                               attr_name_end --;
+                                       }
+                                       else {
+                                               break;
+                                       }
+                               }
+
+                               /* One character forward to obtain length */
+                               attr_name_end ++;
+                       }
                        else if (g_ascii_isspace (*in)) {
                                state = spaces_before_eq;
                        }
                        else if (*in == '/') {
                                tag->flags |= FL_CLOSED;
                        }
+                       else if (!g_ascii_isgraph (*in)) {
+                               state = parse_value;
+                               attr_name_end = in - 1;
+
+                               while (attr_name_end > *savep) {
+                                       if (!g_ascii_isalnum (*attr_name_end)) {
+                                               attr_name_end --;
+                                       }
+                                       else {
+                                               break;
+                                       }
+                               }
+
+                               /* One character forward to obtain length */
+                               attr_name_end ++;
+                       }
                        else {
                                return;
                        }
 
-                       if (!rspamd_html_parse_tag_component (pool, *savep, in, tag)) {
+                       if (!rspamd_html_parse_tag_component (pool, *savep, attr_name_end, tag)) {
                                /* Ignore unknown params */
                                *savep = NULL;
                        }
+                       else if (state == parse_value) {
+                               *savep = in + 1;
+                       }
                }
 
                break;
@@ -1153,7 +1192,7 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
                        tag->flags |= FL_CLOSED;
                        store = TRUE;
                }
-               else if (g_ascii_isspace (*in) || *in == '>') {
+               else if (g_ascii_isspace (*in) || *in == '>' || *in == '"') {
                        store = TRUE;
                        state = spaces_after_param;
                }
@@ -1210,6 +1249,7 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
                struct html_tag_component *comp)
 {
        struct rspamd_url *url;
+       guint saved_flags = 0;
        gchar *decoded;
        gint rc;
        gsize decoded_len;
@@ -1301,13 +1341,23 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
 
        url = rspamd_mempool_alloc0 (pool, sizeof (*url));
 
-       if (rspamd_normalise_unicode_inplace (pool, decoded, &dlen)) {
-               url->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
+       enum rspamd_normalise_result norm_res;
+
+       norm_res = rspamd_normalise_unicode_inplace (pool, decoded, &dlen);
+
+       if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) {
+               saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED;
        }
 
-       rc = rspamd_url_parse (url, decoded, dlen, pool);
+       if (norm_res & (RSPAMD_UNICODE_NORM_ZERO_SPACES|RSPAMD_UNICODE_NORM_ERROR)) {
+               saved_flags |= RSPAMD_URL_FLAG_OBSCURED;
+       }
+
+       rc = rspamd_url_parse (url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
 
        if (rc == URI_ERRNO_OK) {
+               url->flags |= saved_flags;
+
                if (has_bad_chars) {
                        url->flags |= RSPAMD_URL_FLAG_OBSCURED;
                }
@@ -1439,7 +1489,8 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
                        rc = rspamd_url_parse (query_url,
                                        url_str,
                                        strlen (url_str),
-                                       pool);
+                                       pool,
+                                       RSPAMD_URL_PARSE_TEXT);
 
                        if (rc == URI_ERRNO_OK &&
                                        query_url->hostlen > 0) {
index e27a2c39b14892031f2ac4e10c330e534c684f45..3a08ec74805264d9bb3536d332bbda8b9243c95e 100644 (file)
@@ -564,8 +564,10 @@ is_url_end (gchar c)
 }
 
 static gint
-rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
-               gchar const **end, gboolean strict, guint *flags)
+rspamd_mailto_parse (struct http_parser_url *u,
+                                        const gchar *str, gsize len,
+                                        gchar const **end,
+                                        enum rspamd_url_parse_flags parse_flags, guint *flags)
 {
        const gchar *p = str, *c = str, *last = str + len;
        gchar t;
@@ -711,7 +713,7 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
                *end = p;
        }
 
-       if (!strict) {
+       if ((parse_flags & RSPAMD_URL_PARSE_CHECK)) {
                return 0;
        }
 
@@ -720,7 +722,9 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
 
 static gint
 rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
-               gchar const **end, gboolean strict, guint *flags)
+                                 gchar const **end,
+                                 enum rspamd_url_parse_flags parse_flags,
+                                 guint *flags)
 {
        const gchar *p = str, *c = str, *last = str + len, *slash = NULL,
                        *password_start = NULL, *user_start = NULL;
@@ -763,7 +767,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                        SET_U (u, UF_SCHEMA);
                                }
                                else if (!g_ascii_isalnum (t) && t != '+' && t != '-') {
-                                       if (!strict && p > c) {
+                                       if ((parse_flags & RSPAMD_URL_PARSE_CHECK) && p > c) {
                                                /* We might have some domain, but no protocol */
                                                st = parse_domain;
                                                p = c;
@@ -985,7 +989,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                        }
                                        else if (*p != '.' && *p != '-' && *p != '_' && *p != '%') {
                                                if (*p & 0x80) {
-                                                       *flags |= RSPAMD_URL_FLAG_IDN;
+                                                       (*flags) |= RSPAMD_URL_FLAG_IDN;
                                                        guint i = 0;
 
                                                        U8_NEXT (p, i, last - p, uc);
@@ -997,11 +1001,16 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
 
                                                        if (!u_isalnum (uc)) {
                                                                /* Bad symbol */
-                                                               if (strict) {
-                                                                       goto out;
+                                                               if (IS_ZERO_WIDTH_SPACE (uc)) {
+                                                                       (*flags) |= RSPAMD_URL_FLAG_OBSCURED;
                                                                }
                                                                else {
-                                                                       goto set;
+                                                                       if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+                                                                               goto out;
+                                                                       }
+                                                                       else {
+                                                                               goto set;
+                                                                       }
                                                                }
                                                        }
 
@@ -1011,11 +1020,18 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                                        p ++;
                                                }
                                                else {
-                                                       if (strict) {
-                                                               goto out;
+                                                       if (parse_flags & RSPAMD_URL_PARSE_HREF) {
+                                                               /* We have to use all shit we are given here */
+                                                               p ++;
+                                                               (*flags) |= RSPAMD_URL_FLAG_OBSCURED;
                                                        }
                                                        else {
-                                                               goto set;
+                                                               if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+                                                                       goto out;
+                                                               }
+                                                               else {
+                                                                       goto set;
+                                                               }
                                                        }
                                                }
                                        }
@@ -1117,7 +1133,8 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                        goto set;
                                }
                                else if (!g_ascii_isdigit (t)) {
-                                       if (strict || !g_ascii_isspace (t)) {
+                                       if (!(parse_flags & RSPAMD_URL_PARSE_CHECK) ||
+                                               !g_ascii_isspace (t)) {
                                                goto out;
                                        }
                                        else {
@@ -1148,7 +1165,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                        goto set;
                                }
                                else if (is_lwsp (t)) {
-                                       if (strict) {
+                                       if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
                                                if (g_ascii_isspace (t)) {
                                                        goto set;
                                                }
@@ -1172,7 +1189,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                        goto set;
                                }
                                else if (is_lwsp (t)) {
-                                       if (strict) {
+                                       if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
                                                if (g_ascii_isspace (t)) {
                                                        goto set;
                                                }
@@ -1189,7 +1206,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                        goto set;
                                }
                                else if (is_lwsp (t)) {
-                                       if (strict) {
+                                       if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
                                                if (g_ascii_isspace (t)) {
                                                        goto set;
                                                }
@@ -1602,8 +1619,10 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen,
 }
 
 enum uri_errno
-rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
-               rspamd_mempool_t *pool)
+rspamd_url_parse (struct rspamd_url *uri,
+                                 gchar *uristring, gsize len,
+                                 rspamd_mempool_t *pool,
+                                 enum rspamd_url_parse_flags parse_flags)
 {
        struct http_parser_url u;
        gchar *p, *comp;
@@ -1624,14 +1643,16 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
        if (len > sizeof ("mailto:") - 1) {
                /* For mailto: urls we also need to add slashes to make it a valid URL */
                if (g_ascii_strncasecmp (p, "mailto:", sizeof ("mailto:") - 1) == 0) {
-                       ret = rspamd_mailto_parse (&u, uristring, len, &end, TRUE, &flags);
+                       ret = rspamd_mailto_parse (&u, uristring, len, &end, parse_flags,
+                                       &flags);
                }
                else {
-                       ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &flags);
+                       ret = rspamd_web_parse (&u, uristring, len, &end, parse_flags,
+                                       &flags);
                }
        }
        else {
-               ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &flags);
+               ret = rspamd_web_parse (&u, uristring, len, &end, parse_flags, &flags);
        }
 
        if (ret != 0) {
@@ -1715,9 +1736,11 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
                        uri->protocollen);
        rspamd_url_shift (uri, unquoted_len, UF_SCHEMA);
        unquoted_len = rspamd_url_decode (uri->host, uri->host, uri->hostlen);
+
        if (rspamd_normalise_unicode_inplace (pool, uri->host, &unquoted_len)) {
                uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED;
        }
+
        rspamd_url_shift (uri, unquoted_len, UF_HOST);
 
        if (uri->datalen) {
@@ -1730,6 +1753,7 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
                rspamd_http_normalize_path_inplace (uri->data, uri->datalen, &unquoted_len);
                rspamd_url_shift (uri, unquoted_len, UF_PATH);
        }
+
        if (uri->querylen) {
                unquoted_len = rspamd_url_decode (uri->query,
                                uri->query,
@@ -1739,6 +1763,7 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
                }
                rspamd_url_shift (uri, unquoted_len, UF_QUERY);
        }
+
        if (uri->fragmentlen) {
                unquoted_len = rspamd_url_decode (uri->fragment,
                                uri->fragment,
@@ -1769,7 +1794,7 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
                                uri->host, uri->hostlen,
                                rspamd_tld_trie_callback, uri, NULL);
 
-       if (uri->tldlen == 0) {
+       if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && uri->tldlen == 0) {
                /* Ignore URL's without TLD if it is not a numeric URL */
                if (!rspamd_url_is_ip (uri, pool)) {
                        return URI_ERRNO_TLD_MISSING;
@@ -1777,7 +1802,13 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
        }
 
        if (uri->protocol == PROTOCOL_UNKNOWN) {
-               return URI_ERRNO_INVALID_PROTOCOL;
+               if (!(parse_flags & RSPAMD_URL_PARSE_HREF)) {
+                       return URI_ERRNO_INVALID_PROTOCOL;
+               }
+               else {
+                       /* Hack, hack, hack */
+                       uri->protocol = PROTOCOL_HTTP;
+               }
        }
 
        return URI_ERRNO_OK;
@@ -2089,7 +2120,8 @@ url_web_end (struct url_callback_data *cb,
                len = MIN (len, match->newline_pos - pos);
        }
 
-       if (rspamd_web_parse (NULL, pos, len, &last, FALSE, &flags) != 0) {
+       if (rspamd_web_parse (NULL, pos, len, &last,
+                       RSPAMD_URL_PARSE_CHECK, &flags) != 0) {
                return FALSE;
        }
 
@@ -2157,7 +2189,8 @@ url_email_end (struct url_callback_data *cb,
 
        if (!match->prefix || match->prefix[0] == '\0') {
                /* We have mailto:// at the beginning */
-               if (rspamd_mailto_parse (&u, pos, len, &last, FALSE, &flags) != 0) {
+               if (rspamd_mailto_parse (&u, pos, len, &last,
+                               RSPAMD_URL_PARSE_CHECK, &flags) != 0) {
                        return FALSE;
                }
 
@@ -2470,7 +2503,9 @@ rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp,
                cb->fin = m.m_begin + m.m_len;
                url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
                g_strstrip (cb->url_str);
-               rc = rspamd_url_parse (url, cb->url_str, strlen (cb->url_str), pool);
+               rc = rspamd_url_parse (url, cb->url_str,
+                               strlen (cb->url_str), pool,
+                               RSPAMD_URL_PARSE_TEXT);
 
                if (rc == URI_ERRNO_OK && url->hostlen > 0) {
                        if (cb->prefix_added) {
@@ -2583,7 +2618,8 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
                        rc = rspamd_url_parse (query_url,
                                        url_str,
                                        strlen (url_str),
-                                       task->task_pool);
+                                       task->task_pool,
+                                       RSPAMD_URL_PARSE_TEXT);
 
                        if (rc == URI_ERRNO_OK &&
                                        query_url->hostlen > 0) {
@@ -2737,7 +2773,8 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
                        rc = rspamd_url_parse (query_url,
                                        url_str,
                                        strlen (url_str),
-                                       task->task_pool);
+                                       task->task_pool,
+                                       RSPAMD_URL_PARSE_TEXT);
 
                        if (rc == URI_ERRNO_OK &&
                                        url->hostlen > 0) {
index b0cc102394d903b596ce27f0b15aa338c9e11bf6..a9eda71de5daacca024d1cf5339a334e004ca053 100644 (file)
@@ -104,6 +104,12 @@ void rspamd_url_text_extract (rspamd_mempool_t *pool,
        struct rspamd_mime_text_part *part,
        gboolean is_html);
 
+enum rspamd_url_parse_flags {
+       RSPAMD_URL_PARSE_TEXT = 0,
+       RSPAMD_URL_PARSE_HREF = (1u << 0),
+       RSPAMD_URL_PARSE_CHECK = (1 << 1),
+};
+
 /*
  * Parse a single url into an uri structure
  * @param pool memory pool
@@ -111,9 +117,10 @@ void rspamd_url_text_extract (rspamd_mempool_t *pool,
  * @param uri url object, must be pre allocated
  */
 enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
-       gchar *uristring,
-       gsize len,
-       rspamd_mempool_t *pool);
+                                                                gchar *uristring,
+                                                                gsize len,
+                                                                rspamd_mempool_t *pool,
+                                                                enum rspamd_url_parse_flags flags);
 
 /*
  * Try to extract url from a text
index 4bc17db20846c51389125411066c2d6e50a9a8f7..ab9e5bb47eaf12bfa1b1fa8e6f700ce178018d87 100644 (file)
@@ -1660,7 +1660,7 @@ surbl_redirector_finish (struct rspamd_http_connection *conn,
                                        sizeof (*redirected_url));
                        rspamd_strlcpy (urlstr, hdr->begin, urllen + 1);
                        r = rspamd_url_parse (redirected_url, urlstr, urllen,
-                                       task->task_pool);
+                                       task->task_pool, RSPAMD_URL_PARSE_TEXT);
 
                        if (r == URI_ERRNO_OK) {
                                if ((existing = g_hash_table_lookup (task->urls, redirected_url)) == NULL) {
@@ -2120,7 +2120,7 @@ surbl_is_redirector_handler (lua_State *L)
                url_cpy = rspamd_mempool_alloc (task->task_pool, len);
                memcpy (url_cpy, url, len);
 
-               if (rspamd_url_parse (&uri, url_cpy, len, task->task_pool)) {
+               if (rspamd_url_parse (&uri, url_cpy, len, task->task_pool, RSPAMD_URL_PARSE_TEXT)) {
                        msg_debug_surbl ("check url redirection %*s", uri.urllen,
                                        uri.string);
 
@@ -2198,7 +2198,7 @@ surbl_continue_process_handler (lua_State *L)
                                        sizeof (*redirected_url));
                        rspamd_strlcpy (urlstr, nurl, urllen + 1);
                        r = rspamd_url_parse (redirected_url, urlstr, urllen,
-                                       task->task_pool);
+                                       task->task_pool, RSPAMD_URL_PARSE_TEXT);
 
                        if (r == URI_ERRNO_OK) {
                                if (!g_hash_table_lookup (task->urls, redirected_url)) {