]> source.dussan.org Git - rspamd.git/commitdiff
[Fix] Deal with URLs with no slashes after protocol
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 23 Sep 2017 11:38:27 +0000 (12:38 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 23 Sep 2017 11:38:27 +0000 (12:38 +0100)
src/libserver/html.c
src/libserver/url.c
src/libserver/url.h

index 3f479937a500a3308c1eeeaa42ac3bf98604efe2..9056157948666fa6d09ce5fc6f222d91ec59c41c 100644 (file)
@@ -1497,7 +1497,7 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
                }
        }
 
-       if (rspamd_substring_search (s, len, "://", 3) == (-1)) {
+       if (memchr (s, ':', len) == NULL) {
                /* We have no prefix */
                dlen += sizeof ("http://") - 1;
                no_prefix = TRUE;
index 918f7eee33965ac7c49298dcf67b3e2e67a9e3b4..824c0dff88804e35e773a95d294434ea8ae6338c 100644 (file)
@@ -46,6 +46,8 @@
 #include "message.h"
 #include "multipattern.h"
 #include "contrib/uthash/utlist.h"
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
 
 typedef struct url_match_s {
        const gchar *m_begin;
@@ -153,21 +155,17 @@ struct url_matcher static_matchers[] = {
                /* Common prefixes */
                {"file://",   "",          url_file_start,  url_file_end,
                                0, 0},
-               {"file:\\\\",   "",          url_file_start,  url_file_end,
+               {"file:\\\\",   "",        url_file_start,  url_file_end,
                                0, 0},
                {"ftp://",    "",          url_web_start,   url_web_end,
                                0, 0},
-               {"ftp:\\\\",    "",          url_web_start,   url_web_end,
+               {"ftp:\\\\",    "",        url_web_start,   url_web_end,
                                0, 0},
                {"sftp://",   "",          url_web_start,   url_web_end,
                                0, 0},
-               {"http://",   "",          url_web_start,   url_web_end,
+               {"http:",   "",            url_web_start,   url_web_end,
                                0, 0},
-               {"http:\\\\",   "",          url_web_start,   url_web_end,
-                               0, 0},
-               {"https://",  "",          url_web_start,   url_web_end,
-                               0, 0},
-               {"https:\\\\",  "",          url_web_start,   url_web_end,
+               {"https:",   "",           url_web_start,   url_web_end,
                                0, 0},
                {"news://",   "",          url_web_start,   url_web_end,
                                0, 0},
@@ -550,7 +548,7 @@ is_url_end (gchar c)
 
 static gint
 rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
-               gchar const **end, gboolean strict)
+               gchar const **end, gboolean strict, guint *flags)
 {
        const gchar *p = str, *c = str, *last = str + len;
        gchar t;
@@ -591,6 +589,7 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                        p++;
                                }
                                else {
+                                       *flags |= RSPAMD_URL_FLAG_MISSINGSLASHES;
                                        st = parse_slash_slash;
                                }
                                break;
@@ -704,11 +703,11 @@ rspamd_mailto_parse (struct http_parser_url *u, const gchar *str, gsize len,
 
 static gint
 rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
-               gchar const **end, gboolean strict, gboolean *obscured)
+               gchar const **end, gboolean strict, guint *flags)
 {
        const gchar *p = str, *c = str, *last = str + len, *slash = NULL;
        gchar t;
-       gunichar uc;
+       UChar32 uc;
        glong pt;
        gint ret = 1;
        gboolean user_seen = FALSE;
@@ -766,6 +765,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                }
                                else {
                                        st = parse_slash_slash;
+                                       *(flags) |= RSPAMD_URL_FLAG_MISSINGSLASHES;
                                }
                                break;
                        case parse_slash:
@@ -833,15 +833,13 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                                /* We have multiple at in fact */
                                                st = parse_multiple_at;
                                                user_seen = TRUE;
-
-                                               if (obscured) {
-                                                       *obscured = TRUE;
-                                               }
+                                               *flags |= RSPAMD_URL_FLAG_OBSCURED;
 
                                                continue;
                                        }
 
                                        SET_U (u, UF_USERINFO);
+                                       *flags |= RSPAMD_URL_FLAG_HAS_USER;
                                        st = parse_at;
                                }
                                else if (!g_ascii_isgraph (t)) {
@@ -867,6 +865,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                if (t == '@') {
                                        /* Empty password */
                                        SET_U (u, UF_USERINFO);
+                                       *flags |= RSPAMD_URL_FLAG_HAS_USER;
                                        st = parse_at;
                                }
                                else {
@@ -878,6 +877,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                        case parse_password:
                                if (t == '@') {
                                        /* XXX: password is not stored */
+                                       *flags |= RSPAMD_URL_FLAG_HAS_USER;
                                        SET_U (u, UF_USERINFO);
                                        st = parse_at;
                                }
@@ -942,24 +942,32 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                                st = parse_user;
                                        }
                                        else if (*p != '.' && *p != '-' && *p != '_' && *p != '%') {
-                                               uc = g_utf8_get_char_validated (p, last - p);
+                                               if (*p & 0x80) {
+                                                       *flags |= RSPAMD_URL_FLAG_IDN;
+                                                       guint i = 0;
 
-                                               if (uc == (gunichar) -1) {
-                                                       /* Bad utf8 */
-                                                       goto out;
-                                               }
+                                                       U8_NEXT (p, i, last - p, uc);
 
-                                               if (!g_unichar_isalnum (uc)) {
-                                                       /* Bad symbol */
-                                                       if (strict) {
+                                                       if (uc < 0) {
+                                                               /* Bad utf8 */
                                                                goto out;
                                                        }
-                                                       else {
-                                                               goto set;
+
+                                                       if (!u_isalnum (uc)) {
+                                                               /* Bad symbol */
+                                                               if (strict) {
+                                                                       goto out;
+                                                               }
+                                                               else {
+                                                                       goto set;
+                                                               }
                                                        }
-                                               }
 
-                                               p = g_utf8_next_char (p);
+                                                       p = p + i;
+                                               }
+                                               else {
+                                                       p ++;
+                                               }
                                        }
                                        else {
                                                p++;
@@ -992,6 +1000,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                        }
                                        if (u != NULL) {
                                                u->port = pt;
+                                               *flags |= RSPAMD_URL_FLAG_HAS_PORT;
                                        }
                                        st = parse_suffix_slash;
                                }
@@ -1002,6 +1011,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                        }
                                        if (u != NULL) {
                                                u->port = pt;
+                                               *flags |= RSPAMD_URL_FLAG_HAS_PORT;
                                        }
 
                                        c = p + 1;
@@ -1014,6 +1024,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                        }
                                        if (u != NULL) {
                                                u->port = pt;
+                                               *flags |= RSPAMD_URL_FLAG_HAS_PORT;
                                        }
 
                                        c = p + 1;
@@ -1514,9 +1525,8 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
        struct http_parser_url u;
        gchar *p, *comp;
        const gchar *end;
-       guint i, complen, ret;
+       guint i, complen, ret, flags = 0;
        gsize unquoted_len = 0;
-       gboolean obscured = FALSE;
 
        memset (uri, 0, sizeof (*uri));
        memset (&u, 0, sizeof (u));
@@ -1531,14 +1541,14 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
        if (len > sizeof ("mailto:") - 1) {
                /* For mailto: urls we also need to add slashes to make it a valid URL */
                if (g_ascii_strncasecmp (p, "mailto:", sizeof ("mailto:") - 1) == 0) {
-                       ret = rspamd_mailto_parse (&u, uristring, len, &end, TRUE);
+                       ret = rspamd_mailto_parse (&u, uristring, len, &end, TRUE, &flags);
                }
                else {
-                       ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &obscured);
+                       ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &flags);
                }
        }
        else {
-               ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &obscured);
+               ret = rspamd_web_parse (&u, uristring, len, &end, TRUE, &flags);
        }
 
        if (ret != 0) {
@@ -1551,8 +1561,27 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
 
        uri->raw = p;
        uri->rawlen = len;
-       uri->string = rspamd_mempool_alloc (pool, len + 1);
-       rspamd_strlcpy (uri->string, p, len + 1);
+
+       if (flags & RSPAMD_URL_FLAG_MISSINGSLASHES) {
+               len += 2;
+               uri->string = rspamd_mempool_alloc (pool, len + 1);
+               memcpy (uri->string, p, u.field_data[UF_SCHEMA].len);
+               memcpy (uri->string + u.field_data[UF_SCHEMA].len, "://", 3);
+               rspamd_strlcpy (uri->string + u.field_data[UF_SCHEMA].len + 3,
+                       p + u.field_data[UF_SCHEMA].len + 1,
+                               len - 1 - u.field_data[UF_SCHEMA].len);
+               /* Compensate slashes added */
+               for (i = UF_SCHEMA + 1; i < UF_MAX; i++) {
+                       if (u.field_set & (1 << i)) {
+                               u.field_data[i].off += 2;
+                       }
+               }
+       }
+       else {
+               uri->string = rspamd_mempool_alloc (pool, len + 1);
+               rspamd_strlcpy (uri->string, p, len + 1);
+       }
+
        uri->urllen = len;
 
        for (i = 0; i < UF_MAX; i++) {
@@ -1591,15 +1620,12 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
        }
 
        uri->port = u.port;
+       uri->flags = flags;
 
        if (!uri->hostlen) {
                return URI_ERRNO_HOST_MISSING;
        }
 
-       if (obscured) {
-               uri->flags |= RSPAMD_URL_FLAG_OBSCURED;
-       }
-
        /* Now decode url symbols */
        unquoted_len = rspamd_url_decode (uri->string,
                        uri->string,
@@ -1961,13 +1987,14 @@ url_web_end (struct url_callback_data *cb,
 {
        const gchar *last = NULL;
        gint len = cb->end - pos;
+       guint flags = 0;
 
        if (match->newline_pos && match->st != '<') {
                /* We should also limit our match end to the newline */
                len = MIN (len, match->newline_pos - pos);
        }
 
-       if (rspamd_web_parse (NULL, pos, len, &last, FALSE, NULL) != 0) {
+       if (rspamd_web_parse (NULL, pos, len, &last, FALSE, &flags) != 0) {
                return FALSE;
        }
 
@@ -2026,6 +2053,7 @@ url_email_end (struct url_callback_data *cb,
        const gchar *last = NULL;
        struct http_parser_url u;
        gint len = cb->end - pos;
+       guint flags = 0;
 
        if (match->newline_pos && match->st != '<') {
                /* We should also limit our match end to the newline */
@@ -2034,7 +2062,7 @@ url_email_end (struct url_callback_data *cb,
 
        if (!match->prefix || match->prefix[0] == '\0') {
                /* We have mailto:// at the beginning */
-               if (rspamd_mailto_parse (&u, pos, len, &last, FALSE) != 0) {
+               if (rspamd_mailto_parse (&u, pos, len, &last, FALSE, &flags) != 0) {
                        return FALSE;
                }
 
index a9466c6369b6691972e949d2083681ee42f93e6f..3c0ff759974cff96038be9d4e633fd02af3b0239 100644 (file)
@@ -21,6 +21,10 @@ enum rspamd_url_flags {
        RSPAMD_URL_FLAG_SCHEMAENCODED = 1 << 8,
        RSPAMD_URL_FLAG_PATHENCODED = 1 << 9,
        RSPAMD_URL_FLAG_QUERYENCODED = 1 << 10,
+       RSPAMD_URL_FLAG_MISSINGSLASHES = 1 << 11,
+       RSPAMD_URL_FLAG_IDN = 1 << 12,
+       RSPAMD_URL_FLAG_HAS_PORT = 1 << 13,
+       RSPAMD_URL_FLAG_HAS_USER = 1 << 12,
 };
 
 struct rspamd_url_tag {