]> source.dussan.org Git - rspamd.git/commitdiff
More fixes to urls parser.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 20 Feb 2015 12:57:20 +0000 (12:57 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 20 Feb 2015 13:17:34 +0000 (13:17 +0000)
src/libserver/url.c

index 23620f11e1e9b005ac516b4625d8e34299e37797..2883f802735e061886616693b5a546142fe95c40 100644 (file)
@@ -739,17 +739,18 @@ enum {
        IS_URLSAFE)) != 0)
 
 void
-rspamd_unescape_uri (gchar **dst, gchar **src, gsize size)
+rspamd_unescape_uri (gchar *dst, const gchar *src, gsize size)
 {
-       gchar *d, *s, ch, c, decoded;
+       gchar *d, ch, c, decoded;
+       const gchar *s;
        enum {
                sw_usual = 0,
                sw_quoted,
                sw_quoted_second
        } state;
 
-       d = *dst;
-       s = *src;
+       d = dst;
+       s = src;
 
        state = 0;
        decoded = 0;
@@ -760,10 +761,6 @@ rspamd_unescape_uri (gchar **dst, gchar **src, gsize size)
 
                switch (state) {
                case sw_usual:
-                       if (ch == '?') {
-                               *d++ = ch;
-                               goto done;
-                       }
 
                        if (ch == '%') {
                                state = sw_quoted;
@@ -811,11 +808,6 @@ rspamd_unescape_uri (gchar **dst, gchar **src, gsize size)
                        if (c >= 'a' && c <= 'f') {
                                ch = ((decoded << 4) + c - 'a' + 10);
 
-                               if (ch == '?') {
-                                       *d++ = ch;
-                                       goto done;
-                               }
-
                                *d++ = ch;
                                break;
                        }
@@ -825,10 +817,7 @@ rspamd_unescape_uri (gchar **dst, gchar **src, gsize size)
                }
        }
 
-       done:
-
-       *dst = d;
-       *src = s;
+       *d = '\0';
 }
 
 const gchar *
@@ -1050,11 +1039,12 @@ static gint
 rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                gchar const **end, gboolean strict)
 {
-       const gchar *p = str, *c = str, *last = str + len;
+       const gchar *p = str, *c = str, *last = str + len, *slash = NULL;
        gchar t;
        gunichar uc;
        glong pt;
        gint ret = 1;
+       gboolean user_seen = FALSE;
        enum {
                parse_protocol,
                parse_slash,
@@ -1065,6 +1055,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                parse_password_start,
                parse_password,
                parse_domain,
+               parse_port_password,
                parse_port,
                parse_suffix_slash,
                parse_path,
@@ -1103,15 +1094,8 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                        break;
                case parse_slash_slash:
                        c = p;
-
-                       /* XXX: inefficient lookahead */
-                       if (strchr (p, '@') != NULL) {
-                               st = parse_user;
-                       }
-                       else {
-                               st = parse_domain;
-                       }
-
+                       st = parse_domain;
+                       slash = p;
                        break;
                case parse_user:
                        if (t == ':') {
@@ -1164,13 +1148,16 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                if (p - c == 0) {
                                        goto out;
                                }
-                               SET_U (u, UF_HOST);
-
                                if (t == '/') {
+                                       SET_U (u, UF_HOST);
                                        st = parse_suffix_slash;
                                }
-                               else {
-                                       st = parse_port;
+                               else if (!user_seen) {
+                                       /*
+                                        * Here we can have both port and password, hence we need
+                                        * to apply some heuristic here
+                                        */
+                                       st = parse_port_password;
                                        c = p + 1;
                                }
                                p ++;
@@ -1196,13 +1183,31 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
                                }
                        }
                        break;
+               case parse_port_password:
+                       if (g_ascii_isdigit (t)) {
+                               /* XXX: that breaks urls with passwords starting with number */
+                               st = parse_port;
+                               c = slash;
+                               SET_U (u, UF_HOST);
+                               c = p;
+                       }
+                       else {
+                               /* Rewind back */
+                               p = slash;
+                               c = slash;
+                               user_seen = TRUE;
+                               st = parse_user;
+                       }
+                       break;
                case parse_port:
                        if (t == '/') {
                                pt = strtoul (c, NULL, 10);
                                if (pt == 0 || pt > 65535) {
                                        goto out;
                                }
-                               u->port = pt;
+                               if (u != NULL) {
+                                       u->port = pt;
+                               }
                                st = parse_suffix_slash;
                        }
                        else if (!g_ascii_isdigit (t)) {
@@ -1432,14 +1437,8 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
        }
 
        /* Now decode url symbols */
-       uri->string = rspamd_mempool_strdup (pool, p);
-
-       if (uri->datalen > 0) {
-               rspamd_unescape_uri (&uri->data, &uri->data, uri->datalen);
-       }
-       if (uri->querylen > 0) {
-               rspamd_unescape_uri (&uri->query, &uri->query, uri->querylen);
-       }
+       uri->string = p;
+       rspamd_unescape_uri (uri->string, uri->string, len);
        rspamd_str_lc (uri->string, uri->protocollen);
        rspamd_str_lc (uri->host,   uri->hostlen);