]> source.dussan.org Git - rspamd.git/commitdiff
Remove old crap functions from url parser code.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 3 Feb 2015 17:39:03 +0000 (17:39 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 17 Feb 2015 15:14:09 +0000 (15:14 +0000)
src/libmime/message.c
src/libserver/html.c
src/libserver/protocol.c
src/libserver/url.c
src/libserver/url.h
src/libutil/util.c
src/lua/lua_task.c
src/plugins/regexp.c
src/plugins/surbl.c
src/plugins/surbl.h

index 94137af157ca73b755927b388e8a2cd14511b320..d0549cbd56ef63647a5bf25759a5912380bf8f7c 100644 (file)
@@ -1233,7 +1233,7 @@ process_text_part (struct rspamd_task *task,
                        decode_entitles (text_part->content->data,
                                &text_part->content->len);
                }
-               url_parse_text (task->task_pool, task, text_part, TRUE);
+               rspamd_url_text_extract (task->task_pool, task, text_part, TRUE);
 
                rspamd_fuzzy_from_text_part (text_part, task->task_pool, task->cfg->max_diff);
                rspamd_mempool_add_destructor (task->task_pool,
@@ -1260,7 +1260,7 @@ process_text_part (struct rspamd_task *task,
                                type,
                                text_part);
                text_part->orig = part_content;
-               url_parse_text (task->task_pool, task, text_part, FALSE);
+               rspamd_url_text_extract (task->task_pool, task, text_part, FALSE);
                rspamd_fuzzy_from_text_part (text_part, task->task_pool, task->cfg->max_diff);
                task->text_parts = g_list_prepend (task->text_parts, text_part);
        }
@@ -1460,7 +1460,7 @@ process_message (struct rspamd_task *task)
        GMimeDataWrapper *wrapper;
        struct received_header *recv;
        gchar *mid, *url_str, *p, *end, *url_end;
-       struct uri *subject_url;
+       struct rspamd_url *subject_url;
        gsize len;
        gint rc;
 
@@ -1634,14 +1634,14 @@ process_message (struct rspamd_task *task)
 
                while (p < end) {
                        /* Search to the end of url */
-                       if (url_try_text (task->task_pool, p, end - p, NULL, &url_end,
+                       if (rspamd_url_find (task->task_pool, p, end - p, NULL, &url_end,
                                &url_str, FALSE)) {
                                if (url_str != NULL) {
                                        subject_url = rspamd_mempool_alloc0 (task->task_pool,
-                                                       sizeof (struct uri));
+                                                       sizeof (struct rspamd_url));
                                        if (subject_url != NULL) {
                                                /* Try to parse url */
-                                               rc = parse_uri (subject_url, url_str, task->task_pool);
+                                               rc = rspamd_url_parse (subject_url, url_str, task->task_pool);
                                                if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES ||
                                                        rc == URI_ERRNO_NO_HOST_SLASH) &&
                                                        subject_url->hostlen > 0) {
@@ -1656,7 +1656,7 @@ process_message (struct rspamd_task *task)
                                                else if (rc != URI_ERRNO_OK) {
                                                        msg_info ("extract of url '%s' failed: %s",
                                                                url_str,
-                                                               url_strerror (rc));
+                                                               rspamd_url_strerror (rc));
                                                }
                                        }
                                }
index 5055a9aae22db82a68b657b8358ed4876f4fbafe..7df9270c3b58a8aca03f28591f518a2145f078cb 100644 (file)
@@ -674,12 +674,12 @@ decode_entitles (gchar *s, guint * len)
 
 static void
 check_phishing (struct rspamd_task *task,
-       struct uri *href_url,
+       struct rspamd_url *href_url,
        const gchar *url_text,
        gsize remain,
        tag_id_t id)
 {
-       struct uri *new;
+       struct rspamd_url *new;
        gchar *url_str;
        const gchar *p, *c;
        gchar tagbuf[128];
@@ -732,12 +732,12 @@ check_phishing (struct rspamd_task *task,
                p++;
        }
 
-       if (url_try_text (task->task_pool, url_text, len, NULL, NULL, &url_str,
+       if (rspamd_url_find (task->task_pool, url_text, len, NULL, NULL, &url_str,
                TRUE) && url_str != NULL) {
-               new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct uri));
+               new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_url));
                if (new != NULL) {
                        g_strstrip (url_str);
-                       rc = parse_uri (new, url_str, task->task_pool);
+                       rc = rspamd_url_parse (new, url_str, task->task_pool);
 
                        if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc ==
                                URI_ERRNO_NO_HOST_SLASH) {
@@ -787,7 +787,7 @@ check_phishing (struct rspamd_task *task,
                        else {
                                msg_info ("extract of url '%s' failed: %s",
                                        url_str,
-                                       url_strerror (rc));
+                                       rspamd_url_strerror (rc));
                        }
                }
        }
@@ -804,7 +804,7 @@ parse_tag_url (struct rspamd_task *task,
 {
        gchar *c = NULL, *p, *url_text;
        gint len, rc;
-       struct uri *url;
+       struct rspamd_url *url;
        gboolean got_single_quote = FALSE, got_double_quote = FALSE;
 
        /* For A tags search for href= and for IMG tags search for src= */
@@ -885,8 +885,8 @@ parse_tag_url (struct rspamd_task *task,
                        return;
                }
 
-               url = rspamd_mempool_alloc (task->task_pool, sizeof (struct uri));
-               rc = parse_uri (url, url_text, task->task_pool);
+               url = rspamd_mempool_alloc (task->task_pool, sizeof (struct rspamd_url));
+               rc = rspamd_url_parse (url, url_text, task->task_pool);
 
                if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST && url->hostlen !=
                        0) {
index b3feda154632f45d8674cc3f99b3333b24458363..f527ed6fc3944c8113370c6ac9c0a388ce596484 100644 (file)
@@ -492,7 +492,7 @@ static gboolean
 urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
 {
        struct tree_cb_data *cb = ud;
-       struct uri *url = value;
+       struct rspamd_url *url = value;
        ucl_object_t *obj, *elt;
 
        if (!cb->task->extended_urls) {
@@ -550,7 +550,7 @@ static gboolean
 emails_protocol_cb (gpointer key, gpointer value, gpointer ud)
 {
        struct tree_cb_data *cb = ud;
-       struct uri *url = value;
+       struct rspamd_url *url = value;
        ucl_object_t *obj;
 
        obj = ucl_object_fromlstring (url->user, url->userlen + url->hostlen + 1);
index 3e4ccc827a9b9f1469b6ed1c4bc21db801b0dc6c..22cb15759e82d6738468a4f8c8543384371e2a38 100644 (file)
@@ -29,6 +29,7 @@
 #include "main.h"
 #include "message.h"
 #include "trie.h"
+#include "http.h"
 
 #define POST_CHAR 1
 #define POST_CHAR_S "\001"
@@ -695,28 +696,6 @@ struct url_match_scanner {
 
 struct url_match_scanner *url_scanner = NULL;
 
-static const struct _proto protocol_backends[] = {
-       {"file", 0, NULL, 1, 0, 0, 0},
-       {"ftp", 21, NULL, 1, 0, 0, 0},
-       {"http", 80, NULL, 1, 0, 0, 0},
-       {"https", 443, NULL, 1, 0, 0, 1},
-       {"mailto", 25, NULL, 1, 0, 0, 0},
-       /* Keep these last! */
-       {NULL, 0, NULL, 0, 0, 1, 0}
-};
-
-/* Convert an ASCII hex digit to the corresponding number between 0
-   and 15.  H should be a hexadecimal digit that satisfies isxdigit;
-   otherwise, the result is undefined.  */
-#define XDIGIT_TO_NUM(h) ((h) < 'A' ? (h) - '0' : g_ascii_toupper (h) - 'A' + \
-       10)
-#define X2DIGITS_TO_NUM(h1, h2) ((XDIGIT_TO_NUM (h1) << 4) + XDIGIT_TO_NUM (h2))
-/* The reverse of the above: convert a number in the [0, 16) range to
-   the ASCII representation of the corresponding hexadecimal digit.
-   `+ 0' is there so you can't accidentally use it as an lvalue.  */
-#define XNUM_TO_DIGIT(x) ("0123456789ABCDEF"[x] + 0)
-#define XNUM_TO_digit(x) ("0123456789abcdef"[x] + 0)
-
 static guchar url_scanner_table[256] = {
        1,  1,  1,  1,  1,  1,  1,  1,  1,  9,  9,  1,  1,  9,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
@@ -759,7 +738,7 @@ enum {
 
 
 const gchar *
-url_strerror (enum uri_errno err)
+rspamd_url_strerror (enum uri_errno err)
 {
        switch (err) {
        case URI_ERRNO_OK:
@@ -768,36 +747,16 @@ url_strerror (enum uri_errno err)
                return "The URI string was empty";
        case URI_ERRNO_INVALID_PROTOCOL:
                return "No protocol was found";
-       case URI_ERRNO_NO_SLASHES:
-               return "Slashes after protocol missing";
-       case URI_ERRNO_TOO_MANY_SLASHES:
-               return "Too many slashes after protocol";
-       case URI_ERRNO_TRAILING_DOTS:
-               return "'.' after host";
-       case URI_ERRNO_NO_HOST:
-               return "Host part is missing";
-       case URI_ERRNO_NO_PORT_COLON:
-               return "':' after host without port";
-       case URI_ERRNO_NO_HOST_SLASH:
-               return "Slash after host missing";
-       case URI_ERRNO_IPV6_SECURITY:
-               return "IPv6 security bug detected";
+       case URI_ERRNO_BAD_FORMAT:
+               return "Bad URL format";
+       case URI_ERRNO_BAD_ENCODING:
+               return "Invalid symbols encoded";
        case URI_ERRNO_INVALID_PORT:
                return "Port number is bad";
-       case URI_ERRNO_INVALID_PORT_RANGE:
-               return "Port number is not within 0-65535";
        }
        return NULL;
 }
 
-static gint
-check_uri_file (gchar *name)
-{
-       static const gchar chars[] = POST_CHAR_S "#?";
-
-       return strcspn (name, chars);
-}
-
 static gint
 url_init (void)
 {
@@ -843,590 +802,129 @@ url_init (void)
        return 0;
 }
 
-enum protocol
-get_protocol (gchar *name, gint namelen)
-{
-       /* These are really enum protocol values but can take on negative
-        * values and since 0 <= -1 for enum values it's better to use clean
-        * integer type. */
-       gint start, end;
-       enum protocol protocol;
-       guchar *pname;
-       gint pnamelen, minlen, compare;
-
-       /* Almost dichotomic search is used here */
-       /* Starting at the HTTP entry which is the most common that will make
-        * file and NNTP the next entries checked and amongst the third checks
-        * are proxy and FTP. */
-       start = 0;
-       end = PROTOCOL_UNKNOWN - 1;
-       protocol = PROTOCOL_HTTP;
-
-       while (start <= end) {
-               pname = protocol_backends[protocol].name;
-               pnamelen = strlen (pname);
-               minlen = MIN (pnamelen, namelen);
-               compare = g_ascii_strncasecmp (pname, name, minlen);
-
-               if (compare == 0) {
-                       if (pnamelen == namelen)
-                               return protocol;
-
-                       /* If the current protocol name is longer than the
-                        * protocol name being searched for move @end else move
-                        * @start. */
-                       compare = pnamelen > namelen ? 1 : -1;
-               }
-
-               if (compare > 0)
-                       end = protocol - 1;
-               else
-                       start = protocol + 1;
-
-               protocol = (start + end) / 2;
-       }
-
-       return PROTOCOL_UNKNOWN;
-}
-
-
-gint
-get_protocol_port (enum protocol protocol)
-{
-       return protocol_backends[protocol].port;
-}
-
-gint
-get_protocol_need_slashes (enum protocol protocol)
-{
-       return protocol_backends[protocol].need_slashes;
-}
-
-gint
-get_protocol_need_slash_after_host (enum protocol protocol)
-{
-       return protocol_backends[protocol].need_slash_after_host;
-}
-
-gint
-get_protocol_free_syntax (enum protocol protocol)
-{
-       return protocol_backends[protocol].free_syntax;
-}
-
-static gint
-get_protocol_length (const gchar *url)
-{
-       gchar *end = (gchar *)url;
-
-       /* Seek the end of the protocol name if any. */
-       /* RFC1738:
-        * scheme  = 1*[ lowalpha | digit | "+" | "-" | "." ]
-        * (but per its recommendations we accept "upalpha" too) */
-       while (*end && (g_ascii_isalnum (*end) || *end == '+'
-                       || *end == '-' || *end == '.')) {
-               end++;
-       }
-
-       /* Also return 0 if there's no protocol name (@end == @url). */
-       return (*end == ':') ? end - url : 0;
-}
-
-
-/*
- * Calcualte new length of unescaped hostlen
- */
-static guint
-url_calculate_escaped_hostlen (gchar *host, guint hostlen)
-{
-       guint i, result = hostlen;
-       gchar *p = host, c;
-
-       for (i = 0; i < hostlen; i++, p++) {
-               if (*p == '%' && g_ascii_isxdigit (*(p + 1)) &&
-                       g_ascii_isxdigit (*(p + 2)) && i < hostlen - 2) {
-                       c = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
-                       if (c != '\0') {
-                               result -= 2;
-                       }
-               }
-       }
-
-       return result;
-}
-
-void
-rspamd_url_unescape (gchar *s)
-{
-       gchar *t = s;                           /* t - tortoise */
-       gchar *h = s;                           /* h - hare     */
-
-       for (; *h; h++, t++) {
-               if (*h != '%') {
-                       *t = *h;
-               }
-               else {
-                       gchar c;
-                       if (!h[1] || !h[2] ||
-                               !(g_ascii_isxdigit (h[1]) && g_ascii_isxdigit (h[2]))) {
-                               *t = *h;
-                       }
-                       else {
-                               c = X2DIGITS_TO_NUM (h[1], h[2]);
-                               if (c != '\0') {
-                                       *t = c;
-                                       h += 2;
-                               }
-                               else {
-                                       *t = *h;
-                               }
-                       }
-               }
-       }
-       *t = '\0';
-}
-
-static void
-url_strip (gchar *s)
-{
-       gchar *t = s;                           /* t - tortoise */
-       gchar *h = s;                           /* h - hare     */
-
-       while (*h) {
-               if (g_ascii_isgraph (*h)) {
-                       *t = *h;
-                       t++;
-               }
-               h++;
-       }
-       *t = '\0';
-}
-
-static gchar *
-url_escape_1 (const gchar *s, gint allow_passthrough, rspamd_mempool_t * pool)
-{
-       const gchar *p1;
-       gchar *p2, *newstr;
-       gint newlen;
-       gint addition = 0;
-
-       for (p1 = s; *p1; p1++)
-               if (!is_urlsafe (*p1)) {
-                       addition += 2;      /* Two more characters (hex digits) */
-               }
-
-       if (!addition) {
-               if (allow_passthrough) {
-                       return (gchar *)s;
-               }
-               else {
-                       return rspamd_mempool_strdup (pool, s);
-               }
-       }
-
-       newlen = (p1 - s) + addition;
-       newstr = (gchar *)rspamd_mempool_alloc (pool, newlen + 1);
-
-       p1 = s;
-       p2 = newstr;
-       while (*p1) {
-               /* Quote the characters that match the test mask. */
-               if (!is_urlsafe (*p1)) {
-                       guchar c = *p1++;
-                       *p2++ = '%';
-                       *p2++ = XNUM_TO_DIGIT (c >> 4);
-                       *p2++ = XNUM_TO_DIGIT (c & 0xf);
-               }
-               else
-                       *p2++ = *p1++;
-       }
-       *p2 = '\0';
-
-       return newstr;
-}
-
-/* URL-escape the unsafe characters (see urlchr_table) in a given
-   string, returning a freshly allocated string.  */
-
-gchar *
-url_escape (const gchar *s, rspamd_mempool_t * pool)
-{
-       return url_escape_1 (s, 0, pool);
-}
-
-/* Decide whether the gchar at position P needs to be encoded.  (It is
-   not enough to pass a single gchar *P because the function may need
-   to inspect the surrounding context.)
-
-   Return 1 if the gchar should be escaped as %XX, 0 otherwise.  */
-
-static inline gboolean
-char_needs_escaping (const gchar *p)
-{
-       if (*p == '%') {
-               if (g_ascii_isxdigit (*(p + 1)) && g_ascii_isxdigit (*(p + 2))) {
-                       return FALSE;
-               }
-               else {
-                       return TRUE;
-               }
-       }
-       else if (!is_urlsafe (*p)) {
-               return TRUE;
-       }
-       return FALSE;
-}
-
-static gchar *
-rspamd_url_reencode_escapes (gchar *s, rspamd_mempool_t * pool)
-{
-       const gchar *p1;
-       gchar *newstr, *p2;
-       gint oldlen, newlen;
-
-       gint encode_count = 0;
-
-       /* First pass: inspect the string to see if there's anything to do,
-          and to calculate the new length.  */
-       for (p1 = s; *p1; p1++) {
-               if (char_needs_escaping (p1)) {
-                       ++encode_count;
-               }
-       }
-
-       if (!encode_count) {
-               /* The string is good as it is. */
-               return s;
-       }
-
-       oldlen = p1 - s;
-       /* Each encoding adds two characters (hex digits).  */
-       newlen = oldlen + 2 * encode_count;
-       newstr = rspamd_mempool_alloc (pool, newlen + 1);
-
-       /* Second pass: copy the string to the destination address, encoding
-          chars when needed.  */
-       p1 = s;
-       p2 = newstr;
-
-       while (*p1) {
-               if (char_needs_escaping (p1)) {
-                       guchar c = *p1++;
-                       *p2++ = '%';
-                       *p2++ = XNUM_TO_DIGIT (c >> 4);
-                       *p2++ = XNUM_TO_DIGIT (c & 0xf);
-               }
-               else {
-                       *p2++ = *p1++;
-               }
-       }
-
-       *p2 = '\0';
-       return newstr;
-}
-
-/*
- * Resolve "." and ".." elements of PATH by destructively modifying
- * PATH and return non-zero if PATH has been modified, zero otherwise.
- */
-
-static gboolean
-path_simplify (gchar *path)
-{
-       gchar *h = path;                            /* hare */
-       gchar *t = path;                            /* tortoise */
-       gchar *beg = path;                              /* boundary for backing the tortoise */
-       gchar *end = path + strlen (path);
-
-       while (h < end) {
-               /* Hare should be at the beginning of a path element. */
-               if (h[0] == '.' && (h[1] == '/' || h[1] == '\0')) {
-                       /* Ignore "./". */
-                       h += 2;
-               }
-               else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0')) {
-                       /* Handle "../" by retreating the tortoise by one path
-                          element -- but not past beginning.  */
-                       if (t > beg) {
-                               /* Move backwards until T hits the beginning of the
-                                  previous path element or the beginning of path. */
-                               for (--t; t > beg && t[-1] != '/'; t--) ;
-                       }
-                       else {
-                               /* If we're at the beginning, copy the "../" literally
-                                  move the beginning so a later ".." doesn't remove
-                                  it.  */
-                               beg = t + 3;
-                               goto regular;
-                       }
-                       h += 3;
-               }
-               else {
-regular:
-                       /* A regular path element.  If H hasn't advanced past T,
-                          simply skip to the next path element.  Otherwise, copy
-                          the path element until the next slash.  */
-                       if (t == h) {
-                               /* Skip the path element, including the slash.  */
-                               while (h < end && *h != '/')
-                                       t++, h++;
-                               if (h < end)
-                                       t++, h++;
-                       }
-                       else {
-                               /* Copy the path element, including the final slash.  */
-                               while (h < end && *h != '/')
-                                       *t++ = *h++;
-                               if (h < end)
-                                       *t++ = *h++;
-                       }
-               }
-       }
-
-       if (t != h)
-               *t = '\0';
-
-       return t != h;
-}
 
 enum uri_errno
-parse_uri (struct uri *uri, gchar *uristring, rspamd_mempool_t * pool)
+rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
+               rspamd_mempool_t *pool)
 {
-       guchar *prefix_end, *host_end, *p;
-       guchar *lbracket, *rbracket;
-       gint datalen, n, addrlen;
-       guchar *frag_or_post, *user_end, *port_end;
+       struct http_parser_url u;
+       gchar *p, *comp;
+       gint i, complen;
+
+       const struct {
+               enum rspamd_url_protocol proto;
+               const gchar *name;
+               gsize len;
+       } protocols[] = {
+               {
+                       .proto = PROTOCOL_FILE,
+                       .name = "file",
+                       .len = 4
+               },
+               {
+                       .proto = PROTOCOL_FTP,
+                       .name = "ftp",
+                       .len = 3
+               },
+               {
+                       .proto = PROTOCOL_HTTP,
+                       .name = "http",
+                       .len = 4
+               },
+               {
+                       .proto = PROTOCOL_HTTPS,
+                       .name = "https",
+                       .len = 5
+               },
+               {
+                       .proto = PROTOCOL_MAILTO,
+                       .name = "mailto",
+                       .len = 6
+               },
+               {
+                       .proto = PROTOCOL_UNKNOWN,
+                       .name = NULL,
+                       .len = 0
+               }
+       };
 
        memset (uri, 0, sizeof (*uri));
 
-       if (!*uristring) {
+       if (*uristring == '\0') {
                return URI_ERRNO_EMPTY;
        }
 
-       uri->string = rspamd_url_reencode_escapes (uristring, pool);
-       msg_debug ("reencoding escapes in original url: '%s'", struri (uri));
-       uri->protocollen = get_protocol_length (struri (uri));
-
-       /* Assume http as default protocol */
-       if (!uri->protocollen ||
-               (uri->protocol =
-               get_protocol (struri (uri), uri->protocollen)) == PROTOCOL_UNKNOWN) {
-               /* Make exception for numeric urls */
-               p = uri->string;
-               while (*p && (g_ascii_isalnum (*p) || *p == ':')) {
-                       p++;
-               }
-               if (*p == '\0') {
-                       return URI_ERRNO_INVALID_PROTOCOL;
-               }
-               p = g_strconcat ("http://", uri->string, NULL);
-               uri->string = rspamd_mempool_strdup (pool, p);
-               g_free (p);
-               uri->protocol = PROTOCOL_HTTP;
-               prefix_end = struri (uri) + 7;
+       p = g_uri_unescape_string (uristring, NULL);
+       if (p == NULL) {
+               return URI_ERRNO_BAD_ENCODING;
        }
-       else {
-               /* Figure out whether the protocol is known */
-               msg_debug ("getting protocol from url: %d", uri->protocol);
-
-               prefix_end = struri (uri) + uri->protocollen;   /* ':' */
 
-               /* Check if there's a digit after the protocol name. */
-               if (g_ascii_isdigit (*prefix_end)) {
-                       p = struri (uri);
-                       uri->ip_family = p[uri->protocollen] - '0';
-                       prefix_end++;
-               }
-               if (*prefix_end != ':') {
-                       msg_debug ("invalid protocol in uri");
-                       return URI_ERRNO_INVALID_PROTOCOL;
-               }
-               prefix_end++;
+       uri->string = p;
 
-               /* Skip slashes */
+       rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t)g_free, p);
 
-               if (prefix_end[0] == '/' && prefix_end[1] == '/') {
-                       if (prefix_end[2] == '/') {
-                               msg_debug ("too many '/' in uri");
-                               return URI_ERRNO_TOO_MANY_SLASHES;
-                       }
-
-                       prefix_end += 2;
-
-               }
-               else {
-                       msg_debug ("no '/' in uri");
-                       return URI_ERRNO_NO_SLASHES;
-               }
+       /*
+        * We assume here that urls has the sane scheme
+        */
+       if (http_parser_parse_url (p, len, 0, &u) != 0) {
+               return URI_ERRNO_BAD_FORMAT;
        }
 
-       if (get_protocol_free_syntax (uri->protocol)) {
-               uri->data = prefix_end;
-               uri->datalen = strlen (prefix_end);
-               return URI_ERRNO_OK;
-
-       }
-       else if (uri->protocol == PROTOCOL_FILE) {
-               datalen = check_uri_file (prefix_end);
-               frag_or_post = prefix_end + datalen;
-
-               /* Extract the fragment part. */
-               if (datalen >= 0) {
-                       if (*frag_or_post == '#') {
-                               uri->fragment = frag_or_post + 1;
-                               uri->fragmentlen = strcspn (uri->fragment, POST_CHAR_S);
-                               frag_or_post = uri->fragment + uri->fragmentlen;
-                       }
-                       if (*frag_or_post == POST_CHAR) {
-                               uri->post = frag_or_post + 1;
+       for (i = 0; i < UF_MAX; i ++) {
+               if (u.field_set & (1 << i)) {
+                       comp = p + u.field_data[i].off;
+                       complen = u.field_data[i].len;
+                       switch (i) {
+                       case UF_SCHEMA:
+                               uri->protocollen = u.field_data[i].len;
+                               break;
+                       case UF_HOST:
+                               uri->host = comp;
+                               uri->hostlen = complen;
+                               break;
+                       case UF_PATH:
+                               uri->data = comp;
+                               uri->datalen = complen;
+                               break;
+                       case UF_QUERY:
+                               uri->query = comp;
+                               uri->querylen = complen;
+                               break;
+                       case UF_FRAGMENT:
+                               uri->fragment = comp;
+                               uri->fragmentlen = complen;
+                               break;
+                       case UF_USERINFO:
+                               uri->user = comp;
+                               uri->userlen = complen;
+                               break;
+                       default:
+                               break;
                        }
                }
-               else {
-                       datalen = strlen (prefix_end);
-               }
-
-               uri->data = prefix_end;
-               uri->datalen = datalen;
-
-               return URI_ERRNO_OK;
-       }
-
-       /* Isolate host */
-
-       /* Get brackets enclosing IPv6 address */
-       lbracket = strchr (prefix_end, '[');
-       if (lbracket) {
-               rbracket = strchr (lbracket, ']');
-               /* [address] is handled only inside of hostname part (surprisingly). */
-               if (rbracket && rbracket < prefix_end + strcspn (prefix_end, "/"))
-                       uri->ipv6 = 1;
-               else
-                       lbracket = rbracket = NULL;
-       }
-       else {
-               rbracket = NULL;
-       }
-
-       /* Possibly skip auth part */
-       host_end = prefix_end + strcspn (prefix_end, "@");
-
-       if (prefix_end + strcspn (prefix_end, "/?") > host_end && *host_end) {  /* we have auth info here */
-
-               /* Allow '@' in the password component */
-               while (strcspn (host_end + 1, "@") < strcspn (host_end + 1, "/?"))
-                       host_end = host_end + 1 + strcspn (host_end + 1, "@");
-
-               user_end = strchr (prefix_end, ':');
-
-               if (!user_end || user_end > host_end) {
-                       uri->user = prefix_end;
-                       uri->userlen = host_end - prefix_end;
-               }
-               else {
-                       uri->user = prefix_end;
-                       uri->userlen = user_end - prefix_end;
-                       uri->password = user_end + 1;
-                       uri->passwordlen = host_end - user_end - 1;
-               }
-               prefix_end = host_end + 1;
-       }
-
-       if (uri->ipv6 && rbracket != NULL) {
-               host_end = rbracket + strcspn (rbracket, ":/?");
-       }
-       else {
-               host_end = prefix_end + strcspn (prefix_end, ":/?");
-       }
-
-       if (uri->ipv6) {
-               addrlen = rbracket - lbracket - 1;
-
-
-               uri->host = lbracket + 1;
-               uri->hostlen = addrlen;
        }
-       else {
-               uri->host = prefix_end;
-               uri->hostlen = host_end - prefix_end;
 
-               /* Trim trailing '.'s */
-               if (uri->hostlen && uri->host[uri->hostlen - 1] == '.')
-                       return URI_ERRNO_TRAILING_DOTS;
+       if (!uri->hostlen) {
+               return URI_ERRNO_BAD_FORMAT;
        }
 
-       if (*host_end == ':') {     /* we have port here */
-               port_end = host_end + 1 + strcspn (host_end + 1, "/");
-
-               host_end++;
-
-               uri->port = host_end;
-               uri->portlen = port_end - host_end;
-
-               if (uri->portlen == 0)
-                       return URI_ERRNO_NO_PORT_COLON;
-
-               /* We only use 8 bits for portlen so better check */
-               if ((gint)uri->portlen != port_end - host_end)
-                       return URI_ERRNO_INVALID_PORT;
-
-               /* test if port is number */
-               for (; host_end < port_end; host_end++)
-                       if (!g_ascii_isdigit (*host_end))
-                               return URI_ERRNO_INVALID_PORT;
+       rspamd_str_lc (uri->string, uri->protocollen);
+       rspamd_str_lc (uri->host,   uri->hostlen);
 
-               /* Check valid port value, and let show an error message
-                * about invalid url syntax. */
-               if (uri->port && uri->portlen) {
+       uri->protocol = PROTOCOL_UNKNOWN;
 
-                       errno = 0;
-                       n = strtol (uri->port, NULL, 10);
-                       if (errno || !uri_port_is_valid (n))
-                               return URI_ERRNO_INVALID_PORT;
+       for (i = 0; i < G_N_ELEMENTS (protocols); i ++) {
+               if (uri->protocollen == protocols[i].len) {
+                       if (memcmp (uri->string, protocols[i].name, uri->protocollen) == 0) {
+                               uri->protocol = i;
+                               break;
+                       }
                }
        }
 
-       if (*host_end == '/') {
-               host_end++;
+       if (uri->protocol == PROTOCOL_UNKNOWN) {
+               return URI_ERRNO_INVALID_PROTOCOL;
        }
-       else if (get_protocol_need_slash_after_host (uri->protocol) && *host_end !=
-               '?') {
-               /* The need for slash after the host component depends on the
-                * need for a host component. -- The dangerous mind of Jonah */
-               if (!uri->hostlen)
-                       return URI_ERRNO_NO_HOST;
-
-               return URI_ERRNO_NO_HOST_SLASH;
-       }
-
-       /* Look for #fragment or POST_CHAR */
-       prefix_end = host_end + strcspn (host_end, "#" POST_CHAR_S);
-       uri->data = host_end;
-       uri->datalen = prefix_end - host_end;
-
-       if (*prefix_end == '#') {
-               uri->fragment = prefix_end + 1;
-               uri->fragmentlen = strcspn (uri->fragment, POST_CHAR_S);
-               prefix_end = uri->fragment + uri->fragmentlen;
-       }
-
-       if (*prefix_end == POST_CHAR) {
-               uri->post = prefix_end + 1;
-       }
-
-       rspamd_str_lc (uri->string, uri->protocollen);
-       rspamd_str_lc (uri->host,   uri->hostlen);
-       /* Decode %HH sequences in host name.  This is important not so much
-          to support %HH sequences in host names (which other browser
-          don't), but to support binary characters (which will have been
-          converted to %HH by reencode_escapes).  */
-       if (strchr (uri->host, '%')) {
-               uri->hostlen = url_calculate_escaped_hostlen (uri->host, uri->hostlen);
-       }
-
-       url_strip (struri (uri));
-       rspamd_url_unescape (uri->host);
-
-       path_simplify (uri->data);
 
        return URI_ERRNO_OK;
 }
@@ -1821,14 +1319,14 @@ url_email_end (const gchar *begin,
 }
 
 void
-url_parse_text (rspamd_mempool_t * pool,
+rspamd_url_text_extract (rspamd_mempool_t * pool,
        struct rspamd_task *task,
        struct mime_text_part *part,
        gboolean is_html)
 {
        gint rc;
        gchar *url_str = NULL, *url_start, *url_end;
-       struct uri *new;
+       struct rspamd_url *new;
        struct process_exception *ex;
        gchar *p, *end, *begin;
 
@@ -1843,18 +1341,17 @@ url_parse_text (rspamd_mempool_t * pool,
                end = begin + part->content->len;
                p = begin;
                while (p < end) {
-                       if (url_try_text (pool, p, end - p, &url_start, &url_end, &url_str,
+                       if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str,
                                is_html)) {
                                if (url_str != NULL) {
-                                       new = rspamd_mempool_alloc0 (pool, sizeof (struct uri));
+                                       new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
                                        ex =
                                                rspamd_mempool_alloc0 (pool,
                                                        sizeof (struct process_exception));
                                        if (new != NULL) {
                                                g_strstrip (url_str);
-                                               rc = parse_uri (new, url_str, pool);
-                                               if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES ||
-                                                       rc == URI_ERRNO_NO_HOST_SLASH) &&
+                                               rc = rspamd_url_parse (new, url_str, strlen (url_str), pool);
+                                               if (rc == URI_ERRNO_OK &&
                                                        new->hostlen > 0) {
                                                        ex->pos = url_start - begin;
                                                        ex->len = url_end - url_start;
@@ -1877,7 +1374,7 @@ url_parse_text (rspamd_mempool_t * pool,
                                                else if (rc != URI_ERRNO_OK) {
                                                        msg_info ("extract of url '%s' failed: %s",
                                                                url_str,
-                                                               url_strerror (rc));
+                                                               rspamd_url_strerror (rc));
                                                }
                                        }
                                }
@@ -1897,7 +1394,7 @@ url_parse_text (rspamd_mempool_t * pool,
 }
 
 gboolean
-url_try_text (rspamd_mempool_t *pool,
+rspamd_url_find (rspamd_mempool_t *pool,
        const gchar *begin,
        gsize len,
        gchar **start,
index c9700436b3c9b3805baa68bb6d484042b7ded0e2..db3a3472c9013865fcacdc1d26d909e16e646a2a 100644 (file)
@@ -8,12 +8,9 @@
 struct rspamd_task;
 struct mime_text_part;
 
-struct uri {
-       /* The start of the uri (and thus start of the protocol string). */
+struct rspamd_url {
        gchar *string;
-
-       /* The internal type of protocol. Can _never_ be PROTOCOL_UNKNOWN. */
-       gint protocol; /* enum protocol */
+       gint protocol;
 
        gint ip_family;
 
@@ -22,20 +19,20 @@ struct uri {
        gchar *host;
        gchar *port;
        gchar *data;
+       gchar *query;
        gchar *fragment;
        gchar *post;
        gchar *surbl;
 
-       struct uri *phished_url;
+       struct rspamd_url *phished_url;
 
-       /* @protocollen should only be usable if @protocol is either
-        * PROTOCOL_USER or an uri string should be composed. */
        guint protocollen;
        guint userlen;
        guint passwordlen;
        guint hostlen;
        guint portlen;
        guint datalen;
+       guint querylen;
        guint fragmentlen;
        guint surbllen;
 
@@ -46,22 +43,16 @@ struct uri {
 };
 
 enum uri_errno {
-       URI_ERRNO_OK,           /* Parsing went well */
+       URI_ERRNO_OK = 0,           /* Parsing went well */
        URI_ERRNO_EMPTY,        /* The URI string was empty */
        URI_ERRNO_INVALID_PROTOCOL, /* No protocol was found */
-       URI_ERRNO_NO_SLASHES,       /* Slashes after protocol missing */
-       URI_ERRNO_TOO_MANY_SLASHES, /* Too many slashes after protocol */
-       URI_ERRNO_TRAILING_DOTS,    /* '.' after host */
-       URI_ERRNO_NO_HOST,      /* Host part is missing */
-       URI_ERRNO_NO_PORT_COLON,    /* ':' after host without port */
-       URI_ERRNO_NO_HOST_SLASH,    /* Slash after host missing */
-       URI_ERRNO_IPV6_SECURITY,    /* IPv6 security bug detected */
        URI_ERRNO_INVALID_PORT,     /* Port number is bad */
-       URI_ERRNO_INVALID_PORT_RANGE    /* Port number is not within 0-65535 */
+       URI_ERRNO_BAD_ENCODING, /* Bad characters encoding */
+       URI_ERRNO_BAD_FORMAT
 };
 
-enum protocol {
-       PROTOCOL_FILE,
+enum rspamd_url_protocol {
+       PROTOCOL_FILE = 0,
        PROTOCOL_FTP,
        PROTOCOL_HTTP,
        PROTOCOL_HTTPS,
@@ -78,7 +69,7 @@ enum protocol {
  * @param part current text part
  * @param is_html turn on html euristic
  */
-void url_parse_text (rspamd_mempool_t *pool,
+void rspamd_url_text_extract (rspamd_mempool_t *pool,
        struct rspamd_task *task,
        struct mime_text_part *part,
        gboolean is_html);
@@ -89,8 +80,9 @@ void url_parse_text (rspamd_mempool_t *pool,
  * @param uristring text form of url
  * @param uri url object, must be pre allocated
  */
-enum uri_errno parse_uri (struct uri *uri,
+enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
        gchar *uristring,
+       gsize len,
        rspamd_mempool_t *pool);
 
 /*
@@ -103,7 +95,7 @@ enum uri_errno parse_uri (struct uri *uri,
  * @param url_str storage for url string(or NULL)
  * @return TRUE if url is found in specified text
  */
-gboolean url_try_text (rspamd_mempool_t *pool,
+gboolean rspamd_url_find (rspamd_mempool_t *pool,
        const gchar *begin,
        gsize len,
        gchar **start,
@@ -114,7 +106,7 @@ gboolean url_try_text (rspamd_mempool_t *pool,
 /*
  * Return text representation of url parsing error
  */
-const gchar * url_strerror (enum uri_errno err);
+const gchar * rspamd_url_strerror (enum uri_errno err);
 
 /*
  * URL unescape characters in the specified string
index b45a684551da8949086e07c0897cd60248c607aa..932246542ac880fc6609991f86d73281ef326f66 100644 (file)
@@ -1427,7 +1427,7 @@ rspamd_strlcpy_tolower (gchar *dst, const gchar *src, gsize siz)
 gint
 rspamd_emails_cmp (gconstpointer a, gconstpointer b)
 {
-       const struct uri *u1 = a, *u2 = b;
+       const struct rspamd_url *u1 = a, *u2 = b;
        gint r;
 
        if (u1->hostlen != u2->hostlen || u1->hostlen == 0) {
@@ -1453,7 +1453,7 @@ rspamd_emails_cmp (gconstpointer a, gconstpointer b)
 gint
 rspamd_urls_cmp (gconstpointer a, gconstpointer b)
 {
-       const struct uri *u1 = a, *u2 = b;
+       const struct rspamd_url *u1 = a, *u2 = b;
        int r;
 
        if (u1->hostlen != u2->hostlen || u1->hostlen == 0) {
index b1a20647686ceb86041f60bfeff987e4afdd1409..3c92c7336a20df8cb97eb7c8ea35b24162df3936 100644 (file)
@@ -695,12 +695,12 @@ lua_check_image (lua_State * L)
        return ud ? *((struct rspamd_image **)ud) : NULL;
 }
 
-static struct uri *
+static struct rspamd_url *
 lua_check_url (lua_State * L)
 {
        void *ud = luaL_checkudata (L, 1, "rspamd{url}");
        luaL_argcheck (L, ud != NULL, 1, "'url' expected");
-       return ud ? *((struct uri **)ud) : NULL;
+       return ud ? *((struct rspamd_url **)ud) : NULL;
 }
 
 static int
@@ -934,10 +934,10 @@ struct lua_tree_cb_data {
 static gboolean
 lua_tree_url_callback (gpointer key, gpointer value, gpointer ud)
 {
-       struct uri **purl;
+       struct rspamd_url **purl;
        struct lua_tree_cb_data *cb = ud;
 
-       purl = lua_newuserdata (cb->L, sizeof (struct uri *));
+       purl = lua_newuserdata (cb->L, sizeof (struct rspamd_url *));
        rspamd_lua_setclass (cb->L, "rspamd{url}", -1);
        *purl = value;
        lua_rawseti (cb->L, -2, cb->i++);
@@ -2376,7 +2376,7 @@ lua_image_get_filename (lua_State *L)
 static gint
 lua_url_get_length (lua_State *L)
 {
-       struct uri *url = lua_check_url (L);
+       struct rspamd_url *url = lua_check_url (L);
 
        if (url != NULL) {
                lua_pushinteger (L, strlen (struri (url)));
@@ -2390,7 +2390,7 @@ lua_url_get_length (lua_State *L)
 static gint
 lua_url_get_host (lua_State *L)
 {
-       struct uri *url = lua_check_url (L);
+       struct rspamd_url *url = lua_check_url (L);
 
        if (url != NULL) {
                lua_pushlstring (L, url->host, url->hostlen);
@@ -2404,7 +2404,7 @@ lua_url_get_host (lua_State *L)
 static gint
 lua_url_get_user (lua_State *L)
 {
-       struct uri *url = lua_check_url (L);
+       struct rspamd_url *url = lua_check_url (L);
 
        if (url != NULL && url->user != NULL) {
                lua_pushlstring (L, url->user, url->userlen);
@@ -2419,7 +2419,7 @@ lua_url_get_user (lua_State *L)
 static gint
 lua_url_get_path (lua_State *L)
 {
-       struct uri *url = lua_check_url (L);
+       struct rspamd_url *url = lua_check_url (L);
 
        if (url != NULL) {
                lua_pushlstring (L, url->data, url->datalen);
@@ -2434,7 +2434,7 @@ lua_url_get_path (lua_State *L)
 static gint
 lua_url_get_text (lua_State *L)
 {
-       struct uri *url = lua_check_url (L);
+       struct rspamd_url *url = lua_check_url (L);
 
        if (url != NULL) {
                lua_pushstring (L, struri (url));
@@ -2449,7 +2449,7 @@ lua_url_get_text (lua_State *L)
 static gint
 lua_url_is_phished (lua_State *L)
 {
-       struct uri *url = lua_check_url (L);
+       struct rspamd_url *url = lua_check_url (L);
 
        if (url != NULL) {
                lua_pushboolean (L, url->is_phished);
@@ -2464,11 +2464,11 @@ lua_url_is_phished (lua_State *L)
 static gint
 lua_url_get_phished (lua_State *L)
 {
-       struct uri **purl, *url = lua_check_url (L);
+       struct rspamd_url **purl, *url = lua_check_url (L);
 
        if (url) {
                if (url->is_phished && url->phished_url != NULL) {
-                       purl = lua_newuserdata (L, sizeof (struct uri *));
+                       purl = lua_newuserdata (L, sizeof (struct rspamd_url *));
                        rspamd_lua_setclass (L, "rspamd{url}", -1);
                        *purl = url->phished_url;
 
index 39b8af1ec678b9e3da83171d2c5045327ca3a6f3..b1cc6e890bc4c809388ca3a7243901db38aa51fd 100644 (file)
@@ -378,7 +378,7 @@ static gboolean
 tree_url_callback (gpointer key, gpointer value, void *data)
 {
        struct url_regexp_param *param = data;
-       struct uri *url = value;
+       struct rspamd_url *url = value;
        GError *err = NULL;
 
        if (g_regex_match_full (param->regexp, struri (url), -1, 0, 0, NULL,
index 2fe35031aedca8397b871d31051a086d3b6ce142..201a6df18bfdc1510ad85444021894be558a91ab 100644 (file)
@@ -577,7 +577,7 @@ format_surbl_request (rspamd_mempool_t * pool,
        GError ** err,
        gboolean forced,
        GTree *tree,
-       struct uri *url)
+       struct rspamd_url *url)
 {
        GHashTable *t;
        gchar *result = NULL, *dots[MAX_LEVELS],
@@ -754,7 +754,7 @@ format_surbl_request (rspamd_mempool_t * pool,
 }
 
 static void
-make_surbl_requests (struct uri *url, struct rspamd_task *task,
+make_surbl_requests (struct rspamd_url *url, struct rspamd_task *task,
        struct suffix_item *suffix, gboolean forced, GTree *tree)
 {
        gchar *surbl_req;
@@ -954,7 +954,7 @@ redirector_callback (gint fd, short what, void *arg)
                                                struri (param->url),
                                                c);
                                        r =
-                                               parse_uri (param->url,
+                                               rspamd_url_parse (param->url,
                                                        rspamd_mempool_strdup (param->task->task_pool,
                                                        c), param->task->task_pool);
                                        if (r == URI_ERRNO_OK || r == URI_ERRNO_NO_SLASHES || r ==
@@ -986,7 +986,7 @@ redirector_callback (gint fd, short what, void *arg)
 
 
 static void
-register_redirector_call (struct uri *url, struct rspamd_task *task,
+register_redirector_call (struct rspamd_url *url, struct rspamd_task *task,
        struct suffix_item *suffix, const gchar *rule, GTree *tree)
 {
        gint s = -1;
@@ -1043,7 +1043,7 @@ surbl_tree_url_callback (gpointer key, gpointer value, void *data)
 {
        struct redirector_param *param = data;
        struct rspamd_task *task;
-       struct uri *url = value;
+       struct rspamd_url *url = value;
        gchar *red_domain;
        const gchar *pos;
        GRegex *re;
@@ -1135,7 +1135,7 @@ static gboolean
 calculate_buflen_cb (gpointer key, gpointer value, gpointer cbdata)
 {
        struct urls_tree_cb_data *cb = cbdata;
-       struct uri *url = value;
+       struct rspamd_url *url = value;
 
        cb->len += strlen (struri (url)) + url->hostlen + sizeof (" <\"\">, ") - 1;
 
@@ -1146,7 +1146,7 @@ static gboolean
 write_urls_buffer (gpointer key, gpointer value, gpointer cbdata)
 {
        struct urls_tree_cb_data *cb = cbdata;
-       struct uri *url = value;
+       struct rspamd_url *url = value;
        rspamd_fstring_t f;
        gchar *urlstr;
        gsize len;
index 7701c8304cd7ac44e48bf574a8a18b60cf8e6f49..959a730ded3e78135106b368903c0124c78179a5 100644 (file)
@@ -46,14 +46,14 @@ struct suffix_item {
 };
 
 struct dns_param {
-       struct uri *url;
+       struct rspamd_url *url;
        struct rspamd_task *task;
        gchar *host_resolve;
        struct suffix_item *suffix;
 };
 
 struct redirector_param {
-       struct uri *url;
+       struct rspamd_url *url;
        struct rspamd_task *task;
        struct upstream *redirector;
        enum {