diff options
Diffstat (limited to 'src/libserver/url.c')
-rw-r--r-- | src/libserver/url.c | 723 |
1 files changed, 110 insertions, 613 deletions
diff --git a/src/libserver/url.c b/src/libserver/url.c index 3e4ccc827..22cb15759 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -29,6 +29,7 @@ #include "main.h" #include "message.h" #include "trie.h" +#include "http.h" #define POST_CHAR 1 #define POST_CHAR_S "\001" @@ -695,28 +696,6 @@ struct url_match_scanner { struct url_match_scanner *url_scanner = NULL; -static const struct _proto protocol_backends[] = { - {"file", 0, NULL, 1, 0, 0, 0}, - {"ftp", 21, NULL, 1, 0, 0, 0}, - {"http", 80, NULL, 1, 0, 0, 0}, - {"https", 443, NULL, 1, 0, 0, 1}, - {"mailto", 25, NULL, 1, 0, 0, 0}, - /* Keep these last! */ - {NULL, 0, NULL, 0, 0, 1, 0} -}; - -/* Convert an ASCII hex digit to the corresponding number between 0 - and 15. H should be a hexadecimal digit that satisfies isxdigit; - otherwise, the result is undefined. */ -#define XDIGIT_TO_NUM(h) ((h) < 'A' ? (h) - '0' : g_ascii_toupper (h) - 'A' + \ - 10) -#define X2DIGITS_TO_NUM(h1, h2) ((XDIGIT_TO_NUM (h1) << 4) + XDIGIT_TO_NUM (h2)) -/* The reverse of the above: convert a number in the [0, 16) range to - the ASCII representation of the corresponding hexadecimal digit. - `+ 0' is there so you can't accidentally use it as an lvalue. */ -#define XNUM_TO_DIGIT(x) ("0123456789ABCDEF"[x] + 0) -#define XNUM_TO_digit(x) ("0123456789abcdef"[x] + 0) - static guchar url_scanner_table[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -759,7 +738,7 @@ enum { const gchar * -url_strerror (enum uri_errno err) +rspamd_url_strerror (enum uri_errno err) { switch (err) { case URI_ERRNO_OK: @@ -768,37 +747,17 @@ url_strerror (enum uri_errno err) return "The URI string was empty"; case URI_ERRNO_INVALID_PROTOCOL: return "No protocol was found"; - case URI_ERRNO_NO_SLASHES: - return "Slashes after protocol missing"; - case URI_ERRNO_TOO_MANY_SLASHES: - return "Too many slashes after protocol"; - case URI_ERRNO_TRAILING_DOTS: - return "'.' after host"; - case URI_ERRNO_NO_HOST: - return "Host part is missing"; - case URI_ERRNO_NO_PORT_COLON: - return "':' after host without port"; - case URI_ERRNO_NO_HOST_SLASH: - return "Slash after host missing"; - case URI_ERRNO_IPV6_SECURITY: - return "IPv6 security bug detected"; + case URI_ERRNO_BAD_FORMAT: + return "Bad URL format"; + case URI_ERRNO_BAD_ENCODING: + return "Invalid symbols encoded"; case URI_ERRNO_INVALID_PORT: return "Port number is bad"; - case URI_ERRNO_INVALID_PORT_RANGE: - return "Port number is not within 0-65535"; } return NULL; } static gint -check_uri_file (gchar *name) -{ - static const gchar chars[] = POST_CHAR_S "#?"; - - return strcspn (name, chars); -} - -static gint url_init (void) { guint i; @@ -843,590 +802,129 @@ url_init (void) return 0; } -enum protocol -get_protocol (gchar *name, gint namelen) -{ - /* These are really enum protocol values but can take on negative - * values and since 0 <= -1 for enum values it's better to use clean - * integer type. */ - gint start, end; - enum protocol protocol; - guchar *pname; - gint pnamelen, minlen, compare; - - /* Almost dichotomic search is used here */ - /* Starting at the HTTP entry which is the most common that will make - * file and NNTP the next entries checked and amongst the third checks - * are proxy and FTP. */ - start = 0; - end = PROTOCOL_UNKNOWN - 1; - protocol = PROTOCOL_HTTP; - - while (start <= end) { - pname = protocol_backends[protocol].name; - pnamelen = strlen (pname); - minlen = MIN (pnamelen, namelen); - compare = g_ascii_strncasecmp (pname, name, minlen); - - if (compare == 0) { - if (pnamelen == namelen) - return protocol; - - /* If the current protocol name is longer than the - * protocol name being searched for move @end else move - * @start. */ - compare = pnamelen > namelen ? 1 : -1; - } - - if (compare > 0) - end = protocol - 1; - else - start = protocol + 1; - - protocol = (start + end) / 2; - } - - return PROTOCOL_UNKNOWN; -} - - -gint -get_protocol_port (enum protocol protocol) -{ - return protocol_backends[protocol].port; -} - -gint -get_protocol_need_slashes (enum protocol protocol) -{ - return protocol_backends[protocol].need_slashes; -} - -gint -get_protocol_need_slash_after_host (enum protocol protocol) -{ - return protocol_backends[protocol].need_slash_after_host; -} - -gint -get_protocol_free_syntax (enum protocol protocol) -{ - return protocol_backends[protocol].free_syntax; -} - -static gint -get_protocol_length (const gchar *url) -{ - gchar *end = (gchar *)url; - - /* Seek the end of the protocol name if any. */ - /* RFC1738: - * scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] - * (but per its recommendations we accept "upalpha" too) */ - while (*end && (g_ascii_isalnum (*end) || *end == '+' - || *end == '-' || *end == '.')) { - end++; - } - - /* Also return 0 if there's no protocol name (@end == @url). */ - return (*end == ':') ? end - url : 0; -} - - -/* - * Calcualte new length of unescaped hostlen - */ -static guint -url_calculate_escaped_hostlen (gchar *host, guint hostlen) -{ - guint i, result = hostlen; - gchar *p = host, c; - - for (i = 0; i < hostlen; i++, p++) { - if (*p == '%' && g_ascii_isxdigit (*(p + 1)) && - g_ascii_isxdigit (*(p + 2)) && i < hostlen - 2) { - c = X2DIGITS_TO_NUM (*(p + 1), *(p + 2)); - if (c != '\0') { - result -= 2; - } - } - } - - return result; -} - -void -rspamd_url_unescape (gchar *s) -{ - gchar *t = s; /* t - tortoise */ - gchar *h = s; /* h - hare */ - - for (; *h; h++, t++) { - if (*h != '%') { - *t = *h; - } - else { - gchar c; - if (!h[1] || !h[2] || - !(g_ascii_isxdigit (h[1]) && g_ascii_isxdigit (h[2]))) { - *t = *h; - } - else { - c = X2DIGITS_TO_NUM (h[1], h[2]); - if (c != '\0') { - *t = c; - h += 2; - } - else { - *t = *h; - } - } - } - } - *t = '\0'; -} - -static void -url_strip (gchar *s) -{ - gchar *t = s; /* t - tortoise */ - gchar *h = s; /* h - hare */ - - while (*h) { - if (g_ascii_isgraph (*h)) { - *t = *h; - t++; - } - h++; - } - *t = '\0'; -} - -static gchar * -url_escape_1 (const gchar *s, gint allow_passthrough, rspamd_mempool_t * pool) -{ - const gchar *p1; - gchar *p2, *newstr; - gint newlen; - gint addition = 0; - - for (p1 = s; *p1; p1++) - if (!is_urlsafe (*p1)) { - addition += 2; /* Two more characters (hex digits) */ - } - - if (!addition) { - if (allow_passthrough) { - return (gchar *)s; - } - else { - return rspamd_mempool_strdup (pool, s); - } - } - - newlen = (p1 - s) + addition; - newstr = (gchar *)rspamd_mempool_alloc (pool, newlen + 1); - - p1 = s; - p2 = newstr; - while (*p1) { - /* Quote the characters that match the test mask. */ - if (!is_urlsafe (*p1)) { - guchar c = *p1++; - *p2++ = '%'; - *p2++ = XNUM_TO_DIGIT (c >> 4); - *p2++ = XNUM_TO_DIGIT (c & 0xf); - } - else - *p2++ = *p1++; - } - *p2 = '\0'; - - return newstr; -} - -/* URL-escape the unsafe characters (see urlchr_table) in a given - string, returning a freshly allocated string. */ - -gchar * -url_escape (const gchar *s, rspamd_mempool_t * pool) -{ - return url_escape_1 (s, 0, pool); -} - -/* Decide whether the gchar at position P needs to be encoded. (It is - not enough to pass a single gchar *P because the function may need - to inspect the surrounding context.) - - Return 1 if the gchar should be escaped as %XX, 0 otherwise. */ - -static inline gboolean -char_needs_escaping (const gchar *p) -{ - if (*p == '%') { - if (g_ascii_isxdigit (*(p + 1)) && g_ascii_isxdigit (*(p + 2))) { - return FALSE; - } - else { - return TRUE; - } - } - else if (!is_urlsafe (*p)) { - return TRUE; - } - return FALSE; -} - -static gchar * -rspamd_url_reencode_escapes (gchar *s, rspamd_mempool_t * pool) -{ - const gchar *p1; - gchar *newstr, *p2; - gint oldlen, newlen; - - gint encode_count = 0; - - /* First pass: inspect the string to see if there's anything to do, - and to calculate the new length. */ - for (p1 = s; *p1; p1++) { - if (char_needs_escaping (p1)) { - ++encode_count; - } - } - - if (!encode_count) { - /* The string is good as it is. */ - return s; - } - - oldlen = p1 - s; - /* Each encoding adds two characters (hex digits). */ - newlen = oldlen + 2 * encode_count; - newstr = rspamd_mempool_alloc (pool, newlen + 1); - - /* Second pass: copy the string to the destination address, encoding - chars when needed. */ - p1 = s; - p2 = newstr; - - while (*p1) { - if (char_needs_escaping (p1)) { - guchar c = *p1++; - *p2++ = '%'; - *p2++ = XNUM_TO_DIGIT (c >> 4); - *p2++ = XNUM_TO_DIGIT (c & 0xf); - } - else { - *p2++ = *p1++; - } - } - - *p2 = '\0'; - return newstr; -} - -/* - * Resolve "." and ".." elements of PATH by destructively modifying - * PATH and return non-zero if PATH has been modified, zero otherwise. - */ - -static gboolean -path_simplify (gchar *path) -{ - gchar *h = path; /* hare */ - gchar *t = path; /* tortoise */ - gchar *beg = path; /* boundary for backing the tortoise */ - gchar *end = path + strlen (path); - - while (h < end) { - /* Hare should be at the beginning of a path element. */ - if (h[0] == '.' && (h[1] == '/' || h[1] == '\0')) { - /* Ignore "./". */ - h += 2; - } - else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0')) { - /* Handle "../" by retreating the tortoise by one path - element -- but not past beginning. */ - if (t > beg) { - /* Move backwards until T hits the beginning of the - previous path element or the beginning of path. */ - for (--t; t > beg && t[-1] != '/'; t--) ; - } - else { - /* If we're at the beginning, copy the "../" literally - move the beginning so a later ".." doesn't remove - it. */ - beg = t + 3; - goto regular; - } - h += 3; - } - else { -regular: - /* A regular path element. If H hasn't advanced past T, - simply skip to the next path element. Otherwise, copy - the path element until the next slash. */ - if (t == h) { - /* Skip the path element, including the slash. */ - while (h < end && *h != '/') - t++, h++; - if (h < end) - t++, h++; - } - else { - /* Copy the path element, including the final slash. */ - while (h < end && *h != '/') - *t++ = *h++; - if (h < end) - *t++ = *h++; - } - } - } - - if (t != h) - *t = '\0'; - - return t != h; -} enum uri_errno -parse_uri (struct uri *uri, gchar *uristring, rspamd_mempool_t * pool) +rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, + rspamd_mempool_t *pool) { - guchar *prefix_end, *host_end, *p; - guchar *lbracket, *rbracket; - gint datalen, n, addrlen; - guchar *frag_or_post, *user_end, *port_end; + struct http_parser_url u; + gchar *p, *comp; + gint i, complen; + + const struct { + enum rspamd_url_protocol proto; + const gchar *name; + gsize len; + } protocols[] = { + { + .proto = PROTOCOL_FILE, + .name = "file", + .len = 4 + }, + { + .proto = PROTOCOL_FTP, + .name = "ftp", + .len = 3 + }, + { + .proto = PROTOCOL_HTTP, + .name = "http", + .len = 4 + }, + { + .proto = PROTOCOL_HTTPS, + .name = "https", + .len = 5 + }, + { + .proto = PROTOCOL_MAILTO, + .name = "mailto", + .len = 6 + }, + { + .proto = PROTOCOL_UNKNOWN, + .name = NULL, + .len = 0 + } + }; memset (uri, 0, sizeof (*uri)); - if (!*uristring) { + if (*uristring == '\0') { return URI_ERRNO_EMPTY; } - uri->string = rspamd_url_reencode_escapes (uristring, pool); - msg_debug ("reencoding escapes in original url: '%s'", struri (uri)); - uri->protocollen = get_protocol_length (struri (uri)); - - /* Assume http as default protocol */ - if (!uri->protocollen || - (uri->protocol = - get_protocol (struri (uri), uri->protocollen)) == PROTOCOL_UNKNOWN) { - /* Make exception for numeric urls */ - p = uri->string; - while (*p && (g_ascii_isalnum (*p) || *p == ':')) { - p++; - } - if (*p == '\0') { - return URI_ERRNO_INVALID_PROTOCOL; - } - p = g_strconcat ("http://", uri->string, NULL); - uri->string = rspamd_mempool_strdup (pool, p); - g_free (p); - uri->protocol = PROTOCOL_HTTP; - prefix_end = struri (uri) + 7; + p = g_uri_unescape_string (uristring, NULL); + if (p == NULL) { + return URI_ERRNO_BAD_ENCODING; } - else { - /* Figure out whether the protocol is known */ - msg_debug ("getting protocol from url: %d", uri->protocol); - - prefix_end = struri (uri) + uri->protocollen; /* ':' */ - /* Check if there's a digit after the protocol name. */ - if (g_ascii_isdigit (*prefix_end)) { - p = struri (uri); - uri->ip_family = p[uri->protocollen] - '0'; - prefix_end++; - } - if (*prefix_end != ':') { - msg_debug ("invalid protocol in uri"); - return URI_ERRNO_INVALID_PROTOCOL; - } - prefix_end++; + uri->string = p; - /* Skip slashes */ + rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t)g_free, p); - if (prefix_end[0] == '/' && prefix_end[1] == '/') { - if (prefix_end[2] == '/') { - msg_debug ("too many '/' in uri"); - return URI_ERRNO_TOO_MANY_SLASHES; - } - - prefix_end += 2; - - } - else { - msg_debug ("no '/' in uri"); - return URI_ERRNO_NO_SLASHES; - } + /* + * We assume here that urls has the sane scheme + */ + if (http_parser_parse_url (p, len, 0, &u) != 0) { + return URI_ERRNO_BAD_FORMAT; } - if (get_protocol_free_syntax (uri->protocol)) { - uri->data = prefix_end; - uri->datalen = strlen (prefix_end); - return URI_ERRNO_OK; - - } - else if (uri->protocol == PROTOCOL_FILE) { - datalen = check_uri_file (prefix_end); - frag_or_post = prefix_end + datalen; - - /* Extract the fragment part. */ - if (datalen >= 0) { - if (*frag_or_post == '#') { - uri->fragment = frag_or_post + 1; - uri->fragmentlen = strcspn (uri->fragment, POST_CHAR_S); - frag_or_post = uri->fragment + uri->fragmentlen; - } - if (*frag_or_post == POST_CHAR) { - uri->post = frag_or_post + 1; + for (i = 0; i < UF_MAX; i ++) { + if (u.field_set & (1 << i)) { + comp = p + u.field_data[i].off; + complen = u.field_data[i].len; + switch (i) { + case UF_SCHEMA: + uri->protocollen = u.field_data[i].len; + break; + case UF_HOST: + uri->host = comp; + uri->hostlen = complen; + break; + case UF_PATH: + uri->data = comp; + uri->datalen = complen; + break; + case UF_QUERY: + uri->query = comp; + uri->querylen = complen; + break; + case UF_FRAGMENT: + uri->fragment = comp; + uri->fragmentlen = complen; + break; + case UF_USERINFO: + uri->user = comp; + uri->userlen = complen; + break; + default: + break; } } - else { - datalen = strlen (prefix_end); - } - - uri->data = prefix_end; - uri->datalen = datalen; - - return URI_ERRNO_OK; - } - - /* Isolate host */ - - /* Get brackets enclosing IPv6 address */ - lbracket = strchr (prefix_end, '['); - if (lbracket) { - rbracket = strchr (lbracket, ']'); - /* [address] is handled only inside of hostname part (surprisingly). */ - if (rbracket && rbracket < prefix_end + strcspn (prefix_end, "/")) - uri->ipv6 = 1; - else - lbracket = rbracket = NULL; - } - else { - rbracket = NULL; - } - - /* Possibly skip auth part */ - host_end = prefix_end + strcspn (prefix_end, "@"); - - if (prefix_end + strcspn (prefix_end, "/?") > host_end && *host_end) { /* we have auth info here */ - - /* Allow '@' in the password component */ - while (strcspn (host_end + 1, "@") < strcspn (host_end + 1, "/?")) - host_end = host_end + 1 + strcspn (host_end + 1, "@"); - - user_end = strchr (prefix_end, ':'); - - if (!user_end || user_end > host_end) { - uri->user = prefix_end; - uri->userlen = host_end - prefix_end; - } - else { - uri->user = prefix_end; - uri->userlen = user_end - prefix_end; - uri->password = user_end + 1; - uri->passwordlen = host_end - user_end - 1; - } - prefix_end = host_end + 1; - } - - if (uri->ipv6 && rbracket != NULL) { - host_end = rbracket + strcspn (rbracket, ":/?"); - } - else { - host_end = prefix_end + strcspn (prefix_end, ":/?"); - } - - if (uri->ipv6) { - addrlen = rbracket - lbracket - 1; - - - uri->host = lbracket + 1; - uri->hostlen = addrlen; } - else { - uri->host = prefix_end; - uri->hostlen = host_end - prefix_end; - /* Trim trailing '.'s */ - if (uri->hostlen && uri->host[uri->hostlen - 1] == '.') - return URI_ERRNO_TRAILING_DOTS; + if (!uri->hostlen) { + return URI_ERRNO_BAD_FORMAT; } - if (*host_end == ':') { /* we have port here */ - port_end = host_end + 1 + strcspn (host_end + 1, "/"); - - host_end++; - - uri->port = host_end; - uri->portlen = port_end - host_end; - - if (uri->portlen == 0) - return URI_ERRNO_NO_PORT_COLON; - - /* We only use 8 bits for portlen so better check */ - if ((gint)uri->portlen != port_end - host_end) - return URI_ERRNO_INVALID_PORT; - - /* test if port is number */ - for (; host_end < port_end; host_end++) - if (!g_ascii_isdigit (*host_end)) - return URI_ERRNO_INVALID_PORT; + rspamd_str_lc (uri->string, uri->protocollen); + rspamd_str_lc (uri->host, uri->hostlen); - /* Check valid port value, and let show an error message - * about invalid url syntax. */ - if (uri->port && uri->portlen) { + uri->protocol = PROTOCOL_UNKNOWN; - errno = 0; - n = strtol (uri->port, NULL, 10); - if (errno || !uri_port_is_valid (n)) - return URI_ERRNO_INVALID_PORT; + for (i = 0; i < G_N_ELEMENTS (protocols); i ++) { + if (uri->protocollen == protocols[i].len) { + if (memcmp (uri->string, protocols[i].name, uri->protocollen) == 0) { + uri->protocol = i; + break; + } } } - if (*host_end == '/') { - host_end++; + if (uri->protocol == PROTOCOL_UNKNOWN) { + return URI_ERRNO_INVALID_PROTOCOL; } - else if (get_protocol_need_slash_after_host (uri->protocol) && *host_end != - '?') { - /* The need for slash after the host component depends on the - * need for a host component. -- The dangerous mind of Jonah */ - if (!uri->hostlen) - return URI_ERRNO_NO_HOST; - - return URI_ERRNO_NO_HOST_SLASH; - } - - /* Look for #fragment or POST_CHAR */ - prefix_end = host_end + strcspn (host_end, "#" POST_CHAR_S); - uri->data = host_end; - uri->datalen = prefix_end - host_end; - - if (*prefix_end == '#') { - uri->fragment = prefix_end + 1; - uri->fragmentlen = strcspn (uri->fragment, POST_CHAR_S); - prefix_end = uri->fragment + uri->fragmentlen; - } - - if (*prefix_end == POST_CHAR) { - uri->post = prefix_end + 1; - } - - rspamd_str_lc (uri->string, uri->protocollen); - rspamd_str_lc (uri->host, uri->hostlen); - /* Decode %HH sequences in host name. This is important not so much - to support %HH sequences in host names (which other browser - don't), but to support binary characters (which will have been - converted to %HH by reencode_escapes). */ - if (strchr (uri->host, '%')) { - uri->hostlen = url_calculate_escaped_hostlen (uri->host, uri->hostlen); - } - - url_strip (struri (uri)); - rspamd_url_unescape (uri->host); - - path_simplify (uri->data); return URI_ERRNO_OK; } @@ -1821,14 +1319,14 @@ url_email_end (const gchar *begin, } void -url_parse_text (rspamd_mempool_t * pool, +rspamd_url_text_extract (rspamd_mempool_t * pool, struct rspamd_task *task, struct mime_text_part *part, gboolean is_html) { gint rc; gchar *url_str = NULL, *url_start, *url_end; - struct uri *new; + struct rspamd_url *new; struct process_exception *ex; gchar *p, *end, *begin; @@ -1843,18 +1341,17 @@ url_parse_text (rspamd_mempool_t * pool, end = begin + part->content->len; p = begin; while (p < end) { - if (url_try_text (pool, p, end - p, &url_start, &url_end, &url_str, + if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str, is_html)) { if (url_str != NULL) { - new = rspamd_mempool_alloc0 (pool, sizeof (struct uri)); + new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url)); ex = rspamd_mempool_alloc0 (pool, sizeof (struct process_exception)); if (new != NULL) { g_strstrip (url_str); - rc = parse_uri (new, url_str, pool); - if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || - rc == URI_ERRNO_NO_HOST_SLASH) && + rc = rspamd_url_parse (new, url_str, strlen (url_str), pool); + if (rc == URI_ERRNO_OK && new->hostlen > 0) { ex->pos = url_start - begin; ex->len = url_end - url_start; @@ -1877,7 +1374,7 @@ url_parse_text (rspamd_mempool_t * pool, else if (rc != URI_ERRNO_OK) { msg_info ("extract of url '%s' failed: %s", url_str, - url_strerror (rc)); + rspamd_url_strerror (rc)); } } } @@ -1897,7 +1394,7 @@ url_parse_text (rspamd_mempool_t * pool, } gboolean -url_try_text (rspamd_mempool_t *pool, +rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len, gchar **start, |