From b4c0e9b59d3985726d9a346085172394a0495ce6 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 3 Feb 2015 17:39:03 +0000 Subject: [PATCH] Remove old crap functions from url parser code. --- src/libmime/message.c | 14 +- src/libserver/html.c | 18 +- src/libserver/protocol.c | 4 +- src/libserver/url.c | 723 ++++++--------------------------------- src/libserver/url.h | 38 +- src/libutil/util.c | 4 +- src/lua/lua_task.c | 24 +- src/plugins/regexp.c | 2 +- src/plugins/surbl.c | 14 +- src/plugins/surbl.h | 4 +- 10 files changed, 167 insertions(+), 678 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index 869d0a06e..702b148cb 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -1233,7 +1233,7 @@ process_text_part (struct rspamd_task *task, decode_entitles (text_part->content->data, &text_part->content->len); } - url_parse_text (task->task_pool, task, text_part, TRUE); + rspamd_url_text_extract (task->task_pool, task, text_part, TRUE); rspamd_fuzzy_from_text_part (text_part, task->task_pool, task->cfg->max_diff); rspamd_mempool_add_destructor (task->task_pool, @@ -1260,7 +1260,7 @@ process_text_part (struct rspamd_task *task, type, text_part); text_part->orig = part_content; - url_parse_text (task->task_pool, task, text_part, FALSE); + rspamd_url_text_extract (task->task_pool, task, text_part, FALSE); rspamd_fuzzy_from_text_part (text_part, task->task_pool, task->cfg->max_diff); task->text_parts = g_list_prepend (task->text_parts, text_part); } @@ -1460,7 +1460,7 @@ process_message (struct rspamd_task *task) GMimeDataWrapper *wrapper; struct received_header *recv; gchar *mid, *url_str, *p, *end, *url_end; - struct uri *subject_url; + struct rspamd_url *subject_url; gsize len; gint rc; @@ -1634,14 +1634,14 @@ process_message (struct rspamd_task *task) while (p < end) { /* Search to the end of url */ - if (url_try_text (task->task_pool, p, end - p, NULL, &url_end, + if (rspamd_url_find (task->task_pool, p, end - p, NULL, &url_end, &url_str, FALSE)) { if (url_str != NULL) { subject_url = rspamd_mempool_alloc0 (task->task_pool, - sizeof (struct uri)); + sizeof (struct rspamd_url)); if (subject_url != NULL) { /* Try to parse url */ - rc = parse_uri (subject_url, url_str, task->task_pool); + rc = rspamd_url_parse (subject_url, url_str, task->task_pool); if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) && subject_url->hostlen > 0) { @@ -1656,7 +1656,7 @@ process_message (struct rspamd_task *task) else if (rc != URI_ERRNO_OK) { msg_info ("extract of url '%s' failed: %s", url_str, - url_strerror (rc)); + rspamd_url_strerror (rc)); } } } diff --git a/src/libserver/html.c b/src/libserver/html.c index 5055a9aae..7df9270c3 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -674,12 +674,12 @@ decode_entitles (gchar *s, guint * len) static void check_phishing (struct rspamd_task *task, - struct uri *href_url, + struct rspamd_url *href_url, const gchar *url_text, gsize remain, tag_id_t id) { - struct uri *new; + struct rspamd_url *new; gchar *url_str; const gchar *p, *c; gchar tagbuf[128]; @@ -732,12 +732,12 @@ check_phishing (struct rspamd_task *task, p++; } - if (url_try_text (task->task_pool, url_text, len, NULL, NULL, &url_str, + if (rspamd_url_find (task->task_pool, url_text, len, NULL, NULL, &url_str, TRUE) && url_str != NULL) { - new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct uri)); + new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_url)); if (new != NULL) { g_strstrip (url_str); - rc = parse_uri (new, url_str, task->task_pool); + rc = rspamd_url_parse (new, url_str, task->task_pool); if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) { @@ -787,7 +787,7 @@ check_phishing (struct rspamd_task *task, else { msg_info ("extract of url '%s' failed: %s", url_str, - url_strerror (rc)); + rspamd_url_strerror (rc)); } } } @@ -804,7 +804,7 @@ parse_tag_url (struct rspamd_task *task, { gchar *c = NULL, *p, *url_text; gint len, rc; - struct uri *url; + struct rspamd_url *url; gboolean got_single_quote = FALSE, got_double_quote = FALSE; /* For A tags search for href= and for IMG tags search for src= */ @@ -885,8 +885,8 @@ parse_tag_url (struct rspamd_task *task, return; } - url = rspamd_mempool_alloc (task->task_pool, sizeof (struct uri)); - rc = parse_uri (url, url_text, task->task_pool); + url = rspamd_mempool_alloc (task->task_pool, sizeof (struct rspamd_url)); + rc = rspamd_url_parse (url, url_text, task->task_pool); if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST && url->hostlen != 0) { diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c index 25dcee1c0..e702bfc14 100644 --- a/src/libserver/protocol.c +++ b/src/libserver/protocol.c @@ -492,7 +492,7 @@ static gboolean urls_protocol_cb (gpointer key, gpointer value, gpointer ud) { struct tree_cb_data *cb = ud; - struct uri *url = value; + struct rspamd_url *url = value; ucl_object_t *obj, *elt; if (!cb->task->extended_urls) { @@ -550,7 +550,7 @@ static gboolean emails_protocol_cb (gpointer key, gpointer value, gpointer ud) { struct tree_cb_data *cb = ud; - struct uri *url = value; + struct rspamd_url *url = value; ucl_object_t *obj; obj = ucl_object_fromlstring (url->user, url->userlen + url->hostlen + 1); diff --git a/src/libserver/url.c b/src/libserver/url.c index 3e4ccc827..22cb15759 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -29,6 +29,7 @@ #include "main.h" #include "message.h" #include "trie.h" +#include "http.h" #define POST_CHAR 1 #define POST_CHAR_S "\001" @@ -695,28 +696,6 @@ struct url_match_scanner { struct url_match_scanner *url_scanner = NULL; -static const struct _proto protocol_backends[] = { - {"file", 0, NULL, 1, 0, 0, 0}, - {"ftp", 21, NULL, 1, 0, 0, 0}, - {"http", 80, NULL, 1, 0, 0, 0}, - {"https", 443, NULL, 1, 0, 0, 1}, - {"mailto", 25, NULL, 1, 0, 0, 0}, - /* Keep these last! */ - {NULL, 0, NULL, 0, 0, 1, 0} -}; - -/* Convert an ASCII hex digit to the corresponding number between 0 - and 15. H should be a hexadecimal digit that satisfies isxdigit; - otherwise, the result is undefined. */ -#define XDIGIT_TO_NUM(h) ((h) < 'A' ? (h) - '0' : g_ascii_toupper (h) - 'A' + \ - 10) -#define X2DIGITS_TO_NUM(h1, h2) ((XDIGIT_TO_NUM (h1) << 4) + XDIGIT_TO_NUM (h2)) -/* The reverse of the above: convert a number in the [0, 16) range to - the ASCII representation of the corresponding hexadecimal digit. - `+ 0' is there so you can't accidentally use it as an lvalue. */ -#define XNUM_TO_DIGIT(x) ("0123456789ABCDEF"[x] + 0) -#define XNUM_TO_digit(x) ("0123456789abcdef"[x] + 0) - static guchar url_scanner_table[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -759,7 +738,7 @@ enum { const gchar * -url_strerror (enum uri_errno err) +rspamd_url_strerror (enum uri_errno err) { switch (err) { case URI_ERRNO_OK: @@ -768,36 +747,16 @@ url_strerror (enum uri_errno err) return "The URI string was empty"; case URI_ERRNO_INVALID_PROTOCOL: return "No protocol was found"; - case URI_ERRNO_NO_SLASHES: - return "Slashes after protocol missing"; - case URI_ERRNO_TOO_MANY_SLASHES: - return "Too many slashes after protocol"; - case URI_ERRNO_TRAILING_DOTS: - return "'.' after host"; - case URI_ERRNO_NO_HOST: - return "Host part is missing"; - case URI_ERRNO_NO_PORT_COLON: - return "':' after host without port"; - case URI_ERRNO_NO_HOST_SLASH: - return "Slash after host missing"; - case URI_ERRNO_IPV6_SECURITY: - return "IPv6 security bug detected"; + case URI_ERRNO_BAD_FORMAT: + return "Bad URL format"; + case URI_ERRNO_BAD_ENCODING: + return "Invalid symbols encoded"; case URI_ERRNO_INVALID_PORT: return "Port number is bad"; - case URI_ERRNO_INVALID_PORT_RANGE: - return "Port number is not within 0-65535"; } return NULL; } -static gint -check_uri_file (gchar *name) -{ - static const gchar chars[] = POST_CHAR_S "#?"; - - return strcspn (name, chars); -} - static gint url_init (void) { @@ -843,590 +802,129 @@ url_init (void) return 0; } -enum protocol -get_protocol (gchar *name, gint namelen) -{ - /* These are really enum protocol values but can take on negative - * values and since 0 <= -1 for enum values it's better to use clean - * integer type. */ - gint start, end; - enum protocol protocol; - guchar *pname; - gint pnamelen, minlen, compare; - - /* Almost dichotomic search is used here */ - /* Starting at the HTTP entry which is the most common that will make - * file and NNTP the next entries checked and amongst the third checks - * are proxy and FTP. */ - start = 0; - end = PROTOCOL_UNKNOWN - 1; - protocol = PROTOCOL_HTTP; - - while (start <= end) { - pname = protocol_backends[protocol].name; - pnamelen = strlen (pname); - minlen = MIN (pnamelen, namelen); - compare = g_ascii_strncasecmp (pname, name, minlen); - - if (compare == 0) { - if (pnamelen == namelen) - return protocol; - - /* If the current protocol name is longer than the - * protocol name being searched for move @end else move - * @start. */ - compare = pnamelen > namelen ? 1 : -1; - } - - if (compare > 0) - end = protocol - 1; - else - start = protocol + 1; - - protocol = (start + end) / 2; - } - - return PROTOCOL_UNKNOWN; -} - - -gint -get_protocol_port (enum protocol protocol) -{ - return protocol_backends[protocol].port; -} - -gint -get_protocol_need_slashes (enum protocol protocol) -{ - return protocol_backends[protocol].need_slashes; -} - -gint -get_protocol_need_slash_after_host (enum protocol protocol) -{ - return protocol_backends[protocol].need_slash_after_host; -} - -gint -get_protocol_free_syntax (enum protocol protocol) -{ - return protocol_backends[protocol].free_syntax; -} - -static gint -get_protocol_length (const gchar *url) -{ - gchar *end = (gchar *)url; - - /* Seek the end of the protocol name if any. */ - /* RFC1738: - * scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] - * (but per its recommendations we accept "upalpha" too) */ - while (*end && (g_ascii_isalnum (*end) || *end == '+' - || *end == '-' || *end == '.')) { - end++; - } - - /* Also return 0 if there's no protocol name (@end == @url). */ - return (*end == ':') ? end - url : 0; -} - - -/* - * Calcualte new length of unescaped hostlen - */ -static guint -url_calculate_escaped_hostlen (gchar *host, guint hostlen) -{ - guint i, result = hostlen; - gchar *p = host, c; - - for (i = 0; i < hostlen; i++, p++) { - if (*p == '%' && g_ascii_isxdigit (*(p + 1)) && - g_ascii_isxdigit (*(p + 2)) && i < hostlen - 2) { - c = X2DIGITS_TO_NUM (*(p + 1), *(p + 2)); - if (c != '\0') { - result -= 2; - } - } - } - - return result; -} - -void -rspamd_url_unescape (gchar *s) -{ - gchar *t = s; /* t - tortoise */ - gchar *h = s; /* h - hare */ - - for (; *h; h++, t++) { - if (*h != '%') { - *t = *h; - } - else { - gchar c; - if (!h[1] || !h[2] || - !(g_ascii_isxdigit (h[1]) && g_ascii_isxdigit (h[2]))) { - *t = *h; - } - else { - c = X2DIGITS_TO_NUM (h[1], h[2]); - if (c != '\0') { - *t = c; - h += 2; - } - else { - *t = *h; - } - } - } - } - *t = '\0'; -} - -static void -url_strip (gchar *s) -{ - gchar *t = s; /* t - tortoise */ - gchar *h = s; /* h - hare */ - - while (*h) { - if (g_ascii_isgraph (*h)) { - *t = *h; - t++; - } - h++; - } - *t = '\0'; -} - -static gchar * -url_escape_1 (const gchar *s, gint allow_passthrough, rspamd_mempool_t * pool) -{ - const gchar *p1; - gchar *p2, *newstr; - gint newlen; - gint addition = 0; - - for (p1 = s; *p1; p1++) - if (!is_urlsafe (*p1)) { - addition += 2; /* Two more characters (hex digits) */ - } - - if (!addition) { - if (allow_passthrough) { - return (gchar *)s; - } - else { - return rspamd_mempool_strdup (pool, s); - } - } - - newlen = (p1 - s) + addition; - newstr = (gchar *)rspamd_mempool_alloc (pool, newlen + 1); - - p1 = s; - p2 = newstr; - while (*p1) { - /* Quote the characters that match the test mask. */ - if (!is_urlsafe (*p1)) { - guchar c = *p1++; - *p2++ = '%'; - *p2++ = XNUM_TO_DIGIT (c >> 4); - *p2++ = XNUM_TO_DIGIT (c & 0xf); - } - else - *p2++ = *p1++; - } - *p2 = '\0'; - - return newstr; -} - -/* URL-escape the unsafe characters (see urlchr_table) in a given - string, returning a freshly allocated string. */ - -gchar * -url_escape (const gchar *s, rspamd_mempool_t * pool) -{ - return url_escape_1 (s, 0, pool); -} - -/* Decide whether the gchar at position P needs to be encoded. (It is - not enough to pass a single gchar *P because the function may need - to inspect the surrounding context.) - - Return 1 if the gchar should be escaped as %XX, 0 otherwise. */ - -static inline gboolean -char_needs_escaping (const gchar *p) -{ - if (*p == '%') { - if (g_ascii_isxdigit (*(p + 1)) && g_ascii_isxdigit (*(p + 2))) { - return FALSE; - } - else { - return TRUE; - } - } - else if (!is_urlsafe (*p)) { - return TRUE; - } - return FALSE; -} - -static gchar * -rspamd_url_reencode_escapes (gchar *s, rspamd_mempool_t * pool) -{ - const gchar *p1; - gchar *newstr, *p2; - gint oldlen, newlen; - - gint encode_count = 0; - - /* First pass: inspect the string to see if there's anything to do, - and to calculate the new length. */ - for (p1 = s; *p1; p1++) { - if (char_needs_escaping (p1)) { - ++encode_count; - } - } - - if (!encode_count) { - /* The string is good as it is. */ - return s; - } - - oldlen = p1 - s; - /* Each encoding adds two characters (hex digits). */ - newlen = oldlen + 2 * encode_count; - newstr = rspamd_mempool_alloc (pool, newlen + 1); - - /* Second pass: copy the string to the destination address, encoding - chars when needed. */ - p1 = s; - p2 = newstr; - - while (*p1) { - if (char_needs_escaping (p1)) { - guchar c = *p1++; - *p2++ = '%'; - *p2++ = XNUM_TO_DIGIT (c >> 4); - *p2++ = XNUM_TO_DIGIT (c & 0xf); - } - else { - *p2++ = *p1++; - } - } - - *p2 = '\0'; - return newstr; -} - -/* - * Resolve "." and ".." elements of PATH by destructively modifying - * PATH and return non-zero if PATH has been modified, zero otherwise. - */ - -static gboolean -path_simplify (gchar *path) -{ - gchar *h = path; /* hare */ - gchar *t = path; /* tortoise */ - gchar *beg = path; /* boundary for backing the tortoise */ - gchar *end = path + strlen (path); - - while (h < end) { - /* Hare should be at the beginning of a path element. */ - if (h[0] == '.' && (h[1] == '/' || h[1] == '\0')) { - /* Ignore "./". */ - h += 2; - } - else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0')) { - /* Handle "../" by retreating the tortoise by one path - element -- but not past beginning. */ - if (t > beg) { - /* Move backwards until T hits the beginning of the - previous path element or the beginning of path. */ - for (--t; t > beg && t[-1] != '/'; t--) ; - } - else { - /* If we're at the beginning, copy the "../" literally - move the beginning so a later ".." doesn't remove - it. */ - beg = t + 3; - goto regular; - } - h += 3; - } - else { -regular: - /* A regular path element. If H hasn't advanced past T, - simply skip to the next path element. Otherwise, copy - the path element until the next slash. */ - if (t == h) { - /* Skip the path element, including the slash. */ - while (h < end && *h != '/') - t++, h++; - if (h < end) - t++, h++; - } - else { - /* Copy the path element, including the final slash. */ - while (h < end && *h != '/') - *t++ = *h++; - if (h < end) - *t++ = *h++; - } - } - } - - if (t != h) - *t = '\0'; - - return t != h; -} enum uri_errno -parse_uri (struct uri *uri, gchar *uristring, rspamd_mempool_t * pool) +rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, + rspamd_mempool_t *pool) { - guchar *prefix_end, *host_end, *p; - guchar *lbracket, *rbracket; - gint datalen, n, addrlen; - guchar *frag_or_post, *user_end, *port_end; + struct http_parser_url u; + gchar *p, *comp; + gint i, complen; + + const struct { + enum rspamd_url_protocol proto; + const gchar *name; + gsize len; + } protocols[] = { + { + .proto = PROTOCOL_FILE, + .name = "file", + .len = 4 + }, + { + .proto = PROTOCOL_FTP, + .name = "ftp", + .len = 3 + }, + { + .proto = PROTOCOL_HTTP, + .name = "http", + .len = 4 + }, + { + .proto = PROTOCOL_HTTPS, + .name = "https", + .len = 5 + }, + { + .proto = PROTOCOL_MAILTO, + .name = "mailto", + .len = 6 + }, + { + .proto = PROTOCOL_UNKNOWN, + .name = NULL, + .len = 0 + } + }; memset (uri, 0, sizeof (*uri)); - if (!*uristring) { + if (*uristring == '\0') { return URI_ERRNO_EMPTY; } - uri->string = rspamd_url_reencode_escapes (uristring, pool); - msg_debug ("reencoding escapes in original url: '%s'", struri (uri)); - uri->protocollen = get_protocol_length (struri (uri)); - - /* Assume http as default protocol */ - if (!uri->protocollen || - (uri->protocol = - get_protocol (struri (uri), uri->protocollen)) == PROTOCOL_UNKNOWN) { - /* Make exception for numeric urls */ - p = uri->string; - while (*p && (g_ascii_isalnum (*p) || *p == ':')) { - p++; - } - if (*p == '\0') { - return URI_ERRNO_INVALID_PROTOCOL; - } - p = g_strconcat ("http://", uri->string, NULL); - uri->string = rspamd_mempool_strdup (pool, p); - g_free (p); - uri->protocol = PROTOCOL_HTTP; - prefix_end = struri (uri) + 7; + p = g_uri_unescape_string (uristring, NULL); + if (p == NULL) { + return URI_ERRNO_BAD_ENCODING; } - else { - /* Figure out whether the protocol is known */ - msg_debug ("getting protocol from url: %d", uri->protocol); - - prefix_end = struri (uri) + uri->protocollen; /* ':' */ - /* Check if there's a digit after the protocol name. */ - if (g_ascii_isdigit (*prefix_end)) { - p = struri (uri); - uri->ip_family = p[uri->protocollen] - '0'; - prefix_end++; - } - if (*prefix_end != ':') { - msg_debug ("invalid protocol in uri"); - return URI_ERRNO_INVALID_PROTOCOL; - } - prefix_end++; + uri->string = p; - /* Skip slashes */ + rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t)g_free, p); - if (prefix_end[0] == '/' && prefix_end[1] == '/') { - if (prefix_end[2] == '/') { - msg_debug ("too many '/' in uri"); - return URI_ERRNO_TOO_MANY_SLASHES; - } - - prefix_end += 2; - - } - else { - msg_debug ("no '/' in uri"); - return URI_ERRNO_NO_SLASHES; - } + /* + * We assume here that urls has the sane scheme + */ + if (http_parser_parse_url (p, len, 0, &u) != 0) { + return URI_ERRNO_BAD_FORMAT; } - if (get_protocol_free_syntax (uri->protocol)) { - uri->data = prefix_end; - uri->datalen = strlen (prefix_end); - return URI_ERRNO_OK; - - } - else if (uri->protocol == PROTOCOL_FILE) { - datalen = check_uri_file (prefix_end); - frag_or_post = prefix_end + datalen; - - /* Extract the fragment part. */ - if (datalen >= 0) { - if (*frag_or_post == '#') { - uri->fragment = frag_or_post + 1; - uri->fragmentlen = strcspn (uri->fragment, POST_CHAR_S); - frag_or_post = uri->fragment + uri->fragmentlen; - } - if (*frag_or_post == POST_CHAR) { - uri->post = frag_or_post + 1; + for (i = 0; i < UF_MAX; i ++) { + if (u.field_set & (1 << i)) { + comp = p + u.field_data[i].off; + complen = u.field_data[i].len; + switch (i) { + case UF_SCHEMA: + uri->protocollen = u.field_data[i].len; + break; + case UF_HOST: + uri->host = comp; + uri->hostlen = complen; + break; + case UF_PATH: + uri->data = comp; + uri->datalen = complen; + break; + case UF_QUERY: + uri->query = comp; + uri->querylen = complen; + break; + case UF_FRAGMENT: + uri->fragment = comp; + uri->fragmentlen = complen; + break; + case UF_USERINFO: + uri->user = comp; + uri->userlen = complen; + break; + default: + break; } } - else { - datalen = strlen (prefix_end); - } - - uri->data = prefix_end; - uri->datalen = datalen; - - return URI_ERRNO_OK; - } - - /* Isolate host */ - - /* Get brackets enclosing IPv6 address */ - lbracket = strchr (prefix_end, '['); - if (lbracket) { - rbracket = strchr (lbracket, ']'); - /* [address] is handled only inside of hostname part (surprisingly). */ - if (rbracket && rbracket < prefix_end + strcspn (prefix_end, "/")) - uri->ipv6 = 1; - else - lbracket = rbracket = NULL; - } - else { - rbracket = NULL; - } - - /* Possibly skip auth part */ - host_end = prefix_end + strcspn (prefix_end, "@"); - - if (prefix_end + strcspn (prefix_end, "/?") > host_end && *host_end) { /* we have auth info here */ - - /* Allow '@' in the password component */ - while (strcspn (host_end + 1, "@") < strcspn (host_end + 1, "/?")) - host_end = host_end + 1 + strcspn (host_end + 1, "@"); - - user_end = strchr (prefix_end, ':'); - - if (!user_end || user_end > host_end) { - uri->user = prefix_end; - uri->userlen = host_end - prefix_end; - } - else { - uri->user = prefix_end; - uri->userlen = user_end - prefix_end; - uri->password = user_end + 1; - uri->passwordlen = host_end - user_end - 1; - } - prefix_end = host_end + 1; - } - - if (uri->ipv6 && rbracket != NULL) { - host_end = rbracket + strcspn (rbracket, ":/?"); - } - else { - host_end = prefix_end + strcspn (prefix_end, ":/?"); - } - - if (uri->ipv6) { - addrlen = rbracket - lbracket - 1; - - - uri->host = lbracket + 1; - uri->hostlen = addrlen; } - else { - uri->host = prefix_end; - uri->hostlen = host_end - prefix_end; - /* Trim trailing '.'s */ - if (uri->hostlen && uri->host[uri->hostlen - 1] == '.') - return URI_ERRNO_TRAILING_DOTS; + if (!uri->hostlen) { + return URI_ERRNO_BAD_FORMAT; } - if (*host_end == ':') { /* we have port here */ - port_end = host_end + 1 + strcspn (host_end + 1, "/"); - - host_end++; - - uri->port = host_end; - uri->portlen = port_end - host_end; - - if (uri->portlen == 0) - return URI_ERRNO_NO_PORT_COLON; - - /* We only use 8 bits for portlen so better check */ - if ((gint)uri->portlen != port_end - host_end) - return URI_ERRNO_INVALID_PORT; - - /* test if port is number */ - for (; host_end < port_end; host_end++) - if (!g_ascii_isdigit (*host_end)) - return URI_ERRNO_INVALID_PORT; + rspamd_str_lc (uri->string, uri->protocollen); + rspamd_str_lc (uri->host, uri->hostlen); - /* Check valid port value, and let show an error message - * about invalid url syntax. */ - if (uri->port && uri->portlen) { + uri->protocol = PROTOCOL_UNKNOWN; - errno = 0; - n = strtol (uri->port, NULL, 10); - if (errno || !uri_port_is_valid (n)) - return URI_ERRNO_INVALID_PORT; + for (i = 0; i < G_N_ELEMENTS (protocols); i ++) { + if (uri->protocollen == protocols[i].len) { + if (memcmp (uri->string, protocols[i].name, uri->protocollen) == 0) { + uri->protocol = i; + break; + } } } - if (*host_end == '/') { - host_end++; + if (uri->protocol == PROTOCOL_UNKNOWN) { + return URI_ERRNO_INVALID_PROTOCOL; } - else if (get_protocol_need_slash_after_host (uri->protocol) && *host_end != - '?') { - /* The need for slash after the host component depends on the - * need for a host component. -- The dangerous mind of Jonah */ - if (!uri->hostlen) - return URI_ERRNO_NO_HOST; - - return URI_ERRNO_NO_HOST_SLASH; - } - - /* Look for #fragment or POST_CHAR */ - prefix_end = host_end + strcspn (host_end, "#" POST_CHAR_S); - uri->data = host_end; - uri->datalen = prefix_end - host_end; - - if (*prefix_end == '#') { - uri->fragment = prefix_end + 1; - uri->fragmentlen = strcspn (uri->fragment, POST_CHAR_S); - prefix_end = uri->fragment + uri->fragmentlen; - } - - if (*prefix_end == POST_CHAR) { - uri->post = prefix_end + 1; - } - - rspamd_str_lc (uri->string, uri->protocollen); - rspamd_str_lc (uri->host, uri->hostlen); - /* Decode %HH sequences in host name. This is important not so much - to support %HH sequences in host names (which other browser - don't), but to support binary characters (which will have been - converted to %HH by reencode_escapes). */ - if (strchr (uri->host, '%')) { - uri->hostlen = url_calculate_escaped_hostlen (uri->host, uri->hostlen); - } - - url_strip (struri (uri)); - rspamd_url_unescape (uri->host); - - path_simplify (uri->data); return URI_ERRNO_OK; } @@ -1821,14 +1319,14 @@ url_email_end (const gchar *begin, } void -url_parse_text (rspamd_mempool_t * pool, +rspamd_url_text_extract (rspamd_mempool_t * pool, struct rspamd_task *task, struct mime_text_part *part, gboolean is_html) { gint rc; gchar *url_str = NULL, *url_start, *url_end; - struct uri *new; + struct rspamd_url *new; struct process_exception *ex; gchar *p, *end, *begin; @@ -1843,18 +1341,17 @@ url_parse_text (rspamd_mempool_t * pool, end = begin + part->content->len; p = begin; while (p < end) { - if (url_try_text (pool, p, end - p, &url_start, &url_end, &url_str, + if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str, is_html)) { if (url_str != NULL) { - new = rspamd_mempool_alloc0 (pool, sizeof (struct uri)); + new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url)); ex = rspamd_mempool_alloc0 (pool, sizeof (struct process_exception)); if (new != NULL) { g_strstrip (url_str); - rc = parse_uri (new, url_str, pool); - if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || - rc == URI_ERRNO_NO_HOST_SLASH) && + rc = rspamd_url_parse (new, url_str, strlen (url_str), pool); + if (rc == URI_ERRNO_OK && new->hostlen > 0) { ex->pos = url_start - begin; ex->len = url_end - url_start; @@ -1877,7 +1374,7 @@ url_parse_text (rspamd_mempool_t * pool, else if (rc != URI_ERRNO_OK) { msg_info ("extract of url '%s' failed: %s", url_str, - url_strerror (rc)); + rspamd_url_strerror (rc)); } } } @@ -1897,7 +1394,7 @@ url_parse_text (rspamd_mempool_t * pool, } gboolean -url_try_text (rspamd_mempool_t *pool, +rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len, gchar **start, diff --git a/src/libserver/url.h b/src/libserver/url.h index c9700436b..db3a3472c 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -8,12 +8,9 @@ struct rspamd_task; struct mime_text_part; -struct uri { - /* The start of the uri (and thus start of the protocol string). */ +struct rspamd_url { gchar *string; - - /* The internal type of protocol. Can _never_ be PROTOCOL_UNKNOWN. */ - gint protocol; /* enum protocol */ + gint protocol; gint ip_family; @@ -22,20 +19,20 @@ struct uri { gchar *host; gchar *port; gchar *data; + gchar *query; gchar *fragment; gchar *post; gchar *surbl; - struct uri *phished_url; + struct rspamd_url *phished_url; - /* @protocollen should only be usable if @protocol is either - * PROTOCOL_USER or an uri string should be composed. */ guint protocollen; guint userlen; guint passwordlen; guint hostlen; guint portlen; guint datalen; + guint querylen; guint fragmentlen; guint surbllen; @@ -46,22 +43,16 @@ struct uri { }; enum uri_errno { - URI_ERRNO_OK, /* Parsing went well */ + URI_ERRNO_OK = 0, /* Parsing went well */ URI_ERRNO_EMPTY, /* The URI string was empty */ URI_ERRNO_INVALID_PROTOCOL, /* No protocol was found */ - URI_ERRNO_NO_SLASHES, /* Slashes after protocol missing */ - URI_ERRNO_TOO_MANY_SLASHES, /* Too many slashes after protocol */ - URI_ERRNO_TRAILING_DOTS, /* '.' after host */ - URI_ERRNO_NO_HOST, /* Host part is missing */ - URI_ERRNO_NO_PORT_COLON, /* ':' after host without port */ - URI_ERRNO_NO_HOST_SLASH, /* Slash after host missing */ - URI_ERRNO_IPV6_SECURITY, /* IPv6 security bug detected */ URI_ERRNO_INVALID_PORT, /* Port number is bad */ - URI_ERRNO_INVALID_PORT_RANGE /* Port number is not within 0-65535 */ + URI_ERRNO_BAD_ENCODING, /* Bad characters encoding */ + URI_ERRNO_BAD_FORMAT }; -enum protocol { - PROTOCOL_FILE, +enum rspamd_url_protocol { + PROTOCOL_FILE = 0, PROTOCOL_FTP, PROTOCOL_HTTP, PROTOCOL_HTTPS, @@ -78,7 +69,7 @@ enum protocol { * @param part current text part * @param is_html turn on html euristic */ -void url_parse_text (rspamd_mempool_t *pool, +void rspamd_url_text_extract (rspamd_mempool_t *pool, struct rspamd_task *task, struct mime_text_part *part, gboolean is_html); @@ -89,8 +80,9 @@ void url_parse_text (rspamd_mempool_t *pool, * @param uristring text form of url * @param uri url object, must be pre allocated */ -enum uri_errno parse_uri (struct uri *uri, +enum uri_errno rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, + gsize len, rspamd_mempool_t *pool); /* @@ -103,7 +95,7 @@ enum uri_errno parse_uri (struct uri *uri, * @param url_str storage for url string(or NULL) * @return TRUE if url is found in specified text */ -gboolean url_try_text (rspamd_mempool_t *pool, +gboolean rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len, gchar **start, @@ -114,7 +106,7 @@ gboolean url_try_text (rspamd_mempool_t *pool, /* * Return text representation of url parsing error */ -const gchar * url_strerror (enum uri_errno err); +const gchar * rspamd_url_strerror (enum uri_errno err); /* * URL unescape characters in the specified string diff --git a/src/libutil/util.c b/src/libutil/util.c index 6d5682f25..f88ed8e72 100644 --- a/src/libutil/util.c +++ b/src/libutil/util.c @@ -1427,7 +1427,7 @@ rspamd_strlcpy_tolower (gchar *dst, const gchar *src, gsize siz) gint rspamd_emails_cmp (gconstpointer a, gconstpointer b) { - const struct uri *u1 = a, *u2 = b; + const struct rspamd_url *u1 = a, *u2 = b; gint r; if (u1->hostlen != u2->hostlen || u1->hostlen == 0) { @@ -1453,7 +1453,7 @@ rspamd_emails_cmp (gconstpointer a, gconstpointer b) gint rspamd_urls_cmp (gconstpointer a, gconstpointer b) { - const struct uri *u1 = a, *u2 = b; + const struct rspamd_url *u1 = a, *u2 = b; int r; if (u1->hostlen != u2->hostlen || u1->hostlen == 0) { diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index ef52b4544..6fee606c4 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -683,12 +683,12 @@ lua_check_image (lua_State * L) return ud ? *((struct rspamd_image **)ud) : NULL; } -static struct uri * +static struct rspamd_url * lua_check_url (lua_State * L) { void *ud = luaL_checkudata (L, 1, "rspamd{url}"); luaL_argcheck (L, ud != NULL, 1, "'url' expected"); - return ud ? *((struct uri **)ud) : NULL; + return ud ? *((struct rspamd_url **)ud) : NULL; } static int @@ -924,10 +924,10 @@ struct lua_tree_cb_data { static gboolean lua_tree_url_callback (gpointer key, gpointer value, gpointer ud) { - struct uri **purl; + struct rspamd_url **purl; struct lua_tree_cb_data *cb = ud; - purl = lua_newuserdata (cb->L, sizeof (struct uri *)); + purl = lua_newuserdata (cb->L, sizeof (struct rspamd_url *)); rspamd_lua_setclass (cb->L, "rspamd{url}", -1); *purl = value; lua_rawseti (cb->L, -2, cb->i++); @@ -2352,7 +2352,7 @@ lua_image_get_filename (lua_State *L) static gint lua_url_get_length (lua_State *L) { - struct uri *url = lua_check_url (L); + struct rspamd_url *url = lua_check_url (L); if (url != NULL) { lua_pushinteger (L, strlen (struri (url))); @@ -2366,7 +2366,7 @@ lua_url_get_length (lua_State *L) static gint lua_url_get_host (lua_State *L) { - struct uri *url = lua_check_url (L); + struct rspamd_url *url = lua_check_url (L); if (url != NULL) { lua_pushlstring (L, url->host, url->hostlen); @@ -2380,7 +2380,7 @@ lua_url_get_host (lua_State *L) static gint lua_url_get_user (lua_State *L) { - struct uri *url = lua_check_url (L); + struct rspamd_url *url = lua_check_url (L); if (url != NULL && url->user != NULL) { lua_pushlstring (L, url->user, url->userlen); @@ -2395,7 +2395,7 @@ lua_url_get_user (lua_State *L) static gint lua_url_get_path (lua_State *L) { - struct uri *url = lua_check_url (L); + struct rspamd_url *url = lua_check_url (L); if (url != NULL) { lua_pushlstring (L, url->data, url->datalen); @@ -2410,7 +2410,7 @@ lua_url_get_path (lua_State *L) static gint lua_url_get_text (lua_State *L) { - struct uri *url = lua_check_url (L); + struct rspamd_url *url = lua_check_url (L); if (url != NULL) { lua_pushstring (L, struri (url)); @@ -2425,7 +2425,7 @@ lua_url_get_text (lua_State *L) static gint lua_url_is_phished (lua_State *L) { - struct uri *url = lua_check_url (L); + struct rspamd_url *url = lua_check_url (L); if (url != NULL) { lua_pushboolean (L, url->is_phished); @@ -2440,11 +2440,11 @@ lua_url_is_phished (lua_State *L) static gint lua_url_get_phished (lua_State *L) { - struct uri **purl, *url = lua_check_url (L); + struct rspamd_url **purl, *url = lua_check_url (L); if (url) { if (url->is_phished && url->phished_url != NULL) { - purl = lua_newuserdata (L, sizeof (struct uri *)); + purl = lua_newuserdata (L, sizeof (struct rspamd_url *)); rspamd_lua_setclass (L, "rspamd{url}", -1); *purl = url->phished_url; diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c index 74ab46ab5..15eed1674 100644 --- a/src/plugins/regexp.c +++ b/src/plugins/regexp.c @@ -378,7 +378,7 @@ static gboolean tree_url_callback (gpointer key, gpointer value, void *data) { struct url_regexp_param *param = data; - struct uri *url = value; + struct rspamd_url *url = value; GError *err = NULL; if (g_regex_match_full (param->regexp, struri (url), -1, 0, 0, NULL, diff --git a/src/plugins/surbl.c b/src/plugins/surbl.c index b7a3a8337..df9227c08 100644 --- a/src/plugins/surbl.c +++ b/src/plugins/surbl.c @@ -576,7 +576,7 @@ format_surbl_request (rspamd_mempool_t * pool, GError ** err, gboolean forced, GTree *tree, - struct uri *url) + struct rspamd_url *url) { GHashTable *t; gchar *result = NULL, *dots[MAX_LEVELS], @@ -753,7 +753,7 @@ format_surbl_request (rspamd_mempool_t * pool, } static void -make_surbl_requests (struct uri *url, struct rspamd_task *task, +make_surbl_requests (struct rspamd_url *url, struct rspamd_task *task, struct suffix_item *suffix, gboolean forced, GTree *tree) { gchar *surbl_req; @@ -953,7 +953,7 @@ redirector_callback (gint fd, short what, void *arg) struri (param->url), c); r = - parse_uri (param->url, + rspamd_url_parse (param->url, rspamd_mempool_strdup (param->task->task_pool, c), param->task->task_pool); if (r == URI_ERRNO_OK || r == URI_ERRNO_NO_SLASHES || r == @@ -985,7 +985,7 @@ redirector_callback (gint fd, short what, void *arg) static void -register_redirector_call (struct uri *url, struct rspamd_task *task, +register_redirector_call (struct rspamd_url *url, struct rspamd_task *task, struct suffix_item *suffix, const gchar *rule, GTree *tree) { gint s = -1; @@ -1042,7 +1042,7 @@ surbl_tree_url_callback (gpointer key, gpointer value, void *data) { struct redirector_param *param = data; struct rspamd_task *task; - struct uri *url = value; + struct rspamd_url *url = value; gchar *red_domain; const gchar *pos; GRegex *re; @@ -1134,7 +1134,7 @@ static gboolean calculate_buflen_cb (gpointer key, gpointer value, gpointer cbdata) { struct urls_tree_cb_data *cb = cbdata; - struct uri *url = value; + struct rspamd_url *url = value; cb->len += strlen (struri (url)) + url->hostlen + sizeof (" <\"\">, ") - 1; @@ -1145,7 +1145,7 @@ static gboolean write_urls_buffer (gpointer key, gpointer value, gpointer cbdata) { struct urls_tree_cb_data *cb = cbdata; - struct uri *url = value; + struct rspamd_url *url = value; rspamd_fstring_t f; gchar *urlstr; gsize len; diff --git a/src/plugins/surbl.h b/src/plugins/surbl.h index 7701c8304..959a730de 100644 --- a/src/plugins/surbl.h +++ b/src/plugins/surbl.h @@ -46,14 +46,14 @@ struct suffix_item { }; struct dns_param { - struct uri *url; + struct rspamd_url *url; struct rspamd_task *task; gchar *host_resolve; struct suffix_item *suffix; }; struct redirector_param { - struct uri *url; + struct rspamd_url *url; struct rspamd_task *task; struct upstream *redirector; enum { -- 2.39.5