Remove old crap functions from url parser code.

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2015-02-03 17:39:03 +0000
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2015-02-03 17:39:03 +0000
commit: b4c0e9b59d3985726d9a346085172394a0495ce6 (patch)
tree: f03a48c2206d37299f26c87f9ecd85d394a39271
parent: aa8b1c618148c2ead44e1cc643e9eb23423a4843 (diff)
download: rspamd-b4c0e9b59d3985726d9a346085172394a0495ce6.tar.gz
rspamd-b4c0e9b59d3985726d9a346085172394a0495ce6.zip
10 files changed, 167 insertions, 678 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 869d0a06e..702b148cb 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -1233,7 +1233,7 @@ process_text_part (struct rspamd_task *task,
 			decode_entitles (text_part->content->data,
 				&text_part->content->len);
 		}
-		url_parse_text (task->task_pool, task, text_part, TRUE);
+		rspamd_url_text_extract (task->task_pool, task, text_part, TRUE);
 
 		rspamd_fuzzy_from_text_part (text_part, task->task_pool, task->cfg->max_diff);
 		rspamd_mempool_add_destructor (task->task_pool,
@@ -1260,7 +1260,7 @@ process_text_part (struct rspamd_task *task,
 				type,
 				text_part);
 		text_part->orig = part_content;
-		url_parse_text (task->task_pool, task, text_part, FALSE);
+		rspamd_url_text_extract (task->task_pool, task, text_part, FALSE);
 		rspamd_fuzzy_from_text_part (text_part, task->task_pool, task->cfg->max_diff);
 		task->text_parts = g_list_prepend (task->text_parts, text_part);
 	}
@@ -1460,7 +1460,7 @@ process_message (struct rspamd_task *task)
 	GMimeDataWrapper *wrapper;
 	struct received_header *recv;
 	gchar *mid, *url_str, *p, *end, *url_end;
-	struct uri *subject_url;
+	struct rspamd_url *subject_url;
 	gsize len;
 	gint rc;
 
@@ -1634,14 +1634,14 @@ process_message (struct rspamd_task *task)
 
 		while (p < end) {
 			/* Search to the end of url */
-			if (url_try_text (task->task_pool, p, end - p, NULL, &url_end,
+			if (rspamd_url_find (task->task_pool, p, end - p, NULL, &url_end,
 				&url_str, FALSE)) {
 				if (url_str != NULL) {
 					subject_url = rspamd_mempool_alloc0 (task->task_pool,
-							sizeof (struct uri));
+							sizeof (struct rspamd_url));
 					if (subject_url != NULL) {
 						/* Try to parse url */
-						rc = parse_uri (subject_url, url_str, task->task_pool);
+						rc = rspamd_url_parse (subject_url, url_str, task->task_pool);
 						if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES ||
 							rc == URI_ERRNO_NO_HOST_SLASH) &&
 							subject_url->hostlen > 0) {
@@ -1656,7 +1656,7 @@ process_message (struct rspamd_task *task)
 						else if (rc != URI_ERRNO_OK) {
 							msg_info ("extract of url '%s' failed: %s",
 								url_str,
-								url_strerror (rc));
+								rspamd_url_strerror (rc));
 						}
 					}
 				}
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 5055a9aae..7df9270c3 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -674,12 +674,12 @@ decode_entitles (gchar *s, guint * len)
 
 static void
 check_phishing (struct rspamd_task *task,
-	struct uri *href_url,
+	struct rspamd_url *href_url,
 	const gchar *url_text,
 	gsize remain,
 	tag_id_t id)
 {
-	struct uri *new;
+	struct rspamd_url *new;
 	gchar *url_str;
 	const gchar *p, *c;
 	gchar tagbuf[128];
@@ -732,12 +732,12 @@ check_phishing (struct rspamd_task *task,
 		p++;
 	}
 
-	if (url_try_text (task->task_pool, url_text, len, NULL, NULL, &url_str,
+	if (rspamd_url_find (task->task_pool, url_text, len, NULL, NULL, &url_str,
 		TRUE) && url_str != NULL) {
-		new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct uri));
+		new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_url));
 		if (new != NULL) {
 			g_strstrip (url_str);
-			rc = parse_uri (new, url_str, task->task_pool);
+			rc = rspamd_url_parse (new, url_str, task->task_pool);
 
 			if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc ==
 				URI_ERRNO_NO_HOST_SLASH) {
@@ -787,7 +787,7 @@ check_phishing (struct rspamd_task *task,
 			else {
 				msg_info ("extract of url '%s' failed: %s",
 					url_str,
-					url_strerror (rc));
+					rspamd_url_strerror (rc));
 			}
 		}
 	}
@@ -804,7 +804,7 @@ parse_tag_url (struct rspamd_task *task,
 {
 	gchar *c = NULL, *p, *url_text;
 	gint len, rc;
-	struct uri *url;
+	struct rspamd_url *url;
 	gboolean got_single_quote = FALSE, got_double_quote = FALSE;
 
 	/* For A tags search for href= and for IMG tags search for src= */
@@ -885,8 +885,8 @@ parse_tag_url (struct rspamd_task *task,
 			return;
 		}
 
-		url = rspamd_mempool_alloc (task->task_pool, sizeof (struct uri));
-		rc = parse_uri (url, url_text, task->task_pool);
+		url = rspamd_mempool_alloc (task->task_pool, sizeof (struct rspamd_url));
+		rc = rspamd_url_parse (url, url_text, task->task_pool);
 
 		if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST && url->hostlen !=
 			0) {
diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c
index 25dcee1c0..e702bfc14 100644
--- a/src/libserver/protocol.c
+++ b/src/libserver/protocol.c
@@ -492,7 +492,7 @@ static gboolean
 urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
 {
 	struct tree_cb_data *cb = ud;
-	struct uri *url = value;
+	struct rspamd_url *url = value;
 	ucl_object_t *obj, *elt;
 
 	if (!cb->task->extended_urls) {
@@ -550,7 +550,7 @@ static gboolean
 emails_protocol_cb (gpointer key, gpointer value, gpointer ud)
 {
 	struct tree_cb_data *cb = ud;
-	struct uri *url = value;
+	struct rspamd_url *url = value;
 	ucl_object_t *obj;
 
 	obj = ucl_object_fromlstring (url->user, url->userlen + url->hostlen + 1);
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 3e4ccc827..22cb15759 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -29,6 +29,7 @@
 #include "main.h"
 #include "message.h"
 #include "trie.h"
+#include "http.h"
 
 #define POST_CHAR 1
 #define POST_CHAR_S "\001"
@@ -695,28 +696,6 @@ struct url_match_scanner {
 
 struct url_match_scanner *url_scanner = NULL;
 
-static const struct _proto protocol_backends[] = {
-	{"file", 0, NULL, 1, 0, 0, 0},
-	{"ftp", 21, NULL, 1, 0, 0, 0},
-	{"http", 80, NULL, 1, 0, 0, 0},
-	{"https", 443, NULL, 1, 0, 0, 1},
-	{"mailto", 25, NULL, 1, 0, 0, 0},
-	/* Keep these last! */
-	{NULL, 0, NULL, 0, 0, 1, 0}
-};
-
-/* Convert an ASCII hex digit to the corresponding number between 0
-   and 15.  H should be a hexadecimal digit that satisfies isxdigit;
-   otherwise, the result is undefined.  */
-#define XDIGIT_TO_NUM(h) ((h) < 'A' ? (h) - '0' : g_ascii_toupper (h) - 'A' + \
-	10)
-#define X2DIGITS_TO_NUM(h1, h2) ((XDIGIT_TO_NUM (h1) << 4) + XDIGIT_TO_NUM (h2))
-/* The reverse of the above: convert a number in the [0, 16) range to
-   the ASCII representation of the corresponding hexadecimal digit.
-   `+ 0' is there so you can't accidentally use it as an lvalue.  */
-#define XNUM_TO_DIGIT(x) ("0123456789ABCDEF"[x] + 0)
-#define XNUM_TO_digit(x) ("0123456789abcdef"[x] + 0)
-
 static guchar url_scanner_table[256] = {
 	1,  1,  1,  1,  1,  1,  1,  1,  1,  9,  9,  1,  1,  9,  1,  1,
 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
@@ -759,7 +738,7 @@ enum {
 
 
 const gchar *
-url_strerror (enum uri_errno err)
+rspamd_url_strerror (enum uri_errno err)
 {
 	switch (err) {
 	case URI_ERRNO_OK:
@@ -768,37 +747,17 @@ url_strerror (enum uri_errno err)
 		return "The URI string was empty";
 	case URI_ERRNO_INVALID_PROTOCOL:
 		return "No protocol was found";
-	case URI_ERRNO_NO_SLASHES:
-		return "Slashes after protocol missing";
-	case URI_ERRNO_TOO_MANY_SLASHES:
-		return "Too many slashes after protocol";
-	case URI_ERRNO_TRAILING_DOTS:
-		return "'.' after host";
-	case URI_ERRNO_NO_HOST:
-		return "Host part is missing";
-	case URI_ERRNO_NO_PORT_COLON:
-		return "':' after host without port";
-	case URI_ERRNO_NO_HOST_SLASH:
-		return "Slash after host missing";
-	case URI_ERRNO_IPV6_SECURITY:
-		return "IPv6 security bug detected";
+	case URI_ERRNO_BAD_FORMAT:
+		return "Bad URL format";
+	case URI_ERRNO_BAD_ENCODING:
+		return "Invalid symbols encoded";
 	case URI_ERRNO_INVALID_PORT:
 		return "Port number is bad";
-	case URI_ERRNO_INVALID_PORT_RANGE:
-		return "Port number is not within 0-65535";
 	}
 	return NULL;
 }
 
 static gint
-check_uri_file (gchar *name)
-{
-	static const gchar chars[] = POST_CHAR_S "#?";
-
-	return strcspn (name, chars);
-}
-
-static gint
 url_init (void)
 {
 	guint i;
@@ -843,590 +802,129 @@ url_init (void)
 	return 0;
 }
 
-enum protocol
-get_protocol (gchar *name, gint namelen)
-{
-	/* These are really enum protocol values but can take on negative
-	 * values and since 0 <= -1 for enum values it's better to use clean
-	 * integer type. */
-	gint start, end;
-	enum protocol protocol;
-	guchar *pname;
-	gint pnamelen, minlen, compare;
-
-	/* Almost dichotomic search is used here */
-	/* Starting at the HTTP entry which is the most common that will make
-	 * file and NNTP the next entries checked and amongst the third checks
-	 * are proxy and FTP. */
-	start = 0;
-	end = PROTOCOL_UNKNOWN - 1;
-	protocol = PROTOCOL_HTTP;
-
-	while (start <= end) {
-		pname = protocol_backends[protocol].name;
-		pnamelen = strlen (pname);
-		minlen = MIN (pnamelen, namelen);
-		compare = g_ascii_strncasecmp (pname, name, minlen);
-
-		if (compare == 0) {
-			if (pnamelen == namelen)
-				return protocol;
-
-			/* If the current protocol name is longer than the
-			 * protocol name being searched for move @end else move
-			 * @start. */
-			compare = pnamelen > namelen ? 1 : -1;
-		}
-
-		if (compare > 0)
-			end = protocol - 1;
-		else
-			start = protocol + 1;
-
-		protocol = (start + end) / 2;
-	}
-
-	return PROTOCOL_UNKNOWN;
-}
-
-
-gint
-get_protocol_port (enum protocol protocol)
-{
-	return protocol_backends[protocol].port;
-}
-
-gint
-get_protocol_need_slashes (enum protocol protocol)
-{
-	return protocol_backends[protocol].need_slashes;
-}
-
-gint
-get_protocol_need_slash_after_host (enum protocol protocol)
-{
-	return protocol_backends[protocol].need_slash_after_host;
-}
-
-gint
-get_protocol_free_syntax (enum protocol protocol)
-{
-	return protocol_backends[protocol].free_syntax;
-}
-
-static gint
-get_protocol_length (const gchar *url)
-{
-	gchar *end = (gchar *)url;
-
-	/* Seek the end of the protocol name if any. */
-	/* RFC1738:
-	 * scheme  = 1*[ lowalpha | digit | "+" | "-" | "." ]
-	 * (but per its recommendations we accept "upalpha" too) */
-	while (*end && (g_ascii_isalnum (*end) || *end == '+'
-			|| *end == '-' || *end == '.')) {
-		end++;
-	}
-
-	/* Also return 0 if there's no protocol name (@end == @url). */
-	return (*end == ':') ? end - url : 0;
-}
-
-
-/*
- * Calcualte new length of unescaped hostlen
- */
-static guint
-url_calculate_escaped_hostlen (gchar *host, guint hostlen)
-{
-	guint i, result = hostlen;
-	gchar *p = host, c;
-
-	for (i = 0; i < hostlen; i++, p++) {
-		if (*p == '%' && g_ascii_isxdigit (*(p + 1)) &&
-			g_ascii_isxdigit (*(p + 2)) && i < hostlen - 2) {
-			c = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
-			if (c != '\0') {
-				result -= 2;
-			}
-		}
-	}
-
-	return result;
-}
-
-void
-rspamd_url_unescape (gchar *s)
-{
-	gchar *t = s;                           /* t - tortoise */
-	gchar *h = s;                           /* h - hare     */
-
-	for (; *h; h++, t++) {
-		if (*h != '%') {
-			*t = *h;
-		}
-		else {
-			gchar c;
-			if (!h[1] || !h[2] ||
-				!(g_ascii_isxdigit (h[1]) && g_ascii_isxdigit (h[2]))) {
-				*t = *h;
-			}
-			else {
-				c = X2DIGITS_TO_NUM (h[1], h[2]);
-				if (c != '\0') {
-					*t = c;
-					h += 2;
-				}
-				else {
-					*t = *h;
-				}
-			}
-		}
-	}
-	*t = '\0';
-}
-
-static void
-url_strip (gchar *s)
-{
-	gchar *t = s;                           /* t - tortoise */
-	gchar *h = s;                           /* h - hare     */
-
-	while (*h) {
-		if (g_ascii_isgraph (*h)) {
-			*t = *h;
-			t++;
-		}
-		h++;
-	}
-	*t = '\0';
-}
-
-static gchar *
-url_escape_1 (const gchar *s, gint allow_passthrough, rspamd_mempool_t * pool)
-{
-	const gchar *p1;
-	gchar *p2, *newstr;
-	gint newlen;
-	gint addition = 0;
-
-	for (p1 = s; *p1; p1++)
-		if (!is_urlsafe (*p1)) {
-			addition += 2;      /* Two more characters (hex digits) */
-		}
-
-	if (!addition) {
-		if (allow_passthrough) {
-			return (gchar *)s;
-		}
-		else {
-			return rspamd_mempool_strdup (pool, s);
-		}
-	}
-
-	newlen = (p1 - s) + addition;
-	newstr = (gchar *)rspamd_mempool_alloc (pool, newlen + 1);
-
-	p1 = s;
-	p2 = newstr;
-	while (*p1) {
-		/* Quote the characters that match the test mask. */
-		if (!is_urlsafe (*p1)) {
-			guchar c = *p1++;
-			*p2++ = '%';
-			*p2++ = XNUM_TO_DIGIT (c >> 4);
-			*p2++ = XNUM_TO_DIGIT (c & 0xf);
-		}
-		else
-			*p2++ = *p1++;
-	}
-	*p2 = '\0';
-
-	return newstr;
-}
-
-/* URL-escape the unsafe characters (see urlchr_table) in a given
-   string, returning a freshly allocated string.  */
-
-gchar *
-url_escape (const gchar *s, rspamd_mempool_t * pool)
-{
-	return url_escape_1 (s, 0, pool);
-}
-
-/* Decide whether the gchar at position P needs to be encoded.  (It is
-   not enough to pass a single gchar *P because the function may need
-   to inspect the surrounding context.)
-
-   Return 1 if the gchar should be escaped as %XX, 0 otherwise.  */
-
-static inline gboolean
-char_needs_escaping (const gchar *p)
-{
-	if (*p == '%') {
-		if (g_ascii_isxdigit (*(p + 1)) && g_ascii_isxdigit (*(p + 2))) {
-			return FALSE;
-		}
-		else {
-			return TRUE;
-		}
-	}
-	else if (!is_urlsafe (*p)) {
-		return TRUE;
-	}
-	return FALSE;
-}
-
-static gchar *
-rspamd_url_reencode_escapes (gchar *s, rspamd_mempool_t * pool)
-{
-	const gchar *p1;
-	gchar *newstr, *p2;
-	gint oldlen, newlen;
-
-	gint encode_count = 0;
-
-	/* First pass: inspect the string to see if there's anything to do,
-	   and to calculate the new length.  */
-	for (p1 = s; *p1; p1++) {
-		if (char_needs_escaping (p1)) {
-			++encode_count;
-		}
-	}
-
-	if (!encode_count) {
-		/* The string is good as it is. */
-		return s;
-	}
-
-	oldlen = p1 - s;
-	/* Each encoding adds two characters (hex digits).  */
-	newlen = oldlen + 2 * encode_count;
-	newstr = rspamd_mempool_alloc (pool, newlen + 1);
-
-	/* Second pass: copy the string to the destination address, encoding
-	   chars when needed.  */
-	p1 = s;
-	p2 = newstr;
-
-	while (*p1) {
-		if (char_needs_escaping (p1)) {
-			guchar c = *p1++;
-			*p2++ = '%';
-			*p2++ = XNUM_TO_DIGIT (c >> 4);
-			*p2++ = XNUM_TO_DIGIT (c & 0xf);
-		}
-		else {
-			*p2++ = *p1++;
-		}
-	}
-
-	*p2 = '\0';
-	return newstr;
-}
-
-/*
- * Resolve "." and ".." elements of PATH by destructively modifying
- * PATH and return non-zero if PATH has been modified, zero otherwise.
- */
-
-static gboolean
-path_simplify (gchar *path)
-{
-	gchar *h = path;                            /* hare */
-	gchar *t = path;                            /* tortoise */
-	gchar *beg = path;                              /* boundary for backing the tortoise */
-	gchar *end = path + strlen (path);
-
-	while (h < end) {
-		/* Hare should be at the beginning of a path element. */
-		if (h[0] == '.' && (h[1] == '/' || h[1] == '\0')) {
-			/* Ignore "./". */
-			h += 2;
-		}
-		else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0')) {
-			/* Handle "../" by retreating the tortoise by one path
-			   element -- but not past beginning.  */
-			if (t > beg) {
-				/* Move backwards until T hits the beginning of the
-				   previous path element or the beginning of path. */
-				for (--t; t > beg && t[-1] != '/'; t--) ;
-			}
-			else {
-				/* If we're at the beginning, copy the "../" literally
-				   move the beginning so a later ".." doesn't remove
-				   it.  */
-				beg = t + 3;
-				goto regular;
-			}
-			h += 3;
-		}
-		else {
-regular:
-			/* A regular path element.  If H hasn't advanced past T,
-			   simply skip to the next path element.  Otherwise, copy
-			   the path element until the next slash.  */
-			if (t == h) {
-				/* Skip the path element, including the slash.  */
-				while (h < end && *h != '/')
-					t++, h++;
-				if (h < end)
-					t++, h++;
-			}
-			else {
-				/* Copy the path element, including the final slash.  */
-				while (h < end && *h != '/')
-					*t++ = *h++;
-				if (h < end)
-					*t++ = *h++;
-			}
-		}
-	}
-
-	if (t != h)
-		*t = '\0';
-
-	return t != h;
-}
 
 enum uri_errno
-parse_uri (struct uri *uri, gchar *uristring, rspamd_mempool_t * pool)
+rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
+		rspamd_mempool_t *pool)
 {
-	guchar *prefix_end, *host_end, *p;
-	guchar *lbracket, *rbracket;
-	gint datalen, n, addrlen;
-	guchar *frag_or_post, *user_end, *port_end;
+	struct http_parser_url u;
+	gchar *p, *comp;
+	gint i, complen;
+
+	const struct {
+		enum rspamd_url_protocol proto;
+		const gchar *name;
+		gsize len;
+	} protocols[] = {
+		{
+			.proto = PROTOCOL_FILE,
+			.name = "file",
+			.len = 4
+		},
+		{
+			.proto = PROTOCOL_FTP,
+			.name = "ftp",
+			.len = 3
+		},
+		{
+			.proto = PROTOCOL_HTTP,
+			.name = "http",
+			.len = 4
+		},
+		{
+			.proto = PROTOCOL_HTTPS,
+			.name = "https",
+			.len = 5
+		},
+		{
+			.proto = PROTOCOL_MAILTO,
+			.name = "mailto",
+			.len = 6
+		},
+		{
+			.proto = PROTOCOL_UNKNOWN,
+			.name = NULL,
+			.len = 0
+		}
+	};
 
 	memset (uri, 0, sizeof (*uri));
 
-	if (!*uristring) {
+	if (*uristring == '\0') {
 		return URI_ERRNO_EMPTY;
 	}
 
-	uri->string = rspamd_url_reencode_escapes (uristring, pool);
-	msg_debug ("reencoding escapes in original url: '%s'", struri (uri));
-	uri->protocollen = get_protocol_length (struri (uri));
-
-	/* Assume http as default protocol */
-	if (!uri->protocollen ||
-		(uri->protocol =
-		get_protocol (struri (uri), uri->protocollen)) == PROTOCOL_UNKNOWN) {
-		/* Make exception for numeric urls */
-		p = uri->string;
-		while (*p && (g_ascii_isalnum (*p) || *p == ':')) {
-			p++;
-		}
-		if (*p == '\0') {
-			return URI_ERRNO_INVALID_PROTOCOL;
-		}
-		p = g_strconcat ("http://", uri->string, NULL);
-		uri->string = rspamd_mempool_strdup (pool, p);
-		g_free (p);
-		uri->protocol = PROTOCOL_HTTP;
-		prefix_end = struri (uri) + 7;
+	p = g_uri_unescape_string (uristring, NULL);
+	if (p == NULL) {
+		return URI_ERRNO_BAD_ENCODING;
 	}
-	else {
-		/* Figure out whether the protocol is known */
-		msg_debug ("getting protocol from url: %d", uri->protocol);
-
-		prefix_end = struri (uri) + uri->protocollen;   /* ':' */
 
-		/* Check if there's a digit after the protocol name. */
-		if (g_ascii_isdigit (*prefix_end)) {
-			p = struri (uri);
-			uri->ip_family = p[uri->protocollen] - '0';
-			prefix_end++;
-		}
-		if (*prefix_end != ':') {
-			msg_debug ("invalid protocol in uri");
-			return URI_ERRNO_INVALID_PROTOCOL;
-		}
-		prefix_end++;
+	uri->string = p;
 
-		/* Skip slashes */
+	rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t)g_free, p);
 
-		if (prefix_end[0] == '/' && prefix_end[1] == '/') {
-			if (prefix_end[2] == '/') {
-				msg_debug ("too many '/' in uri");
-				return URI_ERRNO_TOO_MANY_SLASHES;
-			}
-
-			prefix_end += 2;
-
-		}
-		else {
-			msg_debug ("no '/' in uri");
-			return URI_ERRNO_NO_SLASHES;
-		}
+	/*
+	 * We assume here that urls has the sane scheme
+	 */
+	if (http_parser_parse_url (p, len, 0, &u) != 0) {
+		return URI_ERRNO_BAD_FORMAT;
 	}
 
-	if (get_protocol_free_syntax (uri->protocol)) {
-		uri->data = prefix_end;
-		uri->datalen = strlen (prefix_end);
-		return URI_ERRNO_OK;
-
-	}
-	else if (uri->protocol == PROTOCOL_FILE) {
-		datalen = check_uri_file (prefix_end);
-		frag_or_post = prefix_end + datalen;
-
-		/* Extract the fragment part. */
-		if (datalen >= 0) {
-			if (*frag_or_post == '#') {
-				uri->fragment = frag_or_post + 1;
-				uri->fragmentlen = strcspn (uri->fragment, POST_CHAR_S);
-				frag_or_post = uri->fragment + uri->fragmentlen;
-			}
-			if (*frag_or_post == POST_CHAR) {
-				uri->post = frag_or_post + 1;
+	for (i = 0; i < UF_MAX; i ++) {
+		if (u.field_set & (1 << i)) {
+			comp = p + u.field_data[i].off;
+			complen = u.field_data[i].len;
+			switch (i) {
+			case UF_SCHEMA:
+				uri->protocollen = u.field_data[i].len;
+				break;
+			case UF_HOST:
+				uri->host = comp;
+				uri->hostlen = complen;
+				break;
+			case UF_PATH:
+				uri->data = comp;
+				uri->datalen = complen;
+				break;
+			case UF_QUERY:
+				uri->query = comp;
+				uri->querylen = complen;
+				break;
+			case UF_FRAGMENT:
+				uri->fragment = comp;
+				uri->fragmentlen = complen;
+				break;
+			case UF_USERINFO:
+				uri->user = comp;
+				uri->userlen = complen;
+				break;
+			default:
+				break;
 			}
 		}
-		else {
-			datalen = strlen (prefix_end);
-		}
-
-		uri->data = prefix_end;
-		uri->datalen = datalen;
-
-		return URI_ERRNO_OK;
-	}
-
-	/* Isolate host */
-
-	/* Get brackets enclosing IPv6 address */
-	lbracket = strchr (prefix_end, '[');
-	if (lbracket) {
-		rbracket = strchr (lbracket, ']');
-		/* [address] is handled only inside of hostname part (surprisingly). */
-		if (rbracket && rbracket < prefix_end + strcspn (prefix_end, "/"))
-			uri->ipv6 = 1;
-		else
-			lbracket = rbracket = NULL;
-	}
-	else {
-		rbracket = NULL;
-	}
-
-	/* Possibly skip auth part */
-	host_end = prefix_end + strcspn (prefix_end, "@");
-
-	if (prefix_end + strcspn (prefix_end, "/?") > host_end && *host_end) {  /* we have auth info here */
-
-		/* Allow '@' in the password component */
-		while (strcspn (host_end + 1, "@") < strcspn (host_end + 1, "/?"))
-			host_end = host_end + 1 + strcspn (host_end + 1, "@");
-
-		user_end = strchr (prefix_end, ':');
-
-		if (!user_end || user_end > host_end) {
-			uri->user = prefix_end;
-			uri->userlen = host_end - prefix_end;
-		}
-		else {
-			uri->user = prefix_end;
-			uri->userlen = user_end - prefix_end;
-			uri->password = user_end + 1;
-			uri->passwordlen = host_end - user_end - 1;
-		}
-		prefix_end = host_end + 1;
-	}
-
-	if (uri->ipv6 && rbracket != NULL) {
-		host_end = rbracket + strcspn (rbracket, ":/?");
-	}
-	else {
-		host_end = prefix_end + strcspn (prefix_end, ":/?");
-	}
-
-	if (uri->ipv6) {
-		addrlen = rbracket - lbracket - 1;
-
-
-		uri->host = lbracket + 1;
-		uri->hostlen = addrlen;
 	}
-	else {
-		uri->host = prefix_end;
-		uri->hostlen = host_end - prefix_end;
 
-		/* Trim trailing '.'s */
-		if (uri->hostlen && uri->host[uri->hostlen - 1] == '.')
-			return URI_ERRNO_TRAILING_DOTS;
+	if (!uri->hostlen) {
+		return URI_ERRNO_BAD_FORMAT;
 	}
 
-	if (*host_end == ':') {     /* we have port here */
-		port_end = host_end + 1 + strcspn (host_end + 1, "/");
-
-		host_end++;
-
-		uri->port = host_end;
-		uri->portlen = port_end - host_end;
-
-		if (uri->portlen == 0)
-			return URI_ERRNO_NO_PORT_COLON;
-
-		/* We only use 8 bits for portlen so better check */
-		if ((gint)uri->portlen != port_end - host_end)
-			return URI_ERRNO_INVALID_PORT;
-
-		/* test if port is number */
-		for (; host_end < port_end; host_end++)
-			if (!g_ascii_isdigit (*host_end))
-				return URI_ERRNO_INVALID_PORT;
+	rspamd_str_lc (uri->string, uri->protocollen);
+	rspamd_str_lc (uri->host,   uri->hostlen);
 
-		/* Check valid port value, and let show an error message
-		 * about invalid url syntax. */
-		if (uri->port && uri->portlen) {
+	uri->protocol = PROTOCOL_UNKNOWN;
 
-			errno = 0;
-			n = strtol (uri->port, NULL, 10);
-			if (errno || !uri_port_is_valid (n))
-				return URI_ERRNO_INVALID_PORT;
+	for (i = 0; i < G_N_ELEMENTS (protocols); i ++) {
+		if (uri->protocollen == protocols[i].len) {
+			if (memcmp (uri->string, protocols[i].name, uri->protocollen) == 0) {
+				uri->protocol = i;
+				break;
+			}
 		}
 	}
 
-	if (*host_end == '/') {
-		host_end++;
+	if (uri->protocol == PROTOCOL_UNKNOWN) {
+		return URI_ERRNO_INVALID_PROTOCOL;
 	}
-	else if (get_protocol_need_slash_after_host (uri->protocol) && *host_end !=
-		'?') {
-		/* The need for slash after the host component depends on the
-		 * need for a host component. -- The dangerous mind of Jonah */
-		if (!uri->hostlen)
-			return URI_ERRNO_NO_HOST;
-
-		return URI_ERRNO_NO_HOST_SLASH;
-	}
-
-	/* Look for #fragment or POST_CHAR */
-	prefix_end = host_end + strcspn (host_end, "#" POST_CHAR_S);
-	uri->data = host_end;
-	uri->datalen = prefix_end - host_end;
-
-	if (*prefix_end == '#') {
-		uri->fragment = prefix_end + 1;
-		uri->fragmentlen = strcspn (uri->fragment, POST_CHAR_S);
-		prefix_end = uri->fragment + uri->fragmentlen;
-	}
-
-	if (*prefix_end == POST_CHAR) {
-		uri->post = prefix_end + 1;
-	}
-
-	rspamd_str_lc (uri->string, uri->protocollen);
-	rspamd_str_lc (uri->host,   uri->hostlen);
-	/* Decode %HH sequences in host name.  This is important not so much
-	   to support %HH sequences in host names (which other browser
-	   don't), but to support binary characters (which will have been
-	   converted to %HH by reencode_escapes).  */
-	if (strchr (uri->host, '%')) {
-		uri->hostlen = url_calculate_escaped_hostlen (uri->host, uri->hostlen);
-	}
-
-	url_strip (struri (uri));
-	rspamd_url_unescape (uri->host);
-
-	path_simplify (uri->data);
 
 	return URI_ERRNO_OK;
 }
@@ -1821,14 +1319,14 @@ url_email_end (const gchar *begin,
 }
 
 void
-url_parse_text (rspamd_mempool_t * pool,
+rspamd_url_text_extract (rspamd_mempool_t * pool,
 	struct rspamd_task *task,
 	struct mime_text_part *part,
 	gboolean is_html)
 {
 	gint rc;
 	gchar *url_str = NULL, *url_start, *url_end;
-	struct uri *new;
+	struct rspamd_url *new;
 	struct process_exception *ex;
 	gchar *p, *end, *begin;
 
@@ -1843,18 +1341,17 @@ url_parse_text (rspamd_mempool_t * pool,
 		end = begin + part->content->len;
 		p = begin;
 		while (p < end) {
-			if (url_try_text (pool, p, end - p, &url_start, &url_end, &url_str,
+			if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str,
 				is_html)) {
 				if (url_str != NULL) {
-					new = rspamd_mempool_alloc0 (pool, sizeof (struct uri));
+					new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
 					ex =
 						rspamd_mempool_alloc0 (pool,
 							sizeof (struct process_exception));
 					if (new != NULL) {
 						g_strstrip (url_str);
-						rc = parse_uri (new, url_str, pool);
-						if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES ||
-							rc == URI_ERRNO_NO_HOST_SLASH) &&
+						rc = rspamd_url_parse (new, url_str, strlen (url_str), pool);
+						if (rc == URI_ERRNO_OK &&
 							new->hostlen > 0) {
 							ex->pos = url_start - begin;
 							ex->len = url_end - url_start;
@@ -1877,7 +1374,7 @@ url_parse_text (rspamd_mempool_t * pool,
 						else if (rc != URI_ERRNO_OK) {
 							msg_info ("extract of url '%s' failed: %s",
 								url_str,
-								url_strerror (rc));
+								rspamd_url_strerror (rc));
 						}
 					}
 				}
@@ -1897,7 +1394,7 @@ url_parse_text (rspamd_mempool_t * pool,
 }
 
 gboolean
-url_try_text (rspamd_mempool_t *pool,
+rspamd_url_find (rspamd_mempool_t *pool,
 	const gchar *begin,
 	gsize len,
 	gchar **start,
diff --git a/src/libserver/url.h b/src/libserver/url.h
index c9700436b..db3a3472c 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -8,12 +8,9 @@
 struct rspamd_task;
 struct mime_text_part;
 
-struct uri {
-	/* The start of the uri (and thus start of the protocol string). */
+struct rspamd_url {
 	gchar *string;
-
-	/* The internal type of protocol. Can _never_ be PROTOCOL_UNKNOWN. */
-	gint protocol; /* enum protocol */
+	gint protocol;
 
 	gint ip_family;
 
@@ -22,20 +19,20 @@ struct uri {
 	gchar *host;
 	gchar *port;
 	gchar *data;
+	gchar *query;
 	gchar *fragment;
 	gchar *post;
 	gchar *surbl;
 
-	struct uri *phished_url;
+	struct rspamd_url *phished_url;
 
-	/* @protocollen should only be usable if @protocol is either
-	 * PROTOCOL_USER or an uri string should be composed. */
 	guint protocollen;
 	guint userlen;
 	guint passwordlen;
 	guint hostlen;
 	guint portlen;
 	guint datalen;
+	guint querylen;
 	guint fragmentlen;
 	guint surbllen;
 
@@ -46,22 +43,16 @@ struct uri {
 };
 
 enum uri_errno {
-	URI_ERRNO_OK,           /* Parsing went well */
+	URI_ERRNO_OK = 0,           /* Parsing went well */
 	URI_ERRNO_EMPTY,        /* The URI string was empty */
 	URI_ERRNO_INVALID_PROTOCOL, /* No protocol was found */
-	URI_ERRNO_NO_SLASHES,       /* Slashes after protocol missing */
-	URI_ERRNO_TOO_MANY_SLASHES, /* Too many slashes after protocol */
-	URI_ERRNO_TRAILING_DOTS,    /* '.' after host */
-	URI_ERRNO_NO_HOST,      /* Host part is missing */
-	URI_ERRNO_NO_PORT_COLON,    /* ':' after host without port */
-	URI_ERRNO_NO_HOST_SLASH,    /* Slash after host missing */
-	URI_ERRNO_IPV6_SECURITY,    /* IPv6 security bug detected */
 	URI_ERRNO_INVALID_PORT,     /* Port number is bad */
-	URI_ERRNO_INVALID_PORT_RANGE    /* Port number is not within 0-65535 */
+	URI_ERRNO_BAD_ENCODING, /* Bad characters encoding */
+	URI_ERRNO_BAD_FORMAT
 };
 
-enum protocol {
-	PROTOCOL_FILE,
+enum rspamd_url_protocol {
+	PROTOCOL_FILE = 0,
 	PROTOCOL_FTP,
 	PROTOCOL_HTTP,
 	PROTOCOL_HTTPS,
@@ -78,7 +69,7 @@ enum protocol {
  * @param part current text part
  * @param is_html turn on html euristic
  */
-void url_parse_text (rspamd_mempool_t *pool,
+void rspamd_url_text_extract (rspamd_mempool_t *pool,
 	struct rspamd_task *task,
 	struct mime_text_part *part,
 	gboolean is_html);
@@ -89,8 +80,9 @@ void url_parse_text (rspamd_mempool_t *pool,
  * @param uristring text form of url
  * @param uri url object, must be pre allocated
  */
-enum uri_errno parse_uri (struct uri *uri,
+enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
 	gchar *uristring,
+	gsize len,
 	rspamd_mempool_t *pool);
 
 /*
@@ -103,7 +95,7 @@ enum uri_errno parse_uri (struct uri *uri,
  * @param url_str storage for url string(or NULL)
  * @return TRUE if url is found in specified text
  */
-gboolean url_try_text (rspamd_mempool_t *pool,
+gboolean rspamd_url_find (rspamd_mempool_t *pool,
 	const gchar *begin,
 	gsize len,
 	gchar **start,
@@ -114,7 +106,7 @@ gboolean url_try_text (rspamd_mempool_t *pool,
 /*
  * Return text representation of url parsing error
  */
-const gchar * url_strerror (enum uri_errno err);
+const gchar * rspamd_url_strerror (enum uri_errno err);
 
 /*
  * URL unescape characters in the specified string
diff --git a/src/libutil/util.c b/src/libutil/util.c
index 6d5682f25..f88ed8e72 100644
--- a/src/libutil/util.c
+++ b/src/libutil/util.c
@@ -1427,7 +1427,7 @@ rspamd_strlcpy_tolower (gchar *dst, const gchar *src, gsize siz)
 gint
 rspamd_emails_cmp (gconstpointer a, gconstpointer b)
 {
-	const struct uri *u1 = a, *u2 = b;
+	const struct rspamd_url *u1 = a, *u2 = b;
 	gint r;
 
 	if (u1->hostlen != u2->hostlen || u1->hostlen == 0) {
@@ -1453,7 +1453,7 @@ rspamd_emails_cmp (gconstpointer a, gconstpointer b)
 gint
 rspamd_urls_cmp (gconstpointer a, gconstpointer b)
 {
-	const struct uri *u1 = a, *u2 = b;
+	const struct rspamd_url *u1 = a, *u2 = b;
 	int r;
 
 	if (u1->hostlen != u2->hostlen || u1->hostlen == 0) {
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index ef52b4544..6fee606c4 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -683,12 +683,12 @@ lua_check_image (lua_State * L)
 	return ud ? *((struct rspamd_image **)ud) : NULL;
 }
 
-static struct uri *
+static struct rspamd_url *
 lua_check_url (lua_State * L)
 {
 	void *ud = luaL_checkudata (L, 1, "rspamd{url}");
 	luaL_argcheck (L, ud != NULL, 1, "'url' expected");
-	return ud ? *((struct uri **)ud) : NULL;
+	return ud ? *((struct rspamd_url **)ud) : NULL;
 }
 
 static int
@@ -924,10 +924,10 @@ struct lua_tree_cb_data {
 static gboolean
 lua_tree_url_callback (gpointer key, gpointer value, gpointer ud)
 {
-	struct uri **purl;
+	struct rspamd_url **purl;
 	struct lua_tree_cb_data *cb = ud;
 
-	purl = lua_newuserdata (cb->L, sizeof (struct uri *));
+	purl = lua_newuserdata (cb->L, sizeof (struct rspamd_url *));
 	rspamd_lua_setclass (cb->L, "rspamd{url}", -1);
 	*purl = value;
 	lua_rawseti (cb->L, -2, cb->i++);
@@ -2352,7 +2352,7 @@ lua_image_get_filename (lua_State *L)
 static gint
 lua_url_get_length (lua_State *L)
 {
-	struct uri *url = lua_check_url (L);
+	struct rspamd_url *url = lua_check_url (L);
 
 	if (url != NULL) {
 		lua_pushinteger (L, strlen (struri (url)));
@@ -2366,7 +2366,7 @@ lua_url_get_length (lua_State *L)
 static gint
 lua_url_get_host (lua_State *L)
 {
-	struct uri *url = lua_check_url (L);
+	struct rspamd_url *url = lua_check_url (L);
 
 	if (url != NULL) {
 		lua_pushlstring (L, url->host, url->hostlen);
@@ -2380,7 +2380,7 @@ lua_url_get_host (lua_State *L)
 static gint
 lua_url_get_user (lua_State *L)
 {
-	struct uri *url = lua_check_url (L);
+	struct rspamd_url *url = lua_check_url (L);
 
 	if (url != NULL && url->user != NULL) {
 		lua_pushlstring (L, url->user, url->userlen);
@@ -2395,7 +2395,7 @@ lua_url_get_user (lua_State *L)
 static gint
 lua_url_get_path (lua_State *L)
 {
-	struct uri *url = lua_check_url (L);
+	struct rspamd_url *url = lua_check_url (L);
 
 	if (url != NULL) {
 		lua_pushlstring (L, url->data, url->datalen);
@@ -2410,7 +2410,7 @@ lua_url_get_path (lua_State *L)
 static gint
 lua_url_get_text (lua_State *L)
 {
-	struct uri *url = lua_check_url (L);
+	struct rspamd_url *url = lua_check_url (L);
 
 	if (url != NULL) {
 		lua_pushstring (L, struri (url));
@@ -2425,7 +2425,7 @@ lua_url_get_text (lua_State *L)
 static gint
 lua_url_is_phished (lua_State *L)
 {
-	struct uri *url = lua_check_url (L);
+	struct rspamd_url *url = lua_check_url (L);
 
 	if (url != NULL) {
 		lua_pushboolean (L, url->is_phished);
@@ -2440,11 +2440,11 @@ lua_url_is_phished (lua_State *L)
 static gint
 lua_url_get_phished (lua_State *L)
 {
-	struct uri **purl, *url = lua_check_url (L);
+	struct rspamd_url **purl, *url = lua_check_url (L);
 
 	if (url) {
 		if (url->is_phished && url->phished_url != NULL) {
-			purl = lua_newuserdata (L, sizeof (struct uri *));
+			purl = lua_newuserdata (L, sizeof (struct rspamd_url *));
 			rspamd_lua_setclass (L, "rspamd{url}", -1);
 			*purl = url->phished_url;
 
diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c
index 74ab46ab5..15eed1674 100644
--- a/src/plugins/regexp.c
+++ b/src/plugins/regexp.c
@@ -378,7 +378,7 @@ static gboolean
 tree_url_callback (gpointer key, gpointer value, void *data)
 {
 	struct url_regexp_param *param = data;
-	struct uri *url = value;
+	struct rspamd_url *url = value;
 	GError *err = NULL;
 
 	if (g_regex_match_full (param->regexp, struri (url), -1, 0, 0, NULL,
diff --git a/src/plugins/surbl.c b/src/plugins/surbl.c
index b7a3a8337..df9227c08 100644
--- a/src/plugins/surbl.c
+++ b/src/plugins/surbl.c
@@ -576,7 +576,7 @@ format_surbl_request (rspamd_mempool_t * pool,
 	GError ** err,
 	gboolean forced,
 	GTree *tree,
-	struct uri *url)
+	struct rspamd_url *url)
 {
 	GHashTable *t;
 	gchar *result = NULL, *dots[MAX_LEVELS],
@@ -753,7 +753,7 @@ format_surbl_request (rspamd_mempool_t * pool,
 }
 
 static void
-make_surbl_requests (struct uri *url, struct rspamd_task *task,
+make_surbl_requests (struct rspamd_url *url, struct rspamd_task *task,
 	struct suffix_item *suffix, gboolean forced, GTree *tree)
 {
 	gchar *surbl_req;
@@ -953,7 +953,7 @@ redirector_callback (gint fd, short what, void *arg)
 						struri (param->url),
 						c);
 					r =
-						parse_uri (param->url,
+						rspamd_url_parse (param->url,
 							rspamd_mempool_strdup (param->task->task_pool,
 							c), param->task->task_pool);
 					if (r == URI_ERRNO_OK || r == URI_ERRNO_NO_SLASHES || r ==
@@ -985,7 +985,7 @@ redirector_callback (gint fd, short what, void *arg)
 
 
 static void
-register_redirector_call (struct uri *url, struct rspamd_task *task,
+register_redirector_call (struct rspamd_url *url, struct rspamd_task *task,
 	struct suffix_item *suffix, const gchar *rule, GTree *tree)
 {
 	gint s = -1;
@@ -1042,7 +1042,7 @@ surbl_tree_url_callback (gpointer key, gpointer value, void *data)
 {
 	struct redirector_param *param = data;
 	struct rspamd_task *task;
-	struct uri *url = value;
+	struct rspamd_url *url = value;
 	gchar *red_domain;
 	const gchar *pos;
 	GRegex *re;
@@ -1134,7 +1134,7 @@ static gboolean
 calculate_buflen_cb (gpointer key, gpointer value, gpointer cbdata)
 {
 	struct urls_tree_cb_data *cb = cbdata;
-	struct uri *url = value;
+	struct rspamd_url *url = value;
 
 	cb->len += strlen (struri (url)) + url->hostlen + sizeof (" <\"\">, ") - 1;
 
@@ -1145,7 +1145,7 @@ static gboolean
 write_urls_buffer (gpointer key, gpointer value, gpointer cbdata)
 {
 	struct urls_tree_cb_data *cb = cbdata;
-	struct uri *url = value;
+	struct rspamd_url *url = value;
 	rspamd_fstring_t f;
 	gchar *urlstr;
 	gsize len;
diff --git a/src/plugins/surbl.h b/src/plugins/surbl.h
index 7701c8304..959a730de 100644
--- a/src/plugins/surbl.h
+++ b/src/plugins/surbl.h
@@ -46,14 +46,14 @@ struct suffix_item {
 };
 
 struct dns_param {
-	struct uri *url;
+	struct rspamd_url *url;
 	struct rspamd_task *task;
 	gchar *host_resolve;
 	struct suffix_item *suffix;
 };
 
 struct redirector_param {
-	struct uri *url;
+	struct rspamd_url *url;
 	struct rspamd_task *task;
 	struct upstream *redirector;
 	enum {
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2015-02-03 17:39:03 +0000
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2015-02-03 17:39:03 +0000
commit	b4c0e9b59d3985726d9a346085172394a0495ce6 (patch)
tree	f03a48c2206d37299f26c87f9ecd85d394a39271
parent	aa8b1c618148c2ead44e1cc643e9eb23423a4843 (diff)
download	rspamd-b4c0e9b59d3985726d9a346085172394a0495ce6.tar.gz rspamd-b4c0e9b59d3985726d9a346085172394a0495ce6.zip