[Fix] Use multipattern in url matcher

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2016-04-14 13:07:40 +0100
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2016-04-14 13:07:40 +0100
commit: f2f1ea684b61abb0c810a0a1fb26c07b0e019d06 (patch)
tree: 64d6096ed40f12eebdbd9cd5e76a5fdf7ad3bda9
parent: da58466e4e5f47ab916db936580ed67d75218c28 (diff)
download: rspamd-f2f1ea684b61abb0c810a0a1fb26c07b0e019d06.tar.gz
rspamd-f2f1ea684b61abb0c810a0a1fb26c07b0e019d06.zip
3 files changed, 283 insertions, 277 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 49034acf6..793f1c5a0 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -683,13 +683,12 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
 	gboolean *url_found)
 {
 	struct rspamd_url *text_url;
-	gint rc, state = 0;
+	gint rc;
 	gchar *url_str = NULL;
 
 	*url_found = FALSE;
 
-	if (rspamd_url_find (pool, url_text, len, NULL, NULL, &url_str,
-		TRUE, &state) && url_str != NULL) {
+	if (rspamd_url_find (pool, url_text, len, &url_str, TRUE) && url_str != NULL) {
 		text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
 		rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool);
 
@@ -1235,15 +1234,13 @@ static void
 rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
 		GHashTable *target)
 {
-	gint nstate = 0;
 	struct rspamd_url *query_url;
 	gchar *url_str;
 	gint rc;
 
 	if (url->querylen > 0) {
 
-		if (rspamd_url_find (pool, url->query, url->querylen, NULL, NULL,
-				&url_str, TRUE, &nstate)) {
+		if (rspamd_url_find (pool, url->query, url->querylen, &url_str, TRUE)) {
 			query_url = rspamd_mempool_alloc0 (pool,
 					sizeof (struct rspamd_url));
 
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 700ffe34b..94b7964fb 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -46,7 +46,7 @@
 #include "rspamd.h"
 #include "message.h"
 #include "http.h"
-#include "acism.h"
+#include "multipattern.h"
 #include "http_parser.h"
 
 typedef struct url_match_s {
@@ -65,7 +65,7 @@ typedef struct url_match_s {
 struct url_callback_data;
 
 struct url_matcher {
-	gchar *pattern;
+	const gchar *pattern;
 	const gchar *prefix;
 
 	gboolean (*start) (struct url_callback_data *cb,
@@ -77,6 +77,7 @@ struct url_matcher {
 			url_match_t *match);
 
 	gint flags;
+	gsize patlen;
 };
 
 static gboolean url_file_start (struct url_callback_data *cb,
@@ -114,38 +115,38 @@ static gboolean url_email_end (struct url_callback_data *cb,
 struct url_matcher static_matchers[] = {
 		/* Common prefixes */
 		{"file://",   "",          url_file_start,  url_file_end,
-				0},
+				0, 0},
 		{"ftp://",    "",          url_web_start,   url_web_end,
-				0},
+				0, 0},
 		{"sftp://",   "",          url_web_start,   url_web_end,
-				0},
+				0, 0},
 		{"http://",   "",          url_web_start,   url_web_end,
-				0},
+				0, 0},
 		{"https://",  "",          url_web_start,   url_web_end,
-				0},
+				0, 0},
 		{"news://",   "",          url_web_start,   url_web_end,
-				0},
+				0, 0},
 		{"nntp://",   "",          url_web_start,   url_web_end,
-				0},
+				0, 0},
 		{"telnet://", "",          url_web_start,   url_web_end,
-				0},
+				0, 0},
 		{"webcal://", "",          url_web_start,   url_web_end,
-				0},
+				0, 0},
 		{"mailto:",   "",          url_email_start, url_email_end,
-				0},
+				0, 0},
 		{"callto://", "",          url_web_start,   url_web_end,
-				0},
+				0, 0},
 		{"h323:",     "",          url_web_start,   url_web_end,
-				0},
+				0, 0},
 		{"sip:",      "",          url_web_start,   url_web_end,
-				0},
+				0, 0},
 		{"www.",      "http://",   url_web_start,   url_web_end,
-				0},
+				0, 0},
 		{"ftp.",      "ftp://",    url_web_start,   url_web_end,
-				URL_FLAG_NOHTML},
+				URL_FLAG_NOHTML, 0},
 		/* Likely emails */
 		{"@",         "mailto://", url_email_start, url_email_end,
-				URL_FLAG_NOHTML}
+				URL_FLAG_NOHTML, 0}
 };
 
 struct url_callback_data {
@@ -158,12 +159,13 @@ struct url_callback_data {
 	const gchar *fin;
 	const gchar *end;
 	const gchar *last_at;
+	url_insert_function func;
+	void *funcd;
 };
 
 struct url_match_scanner {
 	GArray *matchers;
-	GArray *patterns;
-	ac_trie_t *search_trie;
+	struct rspamd_multipattern *search_trie;
 };
 
 struct url_match_scanner *url_scanner = NULL;
@@ -339,9 +341,8 @@ rspamd_url_parse_tld_file (const gchar *fname,
 {
 	FILE *f;
 	struct url_matcher m;
-	ac_trie_pat_t pat;
 	gchar *linebuf = NULL, *p;
-	gsize buflen = 0, patlen;
+	gsize buflen = 0;
 	gssize r;
 	gint flags;
 
@@ -372,6 +373,7 @@ rspamd_url_parse_tld_file (const gchar *fname,
 
 		flags = URL_FLAG_NOHTML | URL_FLAG_TLD_MATCH;
 
+#ifndef WITH_HYPERSCAN
 		if (linebuf[0] == '*') {
 			flags |= URL_FLAG_STAR_MATCH;
 			p = strchr (linebuf, '.');
@@ -385,16 +387,16 @@ rspamd_url_parse_tld_file (const gchar *fname,
 		else {
 			p = linebuf;
 		}
+#else
+		p = linebuf;
+#endif
 
-		patlen = strlen (p);
-		m.pattern = g_malloc (patlen + 2);
-		m.pattern[0] = '.';
 		m.flags = flags;
-		pat.ptr = m.pattern;
-		pat.len = patlen + 1;
-		rspamd_strlcpy (&m.pattern[1], p, patlen + 1);
+		rspamd_multipattern_add_pattern (url_scanner->search_trie, p);
+		m.pattern = rspamd_multipattern_get_pattern (url_scanner->search_trie,
+				rspamd_multipattern_get_npatterns (url_scanner->search_trie) - 1);
+		m.patlen = strlen (m.pattern);
 		g_array_append_val (url_scanner->matchers, m);
-		g_array_append_val (url_scanner->patterns, pat);
 	}
 
 	free (linebuf);
@@ -405,26 +407,26 @@ static void
 rspamd_url_add_static_matchers (struct url_match_scanner *sc)
 {
 	gint n = G_N_ELEMENTS (static_matchers), i;
-	ac_trie_pat_t pat;
 
 	g_array_append_vals (sc->matchers, static_matchers, n);
 
 	for (i = 0; i < n; i++) {
-		pat.ptr = static_matchers[i].pattern;
-		pat.len = strlen (pat.ptr);
-		g_array_append_val (sc->patterns, pat);
+		rspamd_multipattern_add_pattern (url_scanner->search_trie,
+				static_matchers[i].pattern);
 	}
 }
 
 void
 rspamd_url_init (const gchar *tld_file)
 {
+	GError *err = NULL;
+
 	if (url_scanner == NULL) {
 		url_scanner = g_malloc (sizeof (struct url_match_scanner));
 		url_scanner->matchers = g_array_sized_new (FALSE, TRUE,
 				sizeof (struct url_matcher), 512);
-		url_scanner->patterns = g_array_sized_new (FALSE, TRUE,
-				sizeof (ac_trie_pat_t), 512);
+		url_scanner->search_trie = rspamd_multipattern_create_sized (512,
+				RSPAMD_MULTIPATTERN_TLD | RSPAMD_MULTIPATTERN_ICASE);
 		rspamd_url_add_static_matchers (url_scanner);
 
 		if (tld_file != NULL) {
@@ -435,12 +437,14 @@ rspamd_url_init (const gchar *tld_file)
 					"tld extension file is not specified, url matching is limited");
 		}
 
-		url_scanner->search_trie = acism_create (
-				(const ac_trie_pat_t *) url_scanner->patterns->data,
-				url_scanner->patterns->len);
+		if (!rspamd_multipattern_compile (url_scanner->search_trie, &err)) {
+			msg_err ("cannot compile tld patterns, url matching will be "
+					"broken completely: %e", err);
+			g_error_free (err);
+		}
 
-		msg_info ("initialized ac_trie of %ud elements",
-				url_scanner->patterns->len);
+		msg_info ("initialized trie of %ud elements",
+				url_scanner->matchers->len);
 	}
 }
 
@@ -1078,31 +1082,35 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len,
 #undef SET_U
 
 static gint
-rspamd_tld_trie_callback (int strnum, int textpos, void *context)
+rspamd_tld_trie_callback (struct rspamd_multipattern *mp,
+		guint strnum,
+		gint match_start,
+		gint match_pos,
+		const gchar *text,
+		gsize len,
+		void *context)
 {
 	struct url_matcher *matcher;
 	const gchar *start, *pos, *p;
 	struct rspamd_url *url = context;
-	ac_trie_pat_t *pat;
 	gint ndots = 1;
 
 	matcher = &g_array_index (url_scanner->matchers, struct url_matcher,
 			strnum);
-	pat = &g_array_index (url_scanner->patterns, ac_trie_pat_t, strnum);
 
 	if (matcher->flags & URL_FLAG_STAR_MATCH) {
 		/* Skip one more tld component */
 		ndots = 2;
 	}
 
-	pos = url->host + textpos - pat->len;
+	pos = text + match_start;
 	p = pos - 1;
 	start = url->host;
 
-	if (*pos != '.' || textpos != (gint) url->hostlen) {
+	if (*pos != '.' || match_pos != (gint) url->hostlen) {
 		/* Something weird has been found */
-		if (textpos == (gint) url->hostlen - 1) {
-			pos = url->host + textpos;
+		if (match_pos == (gint) url->hostlen - 1) {
+			pos = url->host + match_pos;
 			if (*pos == '.') {
 				/* This is dot at the end of domain */
 				url->hostlen--;
@@ -1401,7 +1409,6 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
 	const gchar *end;
 	guint i, complen, ret;
 	gsize unquoted_len = 0;
-	gint state = 0;
 
 	const struct {
 		enum rspamd_url_protocol proto;
@@ -1557,8 +1564,9 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
 	}
 
 	/* Find TLD part */
-	if (acism_lookup (url_scanner->search_trie, uri->host, uri->hostlen,
-			rspamd_tld_trie_callback, uri, &state, true) == 0) {
+	if (rspamd_multipattern_lookup (url_scanner->search_trie,
+			uri->host, uri->hostlen,
+			rspamd_tld_trie_callback, uri, NULL) == 0) {
 		/* Ignore URL's without TLD if it is not a numeric URL */
 		if (!rspamd_url_is_ip (uri, pool)) {
 			return URI_ERRNO_TLD_MISSING;
@@ -1579,30 +1587,34 @@ struct tld_trie_cbdata {
 };
 
 static gint
-rspamd_tld_trie_find_callback (int strnum, int textpos, void *context)
+rspamd_tld_trie_find_callback (struct rspamd_multipattern *mp,
+		guint strnum,
+		gint match_start,
+		gint match_pos,
+		const gchar *text,
+		gsize len,
+		void *context)
 {
 	struct url_matcher *matcher;
 	const gchar *start, *pos, *p;
 	struct tld_trie_cbdata *cbdata = context;
-	ac_trie_pat_t *pat;
 	gint ndots = 1;
 
 	matcher = &g_array_index (url_scanner->matchers, struct url_matcher,
 			strnum);
-	pat = &g_array_index (url_scanner->patterns, ac_trie_pat_t, strnum);
 
 	if (matcher->flags & URL_FLAG_STAR_MATCH) {
 		/* Skip one more tld component */
 		ndots = 2;
 	}
 
-	pos = cbdata->begin + textpos - pat->len;
+	pos = text + match_start;
 	p = pos - 1;
-	start = cbdata->begin;
+	start = text;
 
-	if (*pos != '.' || textpos != (gint)cbdata->len) {
+	if (*pos != '.' || match_pos != (gint)cbdata->len) {
 		/* Something weird has been found */
-		if (textpos != (gint)cbdata->len - 1) {
+		if (match_pos != (gint)cbdata->len - 1) {
 			/* Search more */
 			return 0;
 		}
@@ -1632,7 +1644,6 @@ gboolean
 rspamd_url_find_tld (const gchar *in, gsize inlen, rspamd_ftok_t *out)
 {
 	struct tld_trie_cbdata cbdata;
-	gint state = 0;
 
 	g_assert (in != NULL);
 	g_assert (out != NULL);
@@ -1642,8 +1653,8 @@ rspamd_url_find_tld (const gchar *in, gsize inlen, rspamd_ftok_t *out)
 	cbdata.len = inlen;
 	cbdata.out = out;
 
-	if (acism_lookup (url_scanner->search_trie, in, inlen,
-			rspamd_tld_trie_find_callback, &cbdata, &state, true) == 0) {
+	if (rspamd_multipattern_lookup (url_scanner->search_trie, in, inlen,
+			rspamd_tld_trie_find_callback, &cbdata, NULL) == 0) {
 		return FALSE;
 	}
 
@@ -1966,160 +1977,67 @@ url_email_end (struct url_callback_data *cb,
 	return FALSE;
 }
 
-void
-rspamd_url_text_extract (rspamd_mempool_t *pool,
-		struct rspamd_task *task,
-		struct mime_text_part *part,
-		gboolean is_html)
+static gboolean
+rspamd_url_trie_is_match (struct url_matcher *matcher, const gchar *pos,
+		const gchar *end)
 {
-	gint rc, state = 0;
-	gchar *url_str = NULL;
-	struct rspamd_url *url;
-	struct process_exception *ex;
-	const gchar *p, *end, *begin, *url_start, *url_end;
-
-	if (part->content == NULL || part->content->len == 0) {
-		msg_warn_task ("got empty text part");
-		return;
-	}
-
-	begin = part->content->data;
-	end = begin + part->content->len;
-	p = begin;
-	while (p < end) {
-		if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str,
-				is_html, &state)) {
-			if (url_str != NULL) {
-				url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
-				ex =
-						rspamd_mempool_alloc0 (pool,
-								sizeof (struct process_exception));
-				if (url != NULL) {
-					g_strstrip (url_str);
-					rc = rspamd_url_parse (url, url_str, strlen (url_str),
-							pool);
-					if (rc == URI_ERRNO_OK &&
-						url->hostlen > 0) {
-						ex->pos = url_start - begin;
-						ex->len = url_end - url_start;
-						if (url->protocol == PROTOCOL_MAILTO) {
-							if (url->userlen > 0) {
-								if (!g_hash_table_lookup (task->emails, url)) {
-									g_hash_table_insert (task->emails, url,
-											url);
-								}
-							}
-						}
-						else {
-							if (!g_hash_table_lookup (task->urls, url)) {
-								g_hash_table_insert (task->urls, url, url);
-							}
-						}
-						part->urls_offset = g_list_prepend (
-								part->urls_offset,
-								ex);
-
-						/* We also search the query for additional url inside */
-						if (url->querylen > 0) {
-							gint nstate = 0;
-							struct rspamd_url *query_url;
-
-							if (rspamd_url_find (pool,
-									url->query,
-									url->querylen,
-									NULL,
-									NULL,
-									&url_str,
-									is_html,
-									&nstate)) {
-
-								query_url = rspamd_mempool_alloc0 (pool,
-										sizeof (struct rspamd_url));
-								rc = rspamd_url_parse (query_url,
-										url_str,
-										strlen (url_str),
-										pool);
-								if (rc == URI_ERRNO_OK &&
-										url->hostlen > 0) {
-									msg_debug_task ("found url %s in query of url"
-											" %*s", url_str, url->querylen, url->query);
-
-									if (!g_hash_table_lookup (task->urls,
-											query_url)) {
-										g_hash_table_insert (task->urls,
-												query_url,
-												query_url);
-									}
-								}
-							}
+	if (matcher->flags & URL_FLAG_TLD_MATCH) {
+		/* Immediately check pos for valid chars */
+		if (pos < end) {
+			if (!g_ascii_isspace (*pos) && *pos != '/' && *pos != '?' &&
+					*pos != ':' && !is_url_end (*pos)) {
+				if (*pos == '.') {
+					/* We allow . at the end of the domain however */
+					pos++;
+					if (pos < end) {
+						if (!g_ascii_isspace (*pos) && *pos != '/' &&
+								*pos != '?' && *pos != ':' && !is_url_end (*pos)) {
+							return FALSE;
 						}
 					}
-					else if (rc != URI_ERRNO_OK) {
-						msg_info_task ("extract of url '%s' failed: %s",
-								url_str,
-								rspamd_url_strerror (rc));
-					}
+				}
+				else {
+					return FALSE;
 				}
 			}
 		}
-		else {
-			break;
-		}
-		p = url_end + 1;
-	}
-	/* Handle offsets of this part */
-	if (part->urls_offset != NULL) {
-		part->urls_offset = g_list_reverse (part->urls_offset);
-		rspamd_mempool_add_destructor (task->task_pool,
-				(rspamd_mempool_destruct_t) g_list_free, part->urls_offset);
 	}
+
+	return TRUE;
 }
 
 static gint
-rspamd_url_trie_callback (int strnum, int textpos, void *context)
+rspamd_url_trie_callback (struct rspamd_multipattern *mp,
+		guint strnum,
+		gint match_start,
+		gint match_pos,
+		const gchar *text,
+		gsize len,
+		void *context)
 {
 	struct url_matcher *matcher;
 	url_match_t m;
 	const gchar *pos;
 	struct url_callback_data *cb = context;
-	ac_trie_pat_t *pat;
 
 	matcher = &g_array_index (url_scanner->matchers, struct url_matcher,
 			strnum);
+
 	if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) {
 		/* Do not try to match non-html like urls in html texts */
 		return 0;
 	}
 
-	if (matcher->flags & URL_FLAG_TLD_MATCH) {
-		/* Immediately check pos for valid chars */
-		pos = &cb->begin[textpos];
-		if (pos < cb->end) {
-			if (!g_ascii_isspace (*pos) && *pos != '/' && *pos != '?' &&
-				*pos != ':' && !is_url_end (*pos)) {
-				if (*pos == '.') {
-					/* We allow . at the end of the domain however */
-					pos++;
-					if (pos < cb->end) {
-						if (!g_ascii_isspace (*pos) && *pos != '/' &&
-							*pos != '?' && *pos != ':' && !is_url_end (*pos)) {
-							return 0;
-						}
-					}
-				}
-				else {
-					return 0;
-				}
-			}
-		}
-	}
+	pos = &cb->begin[match_pos];
 
-	pat = &g_array_index (url_scanner->patterns, ac_trie_pat_t, strnum);
+	if (!rspamd_url_trie_is_match (matcher, pos, cb->end)) {
+		return 0;
+	}
 
 	m.pattern = matcher->pattern;
 	m.prefix = matcher->prefix;
 	m.add_prefix = FALSE;
-	pos = cb->begin + textpos - pat->len;
+	pos = cb->begin + match_start;
 
 	if (matcher->start (cb, pos, &m) &&
 			matcher->end (cb, pos, &m)) {
@@ -2155,14 +2073,11 @@ gboolean
 rspamd_url_find (rspamd_mempool_t *pool,
 		const gchar *begin,
 		gsize len,
-		const gchar **start,
-		const gchar **fin,
 		gchar **url_str,
-		gboolean is_html,
-		gint *statep)
+		gboolean is_html)
 {
 	struct url_callback_data cb;
-	gint ret, state;
+	gint ret;
 
 	memset (&cb, 0, sizeof (cb));
 	cb.begin = begin;
@@ -2170,27 +2085,10 @@ rspamd_url_find (rspamd_mempool_t *pool,
 	cb.is_html = is_html;
 	cb.pool = pool;
 
-	if (statep != NULL) {
-		state = *statep;
-	}
-	else {
-		state = 0;
-	}
-
-	ret = acism_lookup (url_scanner->search_trie, begin, len,
-			rspamd_url_trie_callback, &cb, &state, true);
-
-	if (statep) {
-		*statep = state;
-	}
+	ret = rspamd_multipattern_lookup (url_scanner->search_trie, begin, len,
+			rspamd_url_trie_callback, &cb, NULL);
 
 	if (ret) {
-		if (start) {
-			*start = cb.start;
-		}
-		if (fin) {
-			*fin = cb.fin;
-		}
 		if (url_str) {
 			*url_str = cb.url_str;
 		}
@@ -2201,67 +2099,188 @@ rspamd_url_find (rspamd_mempool_t *pool,
 	return FALSE;
 }
 
-struct rspamd_url *
-rspamd_url_get_next (rspamd_mempool_t *pool,
-		const gchar *start, gchar const **pos, gint *statep)
+static gint
+rspamd_url_trie_generic_callback (struct rspamd_multipattern *mp,
+		guint strnum,
+		gint match_start,
+		gint match_pos,
+		const gchar *text,
+		gsize len,
+		void *context)
 {
-	const gchar *p, *end, *url_start, *url_end;
-	gchar *url_str = NULL;
-	struct rspamd_url *new;
+	struct rspamd_url *url;
+	struct url_matcher *matcher;
+	url_match_t m;
+	const gchar *pos;
+	struct url_callback_data *cb = context;
 	gint rc;
+	rspamd_mempool_t *pool;
+
+	matcher = &g_array_index (url_scanner->matchers, struct url_matcher,
+			strnum);
+	pool = cb->pool;
 
-	end = start + strlen (start);
+	if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) {
+		/* Do not try to match non-html like urls in html texts */
+		return 0;
+	}
+
+	pos = text + match_pos;
+
+	if (!rspamd_url_trie_is_match (matcher, pos, text + len)) {
+		return 0;
+	}
+
+	m.pattern = matcher->pattern;
+	m.prefix = matcher->prefix;
+	m.add_prefix = FALSE;
+
+	if (matcher->start (cb, pos, &m) &&
+			matcher->end (cb, pos, &m)) {
+		if (m.add_prefix || matcher->prefix[0] != '\0') {
+			cb->len = m.m_len + strlen (matcher->prefix);
+			cb->url_str = rspamd_mempool_alloc (cb->pool, cb->len + 1);
+			cb->len = rspamd_snprintf (cb->url_str,
+					cb->len + 1,
+					"%s%*s",
+					m.prefix,
+					(gint)m.m_len,
+					m.m_begin);
+		}
+		else {
+			cb->url_str = rspamd_mempool_alloc (cb->pool, m.m_len + 1);
+			rspamd_strlcpy (cb->url_str, m.m_begin, m.m_len + 1);
+		}
 
-	if (pos == NULL || *pos == NULL) {
-		p = start;
+		cb->start = m.m_begin;
+		cb->fin = m.m_begin + m.m_len;
+		url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
+		g_strstrip (cb->url_str);
+		rc = rspamd_url_parse (url, cb->url_str, m.m_len + 1, pool);
+
+		if (rc == URI_ERRNO_OK && url->hostlen > 0) {
+			if (cb->func) {
+				cb->func (url, cb->start - text, cb->fin - text, cb->funcd);
+			}
+		}
+		else if (rc != URI_ERRNO_OK) {
+			msg_info_pool_check ("extract of url '%s' failed: %s",
+					cb->url_str,
+					rspamd_url_strerror (rc));
+		}
 	}
 	else {
-		p = *pos;
+		cb->url_str = NULL;
 	}
 
-	if (p < end) {
-		if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str,
-				FALSE, statep)) {
-			if (url_str != NULL) {
-				new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
+	/* Continue search */
+	return 0;
+}
 
-				if (new != NULL) {
-					g_strstrip (url_str);
-					rc = rspamd_url_parse (new, url_str, strlen (url_str),
-							pool);
-					if (rc == URI_ERRNO_OK &&
-						new->hostlen > 0) {
+struct rspamd_url_mimepart_cbdata {
+	struct rspamd_task *task;
+	struct mime_text_part *part;
+};
 
-						if (new->protocol == PROTOCOL_MAILTO) {
-							if (new->userlen > 0) {
-								return new;
-							}
-						}
-						else {
-							return new;
-						}
-					}
-					else if (rc != URI_ERRNO_OK) {
-						rspamd_default_log_function (G_LOG_LEVEL_INFO,
-								pool->tag.tagname, pool->tag.uid,
-								G_STRFUNC,
-								"extract of url '%s' failed: %s",
-								url_str,
-								rspamd_url_strerror (rc));
-					}
+static void
+rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
+		gsize end_offset, gpointer ud)
+{
+	struct rspamd_url_mimepart_cbdata *cbd = ud;
+	struct process_exception *ex;
+	struct rspamd_task *task;
+	gchar *url_str = NULL;
+	struct rspamd_url *query_url;
+	gint rc;
+
+	task = cbd->task;
+	ex = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct process_exception));
+
+	ex->pos = start_offset;
+	ex->len = end_offset - start_offset;
+
+	if (url->protocol == PROTOCOL_MAILTO) {
+		if (url->userlen > 0) {
+			if (!g_hash_table_lookup (task->emails, url)) {
+				g_hash_table_insert (task->emails, url,
+						url);
+			}
+		}
+	}
+	else {
+		if (!g_hash_table_lookup (task->urls, url)) {
+			g_hash_table_insert (task->urls, url, url);
+		}
+	}
+
+	cbd->part->urls_offset = g_list_prepend (
+			cbd->part->urls_offset,
+			ex);
+
+	/* We also search the query for additional url inside */
+	if (url->querylen > 0) {
+		if (rspamd_url_find (task->task_pool,
+				url->query,
+				url->querylen,
+				&url_str,
+				IS_PART_HTML (cbd->part))) {
+
+			query_url = rspamd_mempool_alloc0 (task->task_pool,
+					sizeof (struct rspamd_url));
+			rc = rspamd_url_parse (query_url,
+					url_str,
+					strlen (url_str),
+					task->task_pool);
+
+			if (rc == URI_ERRNO_OK &&
+					url->hostlen > 0) {
+				msg_debug_task ("found url %s in query of url"
+						" %*s", url_str, url->querylen, url->query);
+
+				if (!g_hash_table_lookup (task->urls,
+						query_url)) {
+					g_hash_table_insert (task->urls,
+							query_url,
+							query_url);
 				}
 			}
 		}
-		p = url_end + 1;
+	}
+}
 
-		if (pos != NULL) {
-			*pos = p;
-		}
+void
+rspamd_url_text_extract (rspamd_mempool_t *pool,
+		struct rspamd_task *task,
+		struct mime_text_part *part,
+		gboolean is_html)
+{
+	struct url_callback_data cb;
+	struct rspamd_url_mimepart_cbdata mcbd;
+
+	if (part->content == NULL || part->content->len == 0) {
+		msg_warn_task ("got empty text part");
+		return;
 	}
 
-	return NULL;
-}
+	memset (&cb, 0, sizeof (cb));
+	cb.begin = part->content->data;
+	cb.end = part->content->data + part->content->len;
+	cb.is_html = is_html;
+	cb.pool = pool;
 
-/*
- * vi: ts=4
- */
+	mcbd.task = task;
+	mcbd.part = part;
+	cb.funcd = &mcbd;
+	cb.func = rspamd_url_text_part_callback;
+
+	rspamd_multipattern_lookup (url_scanner->search_trie, cb.begin,
+			part->content->len,
+			rspamd_url_trie_generic_callback, &cb, NULL);
+
+	/* Handle offsets of this part */
+	if (part->urls_offset != NULL) {
+		part->urls_offset = g_list_reverse (part->urls_offset);
+		rspamd_mempool_add_destructor (task->task_pool,
+				(rspamd_mempool_destruct_t) g_list_free, part->urls_offset);
+	}
+}
diff --git a/src/libserver/url.h b/src/libserver/url.h
index 7dfcb05af..3af11d638 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -106,26 +106,13 @@ enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
 gboolean rspamd_url_find (rspamd_mempool_t *pool,
 	const gchar *begin,
 	gsize len,
-	const gchar **start,
-	const gchar **end,
 	gchar **url_str,
-	gboolean is_html,
-	gint *statep);
+	gboolean is_html);
 /*
  * Return text representation of url parsing error
  */
 const gchar * rspamd_url_strerror (enum uri_errno err);
 
-/**
- * Convenience routine to extract urls from an arbitrarty text
- * @param pool
- * @param start
- * @param pos
- * @return url or NULL
- */
-struct rspamd_url *
-rspamd_url_get_next (rspamd_mempool_t *pool,
-		const gchar *start, gchar const **pos, gint *statep);
 
 /**
  * Find TLD for a specified host string
@@ -136,4 +123,7 @@ rspamd_url_get_next (rspamd_mempool_t *pool,
  */
 gboolean rspamd_url_find_tld (const gchar *in, gsize inlen, rspamd_ftok_t *out);
 
+typedef void (*url_insert_function) (struct rspamd_url *url,
+		gsize start_offset, gsize end_offset, void *ud);
+
 #endif
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2016-04-14 13:07:40 +0100
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2016-04-14 13:07:40 +0100
commit	f2f1ea684b61abb0c810a0a1fb26c07b0e019d06 (patch)
tree	64d6096ed40f12eebdbd9cd5e76a5fdf7ad3bda9
parent	da58466e4e5f47ab916db936580ed67d75218c28 (diff)
download	rspamd-f2f1ea684b61abb0c810a0a1fb26c07b0e019d06.tar.gz rspamd-f2f1ea684b61abb0c810a0a1fb26c07b0e019d06.zip