* Rework url parsing algorithms

* Adopt all parts of rspamd for new url parser * Improve url-extracter utility by avoiding cut&paste of mime parsing * Small fixes to rspamc client * Bump version to 0.1.3
author: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2009-06-02 19:32:34 +0400
committer: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2009-06-02 19:32:34 +0400
commit: 7bae787900fea17ca82393886217c6287d7e8cea (patch)
tree: 4f358b3624d7b2ba6c86a25057d4ba7db10965ae /src
parent: 4eb2985d1ef3631fca82cbf18cc2e8d7aab9b096 (diff)
download: rspamd-7bae787900fea17ca82393886217c6287d7e8cea.tar.gz
rspamd-7bae787900fea17ca82393886217c6287d7e8cea.zip
11 files changed, 141 insertions, 45 deletions
diff --git a/src/lmtp.c b/src/lmtp.c
index d649d11c3..5f2878a08 100644
--- a/src/lmtp.c
+++ b/src/lmtp.c
@@ -109,6 +109,9 @@ free_task (struct rspamd_lmtp_proto *lmtp, gboolean is_soft)
 		else {
 			rspamd_remove_dispatcher (lmtp->task->dispatcher);
 		}
+		if (lmtp->task->urls) {
+			g_list_free (lmtp->task->urls);
+		}
 		close (lmtp->task->sock);
 		g_free (lmtp->task);
 		g_free (lmtp);
@@ -230,7 +233,6 @@ accept_socket (int fd, short what, void *arg)
 	new_task->state = READ_COMMAND;
 	new_task->sock = nfd;
 	new_task->cfg = worker->srv->cfg;
-	TAILQ_INIT (&new_task->urls);
 	new_task->task_pool = memory_pool_new (memory_pool_get_size ());
 	/* Add destructor for recipients list (it would be better to use anonymous function here */
 	memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)rcpt_destruct, new_task);
diff --git a/src/main.h b/src/main.h
index c1a057824..924fe13ed 100644
--- a/src/main.h
+++ b/src/main.h
@@ -183,7 +183,7 @@ struct worker_task {
 	GList *parts;												/**< list of parsed parts							*/
 	GList *text_parts;											/**< list of text parts								*/
 	char *raw_headers;											/**< list of raw headers							*/
-	TAILQ_HEAD (uriq, uri) urls;								/**< list of parsed urls							*/
+	GList *urls;												/**< list of parsed urls							*/
 	GHashTable *results;										/**< hash table of metric_result indexed by 
 																 *    metric's name									*/
 	GHashTable *re_cache;										/**< cache for matched or not matched regexps		*/
diff --git a/src/message.c b/src/message.c
index 510d407e9..f664122d0 100644
--- a/src/message.c
+++ b/src/message.c
@@ -301,28 +301,36 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
 		text_part->is_balanced = TRUE;
 		text_part->html_nodes = NULL;
 		text_part->content = strip_html_tags (task->task_pool, text_part, part_content, NULL);
+		text_part->html_urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp);
+		text_part->urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp);
 
 		if (text_part->html_nodes == NULL) {
-			url_parse_text (task, text_part->orig, FALSE);
+			url_parse_text (task->task_pool, task, text_part, FALSE);
 		}
 		else {
-			url_parse_text (task, text_part->orig, TRUE);
+			url_parse_text (task->task_pool, task, text_part, FALSE);
+			url_parse_text (task->task_pool, task, text_part, TRUE);
 		}
 
 		text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
 		memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content);
+		memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->html_urls);
+		memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->urls);
 		task->text_parts = g_list_prepend (task->text_parts, text_part);
 	} 
 	else if (g_mime_content_type_is_type (type, "text", "plain")) {
 		msg_debug ("mime_foreach_callback: got urls from text/plain part");
-		url_parse_text (task, part_content, FALSE);
 
 		text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
 		text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
 		text_part->content = text_part->orig;
 		text_part->is_html = FALSE;
 		text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
+		text_part->html_urls = NULL;
+		text_part->urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp);
+		url_parse_text (task->task_pool, task, text_part, FALSE);
 		task->text_parts = g_list_prepend (task->text_parts, text_part);
+		memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->urls);
 	}
 }
 
@@ -489,8 +497,10 @@ process_message (struct worker_task *task)
 	if (task->rcpts) {
 		memory_pool_add_destructor (task->task_pool, (pool_destruct_func)internet_address_list_destroy, task->rcpts);
 	}
-
-	task->worker->srv->stat->messages_scanned ++;
+	
+	if (task->worker) {
+		task->worker->srv->stat->messages_scanned ++;
+	}
 
 	/* free the parser (and the stream) */
 	g_object_unref (parser);
diff --git a/src/message.h b/src/message.h
index 72711638f..9a63b0824 100644
--- a/src/message.h
+++ b/src/message.h
@@ -21,6 +21,8 @@ struct mime_text_part {
 	GByteArray *orig;
 	GByteArray *content;
 	GNode *html_nodes;
+	GTree *urls;
+	GTree *html_urls;
 	fuzzy_hash_t *fuzzy;
 };
 
diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c
index 1b109bf55..fbe36f291 100644
--- a/src/plugins/regexp.c
+++ b/src/plugins/regexp.c
@@ -237,6 +237,28 @@ find_raw_header_pos (const char *headers, const char *headerv)
 	return NULL;
 }
 
+struct url_regexp_param {
+	struct worker_task *task;
+	GRegex *regexp;
+	struct rspamd_regexp *re;
+	gboolean found;
+};
+
+static gboolean
+tree_url_callback (gpointer key, gpointer value, void *data)
+{
+	struct url_regexp_param *param = data;
+	struct uri *url = value;
+
+	if (g_regex_match (param->regexp, struri (url), 0, NULL) == TRUE) {
+		task_cache_add (param->task, param->re, 1);
+		param->found = TRUE;
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
 static gsize
 process_regexp (struct rspamd_regexp *re, struct worker_task *task)
 {
@@ -244,7 +266,7 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
 	struct mime_text_part *part, *tmp;
 	GList *cur, *headerlist;
 	GRegex *regexp;
-	struct uri *url;
+	struct url_regexp_param callback_param;
 	int r;
 	
 
@@ -333,13 +355,30 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
 			return 0;
 		case REGEXP_URL:
 			msg_debug ("process_regexp: checking url regexp: /%s/", re->regexp_text);
-			TAILQ_FOREACH (url, &task->urls, next) {
-				if (g_regex_match (re->regexp, struri (url), 0, NULL) == TRUE) {
-					task_cache_add (task, re, 1);
-					return 1;
+			cur = g_list_first (task->text_parts);
+			while (cur) {
+				part = (struct mime_text_part *)cur->data;
+				if (part->is_raw) {
+					regexp = re->raw_regexp;
 				}
+				else {
+					regexp = re->regexp;
+				}
+				callback_param.task = task;
+				callback_param.regexp = regexp;
+				callback_param.re = re;
+				callback_param.found = FALSE;
+				if (part->urls) {
+					g_tree_foreach (part->urls, tree_url_callback, &callback_param);
+				}
+				if (part->html_urls && callback_param.found == FALSE) {
+					g_tree_foreach (part->html_urls, tree_url_callback, &callback_param);
+				}
+				cur = g_list_next (cur);
+			}
+			if (callback_param.found == FALSE) {
+				task_cache_add (task, re, 0);
 			}
-			task_cache_add (task, re, 0);
 			return 0;
 		case REGEXP_RAW_HEADER:
 			msg_debug ("process_regexp: checking for raw header: %s with regexp: /%s/", re->header, re->regexp_text);
diff --git a/src/plugins/surbl.c b/src/plugins/surbl.c
index d44c7fbe7..1514cae1c 100644
--- a/src/plugins/surbl.c
+++ b/src/plugins/surbl.c
@@ -28,6 +28,7 @@
 
 #include "../config.h"
 #include "../util.h"
+#include "../message.h"
 #include <evdns.h>
 
 #include "surbl.h"
@@ -647,29 +648,53 @@ register_redirector_call (struct uri *url, struct worker_task *task, GTree *url_
 	event_add (&param->ev, &timeout);
 }
 
+static gboolean
+tree_url_callback (gpointer key, gpointer value, void *data)
+{
+	struct redirector_param *param = data;
+	struct uri *url = value;
+
+	msg_debug ("surbl_test_url: check url %s", struri (url));
+	if (surbl_module_ctx->use_redirector) {
+		register_redirector_call (url, param->task, param->tree);
+		param->task->save.saved++;
+	}
+	else {
+		if (param->task->worker->srv->cfg->memcached_servers_num > 0) {
+			register_memcached_call (url, param->task, param->tree);
+			param->task->save.saved++;
+		}
+		else {
+			make_surbl_requests (url, param->task, param->tree);
+		}
+	}
+
+	return FALSE;
+}
+
 static int 
 surbl_test_url (struct worker_task *task)
 {
-	struct uri *url;
 	GTree *url_tree;
+	GList *cur;
+	struct mime_text_part *part;
+	struct redirector_param param;
 
 	url_tree = g_tree_new ((GCompareFunc)g_ascii_strcasecmp);
-
-	TAILQ_FOREACH (url, &task->urls, next) {
-		msg_debug ("surbl_test_url: check url %s", struri (url));
-		if (surbl_module_ctx->use_redirector) {
-			register_redirector_call (url, task, url_tree);
-			task->save.saved++;
+	
+	param.tree = url_tree;
+	param.task = task;
+	cur = task->text_parts;
+	while (cur) {
+		part = cur->data;
+		if (part->urls) {
+			g_tree_foreach (part->urls, tree_url_callback, &param); 
 		}
-		else {
-			if (task->worker->srv->cfg->memcached_servers_num > 0) {
-				register_memcached_call (url, task, url_tree);
-				task->save.saved++;
-			}
-			else {
-				make_surbl_requests (url, task, url_tree);
-			}
+		if (part->html_urls) {
+			g_tree_foreach (part->html_urls, tree_url_callback, &param); 
 		}
+
+		cur = g_list_next (cur);
 	}
 
 	memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, url_tree);
diff --git a/src/protocol.c b/src/protocol.c
index 2cd025287..c551bb783 100644
--- a/src/protocol.c
+++ b/src/protocol.c
@@ -369,10 +369,13 @@ show_url_header (struct worker_task *task)
 	int r = 0;
 	char outbuf[OUTBUFSIZ], c;
 	struct uri *url;
+	GList *cur;
 	f_str_t host;
 
 	r = snprintf (outbuf, sizeof (outbuf), "Urls: ");
-	TAILQ_FOREACH (url, &task->urls, next) {
+	cur = task->urls;
+	while (cur) {
+		url = cur->data;
 		host.begin = url->host;
 		host.len = url->hostlen;
 		/* Skip long hosts to avoid protocol coollisions */
@@ -386,7 +389,7 @@ show_url_header (struct worker_task *task)
 			r = 0;
 		}
 		/* Write url host to buf */
-		if (TAILQ_NEXT (url, next) != NULL) {
+		if (g_list_next (cur) != NULL) {
 			c = *(host.begin + host.len);
 			*(host.begin + host.len) = '\0';
 			msg_debug ("show_url_header: write url: %s", host.begin);
@@ -400,6 +403,7 @@ show_url_header (struct worker_task *task)
 			r += snprintf (outbuf + r, sizeof (outbuf) - r, "%s" CRLF, host.begin);
 			*(host.begin + host.len) = c;
 		}
+		cur = g_list_next (cur);
 	}
 	rspamd_dispatcher_write (task->dispatcher, outbuf, r, FALSE);
 }
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index 1b47289a2..4527e699c 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -122,6 +122,7 @@ tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **tree)
 	token_node_t *new = NULL;
 	f_str_t url_domain;
 	struct uri *url;
+	GList *cur;
 	uint32_t h;
 
 	if (*tree == NULL) {
@@ -129,7 +130,9 @@ tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **tree)
 		memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, *tree);
 	}
 	
-	TAILQ_FOREACH (url, &task->urls, next) {
+	cur = task->urls;
+	while (cur) {
+		url = cur->data;
 		url_domain.begin = url->host;
 		url_domain.len = url->hostlen;
 		new = memory_pool_alloc (pool, sizeof (token_node_t));
@@ -139,6 +142,7 @@ tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **tree)
 		if (g_tree_lookup (*tree, new) == NULL) {
 			g_tree_insert (*tree, new, new);
 		}
+		cur = g_list_next (cur);
 	}
 
 	return TRUE;
diff --git a/src/url.c b/src/url.c
index 875358ae0..cc58a2caf 100644
--- a/src/url.c
+++ b/src/url.c
@@ -23,10 +23,11 @@
  */
 
 #include "config.h"
+#include "url.h"
 #include "util.h"
 #include "fstring.h"
 #include "main.h"
-#include "url.h"
+#include "message.h"
 
 #define POST_CHAR 1
 #define POST_CHAR_S "\001"
@@ -853,7 +854,7 @@ parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool)
 }
 
 void 
-url_parse_text (struct worker_task *task, GByteArray *content, gboolean is_html)
+url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html)
 {
 	GMatchInfo *info;
 	GError *err = NULL;
@@ -861,26 +862,32 @@ url_parse_text (struct worker_task *task, GByteArray *content, gboolean is_html)
 	char *url_str = NULL;
 	struct uri *new;
 	
-	if (!content->data || content->len == 0) {
+	if (!part->orig->data || part->orig->len == 0) {
 		msg_warn ("url_parse_text: got empty text part");
 		return;
 	}
 
 	if (url_init () == 0) {
-		rc = g_regex_match_full (is_html ? html_re : text_re, (const char *)content->data, content->len, 0, 0, &info, &err);
+		if (is_html) {
+			rc = g_regex_match_full (html_re, (const char *)part->orig->data, part->orig->len, 0, 0, &info, &err);
+		}
+		else {
+			rc = g_regex_match_full (text_re, (const char *)part->content->data, part->content->len, 0, 0, &info, &err);
+		
+		}
 		if (rc) {
 			while (g_match_info_matches (info)) {
 				url_str = g_match_info_fetch (info, is_html ? 1 : 0);
 				msg_debug ("url_parse_text: extracted string with regexp: '%s', html is %s", url_str, is_html ? "on" : "off");
 				if (url_str != NULL) {
-					new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
-					if (new != NULL) {
-						rc = parse_uri (new, url_str, task->task_pool);
-						if (rc != URI_ERRNO_OK) {
-							msg_debug ("url_parse_text: error while parsing url %s: %s", url_str, url_strerror (rc));
-						}
-						if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
-							TAILQ_INSERT_TAIL (&task->urls, new, next);
+					if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) {
+						new = memory_pool_alloc (pool, sizeof (struct uri));
+						if (new != NULL) {
+							rc = parse_uri (new, url_str, pool);
+							if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
+								g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
+								task->urls = g_list_prepend (task->urls, new);
+							}
 						}
 					}
 				}
diff --git a/src/url.h b/src/url.h
index 2c367548d..7860f544a 100644
--- a/src/url.h
+++ b/src/url.h
@@ -6,6 +6,7 @@
 #include "mem_pool.h"
 
 struct worker_task;
+struct mime_text_part;
 
 struct uri {
 	/* The start of the uri (and thus start of the protocol string). */
@@ -73,7 +74,7 @@ enum protocol {
 
 #define struri(uri) ((uri)->string)
 
-void url_parse_text (struct worker_task *task, GByteArray *part, gboolean is_html);
+void url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html);
 enum uri_errno parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool);
 
 #endif
diff --git a/src/worker.c b/src/worker.c
index 3f223241b..af4575919 100644
--- a/src/worker.c
+++ b/src/worker.c
@@ -130,6 +130,9 @@ free_task (struct worker_task *task, gboolean is_soft)
 		if (task->text_parts) {
 			g_list_free (task->text_parts);
 		}
+		if (task->urls) {
+			g_list_free (task->urls);
+		}
 		memory_pool_delete (task->task_pool);
 		if (is_soft) {
 			/* Plan dispatcher shutdown */
@@ -287,7 +290,6 @@ accept_socket (int fd, short what, void *arg)
 #endif
 	io_tv.tv_sec = WORKER_IO_TIMEOUT;
 	io_tv.tv_usec = 0;
-	TAILQ_INIT (&new_task->urls);
 	new_task->task_pool = memory_pool_new (memory_pool_get_size ());
 	/* Add destructor for recipients list (it would be better to use anonymous function here */
 	memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)rcpt_destruct, new_task);
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2009-06-02 19:32:34 +0400
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2009-06-02 19:32:34 +0400
commit	7bae787900fea17ca82393886217c6287d7e8cea (patch)
tree	4f358b3624d7b2ba6c86a25057d4ba7db10965ae /src
parent	4eb2985d1ef3631fca82cbf18cc2e8d7aab9b096 (diff)
download	rspamd-7bae787900fea17ca82393886217c6287d7e8cea.tar.gz rspamd-7bae787900fea17ca82393886217c6287d7e8cea.zip