From 7bae787900fea17ca82393886217c6287d7e8cea Mon Sep 17 00:00:00 2001
From: Vsevolod Stakhov <vsevolod@rambler-co.ru>
Date: Tue, 2 Jun 2009 19:32:34 +0400
Subject: [PATCH] * Rework url parsing algorithms * Adopt all parts of rspamd
 for new url parser * Improve url-extracter utility by avoiding cut&paste of
 mime parsing * Small fixes to rspamc client * Bump version to 0.1.3

---
 CMakeLists.txt              |   2 +-
 rspamc.pl.in                |   3 +-
 src/lmtp.c                  |   4 +-
 src/main.h                  |   2 +-
 src/message.c               |  20 ++++--
 src/message.h               |   2 +
 src/plugins/regexp.c        |  51 ++++++++++++--
 src/plugins/surbl.c         |  55 ++++++++++-----
 src/protocol.c              |   8 ++-
 src/tokenizers/tokenizers.c |   6 +-
 src/url.c                   |  31 +++++----
 src/url.h                   |   3 +-
 src/worker.c                |   4 +-
 test/rspamd_url_test.c      |  48 +------------
 utils/url_extracter.c       | 130 ++++++------------------------------
 15 files changed, 164 insertions(+), 205 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f744fff03..2d55565e9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ PROJECT(rspamd C)
 
 SET(RSPAMD_VERSION_MAJOR 0)
 SET(RSPAMD_VERSION_MINOR 1)
-SET(RSPAMD_VERSION_PATCH 2)
+SET(RSPAMD_VERSION_PATCH 3)
 
 SET(RSPAMD_VERSION         "${RSPAMD_VERSION_MAJOR}.${RSPAMD_VERSION_MINOR}.${RSPAMD_VERSION_PATCH}")
 SET(RSPAMD_MASTER_SITE_URL "http://cebka.pp.ru/hg/rspamd")
diff --git a/rspamc.pl.in b/rspamc.pl.in
index 6681fd6d9..9d5712117 100755
--- a/rspamc.pl.in
+++ b/rspamc.pl.in
@@ -200,7 +200,7 @@ sub do_control_command {
 my %args;
 getopt('c:h:p:Ps:', \%args);
 my $cmd = shift;
-my $do_parse_config = 0;
+my $do_parse_config = 1;
 
 if (!defined ($cmd) || $cmd eq '') {
     HELP_MESSAGE();
@@ -230,6 +230,7 @@ if (defined ($args{h})) {
     $cfg{'host'} = $args{h};
     if ($args{h} =~ /^\/.*$/) {
         $cfg{'is_unix'} = 1;
+        $do_parse_config = 0;
     }
 }
 if (defined ($args{p})) {
diff --git a/src/lmtp.c b/src/lmtp.c
index d649d11c3..5f2878a08 100644
--- a/src/lmtp.c
+++ b/src/lmtp.c
@@ -109,6 +109,9 @@ free_task (struct rspamd_lmtp_proto *lmtp, gboolean is_soft)
 		else {
 			rspamd_remove_dispatcher (lmtp->task->dispatcher);
 		}
+		if (lmtp->task->urls) {
+			g_list_free (lmtp->task->urls);
+		}
 		close (lmtp->task->sock);
 		g_free (lmtp->task);
 		g_free (lmtp);
@@ -230,7 +233,6 @@ accept_socket (int fd, short what, void *arg)
 	new_task->state = READ_COMMAND;
 	new_task->sock = nfd;
 	new_task->cfg = worker->srv->cfg;
-	TAILQ_INIT (&new_task->urls);
 	new_task->task_pool = memory_pool_new (memory_pool_get_size ());
 	/* Add destructor for recipients list (it would be better to use anonymous function here */
 	memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)rcpt_destruct, new_task);
diff --git a/src/main.h b/src/main.h
index c1a057824..924fe13ed 100644
--- a/src/main.h
+++ b/src/main.h
@@ -183,7 +183,7 @@ struct worker_task {
 	GList *parts;												/**< list of parsed parts							*/
 	GList *text_parts;											/**< list of text parts								*/
 	char *raw_headers;											/**< list of raw headers							*/
-	TAILQ_HEAD (uriq, uri) urls;								/**< list of parsed urls							*/
+	GList *urls;												/**< list of parsed urls							*/
 	GHashTable *results;										/**< hash table of metric_result indexed by 
 																 *    metric's name									*/
 	GHashTable *re_cache;										/**< cache for matched or not matched regexps		*/
diff --git a/src/message.c b/src/message.c
index 510d407e9..f664122d0 100644
--- a/src/message.c
+++ b/src/message.c
@@ -301,28 +301,36 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
 		text_part->is_balanced = TRUE;
 		text_part->html_nodes = NULL;
 		text_part->content = strip_html_tags (task->task_pool, text_part, part_content, NULL);
+		text_part->html_urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp);
+		text_part->urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp);
 
 		if (text_part->html_nodes == NULL) {
-			url_parse_text (task, text_part->orig, FALSE);
+			url_parse_text (task->task_pool, task, text_part, FALSE);
 		}
 		else {
-			url_parse_text (task, text_part->orig, TRUE);
+			url_parse_text (task->task_pool, task, text_part, FALSE);
+			url_parse_text (task->task_pool, task, text_part, TRUE);
 		}
 
 		text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
 		memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content);
+		memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->html_urls);
+		memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->urls);
 		task->text_parts = g_list_prepend (task->text_parts, text_part);
 	} 
 	else if (g_mime_content_type_is_type (type, "text", "plain")) {
 		msg_debug ("mime_foreach_callback: got urls from text/plain part");
-		url_parse_text (task, part_content, FALSE);
 
 		text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
 		text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
 		text_part->content = text_part->orig;
 		text_part->is_html = FALSE;
 		text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
+		text_part->html_urls = NULL;
+		text_part->urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp);
+		url_parse_text (task->task_pool, task, text_part, FALSE);
 		task->text_parts = g_list_prepend (task->text_parts, text_part);
+		memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->urls);
 	}
 }
 
@@ -489,8 +497,10 @@ process_message (struct worker_task *task)
 	if (task->rcpts) {
 		memory_pool_add_destructor (task->task_pool, (pool_destruct_func)internet_address_list_destroy, task->rcpts);
 	}
-
-	task->worker->srv->stat->messages_scanned ++;
+	
+	if (task->worker) {
+		task->worker->srv->stat->messages_scanned ++;
+	}
 
 	/* free the parser (and the stream) */
 	g_object_unref (parser);
diff --git a/src/message.h b/src/message.h
index 72711638f..9a63b0824 100644
--- a/src/message.h
+++ b/src/message.h
@@ -21,6 +21,8 @@ struct mime_text_part {
 	GByteArray *orig;
 	GByteArray *content;
 	GNode *html_nodes;
+	GTree *urls;
+	GTree *html_urls;
 	fuzzy_hash_t *fuzzy;
 };
 
diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c
index 1b109bf55..fbe36f291 100644
--- a/src/plugins/regexp.c
+++ b/src/plugins/regexp.c
@@ -237,6 +237,28 @@ find_raw_header_pos (const char *headers, const char *headerv)
 	return NULL;
 }
 
+struct url_regexp_param {
+	struct worker_task *task;
+	GRegex *regexp;
+	struct rspamd_regexp *re;
+	gboolean found;
+};
+
+static gboolean
+tree_url_callback (gpointer key, gpointer value, void *data)
+{
+	struct url_regexp_param *param = data;
+	struct uri *url = value;
+
+	if (g_regex_match (param->regexp, struri (url), 0, NULL) == TRUE) {
+		task_cache_add (param->task, param->re, 1);
+		param->found = TRUE;
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
 static gsize
 process_regexp (struct rspamd_regexp *re, struct worker_task *task)
 {
@@ -244,7 +266,7 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
 	struct mime_text_part *part, *tmp;
 	GList *cur, *headerlist;
 	GRegex *regexp;
-	struct uri *url;
+	struct url_regexp_param callback_param;
 	int r;
 	
 
@@ -333,13 +355,30 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
 			return 0;
 		case REGEXP_URL:
 			msg_debug ("process_regexp: checking url regexp: /%s/", re->regexp_text);
-			TAILQ_FOREACH (url, &task->urls, next) {
-				if (g_regex_match (re->regexp, struri (url), 0, NULL) == TRUE) {
-					task_cache_add (task, re, 1);
-					return 1;
+			cur = g_list_first (task->text_parts);
+			while (cur) {
+				part = (struct mime_text_part *)cur->data;
+				if (part->is_raw) {
+					regexp = re->raw_regexp;
 				}
+				else {
+					regexp = re->regexp;
+				}
+				callback_param.task = task;
+				callback_param.regexp = regexp;
+				callback_param.re = re;
+				callback_param.found = FALSE;
+				if (part->urls) {
+					g_tree_foreach (part->urls, tree_url_callback, &callback_param);
+				}
+				if (part->html_urls && callback_param.found == FALSE) {
+					g_tree_foreach (part->html_urls, tree_url_callback, &callback_param);
+				}
+				cur = g_list_next (cur);
+			}
+			if (callback_param.found == FALSE) {
+				task_cache_add (task, re, 0);
 			}
-			task_cache_add (task, re, 0);
 			return 0;
 		case REGEXP_RAW_HEADER:
 			msg_debug ("process_regexp: checking for raw header: %s with regexp: /%s/", re->header, re->regexp_text);
diff --git a/src/plugins/surbl.c b/src/plugins/surbl.c
index d44c7fbe7..1514cae1c 100644
--- a/src/plugins/surbl.c
+++ b/src/plugins/surbl.c
@@ -28,6 +28,7 @@
 
 #include "../config.h"
 #include "../util.h"
+#include "../message.h"
 #include <evdns.h>
 
 #include "surbl.h"
@@ -647,29 +648,53 @@ register_redirector_call (struct uri *url, struct worker_task *task, GTree *url_
 	event_add (&param->ev, &timeout);
 }
 
+static gboolean
+tree_url_callback (gpointer key, gpointer value, void *data)
+{
+	struct redirector_param *param = data;
+	struct uri *url = value;
+
+	msg_debug ("surbl_test_url: check url %s", struri (url));
+	if (surbl_module_ctx->use_redirector) {
+		register_redirector_call (url, param->task, param->tree);
+		param->task->save.saved++;
+	}
+	else {
+		if (param->task->worker->srv->cfg->memcached_servers_num > 0) {
+			register_memcached_call (url, param->task, param->tree);
+			param->task->save.saved++;
+		}
+		else {
+			make_surbl_requests (url, param->task, param->tree);
+		}
+	}
+
+	return FALSE;
+}
+
 static int 
 surbl_test_url (struct worker_task *task)
 {
-	struct uri *url;
 	GTree *url_tree;
+	GList *cur;
+	struct mime_text_part *part;
+	struct redirector_param param;
 
 	url_tree = g_tree_new ((GCompareFunc)g_ascii_strcasecmp);
-
-	TAILQ_FOREACH (url, &task->urls, next) {
-		msg_debug ("surbl_test_url: check url %s", struri (url));
-		if (surbl_module_ctx->use_redirector) {
-			register_redirector_call (url, task, url_tree);
-			task->save.saved++;
+	
+	param.tree = url_tree;
+	param.task = task;
+	cur = task->text_parts;
+	while (cur) {
+		part = cur->data;
+		if (part->urls) {
+			g_tree_foreach (part->urls, tree_url_callback, &param); 
 		}
-		else {
-			if (task->worker->srv->cfg->memcached_servers_num > 0) {
-				register_memcached_call (url, task, url_tree);
-				task->save.saved++;
-			}
-			else {
-				make_surbl_requests (url, task, url_tree);
-			}
+		if (part->html_urls) {
+			g_tree_foreach (part->html_urls, tree_url_callback, &param); 
 		}
+
+		cur = g_list_next (cur);
 	}
 
 	memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, url_tree);
diff --git a/src/protocol.c b/src/protocol.c
index 2cd025287..c551bb783 100644
--- a/src/protocol.c
+++ b/src/protocol.c
@@ -369,10 +369,13 @@ show_url_header (struct worker_task *task)
 	int r = 0;
 	char outbuf[OUTBUFSIZ], c;
 	struct uri *url;
+	GList *cur;
 	f_str_t host;
 
 	r = snprintf (outbuf, sizeof (outbuf), "Urls: ");
-	TAILQ_FOREACH (url, &task->urls, next) {
+	cur = task->urls;
+	while (cur) {
+		url = cur->data;
 		host.begin = url->host;
 		host.len = url->hostlen;
 		/* Skip long hosts to avoid protocol coollisions */
@@ -386,7 +389,7 @@ show_url_header (struct worker_task *task)
 			r = 0;
 		}
 		/* Write url host to buf */
-		if (TAILQ_NEXT (url, next) != NULL) {
+		if (g_list_next (cur) != NULL) {
 			c = *(host.begin + host.len);
 			*(host.begin + host.len) = '\0';
 			msg_debug ("show_url_header: write url: %s", host.begin);
@@ -400,6 +403,7 @@ show_url_header (struct worker_task *task)
 			r += snprintf (outbuf + r, sizeof (outbuf) - r, "%s" CRLF, host.begin);
 			*(host.begin + host.len) = c;
 		}
+		cur = g_list_next (cur);
 	}
 	rspamd_dispatcher_write (task->dispatcher, outbuf, r, FALSE);
 }
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index 1b47289a2..4527e699c 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -122,6 +122,7 @@ tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **tree)
 	token_node_t *new = NULL;
 	f_str_t url_domain;
 	struct uri *url;
+	GList *cur;
 	uint32_t h;
 
 	if (*tree == NULL) {
@@ -129,7 +130,9 @@ tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **tree)
 		memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, *tree);
 	}
 	
-	TAILQ_FOREACH (url, &task->urls, next) {
+	cur = task->urls;
+	while (cur) {
+		url = cur->data;
 		url_domain.begin = url->host;
 		url_domain.len = url->hostlen;
 		new = memory_pool_alloc (pool, sizeof (token_node_t));
@@ -139,6 +142,7 @@ tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **tree)
 		if (g_tree_lookup (*tree, new) == NULL) {
 			g_tree_insert (*tree, new, new);
 		}
+		cur = g_list_next (cur);
 	}
 
 	return TRUE;
diff --git a/src/url.c b/src/url.c
index 875358ae0..cc58a2caf 100644
--- a/src/url.c
+++ b/src/url.c
@@ -23,10 +23,11 @@
  */
 
 #include "config.h"
+#include "url.h"
 #include "util.h"
 #include "fstring.h"
 #include "main.h"
-#include "url.h"
+#include "message.h"
 
 #define POST_CHAR 1
 #define POST_CHAR_S "\001"
@@ -853,7 +854,7 @@ parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool)
 }
 
 void 
-url_parse_text (struct worker_task *task, GByteArray *content, gboolean is_html)
+url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html)
 {
 	GMatchInfo *info;
 	GError *err = NULL;
@@ -861,26 +862,32 @@ url_parse_text (struct worker_task *task, GByteArray *content, gboolean is_html)
 	char *url_str = NULL;
 	struct uri *new;
 	
-	if (!content->data || content->len == 0) {
+	if (!part->orig->data || part->orig->len == 0) {
 		msg_warn ("url_parse_text: got empty text part");
 		return;
 	}
 
 	if (url_init () == 0) {
-		rc = g_regex_match_full (is_html ? html_re : text_re, (const char *)content->data, content->len, 0, 0, &info, &err);
+		if (is_html) {
+			rc = g_regex_match_full (html_re, (const char *)part->orig->data, part->orig->len, 0, 0, &info, &err);
+		}
+		else {
+			rc = g_regex_match_full (text_re, (const char *)part->content->data, part->content->len, 0, 0, &info, &err);
+		
+		}
 		if (rc) {
 			while (g_match_info_matches (info)) {
 				url_str = g_match_info_fetch (info, is_html ? 1 : 0);
 				msg_debug ("url_parse_text: extracted string with regexp: '%s', html is %s", url_str, is_html ? "on" : "off");
 				if (url_str != NULL) {
-					new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
-					if (new != NULL) {
-						rc = parse_uri (new, url_str, task->task_pool);
-						if (rc != URI_ERRNO_OK) {
-							msg_debug ("url_parse_text: error while parsing url %s: %s", url_str, url_strerror (rc));
-						}
-						if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
-							TAILQ_INSERT_TAIL (&task->urls, new, next);
+					if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) {
+						new = memory_pool_alloc (pool, sizeof (struct uri));
+						if (new != NULL) {
+							rc = parse_uri (new, url_str, pool);
+							if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
+								g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
+								task->urls = g_list_prepend (task->urls, new);
+							}
 						}
 					}
 				}
diff --git a/src/url.h b/src/url.h
index 2c367548d..7860f544a 100644
--- a/src/url.h
+++ b/src/url.h
@@ -6,6 +6,7 @@
 #include "mem_pool.h"
 
 struct worker_task;
+struct mime_text_part;
 
 struct uri {
 	/* The start of the uri (and thus start of the protocol string). */
@@ -73,7 +74,7 @@ enum protocol {
 
 #define struri(uri) ((uri)->string)
 
-void url_parse_text (struct worker_task *task, GByteArray *part, gboolean is_html);
+void url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html);
 enum uri_errno parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool);
 
 #endif
diff --git a/src/worker.c b/src/worker.c
index 3f223241b..af4575919 100644
--- a/src/worker.c
+++ b/src/worker.c
@@ -130,6 +130,9 @@ free_task (struct worker_task *task, gboolean is_soft)
 		if (task->text_parts) {
 			g_list_free (task->text_parts);
 		}
+		if (task->urls) {
+			g_list_free (task->urls);
+		}
 		memory_pool_delete (task->task_pool);
 		if (is_soft) {
 			/* Plan dispatcher shutdown */
@@ -287,7 +290,6 @@ accept_socket (int fd, short what, void *arg)
 #endif
 	io_tv.tv_sec = WORKER_IO_TIMEOUT;
 	io_tv.tv_usec = 0;
-	TAILQ_INIT (&new_task->urls);
 	new_task->task_pool = memory_pool_new (memory_pool_get_size ());
 	/* Add destructor for recipients list (it would be better to use anonymous function here */
 	memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)rcpt_destruct, new_task);
diff --git a/test/rspamd_url_test.c b/test/rspamd_url_test.c
index 36c9e439b..f716c1ab0 100644
--- a/test/rspamd_url_test.c
+++ b/test/rspamd_url_test.c
@@ -70,51 +70,5 @@ const char *test_html = "<some_tag>This is test file with <a href=\"http://micro
 void
 rspamd_url_test_func ()
 {
-	GByteArray *text, *html;
-	struct worker_task task;
-	struct uri *url;
-	int i = 0;
-
-	text = g_byte_array_new();
-	text->data = (gchar *)test_text;
-	text->len = strlen (test_text);
-	html = g_byte_array_new();
-	html->data = (gchar *)test_html;
-	html->len = strlen (test_html);
-	bzero (&task, sizeof (task));
-	TAILQ_INIT (&task.urls);
-	task.task_pool = memory_pool_new (8192);
-	
-	g_test_timer_start ();
-	g_test_message ("Testing text URL regexp parser");
-	url_parse_text (&task, text, FALSE);
-
-	TAILQ_FOREACH (url, &task.urls, next) {
-		msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data);
-		i ++;
-	}
-
-	while (!TAILQ_EMPTY (&task.urls)) {
-		url = TAILQ_FIRST (&task.urls);
-		TAILQ_REMOVE (&task.urls, url, next);
-	}
-	/* g_assert (i == 39); */
-
-	msg_debug ("Time elapsed: %.2f", g_test_timer_elapsed ());
-	i = 0;
-	g_test_timer_start ();
-	g_test_message ("Testing html URL regexp parser");
-	url_parse_text (&task, html, TRUE);
-
-	TAILQ_FOREACH (url, &task.urls, next) {
-		msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data);
-		i ++;
-	}
-
-	while (!TAILQ_EMPTY (&task.urls)) {
-		url = TAILQ_FIRST (&task.urls);
-		TAILQ_REMOVE (&task.urls, url, next);
-	}
-	g_assert (i == 1);
-	msg_debug ("Time elapsed: %.2f", g_test_timer_elapsed ());
+	/* XXX: maybe write test for this */
 }
diff --git a/utils/url_extracter.c b/utils/url_extracter.c
index ac8e8be4e..97bf72c47 100644
--- a/utils/url_extracter.c
+++ b/utils/url_extracter.c
@@ -24,107 +24,24 @@
 #include "../src/main.h"
 #include "../src/cfg_file.h"
 #include "../src/url.h"
+#include "../src/util.h"
 #include "../src/message.h"
 
 rspamd_hash_t *counters = NULL;
-#ifdef GMIME24
-static void
-mime_foreach_callback (GMimeObject *parent, GMimeObject *part, gpointer user_data)
-#else
-static void
-mime_foreach_callback (GMimeObject *part, gpointer user_data)
-#endif
-{
-	struct worker_task *task = (struct worker_task *)user_data;
-	struct mime_part *mime_part;
-	GMimeContentType *type;
-	GMimeDataWrapper *wrapper;
-	GMimeStream *part_stream;
-	GByteArray *part_content;
-	GMimeMessage *message;
-	
-	/* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
-	
-	/* find out what class 'part' is... */
-	if (GMIME_IS_MESSAGE_PART (part)) {
-		/* message/rfc822 or message/news */
-		printf ("Message part found\n");
-		
-		/* g_mime_message_foreach_part() won't descend into
-                   child message parts, so if we want to count any
-                   subparts of this child message, we'll have to call
-                   g_mime_message_foreach_part() again here. */
-		
-		message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
-#ifdef GMIME24
-		g_mime_message_foreach (message, mime_foreach_callback, task);
-#else
-		g_mime_message_foreach_part (message, mime_foreach_callback, task);
-#endif
-		g_object_unref (message);
-	} else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
-		/* message/partial */
-		printf ("Message/partial part found\n");
-		
-		/* this is an incomplete message part, probably a
-                   large message that the sender has broken into
-                   smaller parts and is sending us bit by bit. we
-                   could save some info about it so that we could
-                   piece this back together again once we get all the
-                   parts? */
-	} else if (GMIME_IS_MULTIPART (part)) {
-		/* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
-		
-		/* we'll get to finding out if this is a signed/encrypted multipart later... */
-	} else if (GMIME_IS_PART (part)) {
-		printf ("Normal part found\n");
-		/* a normal leaf part, could be text/plain or image/jpeg etc */
-		wrapper = g_mime_part_get_content_object (GMIME_PART (part));
-		if (wrapper != NULL) {
-			part_stream = g_mime_stream_mem_new ();
-			printf ("Get new wrapper object for normal part\n");
-			if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) {
-				printf ("Write wrapper to stream\n");
-				part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream));
-#ifdef GMIME24
-				type = (GMimeContentType *)g_mime_object_get_content_type (GMIME_OBJECT (part));
-#else
-				type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part));
-#endif
-				mime_part = g_malloc (sizeof (struct mime_part));
-				mime_part->type = type;
-				mime_part->content = part_content;
-				task->parts =  g_list_prepend (task->parts, mime_part);
-				if (g_mime_content_type_is_type (type, "text", "html")) {
-					printf ("Found text/html part\n");
-					url_parse_text (task, part_content, TRUE);
-				} 
-				else if (g_mime_content_type_is_type (type, "text", "plain")) {
-					printf ("Found text/plain part\n");
-					url_parse_text (task, part_content, FALSE);
-				}
-			}
-		}
-	} else {
-		g_assert_not_reached ();
-	}
-}
-
 
 int
 main (int argc, char **argv)
 {
-	GMimeMessage *message;
-	GMimeParser *parser;
-	GMimeStream *stream;
 	struct worker_task task;
 	struct uri *url;
 	char *buf = NULL;
 	size_t pos = 0, size = 65535;
+	GList *cur;
 	
 	g_mem_set_vtable(glib_mem_profiler_table);
 	g_mime_init (0);
 	bzero (&task, sizeof (struct worker_task));
+	task.task_pool = memory_pool_new (memory_pool_get_size ());
 	
 	/* Preallocate buffer */
 	buf = g_malloc (size);
@@ -137,32 +54,23 @@ main (int argc, char **argv)
 			buf = g_realloc (buf, size);
 		}
 	}
-
-	stream = g_mime_stream_mem_new_with_buffer (buf, pos);
-	/* create a new parser object to parse the stream */
-	parser = g_mime_parser_new_with_stream (stream);
-
-	/* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */
-	g_object_unref (stream);
-
-	/* parse the message from the stream */
-	message = g_mime_parser_construct_message (parser);
 	
-	task.message = message;
-	task.task_pool = memory_pool_new (memory_pool_get_size ());
-	TAILQ_INIT (&task.urls);
-
-	/* free the parser (and the stream) */
-	g_object_unref (parser);
+	task.cfg = memory_pool_alloc0 (task.task_pool, sizeof (struct config_file));
+	task.cfg->log_level = G_LOG_LEVEL_CRITICAL;
+	task.cfg->log_fd = STDERR_FILENO;
+	g_log_set_default_handler (file_log_function, task.cfg);
 
-#ifdef GMIME24
-	g_mime_message_foreach (message, mime_foreach_callback, &task);
-#else
-	g_mime_message_foreach_part (message, mime_foreach_callback, &task);
-#endif
-
-	TAILQ_FOREACH (url, &task.urls, next) {
-		printf ("Found url: %s, hostname: %s, data: %s\n", struri (url), url->host, url->data);
+	task.msg = memory_pool_alloc (task.task_pool, sizeof (f_str_t));
+	task.msg->begin = buf;
+	task.msg->len = pos;
+	process_message (&task);
+	
+	cur = task.urls;
+	while (cur) {
+		url = cur->data;
+		printf ("%s\n", struri (url));
+		cur = g_list_next (cur);
 	}
-
+	
+	return 0;
 }
-- 
2.39.5