5 files changed, 37 insertions, 86 deletions
diff --git a/src/message.c b/src/message.c
index 14f9245cb..4d6ba95d1 100644
--- a/src/message.c
+++ b/src/message.c
@@ -278,7 +278,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
 
 	if (g_mime_content_type_is_type (type, "text", "html") || g_mime_content_type_is_type (type, "text", "xhtml")) {
 		msg_debug ("mime_foreach_callback: got urls from text/html part");
-		url_parse_html (task, part_content);
+		url_parse_text (task, part_content, TRUE);
 
 		text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
 		text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
@@ -290,7 +290,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
 	} 
 	else if (g_mime_content_type_is_type (type, "text", "plain")) {
 		msg_debug ("mime_foreach_callback: got urls from text/plain part");
-		url_parse_text (task, part_content);
+		url_parse_text (task, part_content, FALSE);
 
 		text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
 		text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
diff --git a/src/url.c b/src/url.c
index c9a187536..875358ae0 100644
--- a/src/url.c
+++ b/src/url.c
@@ -853,11 +853,10 @@ parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool)
 }
 
 void 
-url_parse_text (struct worker_task *task, GByteArray *content)
+url_parse_text (struct worker_task *task, GByteArray *content, gboolean is_html)
 {
 	GMatchInfo *info;
 	GError *err = NULL;
-	int pos = 0, start;
 	int rc;
 	char *url_str = NULL;
 	struct uri *new;
@@ -868,86 +867,39 @@ url_parse_text (struct worker_task *task, GByteArray *content)
 	}
 
 	if (url_init () == 0) {
-		do {
-			rc = g_regex_match_full (text_re, (const char *)content->data, content->len, pos, 0, &info, &err);
-			if (rc) {
-				if (g_match_info_matches (info)) {
-					g_match_info_fetch_pos (info, 0, &start, &pos);
-					url_str = g_match_info_fetch (info, 0);
-					msg_debug ("url_parse_text: extracted string with regexp: '%s'", url_str);
-					if (url_str != NULL) {
-						new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
-						if (new != NULL) {
-							rc = parse_uri (new, url_str, task->task_pool);
-							if (rc != URI_ERRNO_OK) {
-								msg_debug ("url_parse_html: error while parsing url %s: %s", url_str, url_strerror (rc));
-							}
-							if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
-								TAILQ_INSERT_TAIL (&task->urls, new, next);
-							}
+		rc = g_regex_match_full (is_html ? html_re : text_re, (const char *)content->data, content->len, 0, 0, &info, &err);
+		if (rc) {
+			while (g_match_info_matches (info)) {
+				url_str = g_match_info_fetch (info, is_html ? 1 : 0);
+				msg_debug ("url_parse_text: extracted string with regexp: '%s', html is %s", url_str, is_html ? "on" : "off");
+				if (url_str != NULL) {
+					new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
+					if (new != NULL) {
+						rc = parse_uri (new, url_str, task->task_pool);
+						if (rc != URI_ERRNO_OK) {
+							msg_debug ("url_parse_text: error while parsing url %s: %s", url_str, url_strerror (rc));
 						}
-					}
-					memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, url_str);
-				}
-			}
-			else if (err != NULL) {
-				msg_debug ("url_parse_text: error matching regexp: %s", err->message);
-				g_free (err);
-			}
-			else {
-				msg_debug ("url_parse_text: cannot find url pattern in given string");
-			}
-			g_match_info_free (info);
-		} while (rc);
-	}
-}
-
-void 
-url_parse_html (struct worker_task *task, GByteArray *content)
-{
-	GMatchInfo *info;
-	GError *err = NULL;
-	int pos = 0, start;
-	int rc;
-	char *url_str = NULL;
-	struct uri *new;
-
-	if (!content->data || content->len == 0) {
-		msg_warn ("url_parse_text: got empty text part");
-		return;
-	}
-
-	if (url_init () == 0) {
-		do {
-			rc = g_regex_match_full (html_re, (const char *)content->data, content->len, pos, 0, &info, &err);
-			if (rc) {
-				if (g_match_info_matches (info)) {
-					g_match_info_fetch_pos (info, 0, &start, &pos);
-					url_str = g_match_info_fetch (info, 1);
-					msg_debug ("url_parse_html: extracted string with regexp: '%s'", url_str);
-					if (url_str != NULL) {
-						new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
-						if (new != NULL) {
-							rc = parse_uri (new, url_str, task->task_pool);
-							if (rc != URI_ERRNO_OK) {
-								msg_debug ("url_parse_html: error while parsing url %s: %s", url_str, url_strerror (rc));
-							}
-							if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
-								TAILQ_INSERT_TAIL (&task->urls, new, next);
-							}
+						if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
+							TAILQ_INSERT_TAIL (&task->urls, new, next);
 						}
 					}
-					memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, url_str);
 				}
+				memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, url_str);
+				/* Get next match */
+				g_match_info_next (info, &err);
 			}
-			else if (err) {
-				msg_debug ("url_parse_html: error matching regexp: %s", err->message);
-				g_free (err);
-			}
-			else {
-				msg_debug ("url_parse_html: cannot find url pattern in given string");
-			}
-			g_match_info_free (info);
-		} while (rc);
+		}
+		else if (err != NULL) {
+			msg_debug ("url_parse_text: error matching regexp: %s", err->message);
+			g_free (err);
+		}
+		else {
+			msg_debug ("url_parse_text: cannot find url pattern in given string");
+		}
+		g_match_info_free (info);
 	}
 }
+
+/*
+ * vi: ts=4
+ */
diff --git a/src/url.h b/src/url.h
index 08a5cb0a6..2c367548d 100644
--- a/src/url.h
+++ b/src/url.h
@@ -73,8 +73,7 @@ enum protocol {
 
 #define struri(uri) ((uri)->string)
 
-void url_parse_html (struct worker_task *task, GByteArray *part);
-void url_parse_text (struct worker_task *task, GByteArray *part);
+void url_parse_text (struct worker_task *task, GByteArray *part, gboolean is_html);
 enum uri_errno parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool);
 
 #endif
diff --git a/test/rspamd_url_test.c b/test/rspamd_url_test.c
index 808659757..36c9e439b 100644
--- a/test/rspamd_url_test.c
+++ b/test/rspamd_url_test.c
@@ -87,7 +87,7 @@ rspamd_url_test_func ()
 	
 	g_test_timer_start ();
 	g_test_message ("Testing text URL regexp parser");
-	url_parse_text (&task, text);
+	url_parse_text (&task, text, FALSE);
 
 	TAILQ_FOREACH (url, &task.urls, next) {
 		msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data);
@@ -104,7 +104,7 @@ rspamd_url_test_func ()
 	i = 0;
 	g_test_timer_start ();
 	g_test_message ("Testing html URL regexp parser");
-	url_parse_html (&task, html);
+	url_parse_text (&task, html, TRUE);
 
 	TAILQ_FOREACH (url, &task.urls, next) {
 		msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data);
diff --git a/utils/url_extracter.c b/utils/url_extracter.c
index 8e16a689f..1b98fa67e 100644
--- a/utils/url_extracter.c
+++ b/utils/url_extracter.c
@@ -96,11 +96,11 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)
 				task->parts =  g_list_prepend (task->parts, mime_part);
 				if (g_mime_content_type_is_type (type, "text", "html")) {
 					printf ("Found text/html part\n");
-					url_parse_html (task, part_content);
+					url_parse_text (task, part_content, TRUE);
 				} 
 				else if (g_mime_content_type_is_type (type, "text", "plain")) {
 					printf ("Found text/plain part\n");
-					url_parse_text (task, part_content);
+					url_parse_text (task, part_content, FALSE);
 				}
 			}
 		}