aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2009-04-17 19:04:04 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2009-04-17 19:04:04 +0400
commitc0cd8e414ad614ed7a3d9c2122abffc78db7602c (patch)
tree760d166ae0d815a76c41e6237bcf4965dcfea08b
parent83f0dbe021888839dbcc3b3d6dff48b8da21cffb (diff)
downloadrspamd-c0cd8e414ad614ed7a3d9c2122abffc78db7602c.tar.gz
rspamd-c0cd8e414ad614ed7a3d9c2122abffc78db7602c.zip
* Fix urls extracting, avoid code repeating
-rw-r--r--src/message.c4
-rw-r--r--src/url.c108
-rw-r--r--src/url.h3
-rw-r--r--test/rspamd_url_test.c4
-rw-r--r--utils/url_extracter.c4
5 files changed, 37 insertions, 86 deletions
diff --git a/src/message.c b/src/message.c
index 14f9245cb..4d6ba95d1 100644
--- a/src/message.c
+++ b/src/message.c
@@ -278,7 +278,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
if (g_mime_content_type_is_type (type, "text", "html") || g_mime_content_type_is_type (type, "text", "xhtml")) {
msg_debug ("mime_foreach_callback: got urls from text/html part");
- url_parse_html (task, part_content);
+ url_parse_text (task, part_content, TRUE);
text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
@@ -290,7 +290,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
}
else if (g_mime_content_type_is_type (type, "text", "plain")) {
msg_debug ("mime_foreach_callback: got urls from text/plain part");
- url_parse_text (task, part_content);
+ url_parse_text (task, part_content, FALSE);
text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
diff --git a/src/url.c b/src/url.c
index c9a187536..875358ae0 100644
--- a/src/url.c
+++ b/src/url.c
@@ -853,11 +853,10 @@ parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool)
}
void
-url_parse_text (struct worker_task *task, GByteArray *content)
+url_parse_text (struct worker_task *task, GByteArray *content, gboolean is_html)
{
GMatchInfo *info;
GError *err = NULL;
- int pos = 0, start;
int rc;
char *url_str = NULL;
struct uri *new;
@@ -868,86 +867,39 @@ url_parse_text (struct worker_task *task, GByteArray *content)
}
if (url_init () == 0) {
- do {
- rc = g_regex_match_full (text_re, (const char *)content->data, content->len, pos, 0, &info, &err);
- if (rc) {
- if (g_match_info_matches (info)) {
- g_match_info_fetch_pos (info, 0, &start, &pos);
- url_str = g_match_info_fetch (info, 0);
- msg_debug ("url_parse_text: extracted string with regexp: '%s'", url_str);
- if (url_str != NULL) {
- new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
- if (new != NULL) {
- rc = parse_uri (new, url_str, task->task_pool);
- if (rc != URI_ERRNO_OK) {
- msg_debug ("url_parse_html: error while parsing url %s: %s", url_str, url_strerror (rc));
- }
- if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
- TAILQ_INSERT_TAIL (&task->urls, new, next);
- }
+ rc = g_regex_match_full (is_html ? html_re : text_re, (const char *)content->data, content->len, 0, 0, &info, &err);
+ if (rc) {
+ while (g_match_info_matches (info)) {
+ url_str = g_match_info_fetch (info, is_html ? 1 : 0);
+ msg_debug ("url_parse_text: extracted string with regexp: '%s', html is %s", url_str, is_html ? "on" : "off");
+ if (url_str != NULL) {
+ new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
+ if (new != NULL) {
+ rc = parse_uri (new, url_str, task->task_pool);
+ if (rc != URI_ERRNO_OK) {
+ msg_debug ("url_parse_text: error while parsing url %s: %s", url_str, url_strerror (rc));
}
- }
- memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, url_str);
- }
- }
- else if (err != NULL) {
- msg_debug ("url_parse_text: error matching regexp: %s", err->message);
- g_free (err);
- }
- else {
- msg_debug ("url_parse_text: cannot find url pattern in given string");
- }
- g_match_info_free (info);
- } while (rc);
- }
-}
-
-void
-url_parse_html (struct worker_task *task, GByteArray *content)
-{
- GMatchInfo *info;
- GError *err = NULL;
- int pos = 0, start;
- int rc;
- char *url_str = NULL;
- struct uri *new;
-
- if (!content->data || content->len == 0) {
- msg_warn ("url_parse_text: got empty text part");
- return;
- }
-
- if (url_init () == 0) {
- do {
- rc = g_regex_match_full (html_re, (const char *)content->data, content->len, pos, 0, &info, &err);
- if (rc) {
- if (g_match_info_matches (info)) {
- g_match_info_fetch_pos (info, 0, &start, &pos);
- url_str = g_match_info_fetch (info, 1);
- msg_debug ("url_parse_html: extracted string with regexp: '%s'", url_str);
- if (url_str != NULL) {
- new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
- if (new != NULL) {
- rc = parse_uri (new, url_str, task->task_pool);
- if (rc != URI_ERRNO_OK) {
- msg_debug ("url_parse_html: error while parsing url %s: %s", url_str, url_strerror (rc));
- }
- if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
- TAILQ_INSERT_TAIL (&task->urls, new, next);
- }
+ if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
+ TAILQ_INSERT_TAIL (&task->urls, new, next);
}
}
- memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, url_str);
}
+ memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, url_str);
+ /* Get next match */
+ g_match_info_next (info, &err);
}
- else if (err) {
- msg_debug ("url_parse_html: error matching regexp: %s", err->message);
- g_free (err);
- }
- else {
- msg_debug ("url_parse_html: cannot find url pattern in given string");
- }
- g_match_info_free (info);
- } while (rc);
+ }
+ else if (err != NULL) {
+ msg_debug ("url_parse_text: error matching regexp: %s", err->message);
+ g_free (err);
+ }
+ else {
+ msg_debug ("url_parse_text: cannot find url pattern in given string");
+ }
+ g_match_info_free (info);
}
}
+
+/*
+ * vi: ts=4
+ */
diff --git a/src/url.h b/src/url.h
index 08a5cb0a6..2c367548d 100644
--- a/src/url.h
+++ b/src/url.h
@@ -73,8 +73,7 @@ enum protocol {
#define struri(uri) ((uri)->string)
-void url_parse_html (struct worker_task *task, GByteArray *part);
-void url_parse_text (struct worker_task *task, GByteArray *part);
+void url_parse_text (struct worker_task *task, GByteArray *part, gboolean is_html);
enum uri_errno parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool);
#endif
diff --git a/test/rspamd_url_test.c b/test/rspamd_url_test.c
index 808659757..36c9e439b 100644
--- a/test/rspamd_url_test.c
+++ b/test/rspamd_url_test.c
@@ -87,7 +87,7 @@ rspamd_url_test_func ()
g_test_timer_start ();
g_test_message ("Testing text URL regexp parser");
- url_parse_text (&task, text);
+ url_parse_text (&task, text, FALSE);
TAILQ_FOREACH (url, &task.urls, next) {
msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data);
@@ -104,7 +104,7 @@ rspamd_url_test_func ()
i = 0;
g_test_timer_start ();
g_test_message ("Testing html URL regexp parser");
- url_parse_html (&task, html);
+ url_parse_text (&task, html, TRUE);
TAILQ_FOREACH (url, &task.urls, next) {
msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data);
diff --git a/utils/url_extracter.c b/utils/url_extracter.c
index 8e16a689f..1b98fa67e 100644
--- a/utils/url_extracter.c
+++ b/utils/url_extracter.c
@@ -96,11 +96,11 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)
task->parts = g_list_prepend (task->parts, mime_part);
if (g_mime_content_type_is_type (type, "text", "html")) {
printf ("Found text/html part\n");
- url_parse_html (task, part_content);
+ url_parse_text (task, part_content, TRUE);
}
else if (g_mime_content_type_is_type (type, "text", "plain")) {
printf ("Found text/plain part\n");
- url_parse_text (task, part_content);
+ url_parse_text (task, part_content, FALSE);
}
}
}