diff options
-rw-r--r-- | src/message.c | 4 | ||||
-rw-r--r-- | src/url.c | 108 | ||||
-rw-r--r-- | src/url.h | 3 | ||||
-rw-r--r-- | test/rspamd_url_test.c | 4 | ||||
-rw-r--r-- | utils/url_extracter.c | 4 |
5 files changed, 37 insertions, 86 deletions
diff --git a/src/message.c b/src/message.c index 14f9245cb..4d6ba95d1 100644 --- a/src/message.c +++ b/src/message.c @@ -278,7 +278,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont if (g_mime_content_type_is_type (type, "text", "html") || g_mime_content_type_is_type (type, "text", "xhtml")) { msg_debug ("mime_foreach_callback: got urls from text/html part"); - url_parse_html (task, part_content); + url_parse_text (task, part_content, TRUE); text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part)); text_part->orig = convert_text_to_utf (task, part_content, type, text_part); @@ -290,7 +290,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont } else if (g_mime_content_type_is_type (type, "text", "plain")) { msg_debug ("mime_foreach_callback: got urls from text/plain part"); - url_parse_text (task, part_content); + url_parse_text (task, part_content, FALSE); text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part)); text_part->orig = convert_text_to_utf (task, part_content, type, text_part); @@ -853,11 +853,10 @@ parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool) } void -url_parse_text (struct worker_task *task, GByteArray *content) +url_parse_text (struct worker_task *task, GByteArray *content, gboolean is_html) { GMatchInfo *info; GError *err = NULL; - int pos = 0, start; int rc; char *url_str = NULL; struct uri *new; @@ -868,86 +867,39 @@ url_parse_text (struct worker_task *task, GByteArray *content) } if (url_init () == 0) { - do { - rc = g_regex_match_full (text_re, (const char *)content->data, content->len, pos, 0, &info, &err); - if (rc) { - if (g_match_info_matches (info)) { - g_match_info_fetch_pos (info, 0, &start, &pos); - url_str = g_match_info_fetch (info, 0); - msg_debug ("url_parse_text: extracted string with regexp: '%s'", url_str); - if (url_str != NULL) { - new = memory_pool_alloc (task->task_pool, sizeof (struct uri)); - if (new != NULL) { - rc = parse_uri (new, url_str, task->task_pool); - if (rc != URI_ERRNO_OK) { - msg_debug ("url_parse_html: error while parsing url %s: %s", url_str, url_strerror (rc)); - } - if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) { - TAILQ_INSERT_TAIL (&task->urls, new, next); - } + rc = g_regex_match_full (is_html ? html_re : text_re, (const char *)content->data, content->len, 0, 0, &info, &err); + if (rc) { + while (g_match_info_matches (info)) { + url_str = g_match_info_fetch (info, is_html ? 1 : 0); + msg_debug ("url_parse_text: extracted string with regexp: '%s', html is %s", url_str, is_html ? "on" : "off"); + if (url_str != NULL) { + new = memory_pool_alloc (task->task_pool, sizeof (struct uri)); + if (new != NULL) { + rc = parse_uri (new, url_str, task->task_pool); + if (rc != URI_ERRNO_OK) { + msg_debug ("url_parse_text: error while parsing url %s: %s", url_str, url_strerror (rc)); } - } - memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, url_str); - } - } - else if (err != NULL) { - msg_debug ("url_parse_text: error matching regexp: %s", err->message); - g_free (err); - } - else { - msg_debug ("url_parse_text: cannot find url pattern in given string"); - } - g_match_info_free (info); - } while (rc); - } -} - -void -url_parse_html (struct worker_task *task, GByteArray *content) -{ - GMatchInfo *info; - GError *err = NULL; - int pos = 0, start; - int rc; - char *url_str = NULL; - struct uri *new; - - if (!content->data || content->len == 0) { - msg_warn ("url_parse_text: got empty text part"); - return; - } - - if (url_init () == 0) { - do { - rc = g_regex_match_full (html_re, (const char *)content->data, content->len, pos, 0, &info, &err); - if (rc) { - if (g_match_info_matches (info)) { - g_match_info_fetch_pos (info, 0, &start, &pos); - url_str = g_match_info_fetch (info, 1); - msg_debug ("url_parse_html: extracted string with regexp: '%s'", url_str); - if (url_str != NULL) { - new = memory_pool_alloc (task->task_pool, sizeof (struct uri)); - if (new != NULL) { - rc = parse_uri (new, url_str, task->task_pool); - if (rc != URI_ERRNO_OK) { - msg_debug ("url_parse_html: error while parsing url %s: %s", url_str, url_strerror (rc)); - } - if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) { - TAILQ_INSERT_TAIL (&task->urls, new, next); - } + if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) { + TAILQ_INSERT_TAIL (&task->urls, new, next); } } - memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, url_str); } + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, url_str); + /* Get next match */ + g_match_info_next (info, &err); } - else if (err) { - msg_debug ("url_parse_html: error matching regexp: %s", err->message); - g_free (err); - } - else { - msg_debug ("url_parse_html: cannot find url pattern in given string"); - } - g_match_info_free (info); - } while (rc); + } + else if (err != NULL) { + msg_debug ("url_parse_text: error matching regexp: %s", err->message); + g_free (err); + } + else { + msg_debug ("url_parse_text: cannot find url pattern in given string"); + } + g_match_info_free (info); } } + +/* + * vi: ts=4 + */ @@ -73,8 +73,7 @@ enum protocol { #define struri(uri) ((uri)->string) -void url_parse_html (struct worker_task *task, GByteArray *part); -void url_parse_text (struct worker_task *task, GByteArray *part); +void url_parse_text (struct worker_task *task, GByteArray *part, gboolean is_html); enum uri_errno parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool); #endif diff --git a/test/rspamd_url_test.c b/test/rspamd_url_test.c index 808659757..36c9e439b 100644 --- a/test/rspamd_url_test.c +++ b/test/rspamd_url_test.c @@ -87,7 +87,7 @@ rspamd_url_test_func () g_test_timer_start (); g_test_message ("Testing text URL regexp parser"); - url_parse_text (&task, text); + url_parse_text (&task, text, FALSE); TAILQ_FOREACH (url, &task.urls, next) { msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data); @@ -104,7 +104,7 @@ rspamd_url_test_func () i = 0; g_test_timer_start (); g_test_message ("Testing html URL regexp parser"); - url_parse_html (&task, html); + url_parse_text (&task, html, TRUE); TAILQ_FOREACH (url, &task.urls, next) { msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data); diff --git a/utils/url_extracter.c b/utils/url_extracter.c index 8e16a689f..1b98fa67e 100644 --- a/utils/url_extracter.c +++ b/utils/url_extracter.c @@ -96,11 +96,11 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data) task->parts = g_list_prepend (task->parts, mime_part); if (g_mime_content_type_is_type (type, "text", "html")) { printf ("Found text/html part\n"); - url_parse_html (task, part_content); + url_parse_text (task, part_content, TRUE); } else if (g_mime_content_type_is_type (type, "text", "plain")) { printf ("Found text/plain part\n"); - url_parse_text (task, part_content); + url_parse_text (task, part_content, FALSE); } } } |