diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-06-02 19:32:34 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-06-02 19:32:34 +0400 |
commit | 7bae787900fea17ca82393886217c6287d7e8cea (patch) | |
tree | 4f358b3624d7b2ba6c86a25057d4ba7db10965ae /src/message.c | |
parent | 4eb2985d1ef3631fca82cbf18cc2e8d7aab9b096 (diff) | |
download | rspamd-7bae787900fea17ca82393886217c6287d7e8cea.tar.gz rspamd-7bae787900fea17ca82393886217c6287d7e8cea.zip |
* Rework url parsing algorithms
* Adopt all parts of rspamd for new url parser
* Improve url-extracter utility by avoiding cut&paste of mime parsing
* Small fixes to rspamc client
* Bump version to 0.1.3
Diffstat (limited to 'src/message.c')
-rw-r--r-- | src/message.c | 20 |
1 files changed, 15 insertions, 5 deletions
diff --git a/src/message.c b/src/message.c index 510d407e9..f664122d0 100644 --- a/src/message.c +++ b/src/message.c @@ -301,28 +301,36 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont text_part->is_balanced = TRUE; text_part->html_nodes = NULL; text_part->content = strip_html_tags (task->task_pool, text_part, part_content, NULL); + text_part->html_urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp); + text_part->urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp); if (text_part->html_nodes == NULL) { - url_parse_text (task, text_part->orig, FALSE); + url_parse_text (task->task_pool, task, text_part, FALSE); } else { - url_parse_text (task, text_part->orig, TRUE); + url_parse_text (task->task_pool, task, text_part, FALSE); + url_parse_text (task->task_pool, task, text_part, TRUE); } text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content); + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->html_urls); + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->urls); task->text_parts = g_list_prepend (task->text_parts, text_part); } else if (g_mime_content_type_is_type (type, "text", "plain")) { msg_debug ("mime_foreach_callback: got urls from text/plain part"); - url_parse_text (task, part_content, FALSE); text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part)); text_part->orig = convert_text_to_utf (task, part_content, type, text_part); text_part->content = text_part->orig; text_part->is_html = FALSE; text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); + text_part->html_urls = NULL; + text_part->urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp); + url_parse_text (task->task_pool, task, text_part, FALSE); task->text_parts = g_list_prepend (task->text_parts, text_part); + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->urls); } } @@ -489,8 +497,10 @@ process_message (struct worker_task *task) if (task->rcpts) { memory_pool_add_destructor (task->task_pool, (pool_destruct_func)internet_address_list_destroy, task->rcpts); } - - task->worker->srv->stat->messages_scanned ++; + + if (task->worker) { + task->worker->srv->stat->messages_scanned ++; + } /* free the parser (and the stream) */ g_object_unref (parser); |