diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-06-02 19:32:34 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-06-02 19:32:34 +0400 |
commit | 7bae787900fea17ca82393886217c6287d7e8cea (patch) | |
tree | 4f358b3624d7b2ba6c86a25057d4ba7db10965ae /utils/url_extracter.c | |
parent | 4eb2985d1ef3631fca82cbf18cc2e8d7aab9b096 (diff) | |
download | rspamd-7bae787900fea17ca82393886217c6287d7e8cea.tar.gz rspamd-7bae787900fea17ca82393886217c6287d7e8cea.zip |
* Rework url parsing algorithms
* Adopt all parts of rspamd for new url parser
* Improve url-extracter utility by avoiding cut&paste of mime parsing
* Small fixes to rspamc client
* Bump version to 0.1.3
Diffstat (limited to 'utils/url_extracter.c')
-rw-r--r-- | utils/url_extracter.c | 130 |
1 files changed, 19 insertions, 111 deletions
diff --git a/utils/url_extracter.c b/utils/url_extracter.c index ac8e8be4e..97bf72c47 100644 --- a/utils/url_extracter.c +++ b/utils/url_extracter.c @@ -24,107 +24,24 @@ #include "../src/main.h" #include "../src/cfg_file.h" #include "../src/url.h" +#include "../src/util.h" #include "../src/message.h" rspamd_hash_t *counters = NULL; -#ifdef GMIME24 -static void -mime_foreach_callback (GMimeObject *parent, GMimeObject *part, gpointer user_data) -#else -static void -mime_foreach_callback (GMimeObject *part, gpointer user_data) -#endif -{ - struct worker_task *task = (struct worker_task *)user_data; - struct mime_part *mime_part; - GMimeContentType *type; - GMimeDataWrapper *wrapper; - GMimeStream *part_stream; - GByteArray *part_content; - GMimeMessage *message; - - /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */ - - /* find out what class 'part' is... */ - if (GMIME_IS_MESSAGE_PART (part)) { - /* message/rfc822 or message/news */ - printf ("Message part found\n"); - - /* g_mime_message_foreach_part() won't descend into - child message parts, so if we want to count any - subparts of this child message, we'll have to call - g_mime_message_foreach_part() again here. */ - - message = g_mime_message_part_get_message ((GMimeMessagePart *) part); -#ifdef GMIME24 - g_mime_message_foreach (message, mime_foreach_callback, task); -#else - g_mime_message_foreach_part (message, mime_foreach_callback, task); -#endif - g_object_unref (message); - } else if (GMIME_IS_MESSAGE_PARTIAL (part)) { - /* message/partial */ - printf ("Message/partial part found\n"); - - /* this is an incomplete message part, probably a - large message that the sender has broken into - smaller parts and is sending us bit by bit. we - could save some info about it so that we could - piece this back together again once we get all the - parts? */ - } else if (GMIME_IS_MULTIPART (part)) { - /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */ - - /* we'll get to finding out if this is a signed/encrypted multipart later... */ - } else if (GMIME_IS_PART (part)) { - printf ("Normal part found\n"); - /* a normal leaf part, could be text/plain or image/jpeg etc */ - wrapper = g_mime_part_get_content_object (GMIME_PART (part)); - if (wrapper != NULL) { - part_stream = g_mime_stream_mem_new (); - printf ("Get new wrapper object for normal part\n"); - if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) { - printf ("Write wrapper to stream\n"); - part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream)); -#ifdef GMIME24 - type = (GMimeContentType *)g_mime_object_get_content_type (GMIME_OBJECT (part)); -#else - type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part)); -#endif - mime_part = g_malloc (sizeof (struct mime_part)); - mime_part->type = type; - mime_part->content = part_content; - task->parts = g_list_prepend (task->parts, mime_part); - if (g_mime_content_type_is_type (type, "text", "html")) { - printf ("Found text/html part\n"); - url_parse_text (task, part_content, TRUE); - } - else if (g_mime_content_type_is_type (type, "text", "plain")) { - printf ("Found text/plain part\n"); - url_parse_text (task, part_content, FALSE); - } - } - } - } else { - g_assert_not_reached (); - } -} - int main (int argc, char **argv) { - GMimeMessage *message; - GMimeParser *parser; - GMimeStream *stream; struct worker_task task; struct uri *url; char *buf = NULL; size_t pos = 0, size = 65535; + GList *cur; g_mem_set_vtable(glib_mem_profiler_table); g_mime_init (0); bzero (&task, sizeof (struct worker_task)); + task.task_pool = memory_pool_new (memory_pool_get_size ()); /* Preallocate buffer */ buf = g_malloc (size); @@ -137,32 +54,23 @@ main (int argc, char **argv) buf = g_realloc (buf, size); } } - - stream = g_mime_stream_mem_new_with_buffer (buf, pos); - /* create a new parser object to parse the stream */ - parser = g_mime_parser_new_with_stream (stream); - - /* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */ - g_object_unref (stream); - - /* parse the message from the stream */ - message = g_mime_parser_construct_message (parser); - task.message = message; - task.task_pool = memory_pool_new (memory_pool_get_size ()); - TAILQ_INIT (&task.urls); - - /* free the parser (and the stream) */ - g_object_unref (parser); + task.cfg = memory_pool_alloc0 (task.task_pool, sizeof (struct config_file)); + task.cfg->log_level = G_LOG_LEVEL_CRITICAL; + task.cfg->log_fd = STDERR_FILENO; + g_log_set_default_handler (file_log_function, task.cfg); -#ifdef GMIME24 - g_mime_message_foreach (message, mime_foreach_callback, &task); -#else - g_mime_message_foreach_part (message, mime_foreach_callback, &task); -#endif - - TAILQ_FOREACH (url, &task.urls, next) { - printf ("Found url: %s, hostname: %s, data: %s\n", struri (url), url->host, url->data); + task.msg = memory_pool_alloc (task.task_pool, sizeof (f_str_t)); + task.msg->begin = buf; + task.msg->len = pos; + process_message (&task); + + cur = task.urls; + while (cur) { + url = cur->data; + printf ("%s\n", struri (url)); + cur = g_list_next (cur); } - + + return 0; } |