diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2008-09-10 17:58:54 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2008-09-10 17:58:54 +0400 |
commit | fe7ebd5be9d1352f7a3727bfbfabb6453321e269 (patch) | |
tree | 3888171e8e16362cecbefca6ad6548243ba9a8b9 /utils/url_extracter.c | |
parent | 57e765ce78c6b9746cddab4c3415dc386552151f (diff) | |
download | rspamd-fe7ebd5be9d1352f7a3727bfbfabb6453321e269.tar.gz rspamd-fe7ebd5be9d1352f7a3727bfbfabb6453321e269.zip |
* Add utility for extracting urls from message
* Rework build system
Diffstat (limited to 'utils/url_extracter.c')
-rw-r--r-- | utils/url_extracter.c | 140 |
1 files changed, 140 insertions, 0 deletions
diff --git a/utils/url_extracter.c b/utils/url_extracter.c new file mode 100644 index 000000000..dc2138e6f --- /dev/null +++ b/utils/url_extracter.c @@ -0,0 +1,140 @@ +#include <sys/types.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <sys/param.h> + +#include <netinet/in.h> +#include <arpa/inet.h> +#include <netdb.h> +#include <syslog.h> +#include <fcntl.h> +#include <stdlib.h> + +#include <gmime/gmime.h> + +#include "../config.h" +#include "../main.h" +#include "../cfg_file.h" +#include "../url.h" + +static void +mime_foreach_callback (GMimeObject *part, gpointer user_data) +{ + struct worker_task *task = (struct worker_task *)user_data; + struct mime_part *mime_part; + GMimeContentType *type; + GMimeDataWrapper *wrapper; + GMimeStream *part_stream; + GByteArray *part_content; + + /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */ + + /* find out what class 'part' is... */ + if (GMIME_IS_MESSAGE_PART (part)) { + /* message/rfc822 or message/news */ + printf ("Message part found\n"); + GMimeMessage *message; + + /* g_mime_message_foreach_part() won't descend into + child message parts, so if we want to count any + subparts of this child message, we'll have to call + g_mime_message_foreach_part() again here. */ + + message = g_mime_message_part_get_message ((GMimeMessagePart *) part); + g_mime_message_foreach_part (message, mime_foreach_callback, task); + g_object_unref (message); + } else if (GMIME_IS_MESSAGE_PARTIAL (part)) { + /* message/partial */ + printf ("Message/partial part found\n"); + + /* this is an incomplete message part, probably a + large message that the sender has broken into + smaller parts and is sending us bit by bit. we + could save some info about it so that we could + piece this back together again once we get all the + parts? */ + } else if (GMIME_IS_MULTIPART (part)) { + /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */ + + /* we'll get to finding out if this is a signed/encrypted multipart later... */ + } else if (GMIME_IS_PART (part)) { + printf ("Normal part found\n"); + /* a normal leaf part, could be text/plain or image/jpeg etc */ + wrapper = g_mime_part_get_content_object (GMIME_PART (part)); + if (wrapper != NULL) { + part_stream = g_mime_stream_mem_new (); + printf ("Get new wrapper object for normal part\n"); + if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) { + printf ("Write wrapper to stream\n"); + part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream)); + type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part)); + mime_part = g_malloc (sizeof (struct mime_part)); + mime_part->type = type; + mime_part->content = part_content; + TAILQ_INSERT_TAIL (&task->parts, mime_part, next); + if (g_mime_content_type_is_type (type, "text", "html")) { + printf ("Found text/html part\n"); + url_parse_html (task, part_content); + } + else if (g_mime_content_type_is_type (type, "text", "plain")) { + printf ("Found text/plain part\n"); + url_parse_text (task, part_content); + } + } + } + } else { + g_assert_not_reached (); + } +} + + +int +main (int argc, char **argv) +{ + GMimeMessage *message; + GMimeParser *parser; + GMimeStream *stream; + struct worker_task task; + struct uri *url; + char *buf = NULL; + size_t pos = 0, size = 65535; + + g_mem_set_vtable(glib_mem_profiler_table); + g_mime_init (0); + + /* Preallocate buffer */ + buf = g_malloc (size); + + while (!feof (stdin)) { + *(buf + pos) = getchar (); + pos ++; + if (pos == size) { + size *= 2; + buf = g_realloc (buf, size); + } + } + + stream = g_mime_stream_mem_new_with_buffer (buf, pos); + /* create a new parser object to parse the stream */ + parser = g_mime_parser_new_with_stream (stream); + + /* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */ + g_object_unref (stream); + + /* parse the message from the stream */ + message = g_mime_parser_construct_message (parser); + + task.message = message; + TAILQ_INIT (&task.urls); + TAILQ_INIT (&task.parts); + + /* free the parser (and the stream) */ + g_object_unref (parser); + + g_mime_message_foreach_part (message, mime_foreach_callback, &task); + + TAILQ_FOREACH (url, &task.urls, next) { + printf ("Found url: %s, hostname: %s, data: %s\n", struri (url), url->host, url->data); + } + +} |