* Add utility for extracting urls from message

* Rework build system
author: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2008-09-10 17:58:54 +0400
committer: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2008-09-10 17:58:54 +0400
commit: fe7ebd5be9d1352f7a3727bfbfabb6453321e269 (patch)
tree: 3888171e8e16362cecbefca6ad6548243ba9a8b9 /utils
parent: 57e765ce78c6b9746cddab4c3415dc386552151f (diff)
download: rspamd-fe7ebd5be9d1352f7a3727bfbfabb6453321e269.tar.gz
rspamd-fe7ebd5be9d1352f7a3727bfbfabb6453321e269.zip
2 files changed, 152 insertions, 0 deletions
diff --git a/utils/Makefile.in b/utils/Makefile.in
new file mode 100644
index 000000000..86f3eab60
--- /dev/null
+++ b/utils/Makefile.in
@@ -0,0 +1,12 @@
+.PHONY: clean
+
+all: url_extracter
+
+url_extracter: $(OBJECTS) ../url.o ../util.o
+	$(CC) $(PTHREAD_LDFLAGS) $(LDFLAGS) $(OBJECTS) ../url.o ../util.o $(LIBS) -o url_extracter
+
+clean:
+	rm -f *.o url_extracter *.core
+
+dist-clean: clean
+	rm -f Makefile
diff --git a/utils/url_extracter.c b/utils/url_extracter.c
new file mode 100644
index 000000000..dc2138e6f
--- /dev/null
+++ b/utils/url_extracter.c
@@ -0,0 +1,140 @@
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/param.h>
+
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <stdlib.h>
+
+#include <gmime/gmime.h>
+
+#include "../config.h"
+#include "../main.h"
+#include "../cfg_file.h"
+#include "../url.h"
+
+static void
+mime_foreach_callback (GMimeObject *part, gpointer user_data)
+{
+	struct worker_task *task = (struct worker_task *)user_data;
+	struct mime_part *mime_part;
+	GMimeContentType *type;
+	GMimeDataWrapper *wrapper;
+	GMimeStream *part_stream;
+	GByteArray *part_content;
+	
+	/* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
+	
+	/* find out what class 'part' is... */
+	if (GMIME_IS_MESSAGE_PART (part)) {
+		/* message/rfc822 or message/news */
+		printf ("Message part found\n");
+		GMimeMessage *message;
+		
+		/* g_mime_message_foreach_part() won't descend into
+                   child message parts, so if we want to count any
+                   subparts of this child message, we'll have to call
+                   g_mime_message_foreach_part() again here. */
+		
+		message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
+		g_mime_message_foreach_part (message, mime_foreach_callback, task);
+		g_object_unref (message);
+	} else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
+		/* message/partial */
+		printf ("Message/partial part found\n");
+		
+		/* this is an incomplete message part, probably a
+                   large message that the sender has broken into
+                   smaller parts and is sending us bit by bit. we
+                   could save some info about it so that we could
+                   piece this back together again once we get all the
+                   parts? */
+	} else if (GMIME_IS_MULTIPART (part)) {
+		/* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
+		
+		/* we'll get to finding out if this is a signed/encrypted multipart later... */
+	} else if (GMIME_IS_PART (part)) {
+		printf ("Normal part found\n");
+		/* a normal leaf part, could be text/plain or image/jpeg etc */
+		wrapper = g_mime_part_get_content_object (GMIME_PART (part));
+		if (wrapper != NULL) {
+			part_stream = g_mime_stream_mem_new ();
+			printf ("Get new wrapper object for normal part\n");
+			if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) {
+				printf ("Write wrapper to stream\n");
+				part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream));
+				type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part));
+				mime_part = g_malloc (sizeof (struct mime_part));
+				mime_part->type = type;
+				mime_part->content = part_content;
+				TAILQ_INSERT_TAIL (&task->parts, mime_part, next);
+				if (g_mime_content_type_is_type (type, "text", "html")) {
+					printf ("Found text/html part\n");
+					url_parse_html (task, part_content);
+				} 
+				else if (g_mime_content_type_is_type (type, "text", "plain")) {
+					printf ("Found text/plain part\n");
+					url_parse_text (task, part_content);
+				}
+			}
+		}
+	} else {
+		g_assert_not_reached ();
+	}
+}
+
+
+int
+main (int argc, char **argv)
+{
+	GMimeMessage *message;
+	GMimeParser *parser;
+	GMimeStream *stream;
+	struct worker_task task;
+	struct uri *url;
+	char *buf = NULL;
+	size_t pos = 0, size = 65535;
+	
+	g_mem_set_vtable(glib_mem_profiler_table);
+	g_mime_init (0);
+	
+	/* Preallocate buffer */
+	buf = g_malloc (size);
+
+	while (!feof (stdin)) {
+		*(buf + pos) = getchar ();
+		pos ++;
+		if (pos == size) {
+			size *= 2;
+			buf = g_realloc (buf, size);
+		}
+	}
+
+	stream = g_mime_stream_mem_new_with_buffer (buf, pos);
+	/* create a new parser object to parse the stream */
+	parser = g_mime_parser_new_with_stream (stream);
+
+	/* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */
+	g_object_unref (stream);
+
+	/* parse the message from the stream */
+	message = g_mime_parser_construct_message (parser);
+	
+	task.message = message;
+	TAILQ_INIT (&task.urls);
+	TAILQ_INIT (&task.parts);
+
+	/* free the parser (and the stream) */
+	g_object_unref (parser);
+
+	g_mime_message_foreach_part (message, mime_foreach_callback, &task);
+
+	TAILQ_FOREACH (url, &task.urls, next) {
+		printf ("Found url: %s, hostname: %s, data: %s\n", struri (url), url->host, url->data);
+	}
+
+}
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2008-09-10 17:58:54 +0400
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2008-09-10 17:58:54 +0400
commit	fe7ebd5be9d1352f7a3727bfbfabb6453321e269 (patch)
tree	3888171e8e16362cecbefca6ad6548243ba9a8b9 /utils
parent	57e765ce78c6b9746cddab4c3415dc386552151f (diff)
download	rspamd-fe7ebd5be9d1352f7a3727bfbfabb6453321e269.tar.gz rspamd-fe7ebd5be9d1352f7a3727bfbfabb6453321e269.zip