aboutsummaryrefslogtreecommitdiffstats
path: root/utils/url_extracter.c
blob: ede39ec27c8c1d86313eba4a6b77a3f3d303ca43 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#include <sys/types.h>
#include <sys/time.h>
#include <sys/wait.h>
#include <sys/param.h>

#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <syslog.h>
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>

#include <gmime/gmime.h>

#include "../src/config.h"
#if !defined(HAVE_OWN_QUEUE_H) && defined(HAVE_SYS_QUEUE_H)
#include <sys/queue.h>
#endif
#ifdef HAVE_OWN_QUEUE_H
#include "../src/queue.h"
#endif

#include "../src/main.h"
#include "../src/cfg_file.h"
#include "../src/url.h"
#include "../src/message.h"

static void
mime_foreach_callback (GMimeObject *part, gpointer user_data)
{
	struct worker_task *task = (struct worker_task *)user_data;
	struct mime_part *mime_part;
	GMimeContentType *type;
	GMimeDataWrapper *wrapper;
	GMimeStream *part_stream;
	GByteArray *part_content;
	GMimeMessage *message;
	
	/* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
	
	/* find out what class 'part' is... */
	if (GMIME_IS_MESSAGE_PART (part)) {
		/* message/rfc822 or message/news */
		printf ("Message part found\n");
		
		/* g_mime_message_foreach_part() won't descend into
                   child message parts, so if we want to count any
                   subparts of this child message, we'll have to call
                   g_mime_message_foreach_part() again here. */
		
		message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
		g_mime_message_foreach_part (message, mime_foreach_callback, task);
		g_object_unref (message);
	} else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
		/* message/partial */
		printf ("Message/partial part found\n");
		
		/* this is an incomplete message part, probably a
                   large message that the sender has broken into
                   smaller parts and is sending us bit by bit. we
                   could save some info about it so that we could
                   piece this back together again once we get all the
                   parts? */
	} else if (GMIME_IS_MULTIPART (part)) {
		/* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
		
		/* we'll get to finding out if this is a signed/encrypted multipart later... */
	} else if (GMIME_IS_PART (part)) {
		printf ("Normal part found\n");
		/* a normal leaf part, could be text/plain or image/jpeg etc */
		wrapper = g_mime_part_get_content_object (GMIME_PART (part));
		if (wrapper != NULL) {
			part_stream = g_mime_stream_mem_new ();
			printf ("Get new wrapper object for normal part\n");
			if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) {
				printf ("Write wrapper to stream\n");
				part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream));
				type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part));
				mime_part = g_malloc (sizeof (struct mime_part));
				mime_part->type = type;
				mime_part->content = part_content;
				task->parts =  g_list_prepend (task->parts, mime_part);
				if (g_mime_content_type_is_type (type, "text", "html")) {
					printf ("Found text/html part\n");
					url_parse_html (task, part_content);
				} 
				else if (g_mime_content_type_is_type (type, "text", "plain")) {
					printf ("Found text/plain part\n");
					url_parse_text (task, part_content);
				}
			}
		}
	} else {
		g_assert_not_reached ();
	}
}


int
main (int argc, char **argv)
{
	GMimeMessage *message;
	GMimeParser *parser;
	GMimeStream *stream;
	struct worker_task task;
	struct uri *url;
	char *buf = NULL;
	size_t pos = 0, size = 65535;
	
	g_mem_set_vtable(glib_mem_profiler_table);
	g_mime_init (0);
	
	/* Preallocate buffer */
	buf = g_malloc (size);

	while (!feof (stdin)) {
		*(buf + pos) = getchar ();
		pos ++;
		if (pos == size) {
			size *= 2;
			buf = g_realloc (buf, size);
		}
	}

	stream = g_mime_stream_mem_new_with_buffer (buf, pos);
	/* create a new parser object to parse the stream */
	parser = g_mime_parser_new_with_stream (stream);

	/* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */
	g_object_unref (stream);

	/* parse the message from the stream */
	message = g_mime_parser_construct_message (parser);
	
	task.message = message;
	TAILQ_INIT (&task.urls);

	/* free the parser (and the stream) */
	g_object_unref (parser);

	g_mime_message_foreach_part (message, mime_foreach_callback, &task);

	TAILQ_FOREACH (url, &task.urls, next) {
		printf ("Found url: %s, hostname: %s, data: %s\n", struri (url), url->host, url->data);
	}

}