summaryrefslogtreecommitdiffstats
path: root/utils/url_extracter.c
blob: ac8e8be4e956b84c5d9094678b3cd31241af0afb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#include <sys/types.h>
#include <sys/time.h>
#include <sys/wait.h>
#include <sys/param.h>

#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <syslog.h>
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>

#include <gmime/gmime.h>

#include "../src/config.h"
#if !defined(HAVE_OWN_QUEUE_H) && defined(HAVE_SYS_QUEUE_H)
#include <sys/queue.h>
#endif
#ifdef HAVE_OWN_QUEUE_H
#include "../src/queue.h"
#endif

#include "../src/main.h"
#include "../src/cfg_file.h"
#include "../src/url.h"
#include "../src/message.h"

rspamd_hash_t *counters = NULL;
#ifdef GMIME24
static void
mime_foreach_callback (GMimeObject *parent, GMimeObject *part, gpointer user_data)
#else
static void
mime_foreach_callback (GMimeObject *part, gpointer user_data)
#endif
{
	struct worker_task *task = (struct worker_task *)user_data;
	struct mime_part *mime_part;
	GMimeContentType *type;
	GMimeDataWrapper *wrapper;
	GMimeStream *part_stream;
	GByteArray *part_content;
	GMimeMessage *message;
	
	/* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
	
	/* find out what class 'part' is... */
	if (GMIME_IS_MESSAGE_PART (part)) {
		/* message/rfc822 or message/news */
		printf ("Message part found\n");
		
		/* g_mime_message_foreach_part() won't descend into
                   child message parts, so if we want to count any
                   subparts of this child message, we'll have to call
                   g_mime_message_foreach_part() again here. */
		
		message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
#ifdef GMIME24
		g_mime_message_foreach (message, mime_foreach_callback, task);
#else
		g_mime_message_foreach_part (message, mime_foreach_callback, task);
#endif
		g_object_unref (message);
	} else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
		/* message/partial */
		printf ("Message/partial part found\n");
		
		/* this is an incomplete message part, probably a
                   large message that the sender has broken into
                   smaller parts and is sending us bit by bit. we
                   could save some info about it so that we could
                   piece this back together again once we get all the
                   parts? */
	} else if (GMIME_IS_MULTIPART (part)) {
		/* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
		
		/* we'll get to finding out if this is a signed/encrypted multipart later... */
	} else if (GMIME_IS_PART (part)) {
		printf ("Normal part found\n");
		/* a normal leaf part, could be text/plain or image/jpeg etc */
		wrapper = g_mime_part_get_content_object (GMIME_PART (part));
		if (wrapper != NULL) {
			part_stream = g_mime_stream_mem_new ();
			printf ("Get new wrapper object for normal part\n");
			if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) {
				printf ("Write wrapper to stream\n");
				part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream));
#ifdef GMIME24
				type = (GMimeContentType *)g_mime_object_get_content_type (GMIME_OBJECT (part));
#else
				type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part));
#endif
				mime_part = g_malloc (sizeof (struct mime_part));
				mime_part->type = type;
				mime_part->content = part_content;
				task->parts =  g_list_prepend (task->parts, mime_part);
				if (g_mime_content_type_is_type (type, "text", "html")) {
					printf ("Found text/html part\n");
					url_parse_text (task, part_content, TRUE);
				} 
				else if (g_mime_content_type_is_type (type, "text", "plain")) {
					printf ("Found text/plain part\n");
					url_parse_text (task, part_content, FALSE);
				}
			}
		}
	} else {
		g_assert_not_reached ();
	}
}


int
main (int argc, char **argv)
{
	GMimeMessage *message;
	GMimeParser *parser;
	GMimeStream *stream;
	struct worker_task task;
	struct uri *url;
	char *buf = NULL;
	size_t pos = 0, size = 65535;
	
	g_mem_set_vtable(glib_mem_profiler_table);
	g_mime_init (0);
	bzero (&task, sizeof (struct worker_task));
	
	/* Preallocate buffer */
	buf = g_malloc (size);

	while (!feof (stdin)) {
		*(buf + pos) = getchar ();
		pos ++;
		if (pos == size) {
			size *= 2;
			buf = g_realloc (buf, size);
		}
	}

	stream = g_mime_stream_mem_new_with_buffer (buf, pos);
	/* create a new parser object to parse the stream */
	parser = g_mime_parser_new_with_stream (stream);

	/* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */
	g_object_unref (stream);

	/* parse the message from the stream */
	message = g_mime_parser_construct_message (parser);
	
	task.message = message;
	task.task_pool = memory_pool_new (memory_pool_get_size ());
	TAILQ_INIT (&task.urls);

	/* free the parser (and the stream) */
	g_object_unref (parser);

#ifdef GMIME24
	g_mime_message_foreach (message, mime_foreach_callback, &task);
#else
	g_mime_message_foreach_part (message, mime_foreach_callback, &task);
#endif

	TAILQ_FOREACH (url, &task.urls, next) {
		printf ("Found url: %s, hostname: %s, data: %s\n", struri (url), url->host, url->data);
	}

}