summaryrefslogtreecommitdiffstats
path: root/utils/url_extracter.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2009-06-02 19:32:34 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2009-06-02 19:32:34 +0400
commit7bae787900fea17ca82393886217c6287d7e8cea (patch)
tree4f358b3624d7b2ba6c86a25057d4ba7db10965ae /utils/url_extracter.c
parent4eb2985d1ef3631fca82cbf18cc2e8d7aab9b096 (diff)
downloadrspamd-7bae787900fea17ca82393886217c6287d7e8cea.tar.gz
rspamd-7bae787900fea17ca82393886217c6287d7e8cea.zip
* Rework url parsing algorithms
* Adopt all parts of rspamd for new url parser * Improve url-extracter utility by avoiding cut&paste of mime parsing * Small fixes to rspamc client * Bump version to 0.1.3
Diffstat (limited to 'utils/url_extracter.c')
-rw-r--r--utils/url_extracter.c130
1 files changed, 19 insertions, 111 deletions
diff --git a/utils/url_extracter.c b/utils/url_extracter.c
index ac8e8be4e..97bf72c47 100644
--- a/utils/url_extracter.c
+++ b/utils/url_extracter.c
@@ -24,107 +24,24 @@
#include "../src/main.h"
#include "../src/cfg_file.h"
#include "../src/url.h"
+#include "../src/util.h"
#include "../src/message.h"
rspamd_hash_t *counters = NULL;
-#ifdef GMIME24
-static void
-mime_foreach_callback (GMimeObject *parent, GMimeObject *part, gpointer user_data)
-#else
-static void
-mime_foreach_callback (GMimeObject *part, gpointer user_data)
-#endif
-{
- struct worker_task *task = (struct worker_task *)user_data;
- struct mime_part *mime_part;
- GMimeContentType *type;
- GMimeDataWrapper *wrapper;
- GMimeStream *part_stream;
- GByteArray *part_content;
- GMimeMessage *message;
-
- /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
-
- /* find out what class 'part' is... */
- if (GMIME_IS_MESSAGE_PART (part)) {
- /* message/rfc822 or message/news */
- printf ("Message part found\n");
-
- /* g_mime_message_foreach_part() won't descend into
- child message parts, so if we want to count any
- subparts of this child message, we'll have to call
- g_mime_message_foreach_part() again here. */
-
- message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
-#ifdef GMIME24
- g_mime_message_foreach (message, mime_foreach_callback, task);
-#else
- g_mime_message_foreach_part (message, mime_foreach_callback, task);
-#endif
- g_object_unref (message);
- } else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
- /* message/partial */
- printf ("Message/partial part found\n");
-
- /* this is an incomplete message part, probably a
- large message that the sender has broken into
- smaller parts and is sending us bit by bit. we
- could save some info about it so that we could
- piece this back together again once we get all the
- parts? */
- } else if (GMIME_IS_MULTIPART (part)) {
- /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
-
- /* we'll get to finding out if this is a signed/encrypted multipart later... */
- } else if (GMIME_IS_PART (part)) {
- printf ("Normal part found\n");
- /* a normal leaf part, could be text/plain or image/jpeg etc */
- wrapper = g_mime_part_get_content_object (GMIME_PART (part));
- if (wrapper != NULL) {
- part_stream = g_mime_stream_mem_new ();
- printf ("Get new wrapper object for normal part\n");
- if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) {
- printf ("Write wrapper to stream\n");
- part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream));
-#ifdef GMIME24
- type = (GMimeContentType *)g_mime_object_get_content_type (GMIME_OBJECT (part));
-#else
- type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part));
-#endif
- mime_part = g_malloc (sizeof (struct mime_part));
- mime_part->type = type;
- mime_part->content = part_content;
- task->parts = g_list_prepend (task->parts, mime_part);
- if (g_mime_content_type_is_type (type, "text", "html")) {
- printf ("Found text/html part\n");
- url_parse_text (task, part_content, TRUE);
- }
- else if (g_mime_content_type_is_type (type, "text", "plain")) {
- printf ("Found text/plain part\n");
- url_parse_text (task, part_content, FALSE);
- }
- }
- }
- } else {
- g_assert_not_reached ();
- }
-}
-
int
main (int argc, char **argv)
{
- GMimeMessage *message;
- GMimeParser *parser;
- GMimeStream *stream;
struct worker_task task;
struct uri *url;
char *buf = NULL;
size_t pos = 0, size = 65535;
+ GList *cur;
g_mem_set_vtable(glib_mem_profiler_table);
g_mime_init (0);
bzero (&task, sizeof (struct worker_task));
+ task.task_pool = memory_pool_new (memory_pool_get_size ());
/* Preallocate buffer */
buf = g_malloc (size);
@@ -137,32 +54,23 @@ main (int argc, char **argv)
buf = g_realloc (buf, size);
}
}
-
- stream = g_mime_stream_mem_new_with_buffer (buf, pos);
- /* create a new parser object to parse the stream */
- parser = g_mime_parser_new_with_stream (stream);
-
- /* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */
- g_object_unref (stream);
-
- /* parse the message from the stream */
- message = g_mime_parser_construct_message (parser);
- task.message = message;
- task.task_pool = memory_pool_new (memory_pool_get_size ());
- TAILQ_INIT (&task.urls);
-
- /* free the parser (and the stream) */
- g_object_unref (parser);
+ task.cfg = memory_pool_alloc0 (task.task_pool, sizeof (struct config_file));
+ task.cfg->log_level = G_LOG_LEVEL_CRITICAL;
+ task.cfg->log_fd = STDERR_FILENO;
+ g_log_set_default_handler (file_log_function, task.cfg);
-#ifdef GMIME24
- g_mime_message_foreach (message, mime_foreach_callback, &task);
-#else
- g_mime_message_foreach_part (message, mime_foreach_callback, &task);
-#endif
-
- TAILQ_FOREACH (url, &task.urls, next) {
- printf ("Found url: %s, hostname: %s, data: %s\n", struri (url), url->host, url->data);
+ task.msg = memory_pool_alloc (task.task_pool, sizeof (f_str_t));
+ task.msg->begin = buf;
+ task.msg->len = pos;
+ process_message (&task);
+
+ cur = task.urls;
+ while (cur) {
+ url = cur->data;
+ printf ("%s\n", struri (url));
+ cur = g_list_next (cur);
}
-
+
+ return 0;
}