]> source.dussan.org Git - rspamd.git/commitdiff
* Rewrite message parser
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Wed, 21 Jan 2009 14:25:06 +0000 (17:25 +0300)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Wed, 21 Jan 2009 14:25:06 +0000 (17:25 +0300)
* Change mime parts storage
* Add html tags striping (ported from php code)
* Rework learning to process only text and striped html parts

12 files changed:
configure
perl/rspamd.xs
src/controller.c
src/filter.c
src/main.h
src/message.c [new file with mode: 0644]
src/message.h [new file with mode: 0644]
src/plugins/regexp.c
src/tokenizers/osb.c
src/tokenizers/tokenizers.c
src/tokenizers/tokenizers.h
src/worker.c

index 76b2bbb608128985d2d95db5a0923fa82e989934..31d55cfac7d8cca773a9ab26dd9f1bdb00ce7cb2 100755 (executable)
--- a/configure
+++ b/configure
@@ -24,7 +24,7 @@ CACHE="config.cache"
 
 SRCDIR="src"
 OBJDIR="src/.obj"
-SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c tokenizers/tokenizers.c tokenizers/osb.c classifiers/classifiers.c classifiers/winnow.c statfile.c ${LEX_OUTPUT} ${YACC_OUTPUT}"
+SOURCES="upstream.c cfg_utils.c memcached.c message.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c tokenizers/tokenizers.c tokenizers/osb.c classifiers/classifiers.c classifiers/winnow.c statfile.c ${LEX_OUTPUT} ${YACC_OUTPUT}"
 MODULES="surbl regexp"
 
 CFLAGS="$CFLAGS -W -Wpointer-arith -Wno-unused-parameter"
index 7020e34635a0fd322540d6a4927eea51146fbe88..4dfc9e6658ce59d878acb885a4620744dfb92a2a 100644 (file)
@@ -13,6 +13,7 @@
 
 #include "../src/config.h"
 #include "../src/main.h"
+#include "../src/message.h"
 #include "../src/cfg_file.h"
 #include "../src/perl.h"
 #include "../src/mem_pool.h"
@@ -86,12 +87,8 @@ rspamd_task_get_part (r, num)
        if (number < 0 || number > r->parts_count - 1) {
                XSRETURN_UNDEF;
        }
-
-       TAILQ_FOREACH (part, &r->parts, next) {
-               if (--number == 0) {
-                       break;
-               }
-       }
+       
+       part = g_list_nth_data (r->parts, number);
        RETVAL = newHV();
        type = g_mime_content_type_to_string (part->type);
 
index 56eeb042af843419b624d3550cef94e3bb8530c9..1815fc6553ac1ee9a7333e6b08cae8ffda4a3e76 100644 (file)
@@ -18,6 +18,7 @@
 
 #include "util.h"
 #include "main.h"
+#include "message.h"
 #include "protocol.h"
 #include "upstream.h"
 #include "cfg_file.h"
@@ -98,9 +99,20 @@ completion_func (gpointer elem)
 static void
 free_session (struct controller_session *session)
 {
+       GList *part;
+       struct mime_part *p;
+       
        msg_debug ("free_session: freeing session %p", session);
        bufferevent_disable (session->bev, EV_READ | EV_WRITE);
        bufferevent_free (session->bev);
+       
+       while ((part = g_list_first (session->parts))) {
+               session->parts = g_list_remove_link (session->parts, part);
+               p = (struct mime_part *)part->data;
+               g_byte_array_free (p->content, FALSE);
+               g_list_free_1 (part);
+       }
+
        memory_pool_delete (session->session_pool);
        g_free (session);
 }
@@ -231,7 +243,7 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control
                                        bufferevent_write (session->bev, out_buf, r);
                                        return;
                                }
-                               session->learn_buf = memory_pool_alloc (session->session_pool, sizeof (f_str_buf_t));
+                               session->learn_buf = memory_pool_alloc0 (session->session_pool, sizeof (f_str_buf_t));
                                session->learn_buf->buf = fstralloc (session->session_pool, size);
                                if (session->learn_buf->buf == NULL) {
                                        r = snprintf (out_buf, sizeof (out_buf), "allocating buffer for learn failed" CRLF);
@@ -328,8 +340,11 @@ read_socket (struct bufferevent *bev, void *arg)
        struct controller_session *session = (struct controller_session *)arg;
        int len, i;
        char *s, **params, *cmd, out_buf[128];
-       GList *comp_list;
-       GTree *tokens;
+       GList *comp_list, *cur = NULL;
+       GTree *tokens = NULL;
+       GByteArray *content = NULL;
+       struct mime_part *p;
+       f_str_t c;
 
        switch (session->state) {
                case STATE_COMMAND:
@@ -387,18 +402,32 @@ read_socket (struct bufferevent *bev, void *arg)
                                session->learn_buf->pos += i;
                                update_buf_size (session->learn_buf);
                                if (session->learn_buf->free == 0) {
-                                       tokens = session->learn_tokenizer->tokenize_func (session->learn_tokenizer, session->session_pool, session->learn_buf->buf);
-                                       if (tokens == NULL) {
-                                               i = snprintf (out_buf, sizeof (out_buf), "learn fail, tokenizer error" CRLF);
-                                               bufferevent_write (bev, out_buf, i);
-                                               session->state = STATE_REPLY;
-                                               return;
+                                       process_learn (session);
+                                       while ((content = get_next_text_part (session->session_pool, session->parts, &cur)) != NULL) {
+                                               c.begin = content->data;
+                                               c.len = content->len;
+                                               if (!session->learn_tokenizer->tokenize_func (session->learn_tokenizer, 
+                                                                       session->session_pool, &c, &tokens)) {
+                                                       i = snprintf (out_buf, sizeof (out_buf), "learn fail, tokenizer error" CRLF);
+                                                       bufferevent_write (bev, out_buf, i);
+                                                       session->state = STATE_REPLY;
+                                                       return;
+                                               }
                                        }
                                        session->learn_classifier->learn_func (session->worker->srv->statfile_pool, session->learn_filename, tokens, session->in_class);
                                        session->worker->srv->stat->messages_learned ++;
                                        i = snprintf (out_buf, sizeof (out_buf), "learn ok" CRLF);
                                        bufferevent_write (bev, out_buf, i);
                                        bufferevent_enable (bev, EV_WRITE);
+
+                                       /* Clean learned parts */
+                                       while ((cur = g_list_first (session->parts))) {
+                                               session->parts = g_list_remove_link (session->parts, cur);
+                                               p = (struct mime_part *)cur->data;
+                                               g_byte_array_free (p->content, FALSE);
+                                               g_list_free_1 (cur);
+                                       }
+
                                        session->state = STATE_REPLY;
                                        break;
                                }
index 443784ad63d8d6c565d71599d6c40652147c4bde..f45d718f279ac40d7249684aedee9a548265761b 100644 (file)
@@ -6,6 +6,7 @@
 #include "mem_pool.h"
 #include "filter.h"
 #include "main.h"
+#include "message.h"
 #include "cfg_file.h"
 #include "perl.h"
 #include "util.h"
@@ -354,9 +355,12 @@ statfiles_callback (gpointer key, gpointer value, void *arg)
        struct statfile_callback_data *data= (struct statfile_callback_data *)arg;
        struct worker_task *task = data->task;
        struct statfile *st = (struct statfile *)value;
-       GTree *tokens;
+       GTree *tokens = NULL;
        char *filename;
        double weight, *w;
+       GList *cur = NULL;
+       GByteArray *content;
+       f_str_t c;
        
        if (g_list_length (task->rcpt) == 1) {
                filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, (char *)task->rcpt->data);
@@ -371,11 +375,14 @@ statfiles_callback (gpointer key, gpointer value, void *arg)
        }
        
        if ((tokens = g_hash_table_lookup (data->tokens, st->tokenizer)) == NULL) {
-               /* Tree would be freed at task pool freeing */
-               tokens = st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, task->msg->buf);
-               if (tokens == NULL) {
-                       msg_info ("statfiles_callback: cannot tokenize input");
-                       return;
+               while ((content = get_next_text_part (task->task_pool, task->parts, &cur)) != NULL) {
+                       c.begin = content->data;
+                       c.len = content->len;
+                       /* Tree would be freed at task pool freeing */
+                       if (!st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, &c, &tokens)) {
+                               msg_info ("statfiles_callback: cannot tokenize input");
+                               return;
+                       }
                }
                g_hash_table_insert (data->tokens, st->tokenizer, tokens);
        }
index b2e362c9d2be162f6f78bcae13e7b3bab1209ca5..e6594429b2845c776a6ed52e9fe25e4e089b632e 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef RPOP_MAIN_H
-#define RPOP_MAIN_H
+#ifndef RSPAMD_MAIN_H
+#define RSPAMD_MAIN_H
 
 #include "config.h"
 
@@ -84,6 +84,7 @@ struct pidfh;
 struct config_file;
 struct tokenizer;
 struct classifier;
+struct mime_part;
 
 /* Server statistics */
 struct rspamd_stat {
@@ -111,11 +112,6 @@ struct rspamd_main {
        TAILQ_HEAD (workq, rspamd_worker) workers;
 };
 
-struct mime_part {
-       GMimeContentType *type;
-       GByteArray *content;
-       TAILQ_ENTRY (mime_part) next;
-};
 
 struct save_point {
        void *entry;
@@ -144,6 +140,7 @@ struct controller_session {
        struct classifier *learn_classifier;
        char *learn_filename;
        f_str_buf_t *learn_buf;
+       GList *parts;
        int in_class;
 };
 
@@ -178,7 +175,7 @@ struct worker_task {
        /* Message */
        GMimeMessage *message;
        /* All parts of message */
-       TAILQ_HEAD (mime_partq, mime_part) parts;
+       GList *parts;
        /* URLs extracted from message */
        TAILQ_HEAD (uriq, uri) urls;
        /* Hash of metric result structures */
diff --git a/src/message.c b/src/message.c
new file mode 100644 (file)
index 0000000..4569831
--- /dev/null
@@ -0,0 +1,442 @@
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <errno.h>
+#include <signal.h>
+#include <netdb.h>
+
+#include <glib.h>
+#include <gmime/gmime.h>
+
+#include "util.h"
+#include "main.h"
+#include "message.h"
+#include "cfg_file.h"
+#include "modules.h"
+
+GByteArray*
+strip_html_tags (GByteArray *src, int *stateptr)
+{
+       uint8_t *tbuf = NULL, *p, *tp = NULL, *rp, c, lc;
+       int br, i = 0, depth = 0, in_q = 0;
+       int state = 0;
+       GByteArray *buf;
+
+       if (stateptr)
+               state = *stateptr;
+       
+       buf = g_byte_array_sized_new (src->len);
+       g_byte_array_append (buf, src->data, src->len);
+
+       c = *src->data;
+       lc = '\0';
+       p = src->data;
+       rp = buf->data;
+       br = 0;
+
+       while (i < src->len) {
+               switch (c) {
+                       case '\0':
+                               break;
+                       case '<':
+                               if (g_ascii_isspace(*(p + 1))) {
+                                       goto reg_char;
+                               }
+                               if (state == 0) {
+                                       lc = '<';
+                                       state = 1;
+                               } else if (state == 1) {
+                                       depth++;
+                               }
+                               break;
+
+                       case '(':
+                               if (state == 2) {
+                                       if (lc != '"' && lc != '\'') {
+                                               lc = '(';
+                                               br++;
+                                       }
+                               } else if (state == 0) {
+                                       *(rp++) = c;
+                               }
+                               break;  
+
+                       case ')':
+                               if (state == 2) {
+                                       if (lc != '"' && lc != '\'') {
+                                               lc = ')';
+                                               br--;
+                                       }
+                               } else if (state == 0) {
+                                       *(rp++) = c;
+                               }
+                               break;  
+
+                       case '>':
+                               if (depth) {
+                                       depth--;
+                                       break;
+                               }
+
+                               if (in_q) {
+                                       break;
+                               }
+
+                               switch (state) {
+                                       case 1: /* HTML/XML */
+                                               lc = '>';
+                                               in_q = state = 0;
+                                               
+                                               break;
+                                               
+                                       case 2: /* PHP */
+                                               if (!br && lc != '\"' && *(p-1) == '?') {
+                                                       in_q = state = 0;
+                                                       tp = tbuf;
+                                               }
+                                               break;
+                                               
+                                       case 3:
+                                               in_q = state = 0;
+                                               tp = tbuf;
+                                               break;
+
+                                       case 4: /* JavaScript/CSS/etc... */
+                                               if (p >= src->data + 2 && *(p-1) == '-' && *(p-2) == '-') {
+                                                       in_q = state = 0;
+                                                       tp = tbuf;
+                                               }
+                                               break;
+
+                                       default:
+                                               *(rp++) = c;
+                                               break;
+                               }
+                               break;
+
+                       case '"':
+                       case '\'':
+                               if (state == 2 && *(p-1) != '\\') {
+                                       if (lc == c) {
+                                               lc = '\0';
+                                       } else if (lc != '\\') {
+                                               lc = c;
+                                       }
+                               } else if (state == 0) {
+                                       *(rp++) = c;
+                               }
+                               if (state && p != src->data && *(p-1) != '\\' && (!in_q || *p == in_q)) {
+                                       if (in_q) {
+                                               in_q = 0;
+                                       } else {
+                                               in_q = *p;
+                                       }
+                               }
+                               break;
+                       
+                       case '!': 
+                               /* JavaScript & Other HTML scripting languages */
+                               if (state == 1 && *(p-1) == '<') { 
+                                       state = 3;
+                                       lc = c;
+                               } else {
+                                       if (state == 0) {
+                                               *(rp++) = c;
+                                       }
+                               }
+                               break;
+
+                       case '-':
+                               if (state == 3 && p >= src->data + 2 && *(p-1) == '-' && *(p-2) == '!') {
+                                       state = 4;
+                               } else {
+                                       goto reg_char;
+                               }
+                               break;
+
+                       case '?':
+
+                               if (state == 1 && *(p-1) == '<') { 
+                                       br = 0;
+                                       state = 2;
+                                       break;
+                               }
+
+                       case 'E':
+                       case 'e':
+                               /* !DOCTYPE exception */
+                               if (state == 3 && p > src->data + 6
+                                                    && tolower(*(p-1)) == 'p'
+                                                && tolower(*(p-2)) == 'y'
+                                                    && tolower(*(p-3)) == 't'
+                                                    && tolower(*(p-4)) == 'c'
+                                                    && tolower(*(p-5)) == 'o'
+                                                    && tolower(*(p-6)) == 'd') {
+                                       state = 1;
+                                       break;
+                               }
+                               /* fall-through */
+
+                       case 'l':
+
+                               /* swm: If we encounter '<?xml' then we shouldn't be in
+                                * state == 2 (PHP). Switch back to HTML.
+                                */
+
+                               if (state == 2 && p > src->data + 2 && *(p-1) == 'm' && *(p-2) == 'x') {
+                                       state = 1;
+                                       break;
+                               }
+
+                               /* fall-through */
+                       default:
+reg_char:
+                               if (state == 0) {
+                                       *(rp++) = c;
+                               } 
+                               break;
+               }
+               c = *(++p);
+               i++;
+       }       
+       if (rp < buf->data + src->len) {
+               *rp = '\0';
+               g_byte_array_set_size (buf, rp - buf->data);
+       }
+
+       if (stateptr)
+               *stateptr = state;
+
+       return buf;
+}
+
+static void
+free_byte_array_callback (void *pointer)
+{
+       GByteArray *arr = (GByteArray *)pointer;
+       g_byte_array_free (arr, TRUE);
+}
+
+static void
+mime_foreach_callback (GMimeObject *part, gpointer user_data)
+{
+       struct worker_task *task = (struct worker_task *)user_data;
+       struct mime_part *mime_part;
+       GMimeContentType *type;
+       GMimeDataWrapper *wrapper;
+       GMimeStream *part_stream;
+       GByteArray *part_content;
+       
+       task->parts_count ++;
+       
+       /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
+       
+       /* find out what class 'part' is... */
+       if (GMIME_IS_MESSAGE_PART (part)) {
+               /* message/rfc822 or message/news */
+               GMimeMessage *message;
+               
+               /* g_mime_message_foreach_part() won't descend into
+                   child message parts, so if we want to count any
+                   subparts of this child message, we'll have to call
+                   g_mime_message_foreach_part() again here. */
+               
+               message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
+               g_mime_message_foreach_part (message, mime_foreach_callback, task);
+               g_object_unref (message);
+       } else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
+               /* message/partial */
+               
+               /* this is an incomplete message part, probably a
+                   large message that the sender has broken into
+                   smaller parts and is sending us bit by bit. we
+                   could save some info about it so that we could
+                   piece this back together again once we get all the
+                   parts? */
+       } else if (GMIME_IS_MULTIPART (part)) {
+               /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
+               
+               /* we'll get to finding out if this is a signed/encrypted multipart later... */
+       } else if (GMIME_IS_PART (part)) {
+               /* a normal leaf part, could be text/plain or image/jpeg etc */
+               wrapper = g_mime_part_get_content_object (GMIME_PART (part));
+               if (wrapper != NULL) {
+                       part_stream = g_mime_stream_mem_new ();
+                       if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) {
+                               part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream));
+                               type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part));
+                               mime_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_part));
+                               mime_part->type = type;
+                               mime_part->content = part_content;
+                               task->parts = g_list_prepend (task->parts, mime_part);
+                               if (g_mime_content_type_is_type (type, "text", "html")) {
+                                       url_parse_html (task, part_content);
+                               } 
+                               else if (g_mime_content_type_is_type (type, "text", "plain")) {
+                                       url_parse_text (task, part_content);
+                               }
+                       }
+               }
+       } else {
+               g_assert_not_reached ();
+       }
+}
+
+int
+process_message (struct worker_task *task)
+{
+       GMimeMessage *message;
+       GMimeParser *parser;
+       GMimeStream *stream;
+
+       stream = g_mime_stream_mem_new_with_buffer (task->msg->buf->begin, task->msg->buf->len);
+       /* create a new parser object to parse the stream */
+       parser = g_mime_parser_new_with_stream (stream);
+
+       /* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */
+       g_object_unref (stream);
+
+       /* parse the message from the stream */
+       message = g_mime_parser_construct_message (parser);
+       
+       task->message = message;
+       memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_object_unref, task->message);
+
+       /* free the parser (and the stream) */
+       g_object_unref (parser);
+
+       g_mime_message_foreach_part (message, mime_foreach_callback, task);
+       
+       msg_info ("process_message: found %d parts in message", task->parts_count);
+
+       task->worker->srv->stat->messages_scanned ++;
+
+       return 0;
+}
+
+static void
+mime_learn_foreach_callback (GMimeObject *part, gpointer user_data)
+{
+       struct controller_session *session = (struct controller_session *)user_data;
+       struct mime_part *mime_part;
+       GMimeContentType *type;
+       GMimeDataWrapper *wrapper;
+       GMimeStream *part_stream;
+       GByteArray *part_content;
+       
+       /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
+       
+       /* find out what class 'part' is... */
+       if (GMIME_IS_MESSAGE_PART (part)) {
+               /* message/rfc822 or message/news */
+               GMimeMessage *message;
+               
+               /* g_mime_message_foreach_part() won't descend into
+                   child message parts, so if we want to count any
+                   subparts of this child message, we'll have to call
+                   g_mime_message_foreach_part() again here. */
+               
+               message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
+               g_mime_message_foreach_part (message, mime_learn_foreach_callback, session);
+               g_object_unref (message);
+       } else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
+               /* message/partial */
+               
+               /* this is an incomplete message part, probably a
+                   large message that the sender has broken into
+                   smaller parts and is sending us bit by bit. we
+                   could save some info about it so that we could
+                   piece this back together again once we get all the
+                   parts? */
+       } else if (GMIME_IS_MULTIPART (part)) {
+               /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
+               
+               /* we'll get to finding out if this is a signed/encrypted multipart later... */
+       } else if (GMIME_IS_PART (part)) {
+               /* a normal leaf part, could be text/plain or image/jpeg etc */
+               wrapper = g_mime_part_get_content_object (GMIME_PART (part));
+               if (wrapper != NULL) {
+                       part_stream = g_mime_stream_mem_new ();
+                       if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) {
+                               part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream));
+                               type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part));
+                               mime_part = memory_pool_alloc (session->session_pool, sizeof (struct mime_part));
+                               mime_part->type = type;
+                               mime_part->content = part_content;
+                               session->parts = g_list_prepend (session->parts, mime_part);
+                       }
+               }
+       } else {
+               g_assert_not_reached ();
+       }
+}
+
+int
+process_learn (struct controller_session *session)
+{
+       GMimeMessage *message;
+       GMimeParser *parser;
+       GMimeStream *stream;
+
+       stream = g_mime_stream_mem_new_with_buffer (session->learn_buf->buf->begin, session->learn_buf->buf->len);
+       /* create a new parser object to parse the stream */
+       parser = g_mime_parser_new_with_stream (stream);
+
+       /* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */
+       g_object_unref (stream);
+
+       /* parse the message from the stream */
+       message = g_mime_parser_construct_message (parser);
+       
+       memory_pool_add_destructor (session->session_pool, (pool_destruct_func)g_object_unref, message);
+
+       /* free the parser (and the stream) */
+       g_object_unref (parser);
+
+       g_mime_message_foreach_part (message, mime_learn_foreach_callback, session);
+       
+       return 0;
+}
+
+GByteArray* 
+get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur)
+{
+       GByteArray *ret = NULL;
+       struct mime_part *p;
+
+       if (*cur == NULL) {
+               *cur = g_list_first (parts);
+       }
+       else {
+               *cur = g_list_next (*cur);
+       }
+       
+       while (*cur) {
+               p = (*cur)->data;
+               /* For text/plain just return bytes */
+               if (g_mime_content_type_is_type (p->type, "text", "plain")) {
+                       msg_debug ("get_next_text_part: text/plain part");
+                       return p->content;
+               }
+               else if (g_mime_content_type_is_type (p->type, "text", "html")) {
+                       msg_debug ("get_next_text_part: try to strip html tags");
+                       ret = strip_html_tags (p->content, NULL);
+                       memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret);
+                       return ret;
+               }
+               else if (g_mime_content_type_is_type (p->type, "text", "xhtml")) {
+                       msg_debug ("get_next_text_part: try to strip html tags");
+                       ret = strip_html_tags (p->content, NULL);
+                       memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret);
+                       return ret;
+               }
+               *cur = g_list_next (*cur);
+       }
+       
+       return NULL;
+}
diff --git a/src/message.h b/src/message.h
new file mode 100644 (file)
index 0000000..106b325
--- /dev/null
@@ -0,0 +1,38 @@
+#ifndef RSPAMD_MESSAGE_H
+#define RSPAMD_MESSAGE_H
+
+#include "config.h"
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#ifndef HAVE_OWN_QUEUE_H
+#include <sys/queue.h>
+#else
+#include "queue.h"
+#endif
+#include <sys/time.h>
+
+#include <sys/un.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include <signal.h>
+#include <event.h>
+
+#include "main.h"
+
+#include <glib.h>
+#include <gmime/gmime.h>
+
+
+struct mime_part {
+       GMimeContentType *type;
+       GByteArray *content;
+       TAILQ_ENTRY (mime_part) next;
+};
+
+int process_message (struct worker_task *task);
+int process_learn (struct controller_session *session);
+GByteArray* get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur);
+
+#endif
index f82543a7e88c95dabe4f4aca12c6370d1e2fa319..6722e7b92a8c969e2b646330d6ea179c1b553ed0 100644 (file)
@@ -19,6 +19,7 @@
 
 #include "../config.h"
 #include "../main.h"
+#include "../message.h"
 #include "../modules.h"
 #include "../cfg_file.h"
 
@@ -125,6 +126,7 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
 {
        char *headerv;
        struct mime_part *part;
+       GList *cur;
        struct uri *url;
 
        switch (re->type) {
@@ -155,10 +157,13 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
                        break;
                case REGEXP_MIME:
                        msg_debug ("process_regexp: checking mime regexp: /%s/", re->regexp_text);
-                       TAILQ_FOREACH (part, &task->parts, next) {
+                       cur = g_list_first (task->parts);
+                       while (cur) {
+                               part = (struct mime_part *)cur->data;
                                if (g_regex_match_full (re->regexp, part->content->data, part->content->len, 0, 0, NULL, NULL) == TRUE) {
                                        return 1;
                                }
+                               cur = g_list_next (cur);
                        }
                        return 0;
                case REGEXP_MESSAGE:
index 45164467560813bc161fa5c750050645e2117956..122fa22416fb6f3d403e9a8c5acde4a9438bdd29 100644 (file)
@@ -20,11 +20,10 @@ static const int primes[] = {
        797, 3277,
 };
 
-GTree *
-osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input)
+int
+osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **tree)
 {
        token_node_t *new = NULL;
-       GTree *tree;
        f_str_t token = { NULL, 0, 0 };
        uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
        int i;
@@ -33,9 +32,13 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
        for (i = 0; i < FEATURE_WINDOW_SIZE; i ++) {
                hashpipe[i] = 0xABCDEF;
        }
+       
+       if (*tree == NULL) {
+               *tree = g_tree_new (token_node_compare_func);
+               memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, *tree);
+       }
 
-       tree = g_tree_new (token_node_compare_func);
-       memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, tree);
+       msg_debug ("osb_tokenize_text: got input length: %zd", input->len);
 
        while (tokenizer->get_next_word (input, &token)) {
                /* Shift hashpipe */
@@ -43,7 +46,6 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
                        hashpipe[i] = hashpipe[i - 1];
                }
                hashpipe[0] = fstrhash (&token);
-               msg_debug ("osb_tokenize_text: text token %s, hash: %d", fstrcstr (&token, pool), hashpipe[0]);
                
                for (i = 1; i < FEATURE_WINDOW_SIZE; i ++) {
                        h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1];
@@ -52,14 +54,13 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
                        new->h1 = h1;
                        new->h2 = h2;
 
-                       if (g_tree_lookup (tree, new) == NULL) {
-                               msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2);
-                               g_tree_insert (tree, new, new);
+                       if (g_tree_lookup (*tree, new) == NULL) {
+                               g_tree_insert (*tree, new, new);
                        }
                }
        }
 
-       return tree;
+       return TRUE;
 }
 
 /*
index f0481e00d1cbf4ceb4c042a7f2691c36e3c8d15f..6c92f9a97aa9e6cae80a28827a3c912b52bd8ae8 100644 (file)
@@ -60,13 +60,15 @@ get_next_word (f_str_t *buf, f_str_t *token)
 
        pos = token->begin;
        /* Skip non graph symbols */
-       while (remain-- && !g_ascii_isgraph (*pos)) {
+       while (remain > 0 && !g_ascii_isgraph (*pos)) {
                token->begin ++;
                pos ++;
+               remain --;
        }
-       while (remain-- && g_ascii_isgraph (*pos)) {
+       while (remain > 0 && g_ascii_isgraph (*pos)) {
                token->len ++;
                pos ++;
+               remain --;
        }
 
        if (token->len == 0) {
index c3453a945e687df8c1fc71ebc183459e0814c03a..10c8ae7aacf36442bb97ce9a64ad970391a4ab23 100644 (file)
@@ -22,7 +22,7 @@ typedef struct token_node_s {
 /* Common tokenizer structure */
 struct tokenizer {
        char *name;
-       GTree* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
+       int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur);
        f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token);
 };
 
@@ -33,7 +33,7 @@ struct tokenizer* get_tokenizer (char *name);
 /* Get next word from specified f_str_t buf */
 f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
 /* OSB tokenize function */
-GTree* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
+int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur);
 
 /* Array of all defined tokenizers */
 extern struct tokenizer tokenizers[];
index 4a6f9acdd69e05ae5bc3bc0775943a603e6e1f83..809adddda9a418f09a0419488f98f4c13f8eeb9f 100644 (file)
@@ -28,6 +28,7 @@
 #include "cfg_file.h"
 #include "url.h"
 #include "modules.h"
+#include "message.h"
 
 #define TASK_POOL_SIZE 4095
 
@@ -79,16 +80,18 @@ rcpt_destruct (void *pointer)
 static void
 free_task (struct worker_task *task)
 {
-       struct mime_part *part;
+       GList *part;
+       struct mime_part *p;
 
        if (task) {
                if (task->memc_ctx) {
                        memc_close_ctx (task->memc_ctx);
                }
-               while (!TAILQ_EMPTY (&task->parts)) {
-                       part = TAILQ_FIRST (&task->parts);
-                       g_byte_array_free (part->content, FALSE);
-                       TAILQ_REMOVE (&task->parts, part, next);
+               while ((part = g_list_first (task->parts))) {
+                       task->parts = g_list_remove_link (task->parts, part);
+                       p = (struct mime_part *)part->data;
+                       g_byte_array_free (p->content, FALSE);
+                       g_list_free_1 (part);
                }
                memory_pool_delete (task->task_pool);
                bufferevent_disable (task->bev, EV_READ | EV_WRITE);
@@ -98,102 +101,7 @@ free_task (struct worker_task *task)
        }
 }
 
-static void
-mime_foreach_callback (GMimeObject *part, gpointer user_data)
-{
-       struct worker_task *task = (struct worker_task *)user_data;
-       struct mime_part *mime_part;
-       GMimeContentType *type;
-       GMimeDataWrapper *wrapper;
-       GMimeStream *part_stream;
-       GByteArray *part_content;
-       
-       task->parts_count ++;
-       
-       /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
-       
-       /* find out what class 'part' is... */
-       if (GMIME_IS_MESSAGE_PART (part)) {
-               /* message/rfc822 or message/news */
-               GMimeMessage *message;
-               
-               /* g_mime_message_foreach_part() won't descend into
-                   child message parts, so if we want to count any
-                   subparts of this child message, we'll have to call
-                   g_mime_message_foreach_part() again here. */
-               
-               message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
-               g_mime_message_foreach_part (message, mime_foreach_callback, task);
-               g_object_unref (message);
-       } else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
-               /* message/partial */
-               
-               /* this is an incomplete message part, probably a
-                   large message that the sender has broken into
-                   smaller parts and is sending us bit by bit. we
-                   could save some info about it so that we could
-                   piece this back together again once we get all the
-                   parts? */
-       } else if (GMIME_IS_MULTIPART (part)) {
-               /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
-               
-               /* we'll get to finding out if this is a signed/encrypted multipart later... */
-       } else if (GMIME_IS_PART (part)) {
-               /* a normal leaf part, could be text/plain or image/jpeg etc */
-               wrapper = g_mime_part_get_content_object (GMIME_PART (part));
-               if (wrapper != NULL) {
-                       part_stream = g_mime_stream_mem_new ();
-                       if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) {
-                               part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream));
-                               type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part));
-                               mime_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_part));
-                               mime_part->type = type;
-                               mime_part->content = part_content;
-                               TAILQ_INSERT_TAIL (&task->parts, mime_part, next);
-                               if (g_mime_content_type_is_type (type, "text", "html")) {
-                                       url_parse_html (task, part_content);
-                               } 
-                               else if (g_mime_content_type_is_type (type, "text", "plain")) {
-                                       url_parse_text (task, part_content);
-                               }
-                       }
-               }
-       } else {
-               g_assert_not_reached ();
-       }
-}
 
-static int
-process_message (struct worker_task *task)
-{
-       GMimeMessage *message;
-       GMimeParser *parser;
-       GMimeStream *stream;
-
-       stream = g_mime_stream_mem_new_with_buffer (task->msg->buf->begin, task->msg->buf->len);
-       /* create a new parser object to parse the stream */
-       parser = g_mime_parser_new_with_stream (stream);
-
-       /* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */
-       g_object_unref (stream);
-
-       /* parse the message from the stream */
-       message = g_mime_parser_construct_message (parser);
-       
-       task->message = message;
-       memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_object_unref, task->message);
-
-       /* free the parser (and the stream) */
-       g_object_unref (parser);
-
-       g_mime_message_foreach_part (message, mime_foreach_callback, task);
-       
-       msg_info ("process_message: found %d parts in message", task->parts_count);
-
-       task->worker->srv->stat->messages_scanned ++;
-
-       return process_filters (task);
-}
 
 static void
 read_socket (struct bufferevent *bev, void *arg)
@@ -225,6 +133,7 @@ read_socket (struct bufferevent *bev, void *arg)
                                update_buf_size (task->msg);
                                if (task->msg->free == 0) {
                                        r = process_message (task);
+                                       r = process_filters (task);
                                        if (r == -1) {
                                                task->last_error = "Filter processing error";
                                                task->error_code = RSPAMD_FILTER_ERROR;
@@ -318,7 +227,6 @@ accept_socket (int fd, short what, void *arg)
        new_task->sock = nfd;
        new_task->cfg = worker->srv->cfg;
        TAILQ_INIT (&new_task->urls);
-       TAILQ_INIT (&new_task->parts);
        new_task->task_pool = memory_pool_new (memory_pool_get_size ());
        /* Add destructor for recipients list (it would be better to use anonymous function here */
        memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)rcpt_destruct, new_task);