summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2009-01-21 17:25:06 +0300
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2009-01-21 17:25:06 +0300
commit1dc0f6ad2c2e97e11881a7e1b0a4142e65f50898 (patch)
treef0a714e2e87ebd50f6016c8cc7f2a8e03a9cc2d8
parent87c9659fdd08bbbc0eb796afccf7237a03181498 (diff)
downloadrspamd-1dc0f6ad2c2e97e11881a7e1b0a4142e65f50898.tar.gz
rspamd-1dc0f6ad2c2e97e11881a7e1b0a4142e65f50898.zip
* Rewrite message parser
* Change mime parts storage * Add html tags striping (ported from php code) * Rework learning to process only text and striped html parts
-rwxr-xr-xconfigure2
-rw-r--r--perl/rspamd.xs9
-rw-r--r--src/controller.c47
-rw-r--r--src/filter.c19
-rw-r--r--src/main.h13
-rw-r--r--src/message.c442
-rw-r--r--src/message.h38
-rw-r--r--src/plugins/regexp.c7
-rw-r--r--src/tokenizers/osb.c21
-rw-r--r--src/tokenizers/tokenizers.c6
-rw-r--r--src/tokenizers/tokenizers.h4
-rw-r--r--src/worker.c110
12 files changed, 572 insertions, 146 deletions
diff --git a/configure b/configure
index 76b2bbb60..31d55cfac 100755
--- a/configure
+++ b/configure
@@ -24,7 +24,7 @@ CACHE="config.cache"
SRCDIR="src"
OBJDIR="src/.obj"
-SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c tokenizers/tokenizers.c tokenizers/osb.c classifiers/classifiers.c classifiers/winnow.c statfile.c ${LEX_OUTPUT} ${YACC_OUTPUT}"
+SOURCES="upstream.c cfg_utils.c memcached.c message.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c tokenizers/tokenizers.c tokenizers/osb.c classifiers/classifiers.c classifiers/winnow.c statfile.c ${LEX_OUTPUT} ${YACC_OUTPUT}"
MODULES="surbl regexp"
CFLAGS="$CFLAGS -W -Wpointer-arith -Wno-unused-parameter"
diff --git a/perl/rspamd.xs b/perl/rspamd.xs
index 7020e3463..4dfc9e665 100644
--- a/perl/rspamd.xs
+++ b/perl/rspamd.xs
@@ -13,6 +13,7 @@
#include "../src/config.h"
#include "../src/main.h"
+#include "../src/message.h"
#include "../src/cfg_file.h"
#include "../src/perl.h"
#include "../src/mem_pool.h"
@@ -86,12 +87,8 @@ rspamd_task_get_part (r, num)
if (number < 0 || number > r->parts_count - 1) {
XSRETURN_UNDEF;
}
-
- TAILQ_FOREACH (part, &r->parts, next) {
- if (--number == 0) {
- break;
- }
- }
+
+ part = g_list_nth_data (r->parts, number);
RETVAL = newHV();
type = g_mime_content_type_to_string (part->type);
diff --git a/src/controller.c b/src/controller.c
index 56eeb042a..1815fc655 100644
--- a/src/controller.c
+++ b/src/controller.c
@@ -18,6 +18,7 @@
#include "util.h"
#include "main.h"
+#include "message.h"
#include "protocol.h"
#include "upstream.h"
#include "cfg_file.h"
@@ -98,9 +99,20 @@ completion_func (gpointer elem)
static void
free_session (struct controller_session *session)
{
+ GList *part;
+ struct mime_part *p;
+
msg_debug ("free_session: freeing session %p", session);
bufferevent_disable (session->bev, EV_READ | EV_WRITE);
bufferevent_free (session->bev);
+
+ while ((part = g_list_first (session->parts))) {
+ session->parts = g_list_remove_link (session->parts, part);
+ p = (struct mime_part *)part->data;
+ g_byte_array_free (p->content, FALSE);
+ g_list_free_1 (part);
+ }
+
memory_pool_delete (session->session_pool);
g_free (session);
}
@@ -231,7 +243,7 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control
bufferevent_write (session->bev, out_buf, r);
return;
}
- session->learn_buf = memory_pool_alloc (session->session_pool, sizeof (f_str_buf_t));
+ session->learn_buf = memory_pool_alloc0 (session->session_pool, sizeof (f_str_buf_t));
session->learn_buf->buf = fstralloc (session->session_pool, size);
if (session->learn_buf->buf == NULL) {
r = snprintf (out_buf, sizeof (out_buf), "allocating buffer for learn failed" CRLF);
@@ -328,8 +340,11 @@ read_socket (struct bufferevent *bev, void *arg)
struct controller_session *session = (struct controller_session *)arg;
int len, i;
char *s, **params, *cmd, out_buf[128];
- GList *comp_list;
- GTree *tokens;
+ GList *comp_list, *cur = NULL;
+ GTree *tokens = NULL;
+ GByteArray *content = NULL;
+ struct mime_part *p;
+ f_str_t c;
switch (session->state) {
case STATE_COMMAND:
@@ -387,18 +402,32 @@ read_socket (struct bufferevent *bev, void *arg)
session->learn_buf->pos += i;
update_buf_size (session->learn_buf);
if (session->learn_buf->free == 0) {
- tokens = session->learn_tokenizer->tokenize_func (session->learn_tokenizer, session->session_pool, session->learn_buf->buf);
- if (tokens == NULL) {
- i = snprintf (out_buf, sizeof (out_buf), "learn fail, tokenizer error" CRLF);
- bufferevent_write (bev, out_buf, i);
- session->state = STATE_REPLY;
- return;
+ process_learn (session);
+ while ((content = get_next_text_part (session->session_pool, session->parts, &cur)) != NULL) {
+ c.begin = content->data;
+ c.len = content->len;
+ if (!session->learn_tokenizer->tokenize_func (session->learn_tokenizer,
+ session->session_pool, &c, &tokens)) {
+ i = snprintf (out_buf, sizeof (out_buf), "learn fail, tokenizer error" CRLF);
+ bufferevent_write (bev, out_buf, i);
+ session->state = STATE_REPLY;
+ return;
+ }
}
session->learn_classifier->learn_func (session->worker->srv->statfile_pool, session->learn_filename, tokens, session->in_class);
session->worker->srv->stat->messages_learned ++;
i = snprintf (out_buf, sizeof (out_buf), "learn ok" CRLF);
bufferevent_write (bev, out_buf, i);
bufferevent_enable (bev, EV_WRITE);
+
+ /* Clean learned parts */
+ while ((cur = g_list_first (session->parts))) {
+ session->parts = g_list_remove_link (session->parts, cur);
+ p = (struct mime_part *)cur->data;
+ g_byte_array_free (p->content, FALSE);
+ g_list_free_1 (cur);
+ }
+
session->state = STATE_REPLY;
break;
}
diff --git a/src/filter.c b/src/filter.c
index 443784ad6..f45d718f2 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -6,6 +6,7 @@
#include "mem_pool.h"
#include "filter.h"
#include "main.h"
+#include "message.h"
#include "cfg_file.h"
#include "perl.h"
#include "util.h"
@@ -354,9 +355,12 @@ statfiles_callback (gpointer key, gpointer value, void *arg)
struct statfile_callback_data *data= (struct statfile_callback_data *)arg;
struct worker_task *task = data->task;
struct statfile *st = (struct statfile *)value;
- GTree *tokens;
+ GTree *tokens = NULL;
char *filename;
double weight, *w;
+ GList *cur = NULL;
+ GByteArray *content;
+ f_str_t c;
if (g_list_length (task->rcpt) == 1) {
filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, (char *)task->rcpt->data);
@@ -371,11 +375,14 @@ statfiles_callback (gpointer key, gpointer value, void *arg)
}
if ((tokens = g_hash_table_lookup (data->tokens, st->tokenizer)) == NULL) {
- /* Tree would be freed at task pool freeing */
- tokens = st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, task->msg->buf);
- if (tokens == NULL) {
- msg_info ("statfiles_callback: cannot tokenize input");
- return;
+ while ((content = get_next_text_part (task->task_pool, task->parts, &cur)) != NULL) {
+ c.begin = content->data;
+ c.len = content->len;
+ /* Tree would be freed at task pool freeing */
+ if (!st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, &c, &tokens)) {
+ msg_info ("statfiles_callback: cannot tokenize input");
+ return;
+ }
}
g_hash_table_insert (data->tokens, st->tokenizer, tokens);
}
diff --git a/src/main.h b/src/main.h
index b2e362c9d..e6594429b 100644
--- a/src/main.h
+++ b/src/main.h
@@ -1,5 +1,5 @@
-#ifndef RPOP_MAIN_H
-#define RPOP_MAIN_H
+#ifndef RSPAMD_MAIN_H
+#define RSPAMD_MAIN_H
#include "config.h"
@@ -84,6 +84,7 @@ struct pidfh;
struct config_file;
struct tokenizer;
struct classifier;
+struct mime_part;
/* Server statistics */
struct rspamd_stat {
@@ -111,11 +112,6 @@ struct rspamd_main {
TAILQ_HEAD (workq, rspamd_worker) workers;
};
-struct mime_part {
- GMimeContentType *type;
- GByteArray *content;
- TAILQ_ENTRY (mime_part) next;
-};
struct save_point {
void *entry;
@@ -144,6 +140,7 @@ struct controller_session {
struct classifier *learn_classifier;
char *learn_filename;
f_str_buf_t *learn_buf;
+ GList *parts;
int in_class;
};
@@ -178,7 +175,7 @@ struct worker_task {
/* Message */
GMimeMessage *message;
/* All parts of message */
- TAILQ_HEAD (mime_partq, mime_part) parts;
+ GList *parts;
/* URLs extracted from message */
TAILQ_HEAD (uriq, uri) urls;
/* Hash of metric result structures */
diff --git a/src/message.c b/src/message.c
new file mode 100644
index 000000000..456983183
--- /dev/null
+++ b/src/message.c
@@ -0,0 +1,442 @@
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <errno.h>
+#include <signal.h>
+#include <netdb.h>
+
+#include <glib.h>
+#include <gmime/gmime.h>
+
+#include "util.h"
+#include "main.h"
+#include "message.h"
+#include "cfg_file.h"
+#include "modules.h"
+
+GByteArray*
+strip_html_tags (GByteArray *src, int *stateptr)
+{
+ uint8_t *tbuf = NULL, *p, *tp = NULL, *rp, c, lc;
+ int br, i = 0, depth = 0, in_q = 0;
+ int state = 0;
+ GByteArray *buf;
+
+ if (stateptr)
+ state = *stateptr;
+
+ buf = g_byte_array_sized_new (src->len);
+ g_byte_array_append (buf, src->data, src->len);
+
+ c = *src->data;
+ lc = '\0';
+ p = src->data;
+ rp = buf->data;
+ br = 0;
+
+ while (i < src->len) {
+ switch (c) {
+ case '\0':
+ break;
+ case '<':
+ if (g_ascii_isspace(*(p + 1))) {
+ goto reg_char;
+ }
+ if (state == 0) {
+ lc = '<';
+ state = 1;
+ } else if (state == 1) {
+ depth++;
+ }
+ break;
+
+ case '(':
+ if (state == 2) {
+ if (lc != '"' && lc != '\'') {
+ lc = '(';
+ br++;
+ }
+ } else if (state == 0) {
+ *(rp++) = c;
+ }
+ break;
+
+ case ')':
+ if (state == 2) {
+ if (lc != '"' && lc != '\'') {
+ lc = ')';
+ br--;
+ }
+ } else if (state == 0) {
+ *(rp++) = c;
+ }
+ break;
+
+ case '>':
+ if (depth) {
+ depth--;
+ break;
+ }
+
+ if (in_q) {
+ break;
+ }
+
+ switch (state) {
+ case 1: /* HTML/XML */
+ lc = '>';
+ in_q = state = 0;
+
+ break;
+
+ case 2: /* PHP */
+ if (!br && lc != '\"' && *(p-1) == '?') {
+ in_q = state = 0;
+ tp = tbuf;
+ }
+ break;
+
+ case 3:
+ in_q = state = 0;
+ tp = tbuf;
+ break;
+
+ case 4: /* JavaScript/CSS/etc... */
+ if (p >= src->data + 2 && *(p-1) == '-' && *(p-2) == '-') {
+ in_q = state = 0;
+ tp = tbuf;
+ }
+ break;
+
+ default:
+ *(rp++) = c;
+ break;
+ }
+ break;
+
+ case '"':
+ case '\'':
+ if (state == 2 && *(p-1) != '\\') {
+ if (lc == c) {
+ lc = '\0';
+ } else if (lc != '\\') {
+ lc = c;
+ }
+ } else if (state == 0) {
+ *(rp++) = c;
+ }
+ if (state && p != src->data && *(p-1) != '\\' && (!in_q || *p == in_q)) {
+ if (in_q) {
+ in_q = 0;
+ } else {
+ in_q = *p;
+ }
+ }
+ break;
+
+ case '!':
+ /* JavaScript & Other HTML scripting languages */
+ if (state == 1 && *(p-1) == '<') {
+ state = 3;
+ lc = c;
+ } else {
+ if (state == 0) {
+ *(rp++) = c;
+ }
+ }
+ break;
+
+ case '-':
+ if (state == 3 && p >= src->data + 2 && *(p-1) == '-' && *(p-2) == '!') {
+ state = 4;
+ } else {
+ goto reg_char;
+ }
+ break;
+
+ case '?':
+
+ if (state == 1 && *(p-1) == '<') {
+ br = 0;
+ state = 2;
+ break;
+ }
+
+ case 'E':
+ case 'e':
+ /* !DOCTYPE exception */
+ if (state == 3 && p > src->data + 6
+ && tolower(*(p-1)) == 'p'
+ && tolower(*(p-2)) == 'y'
+ && tolower(*(p-3)) == 't'
+ && tolower(*(p-4)) == 'c'
+ && tolower(*(p-5)) == 'o'
+ && tolower(*(p-6)) == 'd') {
+ state = 1;
+ break;
+ }
+ /* fall-through */
+
+ case 'l':
+
+ /* swm: If we encounter '<?xml' then we shouldn't be in
+ * state == 2 (PHP). Switch back to HTML.
+ */
+
+ if (state == 2 && p > src->data + 2 && *(p-1) == 'm' && *(p-2) == 'x') {
+ state = 1;
+ break;
+ }
+
+ /* fall-through */
+ default:
+reg_char:
+ if (state == 0) {
+ *(rp++) = c;
+ }
+ break;
+ }
+ c = *(++p);
+ i++;
+ }
+ if (rp < buf->data + src->len) {
+ *rp = '\0';
+ g_byte_array_set_size (buf, rp - buf->data);
+ }
+
+ if (stateptr)
+ *stateptr = state;
+
+ return buf;
+}
+
+static void
+free_byte_array_callback (void *pointer)
+{
+ GByteArray *arr = (GByteArray *)pointer;
+ g_byte_array_free (arr, TRUE);
+}
+
+static void
+mime_foreach_callback (GMimeObject *part, gpointer user_data)
+{
+ struct worker_task *task = (struct worker_task *)user_data;
+ struct mime_part *mime_part;
+ GMimeContentType *type;
+ GMimeDataWrapper *wrapper;
+ GMimeStream *part_stream;
+ GByteArray *part_content;
+
+ task->parts_count ++;
+
+ /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
+
+ /* find out what class 'part' is... */
+ if (GMIME_IS_MESSAGE_PART (part)) {
+ /* message/rfc822 or message/news */
+ GMimeMessage *message;
+
+ /* g_mime_message_foreach_part() won't descend into
+ child message parts, so if we want to count any
+ subparts of this child message, we'll have to call
+ g_mime_message_foreach_part() again here. */
+
+ message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
+ g_mime_message_foreach_part (message, mime_foreach_callback, task);
+ g_object_unref (message);
+ } else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
+ /* message/partial */
+
+ /* this is an incomplete message part, probably a
+ large message that the sender has broken into
+ smaller parts and is sending us bit by bit. we
+ could save some info about it so that we could
+ piece this back together again once we get all the
+ parts? */
+ } else if (GMIME_IS_MULTIPART (part)) {
+ /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
+
+ /* we'll get to finding out if this is a signed/encrypted multipart later... */
+ } else if (GMIME_IS_PART (part)) {
+ /* a normal leaf part, could be text/plain or image/jpeg etc */
+ wrapper = g_mime_part_get_content_object (GMIME_PART (part));
+ if (wrapper != NULL) {
+ part_stream = g_mime_stream_mem_new ();
+ if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) {
+ part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream));
+ type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part));
+ mime_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_part));
+ mime_part->type = type;
+ mime_part->content = part_content;
+ task->parts = g_list_prepend (task->parts, mime_part);
+ if (g_mime_content_type_is_type (type, "text", "html")) {
+ url_parse_html (task, part_content);
+ }
+ else if (g_mime_content_type_is_type (type, "text", "plain")) {
+ url_parse_text (task, part_content);
+ }
+ }
+ }
+ } else {
+ g_assert_not_reached ();
+ }
+}
+
+int
+process_message (struct worker_task *task)
+{
+ GMimeMessage *message;
+ GMimeParser *parser;
+ GMimeStream *stream;
+
+ stream = g_mime_stream_mem_new_with_buffer (task->msg->buf->begin, task->msg->buf->len);
+ /* create a new parser object to parse the stream */
+ parser = g_mime_parser_new_with_stream (stream);
+
+ /* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */
+ g_object_unref (stream);
+
+ /* parse the message from the stream */
+ message = g_mime_parser_construct_message (parser);
+
+ task->message = message;
+ memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_object_unref, task->message);
+
+ /* free the parser (and the stream) */
+ g_object_unref (parser);
+
+ g_mime_message_foreach_part (message, mime_foreach_callback, task);
+
+ msg_info ("process_message: found %d parts in message", task->parts_count);
+
+ task->worker->srv->stat->messages_scanned ++;
+
+ return 0;
+}
+
+static void
+mime_learn_foreach_callback (GMimeObject *part, gpointer user_data)
+{
+ struct controller_session *session = (struct controller_session *)user_data;
+ struct mime_part *mime_part;
+ GMimeContentType *type;
+ GMimeDataWrapper *wrapper;
+ GMimeStream *part_stream;
+ GByteArray *part_content;
+
+ /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
+
+ /* find out what class 'part' is... */
+ if (GMIME_IS_MESSAGE_PART (part)) {
+ /* message/rfc822 or message/news */
+ GMimeMessage *message;
+
+ /* g_mime_message_foreach_part() won't descend into
+ child message parts, so if we want to count any
+ subparts of this child message, we'll have to call
+ g_mime_message_foreach_part() again here. */
+
+ message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
+ g_mime_message_foreach_part (message, mime_learn_foreach_callback, session);
+ g_object_unref (message);
+ } else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
+ /* message/partial */
+
+ /* this is an incomplete message part, probably a
+ large message that the sender has broken into
+ smaller parts and is sending us bit by bit. we
+ could save some info about it so that we could
+ piece this back together again once we get all the
+ parts? */
+ } else if (GMIME_IS_MULTIPART (part)) {
+ /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
+
+ /* we'll get to finding out if this is a signed/encrypted multipart later... */
+ } else if (GMIME_IS_PART (part)) {
+ /* a normal leaf part, could be text/plain or image/jpeg etc */
+ wrapper = g_mime_part_get_content_object (GMIME_PART (part));
+ if (wrapper != NULL) {
+ part_stream = g_mime_stream_mem_new ();
+ if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) {
+ part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream));
+ type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part));
+ mime_part = memory_pool_alloc (session->session_pool, sizeof (struct mime_part));
+ mime_part->type = type;
+ mime_part->content = part_content;
+ session->parts = g_list_prepend (session->parts, mime_part);
+ }
+ }
+ } else {
+ g_assert_not_reached ();
+ }
+}
+
+int
+process_learn (struct controller_session *session)
+{
+ GMimeMessage *message;
+ GMimeParser *parser;
+ GMimeStream *stream;
+
+ stream = g_mime_stream_mem_new_with_buffer (session->learn_buf->buf->begin, session->learn_buf->buf->len);
+ /* create a new parser object to parse the stream */
+ parser = g_mime_parser_new_with_stream (stream);
+
+ /* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */
+ g_object_unref (stream);
+
+ /* parse the message from the stream */
+ message = g_mime_parser_construct_message (parser);
+
+ memory_pool_add_destructor (session->session_pool, (pool_destruct_func)g_object_unref, message);
+
+ /* free the parser (and the stream) */
+ g_object_unref (parser);
+
+ g_mime_message_foreach_part (message, mime_learn_foreach_callback, session);
+
+ return 0;
+}
+
+GByteArray*
+get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur)
+{
+ GByteArray *ret = NULL;
+ struct mime_part *p;
+
+ if (*cur == NULL) {
+ *cur = g_list_first (parts);
+ }
+ else {
+ *cur = g_list_next (*cur);
+ }
+
+ while (*cur) {
+ p = (*cur)->data;
+ /* For text/plain just return bytes */
+ if (g_mime_content_type_is_type (p->type, "text", "plain")) {
+ msg_debug ("get_next_text_part: text/plain part");
+ return p->content;
+ }
+ else if (g_mime_content_type_is_type (p->type, "text", "html")) {
+ msg_debug ("get_next_text_part: try to strip html tags");
+ ret = strip_html_tags (p->content, NULL);
+ memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret);
+ return ret;
+ }
+ else if (g_mime_content_type_is_type (p->type, "text", "xhtml")) {
+ msg_debug ("get_next_text_part: try to strip html tags");
+ ret = strip_html_tags (p->content, NULL);
+ memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret);
+ return ret;
+ }
+ *cur = g_list_next (*cur);
+ }
+
+ return NULL;
+}
diff --git a/src/message.h b/src/message.h
new file mode 100644
index 000000000..106b32559
--- /dev/null
+++ b/src/message.h
@@ -0,0 +1,38 @@
+#ifndef RSPAMD_MESSAGE_H
+#define RSPAMD_MESSAGE_H
+
+#include "config.h"
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#ifndef HAVE_OWN_QUEUE_H
+#include <sys/queue.h>
+#else
+#include "queue.h"
+#endif
+#include <sys/time.h>
+
+#include <sys/un.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include <signal.h>
+#include <event.h>
+
+#include "main.h"
+
+#include <glib.h>
+#include <gmime/gmime.h>
+
+
+struct mime_part {
+ GMimeContentType *type;
+ GByteArray *content;
+ TAILQ_ENTRY (mime_part) next;
+};
+
+int process_message (struct worker_task *task);
+int process_learn (struct controller_session *session);
+GByteArray* get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur);
+
+#endif
diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c
index f82543a7e..6722e7b92 100644
--- a/src/plugins/regexp.c
+++ b/src/plugins/regexp.c
@@ -19,6 +19,7 @@
#include "../config.h"
#include "../main.h"
+#include "../message.h"
#include "../modules.h"
#include "../cfg_file.h"
@@ -125,6 +126,7 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
{
char *headerv;
struct mime_part *part;
+ GList *cur;
struct uri *url;
switch (re->type) {
@@ -155,10 +157,13 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
break;
case REGEXP_MIME:
msg_debug ("process_regexp: checking mime regexp: /%s/", re->regexp_text);
- TAILQ_FOREACH (part, &task->parts, next) {
+ cur = g_list_first (task->parts);
+ while (cur) {
+ part = (struct mime_part *)cur->data;
if (g_regex_match_full (re->regexp, part->content->data, part->content->len, 0, 0, NULL, NULL) == TRUE) {
return 1;
}
+ cur = g_list_next (cur);
}
return 0;
case REGEXP_MESSAGE:
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
index 451644675..122fa2241 100644
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -20,11 +20,10 @@ static const int primes[] = {
797, 3277,
};
-GTree *
-osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input)
+int
+osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **tree)
{
token_node_t *new = NULL;
- GTree *tree;
f_str_t token = { NULL, 0, 0 };
uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
int i;
@@ -33,9 +32,13 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
for (i = 0; i < FEATURE_WINDOW_SIZE; i ++) {
hashpipe[i] = 0xABCDEF;
}
+
+ if (*tree == NULL) {
+ *tree = g_tree_new (token_node_compare_func);
+ memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, *tree);
+ }
- tree = g_tree_new (token_node_compare_func);
- memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, tree);
+ msg_debug ("osb_tokenize_text: got input length: %zd", input->len);
while (tokenizer->get_next_word (input, &token)) {
/* Shift hashpipe */
@@ -43,7 +46,6 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
hashpipe[i] = hashpipe[i - 1];
}
hashpipe[0] = fstrhash (&token);
- msg_debug ("osb_tokenize_text: text token %s, hash: %d", fstrcstr (&token, pool), hashpipe[0]);
for (i = 1; i < FEATURE_WINDOW_SIZE; i ++) {
h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1];
@@ -52,14 +54,13 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
new->h1 = h1;
new->h2 = h2;
- if (g_tree_lookup (tree, new) == NULL) {
- msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2);
- g_tree_insert (tree, new, new);
+ if (g_tree_lookup (*tree, new) == NULL) {
+ g_tree_insert (*tree, new, new);
}
}
}
- return tree;
+ return TRUE;
}
/*
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index f0481e00d..6c92f9a97 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -60,13 +60,15 @@ get_next_word (f_str_t *buf, f_str_t *token)
pos = token->begin;
/* Skip non graph symbols */
- while (remain-- && !g_ascii_isgraph (*pos)) {
+ while (remain > 0 && !g_ascii_isgraph (*pos)) {
token->begin ++;
pos ++;
+ remain --;
}
- while (remain-- && g_ascii_isgraph (*pos)) {
+ while (remain > 0 && g_ascii_isgraph (*pos)) {
token->len ++;
pos ++;
+ remain --;
}
if (token->len == 0) {
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
index c3453a945..10c8ae7aa 100644
--- a/src/tokenizers/tokenizers.h
+++ b/src/tokenizers/tokenizers.h
@@ -22,7 +22,7 @@ typedef struct token_node_s {
/* Common tokenizer structure */
struct tokenizer {
char *name;
- GTree* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
+ int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur);
f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token);
};
@@ -33,7 +33,7 @@ struct tokenizer* get_tokenizer (char *name);
/* Get next word from specified f_str_t buf */
f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
/* OSB tokenize function */
-GTree* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input);
+int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur);
/* Array of all defined tokenizers */
extern struct tokenizer tokenizers[];
diff --git a/src/worker.c b/src/worker.c
index 4a6f9acdd..809adddda 100644
--- a/src/worker.c
+++ b/src/worker.c
@@ -28,6 +28,7 @@
#include "cfg_file.h"
#include "url.h"
#include "modules.h"
+#include "message.h"
#define TASK_POOL_SIZE 4095
@@ -79,16 +80,18 @@ rcpt_destruct (void *pointer)
static void
free_task (struct worker_task *task)
{
- struct mime_part *part;
+ GList *part;
+ struct mime_part *p;
if (task) {
if (task->memc_ctx) {
memc_close_ctx (task->memc_ctx);
}
- while (!TAILQ_EMPTY (&task->parts)) {
- part = TAILQ_FIRST (&task->parts);
- g_byte_array_free (part->content, FALSE);
- TAILQ_REMOVE (&task->parts, part, next);
+ while ((part = g_list_first (task->parts))) {
+ task->parts = g_list_remove_link (task->parts, part);
+ p = (struct mime_part *)part->data;
+ g_byte_array_free (p->content, FALSE);
+ g_list_free_1 (part);
}
memory_pool_delete (task->task_pool);
bufferevent_disable (task->bev, EV_READ | EV_WRITE);
@@ -98,102 +101,7 @@ free_task (struct worker_task *task)
}
}
-static void
-mime_foreach_callback (GMimeObject *part, gpointer user_data)
-{
- struct worker_task *task = (struct worker_task *)user_data;
- struct mime_part *mime_part;
- GMimeContentType *type;
- GMimeDataWrapper *wrapper;
- GMimeStream *part_stream;
- GByteArray *part_content;
-
- task->parts_count ++;
-
- /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
-
- /* find out what class 'part' is... */
- if (GMIME_IS_MESSAGE_PART (part)) {
- /* message/rfc822 or message/news */
- GMimeMessage *message;
-
- /* g_mime_message_foreach_part() won't descend into
- child message parts, so if we want to count any
- subparts of this child message, we'll have to call
- g_mime_message_foreach_part() again here. */
-
- message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
- g_mime_message_foreach_part (message, mime_foreach_callback, task);
- g_object_unref (message);
- } else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
- /* message/partial */
-
- /* this is an incomplete message part, probably a
- large message that the sender has broken into
- smaller parts and is sending us bit by bit. we
- could save some info about it so that we could
- piece this back together again once we get all the
- parts? */
- } else if (GMIME_IS_MULTIPART (part)) {
- /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
-
- /* we'll get to finding out if this is a signed/encrypted multipart later... */
- } else if (GMIME_IS_PART (part)) {
- /* a normal leaf part, could be text/plain or image/jpeg etc */
- wrapper = g_mime_part_get_content_object (GMIME_PART (part));
- if (wrapper != NULL) {
- part_stream = g_mime_stream_mem_new ();
- if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) {
- part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream));
- type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part));
- mime_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_part));
- mime_part->type = type;
- mime_part->content = part_content;
- TAILQ_INSERT_TAIL (&task->parts, mime_part, next);
- if (g_mime_content_type_is_type (type, "text", "html")) {
- url_parse_html (task, part_content);
- }
- else if (g_mime_content_type_is_type (type, "text", "plain")) {
- url_parse_text (task, part_content);
- }
- }
- }
- } else {
- g_assert_not_reached ();
- }
-}
-static int
-process_message (struct worker_task *task)
-{
- GMimeMessage *message;
- GMimeParser *parser;
- GMimeStream *stream;
-
- stream = g_mime_stream_mem_new_with_buffer (task->msg->buf->begin, task->msg->buf->len);
- /* create a new parser object to parse the stream */
- parser = g_mime_parser_new_with_stream (stream);
-
- /* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */
- g_object_unref (stream);
-
- /* parse the message from the stream */
- message = g_mime_parser_construct_message (parser);
-
- task->message = message;
- memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_object_unref, task->message);
-
- /* free the parser (and the stream) */
- g_object_unref (parser);
-
- g_mime_message_foreach_part (message, mime_foreach_callback, task);
-
- msg_info ("process_message: found %d parts in message", task->parts_count);
-
- task->worker->srv->stat->messages_scanned ++;
-
- return process_filters (task);
-}
static void
read_socket (struct bufferevent *bev, void *arg)
@@ -225,6 +133,7 @@ read_socket (struct bufferevent *bev, void *arg)
update_buf_size (task->msg);
if (task->msg->free == 0) {
r = process_message (task);
+ r = process_filters (task);
if (r == -1) {
task->last_error = "Filter processing error";
task->error_code = RSPAMD_FILTER_ERROR;
@@ -318,7 +227,6 @@ accept_socket (int fd, short what, void *arg)
new_task->sock = nfd;
new_task->cfg = worker->srv->cfg;
TAILQ_INIT (&new_task->urls);
- TAILQ_INIT (&new_task->parts);
new_task->task_pool = memory_pool_new (memory_pool_get_size ());
/* Add destructor for recipients list (it would be better to use anonymous function here */
memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)rcpt_destruct, new_task);