From 1dc0f6ad2c2e97e11881a7e1b0a4142e65f50898 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 21 Jan 2009 17:25:06 +0300 Subject: [PATCH] * Rewrite message parser * Change mime parts storage * Add html tags striping (ported from php code) * Rework learning to process only text and striped html parts --- configure | 2 +- perl/rspamd.xs | 9 +- src/controller.c | 47 +++- src/filter.c | 19 +- src/main.h | 13 +- src/message.c | 442 ++++++++++++++++++++++++++++++++++++ src/message.h | 38 ++++ src/plugins/regexp.c | 7 +- src/tokenizers/osb.c | 21 +- src/tokenizers/tokenizers.c | 6 +- src/tokenizers/tokenizers.h | 4 +- src/worker.c | 110 +-------- 12 files changed, 572 insertions(+), 146 deletions(-) create mode 100644 src/message.c create mode 100644 src/message.h diff --git a/configure b/configure index 76b2bbb60..31d55cfac 100755 --- a/configure +++ b/configure @@ -24,7 +24,7 @@ CACHE="config.cache" SRCDIR="src" OBJDIR="src/.obj" -SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c tokenizers/tokenizers.c tokenizers/osb.c classifiers/classifiers.c classifiers/winnow.c statfile.c ${LEX_OUTPUT} ${YACC_OUTPUT}" +SOURCES="upstream.c cfg_utils.c memcached.c message.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c tokenizers/tokenizers.c tokenizers/osb.c classifiers/classifiers.c classifiers/winnow.c statfile.c ${LEX_OUTPUT} ${YACC_OUTPUT}" MODULES="surbl regexp" CFLAGS="$CFLAGS -W -Wpointer-arith -Wno-unused-parameter" diff --git a/perl/rspamd.xs b/perl/rspamd.xs index 7020e3463..4dfc9e665 100644 --- a/perl/rspamd.xs +++ b/perl/rspamd.xs @@ -13,6 +13,7 @@ #include "../src/config.h" #include "../src/main.h" +#include "../src/message.h" #include "../src/cfg_file.h" #include "../src/perl.h" #include "../src/mem_pool.h" @@ -86,12 +87,8 @@ rspamd_task_get_part (r, num) if (number < 0 || number > r->parts_count - 1) { XSRETURN_UNDEF; } - - TAILQ_FOREACH (part, &r->parts, next) { - if (--number == 0) { - break; - } - } + + part = g_list_nth_data (r->parts, number); RETVAL = newHV(); type = g_mime_content_type_to_string (part->type); diff --git a/src/controller.c b/src/controller.c index 56eeb042a..1815fc655 100644 --- a/src/controller.c +++ b/src/controller.c @@ -18,6 +18,7 @@ #include "util.h" #include "main.h" +#include "message.h" #include "protocol.h" #include "upstream.h" #include "cfg_file.h" @@ -98,9 +99,20 @@ completion_func (gpointer elem) static void free_session (struct controller_session *session) { + GList *part; + struct mime_part *p; + msg_debug ("free_session: freeing session %p", session); bufferevent_disable (session->bev, EV_READ | EV_WRITE); bufferevent_free (session->bev); + + while ((part = g_list_first (session->parts))) { + session->parts = g_list_remove_link (session->parts, part); + p = (struct mime_part *)part->data; + g_byte_array_free (p->content, FALSE); + g_list_free_1 (part); + } + memory_pool_delete (session->session_pool); g_free (session); } @@ -231,7 +243,7 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control bufferevent_write (session->bev, out_buf, r); return; } - session->learn_buf = memory_pool_alloc (session->session_pool, sizeof (f_str_buf_t)); + session->learn_buf = memory_pool_alloc0 (session->session_pool, sizeof (f_str_buf_t)); session->learn_buf->buf = fstralloc (session->session_pool, size); if (session->learn_buf->buf == NULL) { r = snprintf (out_buf, sizeof (out_buf), "allocating buffer for learn failed" CRLF); @@ -328,8 +340,11 @@ read_socket (struct bufferevent *bev, void *arg) struct controller_session *session = (struct controller_session *)arg; int len, i; char *s, **params, *cmd, out_buf[128]; - GList *comp_list; - GTree *tokens; + GList *comp_list, *cur = NULL; + GTree *tokens = NULL; + GByteArray *content = NULL; + struct mime_part *p; + f_str_t c; switch (session->state) { case STATE_COMMAND: @@ -387,18 +402,32 @@ read_socket (struct bufferevent *bev, void *arg) session->learn_buf->pos += i; update_buf_size (session->learn_buf); if (session->learn_buf->free == 0) { - tokens = session->learn_tokenizer->tokenize_func (session->learn_tokenizer, session->session_pool, session->learn_buf->buf); - if (tokens == NULL) { - i = snprintf (out_buf, sizeof (out_buf), "learn fail, tokenizer error" CRLF); - bufferevent_write (bev, out_buf, i); - session->state = STATE_REPLY; - return; + process_learn (session); + while ((content = get_next_text_part (session->session_pool, session->parts, &cur)) != NULL) { + c.begin = content->data; + c.len = content->len; + if (!session->learn_tokenizer->tokenize_func (session->learn_tokenizer, + session->session_pool, &c, &tokens)) { + i = snprintf (out_buf, sizeof (out_buf), "learn fail, tokenizer error" CRLF); + bufferevent_write (bev, out_buf, i); + session->state = STATE_REPLY; + return; + } } session->learn_classifier->learn_func (session->worker->srv->statfile_pool, session->learn_filename, tokens, session->in_class); session->worker->srv->stat->messages_learned ++; i = snprintf (out_buf, sizeof (out_buf), "learn ok" CRLF); bufferevent_write (bev, out_buf, i); bufferevent_enable (bev, EV_WRITE); + + /* Clean learned parts */ + while ((cur = g_list_first (session->parts))) { + session->parts = g_list_remove_link (session->parts, cur); + p = (struct mime_part *)cur->data; + g_byte_array_free (p->content, FALSE); + g_list_free_1 (cur); + } + session->state = STATE_REPLY; break; } diff --git a/src/filter.c b/src/filter.c index 443784ad6..f45d718f2 100644 --- a/src/filter.c +++ b/src/filter.c @@ -6,6 +6,7 @@ #include "mem_pool.h" #include "filter.h" #include "main.h" +#include "message.h" #include "cfg_file.h" #include "perl.h" #include "util.h" @@ -354,9 +355,12 @@ statfiles_callback (gpointer key, gpointer value, void *arg) struct statfile_callback_data *data= (struct statfile_callback_data *)arg; struct worker_task *task = data->task; struct statfile *st = (struct statfile *)value; - GTree *tokens; + GTree *tokens = NULL; char *filename; double weight, *w; + GList *cur = NULL; + GByteArray *content; + f_str_t c; if (g_list_length (task->rcpt) == 1) { filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, (char *)task->rcpt->data); @@ -371,11 +375,14 @@ statfiles_callback (gpointer key, gpointer value, void *arg) } if ((tokens = g_hash_table_lookup (data->tokens, st->tokenizer)) == NULL) { - /* Tree would be freed at task pool freeing */ - tokens = st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, task->msg->buf); - if (tokens == NULL) { - msg_info ("statfiles_callback: cannot tokenize input"); - return; + while ((content = get_next_text_part (task->task_pool, task->parts, &cur)) != NULL) { + c.begin = content->data; + c.len = content->len; + /* Tree would be freed at task pool freeing */ + if (!st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, &c, &tokens)) { + msg_info ("statfiles_callback: cannot tokenize input"); + return; + } } g_hash_table_insert (data->tokens, st->tokenizer, tokens); } diff --git a/src/main.h b/src/main.h index b2e362c9d..e6594429b 100644 --- a/src/main.h +++ b/src/main.h @@ -1,5 +1,5 @@ -#ifndef RPOP_MAIN_H -#define RPOP_MAIN_H +#ifndef RSPAMD_MAIN_H +#define RSPAMD_MAIN_H #include "config.h" @@ -84,6 +84,7 @@ struct pidfh; struct config_file; struct tokenizer; struct classifier; +struct mime_part; /* Server statistics */ struct rspamd_stat { @@ -111,11 +112,6 @@ struct rspamd_main { TAILQ_HEAD (workq, rspamd_worker) workers; }; -struct mime_part { - GMimeContentType *type; - GByteArray *content; - TAILQ_ENTRY (mime_part) next; -}; struct save_point { void *entry; @@ -144,6 +140,7 @@ struct controller_session { struct classifier *learn_classifier; char *learn_filename; f_str_buf_t *learn_buf; + GList *parts; int in_class; }; @@ -178,7 +175,7 @@ struct worker_task { /* Message */ GMimeMessage *message; /* All parts of message */ - TAILQ_HEAD (mime_partq, mime_part) parts; + GList *parts; /* URLs extracted from message */ TAILQ_HEAD (uriq, uri) urls; /* Hash of metric result structures */ diff --git a/src/message.c b/src/message.c new file mode 100644 index 000000000..456983183 --- /dev/null +++ b/src/message.c @@ -0,0 +1,442 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "util.h" +#include "main.h" +#include "message.h" +#include "cfg_file.h" +#include "modules.h" + +GByteArray* +strip_html_tags (GByteArray *src, int *stateptr) +{ + uint8_t *tbuf = NULL, *p, *tp = NULL, *rp, c, lc; + int br, i = 0, depth = 0, in_q = 0; + int state = 0; + GByteArray *buf; + + if (stateptr) + state = *stateptr; + + buf = g_byte_array_sized_new (src->len); + g_byte_array_append (buf, src->data, src->len); + + c = *src->data; + lc = '\0'; + p = src->data; + rp = buf->data; + br = 0; + + while (i < src->len) { + switch (c) { + case '\0': + break; + case '<': + if (g_ascii_isspace(*(p + 1))) { + goto reg_char; + } + if (state == 0) { + lc = '<'; + state = 1; + } else if (state == 1) { + depth++; + } + break; + + case '(': + if (state == 2) { + if (lc != '"' && lc != '\'') { + lc = '('; + br++; + } + } else if (state == 0) { + *(rp++) = c; + } + break; + + case ')': + if (state == 2) { + if (lc != '"' && lc != '\'') { + lc = ')'; + br--; + } + } else if (state == 0) { + *(rp++) = c; + } + break; + + case '>': + if (depth) { + depth--; + break; + } + + if (in_q) { + break; + } + + switch (state) { + case 1: /* HTML/XML */ + lc = '>'; + in_q = state = 0; + + break; + + case 2: /* PHP */ + if (!br && lc != '\"' && *(p-1) == '?') { + in_q = state = 0; + tp = tbuf; + } + break; + + case 3: + in_q = state = 0; + tp = tbuf; + break; + + case 4: /* JavaScript/CSS/etc... */ + if (p >= src->data + 2 && *(p-1) == '-' && *(p-2) == '-') { + in_q = state = 0; + tp = tbuf; + } + break; + + default: + *(rp++) = c; + break; + } + break; + + case '"': + case '\'': + if (state == 2 && *(p-1) != '\\') { + if (lc == c) { + lc = '\0'; + } else if (lc != '\\') { + lc = c; + } + } else if (state == 0) { + *(rp++) = c; + } + if (state && p != src->data && *(p-1) != '\\' && (!in_q || *p == in_q)) { + if (in_q) { + in_q = 0; + } else { + in_q = *p; + } + } + break; + + case '!': + /* JavaScript & Other HTML scripting languages */ + if (state == 1 && *(p-1) == '<') { + state = 3; + lc = c; + } else { + if (state == 0) { + *(rp++) = c; + } + } + break; + + case '-': + if (state == 3 && p >= src->data + 2 && *(p-1) == '-' && *(p-2) == '!') { + state = 4; + } else { + goto reg_char; + } + break; + + case '?': + + if (state == 1 && *(p-1) == '<') { + br = 0; + state = 2; + break; + } + + case 'E': + case 'e': + /* !DOCTYPE exception */ + if (state == 3 && p > src->data + 6 + && tolower(*(p-1)) == 'p' + && tolower(*(p-2)) == 'y' + && tolower(*(p-3)) == 't' + && tolower(*(p-4)) == 'c' + && tolower(*(p-5)) == 'o' + && tolower(*(p-6)) == 'd') { + state = 1; + break; + } + /* fall-through */ + + case 'l': + + /* swm: If we encounter ' src->data + 2 && *(p-1) == 'm' && *(p-2) == 'x') { + state = 1; + break; + } + + /* fall-through */ + default: +reg_char: + if (state == 0) { + *(rp++) = c; + } + break; + } + c = *(++p); + i++; + } + if (rp < buf->data + src->len) { + *rp = '\0'; + g_byte_array_set_size (buf, rp - buf->data); + } + + if (stateptr) + *stateptr = state; + + return buf; +} + +static void +free_byte_array_callback (void *pointer) +{ + GByteArray *arr = (GByteArray *)pointer; + g_byte_array_free (arr, TRUE); +} + +static void +mime_foreach_callback (GMimeObject *part, gpointer user_data) +{ + struct worker_task *task = (struct worker_task *)user_data; + struct mime_part *mime_part; + GMimeContentType *type; + GMimeDataWrapper *wrapper; + GMimeStream *part_stream; + GByteArray *part_content; + + task->parts_count ++; + + /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */ + + /* find out what class 'part' is... */ + if (GMIME_IS_MESSAGE_PART (part)) { + /* message/rfc822 or message/news */ + GMimeMessage *message; + + /* g_mime_message_foreach_part() won't descend into + child message parts, so if we want to count any + subparts of this child message, we'll have to call + g_mime_message_foreach_part() again here. */ + + message = g_mime_message_part_get_message ((GMimeMessagePart *) part); + g_mime_message_foreach_part (message, mime_foreach_callback, task); + g_object_unref (message); + } else if (GMIME_IS_MESSAGE_PARTIAL (part)) { + /* message/partial */ + + /* this is an incomplete message part, probably a + large message that the sender has broken into + smaller parts and is sending us bit by bit. we + could save some info about it so that we could + piece this back together again once we get all the + parts? */ + } else if (GMIME_IS_MULTIPART (part)) { + /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */ + + /* we'll get to finding out if this is a signed/encrypted multipart later... */ + } else if (GMIME_IS_PART (part)) { + /* a normal leaf part, could be text/plain or image/jpeg etc */ + wrapper = g_mime_part_get_content_object (GMIME_PART (part)); + if (wrapper != NULL) { + part_stream = g_mime_stream_mem_new (); + if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) { + part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream)); + type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part)); + mime_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_part)); + mime_part->type = type; + mime_part->content = part_content; + task->parts = g_list_prepend (task->parts, mime_part); + if (g_mime_content_type_is_type (type, "text", "html")) { + url_parse_html (task, part_content); + } + else if (g_mime_content_type_is_type (type, "text", "plain")) { + url_parse_text (task, part_content); + } + } + } + } else { + g_assert_not_reached (); + } +} + +int +process_message (struct worker_task *task) +{ + GMimeMessage *message; + GMimeParser *parser; + GMimeStream *stream; + + stream = g_mime_stream_mem_new_with_buffer (task->msg->buf->begin, task->msg->buf->len); + /* create a new parser object to parse the stream */ + parser = g_mime_parser_new_with_stream (stream); + + /* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */ + g_object_unref (stream); + + /* parse the message from the stream */ + message = g_mime_parser_construct_message (parser); + + task->message = message; + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_object_unref, task->message); + + /* free the parser (and the stream) */ + g_object_unref (parser); + + g_mime_message_foreach_part (message, mime_foreach_callback, task); + + msg_info ("process_message: found %d parts in message", task->parts_count); + + task->worker->srv->stat->messages_scanned ++; + + return 0; +} + +static void +mime_learn_foreach_callback (GMimeObject *part, gpointer user_data) +{ + struct controller_session *session = (struct controller_session *)user_data; + struct mime_part *mime_part; + GMimeContentType *type; + GMimeDataWrapper *wrapper; + GMimeStream *part_stream; + GByteArray *part_content; + + /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */ + + /* find out what class 'part' is... */ + if (GMIME_IS_MESSAGE_PART (part)) { + /* message/rfc822 or message/news */ + GMimeMessage *message; + + /* g_mime_message_foreach_part() won't descend into + child message parts, so if we want to count any + subparts of this child message, we'll have to call + g_mime_message_foreach_part() again here. */ + + message = g_mime_message_part_get_message ((GMimeMessagePart *) part); + g_mime_message_foreach_part (message, mime_learn_foreach_callback, session); + g_object_unref (message); + } else if (GMIME_IS_MESSAGE_PARTIAL (part)) { + /* message/partial */ + + /* this is an incomplete message part, probably a + large message that the sender has broken into + smaller parts and is sending us bit by bit. we + could save some info about it so that we could + piece this back together again once we get all the + parts? */ + } else if (GMIME_IS_MULTIPART (part)) { + /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */ + + /* we'll get to finding out if this is a signed/encrypted multipart later... */ + } else if (GMIME_IS_PART (part)) { + /* a normal leaf part, could be text/plain or image/jpeg etc */ + wrapper = g_mime_part_get_content_object (GMIME_PART (part)); + if (wrapper != NULL) { + part_stream = g_mime_stream_mem_new (); + if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) { + part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream)); + type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part)); + mime_part = memory_pool_alloc (session->session_pool, sizeof (struct mime_part)); + mime_part->type = type; + mime_part->content = part_content; + session->parts = g_list_prepend (session->parts, mime_part); + } + } + } else { + g_assert_not_reached (); + } +} + +int +process_learn (struct controller_session *session) +{ + GMimeMessage *message; + GMimeParser *parser; + GMimeStream *stream; + + stream = g_mime_stream_mem_new_with_buffer (session->learn_buf->buf->begin, session->learn_buf->buf->len); + /* create a new parser object to parse the stream */ + parser = g_mime_parser_new_with_stream (stream); + + /* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */ + g_object_unref (stream); + + /* parse the message from the stream */ + message = g_mime_parser_construct_message (parser); + + memory_pool_add_destructor (session->session_pool, (pool_destruct_func)g_object_unref, message); + + /* free the parser (and the stream) */ + g_object_unref (parser); + + g_mime_message_foreach_part (message, mime_learn_foreach_callback, session); + + return 0; +} + +GByteArray* +get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur) +{ + GByteArray *ret = NULL; + struct mime_part *p; + + if (*cur == NULL) { + *cur = g_list_first (parts); + } + else { + *cur = g_list_next (*cur); + } + + while (*cur) { + p = (*cur)->data; + /* For text/plain just return bytes */ + if (g_mime_content_type_is_type (p->type, "text", "plain")) { + msg_debug ("get_next_text_part: text/plain part"); + return p->content; + } + else if (g_mime_content_type_is_type (p->type, "text", "html")) { + msg_debug ("get_next_text_part: try to strip html tags"); + ret = strip_html_tags (p->content, NULL); + memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret); + return ret; + } + else if (g_mime_content_type_is_type (p->type, "text", "xhtml")) { + msg_debug ("get_next_text_part: try to strip html tags"); + ret = strip_html_tags (p->content, NULL); + memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret); + return ret; + } + *cur = g_list_next (*cur); + } + + return NULL; +} diff --git a/src/message.h b/src/message.h new file mode 100644 index 000000000..106b32559 --- /dev/null +++ b/src/message.h @@ -0,0 +1,38 @@ +#ifndef RSPAMD_MESSAGE_H +#define RSPAMD_MESSAGE_H + +#include "config.h" + +#include +#include +#ifndef HAVE_OWN_QUEUE_H +#include +#else +#include "queue.h" +#endif +#include + +#include +#include +#include + +#include +#include + +#include "main.h" + +#include +#include + + +struct mime_part { + GMimeContentType *type; + GByteArray *content; + TAILQ_ENTRY (mime_part) next; +}; + +int process_message (struct worker_task *task); +int process_learn (struct controller_session *session); +GByteArray* get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur); + +#endif diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c index f82543a7e..6722e7b92 100644 --- a/src/plugins/regexp.c +++ b/src/plugins/regexp.c @@ -19,6 +19,7 @@ #include "../config.h" #include "../main.h" +#include "../message.h" #include "../modules.h" #include "../cfg_file.h" @@ -125,6 +126,7 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task) { char *headerv; struct mime_part *part; + GList *cur; struct uri *url; switch (re->type) { @@ -155,10 +157,13 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task) break; case REGEXP_MIME: msg_debug ("process_regexp: checking mime regexp: /%s/", re->regexp_text); - TAILQ_FOREACH (part, &task->parts, next) { + cur = g_list_first (task->parts); + while (cur) { + part = (struct mime_part *)cur->data; if (g_regex_match_full (re->regexp, part->content->data, part->content->len, 0, 0, NULL, NULL) == TRUE) { return 1; } + cur = g_list_next (cur); } return 0; case REGEXP_MESSAGE: diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index 451644675..122fa2241 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -20,11 +20,10 @@ static const int primes[] = { 797, 3277, }; -GTree * -osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input) +int +osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **tree) { token_node_t *new = NULL; - GTree *tree; f_str_t token = { NULL, 0, 0 }; uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2; int i; @@ -33,9 +32,13 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in for (i = 0; i < FEATURE_WINDOW_SIZE; i ++) { hashpipe[i] = 0xABCDEF; } + + if (*tree == NULL) { + *tree = g_tree_new (token_node_compare_func); + memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, *tree); + } - tree = g_tree_new (token_node_compare_func); - memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, tree); + msg_debug ("osb_tokenize_text: got input length: %zd", input->len); while (tokenizer->get_next_word (input, &token)) { /* Shift hashpipe */ @@ -43,7 +46,6 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in hashpipe[i] = hashpipe[i - 1]; } hashpipe[0] = fstrhash (&token); - msg_debug ("osb_tokenize_text: text token %s, hash: %d", fstrcstr (&token, pool), hashpipe[0]); for (i = 1; i < FEATURE_WINDOW_SIZE; i ++) { h1 = hashpipe[0]* primes[0] + hashpipe[i] * primes[i<<1]; @@ -52,14 +54,13 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in new->h1 = h1; new->h2 = h2; - if (g_tree_lookup (tree, new) == NULL) { - msg_debug ("osb_tokenize_text: append new token, h1=%u, h2=%u", h1, h2); - g_tree_insert (tree, new, new); + if (g_tree_lookup (*tree, new) == NULL) { + g_tree_insert (*tree, new, new); } } } - return tree; + return TRUE; } /* diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index f0481e00d..6c92f9a97 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -60,13 +60,15 @@ get_next_word (f_str_t *buf, f_str_t *token) pos = token->begin; /* Skip non graph symbols */ - while (remain-- && !g_ascii_isgraph (*pos)) { + while (remain > 0 && !g_ascii_isgraph (*pos)) { token->begin ++; pos ++; + remain --; } - while (remain-- && g_ascii_isgraph (*pos)) { + while (remain > 0 && g_ascii_isgraph (*pos)) { token->len ++; pos ++; + remain --; } if (token->len == 0) { diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h index c3453a945..10c8ae7aa 100644 --- a/src/tokenizers/tokenizers.h +++ b/src/tokenizers/tokenizers.h @@ -22,7 +22,7 @@ typedef struct token_node_s { /* Common tokenizer structure */ struct tokenizer { char *name; - GTree* (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input); + int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur); f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token); }; @@ -33,7 +33,7 @@ struct tokenizer* get_tokenizer (char *name); /* Get next word from specified f_str_t buf */ f_str_t *get_next_word (f_str_t *buf, f_str_t *token); /* OSB tokenize function */ -GTree* osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input); +int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur); /* Array of all defined tokenizers */ extern struct tokenizer tokenizers[]; diff --git a/src/worker.c b/src/worker.c index 4a6f9acdd..809adddda 100644 --- a/src/worker.c +++ b/src/worker.c @@ -28,6 +28,7 @@ #include "cfg_file.h" #include "url.h" #include "modules.h" +#include "message.h" #define TASK_POOL_SIZE 4095 @@ -79,16 +80,18 @@ rcpt_destruct (void *pointer) static void free_task (struct worker_task *task) { - struct mime_part *part; + GList *part; + struct mime_part *p; if (task) { if (task->memc_ctx) { memc_close_ctx (task->memc_ctx); } - while (!TAILQ_EMPTY (&task->parts)) { - part = TAILQ_FIRST (&task->parts); - g_byte_array_free (part->content, FALSE); - TAILQ_REMOVE (&task->parts, part, next); + while ((part = g_list_first (task->parts))) { + task->parts = g_list_remove_link (task->parts, part); + p = (struct mime_part *)part->data; + g_byte_array_free (p->content, FALSE); + g_list_free_1 (part); } memory_pool_delete (task->task_pool); bufferevent_disable (task->bev, EV_READ | EV_WRITE); @@ -98,102 +101,7 @@ free_task (struct worker_task *task) } } -static void -mime_foreach_callback (GMimeObject *part, gpointer user_data) -{ - struct worker_task *task = (struct worker_task *)user_data; - struct mime_part *mime_part; - GMimeContentType *type; - GMimeDataWrapper *wrapper; - GMimeStream *part_stream; - GByteArray *part_content; - - task->parts_count ++; - - /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */ - - /* find out what class 'part' is... */ - if (GMIME_IS_MESSAGE_PART (part)) { - /* message/rfc822 or message/news */ - GMimeMessage *message; - - /* g_mime_message_foreach_part() won't descend into - child message parts, so if we want to count any - subparts of this child message, we'll have to call - g_mime_message_foreach_part() again here. */ - - message = g_mime_message_part_get_message ((GMimeMessagePart *) part); - g_mime_message_foreach_part (message, mime_foreach_callback, task); - g_object_unref (message); - } else if (GMIME_IS_MESSAGE_PARTIAL (part)) { - /* message/partial */ - - /* this is an incomplete message part, probably a - large message that the sender has broken into - smaller parts and is sending us bit by bit. we - could save some info about it so that we could - piece this back together again once we get all the - parts? */ - } else if (GMIME_IS_MULTIPART (part)) { - /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */ - - /* we'll get to finding out if this is a signed/encrypted multipart later... */ - } else if (GMIME_IS_PART (part)) { - /* a normal leaf part, could be text/plain or image/jpeg etc */ - wrapper = g_mime_part_get_content_object (GMIME_PART (part)); - if (wrapper != NULL) { - part_stream = g_mime_stream_mem_new (); - if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) { - part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream)); - type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part)); - mime_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_part)); - mime_part->type = type; - mime_part->content = part_content; - TAILQ_INSERT_TAIL (&task->parts, mime_part, next); - if (g_mime_content_type_is_type (type, "text", "html")) { - url_parse_html (task, part_content); - } - else if (g_mime_content_type_is_type (type, "text", "plain")) { - url_parse_text (task, part_content); - } - } - } - } else { - g_assert_not_reached (); - } -} -static int -process_message (struct worker_task *task) -{ - GMimeMessage *message; - GMimeParser *parser; - GMimeStream *stream; - - stream = g_mime_stream_mem_new_with_buffer (task->msg->buf->begin, task->msg->buf->len); - /* create a new parser object to parse the stream */ - parser = g_mime_parser_new_with_stream (stream); - - /* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */ - g_object_unref (stream); - - /* parse the message from the stream */ - message = g_mime_parser_construct_message (parser); - - task->message = message; - memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_object_unref, task->message); - - /* free the parser (and the stream) */ - g_object_unref (parser); - - g_mime_message_foreach_part (message, mime_foreach_callback, task); - - msg_info ("process_message: found %d parts in message", task->parts_count); - - task->worker->srv->stat->messages_scanned ++; - - return process_filters (task); -} static void read_socket (struct bufferevent *bev, void *arg) @@ -225,6 +133,7 @@ read_socket (struct bufferevent *bev, void *arg) update_buf_size (task->msg); if (task->msg->free == 0) { r = process_message (task); + r = process_filters (task); if (r == -1) { task->last_error = "Filter processing error"; task->error_code = RSPAMD_FILTER_ERROR; @@ -318,7 +227,6 @@ accept_socket (int fd, short what, void *arg) new_task->sock = nfd; new_task->cfg = worker->srv->cfg; TAILQ_INIT (&new_task->urls); - TAILQ_INIT (&new_task->parts); new_task->task_pool = memory_pool_new (memory_pool_get_size ()); /* Add destructor for recipients list (it would be better to use anonymous function here */ memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)rcpt_destruct, new_task); -- 2.39.5