aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2009-09-25 17:33:16 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2009-09-25 17:33:16 +0400
commita9ab6be27de01f12ce6f201a4efa3eda8be0e36b (patch)
tree3c2cc297cc7c904d20a36dc783668632ed8088c2 /src
parent3dd95c9525babd0ba5be237663132a69ebf71a2a (diff)
downloadrspamd-a9ab6be27de01f12ce6f201a4efa3eda8be0e36b.tar.gz
rspamd-a9ab6be27de01f12ce6f201a4efa3eda8be0e36b.zip
* Fix learning
Diffstat (limited to 'src')
-rw-r--r--src/classifiers/winnow.c8
-rw-r--r--src/controller.c47
-rw-r--r--src/message.c155
-rw-r--r--src/message.h16
4 files changed, 37 insertions, 189 deletions
diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c
index 94b342525..6abd973ed 100644
--- a/src/classifiers/winnow.c
+++ b/src/classifiers/winnow.c
@@ -149,7 +149,11 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, GTree *input
void
winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *symbol, GTree *input, int in_class)
{
- struct winnow_callback_data data;
+ struct winnow_callback_data data = {
+ .file = NULL,
+ .sum = 0,
+ .count = 0,
+ };
GList *cur;
struct statfile *st;
@@ -157,8 +161,6 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *symbol, G
g_assert (ctx != NULL);
data.pool = pool;
- data.sum = 0;
- data.count = 0;
data.in_class = in_class;
data.now = time (NULL);
data.ctx = ctx;
diff --git a/src/controller.c b/src/controller.c
index 0e11b6ae1..82dc02c0f 100644
--- a/src/controller.c
+++ b/src/controller.c
@@ -306,7 +306,7 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control
return;
}
- session->learn_symbol = *cmd_args;
+ session->learn_symbol = memory_pool_strdup (session->session_pool, *cmd_args);
cl = g_hash_table_lookup (session->cfg->classifiers_symbols, *cmd_args);
if (cl == NULL) {
r = snprintf (out_buf, sizeof (out_buf), "statfile %s is not defined" CRLF, *cmd_args);
@@ -399,12 +399,12 @@ controller_read_socket (f_str_t *in, void *arg)
{
struct controller_session *session = (struct controller_session *)arg;
struct classifier_ctx *cls_ctx;
- int len, i;
+ int len, i, r;
char *s, **params, *cmd, out_buf[128];
+ struct worker_task *task;
+ struct mime_text_part *part;
GList *comp_list, *cur = NULL;
GTree *tokens = NULL;
- GByteArray *content = NULL;
- struct mime_part *p;
f_str_t c;
switch (session->state) {
@@ -450,33 +450,50 @@ controller_read_socket (f_str_t *in, void *arg)
break;
case STATE_LEARN:
session->learn_buf = in;
- process_learn (session);
- while ((content = get_next_text_part (session->session_pool, session->parts, &cur)) != NULL) {
- c.begin = content->data;
- c.len = content->len;
+ task = construct_task (session->worker);
+
+ task->msg = memory_pool_alloc (task->task_pool, sizeof (f_str_t));
+ task->msg->begin = in->begin;
+ task->msg->len = in->len;
+
+ r = process_message (task);
+ if (r == -1) {
+ msg_warn ("read_socket: processing of message failed");
+ free_task (task, FALSE);
+ session->state = STATE_REPLY;
+ r = snprintf (out_buf, sizeof (out_buf), "cannot process message" CRLF);
+ rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE);
+ return FALSE;
+ }
+ cur = g_list_first (task->text_parts);
+ while (cur) {
+ part = cur->data;
+ if (part->is_empty) {
+ cur = g_list_next (cur);
+ continue;
+ }
+ c.begin = part->content->data;
+ c.len = part->content->len;
+
if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer,
session->session_pool, &c, &tokens)) {
i = snprintf (out_buf, sizeof (out_buf), "learn fail, tokenizer error" CRLF);
+ free_task (task, FALSE);
if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) {
return FALSE;
}
session->state = STATE_REPLY;
return TRUE;
}
+ cur = g_list_next (cur);
}
cls_ctx = session->learn_classifier->classifier->init_func (session->session_pool, session->learn_classifier);
session->learn_classifier->classifier->learn_func (cls_ctx, session->worker->srv->statfile_pool,
session->learn_symbol, tokens, session->in_class);
session->worker->srv->stat->messages_learned ++;
- /* Clean learned parts */
- while ((cur = g_list_first (session->parts))) {
- session->parts = g_list_remove_link (session->parts, cur);
- p = (struct mime_part *)cur->data;
- g_byte_array_free (p->content, FALSE);
- g_list_free_1 (cur);
- }
+ free_task (task, FALSE);
i = snprintf (out_buf, sizeof (out_buf), "learn ok" CRLF);
if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) {
return FALSE;
diff --git a/src/message.c b/src/message.c
index 1416f22ab..f1886e687 100644
--- a/src/message.c
+++ b/src/message.c
@@ -770,161 +770,6 @@ process_message (struct worker_task *task)
return 0;
}
-#ifdef GMIME24
-static void
-mime_learn_foreach_callback (GMimeObject *parent, GMimeObject *part, gpointer user_data)
-#else
-static void
-mime_learn_foreach_callback (GMimeObject *part, gpointer user_data)
-#endif
-{
- struct controller_session *session = (struct controller_session *)user_data;
- struct mime_part *mime_part;
- GMimeContentType *type;
- GMimeDataWrapper *wrapper;
- GMimeStream *part_stream;
- GByteArray *part_content;
-
- /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
-
- /* find out what class 'part' is... */
- if (GMIME_IS_MESSAGE_PART (part)) {
- /* message/rfc822 or message/news */
- GMimeMessage *message;
-
- /* g_mime_message_foreach_part() won't descend into
- child message parts, so if we want to count any
- subparts of this child message, we'll have to call
- g_mime_message_foreach_part() again here. */
- message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
-#ifdef GMIME24
- g_mime_message_foreach (message, mime_learn_foreach_callback, session);
-#else
- g_mime_message_foreach_part (message, mime_learn_foreach_callback, session);
-#endif
- g_object_unref (message);
- } else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
- /* message/partial */
-
- /* this is an incomplete message part, probably a
- large message that the sender has broken into
- smaller parts and is sending us bit by bit. we
- could save some info about it so that we could
- piece this back together again once we get all the
- parts? */
- } else if (GMIME_IS_MULTIPART (part)) {
- /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
-
- /* we'll get to finding out if this is a signed/encrypted multipart later... */
- } else if (GMIME_IS_PART (part)) {
- /* a normal leaf part, could be text/plain or image/jpeg etc */
- wrapper = g_mime_part_get_content_object (GMIME_PART (part));
- if (wrapper != NULL) {
- part_stream = g_mime_stream_mem_new ();
- if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) {
- g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (part_stream), FALSE);
- part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream));
- g_object_unref (part_stream);
-#ifdef GMIME24
- type = (GMimeContentType *)g_mime_object_get_content_type (GMIME_OBJECT (part));
-#else
- type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part));
-#endif
- mime_part = memory_pool_alloc (session->session_pool, sizeof (struct mime_part));
- mime_part->type = type;
- mime_part->content = part_content;
- session->parts = g_list_prepend (session->parts, mime_part);
- }
- g_object_unref (wrapper);
- }
- } else {
- g_assert_not_reached ();
- }
-}
-
-int
-process_learn (struct controller_session *session)
-{
- GMimeMessage *message;
- GMimeParser *parser;
- GMimeStream *stream;
- GByteArray *tmp;
-
- tmp = memory_pool_alloc (session->session_pool, sizeof (GByteArray));
- tmp->data = session->learn_buf->begin;
- tmp->len = session->learn_buf->len;
- stream = g_mime_stream_mem_new_with_byte_array (tmp);
- /*
- * This causes g_mime_stream not to free memory by itself as it is memory allocated by
- * pool allocator
- */
- g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (stream), FALSE);
-
- /* create a new parser object to parse the stream */
- parser = g_mime_parser_new_with_stream (stream);
-
- /* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */
- g_object_unref (stream);
-
- /* parse the message from the stream */
- message = g_mime_parser_construct_message (parser);
-
- memory_pool_add_destructor (session->session_pool, (pool_destruct_func)g_object_unref, message);
-
-#ifdef GMIME24
- g_mime_message_foreach (message, mime_learn_foreach_callback, session);
-#else
- g_mime_message_foreach_part (message, mime_learn_foreach_callback, session);
-#endif
-
- /* free the parser (and the stream) */
- g_object_unref (parser);
-
- return 0;
-}
-
-/*
- * XXX: remove this function for learning
- */
-GByteArray*
-get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur)
-{
- struct mime_part *p;
-
- if (*cur == NULL) {
- *cur = g_list_first (parts);
- }
- else {
- *cur = g_list_next (*cur);
- }
-
- while (*cur) {
- p = (*cur)->data;
- /* For text/plain just return bytes */
- if (g_mime_content_type_is_type (p->type, "text", "plain")) {
- msg_debug ("get_next_text_part: text/plain part");
- return p->content;
- }
-#if 0
- else if (g_mime_content_type_is_type (p->type, "text", "html")) {
- msg_debug ("get_next_text_part: try to strip html tags");
- ret = strip_html_tags (p->content, NULL);
- memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret);
- return ret;
- }
- else if (g_mime_content_type_is_type (p->type, "text", "xhtml")) {
- msg_debug ("get_next_text_part: try to strip html tags");
- ret = strip_html_tags (p->content, NULL);
- memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret);
- return ret;
- }
-#endif
- *cur = g_list_next (*cur);
- }
-
- return NULL;
-}
-
struct raw_header {
struct raw_header *next;
char *name;
diff --git a/src/message.h b/src/message.h
index 939379ced..13b93b881 100644
--- a/src/message.h
+++ b/src/message.h
@@ -47,22 +47,6 @@ struct received_header {
*/
int process_message (struct worker_task *task);
-/*
- * Process message for learning statfile classifier.
- * It extract text and html parts and strip tags from html parts
- * @param session session that contains message
- * @return 0 allways (may be changed in future)
- */
-int process_learn (struct controller_session *session);
-
-/**
- * Return next text part (or html with stripped tags) for specified list
- * @param pool memory pool in which place object
- * @param parts current position in list
- * @param cur pointer to which we save current position after processing
- */
-GByteArray* get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur);
-
void message_set_header (GMimeMessage *message, const char *field, const char *value);
GList* message_get_header (memory_pool_t *pool, GMimeMessage *message, const char *field);