* Fix learning

author: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2009-09-25 17:33:16 +0400
committer: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2009-09-25 17:33:16 +0400
commit: a9ab6be27de01f12ce6f201a4efa3eda8be0e36b (patch)
tree: 3c2cc297cc7c904d20a36dc783668632ed8088c2 /src
parent: 3dd95c9525babd0ba5be237663132a69ebf71a2a (diff)
download: rspamd-a9ab6be27de01f12ce6f201a4efa3eda8be0e36b.tar.gz
rspamd-a9ab6be27de01f12ce6f201a4efa3eda8be0e36b.zip
4 files changed, 37 insertions, 189 deletions
diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c
index 94b342525..6abd973ed 100644
--- a/src/classifiers/winnow.c
+++ b/src/classifiers/winnow.c
@@ -149,7 +149,11 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, GTree *input
 void
 winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *symbol, GTree *input, int in_class)
 {
-	struct winnow_callback_data data;
+	struct winnow_callback_data data = { 
+		.file = NULL, 
+		.sum = 0,
+		.count = 0,
+	};
 	GList *cur;
 	struct statfile *st;
 	
@@ -157,8 +161,6 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *symbol, G
 	g_assert (ctx != NULL);
 
 	data.pool = pool;
-	data.sum = 0;
-	data.count = 0;
 	data.in_class = in_class;
 	data.now = time (NULL);
 	data.ctx = ctx;
diff --git a/src/controller.c b/src/controller.c
index 0e11b6ae1..82dc02c0f 100644
--- a/src/controller.c
+++ b/src/controller.c
@@ -306,7 +306,7 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control
 					return;
 				}
 
-				session->learn_symbol = *cmd_args;
+				session->learn_symbol = memory_pool_strdup (session->session_pool, *cmd_args);
 				cl = g_hash_table_lookup (session->cfg->classifiers_symbols, *cmd_args);
 				if (cl == NULL) {
 					r = snprintf (out_buf, sizeof (out_buf), "statfile %s is not defined" CRLF, *cmd_args);
@@ -399,12 +399,12 @@ controller_read_socket (f_str_t *in, void *arg)
 {
 	struct controller_session *session = (struct controller_session *)arg;
 	struct classifier_ctx *cls_ctx;
-	int len, i;
+	int len, i, r;
 	char *s, **params, *cmd, out_buf[128];
+    struct worker_task *task;
+    struct mime_text_part *part;
 	GList *comp_list, *cur = NULL;
 	GTree *tokens = NULL;
-	GByteArray *content = NULL;
-	struct mime_part *p;
 	f_str_t c;
 
 	switch (session->state) {
@@ -450,33 +450,50 @@ controller_read_socket (f_str_t *in, void *arg)
 			break;
 		case STATE_LEARN:
 			session->learn_buf = in;
-			process_learn (session);
-			while ((content = get_next_text_part (session->session_pool, session->parts, &cur)) != NULL) {
-				c.begin = content->data;
-				c.len = content->len;
+           	task = construct_task (session->worker);
+	
+	        task->msg = memory_pool_alloc (task->task_pool, sizeof (f_str_t));
+	        task->msg->begin = in->begin;
+	        task->msg->len = in->len;
+
+	        r = process_message (task);
+        	if (r == -1) {
+                msg_warn ("read_socket: processing of message failed");
+                free_task (task, FALSE);
+                session->state = STATE_REPLY;
+                r = snprintf (out_buf, sizeof (out_buf), "cannot process message" CRLF);
+                rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE);
+                return FALSE;
+            } 
+            cur = g_list_first (task->text_parts);
+			while (cur) {
+				part = cur->data;
+				if (part->is_empty) {
+					cur = g_list_next (cur);
+					continue;
+				}
+				c.begin = part->content->data;
+				c.len = part->content->len;
+
 				if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer, 
 							session->session_pool, &c, &tokens)) {
 					i = snprintf (out_buf, sizeof (out_buf), "learn fail, tokenizer error" CRLF);
+					free_task (task, FALSE);
 					if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) {
                         return FALSE;
                     }
 					session->state = STATE_REPLY;
 					return TRUE;
 				}
+				cur = g_list_next (cur);
 			}
 			cls_ctx = session->learn_classifier->classifier->init_func (session->session_pool, session->learn_classifier);
 			session->learn_classifier->classifier->learn_func (cls_ctx, session->worker->srv->statfile_pool,
 													session->learn_symbol, tokens, session->in_class);
 			session->worker->srv->stat->messages_learned ++;
 
-			/* Clean learned parts */
-			while ((cur = g_list_first (session->parts))) {
-				session->parts = g_list_remove_link (session->parts, cur);
-				p = (struct mime_part *)cur->data;
-				g_byte_array_free (p->content, FALSE);
-				g_list_free_1 (cur);
-			}
 
+            free_task (task, FALSE);
 			i = snprintf (out_buf, sizeof (out_buf), "learn ok" CRLF);
 			if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) {
                 return FALSE;
diff --git a/src/message.c b/src/message.c
index 1416f22ab..f1886e687 100644
--- a/src/message.c
+++ b/src/message.c
@@ -770,161 +770,6 @@ process_message (struct worker_task *task)
 	return 0;
 }
 
-#ifdef GMIME24
-static void
-mime_learn_foreach_callback (GMimeObject *parent, GMimeObject *part, gpointer user_data)
-#else
-static void
-mime_learn_foreach_callback (GMimeObject *part, gpointer user_data)
-#endif
-{
-	struct controller_session *session = (struct controller_session *)user_data;
-	struct mime_part *mime_part;
-	GMimeContentType *type;
-	GMimeDataWrapper *wrapper;
-	GMimeStream *part_stream;
-	GByteArray *part_content;
-	
-	/* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
-	
-	/* find out what class 'part' is... */
-	if (GMIME_IS_MESSAGE_PART (part)) {
-		/* message/rfc822 or message/news */
-		GMimeMessage *message;
-		
-		/* g_mime_message_foreach_part() won't descend into
-                   child message parts, so if we want to count any
-                   subparts of this child message, we'll have to call
-                   g_mime_message_foreach_part() again here. */
-		message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
-#ifdef GMIME24
-		g_mime_message_foreach (message, mime_learn_foreach_callback, session);
-#else
-		g_mime_message_foreach_part (message, mime_learn_foreach_callback, session);
-#endif
-		g_object_unref (message);
-	} else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
-		/* message/partial */
-		
-		/* this is an incomplete message part, probably a
-                   large message that the sender has broken into
-                   smaller parts and is sending us bit by bit. we
-                   could save some info about it so that we could
-                   piece this back together again once we get all the
-                   parts? */
-	} else if (GMIME_IS_MULTIPART (part)) {
-		/* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
-		
-		/* we'll get to finding out if this is a signed/encrypted multipart later... */
-	} else if (GMIME_IS_PART (part)) {
-		/* a normal leaf part, could be text/plain or image/jpeg etc */
-		wrapper = g_mime_part_get_content_object (GMIME_PART (part));
-		if (wrapper != NULL) {
-			part_stream = g_mime_stream_mem_new ();
-			if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) {
-				g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (part_stream), FALSE);
-				part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream));
-				g_object_unref (part_stream);
-#ifdef GMIME24
-				type = (GMimeContentType *)g_mime_object_get_content_type (GMIME_OBJECT (part));
-#else
-				type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part));
-#endif
-				mime_part = memory_pool_alloc (session->session_pool, sizeof (struct mime_part));
-				mime_part->type = type;
-				mime_part->content = part_content;
-				session->parts = g_list_prepend (session->parts, mime_part);
-			}
-			g_object_unref (wrapper);
-		}
-	} else {
-		g_assert_not_reached ();
-	}
-}
-
-int
-process_learn (struct controller_session *session)
-{
-	GMimeMessage *message;
-	GMimeParser *parser;
-	GMimeStream *stream;
-	GByteArray *tmp;
-    
-	tmp = memory_pool_alloc (session->session_pool, sizeof (GByteArray));
-	tmp->data = session->learn_buf->begin;
-	tmp->len = session->learn_buf->len;
-	stream = g_mime_stream_mem_new_with_byte_array (tmp);
-	/* 
-	 * This causes g_mime_stream not to free memory by itself as it is memory allocated by
-	 * pool allocator
-	 */
-	g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (stream), FALSE);
-
-	/* create a new parser object to parse the stream */
-	parser = g_mime_parser_new_with_stream (stream);
-
-	/* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */
-	g_object_unref (stream);
-
-	/* parse the message from the stream */
-	message = g_mime_parser_construct_message (parser);
-	
-	memory_pool_add_destructor (session->session_pool, (pool_destruct_func)g_object_unref, message);
-
-#ifdef GMIME24
-	g_mime_message_foreach (message, mime_learn_foreach_callback, session);
-#else
-	g_mime_message_foreach_part (message, mime_learn_foreach_callback, session);
-#endif
-
-	/* free the parser (and the stream) */
-	g_object_unref (parser);
-	
-	return 0;
-}
-
-/*
- * XXX: remove this function for learning
- */
-GByteArray* 
-get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur)
-{
-	struct mime_part *p;
-
-	if (*cur == NULL) {
-		*cur = g_list_first (parts);
-	}
-	else {
-		*cur = g_list_next (*cur);
-	}
-	
-	while (*cur) {
-		p = (*cur)->data;
-		/* For text/plain just return bytes */
-		if (g_mime_content_type_is_type (p->type, "text", "plain")) {
-			msg_debug ("get_next_text_part: text/plain part");
-			return p->content;
-		}
-#if 0
-		else if (g_mime_content_type_is_type (p->type, "text", "html")) {
-			msg_debug ("get_next_text_part: try to strip html tags");
-			ret = strip_html_tags (p->content, NULL);
-			memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret);
-			return ret;
-		}
-		else if (g_mime_content_type_is_type (p->type, "text", "xhtml")) {
-			msg_debug ("get_next_text_part: try to strip html tags");
-			ret = strip_html_tags (p->content, NULL);
-			memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret);
-			return ret;
-		}
-#endif
-		*cur = g_list_next (*cur);
-	}
-	
-	return NULL;
-}
-
 struct raw_header {
     struct raw_header *next;
     char *name;
diff --git a/src/message.h b/src/message.h
index 939379ced..13b93b881 100644
--- a/src/message.h
+++ b/src/message.h
@@ -47,22 +47,6 @@ struct received_header {
  */
 int process_message (struct worker_task *task);
 
-/*
- * Process message for learning statfile classifier. 
- * It extract text and html parts and strip tags from html parts
- * @param session session that contains message
- * @return 0 allways (may be changed in future) 
- */
-int process_learn (struct controller_session *session);
-
-/**
- * Return next text part (or html with stripped tags) for specified list
- * @param pool memory pool in which place object
- * @param parts current position in list
- * @param cur pointer to which we save current position after processing
- */
-GByteArray* get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur);
-
 void message_set_header (GMimeMessage *message, const char *field, const char *value);
 GList* message_get_header (memory_pool_t *pool, GMimeMessage *message, const char *field);
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2009-09-25 17:33:16 +0400
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2009-09-25 17:33:16 +0400
commit	a9ab6be27de01f12ce6f201a4efa3eda8be0e36b (patch)
tree	3c2cc297cc7c904d20a36dc783668632ed8088c2 /src
parent	3dd95c9525babd0ba5be237663132a69ebf71a2a (diff)
download	rspamd-a9ab6be27de01f12ce6f201a4efa3eda8be0e36b.tar.gz rspamd-a9ab6be27de01f12ce6f201a4efa3eda8be0e36b.zip