diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-09-25 17:33:16 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-09-25 17:33:16 +0400 |
commit | a9ab6be27de01f12ce6f201a4efa3eda8be0e36b (patch) | |
tree | 3c2cc297cc7c904d20a36dc783668632ed8088c2 /src | |
parent | 3dd95c9525babd0ba5be237663132a69ebf71a2a (diff) | |
download | rspamd-a9ab6be27de01f12ce6f201a4efa3eda8be0e36b.tar.gz rspamd-a9ab6be27de01f12ce6f201a4efa3eda8be0e36b.zip |
* Fix learning
Diffstat (limited to 'src')
-rw-r--r-- | src/classifiers/winnow.c | 8 | ||||
-rw-r--r-- | src/controller.c | 47 | ||||
-rw-r--r-- | src/message.c | 155 | ||||
-rw-r--r-- | src/message.h | 16 |
4 files changed, 37 insertions, 189 deletions
diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c index 94b342525..6abd973ed 100644 --- a/src/classifiers/winnow.c +++ b/src/classifiers/winnow.c @@ -149,7 +149,11 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, GTree *input void winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *symbol, GTree *input, int in_class) { - struct winnow_callback_data data; + struct winnow_callback_data data = { + .file = NULL, + .sum = 0, + .count = 0, + }; GList *cur; struct statfile *st; @@ -157,8 +161,6 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *symbol, G g_assert (ctx != NULL); data.pool = pool; - data.sum = 0; - data.count = 0; data.in_class = in_class; data.now = time (NULL); data.ctx = ctx; diff --git a/src/controller.c b/src/controller.c index 0e11b6ae1..82dc02c0f 100644 --- a/src/controller.c +++ b/src/controller.c @@ -306,7 +306,7 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control return; } - session->learn_symbol = *cmd_args; + session->learn_symbol = memory_pool_strdup (session->session_pool, *cmd_args); cl = g_hash_table_lookup (session->cfg->classifiers_symbols, *cmd_args); if (cl == NULL) { r = snprintf (out_buf, sizeof (out_buf), "statfile %s is not defined" CRLF, *cmd_args); @@ -399,12 +399,12 @@ controller_read_socket (f_str_t *in, void *arg) { struct controller_session *session = (struct controller_session *)arg; struct classifier_ctx *cls_ctx; - int len, i; + int len, i, r; char *s, **params, *cmd, out_buf[128]; + struct worker_task *task; + struct mime_text_part *part; GList *comp_list, *cur = NULL; GTree *tokens = NULL; - GByteArray *content = NULL; - struct mime_part *p; f_str_t c; switch (session->state) { @@ -450,33 +450,50 @@ controller_read_socket (f_str_t *in, void *arg) break; case STATE_LEARN: session->learn_buf = in; - process_learn (session); - while ((content = get_next_text_part (session->session_pool, session->parts, &cur)) != NULL) { - c.begin = content->data; - c.len = content->len; + task = construct_task (session->worker); + + task->msg = memory_pool_alloc (task->task_pool, sizeof (f_str_t)); + task->msg->begin = in->begin; + task->msg->len = in->len; + + r = process_message (task); + if (r == -1) { + msg_warn ("read_socket: processing of message failed"); + free_task (task, FALSE); + session->state = STATE_REPLY; + r = snprintf (out_buf, sizeof (out_buf), "cannot process message" CRLF); + rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE); + return FALSE; + } + cur = g_list_first (task->text_parts); + while (cur) { + part = cur->data; + if (part->is_empty) { + cur = g_list_next (cur); + continue; + } + c.begin = part->content->data; + c.len = part->content->len; + if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer, session->session_pool, &c, &tokens)) { i = snprintf (out_buf, sizeof (out_buf), "learn fail, tokenizer error" CRLF); + free_task (task, FALSE); if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) { return FALSE; } session->state = STATE_REPLY; return TRUE; } + cur = g_list_next (cur); } cls_ctx = session->learn_classifier->classifier->init_func (session->session_pool, session->learn_classifier); session->learn_classifier->classifier->learn_func (cls_ctx, session->worker->srv->statfile_pool, session->learn_symbol, tokens, session->in_class); session->worker->srv->stat->messages_learned ++; - /* Clean learned parts */ - while ((cur = g_list_first (session->parts))) { - session->parts = g_list_remove_link (session->parts, cur); - p = (struct mime_part *)cur->data; - g_byte_array_free (p->content, FALSE); - g_list_free_1 (cur); - } + free_task (task, FALSE); i = snprintf (out_buf, sizeof (out_buf), "learn ok" CRLF); if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) { return FALSE; diff --git a/src/message.c b/src/message.c index 1416f22ab..f1886e687 100644 --- a/src/message.c +++ b/src/message.c @@ -770,161 +770,6 @@ process_message (struct worker_task *task) return 0; } -#ifdef GMIME24 -static void -mime_learn_foreach_callback (GMimeObject *parent, GMimeObject *part, gpointer user_data) -#else -static void -mime_learn_foreach_callback (GMimeObject *part, gpointer user_data) -#endif -{ - struct controller_session *session = (struct controller_session *)user_data; - struct mime_part *mime_part; - GMimeContentType *type; - GMimeDataWrapper *wrapper; - GMimeStream *part_stream; - GByteArray *part_content; - - /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */ - - /* find out what class 'part' is... */ - if (GMIME_IS_MESSAGE_PART (part)) { - /* message/rfc822 or message/news */ - GMimeMessage *message; - - /* g_mime_message_foreach_part() won't descend into - child message parts, so if we want to count any - subparts of this child message, we'll have to call - g_mime_message_foreach_part() again here. */ - message = g_mime_message_part_get_message ((GMimeMessagePart *) part); -#ifdef GMIME24 - g_mime_message_foreach (message, mime_learn_foreach_callback, session); -#else - g_mime_message_foreach_part (message, mime_learn_foreach_callback, session); -#endif - g_object_unref (message); - } else if (GMIME_IS_MESSAGE_PARTIAL (part)) { - /* message/partial */ - - /* this is an incomplete message part, probably a - large message that the sender has broken into - smaller parts and is sending us bit by bit. we - could save some info about it so that we could - piece this back together again once we get all the - parts? */ - } else if (GMIME_IS_MULTIPART (part)) { - /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */ - - /* we'll get to finding out if this is a signed/encrypted multipart later... */ - } else if (GMIME_IS_PART (part)) { - /* a normal leaf part, could be text/plain or image/jpeg etc */ - wrapper = g_mime_part_get_content_object (GMIME_PART (part)); - if (wrapper != NULL) { - part_stream = g_mime_stream_mem_new (); - if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) { - g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (part_stream), FALSE); - part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream)); - g_object_unref (part_stream); -#ifdef GMIME24 - type = (GMimeContentType *)g_mime_object_get_content_type (GMIME_OBJECT (part)); -#else - type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part)); -#endif - mime_part = memory_pool_alloc (session->session_pool, sizeof (struct mime_part)); - mime_part->type = type; - mime_part->content = part_content; - session->parts = g_list_prepend (session->parts, mime_part); - } - g_object_unref (wrapper); - } - } else { - g_assert_not_reached (); - } -} - -int -process_learn (struct controller_session *session) -{ - GMimeMessage *message; - GMimeParser *parser; - GMimeStream *stream; - GByteArray *tmp; - - tmp = memory_pool_alloc (session->session_pool, sizeof (GByteArray)); - tmp->data = session->learn_buf->begin; - tmp->len = session->learn_buf->len; - stream = g_mime_stream_mem_new_with_byte_array (tmp); - /* - * This causes g_mime_stream not to free memory by itself as it is memory allocated by - * pool allocator - */ - g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (stream), FALSE); - - /* create a new parser object to parse the stream */ - parser = g_mime_parser_new_with_stream (stream); - - /* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */ - g_object_unref (stream); - - /* parse the message from the stream */ - message = g_mime_parser_construct_message (parser); - - memory_pool_add_destructor (session->session_pool, (pool_destruct_func)g_object_unref, message); - -#ifdef GMIME24 - g_mime_message_foreach (message, mime_learn_foreach_callback, session); -#else - g_mime_message_foreach_part (message, mime_learn_foreach_callback, session); -#endif - - /* free the parser (and the stream) */ - g_object_unref (parser); - - return 0; -} - -/* - * XXX: remove this function for learning - */ -GByteArray* -get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur) -{ - struct mime_part *p; - - if (*cur == NULL) { - *cur = g_list_first (parts); - } - else { - *cur = g_list_next (*cur); - } - - while (*cur) { - p = (*cur)->data; - /* For text/plain just return bytes */ - if (g_mime_content_type_is_type (p->type, "text", "plain")) { - msg_debug ("get_next_text_part: text/plain part"); - return p->content; - } -#if 0 - else if (g_mime_content_type_is_type (p->type, "text", "html")) { - msg_debug ("get_next_text_part: try to strip html tags"); - ret = strip_html_tags (p->content, NULL); - memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret); - return ret; - } - else if (g_mime_content_type_is_type (p->type, "text", "xhtml")) { - msg_debug ("get_next_text_part: try to strip html tags"); - ret = strip_html_tags (p->content, NULL); - memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret); - return ret; - } -#endif - *cur = g_list_next (*cur); - } - - return NULL; -} - struct raw_header { struct raw_header *next; char *name; diff --git a/src/message.h b/src/message.h index 939379ced..13b93b881 100644 --- a/src/message.h +++ b/src/message.h @@ -47,22 +47,6 @@ struct received_header { */ int process_message (struct worker_task *task); -/* - * Process message for learning statfile classifier. - * It extract text and html parts and strip tags from html parts - * @param session session that contains message - * @return 0 allways (may be changed in future) - */ -int process_learn (struct controller_session *session); - -/** - * Return next text part (or html with stripped tags) for specified list - * @param pool memory pool in which place object - * @param parts current position in list - * @param cur pointer to which we save current position after processing - */ -GByteArray* get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur); - void message_set_header (GMimeMessage *message, const char *field, const char *value); GList* message_get_header (memory_pool_t *pool, GMimeMessage *message, const char *field); |