From: Vsevolod Stakhov Date: Fri, 23 Jan 2015 21:55:14 +0000 (+0000) Subject: Rework tokenization invocation. X-Git-Tag: 0.9.0~840 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=b18ee66bc2febb8caf3a9d651cd9c48f29c15a2d;p=rspamd.git Rework tokenization invocation. --- diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 2c17e4b7e..042f8a3bd 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -25,6 +25,7 @@ #include "stat_api.h" #include "main.h" #include "stat_internal.h" +#include "message.h" #include "lua/lua_common.h" #include @@ -73,6 +74,54 @@ rspamd_stat_get_tokenizer_runtime (const gchar *name, rspamd_mempool_t *pool, return tok; } +/* + * Tokenize task using the tokenizer specified + */ +static void +rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, + struct rspamd_task *task, struct rspamd_tokenizer_runtime *tok) +{ + struct mime_text_part *part; + GArray *words; + gchar *sub; + GList *cur; + + cur = task->text_parts; + + while (cur != NULL) { + part = (struct mime_text_part *)cur->data; + + if (!part->is_empty && part->words != NULL) { + /* + * XXX: Use normalized words if needed here + */ + tok->tokenizer->tokenize_func (tok->tokenizer, task->task_pool, + part->words, tok->tokens, part->is_utf); + } + + cur = g_list_next (cur); + } + + if (task->subject != NULL) { + sub = task->subject; + } + else { + sub = (gchar *)g_mime_message_get_subject (task->message); + } + + if (sub != NULL) { + words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL); + if (words != NULL) { + tok->tokenizer->tokenize_func (tok->tokenizer, + task->task_pool, + words, + tok->tokens, + TRUE); + g_array_free (words, TRUE); + } + } +} + gboolean rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err) @@ -80,7 +129,6 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err) struct rspamd_stat_classifier *cls; struct rspamd_classifier_config *clcf; GList *cur; - guint i; struct rspamd_stat_ctx *st_ctx; struct rspamd_tokenizer_runtime *tklist = NULL, *tok; @@ -109,8 +157,9 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err) return FALSE; } + rspamd_stat_process_tokenize (st_ctx, task, tok); - if (!rspamd_stat_preprocess (st_ctx, task, cls, err)) { + if (!rspamd_stat_preprocess (st_ctx, cls, task, err)) { return FALSE; } diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 54b83d33e..6ec7b1e10 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -198,43 +198,6 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, return res; } - -void -tokenize_subject (struct rspamd_task *task, GTree ** tree) -{ - gchar *sub; - struct rspamd_stat_tokenizer *osb_tokenizer; - GArray *words; - - if (*tree == NULL) { - *tree = g_tree_new (token_node_compare_func); - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) g_tree_destroy, *tree); - } - - osb_tokenizer = rspamd_stat_get_tokenizer ("osb-text"); - - /* Try to use pre-defined subject */ - if (task->subject != NULL) { - sub = task->subject; - } - else { - sub = (gchar *)g_mime_message_get_subject (task->message); - } - - if (sub != NULL) { - words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL); - if (words != NULL) { - osb_tokenizer->tokenize_func (osb_tokenizer, - task->task_pool, - words, - *tree, - TRUE); - g_array_free (words, TRUE); - } - } -} - /* * vi:ts=4 */ diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 7d4523bfb..d4c116e13 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -37,9 +37,6 @@ int osb_tokenize_text (struct rspamd_stat_tokenizer *tokenizer, GTree *tokens, gboolean is_utf); -/* Make tokens for a subject */ -void tokenize_subject (struct rspamd_task *task, GTree ** tree); - #endif /* * vi:ts=4