diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-01-23 21:55:14 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-01-23 21:55:14 +0000 |
commit | b18ee66bc2febb8caf3a9d651cd9c48f29c15a2d (patch) | |
tree | 57bd1d0f8cc1f896956105ccfbd66c69615c7d3c /src | |
parent | c07fb26ef4e2f903522a6cf85617d60acebd9ad2 (diff) | |
download | rspamd-b18ee66bc2febb8caf3a9d651cd9c48f29c15a2d.tar.gz rspamd-b18ee66bc2febb8caf3a9d651cd9c48f29c15a2d.zip |
Rework tokenization invocation.
Diffstat (limited to 'src')
-rw-r--r-- | src/libstat/stat_process.c | 53 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 37 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 3 |
3 files changed, 51 insertions, 42 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 2c17e4b7e..042f8a3bd 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -25,6 +25,7 @@ #include "stat_api.h" #include "main.h" #include "stat_internal.h" +#include "message.h" #include "lua/lua_common.h" #include <utlist.h> @@ -73,6 +74,54 @@ rspamd_stat_get_tokenizer_runtime (const gchar *name, rspamd_mempool_t *pool, return tok; } +/* + * Tokenize task using the tokenizer specified + */ +static void +rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, + struct rspamd_task *task, struct rspamd_tokenizer_runtime *tok) +{ + struct mime_text_part *part; + GArray *words; + gchar *sub; + GList *cur; + + cur = task->text_parts; + + while (cur != NULL) { + part = (struct mime_text_part *)cur->data; + + if (!part->is_empty && part->words != NULL) { + /* + * XXX: Use normalized words if needed here + */ + tok->tokenizer->tokenize_func (tok->tokenizer, task->task_pool, + part->words, tok->tokens, part->is_utf); + } + + cur = g_list_next (cur); + } + + if (task->subject != NULL) { + sub = task->subject; + } + else { + sub = (gchar *)g_mime_message_get_subject (task->message); + } + + if (sub != NULL) { + words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL); + if (words != NULL) { + tok->tokenizer->tokenize_func (tok->tokenizer, + task->task_pool, + words, + tok->tokens, + TRUE); + g_array_free (words, TRUE); + } + } +} + gboolean rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err) @@ -80,7 +129,6 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err) struct rspamd_stat_classifier *cls; struct rspamd_classifier_config *clcf; GList *cur; - guint i; struct rspamd_stat_ctx *st_ctx; struct rspamd_tokenizer_runtime *tklist = NULL, *tok; @@ -109,8 +157,9 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err) return FALSE; } + rspamd_stat_process_tokenize (st_ctx, task, tok); - if (!rspamd_stat_preprocess (st_ctx, task, cls, err)) { + if (!rspamd_stat_preprocess (st_ctx, cls, task, err)) { return FALSE; } diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 54b83d33e..6ec7b1e10 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -198,43 +198,6 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, return res; } - -void -tokenize_subject (struct rspamd_task *task, GTree ** tree) -{ - gchar *sub; - struct rspamd_stat_tokenizer *osb_tokenizer; - GArray *words; - - if (*tree == NULL) { - *tree = g_tree_new (token_node_compare_func); - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) g_tree_destroy, *tree); - } - - osb_tokenizer = rspamd_stat_get_tokenizer ("osb-text"); - - /* Try to use pre-defined subject */ - if (task->subject != NULL) { - sub = task->subject; - } - else { - sub = (gchar *)g_mime_message_get_subject (task->message); - } - - if (sub != NULL) { - words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL); - if (words != NULL) { - osb_tokenizer->tokenize_func (osb_tokenizer, - task->task_pool, - words, - *tree, - TRUE); - g_array_free (words, TRUE); - } - } -} - /* * vi:ts=4 */ diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 7d4523bfb..d4c116e13 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -37,9 +37,6 @@ int osb_tokenize_text (struct rspamd_stat_tokenizer *tokenizer, GTree *tokens, gboolean is_utf); -/* Make tokens for a subject */ -void tokenize_subject (struct rspamd_task *task, GTree ** tree); - #endif /* * vi:ts=4 |