aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-01-23 21:55:14 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-01-23 21:55:14 +0000
commitb18ee66bc2febb8caf3a9d651cd9c48f29c15a2d (patch)
tree57bd1d0f8cc1f896956105ccfbd66c69615c7d3c
parentc07fb26ef4e2f903522a6cf85617d60acebd9ad2 (diff)
downloadrspamd-b18ee66bc2febb8caf3a9d651cd9c48f29c15a2d.tar.gz
rspamd-b18ee66bc2febb8caf3a9d651cd9c48f29c15a2d.zip
Rework tokenization invocation.
-rw-r--r--src/libstat/stat_process.c53
-rw-r--r--src/libstat/tokenizers/tokenizers.c37
-rw-r--r--src/libstat/tokenizers/tokenizers.h3
3 files changed, 51 insertions, 42 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index 2c17e4b7e..042f8a3bd 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -25,6 +25,7 @@
#include "stat_api.h"
#include "main.h"
#include "stat_internal.h"
+#include "message.h"
#include "lua/lua_common.h"
#include <utlist.h>
@@ -73,6 +74,54 @@ rspamd_stat_get_tokenizer_runtime (const gchar *name, rspamd_mempool_t *pool,
return tok;
}
+/*
+ * Tokenize task using the tokenizer specified
+ */
+static void
+rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
+ struct rspamd_task *task, struct rspamd_tokenizer_runtime *tok)
+{
+ struct mime_text_part *part;
+ GArray *words;
+ gchar *sub;
+ GList *cur;
+
+ cur = task->text_parts;
+
+ while (cur != NULL) {
+ part = (struct mime_text_part *)cur->data;
+
+ if (!part->is_empty && part->words != NULL) {
+ /*
+ * XXX: Use normalized words if needed here
+ */
+ tok->tokenizer->tokenize_func (tok->tokenizer, task->task_pool,
+ part->words, tok->tokens, part->is_utf);
+ }
+
+ cur = g_list_next (cur);
+ }
+
+ if (task->subject != NULL) {
+ sub = task->subject;
+ }
+ else {
+ sub = (gchar *)g_mime_message_get_subject (task->message);
+ }
+
+ if (sub != NULL) {
+ words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL);
+ if (words != NULL) {
+ tok->tokenizer->tokenize_func (tok->tokenizer,
+ task->task_pool,
+ words,
+ tok->tokens,
+ TRUE);
+ g_array_free (words, TRUE);
+ }
+ }
+}
+
gboolean
rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err)
@@ -80,7 +129,6 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err)
struct rspamd_stat_classifier *cls;
struct rspamd_classifier_config *clcf;
GList *cur;
- guint i;
struct rspamd_stat_ctx *st_ctx;
struct rspamd_tokenizer_runtime *tklist = NULL, *tok;
@@ -109,8 +157,9 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err)
return FALSE;
}
+ rspamd_stat_process_tokenize (st_ctx, task, tok);
- if (!rspamd_stat_preprocess (st_ctx, task, cls, err)) {
+ if (!rspamd_stat_preprocess (st_ctx, cls, task, err)) {
return FALSE;
}
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 54b83d33e..6ec7b1e10 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -198,43 +198,6 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
return res;
}
-
-void
-tokenize_subject (struct rspamd_task *task, GTree ** tree)
-{
- gchar *sub;
- struct rspamd_stat_tokenizer *osb_tokenizer;
- GArray *words;
-
- if (*tree == NULL) {
- *tree = g_tree_new (token_node_compare_func);
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) g_tree_destroy, *tree);
- }
-
- osb_tokenizer = rspamd_stat_get_tokenizer ("osb-text");
-
- /* Try to use pre-defined subject */
- if (task->subject != NULL) {
- sub = task->subject;
- }
- else {
- sub = (gchar *)g_mime_message_get_subject (task->message);
- }
-
- if (sub != NULL) {
- words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL);
- if (words != NULL) {
- osb_tokenizer->tokenize_func (osb_tokenizer,
- task->task_pool,
- words,
- *tree,
- TRUE);
- g_array_free (words, TRUE);
- }
- }
-}
-
/*
* vi:ts=4
*/
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 7d4523bfb..d4c116e13 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -37,9 +37,6 @@ int osb_tokenize_text (struct rspamd_stat_tokenizer *tokenizer,
GTree *tokens,
gboolean is_utf);
-/* Make tokens for a subject */
-void tokenize_subject (struct rspamd_task *task, GTree ** tree);
-
#endif
/*
* vi:ts=4