]> source.dussan.org Git - rspamd.git/commitdiff
Rework tokenization invocation.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 23 Jan 2015 21:55:14 +0000 (21:55 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 23 Jan 2015 21:55:14 +0000 (21:55 +0000)
src/libstat/stat_process.c
src/libstat/tokenizers/tokenizers.c
src/libstat/tokenizers/tokenizers.h

index 2c17e4b7e9d3b9c89fb7bbf94254c4a835c7a576..042f8a3bd2fb94bbfb57eea8de132374be958f9e 100644 (file)
@@ -25,6 +25,7 @@
 #include "stat_api.h"
 #include "main.h"
 #include "stat_internal.h"
+#include "message.h"
 #include "lua/lua_common.h"
 #include <utlist.h>
 
@@ -73,6 +74,54 @@ rspamd_stat_get_tokenizer_runtime (const gchar *name, rspamd_mempool_t *pool,
        return tok;
 }
 
+/*
+ * Tokenize task using the tokenizer specified
+ */
+static void
+rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
+               struct rspamd_task *task, struct rspamd_tokenizer_runtime *tok)
+{
+       struct mime_text_part *part;
+       GArray *words;
+       gchar *sub;
+       GList *cur;
+
+       cur = task->text_parts;
+
+       while (cur != NULL) {
+               part = (struct mime_text_part *)cur->data;
+
+               if (!part->is_empty && part->words != NULL) {
+                       /*
+                        * XXX: Use normalized words if needed here
+                        */
+                       tok->tokenizer->tokenize_func (tok->tokenizer, task->task_pool,
+                                       part->words, tok->tokens, part->is_utf);
+               }
+
+               cur = g_list_next (cur);
+       }
+
+       if (task->subject != NULL) {
+               sub = task->subject;
+       }
+       else {
+               sub = (gchar *)g_mime_message_get_subject (task->message);
+       }
+
+       if (sub != NULL) {
+               words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL);
+               if (words != NULL) {
+                       tok->tokenizer->tokenize_func (tok->tokenizer,
+                                       task->task_pool,
+                                       words,
+                                       tok->tokens,
+                                       TRUE);
+                       g_array_free (words, TRUE);
+               }
+       }
+}
+
 
 gboolean
 rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err)
@@ -80,7 +129,6 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err)
        struct rspamd_stat_classifier *cls;
        struct rspamd_classifier_config *clcf;
        GList *cur;
-       guint i;
        struct rspamd_stat_ctx *st_ctx;
        struct rspamd_tokenizer_runtime *tklist = NULL, *tok;
 
@@ -109,8 +157,9 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err)
                        return FALSE;
                }
 
+               rspamd_stat_process_tokenize (st_ctx, task, tok);
 
-               if (!rspamd_stat_preprocess (st_ctx, task, cls, err)) {
+               if (!rspamd_stat_preprocess (st_ctx, cls, task, err)) {
                        return FALSE;
                }
 
index 54b83d33e646e770cdf83e6ea4ae26d34815f6cb..6ec7b1e10819cc6275f4daebbdf14d413e9ae714 100644 (file)
@@ -198,43 +198,6 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
        return res;
 }
 
-
-void
-tokenize_subject (struct rspamd_task *task, GTree ** tree)
-{
-       gchar *sub;
-       struct rspamd_stat_tokenizer *osb_tokenizer;
-       GArray *words;
-
-       if (*tree == NULL) {
-               *tree = g_tree_new (token_node_compare_func);
-               rspamd_mempool_add_destructor (task->task_pool,
-                       (rspamd_mempool_destruct_t) g_tree_destroy, *tree);
-       }
-
-       osb_tokenizer = rspamd_stat_get_tokenizer ("osb-text");
-
-       /* Try to use pre-defined subject */
-       if (task->subject != NULL) {
-               sub = task->subject;
-       }
-       else {
-               sub = (gchar *)g_mime_message_get_subject (task->message);
-       }
-
-       if (sub != NULL) {
-               words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL);
-               if (words != NULL) {
-                       osb_tokenizer->tokenize_func (osb_tokenizer,
-                                       task->task_pool,
-                                       words,
-                                       *tree,
-                                       TRUE);
-                       g_array_free (words, TRUE);
-               }
-       }
-}
-
 /*
  * vi:ts=4
  */
index 7d4523bfbca2a2d9ff9ca36bfb21c1194f350fe1..d4c116e1327c35a8691869bfd6ad57c63db2218c 100644 (file)
@@ -37,9 +37,6 @@ int osb_tokenize_text (struct rspamd_stat_tokenizer *tokenizer,
        GTree *tokens,
        gboolean is_utf);
 
-/* Make tokens for a subject */
-void tokenize_subject (struct rspamd_task *task, GTree ** tree);
-
 #endif
 /*
  * vi:ts=4