]> source.dussan.org Git - rspamd.git/commitdiff
Add initial processing routines.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 23 Jan 2015 16:49:42 +0000 (16:49 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 23 Jan 2015 16:49:42 +0000 (16:49 +0000)
src/libstat/backends/mmaped_file.c
src/libstat/classifiers/bayes.c
src/libstat/classifiers/classifiers.h
src/libstat/stat_process.c [new file with mode: 0644]
src/libstat/tokenizers/osb.c
src/libstat/tokenizers/tokenizers.c

index 49f4d5ba4e234a49cf1c4ffe6fe03efdb63103d6..f703f7f5b3a03ee4f5562e96b0a0be4b6cdf2598 100644 (file)
@@ -23,7 +23,7 @@
  */
 
 #include "config.h"
-
+#include "stat_internal.h"
 #include "main.h"
 
 #define CHAIN_LENGTH 128
index 54db73d9e8bb4c0069277d8f55c070db12701ec7..6e068b79d430c245093cf177149e677659694ac3 100644 (file)
  * Bayesian classifier
  */
 #include "classifiers.h"
-#include "tokenizers.h"
 #include "main.h"
 #include "filter.h"
 #include "cfg_file.h"
-#include "lua/lua_common.h"
+#include "stat_internal.h"
 
 #define LOCAL_PROB_DENOM 16.0
 
@@ -203,8 +202,7 @@ bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cfg)
 gboolean
 bayes_classify (struct classifier_ctx * ctx,
        GTree *input,
-       struct rspamd_task *task,
-       lua_State *L)
+       struct rspamd_task *task)
 {
        struct bayes_callback_data data;
        gchar *value;
@@ -228,6 +226,8 @@ bayes_classify (struct classifier_ctx * ctx,
                }
        }
 
+       cur = ctx->cfg->statfiles;
+#if 0
        cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L);
        if (cur) {
                rspamd_mempool_add_destructor (task->task_pool,
@@ -236,6 +236,8 @@ bayes_classify (struct classifier_ctx * ctx,
        else {
                cur = ctx->cfg->statfiles;
        }
+#endif
+
 
        data.statfiles_num = g_list_length (cur);
        data.statfiles = g_new0 (struct bayes_statfile_data, data.statfiles_num);
@@ -312,7 +314,6 @@ bayes_learn_spam (struct classifier_ctx * ctx,
        GTree *input,
        struct rspamd_task *task,
        gboolean is_spam,
-       lua_State *L,
        GError **err)
 {
        struct bayes_callback_data data;
index 6a77f5aed5016216cf34ed0f81db80a65a24da89..e2bf57f81f30d443da20c5712723ef06cfd20963 100644 (file)
@@ -2,6 +2,7 @@
 #define CLASSIFIERS_H
 
 #include "config.h"
+#include "mem_pool.h"
 
 /* Consider this value as 0 */
 #define ALPHA 0.0001
@@ -22,10 +23,9 @@ struct rspamd_stat_classifier {
        struct classifier_ctx * (*init_func)(rspamd_mempool_t *pool,
                struct rspamd_classifier_config *cf);
        gboolean (*classify_func)(struct classifier_ctx * ctx,
-               GTree *input, struct rspamd_task *task,
-               lua_State *L);
+               GTree *input, struct rspamd_task *task);
        gboolean (*learn_spam_func)(struct classifier_ctx * ctx,
-               GTree *input, struct rspamd_task *task, gboolean is_spam, lua_State *L,
+               GTree *input, struct rspamd_task *task, gboolean is_spam,
                GError **err);
 };
 
@@ -34,13 +34,11 @@ struct classifier_ctx * bayes_init (rspamd_mempool_t *pool,
        struct rspamd_classifier_config *cf);
 gboolean bayes_classify (struct classifier_ctx * ctx,
        GTree *input,
-       struct rspamd_task *task,
-       lua_State *L);
+       struct rspamd_task *task);
 gboolean bayes_learn_spam (struct classifier_ctx * ctx,
        GTree *input,
        struct rspamd_task *task,
        gboolean is_spam,
-       lua_State *L,
        GError **err);
 
 #endif
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
new file mode 100644 (file)
index 0000000..2c17e4b
--- /dev/null
@@ -0,0 +1,121 @@
+/* Copyright (c) 2015, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *       * Redistributions of source code must retain the above copyright
+ *         notice, this list of conditions and the following disclaimer.
+ *       * Redistributions in binary form must reproduce the above copyright
+ *         notice, this list of conditions and the following disclaimer in the
+ *         documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "stat_api.h"
+#include "main.h"
+#include "stat_internal.h"
+#include "lua/lua_common.h"
+#include <utlist.h>
+
+static gboolean
+rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx, struct rspamd_stat_classifier *cls,
+               struct rspamd_task *task, GError **err)
+{
+
+}
+
+struct rspamd_tokenizer_runtime {
+       GTree *tokens;
+       const gchar *name;
+       struct rspamd_stat_tokenizer *tokenizer;
+       struct rspamd_tokenizer_runtime *next;
+};
+
+static struct rspamd_tokenizer_runtime *
+rspamd_stat_get_tokenizer_runtime (const gchar *name, rspamd_mempool_t *pool,
+               struct rspamd_tokenizer_runtime **ls)
+{
+       struct rspamd_tokenizer_runtime *tok = NULL, *cur;
+
+       LL_FOREACH (*ls, cur) {
+               if (strcmp (cur->name, name) == 0) {
+                       tok = cur;
+                       break;
+               }
+       }
+
+       if (tok == NULL) {
+               tok = rspamd_mempool_alloc (pool, sizeof (*tok));
+               tok->tokenizer = rspamd_stat_get_tokenizer (name);
+
+               if (tok->tokenizer == NULL) {
+                       return NULL;
+               }
+
+               tok->tokens = g_tree_new (token_node_compare_func);
+               rspamd_mempool_add_destructor (pool,
+                               (rspamd_mempool_destruct_t)g_tree_destroy, tok->tokens);
+               tok->name = name;
+               LL_PREPEND(*ls, tok);
+       }
+
+       return tok;
+}
+
+
+gboolean
+rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err)
+{
+       struct rspamd_stat_classifier *cls;
+       struct rspamd_classifier_config *clcf;
+       GList *cur;
+       guint i;
+       struct rspamd_stat_ctx *st_ctx;
+       struct rspamd_tokenizer_runtime *tklist = NULL, *tok;
+
+
+       st_ctx = rspamd_stat_get_ctx ();
+       g_assert (st_ctx != NULL);
+
+       cur = g_list_first (task->cfg->classifiers);
+
+       while (cur) {
+               clcf = (struct rspamd_classifier_config *)cur->data;
+               cls = rspamd_stat_get_classifier (clcf->classifier);
+
+               if (cls == NULL) {
+                       g_set_error (err, rspamd_stat_quark (), 500, "type %s is not defined"
+                                       "for classifiers", clcf->classifier);
+                       return FALSE;
+               }
+
+               tok = rspamd_stat_get_tokenizer_runtime (clcf->tokenizer, task->task_pool,
+                               &tklist);
+
+               if (tok == NULL) {
+                       g_set_error (err, rspamd_stat_quark (), 500, "type %s is not defined"
+                                       "for tokenizers", clcf->tokenizer);
+                       return FALSE;
+               }
+
+
+               if (!rspamd_stat_preprocess (st_ctx, task, cls, err)) {
+                       return FALSE;
+               }
+
+               cur = g_list_next (cur);
+       }
+
+       return TRUE;
+}
index f9307ded46666017f19093fc76d6188ce68fa3cf..0a8d01ce1328b37ca3b95baee67a7d17c17ebd1e 100644 (file)
@@ -26,8 +26,8 @@
  * OSB tokenizer
  */
 
-#include <sys/types.h>
 #include "tokenizers.h"
+#include "stat_internal.h"
 
 /* Size for features pipe */
 #define FEATURE_WINDOW_SIZE 5
@@ -83,8 +83,8 @@ osb_tokenize_text (struct rspamd_stat_tokenizer *tokenizer,
                                memcpy(new->data, &h1, sizeof(h1));
                                memcpy(new->data + sizeof(h1), &h2, sizeof(h2));
 
-                               if (g_tree_lookup (*tree, new) == NULL) {
-                                       g_tree_insert (*tree, new, new);
+                               if (g_tree_lookup (tree, new) == NULL) {
+                                       g_tree_insert (tree, new, new);
                                }
                        }
                }
index 5cc2a83eab6ad3d95cdc2a049174db02dc17a882..54b83d33e646e770cdf83e6ea4ae26d34815f6cb 100644 (file)
@@ -28,6 +28,7 @@
 
 #include "main.h"
 #include "tokenizers.h"
+#include "stat_internal.h"
 
 const int primes[] = {
        1, 7,
@@ -227,10 +228,8 @@ tokenize_subject (struct rspamd_task *task, GTree ** tree)
                        osb_tokenizer->tokenize_func (osb_tokenizer,
                                        task->task_pool,
                                        words,
-                                       tree,
-                                       FALSE,
-                                       TRUE,
-                                       NULL);
+                                       *tree,
+                                       TRUE);
                        g_array_free (words, TRUE);
                }
        }