From: Vsevolod Stakhov Date: Fri, 23 Jan 2015 16:49:42 +0000 (+0000) Subject: Add initial processing routines. X-Git-Tag: 0.9.0~842 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=caa189167ab79c8fb71de4c74ed4b97ceb619550;p=rspamd.git Add initial processing routines. --- diff --git a/src/libstat/backends/mmaped_file.c b/src/libstat/backends/mmaped_file.c index 49f4d5ba4..f703f7f5b 100644 --- a/src/libstat/backends/mmaped_file.c +++ b/src/libstat/backends/mmaped_file.c @@ -23,7 +23,7 @@ */ #include "config.h" - +#include "stat_internal.h" #include "main.h" #define CHAIN_LENGTH 128 diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index 54db73d9e..6e068b79d 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -26,11 +26,10 @@ * Bayesian classifier */ #include "classifiers.h" -#include "tokenizers.h" #include "main.h" #include "filter.h" #include "cfg_file.h" -#include "lua/lua_common.h" +#include "stat_internal.h" #define LOCAL_PROB_DENOM 16.0 @@ -203,8 +202,7 @@ bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cfg) gboolean bayes_classify (struct classifier_ctx * ctx, GTree *input, - struct rspamd_task *task, - lua_State *L) + struct rspamd_task *task) { struct bayes_callback_data data; gchar *value; @@ -228,6 +226,8 @@ bayes_classify (struct classifier_ctx * ctx, } } + cur = ctx->cfg->statfiles; +#if 0 cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L); if (cur) { rspamd_mempool_add_destructor (task->task_pool, @@ -236,6 +236,8 @@ bayes_classify (struct classifier_ctx * ctx, else { cur = ctx->cfg->statfiles; } +#endif + data.statfiles_num = g_list_length (cur); data.statfiles = g_new0 (struct bayes_statfile_data, data.statfiles_num); @@ -312,7 +314,6 @@ bayes_learn_spam (struct classifier_ctx * ctx, GTree *input, struct rspamd_task *task, gboolean is_spam, - lua_State *L, GError **err) { struct bayes_callback_data data; diff --git a/src/libstat/classifiers/classifiers.h b/src/libstat/classifiers/classifiers.h index 6a77f5aed..e2bf57f81 100644 --- a/src/libstat/classifiers/classifiers.h +++ b/src/libstat/classifiers/classifiers.h @@ -2,6 +2,7 @@ #define CLASSIFIERS_H #include "config.h" +#include "mem_pool.h" /* Consider this value as 0 */ #define ALPHA 0.0001 @@ -22,10 +23,9 @@ struct rspamd_stat_classifier { struct classifier_ctx * (*init_func)(rspamd_mempool_t *pool, struct rspamd_classifier_config *cf); gboolean (*classify_func)(struct classifier_ctx * ctx, - GTree *input, struct rspamd_task *task, - lua_State *L); + GTree *input, struct rspamd_task *task); gboolean (*learn_spam_func)(struct classifier_ctx * ctx, - GTree *input, struct rspamd_task *task, gboolean is_spam, lua_State *L, + GTree *input, struct rspamd_task *task, gboolean is_spam, GError **err); }; @@ -34,13 +34,11 @@ struct classifier_ctx * bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cf); gboolean bayes_classify (struct classifier_ctx * ctx, GTree *input, - struct rspamd_task *task, - lua_State *L); + struct rspamd_task *task); gboolean bayes_learn_spam (struct classifier_ctx * ctx, GTree *input, struct rspamd_task *task, gboolean is_spam, - lua_State *L, GError **err); #endif diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c new file mode 100644 index 000000000..2c17e4b7e --- /dev/null +++ b/src/libstat/stat_process.c @@ -0,0 +1,121 @@ +/* Copyright (c) 2015, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "stat_api.h" +#include "main.h" +#include "stat_internal.h" +#include "lua/lua_common.h" +#include + +static gboolean +rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx, struct rspamd_stat_classifier *cls, + struct rspamd_task *task, GError **err) +{ + +} + +struct rspamd_tokenizer_runtime { + GTree *tokens; + const gchar *name; + struct rspamd_stat_tokenizer *tokenizer; + struct rspamd_tokenizer_runtime *next; +}; + +static struct rspamd_tokenizer_runtime * +rspamd_stat_get_tokenizer_runtime (const gchar *name, rspamd_mempool_t *pool, + struct rspamd_tokenizer_runtime **ls) +{ + struct rspamd_tokenizer_runtime *tok = NULL, *cur; + + LL_FOREACH (*ls, cur) { + if (strcmp (cur->name, name) == 0) { + tok = cur; + break; + } + } + + if (tok == NULL) { + tok = rspamd_mempool_alloc (pool, sizeof (*tok)); + tok->tokenizer = rspamd_stat_get_tokenizer (name); + + if (tok->tokenizer == NULL) { + return NULL; + } + + tok->tokens = g_tree_new (token_node_compare_func); + rspamd_mempool_add_destructor (pool, + (rspamd_mempool_destruct_t)g_tree_destroy, tok->tokens); + tok->name = name; + LL_PREPEND(*ls, tok); + } + + return tok; +} + + +gboolean +rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err) +{ + struct rspamd_stat_classifier *cls; + struct rspamd_classifier_config *clcf; + GList *cur; + guint i; + struct rspamd_stat_ctx *st_ctx; + struct rspamd_tokenizer_runtime *tklist = NULL, *tok; + + + st_ctx = rspamd_stat_get_ctx (); + g_assert (st_ctx != NULL); + + cur = g_list_first (task->cfg->classifiers); + + while (cur) { + clcf = (struct rspamd_classifier_config *)cur->data; + cls = rspamd_stat_get_classifier (clcf->classifier); + + if (cls == NULL) { + g_set_error (err, rspamd_stat_quark (), 500, "type %s is not defined" + "for classifiers", clcf->classifier); + return FALSE; + } + + tok = rspamd_stat_get_tokenizer_runtime (clcf->tokenizer, task->task_pool, + &tklist); + + if (tok == NULL) { + g_set_error (err, rspamd_stat_quark (), 500, "type %s is not defined" + "for tokenizers", clcf->tokenizer); + return FALSE; + } + + + if (!rspamd_stat_preprocess (st_ctx, task, cls, err)) { + return FALSE; + } + + cur = g_list_next (cur); + } + + return TRUE; +} diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index f9307ded4..0a8d01ce1 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -26,8 +26,8 @@ * OSB tokenizer */ -#include #include "tokenizers.h" +#include "stat_internal.h" /* Size for features pipe */ #define FEATURE_WINDOW_SIZE 5 @@ -83,8 +83,8 @@ osb_tokenize_text (struct rspamd_stat_tokenizer *tokenizer, memcpy(new->data, &h1, sizeof(h1)); memcpy(new->data + sizeof(h1), &h2, sizeof(h2)); - if (g_tree_lookup (*tree, new) == NULL) { - g_tree_insert (*tree, new, new); + if (g_tree_lookup (tree, new) == NULL) { + g_tree_insert (tree, new, new); } } } diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 5cc2a83ea..54b83d33e 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -28,6 +28,7 @@ #include "main.h" #include "tokenizers.h" +#include "stat_internal.h" const int primes[] = { 1, 7, @@ -227,10 +228,8 @@ tokenize_subject (struct rspamd_task *task, GTree ** tree) osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, words, - tree, - FALSE, - TRUE, - NULL); + *tree, + TRUE); g_array_free (words, TRUE); } }