From 7cf83c6b4d31dfdc8e314f31c8a6c9fd9408f6ea Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 16 Jan 2015 15:32:30 +0000 Subject: [PATCH] No more winnow. --- src/libstat/CMakeLists.txt | 3 +- src/libstat/classifiers.h | 28 -- src/libstat/classifiers/classifiers.c | 8 - src/libstat/classifiers/winnow.c | 694 -------------------------- 4 files changed, 1 insertion(+), 732 deletions(-) delete mode 100644 src/libstat/classifiers/winnow.c diff --git a/src/libstat/CMakeLists.txt b/src/libstat/CMakeLists.txt index 6254a41a6..810570f20 100644 --- a/src/libstat/CMakeLists.txt +++ b/src/libstat/CMakeLists.txt @@ -5,8 +5,7 @@ SET(TOKENIZERSSRC tokenizers/tokenizers.c tokenizers/osb.c) SET(CLASSIFIERSSRC classifiers/classifiers.c - classifiers/bayes.c - classifiers/winnow.c) + classifiers/bayes.c) ADD_LIBRARY(rspamd-stat ${LINK_TYPE} ${LIBSTATSRC} ${TOKENIZERSSRC} ${CLASSIFIERSSRC}) IF(NOT DEBIAN_BUILD) diff --git a/src/libstat/classifiers.h b/src/libstat/classifiers.h index fd1b63bcf..d13178486 100644 --- a/src/libstat/classifiers.h +++ b/src/libstat/classifiers.h @@ -47,34 +47,6 @@ struct classifier { /* Get classifier structure by name or return NULL if this name is not found */ struct classifier * get_classifier (const char *name); -/* Winnow algorithm */ -struct classifier_ctx * winnow_init (rspamd_mempool_t *pool, - struct rspamd_classifier_config *cf); -gboolean winnow_classify (struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, - struct rspamd_task *task, - lua_State *L); -gboolean winnow_learn (struct classifier_ctx * ctx, - statfile_pool_t *pool, - const char *symbol, - GTree *input, - gboolean in_class, - double *sum, - double multiplier, - GError **err); -gboolean winnow_learn_spam (struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, - struct rspamd_task *task, - gboolean is_spam, - lua_State *L, - GError **err); -GList * winnow_weights (struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, - struct rspamd_task *task); - /* Bayes algorithm */ struct classifier_ctx * bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cf); diff --git a/src/libstat/classifiers/classifiers.c b/src/libstat/classifiers/classifiers.c index 95dd52c44..6af7d2dc8 100644 --- a/src/libstat/classifiers/classifiers.c +++ b/src/libstat/classifiers/classifiers.c @@ -29,14 +29,6 @@ #include "classifiers.h" struct classifier classifiers[] = { - { - .name = "winnow", - .init_func = winnow_init, - .classify_func = winnow_classify, - .learn_func = winnow_learn, - .learn_spam_func = winnow_learn_spam, - .weights_func = winnow_weights - }, { .name = "bayes", .init_func = bayes_init, diff --git a/src/libstat/classifiers/winnow.c b/src/libstat/classifiers/winnow.c deleted file mode 100644 index 68d456968..000000000 --- a/src/libstat/classifiers/winnow.c +++ /dev/null @@ -1,694 +0,0 @@ -/* - * Copyright (c) 2009-2012, Vsevolod Stakhov - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * Winnow classifier - */ - -#include "classifiers.h" -#include "tokenizers.h" -#include "main.h" -#include "filter.h" -#include "cfg_file.h" -#include "lua/lua_common.h" - -#define WINNOW_PROMOTION 1.23 -#define WINNOW_DEMOTION 0.83 - -#define MEDIAN_WINDOW_SIZE 5 - -#define MAX_WEIGHT G_MAXDOUBLE / 2. - - - -#define MAX_LEARN_ITERATIONS 100 - -static inline GQuark -winnow_error_quark (void) -{ - return g_quark_from_static_string ("winnow-error"); -} - -struct winnow_callback_data { - statfile_pool_t *pool; - struct classifier_ctx *ctx; - stat_file_t *file; - stat_file_t *learn_file; - long double sum; - long double start; - double multiplier; - guint32 count; - guint32 new_blocks; - gboolean in_class; - gboolean do_demote; - gboolean fresh_run; - time_t now; -}; - -static const double max_common_weight = MAX_WEIGHT * WINNOW_DEMOTION; - - - -static gboolean -winnow_classify_callback (gpointer key, gpointer value, gpointer data) -{ - token_node_t *node = key; - struct winnow_callback_data *cd = data; - double v; - - /* Consider that not found blocks have value 1 */ - v = - statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, - cd->now); - if (fabs (v) > ALPHA) { - cd->sum += v; - } - else { - cd->sum += 1.0; - cd->new_blocks++; - } - - cd->count++; - - return FALSE; -} - -static gboolean -winnow_learn_callback (gpointer key, gpointer value, gpointer data) -{ - token_node_t *node = key; - struct winnow_callback_data *cd = data; - double v, c; - - c = (cd->in_class) ? WINNOW_PROMOTION * cd->multiplier : WINNOW_DEMOTION / - cd->multiplier; - - /* Consider that not found blocks have value 1 */ - v = - statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, - cd->now); - if (fabs (v) < ALPHA) { - /* Block not found, insert new */ - cd->start += 1; - if (cd->file == cd->learn_file) { - statfile_pool_set_block (cd->pool, - cd->file, - node->h1, - node->h2, - cd->now, - c); - node->value = c; - cd->new_blocks++; - } - } - else { - cd->start += v; - /* Here we just increase the extra value of block */ - if (cd->fresh_run) { - node->extra = 0; - } - else { - node->extra++; - } - node->value = v; - - if (node->extra > 1) { - /* - * Assume that this node is common for several statfiles, so - * decrease its weight proportianally - */ - if (node->value > max_common_weight) { - /* Static fluctuation */ - statfile_pool_set_block (cd->pool, - cd->file, - node->h1, - node->h2, - cd->now, - 0.); - node->value = 0.; - } - else if (node->value > WINNOW_PROMOTION * cd->multiplier) { - /* Try to decrease its value */ - /* XXX: it is more intelligent to add some adaptive filter here */ - if (cd->file == cd->learn_file) { - if (node->value > max_common_weight / 2.) { - node->value *= c; - } - else { - /* - * Too high token value that exists also in other - * statfiles, may be statistic error, so decrease it - * slightly - */ - node->value *= WINNOW_DEMOTION; - } - } - else { - node->value = WINNOW_DEMOTION / cd->multiplier; - } - statfile_pool_set_block (cd->pool, - cd->file, - node->h1, - node->h2, - cd->now, - node->value); - } - } - else if (cd->file == cd->learn_file) { - /* New block or block that is in only one statfile */ - /* Set some limit on growing */ - if (v > MAX_WEIGHT) { - node->value = v; - } - else { - node->value *= c; - } - statfile_pool_set_block (cd->pool, - cd->file, - node->h1, - node->h2, - cd->now, - node->value); - } - else if (cd->do_demote) { - /* Demote blocks in file */ - node->value *= WINNOW_DEMOTION / cd->multiplier; - statfile_pool_set_block (cd->pool, - cd->file, - node->h1, - node->h2, - cd->now, - node->value); - } - } - - - cd->sum += node->value; - - cd->count++; - - return FALSE; -} - -struct classifier_ctx * -winnow_init (rspamd_mempool_t * pool, struct rspamd_classifier_config *cfg) -{ - struct classifier_ctx *ctx = - rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx)); - - ctx->pool = pool; - ctx->cfg = cfg; - - return ctx; -} - -gboolean -winnow_classify (struct classifier_ctx *ctx, - statfile_pool_t * pool, - GTree * input, - struct rspamd_task *task, - lua_State *L) -{ - struct winnow_callback_data data; - char *sumbuf, *value; - long double res = 0., max = 0.; - GList *cur; - struct rspamd_statfile_config *st, *sel = NULL; - int nodes, minnodes; - - g_assert (pool != NULL); - g_assert (ctx != NULL); - - data.pool = pool; - data.now = time (NULL); - data.ctx = ctx; - - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { - minnodes = strtol (value, NULL, 10); - nodes = g_tree_nnodes (input); - if (nodes > FEATURE_WINDOW_SIZE) { - nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; - } - if (nodes < minnodes) { - msg_info ( - "do not classify message as it has too few tokens: %d, while %d min", - nodes, - minnodes); - return FALSE; - } - } - - cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L); - if (cur) { - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t)g_list_free, cur); - } - else { - cur = ctx->cfg->statfiles; - } - - while (cur) { - st = cur->data; - data.sum = 0; - data.count = 0; - data.new_blocks = 0; - if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) { - if ((data.file = - statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { - msg_warn ("cannot open %s, skip it", st->path); - cur = g_list_next (cur); - continue; - } - } - - if (data.file != NULL) { - g_tree_foreach (input, winnow_classify_callback, &data); - } - - if (data.count != 0) { - res = data.sum / (double)data.count; - } - else { - res = 0; - } - if (res > max) { - max = res; - sel = st; - } - cur = g_list_next (cur); - } - - if (sel != NULL) { -#ifdef WITH_LUA - max = rspamd_lua_call_cls_post_callbacks (ctx->cfg, task, max, L); -#endif -#ifdef HAVE_TANHL - max = tanhl (max); -#else - /* - * As some implementations of libm does not support tanhl, try to use - * tanh - */ - max = tanh ((double) max); -#endif - sumbuf = rspamd_mempool_alloc (task->task_pool, 32); - rspamd_snprintf (sumbuf, 32, "%.2F", max); - cur = g_list_prepend (NULL, sumbuf); - rspamd_task_insert_result (task, sel->symbol, max, cur); - } - - return TRUE; -} - -GList * -winnow_weights (struct classifier_ctx *ctx, - statfile_pool_t * pool, - GTree * input, - struct rspamd_task *task) -{ - struct winnow_callback_data data; - long double res = 0.; - GList *cur, *resl = NULL; - struct rspamd_statfile_config *st; - struct classify_weight *w; - char *value; - int nodes, minnodes; - - g_assert (pool != NULL); - g_assert (ctx != NULL); - - data.pool = pool; - data.now = time (NULL); - data.ctx = ctx; - - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { - minnodes = strtol (value, NULL, 10); - nodes = g_tree_nnodes (input); - if (nodes > FEATURE_WINDOW_SIZE) { - nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; - } - if (nodes < minnodes) { - msg_info ( - "do not classify message as it has too few tokens: %d, while %d min", - nodes, - minnodes); - return NULL; - } - } - - cur = ctx->cfg->statfiles; - while (cur) { - st = cur->data; - data.sum = 0; - data.count = 0; - if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) { - if ((data.file = - statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { - msg_warn ("cannot open %s, skip it", st->path); - cur = g_list_next (cur); - continue; - } - } - - if (data.file != NULL) { - g_tree_foreach (input, winnow_classify_callback, &data); - } - - w = - rspamd_mempool_alloc0 (task->task_pool, - sizeof (struct classify_weight)); - if (data.count != 0) { - res = data.sum / (double)data.count; - } - else { - res = 0; - } - w->name = st->symbol; - w->weight = res; - resl = g_list_prepend (resl, w); - cur = g_list_next (cur); - } - - if (resl != NULL) { - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t)g_list_free, resl); - } - - return resl; - -} - - -gboolean -winnow_learn (struct classifier_ctx *ctx, - statfile_pool_t *pool, - const char *symbol, - GTree * input, - int in_class, - double *sum, - double multiplier, - GError **err) -{ - struct winnow_callback_data data = { - .file = NULL, - .multiplier = multiplier - }; - char *value; - int nodes, minnodes, iterations = 0; - struct rspamd_statfile_config *st, *sel_st = NULL; - stat_file_t *sel = NULL, *to_learn; - long double res = 0., max = 0., start_value = 0., end_value = 0.; - double learn_threshold = 0.0; - GList *cur, *to_demote = NULL; - gboolean force_learn = FALSE; - - g_assert (pool != NULL); - g_assert (ctx != NULL); - - data.pool = pool; - data.in_class = in_class; - data.now = time (NULL); - data.ctx = ctx; - - - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { - minnodes = strtol (value, NULL, 10); - nodes = g_tree_nnodes (input); - if (nodes > FEATURE_WINDOW_SIZE) { - nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; - } - if (nodes < minnodes) { - msg_info ( - "do not learn message as it has too few tokens: %d, while %d min", - nodes, - minnodes); - if (sum != NULL) { - *sum = 0; - } - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "message contains too few tokens: %d, while min is %d", - nodes, minnodes); - return FALSE; - } - } - if (ctx->cfg->opts && - (value = - g_hash_table_lookup (ctx->cfg->opts, "learn_threshold")) != NULL) { - learn_threshold = strtod (value, NULL); - } - - if (learn_threshold <= 1.0 && learn_threshold >= 0) { - /* Classify message and check target statfile score */ - cur = ctx->cfg->statfiles; - while (cur) { - /* Open or create all statfiles inside classifier */ - st = cur->data; - if (statfile_pool_is_open (pool, st->path) == NULL) { - if (statfile_pool_open (pool, st->path, st->size, - FALSE) == NULL) { - msg_warn ("cannot open %s", st->path); - if (statfile_pool_create (pool, st->path, st->size) == -1) { - msg_err ("cannot create statfile %s", st->path); - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "cannot create statfile: %s", - st->path); - return FALSE; - } - if (statfile_pool_open (pool, st->path, st->size, - FALSE) == NULL) { - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "open statfile %s after creation", - st->path); - msg_err ("cannot open statfile %s after creation", - st->path); - return FALSE; - } - } - } - if (strcmp (st->symbol, symbol) == 0) { - sel_st = st; - - } - cur = g_list_next (cur); - } - - if (sel_st == NULL) { - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "cannot find statfile for symbol %s", - symbol); - msg_err ("cannot find statfile for symbol %s", symbol); - return FALSE; - } - - to_learn = statfile_pool_is_open (pool, sel_st->path); - if (to_learn == NULL) { - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles", - sel_st->path); - return FALSE; - } - /* Check target statfile */ - data.file = to_learn; - data.sum = 0; - data.count = 0; - data.new_blocks = 0; - g_tree_foreach (input, winnow_classify_callback, &data); - if (data.count > 0) { - max = data.sum / (double)data.count; - } - else { - max = 0; - } - /* If most of blocks are not presented in targeted statfile do forced learn */ - if (max < 1 + learn_threshold) { - force_learn = TRUE; - } - /* Check other statfiles */ - while (cur) { - st = cur->data; - data.sum = 0; - data.count = 0; - if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) { - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles", - st->path); - return FALSE; - } - g_tree_foreach (input, winnow_classify_callback, &data); - if (data.count != 0) { - res = data.sum / data.count; - } - else { - res = 0; - } - if (to_learn != data.file && res - max > 1 - learn_threshold) { - /* Demote tokens in this statfile */ - to_demote = g_list_prepend (to_demote, data.file); - } - cur = g_list_next (cur); - } - } - else { - msg_err ( - "learn threshold is more than 1 or less than 0, so cannot do learn, please check your configuration"); - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "bad learn_threshold setting: %.2f", - learn_threshold); - return FALSE; - } - /* If to_demote list is empty this message is already classified correctly */ - if (max > WINNOW_PROMOTION && to_demote == NULL && !force_learn) { - msg_info ( - "this message is already of class %s with threshold %.2f and weight %.2F", - sel_st->symbol, - learn_threshold, - max); - goto end; - } - data.learn_file = to_learn; - end_value = max; - do { - cur = ctx->cfg->statfiles; - data.fresh_run = TRUE; - while (cur) { - st = cur->data; - data.sum = 0; - data.count = 0; - data.new_blocks = 0; - data.start = 0; - if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) { - return FALSE; - } - if (to_demote != NULL && - g_list_find (to_demote, data.file) != NULL) { - data.do_demote = TRUE; - } - else { - data.do_demote = FALSE; - } - - statfile_pool_lock_file (pool, data.file); - g_tree_foreach (input, winnow_learn_callback, &data); - statfile_pool_unlock_file (pool, data.file); - if (data.count != 0) { - res = data.sum / data.count; - } - else { - res = 0; - } - if (res > max) { - max = res; - sel = data.file; - } - if (data.file == to_learn) { - if (data.count > 0) { - start_value = data.start / data.count; - } - end_value = res; - } - cur = g_list_next (cur); - data.fresh_run = FALSE; - } - - data.multiplier *= WINNOW_PROMOTION; - msg_info ( - "learn iteration %d for statfile %s: %G -> %G, multiplier: %.2f", - iterations + 1, - symbol, - start_value, - end_value, - data.multiplier); - } while ((in_class ? sel != to_learn : sel == - to_learn) && iterations++ < MAX_LEARN_ITERATIONS); - - if (iterations >= MAX_LEARN_ITERATIONS) { - msg_warn ( - "learning statfile %s was not fully successfull: iterations count is limited to %d, final sum is %G", - sel_st->symbol, - MAX_LEARN_ITERATIONS, - max); - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "learning statfile %s was not fully successfull: iterations count is limited to %d", - sel_st->symbol, MAX_LEARN_ITERATIONS); - return FALSE; - } - else { - msg_info ( - "learned statfile %s successfully with %d iterations and sum %G", - sel_st->symbol, - iterations + 1, - max); - } - - -end: - if (sum) { -#ifdef HAVE_TANHL - *sum = (double)tanhl (max); -#else - /* - * As some implementations of libm does not support tanhl, try to use - * tanh - */ - *sum = tanh ((double) max); -#endif - } - return TRUE; -} - -gboolean -winnow_learn_spam (struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, - struct rspamd_task *task, - gboolean is_spam, - lua_State *L, - GError **err) -{ - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "learn spam is not supported for winnow" - ); - return FALSE; -} -- 2.39.5