aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-01-16 15:32:30 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-01-16 15:32:30 +0000
commit7cf83c6b4d31dfdc8e314f31c8a6c9fd9408f6ea (patch)
tree60f2de5914605316fbb3db5d2f251dd3bb1e0a3d /src/libstat
parentb5597411a2a4f9b46c0076ccddb95f8eacc1cb7f (diff)
downloadrspamd-7cf83c6b4d31dfdc8e314f31c8a6c9fd9408f6ea.tar.gz
rspamd-7cf83c6b4d31dfdc8e314f31c8a6c9fd9408f6ea.zip
No more winnow.
Diffstat (limited to 'src/libstat')
-rw-r--r--src/libstat/CMakeLists.txt3
-rw-r--r--src/libstat/classifiers.h28
-rw-r--r--src/libstat/classifiers/classifiers.c8
-rw-r--r--src/libstat/classifiers/winnow.c694
4 files changed, 1 insertions, 732 deletions
diff --git a/src/libstat/CMakeLists.txt b/src/libstat/CMakeLists.txt
index 6254a41a6..810570f20 100644
--- a/src/libstat/CMakeLists.txt
+++ b/src/libstat/CMakeLists.txt
@@ -5,8 +5,7 @@ SET(TOKENIZERSSRC tokenizers/tokenizers.c
tokenizers/osb.c)
SET(CLASSIFIERSSRC classifiers/classifiers.c
- classifiers/bayes.c
- classifiers/winnow.c)
+ classifiers/bayes.c)
ADD_LIBRARY(rspamd-stat ${LINK_TYPE} ${LIBSTATSRC} ${TOKENIZERSSRC} ${CLASSIFIERSSRC})
IF(NOT DEBIAN_BUILD)
diff --git a/src/libstat/classifiers.h b/src/libstat/classifiers.h
index fd1b63bcf..d13178486 100644
--- a/src/libstat/classifiers.h
+++ b/src/libstat/classifiers.h
@@ -47,34 +47,6 @@ struct classifier {
/* Get classifier structure by name or return NULL if this name is not found */
struct classifier * get_classifier (const char *name);
-/* Winnow algorithm */
-struct classifier_ctx * winnow_init (rspamd_mempool_t *pool,
- struct rspamd_classifier_config *cf);
-gboolean winnow_classify (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input,
- struct rspamd_task *task,
- lua_State *L);
-gboolean winnow_learn (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- const char *symbol,
- GTree *input,
- gboolean in_class,
- double *sum,
- double multiplier,
- GError **err);
-gboolean winnow_learn_spam (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input,
- struct rspamd_task *task,
- gboolean is_spam,
- lua_State *L,
- GError **err);
-GList * winnow_weights (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input,
- struct rspamd_task *task);
-
/* Bayes algorithm */
struct classifier_ctx * bayes_init (rspamd_mempool_t *pool,
struct rspamd_classifier_config *cf);
diff --git a/src/libstat/classifiers/classifiers.c b/src/libstat/classifiers/classifiers.c
index 95dd52c44..6af7d2dc8 100644
--- a/src/libstat/classifiers/classifiers.c
+++ b/src/libstat/classifiers/classifiers.c
@@ -30,14 +30,6 @@
struct classifier classifiers[] = {
{
- .name = "winnow",
- .init_func = winnow_init,
- .classify_func = winnow_classify,
- .learn_func = winnow_learn,
- .learn_spam_func = winnow_learn_spam,
- .weights_func = winnow_weights
- },
- {
.name = "bayes",
.init_func = bayes_init,
.classify_func = bayes_classify,
diff --git a/src/libstat/classifiers/winnow.c b/src/libstat/classifiers/winnow.c
deleted file mode 100644
index 68d456968..000000000
--- a/src/libstat/classifiers/winnow.c
+++ /dev/null
@@ -1,694 +0,0 @@
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Winnow classifier
- */
-
-#include "classifiers.h"
-#include "tokenizers.h"
-#include "main.h"
-#include "filter.h"
-#include "cfg_file.h"
-#include "lua/lua_common.h"
-
-#define WINNOW_PROMOTION 1.23
-#define WINNOW_DEMOTION 0.83
-
-#define MEDIAN_WINDOW_SIZE 5
-
-#define MAX_WEIGHT G_MAXDOUBLE / 2.
-
-
-
-#define MAX_LEARN_ITERATIONS 100
-
-static inline GQuark
-winnow_error_quark (void)
-{
- return g_quark_from_static_string ("winnow-error");
-}
-
-struct winnow_callback_data {
- statfile_pool_t *pool;
- struct classifier_ctx *ctx;
- stat_file_t *file;
- stat_file_t *learn_file;
- long double sum;
- long double start;
- double multiplier;
- guint32 count;
- guint32 new_blocks;
- gboolean in_class;
- gboolean do_demote;
- gboolean fresh_run;
- time_t now;
-};
-
-static const double max_common_weight = MAX_WEIGHT * WINNOW_DEMOTION;
-
-
-
-static gboolean
-winnow_classify_callback (gpointer key, gpointer value, gpointer data)
-{
- token_node_t *node = key;
- struct winnow_callback_data *cd = data;
- double v;
-
- /* Consider that not found blocks have value 1 */
- v =
- statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
- cd->now);
- if (fabs (v) > ALPHA) {
- cd->sum += v;
- }
- else {
- cd->sum += 1.0;
- cd->new_blocks++;
- }
-
- cd->count++;
-
- return FALSE;
-}
-
-static gboolean
-winnow_learn_callback (gpointer key, gpointer value, gpointer data)
-{
- token_node_t *node = key;
- struct winnow_callback_data *cd = data;
- double v, c;
-
- c = (cd->in_class) ? WINNOW_PROMOTION * cd->multiplier : WINNOW_DEMOTION /
- cd->multiplier;
-
- /* Consider that not found blocks have value 1 */
- v =
- statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
- cd->now);
- if (fabs (v) < ALPHA) {
- /* Block not found, insert new */
- cd->start += 1;
- if (cd->file == cd->learn_file) {
- statfile_pool_set_block (cd->pool,
- cd->file,
- node->h1,
- node->h2,
- cd->now,
- c);
- node->value = c;
- cd->new_blocks++;
- }
- }
- else {
- cd->start += v;
- /* Here we just increase the extra value of block */
- if (cd->fresh_run) {
- node->extra = 0;
- }
- else {
- node->extra++;
- }
- node->value = v;
-
- if (node->extra > 1) {
- /*
- * Assume that this node is common for several statfiles, so
- * decrease its weight proportianally
- */
- if (node->value > max_common_weight) {
- /* Static fluctuation */
- statfile_pool_set_block (cd->pool,
- cd->file,
- node->h1,
- node->h2,
- cd->now,
- 0.);
- node->value = 0.;
- }
- else if (node->value > WINNOW_PROMOTION * cd->multiplier) {
- /* Try to decrease its value */
- /* XXX: it is more intelligent to add some adaptive filter here */
- if (cd->file == cd->learn_file) {
- if (node->value > max_common_weight / 2.) {
- node->value *= c;
- }
- else {
- /*
- * Too high token value that exists also in other
- * statfiles, may be statistic error, so decrease it
- * slightly
- */
- node->value *= WINNOW_DEMOTION;
- }
- }
- else {
- node->value = WINNOW_DEMOTION / cd->multiplier;
- }
- statfile_pool_set_block (cd->pool,
- cd->file,
- node->h1,
- node->h2,
- cd->now,
- node->value);
- }
- }
- else if (cd->file == cd->learn_file) {
- /* New block or block that is in only one statfile */
- /* Set some limit on growing */
- if (v > MAX_WEIGHT) {
- node->value = v;
- }
- else {
- node->value *= c;
- }
- statfile_pool_set_block (cd->pool,
- cd->file,
- node->h1,
- node->h2,
- cd->now,
- node->value);
- }
- else if (cd->do_demote) {
- /* Demote blocks in file */
- node->value *= WINNOW_DEMOTION / cd->multiplier;
- statfile_pool_set_block (cd->pool,
- cd->file,
- node->h1,
- node->h2,
- cd->now,
- node->value);
- }
- }
-
-
- cd->sum += node->value;
-
- cd->count++;
-
- return FALSE;
-}
-
-struct classifier_ctx *
-winnow_init (rspamd_mempool_t * pool, struct rspamd_classifier_config *cfg)
-{
- struct classifier_ctx *ctx =
- rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx));
-
- ctx->pool = pool;
- ctx->cfg = cfg;
-
- return ctx;
-}
-
-gboolean
-winnow_classify (struct classifier_ctx *ctx,
- statfile_pool_t * pool,
- GTree * input,
- struct rspamd_task *task,
- lua_State *L)
-{
- struct winnow_callback_data data;
- char *sumbuf, *value;
- long double res = 0., max = 0.;
- GList *cur;
- struct rspamd_statfile_config *st, *sel = NULL;
- int nodes, minnodes;
-
- g_assert (pool != NULL);
- g_assert (ctx != NULL);
-
- data.pool = pool;
- data.now = time (NULL);
- data.ctx = ctx;
-
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
- minnodes = strtol (value, NULL, 10);
- nodes = g_tree_nnodes (input);
- if (nodes > FEATURE_WINDOW_SIZE) {
- nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
- }
- if (nodes < minnodes) {
- msg_info (
- "do not classify message as it has too few tokens: %d, while %d min",
- nodes,
- minnodes);
- return FALSE;
- }
- }
-
- cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L);
- if (cur) {
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)g_list_free, cur);
- }
- else {
- cur = ctx->cfg->statfiles;
- }
-
- while (cur) {
- st = cur->data;
- data.sum = 0;
- data.count = 0;
- data.new_blocks = 0;
- if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
- if ((data.file =
- statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
- msg_warn ("cannot open %s, skip it", st->path);
- cur = g_list_next (cur);
- continue;
- }
- }
-
- if (data.file != NULL) {
- g_tree_foreach (input, winnow_classify_callback, &data);
- }
-
- if (data.count != 0) {
- res = data.sum / (double)data.count;
- }
- else {
- res = 0;
- }
- if (res > max) {
- max = res;
- sel = st;
- }
- cur = g_list_next (cur);
- }
-
- if (sel != NULL) {
-#ifdef WITH_LUA
- max = rspamd_lua_call_cls_post_callbacks (ctx->cfg, task, max, L);
-#endif
-#ifdef HAVE_TANHL
- max = tanhl (max);
-#else
- /*
- * As some implementations of libm does not support tanhl, try to use
- * tanh
- */
- max = tanh ((double) max);
-#endif
- sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
- rspamd_snprintf (sumbuf, 32, "%.2F", max);
- cur = g_list_prepend (NULL, sumbuf);
- rspamd_task_insert_result (task, sel->symbol, max, cur);
- }
-
- return TRUE;
-}
-
-GList *
-winnow_weights (struct classifier_ctx *ctx,
- statfile_pool_t * pool,
- GTree * input,
- struct rspamd_task *task)
-{
- struct winnow_callback_data data;
- long double res = 0.;
- GList *cur, *resl = NULL;
- struct rspamd_statfile_config *st;
- struct classify_weight *w;
- char *value;
- int nodes, minnodes;
-
- g_assert (pool != NULL);
- g_assert (ctx != NULL);
-
- data.pool = pool;
- data.now = time (NULL);
- data.ctx = ctx;
-
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
- minnodes = strtol (value, NULL, 10);
- nodes = g_tree_nnodes (input);
- if (nodes > FEATURE_WINDOW_SIZE) {
- nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
- }
- if (nodes < minnodes) {
- msg_info (
- "do not classify message as it has too few tokens: %d, while %d min",
- nodes,
- minnodes);
- return NULL;
- }
- }
-
- cur = ctx->cfg->statfiles;
- while (cur) {
- st = cur->data;
- data.sum = 0;
- data.count = 0;
- if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
- if ((data.file =
- statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
- msg_warn ("cannot open %s, skip it", st->path);
- cur = g_list_next (cur);
- continue;
- }
- }
-
- if (data.file != NULL) {
- g_tree_foreach (input, winnow_classify_callback, &data);
- }
-
- w =
- rspamd_mempool_alloc0 (task->task_pool,
- sizeof (struct classify_weight));
- if (data.count != 0) {
- res = data.sum / (double)data.count;
- }
- else {
- res = 0;
- }
- w->name = st->symbol;
- w->weight = res;
- resl = g_list_prepend (resl, w);
- cur = g_list_next (cur);
- }
-
- if (resl != NULL) {
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)g_list_free, resl);
- }
-
- return resl;
-
-}
-
-
-gboolean
-winnow_learn (struct classifier_ctx *ctx,
- statfile_pool_t *pool,
- const char *symbol,
- GTree * input,
- int in_class,
- double *sum,
- double multiplier,
- GError **err)
-{
- struct winnow_callback_data data = {
- .file = NULL,
- .multiplier = multiplier
- };
- char *value;
- int nodes, minnodes, iterations = 0;
- struct rspamd_statfile_config *st, *sel_st = NULL;
- stat_file_t *sel = NULL, *to_learn;
- long double res = 0., max = 0., start_value = 0., end_value = 0.;
- double learn_threshold = 0.0;
- GList *cur, *to_demote = NULL;
- gboolean force_learn = FALSE;
-
- g_assert (pool != NULL);
- g_assert (ctx != NULL);
-
- data.pool = pool;
- data.in_class = in_class;
- data.now = time (NULL);
- data.ctx = ctx;
-
-
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
- minnodes = strtol (value, NULL, 10);
- nodes = g_tree_nnodes (input);
- if (nodes > FEATURE_WINDOW_SIZE) {
- nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
- }
- if (nodes < minnodes) {
- msg_info (
- "do not learn message as it has too few tokens: %d, while %d min",
- nodes,
- minnodes);
- if (sum != NULL) {
- *sum = 0;
- }
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "message contains too few tokens: %d, while min is %d",
- nodes, minnodes);
- return FALSE;
- }
- }
- if (ctx->cfg->opts &&
- (value =
- g_hash_table_lookup (ctx->cfg->opts, "learn_threshold")) != NULL) {
- learn_threshold = strtod (value, NULL);
- }
-
- if (learn_threshold <= 1.0 && learn_threshold >= 0) {
- /* Classify message and check target statfile score */
- cur = ctx->cfg->statfiles;
- while (cur) {
- /* Open or create all statfiles inside classifier */
- st = cur->data;
- if (statfile_pool_is_open (pool, st->path) == NULL) {
- if (statfile_pool_open (pool, st->path, st->size,
- FALSE) == NULL) {
- msg_warn ("cannot open %s", st->path);
- if (statfile_pool_create (pool, st->path, st->size) == -1) {
- msg_err ("cannot create statfile %s", st->path);
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "cannot create statfile: %s",
- st->path);
- return FALSE;
- }
- if (statfile_pool_open (pool, st->path, st->size,
- FALSE) == NULL) {
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "open statfile %s after creation",
- st->path);
- msg_err ("cannot open statfile %s after creation",
- st->path);
- return FALSE;
- }
- }
- }
- if (strcmp (st->symbol, symbol) == 0) {
- sel_st = st;
-
- }
- cur = g_list_next (cur);
- }
-
- if (sel_st == NULL) {
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "cannot find statfile for symbol %s",
- symbol);
- msg_err ("cannot find statfile for symbol %s", symbol);
- return FALSE;
- }
-
- to_learn = statfile_pool_is_open (pool, sel_st->path);
- if (to_learn == NULL) {
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles",
- sel_st->path);
- return FALSE;
- }
- /* Check target statfile */
- data.file = to_learn;
- data.sum = 0;
- data.count = 0;
- data.new_blocks = 0;
- g_tree_foreach (input, winnow_classify_callback, &data);
- if (data.count > 0) {
- max = data.sum / (double)data.count;
- }
- else {
- max = 0;
- }
- /* If most of blocks are not presented in targeted statfile do forced learn */
- if (max < 1 + learn_threshold) {
- force_learn = TRUE;
- }
- /* Check other statfiles */
- while (cur) {
- st = cur->data;
- data.sum = 0;
- data.count = 0;
- if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles",
- st->path);
- return FALSE;
- }
- g_tree_foreach (input, winnow_classify_callback, &data);
- if (data.count != 0) {
- res = data.sum / data.count;
- }
- else {
- res = 0;
- }
- if (to_learn != data.file && res - max > 1 - learn_threshold) {
- /* Demote tokens in this statfile */
- to_demote = g_list_prepend (to_demote, data.file);
- }
- cur = g_list_next (cur);
- }
- }
- else {
- msg_err (
- "learn threshold is more than 1 or less than 0, so cannot do learn, please check your configuration");
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "bad learn_threshold setting: %.2f",
- learn_threshold);
- return FALSE;
- }
- /* If to_demote list is empty this message is already classified correctly */
- if (max > WINNOW_PROMOTION && to_demote == NULL && !force_learn) {
- msg_info (
- "this message is already of class %s with threshold %.2f and weight %.2F",
- sel_st->symbol,
- learn_threshold,
- max);
- goto end;
- }
- data.learn_file = to_learn;
- end_value = max;
- do {
- cur = ctx->cfg->statfiles;
- data.fresh_run = TRUE;
- while (cur) {
- st = cur->data;
- data.sum = 0;
- data.count = 0;
- data.new_blocks = 0;
- data.start = 0;
- if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
- return FALSE;
- }
- if (to_demote != NULL &&
- g_list_find (to_demote, data.file) != NULL) {
- data.do_demote = TRUE;
- }
- else {
- data.do_demote = FALSE;
- }
-
- statfile_pool_lock_file (pool, data.file);
- g_tree_foreach (input, winnow_learn_callback, &data);
- statfile_pool_unlock_file (pool, data.file);
- if (data.count != 0) {
- res = data.sum / data.count;
- }
- else {
- res = 0;
- }
- if (res > max) {
- max = res;
- sel = data.file;
- }
- if (data.file == to_learn) {
- if (data.count > 0) {
- start_value = data.start / data.count;
- }
- end_value = res;
- }
- cur = g_list_next (cur);
- data.fresh_run = FALSE;
- }
-
- data.multiplier *= WINNOW_PROMOTION;
- msg_info (
- "learn iteration %d for statfile %s: %G -> %G, multiplier: %.2f",
- iterations + 1,
- symbol,
- start_value,
- end_value,
- data.multiplier);
- } while ((in_class ? sel != to_learn : sel ==
- to_learn) && iterations++ < MAX_LEARN_ITERATIONS);
-
- if (iterations >= MAX_LEARN_ITERATIONS) {
- msg_warn (
- "learning statfile %s was not fully successfull: iterations count is limited to %d, final sum is %G",
- sel_st->symbol,
- MAX_LEARN_ITERATIONS,
- max);
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "learning statfile %s was not fully successfull: iterations count is limited to %d",
- sel_st->symbol, MAX_LEARN_ITERATIONS);
- return FALSE;
- }
- else {
- msg_info (
- "learned statfile %s successfully with %d iterations and sum %G",
- sel_st->symbol,
- iterations + 1,
- max);
- }
-
-
-end:
- if (sum) {
-#ifdef HAVE_TANHL
- *sum = (double)tanhl (max);
-#else
- /*
- * As some implementations of libm does not support tanhl, try to use
- * tanh
- */
- *sum = tanh ((double) max);
-#endif
- }
- return TRUE;
-}
-
-gboolean
-winnow_learn_spam (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input,
- struct rspamd_task *task,
- gboolean is_spam,
- lua_State *L,
- GError **err)
-{
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "learn spam is not supported for winnow"
- );
- return FALSE;
-}