]> source.dussan.org Git - rspamd.git/commitdiff
Reorganize statfiles and classifiers into libstat.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 16 Jan 2015 15:28:40 +0000 (15:28 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 16 Jan 2015 15:28:40 +0000 (15:28 +0000)
29 files changed:
CMakeLists.txt
src/CMakeLists.txt
src/classifiers/bayes.c [deleted file]
src/classifiers/classifiers.c [deleted file]
src/classifiers/classifiers.h [deleted file]
src/classifiers/winnow.c [deleted file]
src/controller.c
src/libmime/filter.c
src/libmime/message.c
src/libserver/CMakeLists.txt
src/libserver/binlog.c
src/libserver/cfg_rcl.c
src/libserver/cfg_utils.c
src/libserver/statfile_sync.c
src/libstat/CMakeLists.txt [new file with mode: 0644]
src/libstat/classifiers.h [new file with mode: 0644]
src/libstat/classifiers/bayes.c [new file with mode: 0644]
src/libstat/classifiers/classifiers.c [new file with mode: 0644]
src/libstat/classifiers/winnow.c [new file with mode: 0644]
src/libstat/stat_api.h [new file with mode: 0644]
src/libstat/tokenizers.h [new file with mode: 0644]
src/libstat/tokenizers/osb.c [new file with mode: 0644]
src/libstat/tokenizers/tokenizers.c [new file with mode: 0644]
src/lua/lua_classifier.c
src/lua/lua_config.c
src/lua/lua_task.c
src/tokenizers/osb.c [deleted file]
src/tokenizers/tokenizers.c [deleted file]
src/tokenizers/tokenizers.h [deleted file]

index 924cae91e3c629353d49e6d9c984ac9d01bfe55c..955a6fcbfeda5e1a3d3e5f5e98ec24a229ccab13 100644 (file)
@@ -278,7 +278,7 @@ ENDMACRO()
 ############################# CONFIG SECTION #############################################
 # Initial set
 
-INCLUDE_DIRECTORIES(src/libutil src/libserver src/libmime)
+INCLUDE_DIRECTORIES(src/libutil src/libserver src/libmime src/libstat)
 
 IF(CMAKE_INSTALL_PREFIX)
     SET(PREFIX ${CMAKE_INSTALL_PREFIX})
index ced57d20b69acdd9af54c8dec37fa5be3e864664..1c67416acff7282723de15be6c19c3bb06ea7e54 100644 (file)
@@ -66,6 +66,7 @@ ADD_SUBDIRECTORY(lua)
 ADD_SUBDIRECTORY(libutil)
 ADD_SUBDIRECTORY(libserver)
 ADD_SUBDIRECTORY(libmime)
+ADD_SUBDIRECTORY(libstat)
 ADD_SUBDIRECTORY(client)
                                
 SET(RSPAMDSRC  ${CMAKE_CURRENT_BINARY_DIR}/modules.c
@@ -97,6 +98,7 @@ IF(NOT DEBIAN_BUILD)
        SET_TARGET_PROPERTIES(rspamd PROPERTIES VERSION ${RSPAMD_VERSION})
 ENDIF(NOT DEBIAN_BUILD)
 
+TARGET_LINK_LIBRARIES(rspamd rspamd-stat)
 TARGET_LINK_LIBRARIES(rspamd rspamd-mime)
 TARGET_LINK_LIBRARIES(rspamd rspamd-server)
 TARGET_LINK_LIBRARIES(rspamd rspamd-util)
diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c
deleted file mode 100644 (file)
index 0afd310..0000000
+++ /dev/null
@@ -1,597 +0,0 @@
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Bayesian classifier
- */
-#include "classifiers.h"
-#include "tokenizers/tokenizers.h"
-#include "main.h"
-#include "filter.h"
-#include "cfg_file.h"
-#include "binlog.h"
-#include "lua/lua_common.h"
-
-#define LOCAL_PROB_DENOM 16.0
-
-static inline GQuark
-bayes_error_quark (void)
-{
-       return g_quark_from_static_string ("bayes-error");
-}
-
-struct bayes_statfile_data {
-       guint64 hits;
-       guint64 total_hits;
-       double value;
-       struct rspamd_statfile_config *st;
-       stat_file_t *file;
-};
-
-struct bayes_callback_data {
-       statfile_pool_t *pool;
-       struct classifier_ctx *ctx;
-       gboolean in_class;
-       time_t now;
-       stat_file_t *file;
-       struct bayes_statfile_data *statfiles;
-       guint32 statfiles_num;
-       guint64 total_spam;
-       guint64 total_ham;
-       guint64 processed_tokens;
-       gsize max_tokens;
-       double spam_probability;
-       double ham_probability;
-};
-
-static gboolean
-bayes_learn_callback (gpointer key, gpointer value, gpointer data)
-{
-       token_node_t *node = key;
-       struct bayes_callback_data *cd = data;
-       gint c;
-       guint64 v;
-
-       c = (cd->in_class) ? 1 : -1;
-
-       /* Consider that not found blocks have value 1 */
-       v =
-               statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
-                       cd->now);
-       if (v == 0 && c > 0) {
-               statfile_pool_set_block (cd->pool,
-                       cd->file,
-                       node->h1,
-                       node->h2,
-                       cd->now,
-                       c);
-               cd->processed_tokens++;
-       }
-       else if (v != 0) {
-               if (G_LIKELY (c > 0)) {
-                       v++;
-               }
-               else if (c < 0) {
-                       if (v != 0) {
-                               v--;
-                       }
-               }
-               statfile_pool_set_block (cd->pool,
-                       cd->file,
-                       node->h1,
-                       node->h2,
-                       cd->now,
-                       v);
-               cd->processed_tokens++;
-       }
-
-       if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) {
-               /* Stop learning on max tokens */
-               return TRUE;
-       }
-       return FALSE;
-}
-
-/**
- * Returns probability of chisquare > value with specified number of freedom
- * degrees
- * @param value value to test
- * @param freedom_deg number of degrees of freedom
- * @return
- */
-static gdouble
-inv_chi_square (gdouble value, gint freedom_deg)
-{
-       long double prob, sum;
-       gint i;
-
-       if ((freedom_deg & 1) != 0) {
-               msg_err ("non-odd freedom degrees count: %d", freedom_deg);
-               return 0;
-       }
-
-       value /= 2.;
-       errno = 0;
-#ifdef HAVE_EXPL
-       prob = expl (-value);
-#elif defined(HAVE_EXP2L)
-       prob = exp2l (-value * log2 (M_E));
-#else
-       prob = exp (-value);
-#endif
-       if (errno == ERANGE) {
-               msg_err ("exp overflow");
-               return 0;
-       }
-       sum = prob;
-       for (i = 1; i < freedom_deg / 2; i++) {
-               prob *= value / (gdouble)i;
-               sum += prob;
-       }
-
-       return MIN (1.0, sum);
-}
-
-/*
- * In this callback we calculate local probabilities for tokens
- */
-static gboolean
-bayes_classify_callback (gpointer key, gpointer value, gpointer data)
-{
-
-       token_node_t *node = key;
-       struct bayes_callback_data *cd = data;
-       guint i;
-       struct bayes_statfile_data *cur;
-       guint64 spam_count = 0, ham_count = 0, total_count = 0;
-       double spam_prob, spam_freq, ham_freq, bayes_spam_prob;
-
-       for (i = 0; i < cd->statfiles_num; i++) {
-               cur = &cd->statfiles[i];
-               cur->value = statfile_pool_get_block (cd->pool,
-                               cur->file,
-                               node->h1,
-                               node->h2,
-                               cd->now);
-               if (cur->value > 0) {
-                       cur->total_hits += cur->value;
-                       if (cur->st->is_spam) {
-                               spam_count += cur->value;
-                       }
-                       else {
-                               ham_count += cur->value;
-                       }
-                       total_count += cur->value;
-               }
-       }
-
-       /* Probability for this token */
-       if (total_count > 0) {
-               spam_freq = ((double)spam_count / MAX (1., (double)cd->total_spam));
-               ham_freq = ((double)ham_count / MAX (1., (double)cd->total_ham));
-               spam_prob = spam_freq / (spam_freq + ham_freq);
-               bayes_spam_prob = (0.5 + spam_prob * total_count) / (1. + total_count);
-               cd->spam_probability += log (bayes_spam_prob);
-               cd->ham_probability += log (1. - bayes_spam_prob);
-               cd->processed_tokens++;
-       }
-
-       if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) {
-               /* Stop classifying on max tokens */
-               return TRUE;
-       }
-
-       return FALSE;
-}
-
-struct classifier_ctx *
-bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cfg)
-{
-       struct classifier_ctx *ctx =
-               rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx));
-
-       ctx->pool = pool;
-       ctx->cfg = cfg;
-       ctx->debug = FALSE;
-
-       return ctx;
-}
-
-gboolean
-bayes_classify (struct classifier_ctx * ctx,
-       statfile_pool_t *pool,
-       GTree *input,
-       struct rspamd_task *task,
-       lua_State *L)
-{
-       struct bayes_callback_data data;
-       gchar *value;
-       gint nodes, i = 0, selected_st = -1, cnt;
-       gint minnodes;
-       guint64 maxhits = 0, rev;
-       double final_prob, h, s;
-       struct rspamd_statfile_config *st;
-       stat_file_t *file;
-       GList *cur;
-       char *sumbuf;
-
-       g_assert (pool != NULL);
-       g_assert (ctx != NULL);
-
-       if (ctx->cfg->opts &&
-               (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
-               minnodes = strtol (value, NULL, 10);
-               nodes = g_tree_nnodes (input);
-               if (nodes > FEATURE_WINDOW_SIZE) {
-                       nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
-               }
-               if (nodes < minnodes) {
-                       return FALSE;
-               }
-       }
-
-       cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L);
-       if (cur) {
-               rspamd_mempool_add_destructor (task->task_pool,
-                       (rspamd_mempool_destruct_t)g_list_free, cur);
-       }
-       else {
-               cur = ctx->cfg->statfiles;
-       }
-
-       data.statfiles_num = g_list_length (cur);
-       data.statfiles = g_new0 (struct bayes_statfile_data, data.statfiles_num);
-       data.pool = pool;
-       data.now = time (NULL);
-       data.ctx = ctx;
-
-       data.processed_tokens = 0;
-       data.spam_probability = 0;
-       data.ham_probability = 0;
-       data.total_ham = 0;
-       data.total_spam = 0;
-       if (ctx->cfg->opts &&
-               (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
-               minnodes = rspamd_config_parse_limit (value, -1);
-               data.max_tokens = minnodes;
-       }
-       else {
-               data.max_tokens = 0;
-       }
-
-       while (cur) {
-               /* Select statfile to classify */
-               st = cur->data;
-               if ((file = statfile_pool_is_open (pool, st->path)) == NULL) {
-                       if ((file =
-                               statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
-                               msg_warn ("cannot open %s", st->path);
-                               cur = g_list_next (cur);
-                               data.statfiles_num--;
-                               continue;
-                       }
-               }
-               data.statfiles[i].file = file;
-               data.statfiles[i].st = st;
-               statfile_get_revision (file, &rev, NULL);
-               if (st->is_spam) {
-                       data.total_spam += rev;
-               }
-               else {
-                       data.total_ham += rev;
-               }
-
-               cur = g_list_next (cur);
-               i++;
-       }
-
-       cnt = i;
-
-       g_tree_foreach (input, bayes_classify_callback, &data);
-
-       if (data.processed_tokens == 0 || data.spam_probability == 0) {
-               final_prob = 0;
-       }
-       else {
-               h = 1 - inv_chi_square (-2. * data.spam_probability,
-                               2 * data.processed_tokens);
-               s = 1 - inv_chi_square (-2. * data.ham_probability,
-                               2 * data.processed_tokens);
-               final_prob = (s + 1 - h) / 2.;
-       }
-
-       if (data.processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) {
-
-               sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
-               for (i = 0; i < cnt; i++) {
-                       if ((final_prob > 0.5 && !data.statfiles[i].st->is_spam) ||
-                               (final_prob < 0.5 && data.statfiles[i].st->is_spam)) {
-                               continue;
-                       }
-                       if (data.statfiles[i].total_hits > maxhits) {
-                               maxhits = data.statfiles[i].total_hits;
-                               selected_st = i;
-                       }
-               }
-               if (selected_st == -1) {
-                       msg_err (
-                               "unexpected classifier error: cannot select desired statfile");
-               }
-               else {
-                       /* Calculate ham probability correctly */
-                       if (final_prob < 0.5) {
-                               final_prob = 1. - final_prob;
-                       }
-                       rspamd_snprintf (sumbuf, 32, "%.2f%%", final_prob * 100.);
-                       cur = g_list_prepend (NULL, sumbuf);
-                       rspamd_task_insert_result (task,
-                               data.statfiles[selected_st].st->symbol,
-                               final_prob,
-                               cur);
-               }
-       }
-
-       g_free (data.statfiles);
-
-       return TRUE;
-}
-
-gboolean
-bayes_learn (struct classifier_ctx * ctx,
-       statfile_pool_t *pool,
-       const char *symbol,
-       GTree *input,
-       gboolean in_class,
-       double *sum,
-       double multiplier,
-       GError **err)
-{
-       struct bayes_callback_data data;
-       gchar *value;
-       gint nodes;
-       gint minnodes;
-       struct rspamd_statfile_config *st, *sel_st = NULL;
-       stat_file_t *to_learn;
-       GList *cur;
-
-       g_assert (pool != NULL);
-       g_assert (ctx != NULL);
-
-       if (ctx->cfg->opts &&
-               (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
-               minnodes = strtol (value, NULL, 10);
-               nodes = g_tree_nnodes (input);
-               if (nodes > FEATURE_WINDOW_SIZE) {
-                       nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
-               }
-               if (nodes < minnodes) {
-                       msg_info (
-                               "do not learn message as it has too few tokens: %d, while %d min",
-                               nodes,
-                               minnodes);
-                       *sum = 0;
-                       g_set_error (err,
-                               bayes_error_quark (),           /* error domain */
-                               1,                                  /* error code */
-                               "message contains too few tokens: %d, while min is %d",
-                               nodes, (int)minnodes);
-                       return FALSE;
-               }
-       }
-
-       data.pool = pool;
-       data.in_class = in_class;
-       data.now = time (NULL);
-       data.ctx = ctx;
-       data.processed_tokens = 0;
-       data.processed_tokens = 0;
-       if (ctx->cfg->opts &&
-               (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
-               minnodes = rspamd_config_parse_limit (value, -1);
-               data.max_tokens = minnodes;
-       }
-       else {
-               data.max_tokens = 0;
-       }
-       cur = ctx->cfg->statfiles;
-       while (cur) {
-               /* Select statfile to learn */
-               st = cur->data;
-               if (strcmp (st->symbol, symbol) == 0) {
-                       sel_st = st;
-                       break;
-               }
-               cur = g_list_next (cur);
-       }
-       if (sel_st == NULL) {
-               g_set_error (err,
-                       bayes_error_quark (),           /* error domain */
-                       1,                              /* error code */
-                       "cannot find statfile for symbol: %s",
-                       symbol);
-               return FALSE;
-       }
-       if ((to_learn = statfile_pool_is_open (pool, sel_st->path)) == NULL) {
-               if ((to_learn =
-                       statfile_pool_open (pool, sel_st->path, sel_st->size,
-                       FALSE)) == NULL) {
-                       msg_warn ("cannot open %s", sel_st->path);
-                       if (statfile_pool_create (pool, sel_st->path, sel_st->size) == -1) {
-                               msg_err ("cannot create statfile %s", sel_st->path);
-                               g_set_error (err,
-                                       bayes_error_quark (),           /* error domain */
-                                       1,                              /* error code */
-                                       "cannot create statfile: %s",
-                                       sel_st->path);
-                               return FALSE;
-                       }
-                       if ((to_learn =
-                               statfile_pool_open (pool, sel_st->path, sel_st->size,
-                               FALSE)) == NULL) {
-                               g_set_error (err,
-                                       bayes_error_quark (),           /* error domain */
-                                       1,                              /* error code */
-                                       "cannot open statfile %s after creation",
-                                       sel_st->path);
-                               msg_err ("cannot open statfile %s after creation",
-                                       sel_st->path);
-                               return FALSE;
-                       }
-               }
-       }
-       data.file = to_learn;
-       statfile_pool_lock_file (pool, data.file);
-       g_tree_foreach (input, bayes_learn_callback, &data);
-       statfile_inc_revision (to_learn);
-       statfile_pool_unlock_file (pool, data.file);
-
-       if (sum != NULL) {
-               *sum = data.processed_tokens;
-       }
-
-       return TRUE;
-}
-
-gboolean
-bayes_learn_spam (struct classifier_ctx * ctx,
-       statfile_pool_t *pool,
-       GTree *input,
-       struct rspamd_task *task,
-       gboolean is_spam,
-       lua_State *L,
-       GError **err)
-{
-       struct bayes_callback_data data;
-       gchar *value;
-       gint nodes;
-       gint minnodes;
-       struct rspamd_statfile_config *st;
-       stat_file_t *file;
-       GList *cur;
-       gboolean skip_labels;
-
-       g_assert (pool != NULL);
-       g_assert (ctx != NULL);
-
-       if (ctx->cfg->opts &&
-               (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
-               minnodes = strtol (value, NULL, 10);
-               nodes = g_tree_nnodes (input);
-               if (nodes > FEATURE_WINDOW_SIZE) {
-                       nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
-               }
-               if (nodes < minnodes) {
-                       g_set_error (err,
-                               bayes_error_quark (),           /* error domain */
-                               1,                              /* error code */
-                               "message contains too few tokens: %d, while min is %d",
-                               nodes, (int)minnodes);
-                       return FALSE;
-               }
-       }
-
-       cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, TRUE, is_spam, L);
-       if (cur) {
-               skip_labels = FALSE;
-               rspamd_mempool_add_destructor (task->task_pool,
-                       (rspamd_mempool_destruct_t)g_list_free, cur);
-       }
-       else {
-               /* Do not try to learn specific statfiles if pre callback returned nil */
-               skip_labels = TRUE;
-               cur = ctx->cfg->statfiles;
-       }
-
-       data.pool = pool;
-       data.now = time (NULL);
-       data.ctx = ctx;
-       data.in_class = TRUE;
-
-       data.processed_tokens = 0;
-       if (ctx->cfg->opts &&
-               (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
-               minnodes = rspamd_config_parse_limit (value, -1);
-               data.max_tokens = minnodes;
-       }
-       else {
-               data.max_tokens = 0;
-       }
-
-       while (cur) {
-               /* Select statfiles to learn */
-               st = cur->data;
-               if (st->is_spam != is_spam || (skip_labels && st->label)) {
-                       cur = g_list_next (cur);
-                       continue;
-               }
-               if ((file = statfile_pool_is_open (pool, st->path)) == NULL) {
-                       if ((file =
-                               statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
-                               msg_warn ("cannot open %s", st->path);
-                               if (statfile_pool_create (pool, st->path, st->size) == -1) {
-                                       msg_err ("cannot create statfile %s", st->path);
-                                       g_set_error (err,
-                                               bayes_error_quark (),           /* error domain */
-                                               1,                              /* error code */
-                                               "cannot create statfile: %s",
-                                               st->path);
-                                       return FALSE;
-                               }
-                               if ((file =
-                                       statfile_pool_open (pool, st->path, st->size,
-                                       FALSE)) == NULL) {
-                                       g_set_error (err,
-                                               bayes_error_quark (),           /* error domain */
-                                               1,                              /* error code */
-                                               "cannot open statfile %s after creation",
-                                               st->path);
-                                       msg_err ("cannot open statfile %s after creation",
-                                               st->path);
-                                       return FALSE;
-                               }
-                       }
-               }
-               data.file = file;
-               statfile_pool_lock_file (pool, data.file);
-               g_tree_foreach (input, bayes_learn_callback, &data);
-               statfile_inc_revision (file);
-               statfile_pool_unlock_file (pool, data.file);
-               maybe_write_binlog (ctx->cfg, st, file, input);
-               msg_info ("increase revision for %s", st->path);
-
-               cur = g_list_next (cur);
-       }
-
-       return TRUE;
-}
-
-GList *
-bayes_weights (struct classifier_ctx * ctx,
-       statfile_pool_t *pool,
-       GTree *input,
-       struct rspamd_task *task)
-{
-       /* This function is unimplemented with new normalizer */
-       return NULL;
-}
diff --git a/src/classifiers/classifiers.c b/src/classifiers/classifiers.c
deleted file mode 100644 (file)
index 95dd52c..0000000
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Common classifier functions
- */
-
-#include "classifiers.h"
-
-struct classifier classifiers[] = {
-       {
-               .name = "winnow",
-               .init_func = winnow_init,
-               .classify_func = winnow_classify,
-               .learn_func = winnow_learn,
-               .learn_spam_func = winnow_learn_spam,
-               .weights_func = winnow_weights
-       },
-       {
-               .name = "bayes",
-               .init_func = bayes_init,
-               .classify_func = bayes_classify,
-               .learn_func = bayes_learn,
-               .learn_spam_func = bayes_learn_spam,
-               .weights_func = bayes_weights
-       }
-};
-
-struct classifier *
-get_classifier (const char *name)
-{
-       guint i;
-
-       for (i = 0; i < sizeof (classifiers) / sizeof (classifiers[0]); i++) {
-               if (strcmp (classifiers[i].name, name) == 0) {
-                       return &classifiers[i];
-               }
-       }
-
-       return NULL;
-}
-
-/*
- * vi:ts=4
- */
diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h
deleted file mode 100644 (file)
index 8e59fc5..0000000
+++ /dev/null
@@ -1,111 +0,0 @@
-#ifndef CLASSIFIERS_H
-#define CLASSIFIERS_H
-
-#include "config.h"
-#include "mem_pool.h"
-#include "statfile.h"
-#include "tokenizers/tokenizers.h"
-#include <lua.h>
-
-/* Consider this value as 0 */
-#define ALPHA 0.0001
-
-struct rspamd_classifier_config;
-struct rspamd_task;
-
-struct classifier_ctx {
-       rspamd_mempool_t *pool;
-       GHashTable *results;
-       gboolean debug;
-       struct rspamd_classifier_config *cfg;
-};
-
-struct classify_weight {
-       const char *name;
-       long double weight;
-};
-
-/* Common classifier structure */
-struct classifier {
-       char *name;
-       struct classifier_ctx * (*init_func)(rspamd_mempool_t *pool,
-               struct rspamd_classifier_config *cf);
-       gboolean (*classify_func)(struct classifier_ctx * ctx,
-               statfile_pool_t *pool, GTree *input, struct rspamd_task *task,
-               lua_State *L);
-       gboolean (*learn_func)(struct classifier_ctx * ctx, statfile_pool_t *pool,
-               const char *symbol, GTree *input, gboolean in_class,
-               double *sum, double multiplier, GError **err);
-       gboolean (*learn_spam_func)(struct classifier_ctx * ctx,
-               statfile_pool_t *pool,
-               GTree *input, struct rspamd_task *task, gboolean is_spam, lua_State *L,
-               GError **err);
-       GList * (*weights_func)(struct classifier_ctx * ctx, statfile_pool_t *pool,
-               GTree *input, struct rspamd_task *task);
-};
-
-/* Get classifier structure by name or return NULL if this name is not found */
-struct classifier * get_classifier (const char *name);
-
-/* Winnow algorithm */
-struct classifier_ctx * winnow_init (rspamd_mempool_t *pool,
-       struct rspamd_classifier_config *cf);
-gboolean winnow_classify (struct classifier_ctx * ctx,
-       statfile_pool_t *pool,
-       GTree *input,
-       struct rspamd_task *task,
-       lua_State *L);
-gboolean winnow_learn (struct classifier_ctx * ctx,
-       statfile_pool_t *pool,
-       const char *symbol,
-       GTree *input,
-       gboolean in_class,
-       double *sum,
-       double multiplier,
-       GError **err);
-gboolean winnow_learn_spam (struct classifier_ctx * ctx,
-       statfile_pool_t *pool,
-       GTree *input,
-       struct rspamd_task *task,
-       gboolean is_spam,
-       lua_State *L,
-       GError **err);
-GList * winnow_weights (struct classifier_ctx * ctx,
-       statfile_pool_t *pool,
-       GTree *input,
-       struct rspamd_task *task);
-
-/* Bayes algorithm */
-struct classifier_ctx * bayes_init (rspamd_mempool_t *pool,
-       struct rspamd_classifier_config *cf);
-gboolean bayes_classify (struct classifier_ctx * ctx,
-       statfile_pool_t *pool,
-       GTree *input,
-       struct rspamd_task *task,
-       lua_State *L);
-gboolean bayes_learn (struct classifier_ctx * ctx,
-       statfile_pool_t *pool,
-       const char *symbol,
-       GTree *input,
-       gboolean in_class,
-       double *sum,
-       double multiplier,
-       GError **err);
-gboolean bayes_learn_spam (struct classifier_ctx * ctx,
-       statfile_pool_t *pool,
-       GTree *input,
-       struct rspamd_task *task,
-       gboolean is_spam,
-       lua_State *L,
-       GError **err);
-GList * bayes_weights (struct classifier_ctx * ctx,
-       statfile_pool_t *pool,
-       GTree *input,
-       struct rspamd_task *task);
-/* Array of all defined classifiers */
-extern struct classifier classifiers[];
-
-#endif
-/*
- * vi:ts=4
- */
diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c
deleted file mode 100644 (file)
index 4bfe086..0000000
+++ /dev/null
@@ -1,694 +0,0 @@
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Winnow classifier
- */
-
-#include "classifiers.h"
-#include "tokenizers/tokenizers.h"
-#include "main.h"
-#include "filter.h"
-#include "cfg_file.h"
-#include "lua/lua_common.h"
-
-#define WINNOW_PROMOTION 1.23
-#define WINNOW_DEMOTION 0.83
-
-#define MEDIAN_WINDOW_SIZE 5
-
-#define MAX_WEIGHT G_MAXDOUBLE / 2.
-
-
-
-#define MAX_LEARN_ITERATIONS 100
-
-static inline GQuark
-winnow_error_quark (void)
-{
-       return g_quark_from_static_string ("winnow-error");
-}
-
-struct winnow_callback_data {
-       statfile_pool_t *pool;
-       struct classifier_ctx *ctx;
-       stat_file_t *file;
-       stat_file_t *learn_file;
-       long double sum;
-       long double start;
-       double multiplier;
-       guint32 count;
-       guint32 new_blocks;
-       gboolean in_class;
-       gboolean do_demote;
-       gboolean fresh_run;
-       time_t now;
-};
-
-static const double max_common_weight = MAX_WEIGHT * WINNOW_DEMOTION;
-
-
-
-static gboolean
-winnow_classify_callback (gpointer key, gpointer value, gpointer data)
-{
-       token_node_t *node = key;
-       struct winnow_callback_data *cd = data;
-       double v;
-
-       /* Consider that not found blocks have value 1 */
-       v =
-               statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
-                       cd->now);
-       if (fabs (v) > ALPHA) {
-               cd->sum += v;
-       }
-       else {
-               cd->sum += 1.0;
-               cd->new_blocks++;
-       }
-
-       cd->count++;
-
-       return FALSE;
-}
-
-static gboolean
-winnow_learn_callback (gpointer key, gpointer value, gpointer data)
-{
-       token_node_t *node = key;
-       struct winnow_callback_data *cd = data;
-       double v, c;
-
-       c = (cd->in_class) ? WINNOW_PROMOTION * cd->multiplier : WINNOW_DEMOTION /
-               cd->multiplier;
-
-       /* Consider that not found blocks have value 1 */
-       v =
-               statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
-                       cd->now);
-       if (fabs (v) < ALPHA) {
-               /* Block not found, insert new */
-               cd->start += 1;
-               if (cd->file == cd->learn_file) {
-                       statfile_pool_set_block (cd->pool,
-                               cd->file,
-                               node->h1,
-                               node->h2,
-                               cd->now,
-                               c);
-                       node->value = c;
-                       cd->new_blocks++;
-               }
-       }
-       else {
-               cd->start += v;
-               /* Here we just increase the extra value of block */
-               if (cd->fresh_run) {
-                       node->extra = 0;
-               }
-               else {
-                       node->extra++;
-               }
-               node->value = v;
-
-               if (node->extra > 1) {
-                       /*
-                        * Assume that this node is common for several statfiles, so
-                        * decrease its weight proportianally
-                        */
-                       if (node->value > max_common_weight) {
-                               /* Static fluctuation */
-                               statfile_pool_set_block (cd->pool,
-                                       cd->file,
-                                       node->h1,
-                                       node->h2,
-                                       cd->now,
-                                       0.);
-                               node->value = 0.;
-                       }
-                       else if (node->value > WINNOW_PROMOTION * cd->multiplier) {
-                               /* Try to decrease its value */
-                               /* XXX: it is more intelligent to add some adaptive filter here */
-                               if (cd->file == cd->learn_file) {
-                                       if (node->value > max_common_weight / 2.) {
-                                               node->value *= c;
-                                       }
-                                       else {
-                                               /*
-                                                * Too high token value that exists also in other
-                                                * statfiles, may be statistic error, so decrease it
-                                                * slightly
-                                                */
-                                               node->value *= WINNOW_DEMOTION;
-                                       }
-                               }
-                               else {
-                                       node->value = WINNOW_DEMOTION / cd->multiplier;
-                               }
-                               statfile_pool_set_block (cd->pool,
-                                       cd->file,
-                                       node->h1,
-                                       node->h2,
-                                       cd->now,
-                                       node->value);
-                       }
-               }
-               else if (cd->file == cd->learn_file) {
-                       /* New block or block that is in only one statfile */
-                       /* Set some limit on growing */
-                       if (v > MAX_WEIGHT) {
-                               node->value = v;
-                       }
-                       else {
-                               node->value *= c;
-                       }
-                       statfile_pool_set_block (cd->pool,
-                               cd->file,
-                               node->h1,
-                               node->h2,
-                               cd->now,
-                               node->value);
-               }
-               else if (cd->do_demote) {
-                       /* Demote blocks in file */
-                       node->value *= WINNOW_DEMOTION / cd->multiplier;
-                       statfile_pool_set_block (cd->pool,
-                               cd->file,
-                               node->h1,
-                               node->h2,
-                               cd->now,
-                               node->value);
-               }
-       }
-
-
-       cd->sum += node->value;
-
-       cd->count++;
-
-       return FALSE;
-}
-
-struct classifier_ctx *
-winnow_init (rspamd_mempool_t * pool, struct rspamd_classifier_config *cfg)
-{
-       struct classifier_ctx *ctx =
-               rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx));
-
-       ctx->pool = pool;
-       ctx->cfg = cfg;
-
-       return ctx;
-}
-
-gboolean
-winnow_classify (struct classifier_ctx *ctx,
-       statfile_pool_t * pool,
-       GTree * input,
-       struct rspamd_task *task,
-       lua_State *L)
-{
-       struct winnow_callback_data data;
-       char *sumbuf, *value;
-       long double res = 0., max = 0.;
-       GList *cur;
-       struct rspamd_statfile_config *st, *sel = NULL;
-       int nodes, minnodes;
-
-       g_assert (pool != NULL);
-       g_assert (ctx != NULL);
-
-       data.pool = pool;
-       data.now = time (NULL);
-       data.ctx = ctx;
-
-       if (ctx->cfg->opts &&
-               (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
-               minnodes = strtol (value, NULL, 10);
-               nodes = g_tree_nnodes (input);
-               if (nodes > FEATURE_WINDOW_SIZE) {
-                       nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
-               }
-               if (nodes < minnodes) {
-                       msg_info (
-                               "do not classify message as it has too few tokens: %d, while %d min",
-                               nodes,
-                               minnodes);
-                       return FALSE;
-               }
-       }
-
-       cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L);
-       if (cur) {
-               rspamd_mempool_add_destructor (task->task_pool,
-                       (rspamd_mempool_destruct_t)g_list_free, cur);
-       }
-       else {
-               cur = ctx->cfg->statfiles;
-       }
-
-       while (cur) {
-               st = cur->data;
-               data.sum = 0;
-               data.count = 0;
-               data.new_blocks = 0;
-               if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
-                       if ((data.file =
-                               statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
-                               msg_warn ("cannot open %s, skip it", st->path);
-                               cur = g_list_next (cur);
-                               continue;
-                       }
-               }
-
-               if (data.file != NULL) {
-                       g_tree_foreach (input, winnow_classify_callback, &data);
-               }
-
-               if (data.count != 0) {
-                       res = data.sum / (double)data.count;
-               }
-               else {
-                       res = 0;
-               }
-               if (res > max) {
-                       max = res;
-                       sel = st;
-               }
-               cur = g_list_next (cur);
-       }
-
-       if (sel != NULL) {
-#ifdef WITH_LUA
-               max = rspamd_lua_call_cls_post_callbacks (ctx->cfg, task, max, L);
-#endif
-#ifdef HAVE_TANHL
-               max = tanhl (max);
-#else
-               /*
-                * As some implementations of libm does not support tanhl, try to use
-                * tanh
-                */
-               max = tanh ((double) max);
-#endif
-               sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
-               rspamd_snprintf (sumbuf, 32, "%.2F", max);
-               cur = g_list_prepend (NULL, sumbuf);
-               rspamd_task_insert_result (task, sel->symbol, max, cur);
-       }
-
-       return TRUE;
-}
-
-GList *
-winnow_weights (struct classifier_ctx *ctx,
-       statfile_pool_t * pool,
-       GTree * input,
-       struct rspamd_task *task)
-{
-       struct winnow_callback_data data;
-       long double res = 0.;
-       GList *cur, *resl = NULL;
-       struct rspamd_statfile_config *st;
-       struct classify_weight *w;
-       char *value;
-       int nodes, minnodes;
-
-       g_assert (pool != NULL);
-       g_assert (ctx != NULL);
-
-       data.pool = pool;
-       data.now = time (NULL);
-       data.ctx = ctx;
-
-       if (ctx->cfg->opts &&
-               (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
-               minnodes = strtol (value, NULL, 10);
-               nodes = g_tree_nnodes (input);
-               if (nodes > FEATURE_WINDOW_SIZE) {
-                       nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
-               }
-               if (nodes < minnodes) {
-                       msg_info (
-                               "do not classify message as it has too few tokens: %d, while %d min",
-                               nodes,
-                               minnodes);
-                       return NULL;
-               }
-       }
-
-       cur = ctx->cfg->statfiles;
-       while (cur) {
-               st = cur->data;
-               data.sum = 0;
-               data.count = 0;
-               if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
-                       if ((data.file =
-                               statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
-                               msg_warn ("cannot open %s, skip it", st->path);
-                               cur = g_list_next (cur);
-                               continue;
-                       }
-               }
-
-               if (data.file != NULL) {
-                       g_tree_foreach (input, winnow_classify_callback, &data);
-               }
-
-               w =
-                       rspamd_mempool_alloc0 (task->task_pool,
-                               sizeof (struct classify_weight));
-               if (data.count != 0) {
-                       res = data.sum / (double)data.count;
-               }
-               else {
-                       res = 0;
-               }
-               w->name = st->symbol;
-               w->weight = res;
-               resl = g_list_prepend (resl, w);
-               cur = g_list_next (cur);
-       }
-
-       if (resl != NULL) {
-               rspamd_mempool_add_destructor (task->task_pool,
-                       (rspamd_mempool_destruct_t)g_list_free, resl);
-       }
-
-       return resl;
-
-}
-
-
-gboolean
-winnow_learn (struct classifier_ctx *ctx,
-       statfile_pool_t *pool,
-       const char *symbol,
-       GTree * input,
-       int in_class,
-       double *sum,
-       double multiplier,
-       GError **err)
-{
-       struct winnow_callback_data data = {
-               .file = NULL,
-               .multiplier = multiplier
-       };
-       char *value;
-       int nodes, minnodes, iterations = 0;
-       struct rspamd_statfile_config *st, *sel_st = NULL;
-       stat_file_t *sel = NULL, *to_learn;
-       long double res = 0., max = 0., start_value = 0., end_value = 0.;
-       double learn_threshold = 0.0;
-       GList *cur, *to_demote = NULL;
-       gboolean force_learn = FALSE;
-
-       g_assert (pool != NULL);
-       g_assert (ctx != NULL);
-
-       data.pool = pool;
-       data.in_class = in_class;
-       data.now = time (NULL);
-       data.ctx = ctx;
-
-
-       if (ctx->cfg->opts &&
-               (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
-               minnodes = strtol (value, NULL, 10);
-               nodes = g_tree_nnodes (input);
-               if (nodes > FEATURE_WINDOW_SIZE) {
-                       nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
-               }
-               if (nodes < minnodes) {
-                       msg_info (
-                               "do not learn message as it has too few tokens: %d, while %d min",
-                               nodes,
-                               minnodes);
-                       if (sum != NULL) {
-                               *sum = 0;
-                       }
-                       g_set_error (err,
-                               winnow_error_quark (),              /* error domain */
-                               1,                                  /* error code */
-                               "message contains too few tokens: %d, while min is %d",
-                               nodes, minnodes);
-                       return FALSE;
-               }
-       }
-       if (ctx->cfg->opts &&
-               (value =
-               g_hash_table_lookup (ctx->cfg->opts, "learn_threshold")) != NULL) {
-               learn_threshold = strtod (value, NULL);
-       }
-
-       if (learn_threshold <= 1.0 && learn_threshold >= 0) {
-               /* Classify message and check target statfile score */
-               cur = ctx->cfg->statfiles;
-               while (cur) {
-                       /* Open or create all statfiles inside classifier */
-                       st = cur->data;
-                       if (statfile_pool_is_open (pool, st->path) == NULL) {
-                               if (statfile_pool_open (pool, st->path, st->size,
-                                       FALSE) == NULL) {
-                                       msg_warn ("cannot open %s", st->path);
-                                       if (statfile_pool_create (pool, st->path, st->size) == -1) {
-                                               msg_err ("cannot create statfile %s", st->path);
-                                               g_set_error (err,
-                                                       winnow_error_quark (),          /* error domain */
-                                                       1,                              /* error code */
-                                                       "cannot create statfile: %s",
-                                                       st->path);
-                                               return FALSE;
-                                       }
-                                       if (statfile_pool_open (pool, st->path, st->size,
-                                               FALSE) == NULL) {
-                                               g_set_error (err,
-                                                       winnow_error_quark (),          /* error domain */
-                                                       1,                              /* error code */
-                                                       "open statfile %s after creation",
-                                                       st->path);
-                                               msg_err ("cannot open statfile %s after creation",
-                                                       st->path);
-                                               return FALSE;
-                                       }
-                               }
-                       }
-                       if (strcmp (st->symbol, symbol) == 0) {
-                               sel_st = st;
-
-                       }
-                       cur = g_list_next (cur);
-               }
-
-               if (sel_st == NULL) {
-                       g_set_error (err,
-                               winnow_error_quark (),          /* error domain */
-                               1,                              /* error code */
-                               "cannot find statfile for symbol %s",
-                               symbol);
-                       msg_err ("cannot find statfile for symbol %s", symbol);
-                       return FALSE;
-               }
-
-               to_learn = statfile_pool_is_open (pool, sel_st->path);
-               if (to_learn == NULL) {
-                       g_set_error (err,
-                               winnow_error_quark (),          /* error domain */
-                               1,                              /* error code */
-                               "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles",
-                               sel_st->path);
-                       return FALSE;
-               }
-               /* Check target statfile */
-               data.file = to_learn;
-               data.sum = 0;
-               data.count = 0;
-               data.new_blocks = 0;
-               g_tree_foreach (input, winnow_classify_callback, &data);
-               if (data.count > 0) {
-                       max = data.sum / (double)data.count;
-               }
-               else {
-                       max = 0;
-               }
-               /* If most of blocks are not presented in targeted statfile do forced learn */
-               if (max < 1 + learn_threshold) {
-                       force_learn = TRUE;
-               }
-               /* Check other statfiles */
-               while (cur) {
-                       st = cur->data;
-                       data.sum = 0;
-                       data.count = 0;
-                       if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
-                               g_set_error (err,
-                                       winnow_error_quark (),          /* error domain */
-                                       1,                              /* error code */
-                                       "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles",
-                                       st->path);
-                               return FALSE;
-                       }
-                       g_tree_foreach (input, winnow_classify_callback, &data);
-                       if (data.count != 0) {
-                               res = data.sum / data.count;
-                       }
-                       else {
-                               res = 0;
-                       }
-                       if (to_learn != data.file && res - max > 1 - learn_threshold) {
-                               /* Demote tokens in this statfile */
-                               to_demote = g_list_prepend (to_demote, data.file);
-                       }
-                       cur = g_list_next (cur);
-               }
-       }
-       else {
-               msg_err (
-                       "learn threshold is more than 1 or less than 0, so cannot do learn, please check your configuration");
-               g_set_error (err,
-                       winnow_error_quark (),          /* error domain */
-                       1,                              /* error code */
-                       "bad learn_threshold setting: %.2f",
-                       learn_threshold);
-               return FALSE;
-       }
-       /* If to_demote list is empty this message is already classified correctly */
-       if (max > WINNOW_PROMOTION && to_demote == NULL && !force_learn) {
-               msg_info (
-                       "this message is already of class %s with threshold %.2f and weight %.2F",
-                       sel_st->symbol,
-                       learn_threshold,
-                       max);
-               goto end;
-       }
-       data.learn_file = to_learn;
-       end_value = max;
-       do {
-               cur = ctx->cfg->statfiles;
-               data.fresh_run = TRUE;
-               while (cur) {
-                       st = cur->data;
-                       data.sum = 0;
-                       data.count = 0;
-                       data.new_blocks = 0;
-                       data.start = 0;
-                       if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
-                               return FALSE;
-                       }
-                       if (to_demote != NULL &&
-                               g_list_find (to_demote, data.file) != NULL) {
-                               data.do_demote = TRUE;
-                       }
-                       else {
-                               data.do_demote = FALSE;
-                       }
-
-                       statfile_pool_lock_file (pool, data.file);
-                       g_tree_foreach (input, winnow_learn_callback, &data);
-                       statfile_pool_unlock_file (pool, data.file);
-                       if (data.count != 0) {
-                               res = data.sum / data.count;
-                       }
-                       else {
-                               res = 0;
-                       }
-                       if (res > max) {
-                               max = res;
-                               sel = data.file;
-                       }
-                       if (data.file == to_learn) {
-                               if (data.count > 0) {
-                                       start_value = data.start / data.count;
-                               }
-                               end_value = res;
-                       }
-                       cur = g_list_next (cur);
-                       data.fresh_run = FALSE;
-               }
-
-               data.multiplier *= WINNOW_PROMOTION;
-               msg_info (
-                       "learn iteration %d for statfile %s: %G -> %G, multiplier: %.2f",
-                       iterations + 1,
-                       symbol,
-                       start_value,
-                       end_value,
-                       data.multiplier);
-       } while ((in_class ? sel != to_learn : sel ==
-               to_learn)  && iterations++ < MAX_LEARN_ITERATIONS);
-
-       if (iterations >= MAX_LEARN_ITERATIONS) {
-               msg_warn (
-                       "learning statfile %s  was not fully successfull: iterations count is limited to %d, final sum is %G",
-                       sel_st->symbol,
-                       MAX_LEARN_ITERATIONS,
-                       max);
-               g_set_error (err,
-                       winnow_error_quark (),          /* error domain */
-                       1,                              /* error code */
-                       "learning statfile %s  was not fully successfull: iterations count is limited to %d",
-                       sel_st->symbol, MAX_LEARN_ITERATIONS);
-               return FALSE;
-       }
-       else {
-               msg_info (
-                       "learned statfile %s successfully with %d iterations and sum %G",
-                       sel_st->symbol,
-                       iterations + 1,
-                       max);
-       }
-
-
-end:
-       if (sum) {
-#ifdef HAVE_TANHL
-               *sum = (double)tanhl (max);
-#else
-               /*
-                * As some implementations of libm does not support tanhl, try to use
-                * tanh
-                */
-               *sum = tanh ((double) max);
-#endif
-       }
-       return TRUE;
-}
-
-gboolean
-winnow_learn_spam (struct classifier_ctx * ctx,
-       statfile_pool_t *pool,
-       GTree *input,
-       struct rspamd_task *task,
-       gboolean is_spam,
-       lua_State *L,
-       GError **err)
-{
-       g_set_error (err,
-               winnow_error_quark (),                  /* error domain */
-               1,                                      /* error code */
-               "learn spam is not supported for winnow"
-               );
-       return FALSE;
-}
index 33422f78243ae8b11985d17f3f1f6b5d7cd45d48..3b6436490c071ce21692d18cbe94150852a20e43 100644 (file)
@@ -23,8 +23,8 @@
 
 
 #include "config.h"
-#include "tokenizers/tokenizers.h"
-#include "classifiers/classifiers.h"
+#include "tokenizers.h"
+#include "classifiers.h"
 #include "libserver/dynamic_cfg.h"
 #include "libutil/rrd.h"
 #include "libutil/map.h"
index a1f042aae7f4040185af0588d816b0a0bb8b8f3a..48285ea0a365d65f4ee1020023135b260a180fb2 100644 (file)
@@ -32,8 +32,8 @@
 #include "expressions.h"
 #include "binlog.h"
 #include "diff.h"
-#include "classifiers/classifiers.h"
-#include "tokenizers/tokenizers.h"
+#include "classifiers.h"
+#include "tokenizers.h"
 
 #ifdef WITH_LUA
 #   include "lua/lua_common.h"
index 94137af157ca73b755927b388e8a2cd14511b320..682e0cf82c80567dca77d85a541f09a460056a18 100644 (file)
@@ -30,7 +30,7 @@
 #include "html.h"
 #include "images.h"
 #include "utlist.h"
-#include "tokenizers/tokenizers.h"
+#include "tokenizers.h"
 
 #include <iconv.h>
 
index 8696da7ba5fd5a34162f27955f60cf64a6790aa0..307611301b22f8d5910f10d88eea2f4f786736e4 100644 (file)
@@ -21,13 +21,6 @@ SET(LIBRSPAMDSERVERSRC
                                url.c
                                worker_util.c)
 
-SET(TOKENIZERSSRC  ../tokenizers/tokenizers.c
-                               ../tokenizers/osb.c)
-
-SET(CLASSIFIERSSRC ../classifiers/classifiers.c
-                ../classifiers/bayes.c
-                               ../classifiers/winnow.c)
-
 # Librspamd-server
 
 #IF(WITH_DB)
@@ -37,7 +30,7 @@ SET(CLASSIFIERSSRC ../classifiers/classifiers.c
 #      LIST(APPEND LIBRSPAMDSERVERSRC kvstorage_sqlite.c)
 #ENDIF(WITH_SQLITE)
                                
-ADD_LIBRARY(rspamd-server ${LINK_TYPE} ${LIBRSPAMDSERVERSRC} ${TOKENIZERSSRC} ${CLASSIFIERSSRC})
+ADD_LIBRARY(rspamd-server ${LINK_TYPE} ${LIBRSPAMDSERVERSRC})
 IF(NOT DEBIAN_BUILD)
        SET_TARGET_PROPERTIES(rspamd-server PROPERTIES VERSION ${RSPAMD_VERSION})
 ENDIF(NOT DEBIAN_BUILD)
index 5eeae7ac23f274b723a1ce036a1bd02f75377f50..c4801633927e53d84891bd3f37b37b4943cf5d99 100644 (file)
@@ -25,7 +25,7 @@
 #include "config.h"
 #include "binlog.h"
 #include "cfg_file.h"
-#include "tokenizers/tokenizers.h"
+#include "tokenizers.h"
 
 #define BINLOG_SUFFIX ".binlog"
 #define BACKUP_SUFFIX ".old"
index 13ef400ed62e3e94bc9d24d6a442ebb1f32dfb5f..44db06a0bdd86de699554296854d87a26a24e9e5 100644 (file)
@@ -28,8 +28,8 @@
 #include "cfg_file.h"
 #include "lua/lua_common.h"
 #include "expressions.h"
-#include "classifiers/classifiers.h"
-#include "tokenizers/tokenizers.h"
+#include "classifiers.h"
+#include "tokenizers.h"
 
 
 struct rspamd_rcl_default_handler_data {
index e28f6445ec3a8989c854d68e7574ffb4f7fbd9ab..b53a2690c6d1bbba8997ba9d8356b436e023c709 100644 (file)
@@ -29,7 +29,7 @@
 #include "main.h"
 #include "uthash_strcase.h"
 #include "filter.h"
-#include "classifiers/classifiers.h"
+#include "classifiers.h"
 #include "lua/lua_common.h"
 #include "kvstorage_config.h"
 #include "map.h"
index 23ed96e12941d3276d55725896fa3b9af49d4328..62f84805956d787718e39cec043a90b4cd12f656 100644 (file)
@@ -24,8 +24,8 @@
 
 #include "config.h"
 #include "cfg_file.h"
-#include "tokenizers/tokenizers.h"
-#include "classifiers/classifiers.h"
+#include "tokenizers.h"
+#include "classifiers.h"
 #include "statfile.h"
 #include "binlog.h"
 #include "buffer.h"
diff --git a/src/libstat/CMakeLists.txt b/src/libstat/CMakeLists.txt
new file mode 100644 (file)
index 0000000..6254a41
--- /dev/null
@@ -0,0 +1,27 @@
+# Librspamdserver
+SET(LIBSTATSRC
+                               )
+SET(TOKENIZERSSRC  tokenizers/tokenizers.c
+                               tokenizers/osb.c)
+
+SET(CLASSIFIERSSRC classifiers/classifiers.c
+                classifiers/bayes.c
+                               classifiers/winnow.c)
+                               
+ADD_LIBRARY(rspamd-stat ${LINK_TYPE} ${LIBSTATSRC} ${TOKENIZERSSRC} ${CLASSIFIERSSRC})
+IF(NOT DEBIAN_BUILD)
+       SET_TARGET_PROPERTIES(rspamd-stat PROPERTIES VERSION ${RSPAMD_VERSION})
+ENDIF(NOT DEBIAN_BUILD)
+SET_TARGET_PROPERTIES(rspamd-stat PROPERTIES LINKER_LANGUAGE C COMPILE_FLAGS "-DRSPAMD_LIB")
+TARGET_LINK_LIBRARIES(rspamd-stat rspamd-server)
+
+IF(CMAKE_COMPILER_IS_GNUCC)
+SET_TARGET_PROPERTIES(rspamd-stat PROPERTIES COMPILE_FLAGS "-DRSPAMD_LIB -fno-strict-aliasing")
+ENDIF(CMAKE_COMPILER_IS_GNUCC)
+
+
+IF(NO_SHARED MATCHES "OFF")
+       INSTALL(TARGETS rspamd-stat
+       LIBRARY DESTINATION ${LIBDIR} 
+       PUBLIC_HEADER DESTINATION ${INCLUDEDIR})
+ENDIF(NO_SHARED MATCHES "OFF")
diff --git a/src/libstat/classifiers.h b/src/libstat/classifiers.h
new file mode 100644 (file)
index 0000000..fd1b63b
--- /dev/null
@@ -0,0 +1,111 @@
+#ifndef CLASSIFIERS_H
+#define CLASSIFIERS_H
+
+#include "config.h"
+#include "mem_pool.h"
+#include "statfile.h"
+#include "tokenizers.h"
+#include <lua.h>
+
+/* Consider this value as 0 */
+#define ALPHA 0.0001
+
+struct rspamd_classifier_config;
+struct rspamd_task;
+
+struct classifier_ctx {
+       rspamd_mempool_t *pool;
+       GHashTable *results;
+       gboolean debug;
+       struct rspamd_classifier_config *cfg;
+};
+
+struct classify_weight {
+       const char *name;
+       long double weight;
+};
+
+/* Common classifier structure */
+struct classifier {
+       char *name;
+       struct classifier_ctx * (*init_func)(rspamd_mempool_t *pool,
+               struct rspamd_classifier_config *cf);
+       gboolean (*classify_func)(struct classifier_ctx * ctx,
+               statfile_pool_t *pool, GTree *input, struct rspamd_task *task,
+               lua_State *L);
+       gboolean (*learn_func)(struct classifier_ctx * ctx, statfile_pool_t *pool,
+               const char *symbol, GTree *input, gboolean in_class,
+               double *sum, double multiplier, GError **err);
+       gboolean (*learn_spam_func)(struct classifier_ctx * ctx,
+               statfile_pool_t *pool,
+               GTree *input, struct rspamd_task *task, gboolean is_spam, lua_State *L,
+               GError **err);
+       GList * (*weights_func)(struct classifier_ctx * ctx, statfile_pool_t *pool,
+               GTree *input, struct rspamd_task *task);
+};
+
+/* Get classifier structure by name or return NULL if this name is not found */
+struct classifier * get_classifier (const char *name);
+
+/* Winnow algorithm */
+struct classifier_ctx * winnow_init (rspamd_mempool_t *pool,
+       struct rspamd_classifier_config *cf);
+gboolean winnow_classify (struct classifier_ctx * ctx,
+       statfile_pool_t *pool,
+       GTree *input,
+       struct rspamd_task *task,
+       lua_State *L);
+gboolean winnow_learn (struct classifier_ctx * ctx,
+       statfile_pool_t *pool,
+       const char *symbol,
+       GTree *input,
+       gboolean in_class,
+       double *sum,
+       double multiplier,
+       GError **err);
+gboolean winnow_learn_spam (struct classifier_ctx * ctx,
+       statfile_pool_t *pool,
+       GTree *input,
+       struct rspamd_task *task,
+       gboolean is_spam,
+       lua_State *L,
+       GError **err);
+GList * winnow_weights (struct classifier_ctx * ctx,
+       statfile_pool_t *pool,
+       GTree *input,
+       struct rspamd_task *task);
+
+/* Bayes algorithm */
+struct classifier_ctx * bayes_init (rspamd_mempool_t *pool,
+       struct rspamd_classifier_config *cf);
+gboolean bayes_classify (struct classifier_ctx * ctx,
+       statfile_pool_t *pool,
+       GTree *input,
+       struct rspamd_task *task,
+       lua_State *L);
+gboolean bayes_learn (struct classifier_ctx * ctx,
+       statfile_pool_t *pool,
+       const char *symbol,
+       GTree *input,
+       gboolean in_class,
+       double *sum,
+       double multiplier,
+       GError **err);
+gboolean bayes_learn_spam (struct classifier_ctx * ctx,
+       statfile_pool_t *pool,
+       GTree *input,
+       struct rspamd_task *task,
+       gboolean is_spam,
+       lua_State *L,
+       GError **err);
+GList * bayes_weights (struct classifier_ctx * ctx,
+       statfile_pool_t *pool,
+       GTree *input,
+       struct rspamd_task *task);
+/* Array of all defined classifiers */
+extern struct classifier classifiers[];
+
+#endif
+/*
+ * vi:ts=4
+ */
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
new file mode 100644 (file)
index 0000000..3416969
--- /dev/null
@@ -0,0 +1,597 @@
+/*
+ * Copyright (c) 2009-2012, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Bayesian classifier
+ */
+#include "classifiers.h"
+#include "tokenizers.h"
+#include "main.h"
+#include "filter.h"
+#include "cfg_file.h"
+#include "binlog.h"
+#include "lua/lua_common.h"
+
+#define LOCAL_PROB_DENOM 16.0
+
+static inline GQuark
+bayes_error_quark (void)
+{
+       return g_quark_from_static_string ("bayes-error");
+}
+
+struct bayes_statfile_data {
+       guint64 hits;
+       guint64 total_hits;
+       double value;
+       struct rspamd_statfile_config *st;
+       stat_file_t *file;
+};
+
+struct bayes_callback_data {
+       statfile_pool_t *pool;
+       struct classifier_ctx *ctx;
+       gboolean in_class;
+       time_t now;
+       stat_file_t *file;
+       struct bayes_statfile_data *statfiles;
+       guint32 statfiles_num;
+       guint64 total_spam;
+       guint64 total_ham;
+       guint64 processed_tokens;
+       gsize max_tokens;
+       double spam_probability;
+       double ham_probability;
+};
+
+static gboolean
+bayes_learn_callback (gpointer key, gpointer value, gpointer data)
+{
+       token_node_t *node = key;
+       struct bayes_callback_data *cd = data;
+       gint c;
+       guint64 v;
+
+       c = (cd->in_class) ? 1 : -1;
+
+       /* Consider that not found blocks have value 1 */
+       v =
+               statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
+                       cd->now);
+       if (v == 0 && c > 0) {
+               statfile_pool_set_block (cd->pool,
+                       cd->file,
+                       node->h1,
+                       node->h2,
+                       cd->now,
+                       c);
+               cd->processed_tokens++;
+       }
+       else if (v != 0) {
+               if (G_LIKELY (c > 0)) {
+                       v++;
+               }
+               else if (c < 0) {
+                       if (v != 0) {
+                               v--;
+                       }
+               }
+               statfile_pool_set_block (cd->pool,
+                       cd->file,
+                       node->h1,
+                       node->h2,
+                       cd->now,
+                       v);
+               cd->processed_tokens++;
+       }
+
+       if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) {
+               /* Stop learning on max tokens */
+               return TRUE;
+       }
+       return FALSE;
+}
+
+/**
+ * Returns probability of chisquare > value with specified number of freedom
+ * degrees
+ * @param value value to test
+ * @param freedom_deg number of degrees of freedom
+ * @return
+ */
+static gdouble
+inv_chi_square (gdouble value, gint freedom_deg)
+{
+       long double prob, sum;
+       gint i;
+
+       if ((freedom_deg & 1) != 0) {
+               msg_err ("non-odd freedom degrees count: %d", freedom_deg);
+               return 0;
+       }
+
+       value /= 2.;
+       errno = 0;
+#ifdef HAVE_EXPL
+       prob = expl (-value);
+#elif defined(HAVE_EXP2L)
+       prob = exp2l (-value * log2 (M_E));
+#else
+       prob = exp (-value);
+#endif
+       if (errno == ERANGE) {
+               msg_err ("exp overflow");
+               return 0;
+       }
+       sum = prob;
+       for (i = 1; i < freedom_deg / 2; i++) {
+               prob *= value / (gdouble)i;
+               sum += prob;
+       }
+
+       return MIN (1.0, sum);
+}
+
+/*
+ * In this callback we calculate local probabilities for tokens
+ */
+static gboolean
+bayes_classify_callback (gpointer key, gpointer value, gpointer data)
+{
+
+       token_node_t *node = key;
+       struct bayes_callback_data *cd = data;
+       guint i;
+       struct bayes_statfile_data *cur;
+       guint64 spam_count = 0, ham_count = 0, total_count = 0;
+       double spam_prob, spam_freq, ham_freq, bayes_spam_prob;
+
+       for (i = 0; i < cd->statfiles_num; i++) {
+               cur = &cd->statfiles[i];
+               cur->value = statfile_pool_get_block (cd->pool,
+                               cur->file,
+                               node->h1,
+                               node->h2,
+                               cd->now);
+               if (cur->value > 0) {
+                       cur->total_hits += cur->value;
+                       if (cur->st->is_spam) {
+                               spam_count += cur->value;
+                       }
+                       else {
+                               ham_count += cur->value;
+                       }
+                       total_count += cur->value;
+               }
+       }
+
+       /* Probability for this token */
+       if (total_count > 0) {
+               spam_freq = ((double)spam_count / MAX (1., (double)cd->total_spam));
+               ham_freq = ((double)ham_count / MAX (1., (double)cd->total_ham));
+               spam_prob = spam_freq / (spam_freq + ham_freq);
+               bayes_spam_prob = (0.5 + spam_prob * total_count) / (1. + total_count);
+               cd->spam_probability += log (bayes_spam_prob);
+               cd->ham_probability += log (1. - bayes_spam_prob);
+               cd->processed_tokens++;
+       }
+
+       if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) {
+               /* Stop classifying on max tokens */
+               return TRUE;
+       }
+
+       return FALSE;
+}
+
+struct classifier_ctx *
+bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cfg)
+{
+       struct classifier_ctx *ctx =
+               rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx));
+
+       ctx->pool = pool;
+       ctx->cfg = cfg;
+       ctx->debug = FALSE;
+
+       return ctx;
+}
+
+gboolean
+bayes_classify (struct classifier_ctx * ctx,
+       statfile_pool_t *pool,
+       GTree *input,
+       struct rspamd_task *task,
+       lua_State *L)
+{
+       struct bayes_callback_data data;
+       gchar *value;
+       gint nodes, i = 0, selected_st = -1, cnt;
+       gint minnodes;
+       guint64 maxhits = 0, rev;
+       double final_prob, h, s;
+       struct rspamd_statfile_config *st;
+       stat_file_t *file;
+       GList *cur;
+       char *sumbuf;
+
+       g_assert (pool != NULL);
+       g_assert (ctx != NULL);
+
+       if (ctx->cfg->opts &&
+               (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+               minnodes = strtol (value, NULL, 10);
+               nodes = g_tree_nnodes (input);
+               if (nodes > FEATURE_WINDOW_SIZE) {
+                       nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+               }
+               if (nodes < minnodes) {
+                       return FALSE;
+               }
+       }
+
+       cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L);
+       if (cur) {
+               rspamd_mempool_add_destructor (task->task_pool,
+                       (rspamd_mempool_destruct_t)g_list_free, cur);
+       }
+       else {
+               cur = ctx->cfg->statfiles;
+       }
+
+       data.statfiles_num = g_list_length (cur);
+       data.statfiles = g_new0 (struct bayes_statfile_data, data.statfiles_num);
+       data.pool = pool;
+       data.now = time (NULL);
+       data.ctx = ctx;
+
+       data.processed_tokens = 0;
+       data.spam_probability = 0;
+       data.ham_probability = 0;
+       data.total_ham = 0;
+       data.total_spam = 0;
+       if (ctx->cfg->opts &&
+               (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
+               minnodes = rspamd_config_parse_limit (value, -1);
+               data.max_tokens = minnodes;
+       }
+       else {
+               data.max_tokens = 0;
+       }
+
+       while (cur) {
+               /* Select statfile to classify */
+               st = cur->data;
+               if ((file = statfile_pool_is_open (pool, st->path)) == NULL) {
+                       if ((file =
+                               statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+                               msg_warn ("cannot open %s", st->path);
+                               cur = g_list_next (cur);
+                               data.statfiles_num--;
+                               continue;
+                       }
+               }
+               data.statfiles[i].file = file;
+               data.statfiles[i].st = st;
+               statfile_get_revision (file, &rev, NULL);
+               if (st->is_spam) {
+                       data.total_spam += rev;
+               }
+               else {
+                       data.total_ham += rev;
+               }
+
+               cur = g_list_next (cur);
+               i++;
+       }
+
+       cnt = i;
+
+       g_tree_foreach (input, bayes_classify_callback, &data);
+
+       if (data.processed_tokens == 0 || data.spam_probability == 0) {
+               final_prob = 0;
+       }
+       else {
+               h = 1 - inv_chi_square (-2. * data.spam_probability,
+                               2 * data.processed_tokens);
+               s = 1 - inv_chi_square (-2. * data.ham_probability,
+                               2 * data.processed_tokens);
+               final_prob = (s + 1 - h) / 2.;
+       }
+
+       if (data.processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) {
+
+               sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
+               for (i = 0; i < cnt; i++) {
+                       if ((final_prob > 0.5 && !data.statfiles[i].st->is_spam) ||
+                               (final_prob < 0.5 && data.statfiles[i].st->is_spam)) {
+                               continue;
+                       }
+                       if (data.statfiles[i].total_hits > maxhits) {
+                               maxhits = data.statfiles[i].total_hits;
+                               selected_st = i;
+                       }
+               }
+               if (selected_st == -1) {
+                       msg_err (
+                               "unexpected classifier error: cannot select desired statfile");
+               }
+               else {
+                       /* Calculate ham probability correctly */
+                       if (final_prob < 0.5) {
+                               final_prob = 1. - final_prob;
+                       }
+                       rspamd_snprintf (sumbuf, 32, "%.2f%%", final_prob * 100.);
+                       cur = g_list_prepend (NULL, sumbuf);
+                       rspamd_task_insert_result (task,
+                               data.statfiles[selected_st].st->symbol,
+                               final_prob,
+                               cur);
+               }
+       }
+
+       g_free (data.statfiles);
+
+       return TRUE;
+}
+
+gboolean
+bayes_learn (struct classifier_ctx * ctx,
+       statfile_pool_t *pool,
+       const char *symbol,
+       GTree *input,
+       gboolean in_class,
+       double *sum,
+       double multiplier,
+       GError **err)
+{
+       struct bayes_callback_data data;
+       gchar *value;
+       gint nodes;
+       gint minnodes;
+       struct rspamd_statfile_config *st, *sel_st = NULL;
+       stat_file_t *to_learn;
+       GList *cur;
+
+       g_assert (pool != NULL);
+       g_assert (ctx != NULL);
+
+       if (ctx->cfg->opts &&
+               (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+               minnodes = strtol (value, NULL, 10);
+               nodes = g_tree_nnodes (input);
+               if (nodes > FEATURE_WINDOW_SIZE) {
+                       nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+               }
+               if (nodes < minnodes) {
+                       msg_info (
+                               "do not learn message as it has too few tokens: %d, while %d min",
+                               nodes,
+                               minnodes);
+                       *sum = 0;
+                       g_set_error (err,
+                               bayes_error_quark (),           /* error domain */
+                               1,                                  /* error code */
+                               "message contains too few tokens: %d, while min is %d",
+                               nodes, (int)minnodes);
+                       return FALSE;
+               }
+       }
+
+       data.pool = pool;
+       data.in_class = in_class;
+       data.now = time (NULL);
+       data.ctx = ctx;
+       data.processed_tokens = 0;
+       data.processed_tokens = 0;
+       if (ctx->cfg->opts &&
+               (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
+               minnodes = rspamd_config_parse_limit (value, -1);
+               data.max_tokens = minnodes;
+       }
+       else {
+               data.max_tokens = 0;
+       }
+       cur = ctx->cfg->statfiles;
+       while (cur) {
+               /* Select statfile to learn */
+               st = cur->data;
+               if (strcmp (st->symbol, symbol) == 0) {
+                       sel_st = st;
+                       break;
+               }
+               cur = g_list_next (cur);
+       }
+       if (sel_st == NULL) {
+               g_set_error (err,
+                       bayes_error_quark (),           /* error domain */
+                       1,                              /* error code */
+                       "cannot find statfile for symbol: %s",
+                       symbol);
+               return FALSE;
+       }
+       if ((to_learn = statfile_pool_is_open (pool, sel_st->path)) == NULL) {
+               if ((to_learn =
+                       statfile_pool_open (pool, sel_st->path, sel_st->size,
+                       FALSE)) == NULL) {
+                       msg_warn ("cannot open %s", sel_st->path);
+                       if (statfile_pool_create (pool, sel_st->path, sel_st->size) == -1) {
+                               msg_err ("cannot create statfile %s", sel_st->path);
+                               g_set_error (err,
+                                       bayes_error_quark (),           /* error domain */
+                                       1,                              /* error code */
+                                       "cannot create statfile: %s",
+                                       sel_st->path);
+                               return FALSE;
+                       }
+                       if ((to_learn =
+                               statfile_pool_open (pool, sel_st->path, sel_st->size,
+                               FALSE)) == NULL) {
+                               g_set_error (err,
+                                       bayes_error_quark (),           /* error domain */
+                                       1,                              /* error code */
+                                       "cannot open statfile %s after creation",
+                                       sel_st->path);
+                               msg_err ("cannot open statfile %s after creation",
+                                       sel_st->path);
+                               return FALSE;
+                       }
+               }
+       }
+       data.file = to_learn;
+       statfile_pool_lock_file (pool, data.file);
+       g_tree_foreach (input, bayes_learn_callback, &data);
+       statfile_inc_revision (to_learn);
+       statfile_pool_unlock_file (pool, data.file);
+
+       if (sum != NULL) {
+               *sum = data.processed_tokens;
+       }
+
+       return TRUE;
+}
+
+gboolean
+bayes_learn_spam (struct classifier_ctx * ctx,
+       statfile_pool_t *pool,
+       GTree *input,
+       struct rspamd_task *task,
+       gboolean is_spam,
+       lua_State *L,
+       GError **err)
+{
+       struct bayes_callback_data data;
+       gchar *value;
+       gint nodes;
+       gint minnodes;
+       struct rspamd_statfile_config *st;
+       stat_file_t *file;
+       GList *cur;
+       gboolean skip_labels;
+
+       g_assert (pool != NULL);
+       g_assert (ctx != NULL);
+
+       if (ctx->cfg->opts &&
+               (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+               minnodes = strtol (value, NULL, 10);
+               nodes = g_tree_nnodes (input);
+               if (nodes > FEATURE_WINDOW_SIZE) {
+                       nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+               }
+               if (nodes < minnodes) {
+                       g_set_error (err,
+                               bayes_error_quark (),           /* error domain */
+                               1,                              /* error code */
+                               "message contains too few tokens: %d, while min is %d",
+                               nodes, (int)minnodes);
+                       return FALSE;
+               }
+       }
+
+       cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, TRUE, is_spam, L);
+       if (cur) {
+               skip_labels = FALSE;
+               rspamd_mempool_add_destructor (task->task_pool,
+                       (rspamd_mempool_destruct_t)g_list_free, cur);
+       }
+       else {
+               /* Do not try to learn specific statfiles if pre callback returned nil */
+               skip_labels = TRUE;
+               cur = ctx->cfg->statfiles;
+       }
+
+       data.pool = pool;
+       data.now = time (NULL);
+       data.ctx = ctx;
+       data.in_class = TRUE;
+
+       data.processed_tokens = 0;
+       if (ctx->cfg->opts &&
+               (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
+               minnodes = rspamd_config_parse_limit (value, -1);
+               data.max_tokens = minnodes;
+       }
+       else {
+               data.max_tokens = 0;
+       }
+
+       while (cur) {
+               /* Select statfiles to learn */
+               st = cur->data;
+               if (st->is_spam != is_spam || (skip_labels && st->label)) {
+                       cur = g_list_next (cur);
+                       continue;
+               }
+               if ((file = statfile_pool_is_open (pool, st->path)) == NULL) {
+                       if ((file =
+                               statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+                               msg_warn ("cannot open %s", st->path);
+                               if (statfile_pool_create (pool, st->path, st->size) == -1) {
+                                       msg_err ("cannot create statfile %s", st->path);
+                                       g_set_error (err,
+                                               bayes_error_quark (),           /* error domain */
+                                               1,                              /* error code */
+                                               "cannot create statfile: %s",
+                                               st->path);
+                                       return FALSE;
+                               }
+                               if ((file =
+                                       statfile_pool_open (pool, st->path, st->size,
+                                       FALSE)) == NULL) {
+                                       g_set_error (err,
+                                               bayes_error_quark (),           /* error domain */
+                                               1,                              /* error code */
+                                               "cannot open statfile %s after creation",
+                                               st->path);
+                                       msg_err ("cannot open statfile %s after creation",
+                                               st->path);
+                                       return FALSE;
+                               }
+                       }
+               }
+               data.file = file;
+               statfile_pool_lock_file (pool, data.file);
+               g_tree_foreach (input, bayes_learn_callback, &data);
+               statfile_inc_revision (file);
+               statfile_pool_unlock_file (pool, data.file);
+               maybe_write_binlog (ctx->cfg, st, file, input);
+               msg_info ("increase revision for %s", st->path);
+
+               cur = g_list_next (cur);
+       }
+
+       return TRUE;
+}
+
+GList *
+bayes_weights (struct classifier_ctx * ctx,
+       statfile_pool_t *pool,
+       GTree *input,
+       struct rspamd_task *task)
+{
+       /* This function is unimplemented with new normalizer */
+       return NULL;
+}
diff --git a/src/libstat/classifiers/classifiers.c b/src/libstat/classifiers/classifiers.c
new file mode 100644 (file)
index 0000000..95dd52c
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2009-2012, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Common classifier functions
+ */
+
+#include "classifiers.h"
+
+struct classifier classifiers[] = {
+       {
+               .name = "winnow",
+               .init_func = winnow_init,
+               .classify_func = winnow_classify,
+               .learn_func = winnow_learn,
+               .learn_spam_func = winnow_learn_spam,
+               .weights_func = winnow_weights
+       },
+       {
+               .name = "bayes",
+               .init_func = bayes_init,
+               .classify_func = bayes_classify,
+               .learn_func = bayes_learn,
+               .learn_spam_func = bayes_learn_spam,
+               .weights_func = bayes_weights
+       }
+};
+
+struct classifier *
+get_classifier (const char *name)
+{
+       guint i;
+
+       for (i = 0; i < sizeof (classifiers) / sizeof (classifiers[0]); i++) {
+               if (strcmp (classifiers[i].name, name) == 0) {
+                       return &classifiers[i];
+               }
+       }
+
+       return NULL;
+}
+
+/*
+ * vi:ts=4
+ */
diff --git a/src/libstat/classifiers/winnow.c b/src/libstat/classifiers/winnow.c
new file mode 100644 (file)
index 0000000..68d4569
--- /dev/null
@@ -0,0 +1,694 @@
+/*
+ * Copyright (c) 2009-2012, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Winnow classifier
+ */
+
+#include "classifiers.h"
+#include "tokenizers.h"
+#include "main.h"
+#include "filter.h"
+#include "cfg_file.h"
+#include "lua/lua_common.h"
+
+#define WINNOW_PROMOTION 1.23
+#define WINNOW_DEMOTION 0.83
+
+#define MEDIAN_WINDOW_SIZE 5
+
+#define MAX_WEIGHT G_MAXDOUBLE / 2.
+
+
+
+#define MAX_LEARN_ITERATIONS 100
+
+static inline GQuark
+winnow_error_quark (void)
+{
+       return g_quark_from_static_string ("winnow-error");
+}
+
+struct winnow_callback_data {
+       statfile_pool_t *pool;
+       struct classifier_ctx *ctx;
+       stat_file_t *file;
+       stat_file_t *learn_file;
+       long double sum;
+       long double start;
+       double multiplier;
+       guint32 count;
+       guint32 new_blocks;
+       gboolean in_class;
+       gboolean do_demote;
+       gboolean fresh_run;
+       time_t now;
+};
+
+static const double max_common_weight = MAX_WEIGHT * WINNOW_DEMOTION;
+
+
+
+static gboolean
+winnow_classify_callback (gpointer key, gpointer value, gpointer data)
+{
+       token_node_t *node = key;
+       struct winnow_callback_data *cd = data;
+       double v;
+
+       /* Consider that not found blocks have value 1 */
+       v =
+               statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
+                       cd->now);
+       if (fabs (v) > ALPHA) {
+               cd->sum += v;
+       }
+       else {
+               cd->sum += 1.0;
+               cd->new_blocks++;
+       }
+
+       cd->count++;
+
+       return FALSE;
+}
+
+static gboolean
+winnow_learn_callback (gpointer key, gpointer value, gpointer data)
+{
+       token_node_t *node = key;
+       struct winnow_callback_data *cd = data;
+       double v, c;
+
+       c = (cd->in_class) ? WINNOW_PROMOTION * cd->multiplier : WINNOW_DEMOTION /
+               cd->multiplier;
+
+       /* Consider that not found blocks have value 1 */
+       v =
+               statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
+                       cd->now);
+       if (fabs (v) < ALPHA) {
+               /* Block not found, insert new */
+               cd->start += 1;
+               if (cd->file == cd->learn_file) {
+                       statfile_pool_set_block (cd->pool,
+                               cd->file,
+                               node->h1,
+                               node->h2,
+                               cd->now,
+                               c);
+                       node->value = c;
+                       cd->new_blocks++;
+               }
+       }
+       else {
+               cd->start += v;
+               /* Here we just increase the extra value of block */
+               if (cd->fresh_run) {
+                       node->extra = 0;
+               }
+               else {
+                       node->extra++;
+               }
+               node->value = v;
+
+               if (node->extra > 1) {
+                       /*
+                        * Assume that this node is common for several statfiles, so
+                        * decrease its weight proportianally
+                        */
+                       if (node->value > max_common_weight) {
+                               /* Static fluctuation */
+                               statfile_pool_set_block (cd->pool,
+                                       cd->file,
+                                       node->h1,
+                                       node->h2,
+                                       cd->now,
+                                       0.);
+                               node->value = 0.;
+                       }
+                       else if (node->value > WINNOW_PROMOTION * cd->multiplier) {
+                               /* Try to decrease its value */
+                               /* XXX: it is more intelligent to add some adaptive filter here */
+                               if (cd->file == cd->learn_file) {
+                                       if (node->value > max_common_weight / 2.) {
+                                               node->value *= c;
+                                       }
+                                       else {
+                                               /*
+                                                * Too high token value that exists also in other
+                                                * statfiles, may be statistic error, so decrease it
+                                                * slightly
+                                                */
+                                               node->value *= WINNOW_DEMOTION;
+                                       }
+                               }
+                               else {
+                                       node->value = WINNOW_DEMOTION / cd->multiplier;
+                               }
+                               statfile_pool_set_block (cd->pool,
+                                       cd->file,
+                                       node->h1,
+                                       node->h2,
+                                       cd->now,
+                                       node->value);
+                       }
+               }
+               else if (cd->file == cd->learn_file) {
+                       /* New block or block that is in only one statfile */
+                       /* Set some limit on growing */
+                       if (v > MAX_WEIGHT) {
+                               node->value = v;
+                       }
+                       else {
+                               node->value *= c;
+                       }
+                       statfile_pool_set_block (cd->pool,
+                               cd->file,
+                               node->h1,
+                               node->h2,
+                               cd->now,
+                               node->value);
+               }
+               else if (cd->do_demote) {
+                       /* Demote blocks in file */
+                       node->value *= WINNOW_DEMOTION / cd->multiplier;
+                       statfile_pool_set_block (cd->pool,
+                               cd->file,
+                               node->h1,
+                               node->h2,
+                               cd->now,
+                               node->value);
+               }
+       }
+
+
+       cd->sum += node->value;
+
+       cd->count++;
+
+       return FALSE;
+}
+
+struct classifier_ctx *
+winnow_init (rspamd_mempool_t * pool, struct rspamd_classifier_config *cfg)
+{
+       struct classifier_ctx *ctx =
+               rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx));
+
+       ctx->pool = pool;
+       ctx->cfg = cfg;
+
+       return ctx;
+}
+
+gboolean
+winnow_classify (struct classifier_ctx *ctx,
+       statfile_pool_t * pool,
+       GTree * input,
+       struct rspamd_task *task,
+       lua_State *L)
+{
+       struct winnow_callback_data data;
+       char *sumbuf, *value;
+       long double res = 0., max = 0.;
+       GList *cur;
+       struct rspamd_statfile_config *st, *sel = NULL;
+       int nodes, minnodes;
+
+       g_assert (pool != NULL);
+       g_assert (ctx != NULL);
+
+       data.pool = pool;
+       data.now = time (NULL);
+       data.ctx = ctx;
+
+       if (ctx->cfg->opts &&
+               (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+               minnodes = strtol (value, NULL, 10);
+               nodes = g_tree_nnodes (input);
+               if (nodes > FEATURE_WINDOW_SIZE) {
+                       nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+               }
+               if (nodes < minnodes) {
+                       msg_info (
+                               "do not classify message as it has too few tokens: %d, while %d min",
+                               nodes,
+                               minnodes);
+                       return FALSE;
+               }
+       }
+
+       cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L);
+       if (cur) {
+               rspamd_mempool_add_destructor (task->task_pool,
+                       (rspamd_mempool_destruct_t)g_list_free, cur);
+       }
+       else {
+               cur = ctx->cfg->statfiles;
+       }
+
+       while (cur) {
+               st = cur->data;
+               data.sum = 0;
+               data.count = 0;
+               data.new_blocks = 0;
+               if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
+                       if ((data.file =
+                               statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+                               msg_warn ("cannot open %s, skip it", st->path);
+                               cur = g_list_next (cur);
+                               continue;
+                       }
+               }
+
+               if (data.file != NULL) {
+                       g_tree_foreach (input, winnow_classify_callback, &data);
+               }
+
+               if (data.count != 0) {
+                       res = data.sum / (double)data.count;
+               }
+               else {
+                       res = 0;
+               }
+               if (res > max) {
+                       max = res;
+                       sel = st;
+               }
+               cur = g_list_next (cur);
+       }
+
+       if (sel != NULL) {
+#ifdef WITH_LUA
+               max = rspamd_lua_call_cls_post_callbacks (ctx->cfg, task, max, L);
+#endif
+#ifdef HAVE_TANHL
+               max = tanhl (max);
+#else
+               /*
+                * As some implementations of libm does not support tanhl, try to use
+                * tanh
+                */
+               max = tanh ((double) max);
+#endif
+               sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
+               rspamd_snprintf (sumbuf, 32, "%.2F", max);
+               cur = g_list_prepend (NULL, sumbuf);
+               rspamd_task_insert_result (task, sel->symbol, max, cur);
+       }
+
+       return TRUE;
+}
+
+GList *
+winnow_weights (struct classifier_ctx *ctx,
+       statfile_pool_t * pool,
+       GTree * input,
+       struct rspamd_task *task)
+{
+       struct winnow_callback_data data;
+       long double res = 0.;
+       GList *cur, *resl = NULL;
+       struct rspamd_statfile_config *st;
+       struct classify_weight *w;
+       char *value;
+       int nodes, minnodes;
+
+       g_assert (pool != NULL);
+       g_assert (ctx != NULL);
+
+       data.pool = pool;
+       data.now = time (NULL);
+       data.ctx = ctx;
+
+       if (ctx->cfg->opts &&
+               (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+               minnodes = strtol (value, NULL, 10);
+               nodes = g_tree_nnodes (input);
+               if (nodes > FEATURE_WINDOW_SIZE) {
+                       nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+               }
+               if (nodes < minnodes) {
+                       msg_info (
+                               "do not classify message as it has too few tokens: %d, while %d min",
+                               nodes,
+                               minnodes);
+                       return NULL;
+               }
+       }
+
+       cur = ctx->cfg->statfiles;
+       while (cur) {
+               st = cur->data;
+               data.sum = 0;
+               data.count = 0;
+               if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
+                       if ((data.file =
+                               statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+                               msg_warn ("cannot open %s, skip it", st->path);
+                               cur = g_list_next (cur);
+                               continue;
+                       }
+               }
+
+               if (data.file != NULL) {
+                       g_tree_foreach (input, winnow_classify_callback, &data);
+               }
+
+               w =
+                       rspamd_mempool_alloc0 (task->task_pool,
+                               sizeof (struct classify_weight));
+               if (data.count != 0) {
+                       res = data.sum / (double)data.count;
+               }
+               else {
+                       res = 0;
+               }
+               w->name = st->symbol;
+               w->weight = res;
+               resl = g_list_prepend (resl, w);
+               cur = g_list_next (cur);
+       }
+
+       if (resl != NULL) {
+               rspamd_mempool_add_destructor (task->task_pool,
+                       (rspamd_mempool_destruct_t)g_list_free, resl);
+       }
+
+       return resl;
+
+}
+
+
+gboolean
+winnow_learn (struct classifier_ctx *ctx,
+       statfile_pool_t *pool,
+       const char *symbol,
+       GTree * input,
+       int in_class,
+       double *sum,
+       double multiplier,
+       GError **err)
+{
+       struct winnow_callback_data data = {
+               .file = NULL,
+               .multiplier = multiplier
+       };
+       char *value;
+       int nodes, minnodes, iterations = 0;
+       struct rspamd_statfile_config *st, *sel_st = NULL;
+       stat_file_t *sel = NULL, *to_learn;
+       long double res = 0., max = 0., start_value = 0., end_value = 0.;
+       double learn_threshold = 0.0;
+       GList *cur, *to_demote = NULL;
+       gboolean force_learn = FALSE;
+
+       g_assert (pool != NULL);
+       g_assert (ctx != NULL);
+
+       data.pool = pool;
+       data.in_class = in_class;
+       data.now = time (NULL);
+       data.ctx = ctx;
+
+
+       if (ctx->cfg->opts &&
+               (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+               minnodes = strtol (value, NULL, 10);
+               nodes = g_tree_nnodes (input);
+               if (nodes > FEATURE_WINDOW_SIZE) {
+                       nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+               }
+               if (nodes < minnodes) {
+                       msg_info (
+                               "do not learn message as it has too few tokens: %d, while %d min",
+                               nodes,
+                               minnodes);
+                       if (sum != NULL) {
+                               *sum = 0;
+                       }
+                       g_set_error (err,
+                               winnow_error_quark (),              /* error domain */
+                               1,                                  /* error code */
+                               "message contains too few tokens: %d, while min is %d",
+                               nodes, minnodes);
+                       return FALSE;
+               }
+       }
+       if (ctx->cfg->opts &&
+               (value =
+               g_hash_table_lookup (ctx->cfg->opts, "learn_threshold")) != NULL) {
+               learn_threshold = strtod (value, NULL);
+       }
+
+       if (learn_threshold <= 1.0 && learn_threshold >= 0) {
+               /* Classify message and check target statfile score */
+               cur = ctx->cfg->statfiles;
+               while (cur) {
+                       /* Open or create all statfiles inside classifier */
+                       st = cur->data;
+                       if (statfile_pool_is_open (pool, st->path) == NULL) {
+                               if (statfile_pool_open (pool, st->path, st->size,
+                                       FALSE) == NULL) {
+                                       msg_warn ("cannot open %s", st->path);
+                                       if (statfile_pool_create (pool, st->path, st->size) == -1) {
+                                               msg_err ("cannot create statfile %s", st->path);
+                                               g_set_error (err,
+                                                       winnow_error_quark (),          /* error domain */
+                                                       1,                              /* error code */
+                                                       "cannot create statfile: %s",
+                                                       st->path);
+                                               return FALSE;
+                                       }
+                                       if (statfile_pool_open (pool, st->path, st->size,
+                                               FALSE) == NULL) {
+                                               g_set_error (err,
+                                                       winnow_error_quark (),          /* error domain */
+                                                       1,                              /* error code */
+                                                       "open statfile %s after creation",
+                                                       st->path);
+                                               msg_err ("cannot open statfile %s after creation",
+                                                       st->path);
+                                               return FALSE;
+                                       }
+                               }
+                       }
+                       if (strcmp (st->symbol, symbol) == 0) {
+                               sel_st = st;
+
+                       }
+                       cur = g_list_next (cur);
+               }
+
+               if (sel_st == NULL) {
+                       g_set_error (err,
+                               winnow_error_quark (),          /* error domain */
+                               1,                              /* error code */
+                               "cannot find statfile for symbol %s",
+                               symbol);
+                       msg_err ("cannot find statfile for symbol %s", symbol);
+                       return FALSE;
+               }
+
+               to_learn = statfile_pool_is_open (pool, sel_st->path);
+               if (to_learn == NULL) {
+                       g_set_error (err,
+                               winnow_error_quark (),          /* error domain */
+                               1,                              /* error code */
+                               "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles",
+                               sel_st->path);
+                       return FALSE;
+               }
+               /* Check target statfile */
+               data.file = to_learn;
+               data.sum = 0;
+               data.count = 0;
+               data.new_blocks = 0;
+               g_tree_foreach (input, winnow_classify_callback, &data);
+               if (data.count > 0) {
+                       max = data.sum / (double)data.count;
+               }
+               else {
+                       max = 0;
+               }
+               /* If most of blocks are not presented in targeted statfile do forced learn */
+               if (max < 1 + learn_threshold) {
+                       force_learn = TRUE;
+               }
+               /* Check other statfiles */
+               while (cur) {
+                       st = cur->data;
+                       data.sum = 0;
+                       data.count = 0;
+                       if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
+                               g_set_error (err,
+                                       winnow_error_quark (),          /* error domain */
+                                       1,                              /* error code */
+                                       "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles",
+                                       st->path);
+                               return FALSE;
+                       }
+                       g_tree_foreach (input, winnow_classify_callback, &data);
+                       if (data.count != 0) {
+                               res = data.sum / data.count;
+                       }
+                       else {
+                               res = 0;
+                       }
+                       if (to_learn != data.file && res - max > 1 - learn_threshold) {
+                               /* Demote tokens in this statfile */
+                               to_demote = g_list_prepend (to_demote, data.file);
+                       }
+                       cur = g_list_next (cur);
+               }
+       }
+       else {
+               msg_err (
+                       "learn threshold is more than 1 or less than 0, so cannot do learn, please check your configuration");
+               g_set_error (err,
+                       winnow_error_quark (),          /* error domain */
+                       1,                              /* error code */
+                       "bad learn_threshold setting: %.2f",
+                       learn_threshold);
+               return FALSE;
+       }
+       /* If to_demote list is empty this message is already classified correctly */
+       if (max > WINNOW_PROMOTION && to_demote == NULL && !force_learn) {
+               msg_info (
+                       "this message is already of class %s with threshold %.2f and weight %.2F",
+                       sel_st->symbol,
+                       learn_threshold,
+                       max);
+               goto end;
+       }
+       data.learn_file = to_learn;
+       end_value = max;
+       do {
+               cur = ctx->cfg->statfiles;
+               data.fresh_run = TRUE;
+               while (cur) {
+                       st = cur->data;
+                       data.sum = 0;
+                       data.count = 0;
+                       data.new_blocks = 0;
+                       data.start = 0;
+                       if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
+                               return FALSE;
+                       }
+                       if (to_demote != NULL &&
+                               g_list_find (to_demote, data.file) != NULL) {
+                               data.do_demote = TRUE;
+                       }
+                       else {
+                               data.do_demote = FALSE;
+                       }
+
+                       statfile_pool_lock_file (pool, data.file);
+                       g_tree_foreach (input, winnow_learn_callback, &data);
+                       statfile_pool_unlock_file (pool, data.file);
+                       if (data.count != 0) {
+                               res = data.sum / data.count;
+                       }
+                       else {
+                               res = 0;
+                       }
+                       if (res > max) {
+                               max = res;
+                               sel = data.file;
+                       }
+                       if (data.file == to_learn) {
+                               if (data.count > 0) {
+                                       start_value = data.start / data.count;
+                               }
+                               end_value = res;
+                       }
+                       cur = g_list_next (cur);
+                       data.fresh_run = FALSE;
+               }
+
+               data.multiplier *= WINNOW_PROMOTION;
+               msg_info (
+                       "learn iteration %d for statfile %s: %G -> %G, multiplier: %.2f",
+                       iterations + 1,
+                       symbol,
+                       start_value,
+                       end_value,
+                       data.multiplier);
+       } while ((in_class ? sel != to_learn : sel ==
+               to_learn)  && iterations++ < MAX_LEARN_ITERATIONS);
+
+       if (iterations >= MAX_LEARN_ITERATIONS) {
+               msg_warn (
+                       "learning statfile %s  was not fully successfull: iterations count is limited to %d, final sum is %G",
+                       sel_st->symbol,
+                       MAX_LEARN_ITERATIONS,
+                       max);
+               g_set_error (err,
+                       winnow_error_quark (),          /* error domain */
+                       1,                              /* error code */
+                       "learning statfile %s  was not fully successfull: iterations count is limited to %d",
+                       sel_st->symbol, MAX_LEARN_ITERATIONS);
+               return FALSE;
+       }
+       else {
+               msg_info (
+                       "learned statfile %s successfully with %d iterations and sum %G",
+                       sel_st->symbol,
+                       iterations + 1,
+                       max);
+       }
+
+
+end:
+       if (sum) {
+#ifdef HAVE_TANHL
+               *sum = (double)tanhl (max);
+#else
+               /*
+                * As some implementations of libm does not support tanhl, try to use
+                * tanh
+                */
+               *sum = tanh ((double) max);
+#endif
+       }
+       return TRUE;
+}
+
+gboolean
+winnow_learn_spam (struct classifier_ctx * ctx,
+       statfile_pool_t *pool,
+       GTree *input,
+       struct rspamd_task *task,
+       gboolean is_spam,
+       lua_State *L,
+       GError **err)
+{
+       g_set_error (err,
+               winnow_error_quark (),                  /* error domain */
+               1,                                      /* error code */
+               "learn spam is not supported for winnow"
+               );
+       return FALSE;
+}
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h
new file mode 100644 (file)
index 0000000..0e2bf86
--- /dev/null
@@ -0,0 +1,59 @@
+/* Copyright (c) 2015, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *       * Redistributions of source code must retain the above copyright
+ *         notice, this list of conditions and the following disclaimer.
+ *       * Redistributions in binary form must reproduce the above copyright
+ *         notice, this list of conditions and the following disclaimer in the
+ *         documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef STAT_API_H_
+#define STAT_API_H_
+
+#include "config.h"
+#include "task.h"
+
+/**
+ * @file stat_api.h
+ * High level statistics API
+ */
+
+/**
+ * Initialise statistics modules
+ * @param cfg
+ */
+void rspamd_stat_init (struct rspamd_config *cfg);
+
+/**
+ * Classify the task specified and insert symbols if needed
+ * @param task
+ * @return TRUE if task has been classified
+ */
+gboolean rspamd_stat_classify (struct rspamd_task *task, GError **err);
+
+
+/**
+ * Learn task as spam or ham, task must be processed prior to this call
+ * @param task task to learn
+ * @param spam if TRUE learn spam, otherwise learn ham
+ * @return TRUE if task has been learned
+ */
+gboolean rspamd_stat_learn (struct rspamd_task *task, gboolean spam, GError **err);
+
+
+void rspamd_stat_unload (void);
+
+#endif /* STAT_API_H_ */
diff --git a/src/libstat/tokenizers.h b/src/libstat/tokenizers.h
new file mode 100644 (file)
index 0000000..ed47e0a
--- /dev/null
@@ -0,0 +1,64 @@
+#ifndef TOKENIZERS_H
+#define TOKENIZERS_H
+
+#include "config.h"
+#include "mem_pool.h"
+#include "fstring.h"
+#include "main.h"
+
+/* Size for features pipe */
+#define FEATURE_WINDOW_SIZE 5
+
+typedef struct token_node_s {
+       guint32 h1;
+       guint32 h2;
+       double value;
+       uintptr_t extra;
+} token_node_t;
+
+/* Common tokenizer structure */
+struct tokenizer {
+       gchar *name;
+       gint (*tokenize_func)(struct tokenizer *tokenizer,
+                       rspamd_mempool_t *pool,
+                       GArray *words,
+                       GTree **cur,
+                       gboolean save_token,
+                       gboolean is_utf,
+                       GList *exceptions);
+       gchar * (*get_next_word)(rspamd_fstring_t *buf, rspamd_fstring_t *token, GList **exceptions);
+};
+
+/* Compare two token nodes */
+int token_node_compare_func (gconstpointer a, gconstpointer b);
+
+/* Get tokenizer structure by name or return NULL if this name is not found */
+struct tokenizer * get_tokenizer (const char *name);
+
+/* Get next word from specified f_str_t buf */
+gchar * rspamd_tokenizer_get_word (rspamd_fstring_t *buf,
+               rspamd_fstring_t *token, GList **exceptions);
+
+/* Tokenize text into array of words (rspamd_fstring_t type) */
+GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
+               gsize min_len, GList **exceptions);
+
+/* OSB tokenize function */
+int osb_tokenize_text (struct tokenizer *tokenizer,
+       rspamd_mempool_t *pool,
+       GArray *input,
+       GTree **cur,
+       gboolean save_token,
+       gboolean is_utf,
+       GList *exceptions);
+
+/* Make tokens for a subject */
+void tokenize_subject (struct rspamd_task *task, GTree ** tree);
+
+/* Array of all defined tokenizers */
+extern struct tokenizer tokenizers[];
+
+#endif
+/*
+ * vi:ts=4
+ */
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
new file mode 100644 (file)
index 0000000..9dd12a8
--- /dev/null
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2009-2012, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * OSB tokenizer
+ */
+
+#include <sys/types.h>
+#include "tokenizers.h"
+
+/* Minimum length of token */
+#define MIN_LEN 4
+
+extern const int primes[];
+
+int
+osb_tokenize_text (struct tokenizer *tokenizer,
+       rspamd_mempool_t * pool,
+       GArray * input,
+       GTree ** tree,
+       gboolean save_token,
+       gboolean is_utf,
+       GList *exceptions)
+{
+       token_node_t *new = NULL;
+       rspamd_fstring_t *token;
+       guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
+       gint i, processed = 0;
+       guint w;
+
+       if (input == NULL) {
+               return FALSE;
+       }
+
+       if (*tree == NULL) {
+               *tree = g_tree_new (token_node_compare_func);
+               rspamd_mempool_add_destructor (pool,
+                       (rspamd_mempool_destruct_t) g_tree_destroy,
+                       *tree);
+       }
+
+       memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0]));
+
+       for (w = 0; w < input->len; w ++) {
+               token = &g_array_index (input, rspamd_fstring_t, w);
+
+               if (processed < FEATURE_WINDOW_SIZE) {
+                       /* Just fill a hashpipe */
+                       hashpipe[FEATURE_WINDOW_SIZE - ++processed] =
+                               rspamd_fstrhash_lc (token, is_utf);
+               }
+               else {
+                       /* Shift hashpipe */
+                       for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
+                               hashpipe[i] = hashpipe[i - 1];
+                       }
+                       hashpipe[0] = rspamd_fstrhash_lc (token, is_utf);
+                       processed++;
+
+                       for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
+                               h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
+                               h2 = hashpipe[0] * primes[1] + hashpipe[i] *
+                                       primes[(i << 1) - 1];
+                               new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
+                               new->h1 = h1;
+                               new->h2 = h2;
+                               if (save_token) {
+                                       new->extra =
+                                               (uintptr_t)rspamd_mempool_fstrdup (pool, token);
+                               }
+
+                               if (g_tree_lookup (*tree, new) == NULL) {
+                                       g_tree_insert (*tree, new, new);
+                               }
+                       }
+               }
+       }
+
+       if (processed <= FEATURE_WINDOW_SIZE) {
+               for (i = 1; i < processed; i++) {
+                       h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
+                       h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
+                       new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
+                       new->h1 = h1;
+                       new->h2 = h2;
+                       if (save_token) {
+                               new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, token);
+                       }
+
+                       if (g_tree_lookup (*tree, new) == NULL) {
+                               g_tree_insert (*tree, new, new);
+                       }
+               }
+       }
+
+       return TRUE;
+}
+
+/*
+ * vi:ts=4
+ */
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
new file mode 100644 (file)
index 0000000..3e6c745
--- /dev/null
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2009-2012, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Common tokenization functions
+ */
+
+#include <sys/types.h>
+#include "main.h"
+#include "tokenizers.h"
+
+struct tokenizer tokenizers[] = {
+       {"osb-text", osb_tokenize_text, rspamd_tokenizer_get_word},
+};
+
+const int primes[] = {
+       1, 7,
+       3, 13,
+       5, 29,
+       11, 51,
+       23, 101,
+       47, 203,
+       97, 407,
+       197, 817,
+       397, 1637,
+       797, 3277,
+};
+
+const gchar t_delimiters[255] = {
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+       1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
+       1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+       1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0
+};
+
+struct tokenizer *
+get_tokenizer (const char *name)
+{
+       guint i;
+
+       for (i = 0; i < sizeof (tokenizers) / sizeof (tokenizers[0]); i++) {
+               if (strcmp (tokenizers[i].name, name) == 0) {
+                       return &tokenizers[i];
+               }
+       }
+
+       return NULL;
+}
+
+int
+token_node_compare_func (gconstpointer a, gconstpointer b)
+{
+       const token_node_t *aa = a, *bb = b;
+
+       if (aa->h1 == bb->h1) {
+               return aa->h2 - bb->h2;
+       }
+
+       return aa->h1 - bb->h1;
+}
+
+/* Get next word from specified f_str_t buf */
+gchar *
+rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions)
+{
+       gsize remain, pos;
+       guchar *p;
+       struct process_exception *ex = NULL;
+
+       if (buf == NULL) {
+               return NULL;
+       }
+
+       if (exceptions != NULL && *exceptions != NULL) {
+               ex = (*exceptions)->data;
+       }
+
+       if (token->begin == NULL) {
+               if (ex != NULL) {
+                       if (ex->pos == 0) {
+                               token->begin = buf->begin + ex->len;
+                               token->len = ex->len;
+                       }
+                       else {
+                               token->begin = buf->begin;
+                               token->len = 0;
+                       }
+               }
+               else {
+                       token->begin = buf->begin;
+                       token->len = 0;
+               }
+       }
+
+       token->len = 0;
+
+       pos = token->begin - buf->begin;
+       if (pos >= buf->len) {
+               return NULL;
+       }
+
+       remain = buf->len - pos;
+       p = token->begin;
+       /* Skip non delimiters symbols */
+       do {
+               if (ex != NULL && ex->pos == pos) {
+                       /* Go to the next exception */
+                       *exceptions = g_list_next (*exceptions);
+                       return p + ex->len;
+               }
+               pos++;
+               p++;
+               remain--;
+       } while (remain > 0 && t_delimiters[*p]);
+
+       token->begin = p;
+
+       while (remain > 0 && !t_delimiters[*p]) {
+               if (ex != NULL && ex->pos == pos) {
+                       *exceptions = g_list_next (*exceptions);
+                       return p + ex->len;
+               }
+               token->len++;
+               pos++;
+               remain--;
+               p++;
+       }
+
+       if (remain == 0) {
+               return NULL;
+       }
+
+       return p;
+}
+
+GArray *
+rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
+               gsize min_len, GList **exceptions)
+{
+       rspamd_fstring_t token, buf;
+       gchar *pos;
+       gsize l;
+       GArray *res;
+
+       if (len == 0 || text == NULL) {
+               return NULL;
+       }
+
+       buf.begin = text;
+       buf.len = len;
+       buf.size = buf.len;
+       token.begin = NULL;
+       token.len = 0;
+
+       res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t));
+       while ((pos = rspamd_tokenizer_get_word (&buf,
+                       &token, exceptions)) != NULL) {
+               if (is_utf) {
+                       l = g_utf8_strlen (token.begin, token.len);
+               }
+               else {
+                       l = token.len;
+               }
+               if (min_len > 0 && l < min_len) {
+                       token.begin = pos;
+                       continue;
+               }
+               g_array_append_val (res, token);
+
+               token.begin = pos;
+       }
+
+       return res;
+}
+
+
+void
+tokenize_subject (struct rspamd_task *task, GTree ** tree)
+{
+       gchar *sub;
+       struct tokenizer *osb_tokenizer;
+       GArray *words;
+
+       if (*tree == NULL) {
+               *tree = g_tree_new (token_node_compare_func);
+               rspamd_mempool_add_destructor (task->task_pool,
+                       (rspamd_mempool_destruct_t) g_tree_destroy, *tree);
+       }
+
+       osb_tokenizer = get_tokenizer ("osb-text");
+
+       /* Try to use pre-defined subject */
+       if (task->subject != NULL) {
+               sub = task->subject;
+       }
+       else {
+               sub = (gchar *)g_mime_message_get_subject (task->message);
+       }
+
+       if (sub != NULL) {
+               words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL);
+               if (words != NULL) {
+                       osb_tokenizer->tokenize_func (osb_tokenizer,
+                                       task->task_pool,
+                                       words,
+                                       tree,
+                                       FALSE,
+                                       TRUE,
+                                       NULL);
+                       g_array_free (words, TRUE);
+               }
+       }
+}
+
+/*
+ * vi:ts=4
+ */
index 1edca48576b1a09dc78946296a88e1deb45f3778..346f5d64ba272ca9280cb4783af59c2288e6bf75 100644 (file)
@@ -25,7 +25,7 @@
 
 #include "lua_common.h"
 #include "cfg_file.h"
-#include "classifiers/classifiers.h"
+#include "classifiers.h"
 
 /* Classifier methods */
 LUA_FUNCTION_DEF (classifier, register_pre_callback);
index 3043013bae5a902f2fe3b5cab2db6003dde5c668..c1eec7655bf70a8cf883f103de2bd247e601b727 100644 (file)
@@ -29,7 +29,7 @@
 #include "message.h"
 #include "radix.h"
 #include "trie.h"
-#include "classifiers/classifiers.h"
+#include "classifiers.h"
 
 /***
  * This module is used to configure rspamd and is normally available as global
index 0a81e3d8bcbb95ab4631ee7c425af1bf80c2fcd4..4f1a461760113a498b6f748c45b411b899963f9c 100644 (file)
@@ -33,8 +33,8 @@
 #include "images.h"
 #include "cfg_file.h"
 #include "statfile.h"
-#include "tokenizers/tokenizers.h"
-#include "classifiers/classifiers.h"
+#include "tokenizers.h"
+#include "classifiers.h"
 #include "binlog.h"
 #include "statfile_sync.h"
 #include "diff.h"
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
deleted file mode 100644 (file)
index 9dd12a8..0000000
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * OSB tokenizer
- */
-
-#include <sys/types.h>
-#include "tokenizers.h"
-
-/* Minimum length of token */
-#define MIN_LEN 4
-
-extern const int primes[];
-
-int
-osb_tokenize_text (struct tokenizer *tokenizer,
-       rspamd_mempool_t * pool,
-       GArray * input,
-       GTree ** tree,
-       gboolean save_token,
-       gboolean is_utf,
-       GList *exceptions)
-{
-       token_node_t *new = NULL;
-       rspamd_fstring_t *token;
-       guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
-       gint i, processed = 0;
-       guint w;
-
-       if (input == NULL) {
-               return FALSE;
-       }
-
-       if (*tree == NULL) {
-               *tree = g_tree_new (token_node_compare_func);
-               rspamd_mempool_add_destructor (pool,
-                       (rspamd_mempool_destruct_t) g_tree_destroy,
-                       *tree);
-       }
-
-       memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0]));
-
-       for (w = 0; w < input->len; w ++) {
-               token = &g_array_index (input, rspamd_fstring_t, w);
-
-               if (processed < FEATURE_WINDOW_SIZE) {
-                       /* Just fill a hashpipe */
-                       hashpipe[FEATURE_WINDOW_SIZE - ++processed] =
-                               rspamd_fstrhash_lc (token, is_utf);
-               }
-               else {
-                       /* Shift hashpipe */
-                       for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
-                               hashpipe[i] = hashpipe[i - 1];
-                       }
-                       hashpipe[0] = rspamd_fstrhash_lc (token, is_utf);
-                       processed++;
-
-                       for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
-                               h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
-                               h2 = hashpipe[0] * primes[1] + hashpipe[i] *
-                                       primes[(i << 1) - 1];
-                               new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
-                               new->h1 = h1;
-                               new->h2 = h2;
-                               if (save_token) {
-                                       new->extra =
-                                               (uintptr_t)rspamd_mempool_fstrdup (pool, token);
-                               }
-
-                               if (g_tree_lookup (*tree, new) == NULL) {
-                                       g_tree_insert (*tree, new, new);
-                               }
-                       }
-               }
-       }
-
-       if (processed <= FEATURE_WINDOW_SIZE) {
-               for (i = 1; i < processed; i++) {
-                       h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
-                       h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
-                       new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
-                       new->h1 = h1;
-                       new->h2 = h2;
-                       if (save_token) {
-                               new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, token);
-                       }
-
-                       if (g_tree_lookup (*tree, new) == NULL) {
-                               g_tree_insert (*tree, new, new);
-                       }
-               }
-       }
-
-       return TRUE;
-}
-
-/*
- * vi:ts=4
- */
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
deleted file mode 100644 (file)
index 3e6c745..0000000
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Common tokenization functions
- */
-
-#include <sys/types.h>
-#include "main.h"
-#include "tokenizers.h"
-
-struct tokenizer tokenizers[] = {
-       {"osb-text", osb_tokenize_text, rspamd_tokenizer_get_word},
-};
-
-const int primes[] = {
-       1, 7,
-       3, 13,
-       5, 29,
-       11, 51,
-       23, 101,
-       47, 203,
-       97, 407,
-       197, 817,
-       397, 1637,
-       797, 3277,
-};
-
-const gchar t_delimiters[255] = {
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-       1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
-       1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
-       1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       0, 0, 0, 0, 0
-};
-
-struct tokenizer *
-get_tokenizer (const char *name)
-{
-       guint i;
-
-       for (i = 0; i < sizeof (tokenizers) / sizeof (tokenizers[0]); i++) {
-               if (strcmp (tokenizers[i].name, name) == 0) {
-                       return &tokenizers[i];
-               }
-       }
-
-       return NULL;
-}
-
-int
-token_node_compare_func (gconstpointer a, gconstpointer b)
-{
-       const token_node_t *aa = a, *bb = b;
-
-       if (aa->h1 == bb->h1) {
-               return aa->h2 - bb->h2;
-       }
-
-       return aa->h1 - bb->h1;
-}
-
-/* Get next word from specified f_str_t buf */
-gchar *
-rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions)
-{
-       gsize remain, pos;
-       guchar *p;
-       struct process_exception *ex = NULL;
-
-       if (buf == NULL) {
-               return NULL;
-       }
-
-       if (exceptions != NULL && *exceptions != NULL) {
-               ex = (*exceptions)->data;
-       }
-
-       if (token->begin == NULL) {
-               if (ex != NULL) {
-                       if (ex->pos == 0) {
-                               token->begin = buf->begin + ex->len;
-                               token->len = ex->len;
-                       }
-                       else {
-                               token->begin = buf->begin;
-                               token->len = 0;
-                       }
-               }
-               else {
-                       token->begin = buf->begin;
-                       token->len = 0;
-               }
-       }
-
-       token->len = 0;
-
-       pos = token->begin - buf->begin;
-       if (pos >= buf->len) {
-               return NULL;
-       }
-
-       remain = buf->len - pos;
-       p = token->begin;
-       /* Skip non delimiters symbols */
-       do {
-               if (ex != NULL && ex->pos == pos) {
-                       /* Go to the next exception */
-                       *exceptions = g_list_next (*exceptions);
-                       return p + ex->len;
-               }
-               pos++;
-               p++;
-               remain--;
-       } while (remain > 0 && t_delimiters[*p]);
-
-       token->begin = p;
-
-       while (remain > 0 && !t_delimiters[*p]) {
-               if (ex != NULL && ex->pos == pos) {
-                       *exceptions = g_list_next (*exceptions);
-                       return p + ex->len;
-               }
-               token->len++;
-               pos++;
-               remain--;
-               p++;
-       }
-
-       if (remain == 0) {
-               return NULL;
-       }
-
-       return p;
-}
-
-GArray *
-rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
-               gsize min_len, GList **exceptions)
-{
-       rspamd_fstring_t token, buf;
-       gchar *pos;
-       gsize l;
-       GArray *res;
-
-       if (len == 0 || text == NULL) {
-               return NULL;
-       }
-
-       buf.begin = text;
-       buf.len = len;
-       buf.size = buf.len;
-       token.begin = NULL;
-       token.len = 0;
-
-       res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t));
-       while ((pos = rspamd_tokenizer_get_word (&buf,
-                       &token, exceptions)) != NULL) {
-               if (is_utf) {
-                       l = g_utf8_strlen (token.begin, token.len);
-               }
-               else {
-                       l = token.len;
-               }
-               if (min_len > 0 && l < min_len) {
-                       token.begin = pos;
-                       continue;
-               }
-               g_array_append_val (res, token);
-
-               token.begin = pos;
-       }
-
-       return res;
-}
-
-
-void
-tokenize_subject (struct rspamd_task *task, GTree ** tree)
-{
-       gchar *sub;
-       struct tokenizer *osb_tokenizer;
-       GArray *words;
-
-       if (*tree == NULL) {
-               *tree = g_tree_new (token_node_compare_func);
-               rspamd_mempool_add_destructor (task->task_pool,
-                       (rspamd_mempool_destruct_t) g_tree_destroy, *tree);
-       }
-
-       osb_tokenizer = get_tokenizer ("osb-text");
-
-       /* Try to use pre-defined subject */
-       if (task->subject != NULL) {
-               sub = task->subject;
-       }
-       else {
-               sub = (gchar *)g_mime_message_get_subject (task->message);
-       }
-
-       if (sub != NULL) {
-               words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL);
-               if (words != NULL) {
-                       osb_tokenizer->tokenize_func (osb_tokenizer,
-                                       task->task_pool,
-                                       words,
-                                       tree,
-                                       FALSE,
-                                       TRUE,
-                                       NULL);
-                       g_array_free (words, TRUE);
-               }
-       }
-}
-
-/*
- * vi:ts=4
- */
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
deleted file mode 100644 (file)
index ed47e0a..0000000
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef TOKENIZERS_H
-#define TOKENIZERS_H
-
-#include "config.h"
-#include "mem_pool.h"
-#include "fstring.h"
-#include "main.h"
-
-/* Size for features pipe */
-#define FEATURE_WINDOW_SIZE 5
-
-typedef struct token_node_s {
-       guint32 h1;
-       guint32 h2;
-       double value;
-       uintptr_t extra;
-} token_node_t;
-
-/* Common tokenizer structure */
-struct tokenizer {
-       gchar *name;
-       gint (*tokenize_func)(struct tokenizer *tokenizer,
-                       rspamd_mempool_t *pool,
-                       GArray *words,
-                       GTree **cur,
-                       gboolean save_token,
-                       gboolean is_utf,
-                       GList *exceptions);
-       gchar * (*get_next_word)(rspamd_fstring_t *buf, rspamd_fstring_t *token, GList **exceptions);
-};
-
-/* Compare two token nodes */
-int token_node_compare_func (gconstpointer a, gconstpointer b);
-
-/* Get tokenizer structure by name or return NULL if this name is not found */
-struct tokenizer * get_tokenizer (const char *name);
-
-/* Get next word from specified f_str_t buf */
-gchar * rspamd_tokenizer_get_word (rspamd_fstring_t *buf,
-               rspamd_fstring_t *token, GList **exceptions);
-
-/* Tokenize text into array of words (rspamd_fstring_t type) */
-GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
-               gsize min_len, GList **exceptions);
-
-/* OSB tokenize function */
-int osb_tokenize_text (struct tokenizer *tokenizer,
-       rspamd_mempool_t *pool,
-       GArray *input,
-       GTree **cur,
-       gboolean save_token,
-       gboolean is_utf,
-       GList *exceptions);
-
-/* Make tokens for a subject */
-void tokenize_subject (struct rspamd_task *task, GTree ** tree);
-
-/* Array of all defined tokenizers */
-extern struct tokenizer tokenizers[];
-
-#endif
-/*
- * vi:ts=4
- */