From: Vsevolod Stakhov Date: Fri, 16 Jan 2015 15:28:40 +0000 (+0000) Subject: Reorganize statfiles and classifiers into libstat. X-Git-Tag: 0.9.0~871 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=b5597411a2a4f9b46c0076ccddb95f8eacc1cb7f;p=rspamd.git Reorganize statfiles and classifiers into libstat. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 924cae91e..955a6fcbf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -278,7 +278,7 @@ ENDMACRO() ############################# CONFIG SECTION ############################################# # Initial set -INCLUDE_DIRECTORIES(src/libutil src/libserver src/libmime) +INCLUDE_DIRECTORIES(src/libutil src/libserver src/libmime src/libstat) IF(CMAKE_INSTALL_PREFIX) SET(PREFIX ${CMAKE_INSTALL_PREFIX}) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ced57d20b..1c67416ac 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -66,6 +66,7 @@ ADD_SUBDIRECTORY(lua) ADD_SUBDIRECTORY(libutil) ADD_SUBDIRECTORY(libserver) ADD_SUBDIRECTORY(libmime) +ADD_SUBDIRECTORY(libstat) ADD_SUBDIRECTORY(client) SET(RSPAMDSRC ${CMAKE_CURRENT_BINARY_DIR}/modules.c @@ -97,6 +98,7 @@ IF(NOT DEBIAN_BUILD) SET_TARGET_PROPERTIES(rspamd PROPERTIES VERSION ${RSPAMD_VERSION}) ENDIF(NOT DEBIAN_BUILD) +TARGET_LINK_LIBRARIES(rspamd rspamd-stat) TARGET_LINK_LIBRARIES(rspamd rspamd-mime) TARGET_LINK_LIBRARIES(rspamd rspamd-server) TARGET_LINK_LIBRARIES(rspamd rspamd-util) diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c deleted file mode 100644 index 0afd3109c..000000000 --- a/src/classifiers/bayes.c +++ /dev/null @@ -1,597 +0,0 @@ -/* - * Copyright (c) 2009-2012, Vsevolod Stakhov - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * Bayesian classifier - */ -#include "classifiers.h" -#include "tokenizers/tokenizers.h" -#include "main.h" -#include "filter.h" -#include "cfg_file.h" -#include "binlog.h" -#include "lua/lua_common.h" - -#define LOCAL_PROB_DENOM 16.0 - -static inline GQuark -bayes_error_quark (void) -{ - return g_quark_from_static_string ("bayes-error"); -} - -struct bayes_statfile_data { - guint64 hits; - guint64 total_hits; - double value; - struct rspamd_statfile_config *st; - stat_file_t *file; -}; - -struct bayes_callback_data { - statfile_pool_t *pool; - struct classifier_ctx *ctx; - gboolean in_class; - time_t now; - stat_file_t *file; - struct bayes_statfile_data *statfiles; - guint32 statfiles_num; - guint64 total_spam; - guint64 total_ham; - guint64 processed_tokens; - gsize max_tokens; - double spam_probability; - double ham_probability; -}; - -static gboolean -bayes_learn_callback (gpointer key, gpointer value, gpointer data) -{ - token_node_t *node = key; - struct bayes_callback_data *cd = data; - gint c; - guint64 v; - - c = (cd->in_class) ? 1 : -1; - - /* Consider that not found blocks have value 1 */ - v = - statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, - cd->now); - if (v == 0 && c > 0) { - statfile_pool_set_block (cd->pool, - cd->file, - node->h1, - node->h2, - cd->now, - c); - cd->processed_tokens++; - } - else if (v != 0) { - if (G_LIKELY (c > 0)) { - v++; - } - else if (c < 0) { - if (v != 0) { - v--; - } - } - statfile_pool_set_block (cd->pool, - cd->file, - node->h1, - node->h2, - cd->now, - v); - cd->processed_tokens++; - } - - if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) { - /* Stop learning on max tokens */ - return TRUE; - } - return FALSE; -} - -/** - * Returns probability of chisquare > value with specified number of freedom - * degrees - * @param value value to test - * @param freedom_deg number of degrees of freedom - * @return - */ -static gdouble -inv_chi_square (gdouble value, gint freedom_deg) -{ - long double prob, sum; - gint i; - - if ((freedom_deg & 1) != 0) { - msg_err ("non-odd freedom degrees count: %d", freedom_deg); - return 0; - } - - value /= 2.; - errno = 0; -#ifdef HAVE_EXPL - prob = expl (-value); -#elif defined(HAVE_EXP2L) - prob = exp2l (-value * log2 (M_E)); -#else - prob = exp (-value); -#endif - if (errno == ERANGE) { - msg_err ("exp overflow"); - return 0; - } - sum = prob; - for (i = 1; i < freedom_deg / 2; i++) { - prob *= value / (gdouble)i; - sum += prob; - } - - return MIN (1.0, sum); -} - -/* - * In this callback we calculate local probabilities for tokens - */ -static gboolean -bayes_classify_callback (gpointer key, gpointer value, gpointer data) -{ - - token_node_t *node = key; - struct bayes_callback_data *cd = data; - guint i; - struct bayes_statfile_data *cur; - guint64 spam_count = 0, ham_count = 0, total_count = 0; - double spam_prob, spam_freq, ham_freq, bayes_spam_prob; - - for (i = 0; i < cd->statfiles_num; i++) { - cur = &cd->statfiles[i]; - cur->value = statfile_pool_get_block (cd->pool, - cur->file, - node->h1, - node->h2, - cd->now); - if (cur->value > 0) { - cur->total_hits += cur->value; - if (cur->st->is_spam) { - spam_count += cur->value; - } - else { - ham_count += cur->value; - } - total_count += cur->value; - } - } - - /* Probability for this token */ - if (total_count > 0) { - spam_freq = ((double)spam_count / MAX (1., (double)cd->total_spam)); - ham_freq = ((double)ham_count / MAX (1., (double)cd->total_ham)); - spam_prob = spam_freq / (spam_freq + ham_freq); - bayes_spam_prob = (0.5 + spam_prob * total_count) / (1. + total_count); - cd->spam_probability += log (bayes_spam_prob); - cd->ham_probability += log (1. - bayes_spam_prob); - cd->processed_tokens++; - } - - if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) { - /* Stop classifying on max tokens */ - return TRUE; - } - - return FALSE; -} - -struct classifier_ctx * -bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cfg) -{ - struct classifier_ctx *ctx = - rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx)); - - ctx->pool = pool; - ctx->cfg = cfg; - ctx->debug = FALSE; - - return ctx; -} - -gboolean -bayes_classify (struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, - struct rspamd_task *task, - lua_State *L) -{ - struct bayes_callback_data data; - gchar *value; - gint nodes, i = 0, selected_st = -1, cnt; - gint minnodes; - guint64 maxhits = 0, rev; - double final_prob, h, s; - struct rspamd_statfile_config *st; - stat_file_t *file; - GList *cur; - char *sumbuf; - - g_assert (pool != NULL); - g_assert (ctx != NULL); - - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { - minnodes = strtol (value, NULL, 10); - nodes = g_tree_nnodes (input); - if (nodes > FEATURE_WINDOW_SIZE) { - nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; - } - if (nodes < minnodes) { - return FALSE; - } - } - - cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L); - if (cur) { - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t)g_list_free, cur); - } - else { - cur = ctx->cfg->statfiles; - } - - data.statfiles_num = g_list_length (cur); - data.statfiles = g_new0 (struct bayes_statfile_data, data.statfiles_num); - data.pool = pool; - data.now = time (NULL); - data.ctx = ctx; - - data.processed_tokens = 0; - data.spam_probability = 0; - data.ham_probability = 0; - data.total_ham = 0; - data.total_spam = 0; - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) { - minnodes = rspamd_config_parse_limit (value, -1); - data.max_tokens = minnodes; - } - else { - data.max_tokens = 0; - } - - while (cur) { - /* Select statfile to classify */ - st = cur->data; - if ((file = statfile_pool_is_open (pool, st->path)) == NULL) { - if ((file = - statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { - msg_warn ("cannot open %s", st->path); - cur = g_list_next (cur); - data.statfiles_num--; - continue; - } - } - data.statfiles[i].file = file; - data.statfiles[i].st = st; - statfile_get_revision (file, &rev, NULL); - if (st->is_spam) { - data.total_spam += rev; - } - else { - data.total_ham += rev; - } - - cur = g_list_next (cur); - i++; - } - - cnt = i; - - g_tree_foreach (input, bayes_classify_callback, &data); - - if (data.processed_tokens == 0 || data.spam_probability == 0) { - final_prob = 0; - } - else { - h = 1 - inv_chi_square (-2. * data.spam_probability, - 2 * data.processed_tokens); - s = 1 - inv_chi_square (-2. * data.ham_probability, - 2 * data.processed_tokens); - final_prob = (s + 1 - h) / 2.; - } - - if (data.processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) { - - sumbuf = rspamd_mempool_alloc (task->task_pool, 32); - for (i = 0; i < cnt; i++) { - if ((final_prob > 0.5 && !data.statfiles[i].st->is_spam) || - (final_prob < 0.5 && data.statfiles[i].st->is_spam)) { - continue; - } - if (data.statfiles[i].total_hits > maxhits) { - maxhits = data.statfiles[i].total_hits; - selected_st = i; - } - } - if (selected_st == -1) { - msg_err ( - "unexpected classifier error: cannot select desired statfile"); - } - else { - /* Calculate ham probability correctly */ - if (final_prob < 0.5) { - final_prob = 1. - final_prob; - } - rspamd_snprintf (sumbuf, 32, "%.2f%%", final_prob * 100.); - cur = g_list_prepend (NULL, sumbuf); - rspamd_task_insert_result (task, - data.statfiles[selected_st].st->symbol, - final_prob, - cur); - } - } - - g_free (data.statfiles); - - return TRUE; -} - -gboolean -bayes_learn (struct classifier_ctx * ctx, - statfile_pool_t *pool, - const char *symbol, - GTree *input, - gboolean in_class, - double *sum, - double multiplier, - GError **err) -{ - struct bayes_callback_data data; - gchar *value; - gint nodes; - gint minnodes; - struct rspamd_statfile_config *st, *sel_st = NULL; - stat_file_t *to_learn; - GList *cur; - - g_assert (pool != NULL); - g_assert (ctx != NULL); - - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { - minnodes = strtol (value, NULL, 10); - nodes = g_tree_nnodes (input); - if (nodes > FEATURE_WINDOW_SIZE) { - nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; - } - if (nodes < minnodes) { - msg_info ( - "do not learn message as it has too few tokens: %d, while %d min", - nodes, - minnodes); - *sum = 0; - g_set_error (err, - bayes_error_quark (), /* error domain */ - 1, /* error code */ - "message contains too few tokens: %d, while min is %d", - nodes, (int)minnodes); - return FALSE; - } - } - - data.pool = pool; - data.in_class = in_class; - data.now = time (NULL); - data.ctx = ctx; - data.processed_tokens = 0; - data.processed_tokens = 0; - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) { - minnodes = rspamd_config_parse_limit (value, -1); - data.max_tokens = minnodes; - } - else { - data.max_tokens = 0; - } - cur = ctx->cfg->statfiles; - while (cur) { - /* Select statfile to learn */ - st = cur->data; - if (strcmp (st->symbol, symbol) == 0) { - sel_st = st; - break; - } - cur = g_list_next (cur); - } - if (sel_st == NULL) { - g_set_error (err, - bayes_error_quark (), /* error domain */ - 1, /* error code */ - "cannot find statfile for symbol: %s", - symbol); - return FALSE; - } - if ((to_learn = statfile_pool_is_open (pool, sel_st->path)) == NULL) { - if ((to_learn = - statfile_pool_open (pool, sel_st->path, sel_st->size, - FALSE)) == NULL) { - msg_warn ("cannot open %s", sel_st->path); - if (statfile_pool_create (pool, sel_st->path, sel_st->size) == -1) { - msg_err ("cannot create statfile %s", sel_st->path); - g_set_error (err, - bayes_error_quark (), /* error domain */ - 1, /* error code */ - "cannot create statfile: %s", - sel_st->path); - return FALSE; - } - if ((to_learn = - statfile_pool_open (pool, sel_st->path, sel_st->size, - FALSE)) == NULL) { - g_set_error (err, - bayes_error_quark (), /* error domain */ - 1, /* error code */ - "cannot open statfile %s after creation", - sel_st->path); - msg_err ("cannot open statfile %s after creation", - sel_st->path); - return FALSE; - } - } - } - data.file = to_learn; - statfile_pool_lock_file (pool, data.file); - g_tree_foreach (input, bayes_learn_callback, &data); - statfile_inc_revision (to_learn); - statfile_pool_unlock_file (pool, data.file); - - if (sum != NULL) { - *sum = data.processed_tokens; - } - - return TRUE; -} - -gboolean -bayes_learn_spam (struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, - struct rspamd_task *task, - gboolean is_spam, - lua_State *L, - GError **err) -{ - struct bayes_callback_data data; - gchar *value; - gint nodes; - gint minnodes; - struct rspamd_statfile_config *st; - stat_file_t *file; - GList *cur; - gboolean skip_labels; - - g_assert (pool != NULL); - g_assert (ctx != NULL); - - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { - minnodes = strtol (value, NULL, 10); - nodes = g_tree_nnodes (input); - if (nodes > FEATURE_WINDOW_SIZE) { - nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; - } - if (nodes < minnodes) { - g_set_error (err, - bayes_error_quark (), /* error domain */ - 1, /* error code */ - "message contains too few tokens: %d, while min is %d", - nodes, (int)minnodes); - return FALSE; - } - } - - cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, TRUE, is_spam, L); - if (cur) { - skip_labels = FALSE; - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t)g_list_free, cur); - } - else { - /* Do not try to learn specific statfiles if pre callback returned nil */ - skip_labels = TRUE; - cur = ctx->cfg->statfiles; - } - - data.pool = pool; - data.now = time (NULL); - data.ctx = ctx; - data.in_class = TRUE; - - data.processed_tokens = 0; - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) { - minnodes = rspamd_config_parse_limit (value, -1); - data.max_tokens = minnodes; - } - else { - data.max_tokens = 0; - } - - while (cur) { - /* Select statfiles to learn */ - st = cur->data; - if (st->is_spam != is_spam || (skip_labels && st->label)) { - cur = g_list_next (cur); - continue; - } - if ((file = statfile_pool_is_open (pool, st->path)) == NULL) { - if ((file = - statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { - msg_warn ("cannot open %s", st->path); - if (statfile_pool_create (pool, st->path, st->size) == -1) { - msg_err ("cannot create statfile %s", st->path); - g_set_error (err, - bayes_error_quark (), /* error domain */ - 1, /* error code */ - "cannot create statfile: %s", - st->path); - return FALSE; - } - if ((file = - statfile_pool_open (pool, st->path, st->size, - FALSE)) == NULL) { - g_set_error (err, - bayes_error_quark (), /* error domain */ - 1, /* error code */ - "cannot open statfile %s after creation", - st->path); - msg_err ("cannot open statfile %s after creation", - st->path); - return FALSE; - } - } - } - data.file = file; - statfile_pool_lock_file (pool, data.file); - g_tree_foreach (input, bayes_learn_callback, &data); - statfile_inc_revision (file); - statfile_pool_unlock_file (pool, data.file); - maybe_write_binlog (ctx->cfg, st, file, input); - msg_info ("increase revision for %s", st->path); - - cur = g_list_next (cur); - } - - return TRUE; -} - -GList * -bayes_weights (struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, - struct rspamd_task *task) -{ - /* This function is unimplemented with new normalizer */ - return NULL; -} diff --git a/src/classifiers/classifiers.c b/src/classifiers/classifiers.c deleted file mode 100644 index 95dd52c44..000000000 --- a/src/classifiers/classifiers.c +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2009-2012, Vsevolod Stakhov - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * Common classifier functions - */ - -#include "classifiers.h" - -struct classifier classifiers[] = { - { - .name = "winnow", - .init_func = winnow_init, - .classify_func = winnow_classify, - .learn_func = winnow_learn, - .learn_spam_func = winnow_learn_spam, - .weights_func = winnow_weights - }, - { - .name = "bayes", - .init_func = bayes_init, - .classify_func = bayes_classify, - .learn_func = bayes_learn, - .learn_spam_func = bayes_learn_spam, - .weights_func = bayes_weights - } -}; - -struct classifier * -get_classifier (const char *name) -{ - guint i; - - for (i = 0; i < sizeof (classifiers) / sizeof (classifiers[0]); i++) { - if (strcmp (classifiers[i].name, name) == 0) { - return &classifiers[i]; - } - } - - return NULL; -} - -/* - * vi:ts=4 - */ diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h deleted file mode 100644 index 8e59fc555..000000000 --- a/src/classifiers/classifiers.h +++ /dev/null @@ -1,111 +0,0 @@ -#ifndef CLASSIFIERS_H -#define CLASSIFIERS_H - -#include "config.h" -#include "mem_pool.h" -#include "statfile.h" -#include "tokenizers/tokenizers.h" -#include - -/* Consider this value as 0 */ -#define ALPHA 0.0001 - -struct rspamd_classifier_config; -struct rspamd_task; - -struct classifier_ctx { - rspamd_mempool_t *pool; - GHashTable *results; - gboolean debug; - struct rspamd_classifier_config *cfg; -}; - -struct classify_weight { - const char *name; - long double weight; -}; - -/* Common classifier structure */ -struct classifier { - char *name; - struct classifier_ctx * (*init_func)(rspamd_mempool_t *pool, - struct rspamd_classifier_config *cf); - gboolean (*classify_func)(struct classifier_ctx * ctx, - statfile_pool_t *pool, GTree *input, struct rspamd_task *task, - lua_State *L); - gboolean (*learn_func)(struct classifier_ctx * ctx, statfile_pool_t *pool, - const char *symbol, GTree *input, gboolean in_class, - double *sum, double multiplier, GError **err); - gboolean (*learn_spam_func)(struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, struct rspamd_task *task, gboolean is_spam, lua_State *L, - GError **err); - GList * (*weights_func)(struct classifier_ctx * ctx, statfile_pool_t *pool, - GTree *input, struct rspamd_task *task); -}; - -/* Get classifier structure by name or return NULL if this name is not found */ -struct classifier * get_classifier (const char *name); - -/* Winnow algorithm */ -struct classifier_ctx * winnow_init (rspamd_mempool_t *pool, - struct rspamd_classifier_config *cf); -gboolean winnow_classify (struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, - struct rspamd_task *task, - lua_State *L); -gboolean winnow_learn (struct classifier_ctx * ctx, - statfile_pool_t *pool, - const char *symbol, - GTree *input, - gboolean in_class, - double *sum, - double multiplier, - GError **err); -gboolean winnow_learn_spam (struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, - struct rspamd_task *task, - gboolean is_spam, - lua_State *L, - GError **err); -GList * winnow_weights (struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, - struct rspamd_task *task); - -/* Bayes algorithm */ -struct classifier_ctx * bayes_init (rspamd_mempool_t *pool, - struct rspamd_classifier_config *cf); -gboolean bayes_classify (struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, - struct rspamd_task *task, - lua_State *L); -gboolean bayes_learn (struct classifier_ctx * ctx, - statfile_pool_t *pool, - const char *symbol, - GTree *input, - gboolean in_class, - double *sum, - double multiplier, - GError **err); -gboolean bayes_learn_spam (struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, - struct rspamd_task *task, - gboolean is_spam, - lua_State *L, - GError **err); -GList * bayes_weights (struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, - struct rspamd_task *task); -/* Array of all defined classifiers */ -extern struct classifier classifiers[]; - -#endif -/* - * vi:ts=4 - */ diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c deleted file mode 100644 index 4bfe086bf..000000000 --- a/src/classifiers/winnow.c +++ /dev/null @@ -1,694 +0,0 @@ -/* - * Copyright (c) 2009-2012, Vsevolod Stakhov - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * Winnow classifier - */ - -#include "classifiers.h" -#include "tokenizers/tokenizers.h" -#include "main.h" -#include "filter.h" -#include "cfg_file.h" -#include "lua/lua_common.h" - -#define WINNOW_PROMOTION 1.23 -#define WINNOW_DEMOTION 0.83 - -#define MEDIAN_WINDOW_SIZE 5 - -#define MAX_WEIGHT G_MAXDOUBLE / 2. - - - -#define MAX_LEARN_ITERATIONS 100 - -static inline GQuark -winnow_error_quark (void) -{ - return g_quark_from_static_string ("winnow-error"); -} - -struct winnow_callback_data { - statfile_pool_t *pool; - struct classifier_ctx *ctx; - stat_file_t *file; - stat_file_t *learn_file; - long double sum; - long double start; - double multiplier; - guint32 count; - guint32 new_blocks; - gboolean in_class; - gboolean do_demote; - gboolean fresh_run; - time_t now; -}; - -static const double max_common_weight = MAX_WEIGHT * WINNOW_DEMOTION; - - - -static gboolean -winnow_classify_callback (gpointer key, gpointer value, gpointer data) -{ - token_node_t *node = key; - struct winnow_callback_data *cd = data; - double v; - - /* Consider that not found blocks have value 1 */ - v = - statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, - cd->now); - if (fabs (v) > ALPHA) { - cd->sum += v; - } - else { - cd->sum += 1.0; - cd->new_blocks++; - } - - cd->count++; - - return FALSE; -} - -static gboolean -winnow_learn_callback (gpointer key, gpointer value, gpointer data) -{ - token_node_t *node = key; - struct winnow_callback_data *cd = data; - double v, c; - - c = (cd->in_class) ? WINNOW_PROMOTION * cd->multiplier : WINNOW_DEMOTION / - cd->multiplier; - - /* Consider that not found blocks have value 1 */ - v = - statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, - cd->now); - if (fabs (v) < ALPHA) { - /* Block not found, insert new */ - cd->start += 1; - if (cd->file == cd->learn_file) { - statfile_pool_set_block (cd->pool, - cd->file, - node->h1, - node->h2, - cd->now, - c); - node->value = c; - cd->new_blocks++; - } - } - else { - cd->start += v; - /* Here we just increase the extra value of block */ - if (cd->fresh_run) { - node->extra = 0; - } - else { - node->extra++; - } - node->value = v; - - if (node->extra > 1) { - /* - * Assume that this node is common for several statfiles, so - * decrease its weight proportianally - */ - if (node->value > max_common_weight) { - /* Static fluctuation */ - statfile_pool_set_block (cd->pool, - cd->file, - node->h1, - node->h2, - cd->now, - 0.); - node->value = 0.; - } - else if (node->value > WINNOW_PROMOTION * cd->multiplier) { - /* Try to decrease its value */ - /* XXX: it is more intelligent to add some adaptive filter here */ - if (cd->file == cd->learn_file) { - if (node->value > max_common_weight / 2.) { - node->value *= c; - } - else { - /* - * Too high token value that exists also in other - * statfiles, may be statistic error, so decrease it - * slightly - */ - node->value *= WINNOW_DEMOTION; - } - } - else { - node->value = WINNOW_DEMOTION / cd->multiplier; - } - statfile_pool_set_block (cd->pool, - cd->file, - node->h1, - node->h2, - cd->now, - node->value); - } - } - else if (cd->file == cd->learn_file) { - /* New block or block that is in only one statfile */ - /* Set some limit on growing */ - if (v > MAX_WEIGHT) { - node->value = v; - } - else { - node->value *= c; - } - statfile_pool_set_block (cd->pool, - cd->file, - node->h1, - node->h2, - cd->now, - node->value); - } - else if (cd->do_demote) { - /* Demote blocks in file */ - node->value *= WINNOW_DEMOTION / cd->multiplier; - statfile_pool_set_block (cd->pool, - cd->file, - node->h1, - node->h2, - cd->now, - node->value); - } - } - - - cd->sum += node->value; - - cd->count++; - - return FALSE; -} - -struct classifier_ctx * -winnow_init (rspamd_mempool_t * pool, struct rspamd_classifier_config *cfg) -{ - struct classifier_ctx *ctx = - rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx)); - - ctx->pool = pool; - ctx->cfg = cfg; - - return ctx; -} - -gboolean -winnow_classify (struct classifier_ctx *ctx, - statfile_pool_t * pool, - GTree * input, - struct rspamd_task *task, - lua_State *L) -{ - struct winnow_callback_data data; - char *sumbuf, *value; - long double res = 0., max = 0.; - GList *cur; - struct rspamd_statfile_config *st, *sel = NULL; - int nodes, minnodes; - - g_assert (pool != NULL); - g_assert (ctx != NULL); - - data.pool = pool; - data.now = time (NULL); - data.ctx = ctx; - - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { - minnodes = strtol (value, NULL, 10); - nodes = g_tree_nnodes (input); - if (nodes > FEATURE_WINDOW_SIZE) { - nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; - } - if (nodes < minnodes) { - msg_info ( - "do not classify message as it has too few tokens: %d, while %d min", - nodes, - minnodes); - return FALSE; - } - } - - cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L); - if (cur) { - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t)g_list_free, cur); - } - else { - cur = ctx->cfg->statfiles; - } - - while (cur) { - st = cur->data; - data.sum = 0; - data.count = 0; - data.new_blocks = 0; - if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) { - if ((data.file = - statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { - msg_warn ("cannot open %s, skip it", st->path); - cur = g_list_next (cur); - continue; - } - } - - if (data.file != NULL) { - g_tree_foreach (input, winnow_classify_callback, &data); - } - - if (data.count != 0) { - res = data.sum / (double)data.count; - } - else { - res = 0; - } - if (res > max) { - max = res; - sel = st; - } - cur = g_list_next (cur); - } - - if (sel != NULL) { -#ifdef WITH_LUA - max = rspamd_lua_call_cls_post_callbacks (ctx->cfg, task, max, L); -#endif -#ifdef HAVE_TANHL - max = tanhl (max); -#else - /* - * As some implementations of libm does not support tanhl, try to use - * tanh - */ - max = tanh ((double) max); -#endif - sumbuf = rspamd_mempool_alloc (task->task_pool, 32); - rspamd_snprintf (sumbuf, 32, "%.2F", max); - cur = g_list_prepend (NULL, sumbuf); - rspamd_task_insert_result (task, sel->symbol, max, cur); - } - - return TRUE; -} - -GList * -winnow_weights (struct classifier_ctx *ctx, - statfile_pool_t * pool, - GTree * input, - struct rspamd_task *task) -{ - struct winnow_callback_data data; - long double res = 0.; - GList *cur, *resl = NULL; - struct rspamd_statfile_config *st; - struct classify_weight *w; - char *value; - int nodes, minnodes; - - g_assert (pool != NULL); - g_assert (ctx != NULL); - - data.pool = pool; - data.now = time (NULL); - data.ctx = ctx; - - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { - minnodes = strtol (value, NULL, 10); - nodes = g_tree_nnodes (input); - if (nodes > FEATURE_WINDOW_SIZE) { - nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; - } - if (nodes < minnodes) { - msg_info ( - "do not classify message as it has too few tokens: %d, while %d min", - nodes, - minnodes); - return NULL; - } - } - - cur = ctx->cfg->statfiles; - while (cur) { - st = cur->data; - data.sum = 0; - data.count = 0; - if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) { - if ((data.file = - statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { - msg_warn ("cannot open %s, skip it", st->path); - cur = g_list_next (cur); - continue; - } - } - - if (data.file != NULL) { - g_tree_foreach (input, winnow_classify_callback, &data); - } - - w = - rspamd_mempool_alloc0 (task->task_pool, - sizeof (struct classify_weight)); - if (data.count != 0) { - res = data.sum / (double)data.count; - } - else { - res = 0; - } - w->name = st->symbol; - w->weight = res; - resl = g_list_prepend (resl, w); - cur = g_list_next (cur); - } - - if (resl != NULL) { - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t)g_list_free, resl); - } - - return resl; - -} - - -gboolean -winnow_learn (struct classifier_ctx *ctx, - statfile_pool_t *pool, - const char *symbol, - GTree * input, - int in_class, - double *sum, - double multiplier, - GError **err) -{ - struct winnow_callback_data data = { - .file = NULL, - .multiplier = multiplier - }; - char *value; - int nodes, minnodes, iterations = 0; - struct rspamd_statfile_config *st, *sel_st = NULL; - stat_file_t *sel = NULL, *to_learn; - long double res = 0., max = 0., start_value = 0., end_value = 0.; - double learn_threshold = 0.0; - GList *cur, *to_demote = NULL; - gboolean force_learn = FALSE; - - g_assert (pool != NULL); - g_assert (ctx != NULL); - - data.pool = pool; - data.in_class = in_class; - data.now = time (NULL); - data.ctx = ctx; - - - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { - minnodes = strtol (value, NULL, 10); - nodes = g_tree_nnodes (input); - if (nodes > FEATURE_WINDOW_SIZE) { - nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; - } - if (nodes < minnodes) { - msg_info ( - "do not learn message as it has too few tokens: %d, while %d min", - nodes, - minnodes); - if (sum != NULL) { - *sum = 0; - } - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "message contains too few tokens: %d, while min is %d", - nodes, minnodes); - return FALSE; - } - } - if (ctx->cfg->opts && - (value = - g_hash_table_lookup (ctx->cfg->opts, "learn_threshold")) != NULL) { - learn_threshold = strtod (value, NULL); - } - - if (learn_threshold <= 1.0 && learn_threshold >= 0) { - /* Classify message and check target statfile score */ - cur = ctx->cfg->statfiles; - while (cur) { - /* Open or create all statfiles inside classifier */ - st = cur->data; - if (statfile_pool_is_open (pool, st->path) == NULL) { - if (statfile_pool_open (pool, st->path, st->size, - FALSE) == NULL) { - msg_warn ("cannot open %s", st->path); - if (statfile_pool_create (pool, st->path, st->size) == -1) { - msg_err ("cannot create statfile %s", st->path); - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "cannot create statfile: %s", - st->path); - return FALSE; - } - if (statfile_pool_open (pool, st->path, st->size, - FALSE) == NULL) { - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "open statfile %s after creation", - st->path); - msg_err ("cannot open statfile %s after creation", - st->path); - return FALSE; - } - } - } - if (strcmp (st->symbol, symbol) == 0) { - sel_st = st; - - } - cur = g_list_next (cur); - } - - if (sel_st == NULL) { - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "cannot find statfile for symbol %s", - symbol); - msg_err ("cannot find statfile for symbol %s", symbol); - return FALSE; - } - - to_learn = statfile_pool_is_open (pool, sel_st->path); - if (to_learn == NULL) { - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles", - sel_st->path); - return FALSE; - } - /* Check target statfile */ - data.file = to_learn; - data.sum = 0; - data.count = 0; - data.new_blocks = 0; - g_tree_foreach (input, winnow_classify_callback, &data); - if (data.count > 0) { - max = data.sum / (double)data.count; - } - else { - max = 0; - } - /* If most of blocks are not presented in targeted statfile do forced learn */ - if (max < 1 + learn_threshold) { - force_learn = TRUE; - } - /* Check other statfiles */ - while (cur) { - st = cur->data; - data.sum = 0; - data.count = 0; - if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) { - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles", - st->path); - return FALSE; - } - g_tree_foreach (input, winnow_classify_callback, &data); - if (data.count != 0) { - res = data.sum / data.count; - } - else { - res = 0; - } - if (to_learn != data.file && res - max > 1 - learn_threshold) { - /* Demote tokens in this statfile */ - to_demote = g_list_prepend (to_demote, data.file); - } - cur = g_list_next (cur); - } - } - else { - msg_err ( - "learn threshold is more than 1 or less than 0, so cannot do learn, please check your configuration"); - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "bad learn_threshold setting: %.2f", - learn_threshold); - return FALSE; - } - /* If to_demote list is empty this message is already classified correctly */ - if (max > WINNOW_PROMOTION && to_demote == NULL && !force_learn) { - msg_info ( - "this message is already of class %s with threshold %.2f and weight %.2F", - sel_st->symbol, - learn_threshold, - max); - goto end; - } - data.learn_file = to_learn; - end_value = max; - do { - cur = ctx->cfg->statfiles; - data.fresh_run = TRUE; - while (cur) { - st = cur->data; - data.sum = 0; - data.count = 0; - data.new_blocks = 0; - data.start = 0; - if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) { - return FALSE; - } - if (to_demote != NULL && - g_list_find (to_demote, data.file) != NULL) { - data.do_demote = TRUE; - } - else { - data.do_demote = FALSE; - } - - statfile_pool_lock_file (pool, data.file); - g_tree_foreach (input, winnow_learn_callback, &data); - statfile_pool_unlock_file (pool, data.file); - if (data.count != 0) { - res = data.sum / data.count; - } - else { - res = 0; - } - if (res > max) { - max = res; - sel = data.file; - } - if (data.file == to_learn) { - if (data.count > 0) { - start_value = data.start / data.count; - } - end_value = res; - } - cur = g_list_next (cur); - data.fresh_run = FALSE; - } - - data.multiplier *= WINNOW_PROMOTION; - msg_info ( - "learn iteration %d for statfile %s: %G -> %G, multiplier: %.2f", - iterations + 1, - symbol, - start_value, - end_value, - data.multiplier); - } while ((in_class ? sel != to_learn : sel == - to_learn) && iterations++ < MAX_LEARN_ITERATIONS); - - if (iterations >= MAX_LEARN_ITERATIONS) { - msg_warn ( - "learning statfile %s was not fully successfull: iterations count is limited to %d, final sum is %G", - sel_st->symbol, - MAX_LEARN_ITERATIONS, - max); - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "learning statfile %s was not fully successfull: iterations count is limited to %d", - sel_st->symbol, MAX_LEARN_ITERATIONS); - return FALSE; - } - else { - msg_info ( - "learned statfile %s successfully with %d iterations and sum %G", - sel_st->symbol, - iterations + 1, - max); - } - - -end: - if (sum) { -#ifdef HAVE_TANHL - *sum = (double)tanhl (max); -#else - /* - * As some implementations of libm does not support tanhl, try to use - * tanh - */ - *sum = tanh ((double) max); -#endif - } - return TRUE; -} - -gboolean -winnow_learn_spam (struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, - struct rspamd_task *task, - gboolean is_spam, - lua_State *L, - GError **err) -{ - g_set_error (err, - winnow_error_quark (), /* error domain */ - 1, /* error code */ - "learn spam is not supported for winnow" - ); - return FALSE; -} diff --git a/src/controller.c b/src/controller.c index 33422f782..3b6436490 100644 --- a/src/controller.c +++ b/src/controller.c @@ -23,8 +23,8 @@ #include "config.h" -#include "tokenizers/tokenizers.h" -#include "classifiers/classifiers.h" +#include "tokenizers.h" +#include "classifiers.h" #include "libserver/dynamic_cfg.h" #include "libutil/rrd.h" #include "libutil/map.h" diff --git a/src/libmime/filter.c b/src/libmime/filter.c index a1f042aae..48285ea0a 100644 --- a/src/libmime/filter.c +++ b/src/libmime/filter.c @@ -32,8 +32,8 @@ #include "expressions.h" #include "binlog.h" #include "diff.h" -#include "classifiers/classifiers.h" -#include "tokenizers/tokenizers.h" +#include "classifiers.h" +#include "tokenizers.h" #ifdef WITH_LUA # include "lua/lua_common.h" diff --git a/src/libmime/message.c b/src/libmime/message.c index 94137af15..682e0cf82 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -30,7 +30,7 @@ #include "html.h" #include "images.h" #include "utlist.h" -#include "tokenizers/tokenizers.h" +#include "tokenizers.h" #include diff --git a/src/libserver/CMakeLists.txt b/src/libserver/CMakeLists.txt index 8696da7ba..307611301 100644 --- a/src/libserver/CMakeLists.txt +++ b/src/libserver/CMakeLists.txt @@ -21,13 +21,6 @@ SET(LIBRSPAMDSERVERSRC url.c worker_util.c) -SET(TOKENIZERSSRC ../tokenizers/tokenizers.c - ../tokenizers/osb.c) - -SET(CLASSIFIERSSRC ../classifiers/classifiers.c - ../classifiers/bayes.c - ../classifiers/winnow.c) - # Librspamd-server #IF(WITH_DB) @@ -37,7 +30,7 @@ SET(CLASSIFIERSSRC ../classifiers/classifiers.c # LIST(APPEND LIBRSPAMDSERVERSRC kvstorage_sqlite.c) #ENDIF(WITH_SQLITE) -ADD_LIBRARY(rspamd-server ${LINK_TYPE} ${LIBRSPAMDSERVERSRC} ${TOKENIZERSSRC} ${CLASSIFIERSSRC}) +ADD_LIBRARY(rspamd-server ${LINK_TYPE} ${LIBRSPAMDSERVERSRC}) IF(NOT DEBIAN_BUILD) SET_TARGET_PROPERTIES(rspamd-server PROPERTIES VERSION ${RSPAMD_VERSION}) ENDIF(NOT DEBIAN_BUILD) diff --git a/src/libserver/binlog.c b/src/libserver/binlog.c index 5eeae7ac2..c48016339 100644 --- a/src/libserver/binlog.c +++ b/src/libserver/binlog.c @@ -25,7 +25,7 @@ #include "config.h" #include "binlog.h" #include "cfg_file.h" -#include "tokenizers/tokenizers.h" +#include "tokenizers.h" #define BINLOG_SUFFIX ".binlog" #define BACKUP_SUFFIX ".old" diff --git a/src/libserver/cfg_rcl.c b/src/libserver/cfg_rcl.c index 13ef400ed..44db06a0b 100644 --- a/src/libserver/cfg_rcl.c +++ b/src/libserver/cfg_rcl.c @@ -28,8 +28,8 @@ #include "cfg_file.h" #include "lua/lua_common.h" #include "expressions.h" -#include "classifiers/classifiers.h" -#include "tokenizers/tokenizers.h" +#include "classifiers.h" +#include "tokenizers.h" struct rspamd_rcl_default_handler_data { diff --git a/src/libserver/cfg_utils.c b/src/libserver/cfg_utils.c index e28f6445e..b53a2690c 100644 --- a/src/libserver/cfg_utils.c +++ b/src/libserver/cfg_utils.c @@ -29,7 +29,7 @@ #include "main.h" #include "uthash_strcase.h" #include "filter.h" -#include "classifiers/classifiers.h" +#include "classifiers.h" #include "lua/lua_common.h" #include "kvstorage_config.h" #include "map.h" diff --git a/src/libserver/statfile_sync.c b/src/libserver/statfile_sync.c index 23ed96e12..62f848059 100644 --- a/src/libserver/statfile_sync.c +++ b/src/libserver/statfile_sync.c @@ -24,8 +24,8 @@ #include "config.h" #include "cfg_file.h" -#include "tokenizers/tokenizers.h" -#include "classifiers/classifiers.h" +#include "tokenizers.h" +#include "classifiers.h" #include "statfile.h" #include "binlog.h" #include "buffer.h" diff --git a/src/libstat/CMakeLists.txt b/src/libstat/CMakeLists.txt new file mode 100644 index 000000000..6254a41a6 --- /dev/null +++ b/src/libstat/CMakeLists.txt @@ -0,0 +1,27 @@ +# Librspamdserver +SET(LIBSTATSRC + ) +SET(TOKENIZERSSRC tokenizers/tokenizers.c + tokenizers/osb.c) + +SET(CLASSIFIERSSRC classifiers/classifiers.c + classifiers/bayes.c + classifiers/winnow.c) + +ADD_LIBRARY(rspamd-stat ${LINK_TYPE} ${LIBSTATSRC} ${TOKENIZERSSRC} ${CLASSIFIERSSRC}) +IF(NOT DEBIAN_BUILD) + SET_TARGET_PROPERTIES(rspamd-stat PROPERTIES VERSION ${RSPAMD_VERSION}) +ENDIF(NOT DEBIAN_BUILD) +SET_TARGET_PROPERTIES(rspamd-stat PROPERTIES LINKER_LANGUAGE C COMPILE_FLAGS "-DRSPAMD_LIB") +TARGET_LINK_LIBRARIES(rspamd-stat rspamd-server) + +IF(CMAKE_COMPILER_IS_GNUCC) +SET_TARGET_PROPERTIES(rspamd-stat PROPERTIES COMPILE_FLAGS "-DRSPAMD_LIB -fno-strict-aliasing") +ENDIF(CMAKE_COMPILER_IS_GNUCC) + + +IF(NO_SHARED MATCHES "OFF") + INSTALL(TARGETS rspamd-stat + LIBRARY DESTINATION ${LIBDIR} + PUBLIC_HEADER DESTINATION ${INCLUDEDIR}) +ENDIF(NO_SHARED MATCHES "OFF") diff --git a/src/libstat/classifiers.h b/src/libstat/classifiers.h new file mode 100644 index 000000000..fd1b63bcf --- /dev/null +++ b/src/libstat/classifiers.h @@ -0,0 +1,111 @@ +#ifndef CLASSIFIERS_H +#define CLASSIFIERS_H + +#include "config.h" +#include "mem_pool.h" +#include "statfile.h" +#include "tokenizers.h" +#include + +/* Consider this value as 0 */ +#define ALPHA 0.0001 + +struct rspamd_classifier_config; +struct rspamd_task; + +struct classifier_ctx { + rspamd_mempool_t *pool; + GHashTable *results; + gboolean debug; + struct rspamd_classifier_config *cfg; +}; + +struct classify_weight { + const char *name; + long double weight; +}; + +/* Common classifier structure */ +struct classifier { + char *name; + struct classifier_ctx * (*init_func)(rspamd_mempool_t *pool, + struct rspamd_classifier_config *cf); + gboolean (*classify_func)(struct classifier_ctx * ctx, + statfile_pool_t *pool, GTree *input, struct rspamd_task *task, + lua_State *L); + gboolean (*learn_func)(struct classifier_ctx * ctx, statfile_pool_t *pool, + const char *symbol, GTree *input, gboolean in_class, + double *sum, double multiplier, GError **err); + gboolean (*learn_spam_func)(struct classifier_ctx * ctx, + statfile_pool_t *pool, + GTree *input, struct rspamd_task *task, gboolean is_spam, lua_State *L, + GError **err); + GList * (*weights_func)(struct classifier_ctx * ctx, statfile_pool_t *pool, + GTree *input, struct rspamd_task *task); +}; + +/* Get classifier structure by name or return NULL if this name is not found */ +struct classifier * get_classifier (const char *name); + +/* Winnow algorithm */ +struct classifier_ctx * winnow_init (rspamd_mempool_t *pool, + struct rspamd_classifier_config *cf); +gboolean winnow_classify (struct classifier_ctx * ctx, + statfile_pool_t *pool, + GTree *input, + struct rspamd_task *task, + lua_State *L); +gboolean winnow_learn (struct classifier_ctx * ctx, + statfile_pool_t *pool, + const char *symbol, + GTree *input, + gboolean in_class, + double *sum, + double multiplier, + GError **err); +gboolean winnow_learn_spam (struct classifier_ctx * ctx, + statfile_pool_t *pool, + GTree *input, + struct rspamd_task *task, + gboolean is_spam, + lua_State *L, + GError **err); +GList * winnow_weights (struct classifier_ctx * ctx, + statfile_pool_t *pool, + GTree *input, + struct rspamd_task *task); + +/* Bayes algorithm */ +struct classifier_ctx * bayes_init (rspamd_mempool_t *pool, + struct rspamd_classifier_config *cf); +gboolean bayes_classify (struct classifier_ctx * ctx, + statfile_pool_t *pool, + GTree *input, + struct rspamd_task *task, + lua_State *L); +gboolean bayes_learn (struct classifier_ctx * ctx, + statfile_pool_t *pool, + const char *symbol, + GTree *input, + gboolean in_class, + double *sum, + double multiplier, + GError **err); +gboolean bayes_learn_spam (struct classifier_ctx * ctx, + statfile_pool_t *pool, + GTree *input, + struct rspamd_task *task, + gboolean is_spam, + lua_State *L, + GError **err); +GList * bayes_weights (struct classifier_ctx * ctx, + statfile_pool_t *pool, + GTree *input, + struct rspamd_task *task); +/* Array of all defined classifiers */ +extern struct classifier classifiers[]; + +#endif +/* + * vi:ts=4 + */ diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c new file mode 100644 index 000000000..34169697e --- /dev/null +++ b/src/libstat/classifiers/bayes.c @@ -0,0 +1,597 @@ +/* + * Copyright (c) 2009-2012, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Bayesian classifier + */ +#include "classifiers.h" +#include "tokenizers.h" +#include "main.h" +#include "filter.h" +#include "cfg_file.h" +#include "binlog.h" +#include "lua/lua_common.h" + +#define LOCAL_PROB_DENOM 16.0 + +static inline GQuark +bayes_error_quark (void) +{ + return g_quark_from_static_string ("bayes-error"); +} + +struct bayes_statfile_data { + guint64 hits; + guint64 total_hits; + double value; + struct rspamd_statfile_config *st; + stat_file_t *file; +}; + +struct bayes_callback_data { + statfile_pool_t *pool; + struct classifier_ctx *ctx; + gboolean in_class; + time_t now; + stat_file_t *file; + struct bayes_statfile_data *statfiles; + guint32 statfiles_num; + guint64 total_spam; + guint64 total_ham; + guint64 processed_tokens; + gsize max_tokens; + double spam_probability; + double ham_probability; +}; + +static gboolean +bayes_learn_callback (gpointer key, gpointer value, gpointer data) +{ + token_node_t *node = key; + struct bayes_callback_data *cd = data; + gint c; + guint64 v; + + c = (cd->in_class) ? 1 : -1; + + /* Consider that not found blocks have value 1 */ + v = + statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, + cd->now); + if (v == 0 && c > 0) { + statfile_pool_set_block (cd->pool, + cd->file, + node->h1, + node->h2, + cd->now, + c); + cd->processed_tokens++; + } + else if (v != 0) { + if (G_LIKELY (c > 0)) { + v++; + } + else if (c < 0) { + if (v != 0) { + v--; + } + } + statfile_pool_set_block (cd->pool, + cd->file, + node->h1, + node->h2, + cd->now, + v); + cd->processed_tokens++; + } + + if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) { + /* Stop learning on max tokens */ + return TRUE; + } + return FALSE; +} + +/** + * Returns probability of chisquare > value with specified number of freedom + * degrees + * @param value value to test + * @param freedom_deg number of degrees of freedom + * @return + */ +static gdouble +inv_chi_square (gdouble value, gint freedom_deg) +{ + long double prob, sum; + gint i; + + if ((freedom_deg & 1) != 0) { + msg_err ("non-odd freedom degrees count: %d", freedom_deg); + return 0; + } + + value /= 2.; + errno = 0; +#ifdef HAVE_EXPL + prob = expl (-value); +#elif defined(HAVE_EXP2L) + prob = exp2l (-value * log2 (M_E)); +#else + prob = exp (-value); +#endif + if (errno == ERANGE) { + msg_err ("exp overflow"); + return 0; + } + sum = prob; + for (i = 1; i < freedom_deg / 2; i++) { + prob *= value / (gdouble)i; + sum += prob; + } + + return MIN (1.0, sum); +} + +/* + * In this callback we calculate local probabilities for tokens + */ +static gboolean +bayes_classify_callback (gpointer key, gpointer value, gpointer data) +{ + + token_node_t *node = key; + struct bayes_callback_data *cd = data; + guint i; + struct bayes_statfile_data *cur; + guint64 spam_count = 0, ham_count = 0, total_count = 0; + double spam_prob, spam_freq, ham_freq, bayes_spam_prob; + + for (i = 0; i < cd->statfiles_num; i++) { + cur = &cd->statfiles[i]; + cur->value = statfile_pool_get_block (cd->pool, + cur->file, + node->h1, + node->h2, + cd->now); + if (cur->value > 0) { + cur->total_hits += cur->value; + if (cur->st->is_spam) { + spam_count += cur->value; + } + else { + ham_count += cur->value; + } + total_count += cur->value; + } + } + + /* Probability for this token */ + if (total_count > 0) { + spam_freq = ((double)spam_count / MAX (1., (double)cd->total_spam)); + ham_freq = ((double)ham_count / MAX (1., (double)cd->total_ham)); + spam_prob = spam_freq / (spam_freq + ham_freq); + bayes_spam_prob = (0.5 + spam_prob * total_count) / (1. + total_count); + cd->spam_probability += log (bayes_spam_prob); + cd->ham_probability += log (1. - bayes_spam_prob); + cd->processed_tokens++; + } + + if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) { + /* Stop classifying on max tokens */ + return TRUE; + } + + return FALSE; +} + +struct classifier_ctx * +bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cfg) +{ + struct classifier_ctx *ctx = + rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx)); + + ctx->pool = pool; + ctx->cfg = cfg; + ctx->debug = FALSE; + + return ctx; +} + +gboolean +bayes_classify (struct classifier_ctx * ctx, + statfile_pool_t *pool, + GTree *input, + struct rspamd_task *task, + lua_State *L) +{ + struct bayes_callback_data data; + gchar *value; + gint nodes, i = 0, selected_st = -1, cnt; + gint minnodes; + guint64 maxhits = 0, rev; + double final_prob, h, s; + struct rspamd_statfile_config *st; + stat_file_t *file; + GList *cur; + char *sumbuf; + + g_assert (pool != NULL); + g_assert (ctx != NULL); + + if (ctx->cfg->opts && + (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { + minnodes = strtol (value, NULL, 10); + nodes = g_tree_nnodes (input); + if (nodes > FEATURE_WINDOW_SIZE) { + nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; + } + if (nodes < minnodes) { + return FALSE; + } + } + + cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L); + if (cur) { + rspamd_mempool_add_destructor (task->task_pool, + (rspamd_mempool_destruct_t)g_list_free, cur); + } + else { + cur = ctx->cfg->statfiles; + } + + data.statfiles_num = g_list_length (cur); + data.statfiles = g_new0 (struct bayes_statfile_data, data.statfiles_num); + data.pool = pool; + data.now = time (NULL); + data.ctx = ctx; + + data.processed_tokens = 0; + data.spam_probability = 0; + data.ham_probability = 0; + data.total_ham = 0; + data.total_spam = 0; + if (ctx->cfg->opts && + (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) { + minnodes = rspamd_config_parse_limit (value, -1); + data.max_tokens = minnodes; + } + else { + data.max_tokens = 0; + } + + while (cur) { + /* Select statfile to classify */ + st = cur->data; + if ((file = statfile_pool_is_open (pool, st->path)) == NULL) { + if ((file = + statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { + msg_warn ("cannot open %s", st->path); + cur = g_list_next (cur); + data.statfiles_num--; + continue; + } + } + data.statfiles[i].file = file; + data.statfiles[i].st = st; + statfile_get_revision (file, &rev, NULL); + if (st->is_spam) { + data.total_spam += rev; + } + else { + data.total_ham += rev; + } + + cur = g_list_next (cur); + i++; + } + + cnt = i; + + g_tree_foreach (input, bayes_classify_callback, &data); + + if (data.processed_tokens == 0 || data.spam_probability == 0) { + final_prob = 0; + } + else { + h = 1 - inv_chi_square (-2. * data.spam_probability, + 2 * data.processed_tokens); + s = 1 - inv_chi_square (-2. * data.ham_probability, + 2 * data.processed_tokens); + final_prob = (s + 1 - h) / 2.; + } + + if (data.processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) { + + sumbuf = rspamd_mempool_alloc (task->task_pool, 32); + for (i = 0; i < cnt; i++) { + if ((final_prob > 0.5 && !data.statfiles[i].st->is_spam) || + (final_prob < 0.5 && data.statfiles[i].st->is_spam)) { + continue; + } + if (data.statfiles[i].total_hits > maxhits) { + maxhits = data.statfiles[i].total_hits; + selected_st = i; + } + } + if (selected_st == -1) { + msg_err ( + "unexpected classifier error: cannot select desired statfile"); + } + else { + /* Calculate ham probability correctly */ + if (final_prob < 0.5) { + final_prob = 1. - final_prob; + } + rspamd_snprintf (sumbuf, 32, "%.2f%%", final_prob * 100.); + cur = g_list_prepend (NULL, sumbuf); + rspamd_task_insert_result (task, + data.statfiles[selected_st].st->symbol, + final_prob, + cur); + } + } + + g_free (data.statfiles); + + return TRUE; +} + +gboolean +bayes_learn (struct classifier_ctx * ctx, + statfile_pool_t *pool, + const char *symbol, + GTree *input, + gboolean in_class, + double *sum, + double multiplier, + GError **err) +{ + struct bayes_callback_data data; + gchar *value; + gint nodes; + gint minnodes; + struct rspamd_statfile_config *st, *sel_st = NULL; + stat_file_t *to_learn; + GList *cur; + + g_assert (pool != NULL); + g_assert (ctx != NULL); + + if (ctx->cfg->opts && + (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { + minnodes = strtol (value, NULL, 10); + nodes = g_tree_nnodes (input); + if (nodes > FEATURE_WINDOW_SIZE) { + nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; + } + if (nodes < minnodes) { + msg_info ( + "do not learn message as it has too few tokens: %d, while %d min", + nodes, + minnodes); + *sum = 0; + g_set_error (err, + bayes_error_quark (), /* error domain */ + 1, /* error code */ + "message contains too few tokens: %d, while min is %d", + nodes, (int)minnodes); + return FALSE; + } + } + + data.pool = pool; + data.in_class = in_class; + data.now = time (NULL); + data.ctx = ctx; + data.processed_tokens = 0; + data.processed_tokens = 0; + if (ctx->cfg->opts && + (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) { + minnodes = rspamd_config_parse_limit (value, -1); + data.max_tokens = minnodes; + } + else { + data.max_tokens = 0; + } + cur = ctx->cfg->statfiles; + while (cur) { + /* Select statfile to learn */ + st = cur->data; + if (strcmp (st->symbol, symbol) == 0) { + sel_st = st; + break; + } + cur = g_list_next (cur); + } + if (sel_st == NULL) { + g_set_error (err, + bayes_error_quark (), /* error domain */ + 1, /* error code */ + "cannot find statfile for symbol: %s", + symbol); + return FALSE; + } + if ((to_learn = statfile_pool_is_open (pool, sel_st->path)) == NULL) { + if ((to_learn = + statfile_pool_open (pool, sel_st->path, sel_st->size, + FALSE)) == NULL) { + msg_warn ("cannot open %s", sel_st->path); + if (statfile_pool_create (pool, sel_st->path, sel_st->size) == -1) { + msg_err ("cannot create statfile %s", sel_st->path); + g_set_error (err, + bayes_error_quark (), /* error domain */ + 1, /* error code */ + "cannot create statfile: %s", + sel_st->path); + return FALSE; + } + if ((to_learn = + statfile_pool_open (pool, sel_st->path, sel_st->size, + FALSE)) == NULL) { + g_set_error (err, + bayes_error_quark (), /* error domain */ + 1, /* error code */ + "cannot open statfile %s after creation", + sel_st->path); + msg_err ("cannot open statfile %s after creation", + sel_st->path); + return FALSE; + } + } + } + data.file = to_learn; + statfile_pool_lock_file (pool, data.file); + g_tree_foreach (input, bayes_learn_callback, &data); + statfile_inc_revision (to_learn); + statfile_pool_unlock_file (pool, data.file); + + if (sum != NULL) { + *sum = data.processed_tokens; + } + + return TRUE; +} + +gboolean +bayes_learn_spam (struct classifier_ctx * ctx, + statfile_pool_t *pool, + GTree *input, + struct rspamd_task *task, + gboolean is_spam, + lua_State *L, + GError **err) +{ + struct bayes_callback_data data; + gchar *value; + gint nodes; + gint minnodes; + struct rspamd_statfile_config *st; + stat_file_t *file; + GList *cur; + gboolean skip_labels; + + g_assert (pool != NULL); + g_assert (ctx != NULL); + + if (ctx->cfg->opts && + (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { + minnodes = strtol (value, NULL, 10); + nodes = g_tree_nnodes (input); + if (nodes > FEATURE_WINDOW_SIZE) { + nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; + } + if (nodes < minnodes) { + g_set_error (err, + bayes_error_quark (), /* error domain */ + 1, /* error code */ + "message contains too few tokens: %d, while min is %d", + nodes, (int)minnodes); + return FALSE; + } + } + + cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, TRUE, is_spam, L); + if (cur) { + skip_labels = FALSE; + rspamd_mempool_add_destructor (task->task_pool, + (rspamd_mempool_destruct_t)g_list_free, cur); + } + else { + /* Do not try to learn specific statfiles if pre callback returned nil */ + skip_labels = TRUE; + cur = ctx->cfg->statfiles; + } + + data.pool = pool; + data.now = time (NULL); + data.ctx = ctx; + data.in_class = TRUE; + + data.processed_tokens = 0; + if (ctx->cfg->opts && + (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) { + minnodes = rspamd_config_parse_limit (value, -1); + data.max_tokens = minnodes; + } + else { + data.max_tokens = 0; + } + + while (cur) { + /* Select statfiles to learn */ + st = cur->data; + if (st->is_spam != is_spam || (skip_labels && st->label)) { + cur = g_list_next (cur); + continue; + } + if ((file = statfile_pool_is_open (pool, st->path)) == NULL) { + if ((file = + statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { + msg_warn ("cannot open %s", st->path); + if (statfile_pool_create (pool, st->path, st->size) == -1) { + msg_err ("cannot create statfile %s", st->path); + g_set_error (err, + bayes_error_quark (), /* error domain */ + 1, /* error code */ + "cannot create statfile: %s", + st->path); + return FALSE; + } + if ((file = + statfile_pool_open (pool, st->path, st->size, + FALSE)) == NULL) { + g_set_error (err, + bayes_error_quark (), /* error domain */ + 1, /* error code */ + "cannot open statfile %s after creation", + st->path); + msg_err ("cannot open statfile %s after creation", + st->path); + return FALSE; + } + } + } + data.file = file; + statfile_pool_lock_file (pool, data.file); + g_tree_foreach (input, bayes_learn_callback, &data); + statfile_inc_revision (file); + statfile_pool_unlock_file (pool, data.file); + maybe_write_binlog (ctx->cfg, st, file, input); + msg_info ("increase revision for %s", st->path); + + cur = g_list_next (cur); + } + + return TRUE; +} + +GList * +bayes_weights (struct classifier_ctx * ctx, + statfile_pool_t *pool, + GTree *input, + struct rspamd_task *task) +{ + /* This function is unimplemented with new normalizer */ + return NULL; +} diff --git a/src/libstat/classifiers/classifiers.c b/src/libstat/classifiers/classifiers.c new file mode 100644 index 000000000..95dd52c44 --- /dev/null +++ b/src/libstat/classifiers/classifiers.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2009-2012, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Common classifier functions + */ + +#include "classifiers.h" + +struct classifier classifiers[] = { + { + .name = "winnow", + .init_func = winnow_init, + .classify_func = winnow_classify, + .learn_func = winnow_learn, + .learn_spam_func = winnow_learn_spam, + .weights_func = winnow_weights + }, + { + .name = "bayes", + .init_func = bayes_init, + .classify_func = bayes_classify, + .learn_func = bayes_learn, + .learn_spam_func = bayes_learn_spam, + .weights_func = bayes_weights + } +}; + +struct classifier * +get_classifier (const char *name) +{ + guint i; + + for (i = 0; i < sizeof (classifiers) / sizeof (classifiers[0]); i++) { + if (strcmp (classifiers[i].name, name) == 0) { + return &classifiers[i]; + } + } + + return NULL; +} + +/* + * vi:ts=4 + */ diff --git a/src/libstat/classifiers/winnow.c b/src/libstat/classifiers/winnow.c new file mode 100644 index 000000000..68d456968 --- /dev/null +++ b/src/libstat/classifiers/winnow.c @@ -0,0 +1,694 @@ +/* + * Copyright (c) 2009-2012, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Winnow classifier + */ + +#include "classifiers.h" +#include "tokenizers.h" +#include "main.h" +#include "filter.h" +#include "cfg_file.h" +#include "lua/lua_common.h" + +#define WINNOW_PROMOTION 1.23 +#define WINNOW_DEMOTION 0.83 + +#define MEDIAN_WINDOW_SIZE 5 + +#define MAX_WEIGHT G_MAXDOUBLE / 2. + + + +#define MAX_LEARN_ITERATIONS 100 + +static inline GQuark +winnow_error_quark (void) +{ + return g_quark_from_static_string ("winnow-error"); +} + +struct winnow_callback_data { + statfile_pool_t *pool; + struct classifier_ctx *ctx; + stat_file_t *file; + stat_file_t *learn_file; + long double sum; + long double start; + double multiplier; + guint32 count; + guint32 new_blocks; + gboolean in_class; + gboolean do_demote; + gboolean fresh_run; + time_t now; +}; + +static const double max_common_weight = MAX_WEIGHT * WINNOW_DEMOTION; + + + +static gboolean +winnow_classify_callback (gpointer key, gpointer value, gpointer data) +{ + token_node_t *node = key; + struct winnow_callback_data *cd = data; + double v; + + /* Consider that not found blocks have value 1 */ + v = + statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, + cd->now); + if (fabs (v) > ALPHA) { + cd->sum += v; + } + else { + cd->sum += 1.0; + cd->new_blocks++; + } + + cd->count++; + + return FALSE; +} + +static gboolean +winnow_learn_callback (gpointer key, gpointer value, gpointer data) +{ + token_node_t *node = key; + struct winnow_callback_data *cd = data; + double v, c; + + c = (cd->in_class) ? WINNOW_PROMOTION * cd->multiplier : WINNOW_DEMOTION / + cd->multiplier; + + /* Consider that not found blocks have value 1 */ + v = + statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, + cd->now); + if (fabs (v) < ALPHA) { + /* Block not found, insert new */ + cd->start += 1; + if (cd->file == cd->learn_file) { + statfile_pool_set_block (cd->pool, + cd->file, + node->h1, + node->h2, + cd->now, + c); + node->value = c; + cd->new_blocks++; + } + } + else { + cd->start += v; + /* Here we just increase the extra value of block */ + if (cd->fresh_run) { + node->extra = 0; + } + else { + node->extra++; + } + node->value = v; + + if (node->extra > 1) { + /* + * Assume that this node is common for several statfiles, so + * decrease its weight proportianally + */ + if (node->value > max_common_weight) { + /* Static fluctuation */ + statfile_pool_set_block (cd->pool, + cd->file, + node->h1, + node->h2, + cd->now, + 0.); + node->value = 0.; + } + else if (node->value > WINNOW_PROMOTION * cd->multiplier) { + /* Try to decrease its value */ + /* XXX: it is more intelligent to add some adaptive filter here */ + if (cd->file == cd->learn_file) { + if (node->value > max_common_weight / 2.) { + node->value *= c; + } + else { + /* + * Too high token value that exists also in other + * statfiles, may be statistic error, so decrease it + * slightly + */ + node->value *= WINNOW_DEMOTION; + } + } + else { + node->value = WINNOW_DEMOTION / cd->multiplier; + } + statfile_pool_set_block (cd->pool, + cd->file, + node->h1, + node->h2, + cd->now, + node->value); + } + } + else if (cd->file == cd->learn_file) { + /* New block or block that is in only one statfile */ + /* Set some limit on growing */ + if (v > MAX_WEIGHT) { + node->value = v; + } + else { + node->value *= c; + } + statfile_pool_set_block (cd->pool, + cd->file, + node->h1, + node->h2, + cd->now, + node->value); + } + else if (cd->do_demote) { + /* Demote blocks in file */ + node->value *= WINNOW_DEMOTION / cd->multiplier; + statfile_pool_set_block (cd->pool, + cd->file, + node->h1, + node->h2, + cd->now, + node->value); + } + } + + + cd->sum += node->value; + + cd->count++; + + return FALSE; +} + +struct classifier_ctx * +winnow_init (rspamd_mempool_t * pool, struct rspamd_classifier_config *cfg) +{ + struct classifier_ctx *ctx = + rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx)); + + ctx->pool = pool; + ctx->cfg = cfg; + + return ctx; +} + +gboolean +winnow_classify (struct classifier_ctx *ctx, + statfile_pool_t * pool, + GTree * input, + struct rspamd_task *task, + lua_State *L) +{ + struct winnow_callback_data data; + char *sumbuf, *value; + long double res = 0., max = 0.; + GList *cur; + struct rspamd_statfile_config *st, *sel = NULL; + int nodes, minnodes; + + g_assert (pool != NULL); + g_assert (ctx != NULL); + + data.pool = pool; + data.now = time (NULL); + data.ctx = ctx; + + if (ctx->cfg->opts && + (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { + minnodes = strtol (value, NULL, 10); + nodes = g_tree_nnodes (input); + if (nodes > FEATURE_WINDOW_SIZE) { + nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; + } + if (nodes < minnodes) { + msg_info ( + "do not classify message as it has too few tokens: %d, while %d min", + nodes, + minnodes); + return FALSE; + } + } + + cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L); + if (cur) { + rspamd_mempool_add_destructor (task->task_pool, + (rspamd_mempool_destruct_t)g_list_free, cur); + } + else { + cur = ctx->cfg->statfiles; + } + + while (cur) { + st = cur->data; + data.sum = 0; + data.count = 0; + data.new_blocks = 0; + if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) { + if ((data.file = + statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { + msg_warn ("cannot open %s, skip it", st->path); + cur = g_list_next (cur); + continue; + } + } + + if (data.file != NULL) { + g_tree_foreach (input, winnow_classify_callback, &data); + } + + if (data.count != 0) { + res = data.sum / (double)data.count; + } + else { + res = 0; + } + if (res > max) { + max = res; + sel = st; + } + cur = g_list_next (cur); + } + + if (sel != NULL) { +#ifdef WITH_LUA + max = rspamd_lua_call_cls_post_callbacks (ctx->cfg, task, max, L); +#endif +#ifdef HAVE_TANHL + max = tanhl (max); +#else + /* + * As some implementations of libm does not support tanhl, try to use + * tanh + */ + max = tanh ((double) max); +#endif + sumbuf = rspamd_mempool_alloc (task->task_pool, 32); + rspamd_snprintf (sumbuf, 32, "%.2F", max); + cur = g_list_prepend (NULL, sumbuf); + rspamd_task_insert_result (task, sel->symbol, max, cur); + } + + return TRUE; +} + +GList * +winnow_weights (struct classifier_ctx *ctx, + statfile_pool_t * pool, + GTree * input, + struct rspamd_task *task) +{ + struct winnow_callback_data data; + long double res = 0.; + GList *cur, *resl = NULL; + struct rspamd_statfile_config *st; + struct classify_weight *w; + char *value; + int nodes, minnodes; + + g_assert (pool != NULL); + g_assert (ctx != NULL); + + data.pool = pool; + data.now = time (NULL); + data.ctx = ctx; + + if (ctx->cfg->opts && + (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { + minnodes = strtol (value, NULL, 10); + nodes = g_tree_nnodes (input); + if (nodes > FEATURE_WINDOW_SIZE) { + nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; + } + if (nodes < minnodes) { + msg_info ( + "do not classify message as it has too few tokens: %d, while %d min", + nodes, + minnodes); + return NULL; + } + } + + cur = ctx->cfg->statfiles; + while (cur) { + st = cur->data; + data.sum = 0; + data.count = 0; + if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) { + if ((data.file = + statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { + msg_warn ("cannot open %s, skip it", st->path); + cur = g_list_next (cur); + continue; + } + } + + if (data.file != NULL) { + g_tree_foreach (input, winnow_classify_callback, &data); + } + + w = + rspamd_mempool_alloc0 (task->task_pool, + sizeof (struct classify_weight)); + if (data.count != 0) { + res = data.sum / (double)data.count; + } + else { + res = 0; + } + w->name = st->symbol; + w->weight = res; + resl = g_list_prepend (resl, w); + cur = g_list_next (cur); + } + + if (resl != NULL) { + rspamd_mempool_add_destructor (task->task_pool, + (rspamd_mempool_destruct_t)g_list_free, resl); + } + + return resl; + +} + + +gboolean +winnow_learn (struct classifier_ctx *ctx, + statfile_pool_t *pool, + const char *symbol, + GTree * input, + int in_class, + double *sum, + double multiplier, + GError **err) +{ + struct winnow_callback_data data = { + .file = NULL, + .multiplier = multiplier + }; + char *value; + int nodes, minnodes, iterations = 0; + struct rspamd_statfile_config *st, *sel_st = NULL; + stat_file_t *sel = NULL, *to_learn; + long double res = 0., max = 0., start_value = 0., end_value = 0.; + double learn_threshold = 0.0; + GList *cur, *to_demote = NULL; + gboolean force_learn = FALSE; + + g_assert (pool != NULL); + g_assert (ctx != NULL); + + data.pool = pool; + data.in_class = in_class; + data.now = time (NULL); + data.ctx = ctx; + + + if (ctx->cfg->opts && + (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { + minnodes = strtol (value, NULL, 10); + nodes = g_tree_nnodes (input); + if (nodes > FEATURE_WINDOW_SIZE) { + nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; + } + if (nodes < minnodes) { + msg_info ( + "do not learn message as it has too few tokens: %d, while %d min", + nodes, + minnodes); + if (sum != NULL) { + *sum = 0; + } + g_set_error (err, + winnow_error_quark (), /* error domain */ + 1, /* error code */ + "message contains too few tokens: %d, while min is %d", + nodes, minnodes); + return FALSE; + } + } + if (ctx->cfg->opts && + (value = + g_hash_table_lookup (ctx->cfg->opts, "learn_threshold")) != NULL) { + learn_threshold = strtod (value, NULL); + } + + if (learn_threshold <= 1.0 && learn_threshold >= 0) { + /* Classify message and check target statfile score */ + cur = ctx->cfg->statfiles; + while (cur) { + /* Open or create all statfiles inside classifier */ + st = cur->data; + if (statfile_pool_is_open (pool, st->path) == NULL) { + if (statfile_pool_open (pool, st->path, st->size, + FALSE) == NULL) { + msg_warn ("cannot open %s", st->path); + if (statfile_pool_create (pool, st->path, st->size) == -1) { + msg_err ("cannot create statfile %s", st->path); + g_set_error (err, + winnow_error_quark (), /* error domain */ + 1, /* error code */ + "cannot create statfile: %s", + st->path); + return FALSE; + } + if (statfile_pool_open (pool, st->path, st->size, + FALSE) == NULL) { + g_set_error (err, + winnow_error_quark (), /* error domain */ + 1, /* error code */ + "open statfile %s after creation", + st->path); + msg_err ("cannot open statfile %s after creation", + st->path); + return FALSE; + } + } + } + if (strcmp (st->symbol, symbol) == 0) { + sel_st = st; + + } + cur = g_list_next (cur); + } + + if (sel_st == NULL) { + g_set_error (err, + winnow_error_quark (), /* error domain */ + 1, /* error code */ + "cannot find statfile for symbol %s", + symbol); + msg_err ("cannot find statfile for symbol %s", symbol); + return FALSE; + } + + to_learn = statfile_pool_is_open (pool, sel_st->path); + if (to_learn == NULL) { + g_set_error (err, + winnow_error_quark (), /* error domain */ + 1, /* error code */ + "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles", + sel_st->path); + return FALSE; + } + /* Check target statfile */ + data.file = to_learn; + data.sum = 0; + data.count = 0; + data.new_blocks = 0; + g_tree_foreach (input, winnow_classify_callback, &data); + if (data.count > 0) { + max = data.sum / (double)data.count; + } + else { + max = 0; + } + /* If most of blocks are not presented in targeted statfile do forced learn */ + if (max < 1 + learn_threshold) { + force_learn = TRUE; + } + /* Check other statfiles */ + while (cur) { + st = cur->data; + data.sum = 0; + data.count = 0; + if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) { + g_set_error (err, + winnow_error_quark (), /* error domain */ + 1, /* error code */ + "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles", + st->path); + return FALSE; + } + g_tree_foreach (input, winnow_classify_callback, &data); + if (data.count != 0) { + res = data.sum / data.count; + } + else { + res = 0; + } + if (to_learn != data.file && res - max > 1 - learn_threshold) { + /* Demote tokens in this statfile */ + to_demote = g_list_prepend (to_demote, data.file); + } + cur = g_list_next (cur); + } + } + else { + msg_err ( + "learn threshold is more than 1 or less than 0, so cannot do learn, please check your configuration"); + g_set_error (err, + winnow_error_quark (), /* error domain */ + 1, /* error code */ + "bad learn_threshold setting: %.2f", + learn_threshold); + return FALSE; + } + /* If to_demote list is empty this message is already classified correctly */ + if (max > WINNOW_PROMOTION && to_demote == NULL && !force_learn) { + msg_info ( + "this message is already of class %s with threshold %.2f and weight %.2F", + sel_st->symbol, + learn_threshold, + max); + goto end; + } + data.learn_file = to_learn; + end_value = max; + do { + cur = ctx->cfg->statfiles; + data.fresh_run = TRUE; + while (cur) { + st = cur->data; + data.sum = 0; + data.count = 0; + data.new_blocks = 0; + data.start = 0; + if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) { + return FALSE; + } + if (to_demote != NULL && + g_list_find (to_demote, data.file) != NULL) { + data.do_demote = TRUE; + } + else { + data.do_demote = FALSE; + } + + statfile_pool_lock_file (pool, data.file); + g_tree_foreach (input, winnow_learn_callback, &data); + statfile_pool_unlock_file (pool, data.file); + if (data.count != 0) { + res = data.sum / data.count; + } + else { + res = 0; + } + if (res > max) { + max = res; + sel = data.file; + } + if (data.file == to_learn) { + if (data.count > 0) { + start_value = data.start / data.count; + } + end_value = res; + } + cur = g_list_next (cur); + data.fresh_run = FALSE; + } + + data.multiplier *= WINNOW_PROMOTION; + msg_info ( + "learn iteration %d for statfile %s: %G -> %G, multiplier: %.2f", + iterations + 1, + symbol, + start_value, + end_value, + data.multiplier); + } while ((in_class ? sel != to_learn : sel == + to_learn) && iterations++ < MAX_LEARN_ITERATIONS); + + if (iterations >= MAX_LEARN_ITERATIONS) { + msg_warn ( + "learning statfile %s was not fully successfull: iterations count is limited to %d, final sum is %G", + sel_st->symbol, + MAX_LEARN_ITERATIONS, + max); + g_set_error (err, + winnow_error_quark (), /* error domain */ + 1, /* error code */ + "learning statfile %s was not fully successfull: iterations count is limited to %d", + sel_st->symbol, MAX_LEARN_ITERATIONS); + return FALSE; + } + else { + msg_info ( + "learned statfile %s successfully with %d iterations and sum %G", + sel_st->symbol, + iterations + 1, + max); + } + + +end: + if (sum) { +#ifdef HAVE_TANHL + *sum = (double)tanhl (max); +#else + /* + * As some implementations of libm does not support tanhl, try to use + * tanh + */ + *sum = tanh ((double) max); +#endif + } + return TRUE; +} + +gboolean +winnow_learn_spam (struct classifier_ctx * ctx, + statfile_pool_t *pool, + GTree *input, + struct rspamd_task *task, + gboolean is_spam, + lua_State *L, + GError **err) +{ + g_set_error (err, + winnow_error_quark (), /* error domain */ + 1, /* error code */ + "learn spam is not supported for winnow" + ); + return FALSE; +} diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h new file mode 100644 index 000000000..0e2bf86b8 --- /dev/null +++ b/src/libstat/stat_api.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2015, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef STAT_API_H_ +#define STAT_API_H_ + +#include "config.h" +#include "task.h" + +/** + * @file stat_api.h + * High level statistics API + */ + +/** + * Initialise statistics modules + * @param cfg + */ +void rspamd_stat_init (struct rspamd_config *cfg); + +/** + * Classify the task specified and insert symbols if needed + * @param task + * @return TRUE if task has been classified + */ +gboolean rspamd_stat_classify (struct rspamd_task *task, GError **err); + + +/** + * Learn task as spam or ham, task must be processed prior to this call + * @param task task to learn + * @param spam if TRUE learn spam, otherwise learn ham + * @return TRUE if task has been learned + */ +gboolean rspamd_stat_learn (struct rspamd_task *task, gboolean spam, GError **err); + + +void rspamd_stat_unload (void); + +#endif /* STAT_API_H_ */ diff --git a/src/libstat/tokenizers.h b/src/libstat/tokenizers.h new file mode 100644 index 000000000..ed47e0add --- /dev/null +++ b/src/libstat/tokenizers.h @@ -0,0 +1,64 @@ +#ifndef TOKENIZERS_H +#define TOKENIZERS_H + +#include "config.h" +#include "mem_pool.h" +#include "fstring.h" +#include "main.h" + +/* Size for features pipe */ +#define FEATURE_WINDOW_SIZE 5 + +typedef struct token_node_s { + guint32 h1; + guint32 h2; + double value; + uintptr_t extra; +} token_node_t; + +/* Common tokenizer structure */ +struct tokenizer { + gchar *name; + gint (*tokenize_func)(struct tokenizer *tokenizer, + rspamd_mempool_t *pool, + GArray *words, + GTree **cur, + gboolean save_token, + gboolean is_utf, + GList *exceptions); + gchar * (*get_next_word)(rspamd_fstring_t *buf, rspamd_fstring_t *token, GList **exceptions); +}; + +/* Compare two token nodes */ +int token_node_compare_func (gconstpointer a, gconstpointer b); + +/* Get tokenizer structure by name or return NULL if this name is not found */ +struct tokenizer * get_tokenizer (const char *name); + +/* Get next word from specified f_str_t buf */ +gchar * rspamd_tokenizer_get_word (rspamd_fstring_t *buf, + rspamd_fstring_t *token, GList **exceptions); + +/* Tokenize text into array of words (rspamd_fstring_t type) */ +GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, + gsize min_len, GList **exceptions); + +/* OSB tokenize function */ +int osb_tokenize_text (struct tokenizer *tokenizer, + rspamd_mempool_t *pool, + GArray *input, + GTree **cur, + gboolean save_token, + gboolean is_utf, + GList *exceptions); + +/* Make tokens for a subject */ +void tokenize_subject (struct rspamd_task *task, GTree ** tree); + +/* Array of all defined tokenizers */ +extern struct tokenizer tokenizers[]; + +#endif +/* + * vi:ts=4 + */ diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c new file mode 100644 index 000000000..9dd12a8dd --- /dev/null +++ b/src/libstat/tokenizers/osb.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2009-2012, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * OSB tokenizer + */ + +#include +#include "tokenizers.h" + +/* Minimum length of token */ +#define MIN_LEN 4 + +extern const int primes[]; + +int +osb_tokenize_text (struct tokenizer *tokenizer, + rspamd_mempool_t * pool, + GArray * input, + GTree ** tree, + gboolean save_token, + gboolean is_utf, + GList *exceptions) +{ + token_node_t *new = NULL; + rspamd_fstring_t *token; + guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; + gint i, processed = 0; + guint w; + + if (input == NULL) { + return FALSE; + } + + if (*tree == NULL) { + *tree = g_tree_new (token_node_compare_func); + rspamd_mempool_add_destructor (pool, + (rspamd_mempool_destruct_t) g_tree_destroy, + *tree); + } + + memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0])); + + for (w = 0; w < input->len; w ++) { + token = &g_array_index (input, rspamd_fstring_t, w); + + if (processed < FEATURE_WINDOW_SIZE) { + /* Just fill a hashpipe */ + hashpipe[FEATURE_WINDOW_SIZE - ++processed] = + rspamd_fstrhash_lc (token, is_utf); + } + else { + /* Shift hashpipe */ + for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) { + hashpipe[i] = hashpipe[i - 1]; + } + hashpipe[0] = rspamd_fstrhash_lc (token, is_utf); + processed++; + + for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { + h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; + h2 = hashpipe[0] * primes[1] + hashpipe[i] * + primes[(i << 1) - 1]; + new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t)); + new->h1 = h1; + new->h2 = h2; + if (save_token) { + new->extra = + (uintptr_t)rspamd_mempool_fstrdup (pool, token); + } + + if (g_tree_lookup (*tree, new) == NULL) { + g_tree_insert (*tree, new, new); + } + } + } + } + + if (processed <= FEATURE_WINDOW_SIZE) { + for (i = 1; i < processed; i++) { + h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; + h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; + new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t)); + new->h1 = h1; + new->h2 = h2; + if (save_token) { + new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, token); + } + + if (g_tree_lookup (*tree, new) == NULL) { + g_tree_insert (*tree, new, new); + } + } + } + + return TRUE; +} + +/* + * vi:ts=4 + */ diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c new file mode 100644 index 000000000..3e6c745ec --- /dev/null +++ b/src/libstat/tokenizers/tokenizers.c @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2009-2012, Vsevolod Stakhov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Common tokenization functions + */ + +#include +#include "main.h" +#include "tokenizers.h" + +struct tokenizer tokenizers[] = { + {"osb-text", osb_tokenize_text, rspamd_tokenizer_get_word}, +}; + +const int primes[] = { + 1, 7, + 3, 13, + 5, 29, + 11, 51, + 23, 101, + 47, 203, + 97, 407, + 197, 817, + 397, 1637, + 797, 3277, +}; + +const gchar t_delimiters[255] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0 +}; + +struct tokenizer * +get_tokenizer (const char *name) +{ + guint i; + + for (i = 0; i < sizeof (tokenizers) / sizeof (tokenizers[0]); i++) { + if (strcmp (tokenizers[i].name, name) == 0) { + return &tokenizers[i]; + } + } + + return NULL; +} + +int +token_node_compare_func (gconstpointer a, gconstpointer b) +{ + const token_node_t *aa = a, *bb = b; + + if (aa->h1 == bb->h1) { + return aa->h2 - bb->h2; + } + + return aa->h1 - bb->h1; +} + +/* Get next word from specified f_str_t buf */ +gchar * +rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions) +{ + gsize remain, pos; + guchar *p; + struct process_exception *ex = NULL; + + if (buf == NULL) { + return NULL; + } + + if (exceptions != NULL && *exceptions != NULL) { + ex = (*exceptions)->data; + } + + if (token->begin == NULL) { + if (ex != NULL) { + if (ex->pos == 0) { + token->begin = buf->begin + ex->len; + token->len = ex->len; + } + else { + token->begin = buf->begin; + token->len = 0; + } + } + else { + token->begin = buf->begin; + token->len = 0; + } + } + + token->len = 0; + + pos = token->begin - buf->begin; + if (pos >= buf->len) { + return NULL; + } + + remain = buf->len - pos; + p = token->begin; + /* Skip non delimiters symbols */ + do { + if (ex != NULL && ex->pos == pos) { + /* Go to the next exception */ + *exceptions = g_list_next (*exceptions); + return p + ex->len; + } + pos++; + p++; + remain--; + } while (remain > 0 && t_delimiters[*p]); + + token->begin = p; + + while (remain > 0 && !t_delimiters[*p]) { + if (ex != NULL && ex->pos == pos) { + *exceptions = g_list_next (*exceptions); + return p + ex->len; + } + token->len++; + pos++; + remain--; + p++; + } + + if (remain == 0) { + return NULL; + } + + return p; +} + +GArray * +rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, + gsize min_len, GList **exceptions) +{ + rspamd_fstring_t token, buf; + gchar *pos; + gsize l; + GArray *res; + + if (len == 0 || text == NULL) { + return NULL; + } + + buf.begin = text; + buf.len = len; + buf.size = buf.len; + token.begin = NULL; + token.len = 0; + + res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t)); + while ((pos = rspamd_tokenizer_get_word (&buf, + &token, exceptions)) != NULL) { + if (is_utf) { + l = g_utf8_strlen (token.begin, token.len); + } + else { + l = token.len; + } + if (min_len > 0 && l < min_len) { + token.begin = pos; + continue; + } + g_array_append_val (res, token); + + token.begin = pos; + } + + return res; +} + + +void +tokenize_subject (struct rspamd_task *task, GTree ** tree) +{ + gchar *sub; + struct tokenizer *osb_tokenizer; + GArray *words; + + if (*tree == NULL) { + *tree = g_tree_new (token_node_compare_func); + rspamd_mempool_add_destructor (task->task_pool, + (rspamd_mempool_destruct_t) g_tree_destroy, *tree); + } + + osb_tokenizer = get_tokenizer ("osb-text"); + + /* Try to use pre-defined subject */ + if (task->subject != NULL) { + sub = task->subject; + } + else { + sub = (gchar *)g_mime_message_get_subject (task->message); + } + + if (sub != NULL) { + words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL); + if (words != NULL) { + osb_tokenizer->tokenize_func (osb_tokenizer, + task->task_pool, + words, + tree, + FALSE, + TRUE, + NULL); + g_array_free (words, TRUE); + } + } +} + +/* + * vi:ts=4 + */ diff --git a/src/lua/lua_classifier.c b/src/lua/lua_classifier.c index 1edca4857..346f5d64b 100644 --- a/src/lua/lua_classifier.c +++ b/src/lua/lua_classifier.c @@ -25,7 +25,7 @@ #include "lua_common.h" #include "cfg_file.h" -#include "classifiers/classifiers.h" +#include "classifiers.h" /* Classifier methods */ LUA_FUNCTION_DEF (classifier, register_pre_callback); diff --git a/src/lua/lua_config.c b/src/lua/lua_config.c index 3043013ba..c1eec7655 100644 --- a/src/lua/lua_config.c +++ b/src/lua/lua_config.c @@ -29,7 +29,7 @@ #include "message.h" #include "radix.h" #include "trie.h" -#include "classifiers/classifiers.h" +#include "classifiers.h" /*** * This module is used to configure rspamd and is normally available as global diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index 0a81e3d8b..4f1a46176 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -33,8 +33,8 @@ #include "images.h" #include "cfg_file.h" #include "statfile.h" -#include "tokenizers/tokenizers.h" -#include "classifiers/classifiers.h" +#include "tokenizers.h" +#include "classifiers.h" #include "binlog.h" #include "statfile_sync.h" #include "diff.h" diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c deleted file mode 100644 index 9dd12a8dd..000000000 --- a/src/tokenizers/osb.c +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2009-2012, Vsevolod Stakhov - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * OSB tokenizer - */ - -#include -#include "tokenizers.h" - -/* Minimum length of token */ -#define MIN_LEN 4 - -extern const int primes[]; - -int -osb_tokenize_text (struct tokenizer *tokenizer, - rspamd_mempool_t * pool, - GArray * input, - GTree ** tree, - gboolean save_token, - gboolean is_utf, - GList *exceptions) -{ - token_node_t *new = NULL; - rspamd_fstring_t *token; - guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; - gint i, processed = 0; - guint w; - - if (input == NULL) { - return FALSE; - } - - if (*tree == NULL) { - *tree = g_tree_new (token_node_compare_func); - rspamd_mempool_add_destructor (pool, - (rspamd_mempool_destruct_t) g_tree_destroy, - *tree); - } - - memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0])); - - for (w = 0; w < input->len; w ++) { - token = &g_array_index (input, rspamd_fstring_t, w); - - if (processed < FEATURE_WINDOW_SIZE) { - /* Just fill a hashpipe */ - hashpipe[FEATURE_WINDOW_SIZE - ++processed] = - rspamd_fstrhash_lc (token, is_utf); - } - else { - /* Shift hashpipe */ - for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) { - hashpipe[i] = hashpipe[i - 1]; - } - hashpipe[0] = rspamd_fstrhash_lc (token, is_utf); - processed++; - - for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { - h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; - h2 = hashpipe[0] * primes[1] + hashpipe[i] * - primes[(i << 1) - 1]; - new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t)); - new->h1 = h1; - new->h2 = h2; - if (save_token) { - new->extra = - (uintptr_t)rspamd_mempool_fstrdup (pool, token); - } - - if (g_tree_lookup (*tree, new) == NULL) { - g_tree_insert (*tree, new, new); - } - } - } - } - - if (processed <= FEATURE_WINDOW_SIZE) { - for (i = 1; i < processed; i++) { - h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; - h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; - new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t)); - new->h1 = h1; - new->h2 = h2; - if (save_token) { - new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, token); - } - - if (g_tree_lookup (*tree, new) == NULL) { - g_tree_insert (*tree, new, new); - } - } - } - - return TRUE; -} - -/* - * vi:ts=4 - */ diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c deleted file mode 100644 index 3e6c745ec..000000000 --- a/src/tokenizers/tokenizers.c +++ /dev/null @@ -1,260 +0,0 @@ -/* - * Copyright (c) 2009-2012, Vsevolod Stakhov - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * Common tokenization functions - */ - -#include -#include "main.h" -#include "tokenizers.h" - -struct tokenizer tokenizers[] = { - {"osb-text", osb_tokenize_text, rspamd_tokenizer_get_word}, -}; - -const int primes[] = { - 1, 7, - 3, 13, - 5, 29, - 11, 51, - 23, 101, - 47, 203, - 97, 407, - 197, 817, - 397, 1637, - 797, 3277, -}; - -const gchar t_delimiters[255] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, - 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, - 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0 -}; - -struct tokenizer * -get_tokenizer (const char *name) -{ - guint i; - - for (i = 0; i < sizeof (tokenizers) / sizeof (tokenizers[0]); i++) { - if (strcmp (tokenizers[i].name, name) == 0) { - return &tokenizers[i]; - } - } - - return NULL; -} - -int -token_node_compare_func (gconstpointer a, gconstpointer b) -{ - const token_node_t *aa = a, *bb = b; - - if (aa->h1 == bb->h1) { - return aa->h2 - bb->h2; - } - - return aa->h1 - bb->h1; -} - -/* Get next word from specified f_str_t buf */ -gchar * -rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions) -{ - gsize remain, pos; - guchar *p; - struct process_exception *ex = NULL; - - if (buf == NULL) { - return NULL; - } - - if (exceptions != NULL && *exceptions != NULL) { - ex = (*exceptions)->data; - } - - if (token->begin == NULL) { - if (ex != NULL) { - if (ex->pos == 0) { - token->begin = buf->begin + ex->len; - token->len = ex->len; - } - else { - token->begin = buf->begin; - token->len = 0; - } - } - else { - token->begin = buf->begin; - token->len = 0; - } - } - - token->len = 0; - - pos = token->begin - buf->begin; - if (pos >= buf->len) { - return NULL; - } - - remain = buf->len - pos; - p = token->begin; - /* Skip non delimiters symbols */ - do { - if (ex != NULL && ex->pos == pos) { - /* Go to the next exception */ - *exceptions = g_list_next (*exceptions); - return p + ex->len; - } - pos++; - p++; - remain--; - } while (remain > 0 && t_delimiters[*p]); - - token->begin = p; - - while (remain > 0 && !t_delimiters[*p]) { - if (ex != NULL && ex->pos == pos) { - *exceptions = g_list_next (*exceptions); - return p + ex->len; - } - token->len++; - pos++; - remain--; - p++; - } - - if (remain == 0) { - return NULL; - } - - return p; -} - -GArray * -rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, - gsize min_len, GList **exceptions) -{ - rspamd_fstring_t token, buf; - gchar *pos; - gsize l; - GArray *res; - - if (len == 0 || text == NULL) { - return NULL; - } - - buf.begin = text; - buf.len = len; - buf.size = buf.len; - token.begin = NULL; - token.len = 0; - - res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t)); - while ((pos = rspamd_tokenizer_get_word (&buf, - &token, exceptions)) != NULL) { - if (is_utf) { - l = g_utf8_strlen (token.begin, token.len); - } - else { - l = token.len; - } - if (min_len > 0 && l < min_len) { - token.begin = pos; - continue; - } - g_array_append_val (res, token); - - token.begin = pos; - } - - return res; -} - - -void -tokenize_subject (struct rspamd_task *task, GTree ** tree) -{ - gchar *sub; - struct tokenizer *osb_tokenizer; - GArray *words; - - if (*tree == NULL) { - *tree = g_tree_new (token_node_compare_func); - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) g_tree_destroy, *tree); - } - - osb_tokenizer = get_tokenizer ("osb-text"); - - /* Try to use pre-defined subject */ - if (task->subject != NULL) { - sub = task->subject; - } - else { - sub = (gchar *)g_mime_message_get_subject (task->message); - } - - if (sub != NULL) { - words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL); - if (words != NULL) { - osb_tokenizer->tokenize_func (osb_tokenizer, - task->task_pool, - words, - tree, - FALSE, - TRUE, - NULL); - g_array_free (words, TRUE); - } - } -} - -/* - * vi:ts=4 - */ diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h deleted file mode 100644 index ed47e0add..000000000 --- a/src/tokenizers/tokenizers.h +++ /dev/null @@ -1,64 +0,0 @@ -#ifndef TOKENIZERS_H -#define TOKENIZERS_H - -#include "config.h" -#include "mem_pool.h" -#include "fstring.h" -#include "main.h" - -/* Size for features pipe */ -#define FEATURE_WINDOW_SIZE 5 - -typedef struct token_node_s { - guint32 h1; - guint32 h2; - double value; - uintptr_t extra; -} token_node_t; - -/* Common tokenizer structure */ -struct tokenizer { - gchar *name; - gint (*tokenize_func)(struct tokenizer *tokenizer, - rspamd_mempool_t *pool, - GArray *words, - GTree **cur, - gboolean save_token, - gboolean is_utf, - GList *exceptions); - gchar * (*get_next_word)(rspamd_fstring_t *buf, rspamd_fstring_t *token, GList **exceptions); -}; - -/* Compare two token nodes */ -int token_node_compare_func (gconstpointer a, gconstpointer b); - -/* Get tokenizer structure by name or return NULL if this name is not found */ -struct tokenizer * get_tokenizer (const char *name); - -/* Get next word from specified f_str_t buf */ -gchar * rspamd_tokenizer_get_word (rspamd_fstring_t *buf, - rspamd_fstring_t *token, GList **exceptions); - -/* Tokenize text into array of words (rspamd_fstring_t type) */ -GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, - gsize min_len, GList **exceptions); - -/* OSB tokenize function */ -int osb_tokenize_text (struct tokenizer *tokenizer, - rspamd_mempool_t *pool, - GArray *input, - GTree **cur, - gboolean save_token, - gboolean is_utf, - GList *exceptions); - -/* Make tokens for a subject */ -void tokenize_subject (struct rspamd_task *task, GTree ** tree); - -/* Array of all defined tokenizers */ -extern struct tokenizer tokenizers[]; - -#endif -/* - * vi:ts=4 - */