############################# CONFIG SECTION #############################################
# Initial set
-INCLUDE_DIRECTORIES(src/libutil src/libserver src/libmime)
+INCLUDE_DIRECTORIES(src/libutil src/libserver src/libmime src/libstat)
IF(CMAKE_INSTALL_PREFIX)
SET(PREFIX ${CMAKE_INSTALL_PREFIX})
ADD_SUBDIRECTORY(libutil)
ADD_SUBDIRECTORY(libserver)
ADD_SUBDIRECTORY(libmime)
+ADD_SUBDIRECTORY(libstat)
ADD_SUBDIRECTORY(client)
SET(RSPAMDSRC ${CMAKE_CURRENT_BINARY_DIR}/modules.c
SET_TARGET_PROPERTIES(rspamd PROPERTIES VERSION ${RSPAMD_VERSION})
ENDIF(NOT DEBIAN_BUILD)
+TARGET_LINK_LIBRARIES(rspamd rspamd-stat)
TARGET_LINK_LIBRARIES(rspamd rspamd-mime)
TARGET_LINK_LIBRARIES(rspamd rspamd-server)
TARGET_LINK_LIBRARIES(rspamd rspamd-util)
+++ /dev/null
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Bayesian classifier
- */
-#include "classifiers.h"
-#include "tokenizers/tokenizers.h"
-#include "main.h"
-#include "filter.h"
-#include "cfg_file.h"
-#include "binlog.h"
-#include "lua/lua_common.h"
-
-#define LOCAL_PROB_DENOM 16.0
-
-static inline GQuark
-bayes_error_quark (void)
-{
- return g_quark_from_static_string ("bayes-error");
-}
-
-struct bayes_statfile_data {
- guint64 hits;
- guint64 total_hits;
- double value;
- struct rspamd_statfile_config *st;
- stat_file_t *file;
-};
-
-struct bayes_callback_data {
- statfile_pool_t *pool;
- struct classifier_ctx *ctx;
- gboolean in_class;
- time_t now;
- stat_file_t *file;
- struct bayes_statfile_data *statfiles;
- guint32 statfiles_num;
- guint64 total_spam;
- guint64 total_ham;
- guint64 processed_tokens;
- gsize max_tokens;
- double spam_probability;
- double ham_probability;
-};
-
-static gboolean
-bayes_learn_callback (gpointer key, gpointer value, gpointer data)
-{
- token_node_t *node = key;
- struct bayes_callback_data *cd = data;
- gint c;
- guint64 v;
-
- c = (cd->in_class) ? 1 : -1;
-
- /* Consider that not found blocks have value 1 */
- v =
- statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
- cd->now);
- if (v == 0 && c > 0) {
- statfile_pool_set_block (cd->pool,
- cd->file,
- node->h1,
- node->h2,
- cd->now,
- c);
- cd->processed_tokens++;
- }
- else if (v != 0) {
- if (G_LIKELY (c > 0)) {
- v++;
- }
- else if (c < 0) {
- if (v != 0) {
- v--;
- }
- }
- statfile_pool_set_block (cd->pool,
- cd->file,
- node->h1,
- node->h2,
- cd->now,
- v);
- cd->processed_tokens++;
- }
-
- if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) {
- /* Stop learning on max tokens */
- return TRUE;
- }
- return FALSE;
-}
-
-/**
- * Returns probability of chisquare > value with specified number of freedom
- * degrees
- * @param value value to test
- * @param freedom_deg number of degrees of freedom
- * @return
- */
-static gdouble
-inv_chi_square (gdouble value, gint freedom_deg)
-{
- long double prob, sum;
- gint i;
-
- if ((freedom_deg & 1) != 0) {
- msg_err ("non-odd freedom degrees count: %d", freedom_deg);
- return 0;
- }
-
- value /= 2.;
- errno = 0;
-#ifdef HAVE_EXPL
- prob = expl (-value);
-#elif defined(HAVE_EXP2L)
- prob = exp2l (-value * log2 (M_E));
-#else
- prob = exp (-value);
-#endif
- if (errno == ERANGE) {
- msg_err ("exp overflow");
- return 0;
- }
- sum = prob;
- for (i = 1; i < freedom_deg / 2; i++) {
- prob *= value / (gdouble)i;
- sum += prob;
- }
-
- return MIN (1.0, sum);
-}
-
-/*
- * In this callback we calculate local probabilities for tokens
- */
-static gboolean
-bayes_classify_callback (gpointer key, gpointer value, gpointer data)
-{
-
- token_node_t *node = key;
- struct bayes_callback_data *cd = data;
- guint i;
- struct bayes_statfile_data *cur;
- guint64 spam_count = 0, ham_count = 0, total_count = 0;
- double spam_prob, spam_freq, ham_freq, bayes_spam_prob;
-
- for (i = 0; i < cd->statfiles_num; i++) {
- cur = &cd->statfiles[i];
- cur->value = statfile_pool_get_block (cd->pool,
- cur->file,
- node->h1,
- node->h2,
- cd->now);
- if (cur->value > 0) {
- cur->total_hits += cur->value;
- if (cur->st->is_spam) {
- spam_count += cur->value;
- }
- else {
- ham_count += cur->value;
- }
- total_count += cur->value;
- }
- }
-
- /* Probability for this token */
- if (total_count > 0) {
- spam_freq = ((double)spam_count / MAX (1., (double)cd->total_spam));
- ham_freq = ((double)ham_count / MAX (1., (double)cd->total_ham));
- spam_prob = spam_freq / (spam_freq + ham_freq);
- bayes_spam_prob = (0.5 + spam_prob * total_count) / (1. + total_count);
- cd->spam_probability += log (bayes_spam_prob);
- cd->ham_probability += log (1. - bayes_spam_prob);
- cd->processed_tokens++;
- }
-
- if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) {
- /* Stop classifying on max tokens */
- return TRUE;
- }
-
- return FALSE;
-}
-
-struct classifier_ctx *
-bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cfg)
-{
- struct classifier_ctx *ctx =
- rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx));
-
- ctx->pool = pool;
- ctx->cfg = cfg;
- ctx->debug = FALSE;
-
- return ctx;
-}
-
-gboolean
-bayes_classify (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input,
- struct rspamd_task *task,
- lua_State *L)
-{
- struct bayes_callback_data data;
- gchar *value;
- gint nodes, i = 0, selected_st = -1, cnt;
- gint minnodes;
- guint64 maxhits = 0, rev;
- double final_prob, h, s;
- struct rspamd_statfile_config *st;
- stat_file_t *file;
- GList *cur;
- char *sumbuf;
-
- g_assert (pool != NULL);
- g_assert (ctx != NULL);
-
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
- minnodes = strtol (value, NULL, 10);
- nodes = g_tree_nnodes (input);
- if (nodes > FEATURE_WINDOW_SIZE) {
- nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
- }
- if (nodes < minnodes) {
- return FALSE;
- }
- }
-
- cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L);
- if (cur) {
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)g_list_free, cur);
- }
- else {
- cur = ctx->cfg->statfiles;
- }
-
- data.statfiles_num = g_list_length (cur);
- data.statfiles = g_new0 (struct bayes_statfile_data, data.statfiles_num);
- data.pool = pool;
- data.now = time (NULL);
- data.ctx = ctx;
-
- data.processed_tokens = 0;
- data.spam_probability = 0;
- data.ham_probability = 0;
- data.total_ham = 0;
- data.total_spam = 0;
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
- minnodes = rspamd_config_parse_limit (value, -1);
- data.max_tokens = minnodes;
- }
- else {
- data.max_tokens = 0;
- }
-
- while (cur) {
- /* Select statfile to classify */
- st = cur->data;
- if ((file = statfile_pool_is_open (pool, st->path)) == NULL) {
- if ((file =
- statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
- msg_warn ("cannot open %s", st->path);
- cur = g_list_next (cur);
- data.statfiles_num--;
- continue;
- }
- }
- data.statfiles[i].file = file;
- data.statfiles[i].st = st;
- statfile_get_revision (file, &rev, NULL);
- if (st->is_spam) {
- data.total_spam += rev;
- }
- else {
- data.total_ham += rev;
- }
-
- cur = g_list_next (cur);
- i++;
- }
-
- cnt = i;
-
- g_tree_foreach (input, bayes_classify_callback, &data);
-
- if (data.processed_tokens == 0 || data.spam_probability == 0) {
- final_prob = 0;
- }
- else {
- h = 1 - inv_chi_square (-2. * data.spam_probability,
- 2 * data.processed_tokens);
- s = 1 - inv_chi_square (-2. * data.ham_probability,
- 2 * data.processed_tokens);
- final_prob = (s + 1 - h) / 2.;
- }
-
- if (data.processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) {
-
- sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
- for (i = 0; i < cnt; i++) {
- if ((final_prob > 0.5 && !data.statfiles[i].st->is_spam) ||
- (final_prob < 0.5 && data.statfiles[i].st->is_spam)) {
- continue;
- }
- if (data.statfiles[i].total_hits > maxhits) {
- maxhits = data.statfiles[i].total_hits;
- selected_st = i;
- }
- }
- if (selected_st == -1) {
- msg_err (
- "unexpected classifier error: cannot select desired statfile");
- }
- else {
- /* Calculate ham probability correctly */
- if (final_prob < 0.5) {
- final_prob = 1. - final_prob;
- }
- rspamd_snprintf (sumbuf, 32, "%.2f%%", final_prob * 100.);
- cur = g_list_prepend (NULL, sumbuf);
- rspamd_task_insert_result (task,
- data.statfiles[selected_st].st->symbol,
- final_prob,
- cur);
- }
- }
-
- g_free (data.statfiles);
-
- return TRUE;
-}
-
-gboolean
-bayes_learn (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- const char *symbol,
- GTree *input,
- gboolean in_class,
- double *sum,
- double multiplier,
- GError **err)
-{
- struct bayes_callback_data data;
- gchar *value;
- gint nodes;
- gint minnodes;
- struct rspamd_statfile_config *st, *sel_st = NULL;
- stat_file_t *to_learn;
- GList *cur;
-
- g_assert (pool != NULL);
- g_assert (ctx != NULL);
-
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
- minnodes = strtol (value, NULL, 10);
- nodes = g_tree_nnodes (input);
- if (nodes > FEATURE_WINDOW_SIZE) {
- nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
- }
- if (nodes < minnodes) {
- msg_info (
- "do not learn message as it has too few tokens: %d, while %d min",
- nodes,
- minnodes);
- *sum = 0;
- g_set_error (err,
- bayes_error_quark (), /* error domain */
- 1, /* error code */
- "message contains too few tokens: %d, while min is %d",
- nodes, (int)minnodes);
- return FALSE;
- }
- }
-
- data.pool = pool;
- data.in_class = in_class;
- data.now = time (NULL);
- data.ctx = ctx;
- data.processed_tokens = 0;
- data.processed_tokens = 0;
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
- minnodes = rspamd_config_parse_limit (value, -1);
- data.max_tokens = minnodes;
- }
- else {
- data.max_tokens = 0;
- }
- cur = ctx->cfg->statfiles;
- while (cur) {
- /* Select statfile to learn */
- st = cur->data;
- if (strcmp (st->symbol, symbol) == 0) {
- sel_st = st;
- break;
- }
- cur = g_list_next (cur);
- }
- if (sel_st == NULL) {
- g_set_error (err,
- bayes_error_quark (), /* error domain */
- 1, /* error code */
- "cannot find statfile for symbol: %s",
- symbol);
- return FALSE;
- }
- if ((to_learn = statfile_pool_is_open (pool, sel_st->path)) == NULL) {
- if ((to_learn =
- statfile_pool_open (pool, sel_st->path, sel_st->size,
- FALSE)) == NULL) {
- msg_warn ("cannot open %s", sel_st->path);
- if (statfile_pool_create (pool, sel_st->path, sel_st->size) == -1) {
- msg_err ("cannot create statfile %s", sel_st->path);
- g_set_error (err,
- bayes_error_quark (), /* error domain */
- 1, /* error code */
- "cannot create statfile: %s",
- sel_st->path);
- return FALSE;
- }
- if ((to_learn =
- statfile_pool_open (pool, sel_st->path, sel_st->size,
- FALSE)) == NULL) {
- g_set_error (err,
- bayes_error_quark (), /* error domain */
- 1, /* error code */
- "cannot open statfile %s after creation",
- sel_st->path);
- msg_err ("cannot open statfile %s after creation",
- sel_st->path);
- return FALSE;
- }
- }
- }
- data.file = to_learn;
- statfile_pool_lock_file (pool, data.file);
- g_tree_foreach (input, bayes_learn_callback, &data);
- statfile_inc_revision (to_learn);
- statfile_pool_unlock_file (pool, data.file);
-
- if (sum != NULL) {
- *sum = data.processed_tokens;
- }
-
- return TRUE;
-}
-
-gboolean
-bayes_learn_spam (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input,
- struct rspamd_task *task,
- gboolean is_spam,
- lua_State *L,
- GError **err)
-{
- struct bayes_callback_data data;
- gchar *value;
- gint nodes;
- gint minnodes;
- struct rspamd_statfile_config *st;
- stat_file_t *file;
- GList *cur;
- gboolean skip_labels;
-
- g_assert (pool != NULL);
- g_assert (ctx != NULL);
-
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
- minnodes = strtol (value, NULL, 10);
- nodes = g_tree_nnodes (input);
- if (nodes > FEATURE_WINDOW_SIZE) {
- nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
- }
- if (nodes < minnodes) {
- g_set_error (err,
- bayes_error_quark (), /* error domain */
- 1, /* error code */
- "message contains too few tokens: %d, while min is %d",
- nodes, (int)minnodes);
- return FALSE;
- }
- }
-
- cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, TRUE, is_spam, L);
- if (cur) {
- skip_labels = FALSE;
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)g_list_free, cur);
- }
- else {
- /* Do not try to learn specific statfiles if pre callback returned nil */
- skip_labels = TRUE;
- cur = ctx->cfg->statfiles;
- }
-
- data.pool = pool;
- data.now = time (NULL);
- data.ctx = ctx;
- data.in_class = TRUE;
-
- data.processed_tokens = 0;
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
- minnodes = rspamd_config_parse_limit (value, -1);
- data.max_tokens = minnodes;
- }
- else {
- data.max_tokens = 0;
- }
-
- while (cur) {
- /* Select statfiles to learn */
- st = cur->data;
- if (st->is_spam != is_spam || (skip_labels && st->label)) {
- cur = g_list_next (cur);
- continue;
- }
- if ((file = statfile_pool_is_open (pool, st->path)) == NULL) {
- if ((file =
- statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
- msg_warn ("cannot open %s", st->path);
- if (statfile_pool_create (pool, st->path, st->size) == -1) {
- msg_err ("cannot create statfile %s", st->path);
- g_set_error (err,
- bayes_error_quark (), /* error domain */
- 1, /* error code */
- "cannot create statfile: %s",
- st->path);
- return FALSE;
- }
- if ((file =
- statfile_pool_open (pool, st->path, st->size,
- FALSE)) == NULL) {
- g_set_error (err,
- bayes_error_quark (), /* error domain */
- 1, /* error code */
- "cannot open statfile %s after creation",
- st->path);
- msg_err ("cannot open statfile %s after creation",
- st->path);
- return FALSE;
- }
- }
- }
- data.file = file;
- statfile_pool_lock_file (pool, data.file);
- g_tree_foreach (input, bayes_learn_callback, &data);
- statfile_inc_revision (file);
- statfile_pool_unlock_file (pool, data.file);
- maybe_write_binlog (ctx->cfg, st, file, input);
- msg_info ("increase revision for %s", st->path);
-
- cur = g_list_next (cur);
- }
-
- return TRUE;
-}
-
-GList *
-bayes_weights (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input,
- struct rspamd_task *task)
-{
- /* This function is unimplemented with new normalizer */
- return NULL;
-}
+++ /dev/null
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Common classifier functions
- */
-
-#include "classifiers.h"
-
-struct classifier classifiers[] = {
- {
- .name = "winnow",
- .init_func = winnow_init,
- .classify_func = winnow_classify,
- .learn_func = winnow_learn,
- .learn_spam_func = winnow_learn_spam,
- .weights_func = winnow_weights
- },
- {
- .name = "bayes",
- .init_func = bayes_init,
- .classify_func = bayes_classify,
- .learn_func = bayes_learn,
- .learn_spam_func = bayes_learn_spam,
- .weights_func = bayes_weights
- }
-};
-
-struct classifier *
-get_classifier (const char *name)
-{
- guint i;
-
- for (i = 0; i < sizeof (classifiers) / sizeof (classifiers[0]); i++) {
- if (strcmp (classifiers[i].name, name) == 0) {
- return &classifiers[i];
- }
- }
-
- return NULL;
-}
-
-/*
- * vi:ts=4
- */
+++ /dev/null
-#ifndef CLASSIFIERS_H
-#define CLASSIFIERS_H
-
-#include "config.h"
-#include "mem_pool.h"
-#include "statfile.h"
-#include "tokenizers/tokenizers.h"
-#include <lua.h>
-
-/* Consider this value as 0 */
-#define ALPHA 0.0001
-
-struct rspamd_classifier_config;
-struct rspamd_task;
-
-struct classifier_ctx {
- rspamd_mempool_t *pool;
- GHashTable *results;
- gboolean debug;
- struct rspamd_classifier_config *cfg;
-};
-
-struct classify_weight {
- const char *name;
- long double weight;
-};
-
-/* Common classifier structure */
-struct classifier {
- char *name;
- struct classifier_ctx * (*init_func)(rspamd_mempool_t *pool,
- struct rspamd_classifier_config *cf);
- gboolean (*classify_func)(struct classifier_ctx * ctx,
- statfile_pool_t *pool, GTree *input, struct rspamd_task *task,
- lua_State *L);
- gboolean (*learn_func)(struct classifier_ctx * ctx, statfile_pool_t *pool,
- const char *symbol, GTree *input, gboolean in_class,
- double *sum, double multiplier, GError **err);
- gboolean (*learn_spam_func)(struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input, struct rspamd_task *task, gboolean is_spam, lua_State *L,
- GError **err);
- GList * (*weights_func)(struct classifier_ctx * ctx, statfile_pool_t *pool,
- GTree *input, struct rspamd_task *task);
-};
-
-/* Get classifier structure by name or return NULL if this name is not found */
-struct classifier * get_classifier (const char *name);
-
-/* Winnow algorithm */
-struct classifier_ctx * winnow_init (rspamd_mempool_t *pool,
- struct rspamd_classifier_config *cf);
-gboolean winnow_classify (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input,
- struct rspamd_task *task,
- lua_State *L);
-gboolean winnow_learn (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- const char *symbol,
- GTree *input,
- gboolean in_class,
- double *sum,
- double multiplier,
- GError **err);
-gboolean winnow_learn_spam (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input,
- struct rspamd_task *task,
- gboolean is_spam,
- lua_State *L,
- GError **err);
-GList * winnow_weights (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input,
- struct rspamd_task *task);
-
-/* Bayes algorithm */
-struct classifier_ctx * bayes_init (rspamd_mempool_t *pool,
- struct rspamd_classifier_config *cf);
-gboolean bayes_classify (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input,
- struct rspamd_task *task,
- lua_State *L);
-gboolean bayes_learn (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- const char *symbol,
- GTree *input,
- gboolean in_class,
- double *sum,
- double multiplier,
- GError **err);
-gboolean bayes_learn_spam (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input,
- struct rspamd_task *task,
- gboolean is_spam,
- lua_State *L,
- GError **err);
-GList * bayes_weights (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input,
- struct rspamd_task *task);
-/* Array of all defined classifiers */
-extern struct classifier classifiers[];
-
-#endif
-/*
- * vi:ts=4
- */
+++ /dev/null
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Winnow classifier
- */
-
-#include "classifiers.h"
-#include "tokenizers/tokenizers.h"
-#include "main.h"
-#include "filter.h"
-#include "cfg_file.h"
-#include "lua/lua_common.h"
-
-#define WINNOW_PROMOTION 1.23
-#define WINNOW_DEMOTION 0.83
-
-#define MEDIAN_WINDOW_SIZE 5
-
-#define MAX_WEIGHT G_MAXDOUBLE / 2.
-
-
-
-#define MAX_LEARN_ITERATIONS 100
-
-static inline GQuark
-winnow_error_quark (void)
-{
- return g_quark_from_static_string ("winnow-error");
-}
-
-struct winnow_callback_data {
- statfile_pool_t *pool;
- struct classifier_ctx *ctx;
- stat_file_t *file;
- stat_file_t *learn_file;
- long double sum;
- long double start;
- double multiplier;
- guint32 count;
- guint32 new_blocks;
- gboolean in_class;
- gboolean do_demote;
- gboolean fresh_run;
- time_t now;
-};
-
-static const double max_common_weight = MAX_WEIGHT * WINNOW_DEMOTION;
-
-
-
-static gboolean
-winnow_classify_callback (gpointer key, gpointer value, gpointer data)
-{
- token_node_t *node = key;
- struct winnow_callback_data *cd = data;
- double v;
-
- /* Consider that not found blocks have value 1 */
- v =
- statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
- cd->now);
- if (fabs (v) > ALPHA) {
- cd->sum += v;
- }
- else {
- cd->sum += 1.0;
- cd->new_blocks++;
- }
-
- cd->count++;
-
- return FALSE;
-}
-
-static gboolean
-winnow_learn_callback (gpointer key, gpointer value, gpointer data)
-{
- token_node_t *node = key;
- struct winnow_callback_data *cd = data;
- double v, c;
-
- c = (cd->in_class) ? WINNOW_PROMOTION * cd->multiplier : WINNOW_DEMOTION /
- cd->multiplier;
-
- /* Consider that not found blocks have value 1 */
- v =
- statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
- cd->now);
- if (fabs (v) < ALPHA) {
- /* Block not found, insert new */
- cd->start += 1;
- if (cd->file == cd->learn_file) {
- statfile_pool_set_block (cd->pool,
- cd->file,
- node->h1,
- node->h2,
- cd->now,
- c);
- node->value = c;
- cd->new_blocks++;
- }
- }
- else {
- cd->start += v;
- /* Here we just increase the extra value of block */
- if (cd->fresh_run) {
- node->extra = 0;
- }
- else {
- node->extra++;
- }
- node->value = v;
-
- if (node->extra > 1) {
- /*
- * Assume that this node is common for several statfiles, so
- * decrease its weight proportianally
- */
- if (node->value > max_common_weight) {
- /* Static fluctuation */
- statfile_pool_set_block (cd->pool,
- cd->file,
- node->h1,
- node->h2,
- cd->now,
- 0.);
- node->value = 0.;
- }
- else if (node->value > WINNOW_PROMOTION * cd->multiplier) {
- /* Try to decrease its value */
- /* XXX: it is more intelligent to add some adaptive filter here */
- if (cd->file == cd->learn_file) {
- if (node->value > max_common_weight / 2.) {
- node->value *= c;
- }
- else {
- /*
- * Too high token value that exists also in other
- * statfiles, may be statistic error, so decrease it
- * slightly
- */
- node->value *= WINNOW_DEMOTION;
- }
- }
- else {
- node->value = WINNOW_DEMOTION / cd->multiplier;
- }
- statfile_pool_set_block (cd->pool,
- cd->file,
- node->h1,
- node->h2,
- cd->now,
- node->value);
- }
- }
- else if (cd->file == cd->learn_file) {
- /* New block or block that is in only one statfile */
- /* Set some limit on growing */
- if (v > MAX_WEIGHT) {
- node->value = v;
- }
- else {
- node->value *= c;
- }
- statfile_pool_set_block (cd->pool,
- cd->file,
- node->h1,
- node->h2,
- cd->now,
- node->value);
- }
- else if (cd->do_demote) {
- /* Demote blocks in file */
- node->value *= WINNOW_DEMOTION / cd->multiplier;
- statfile_pool_set_block (cd->pool,
- cd->file,
- node->h1,
- node->h2,
- cd->now,
- node->value);
- }
- }
-
-
- cd->sum += node->value;
-
- cd->count++;
-
- return FALSE;
-}
-
-struct classifier_ctx *
-winnow_init (rspamd_mempool_t * pool, struct rspamd_classifier_config *cfg)
-{
- struct classifier_ctx *ctx =
- rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx));
-
- ctx->pool = pool;
- ctx->cfg = cfg;
-
- return ctx;
-}
-
-gboolean
-winnow_classify (struct classifier_ctx *ctx,
- statfile_pool_t * pool,
- GTree * input,
- struct rspamd_task *task,
- lua_State *L)
-{
- struct winnow_callback_data data;
- char *sumbuf, *value;
- long double res = 0., max = 0.;
- GList *cur;
- struct rspamd_statfile_config *st, *sel = NULL;
- int nodes, minnodes;
-
- g_assert (pool != NULL);
- g_assert (ctx != NULL);
-
- data.pool = pool;
- data.now = time (NULL);
- data.ctx = ctx;
-
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
- minnodes = strtol (value, NULL, 10);
- nodes = g_tree_nnodes (input);
- if (nodes > FEATURE_WINDOW_SIZE) {
- nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
- }
- if (nodes < minnodes) {
- msg_info (
- "do not classify message as it has too few tokens: %d, while %d min",
- nodes,
- minnodes);
- return FALSE;
- }
- }
-
- cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L);
- if (cur) {
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)g_list_free, cur);
- }
- else {
- cur = ctx->cfg->statfiles;
- }
-
- while (cur) {
- st = cur->data;
- data.sum = 0;
- data.count = 0;
- data.new_blocks = 0;
- if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
- if ((data.file =
- statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
- msg_warn ("cannot open %s, skip it", st->path);
- cur = g_list_next (cur);
- continue;
- }
- }
-
- if (data.file != NULL) {
- g_tree_foreach (input, winnow_classify_callback, &data);
- }
-
- if (data.count != 0) {
- res = data.sum / (double)data.count;
- }
- else {
- res = 0;
- }
- if (res > max) {
- max = res;
- sel = st;
- }
- cur = g_list_next (cur);
- }
-
- if (sel != NULL) {
-#ifdef WITH_LUA
- max = rspamd_lua_call_cls_post_callbacks (ctx->cfg, task, max, L);
-#endif
-#ifdef HAVE_TANHL
- max = tanhl (max);
-#else
- /*
- * As some implementations of libm does not support tanhl, try to use
- * tanh
- */
- max = tanh ((double) max);
-#endif
- sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
- rspamd_snprintf (sumbuf, 32, "%.2F", max);
- cur = g_list_prepend (NULL, sumbuf);
- rspamd_task_insert_result (task, sel->symbol, max, cur);
- }
-
- return TRUE;
-}
-
-GList *
-winnow_weights (struct classifier_ctx *ctx,
- statfile_pool_t * pool,
- GTree * input,
- struct rspamd_task *task)
-{
- struct winnow_callback_data data;
- long double res = 0.;
- GList *cur, *resl = NULL;
- struct rspamd_statfile_config *st;
- struct classify_weight *w;
- char *value;
- int nodes, minnodes;
-
- g_assert (pool != NULL);
- g_assert (ctx != NULL);
-
- data.pool = pool;
- data.now = time (NULL);
- data.ctx = ctx;
-
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
- minnodes = strtol (value, NULL, 10);
- nodes = g_tree_nnodes (input);
- if (nodes > FEATURE_WINDOW_SIZE) {
- nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
- }
- if (nodes < minnodes) {
- msg_info (
- "do not classify message as it has too few tokens: %d, while %d min",
- nodes,
- minnodes);
- return NULL;
- }
- }
-
- cur = ctx->cfg->statfiles;
- while (cur) {
- st = cur->data;
- data.sum = 0;
- data.count = 0;
- if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
- if ((data.file =
- statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
- msg_warn ("cannot open %s, skip it", st->path);
- cur = g_list_next (cur);
- continue;
- }
- }
-
- if (data.file != NULL) {
- g_tree_foreach (input, winnow_classify_callback, &data);
- }
-
- w =
- rspamd_mempool_alloc0 (task->task_pool,
- sizeof (struct classify_weight));
- if (data.count != 0) {
- res = data.sum / (double)data.count;
- }
- else {
- res = 0;
- }
- w->name = st->symbol;
- w->weight = res;
- resl = g_list_prepend (resl, w);
- cur = g_list_next (cur);
- }
-
- if (resl != NULL) {
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)g_list_free, resl);
- }
-
- return resl;
-
-}
-
-
-gboolean
-winnow_learn (struct classifier_ctx *ctx,
- statfile_pool_t *pool,
- const char *symbol,
- GTree * input,
- int in_class,
- double *sum,
- double multiplier,
- GError **err)
-{
- struct winnow_callback_data data = {
- .file = NULL,
- .multiplier = multiplier
- };
- char *value;
- int nodes, minnodes, iterations = 0;
- struct rspamd_statfile_config *st, *sel_st = NULL;
- stat_file_t *sel = NULL, *to_learn;
- long double res = 0., max = 0., start_value = 0., end_value = 0.;
- double learn_threshold = 0.0;
- GList *cur, *to_demote = NULL;
- gboolean force_learn = FALSE;
-
- g_assert (pool != NULL);
- g_assert (ctx != NULL);
-
- data.pool = pool;
- data.in_class = in_class;
- data.now = time (NULL);
- data.ctx = ctx;
-
-
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
- minnodes = strtol (value, NULL, 10);
- nodes = g_tree_nnodes (input);
- if (nodes > FEATURE_WINDOW_SIZE) {
- nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
- }
- if (nodes < minnodes) {
- msg_info (
- "do not learn message as it has too few tokens: %d, while %d min",
- nodes,
- minnodes);
- if (sum != NULL) {
- *sum = 0;
- }
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "message contains too few tokens: %d, while min is %d",
- nodes, minnodes);
- return FALSE;
- }
- }
- if (ctx->cfg->opts &&
- (value =
- g_hash_table_lookup (ctx->cfg->opts, "learn_threshold")) != NULL) {
- learn_threshold = strtod (value, NULL);
- }
-
- if (learn_threshold <= 1.0 && learn_threshold >= 0) {
- /* Classify message and check target statfile score */
- cur = ctx->cfg->statfiles;
- while (cur) {
- /* Open or create all statfiles inside classifier */
- st = cur->data;
- if (statfile_pool_is_open (pool, st->path) == NULL) {
- if (statfile_pool_open (pool, st->path, st->size,
- FALSE) == NULL) {
- msg_warn ("cannot open %s", st->path);
- if (statfile_pool_create (pool, st->path, st->size) == -1) {
- msg_err ("cannot create statfile %s", st->path);
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "cannot create statfile: %s",
- st->path);
- return FALSE;
- }
- if (statfile_pool_open (pool, st->path, st->size,
- FALSE) == NULL) {
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "open statfile %s after creation",
- st->path);
- msg_err ("cannot open statfile %s after creation",
- st->path);
- return FALSE;
- }
- }
- }
- if (strcmp (st->symbol, symbol) == 0) {
- sel_st = st;
-
- }
- cur = g_list_next (cur);
- }
-
- if (sel_st == NULL) {
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "cannot find statfile for symbol %s",
- symbol);
- msg_err ("cannot find statfile for symbol %s", symbol);
- return FALSE;
- }
-
- to_learn = statfile_pool_is_open (pool, sel_st->path);
- if (to_learn == NULL) {
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles",
- sel_st->path);
- return FALSE;
- }
- /* Check target statfile */
- data.file = to_learn;
- data.sum = 0;
- data.count = 0;
- data.new_blocks = 0;
- g_tree_foreach (input, winnow_classify_callback, &data);
- if (data.count > 0) {
- max = data.sum / (double)data.count;
- }
- else {
- max = 0;
- }
- /* If most of blocks are not presented in targeted statfile do forced learn */
- if (max < 1 + learn_threshold) {
- force_learn = TRUE;
- }
- /* Check other statfiles */
- while (cur) {
- st = cur->data;
- data.sum = 0;
- data.count = 0;
- if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles",
- st->path);
- return FALSE;
- }
- g_tree_foreach (input, winnow_classify_callback, &data);
- if (data.count != 0) {
- res = data.sum / data.count;
- }
- else {
- res = 0;
- }
- if (to_learn != data.file && res - max > 1 - learn_threshold) {
- /* Demote tokens in this statfile */
- to_demote = g_list_prepend (to_demote, data.file);
- }
- cur = g_list_next (cur);
- }
- }
- else {
- msg_err (
- "learn threshold is more than 1 or less than 0, so cannot do learn, please check your configuration");
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "bad learn_threshold setting: %.2f",
- learn_threshold);
- return FALSE;
- }
- /* If to_demote list is empty this message is already classified correctly */
- if (max > WINNOW_PROMOTION && to_demote == NULL && !force_learn) {
- msg_info (
- "this message is already of class %s with threshold %.2f and weight %.2F",
- sel_st->symbol,
- learn_threshold,
- max);
- goto end;
- }
- data.learn_file = to_learn;
- end_value = max;
- do {
- cur = ctx->cfg->statfiles;
- data.fresh_run = TRUE;
- while (cur) {
- st = cur->data;
- data.sum = 0;
- data.count = 0;
- data.new_blocks = 0;
- data.start = 0;
- if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
- return FALSE;
- }
- if (to_demote != NULL &&
- g_list_find (to_demote, data.file) != NULL) {
- data.do_demote = TRUE;
- }
- else {
- data.do_demote = FALSE;
- }
-
- statfile_pool_lock_file (pool, data.file);
- g_tree_foreach (input, winnow_learn_callback, &data);
- statfile_pool_unlock_file (pool, data.file);
- if (data.count != 0) {
- res = data.sum / data.count;
- }
- else {
- res = 0;
- }
- if (res > max) {
- max = res;
- sel = data.file;
- }
- if (data.file == to_learn) {
- if (data.count > 0) {
- start_value = data.start / data.count;
- }
- end_value = res;
- }
- cur = g_list_next (cur);
- data.fresh_run = FALSE;
- }
-
- data.multiplier *= WINNOW_PROMOTION;
- msg_info (
- "learn iteration %d for statfile %s: %G -> %G, multiplier: %.2f",
- iterations + 1,
- symbol,
- start_value,
- end_value,
- data.multiplier);
- } while ((in_class ? sel != to_learn : sel ==
- to_learn) && iterations++ < MAX_LEARN_ITERATIONS);
-
- if (iterations >= MAX_LEARN_ITERATIONS) {
- msg_warn (
- "learning statfile %s was not fully successfull: iterations count is limited to %d, final sum is %G",
- sel_st->symbol,
- MAX_LEARN_ITERATIONS,
- max);
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "learning statfile %s was not fully successfull: iterations count is limited to %d",
- sel_st->symbol, MAX_LEARN_ITERATIONS);
- return FALSE;
- }
- else {
- msg_info (
- "learned statfile %s successfully with %d iterations and sum %G",
- sel_st->symbol,
- iterations + 1,
- max);
- }
-
-
-end:
- if (sum) {
-#ifdef HAVE_TANHL
- *sum = (double)tanhl (max);
-#else
- /*
- * As some implementations of libm does not support tanhl, try to use
- * tanh
- */
- *sum = tanh ((double) max);
-#endif
- }
- return TRUE;
-}
-
-gboolean
-winnow_learn_spam (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input,
- struct rspamd_task *task,
- gboolean is_spam,
- lua_State *L,
- GError **err)
-{
- g_set_error (err,
- winnow_error_quark (), /* error domain */
- 1, /* error code */
- "learn spam is not supported for winnow"
- );
- return FALSE;
-}
#include "config.h"
-#include "tokenizers/tokenizers.h"
-#include "classifiers/classifiers.h"
+#include "tokenizers.h"
+#include "classifiers.h"
#include "libserver/dynamic_cfg.h"
#include "libutil/rrd.h"
#include "libutil/map.h"
#include "expressions.h"
#include "binlog.h"
#include "diff.h"
-#include "classifiers/classifiers.h"
-#include "tokenizers/tokenizers.h"
+#include "classifiers.h"
+#include "tokenizers.h"
#ifdef WITH_LUA
# include "lua/lua_common.h"
#include "html.h"
#include "images.h"
#include "utlist.h"
-#include "tokenizers/tokenizers.h"
+#include "tokenizers.h"
#include <iconv.h>
url.c
worker_util.c)
-SET(TOKENIZERSSRC ../tokenizers/tokenizers.c
- ../tokenizers/osb.c)
-
-SET(CLASSIFIERSSRC ../classifiers/classifiers.c
- ../classifiers/bayes.c
- ../classifiers/winnow.c)
-
# Librspamd-server
#IF(WITH_DB)
# LIST(APPEND LIBRSPAMDSERVERSRC kvstorage_sqlite.c)
#ENDIF(WITH_SQLITE)
-ADD_LIBRARY(rspamd-server ${LINK_TYPE} ${LIBRSPAMDSERVERSRC} ${TOKENIZERSSRC} ${CLASSIFIERSSRC})
+ADD_LIBRARY(rspamd-server ${LINK_TYPE} ${LIBRSPAMDSERVERSRC})
IF(NOT DEBIAN_BUILD)
SET_TARGET_PROPERTIES(rspamd-server PROPERTIES VERSION ${RSPAMD_VERSION})
ENDIF(NOT DEBIAN_BUILD)
#include "config.h"
#include "binlog.h"
#include "cfg_file.h"
-#include "tokenizers/tokenizers.h"
+#include "tokenizers.h"
#define BINLOG_SUFFIX ".binlog"
#define BACKUP_SUFFIX ".old"
#include "cfg_file.h"
#include "lua/lua_common.h"
#include "expressions.h"
-#include "classifiers/classifiers.h"
-#include "tokenizers/tokenizers.h"
+#include "classifiers.h"
+#include "tokenizers.h"
struct rspamd_rcl_default_handler_data {
#include "main.h"
#include "uthash_strcase.h"
#include "filter.h"
-#include "classifiers/classifiers.h"
+#include "classifiers.h"
#include "lua/lua_common.h"
#include "kvstorage_config.h"
#include "map.h"
#include "config.h"
#include "cfg_file.h"
-#include "tokenizers/tokenizers.h"
-#include "classifiers/classifiers.h"
+#include "tokenizers.h"
+#include "classifiers.h"
#include "statfile.h"
#include "binlog.h"
#include "buffer.h"
--- /dev/null
+# Librspamdserver
+SET(LIBSTATSRC
+ )
+SET(TOKENIZERSSRC tokenizers/tokenizers.c
+ tokenizers/osb.c)
+
+SET(CLASSIFIERSSRC classifiers/classifiers.c
+ classifiers/bayes.c
+ classifiers/winnow.c)
+
+ADD_LIBRARY(rspamd-stat ${LINK_TYPE} ${LIBSTATSRC} ${TOKENIZERSSRC} ${CLASSIFIERSSRC})
+IF(NOT DEBIAN_BUILD)
+ SET_TARGET_PROPERTIES(rspamd-stat PROPERTIES VERSION ${RSPAMD_VERSION})
+ENDIF(NOT DEBIAN_BUILD)
+SET_TARGET_PROPERTIES(rspamd-stat PROPERTIES LINKER_LANGUAGE C COMPILE_FLAGS "-DRSPAMD_LIB")
+TARGET_LINK_LIBRARIES(rspamd-stat rspamd-server)
+
+IF(CMAKE_COMPILER_IS_GNUCC)
+SET_TARGET_PROPERTIES(rspamd-stat PROPERTIES COMPILE_FLAGS "-DRSPAMD_LIB -fno-strict-aliasing")
+ENDIF(CMAKE_COMPILER_IS_GNUCC)
+
+
+IF(NO_SHARED MATCHES "OFF")
+ INSTALL(TARGETS rspamd-stat
+ LIBRARY DESTINATION ${LIBDIR}
+ PUBLIC_HEADER DESTINATION ${INCLUDEDIR})
+ENDIF(NO_SHARED MATCHES "OFF")
--- /dev/null
+#ifndef CLASSIFIERS_H
+#define CLASSIFIERS_H
+
+#include "config.h"
+#include "mem_pool.h"
+#include "statfile.h"
+#include "tokenizers.h"
+#include <lua.h>
+
+/* Consider this value as 0 */
+#define ALPHA 0.0001
+
+struct rspamd_classifier_config;
+struct rspamd_task;
+
+struct classifier_ctx {
+ rspamd_mempool_t *pool;
+ GHashTable *results;
+ gboolean debug;
+ struct rspamd_classifier_config *cfg;
+};
+
+struct classify_weight {
+ const char *name;
+ long double weight;
+};
+
+/* Common classifier structure */
+struct classifier {
+ char *name;
+ struct classifier_ctx * (*init_func)(rspamd_mempool_t *pool,
+ struct rspamd_classifier_config *cf);
+ gboolean (*classify_func)(struct classifier_ctx * ctx,
+ statfile_pool_t *pool, GTree *input, struct rspamd_task *task,
+ lua_State *L);
+ gboolean (*learn_func)(struct classifier_ctx * ctx, statfile_pool_t *pool,
+ const char *symbol, GTree *input, gboolean in_class,
+ double *sum, double multiplier, GError **err);
+ gboolean (*learn_spam_func)(struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input, struct rspamd_task *task, gboolean is_spam, lua_State *L,
+ GError **err);
+ GList * (*weights_func)(struct classifier_ctx * ctx, statfile_pool_t *pool,
+ GTree *input, struct rspamd_task *task);
+};
+
+/* Get classifier structure by name or return NULL if this name is not found */
+struct classifier * get_classifier (const char *name);
+
+/* Winnow algorithm */
+struct classifier_ctx * winnow_init (rspamd_mempool_t *pool,
+ struct rspamd_classifier_config *cf);
+gboolean winnow_classify (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task,
+ lua_State *L);
+gboolean winnow_learn (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ const char *symbol,
+ GTree *input,
+ gboolean in_class,
+ double *sum,
+ double multiplier,
+ GError **err);
+gboolean winnow_learn_spam (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task,
+ gboolean is_spam,
+ lua_State *L,
+ GError **err);
+GList * winnow_weights (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task);
+
+/* Bayes algorithm */
+struct classifier_ctx * bayes_init (rspamd_mempool_t *pool,
+ struct rspamd_classifier_config *cf);
+gboolean bayes_classify (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task,
+ lua_State *L);
+gboolean bayes_learn (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ const char *symbol,
+ GTree *input,
+ gboolean in_class,
+ double *sum,
+ double multiplier,
+ GError **err);
+gboolean bayes_learn_spam (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task,
+ gboolean is_spam,
+ lua_State *L,
+ GError **err);
+GList * bayes_weights (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task);
+/* Array of all defined classifiers */
+extern struct classifier classifiers[];
+
+#endif
+/*
+ * vi:ts=4
+ */
--- /dev/null
+/*
+ * Copyright (c) 2009-2012, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Bayesian classifier
+ */
+#include "classifiers.h"
+#include "tokenizers.h"
+#include "main.h"
+#include "filter.h"
+#include "cfg_file.h"
+#include "binlog.h"
+#include "lua/lua_common.h"
+
+#define LOCAL_PROB_DENOM 16.0
+
+static inline GQuark
+bayes_error_quark (void)
+{
+ return g_quark_from_static_string ("bayes-error");
+}
+
+struct bayes_statfile_data {
+ guint64 hits;
+ guint64 total_hits;
+ double value;
+ struct rspamd_statfile_config *st;
+ stat_file_t *file;
+};
+
+struct bayes_callback_data {
+ statfile_pool_t *pool;
+ struct classifier_ctx *ctx;
+ gboolean in_class;
+ time_t now;
+ stat_file_t *file;
+ struct bayes_statfile_data *statfiles;
+ guint32 statfiles_num;
+ guint64 total_spam;
+ guint64 total_ham;
+ guint64 processed_tokens;
+ gsize max_tokens;
+ double spam_probability;
+ double ham_probability;
+};
+
+static gboolean
+bayes_learn_callback (gpointer key, gpointer value, gpointer data)
+{
+ token_node_t *node = key;
+ struct bayes_callback_data *cd = data;
+ gint c;
+ guint64 v;
+
+ c = (cd->in_class) ? 1 : -1;
+
+ /* Consider that not found blocks have value 1 */
+ v =
+ statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
+ cd->now);
+ if (v == 0 && c > 0) {
+ statfile_pool_set_block (cd->pool,
+ cd->file,
+ node->h1,
+ node->h2,
+ cd->now,
+ c);
+ cd->processed_tokens++;
+ }
+ else if (v != 0) {
+ if (G_LIKELY (c > 0)) {
+ v++;
+ }
+ else if (c < 0) {
+ if (v != 0) {
+ v--;
+ }
+ }
+ statfile_pool_set_block (cd->pool,
+ cd->file,
+ node->h1,
+ node->h2,
+ cd->now,
+ v);
+ cd->processed_tokens++;
+ }
+
+ if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) {
+ /* Stop learning on max tokens */
+ return TRUE;
+ }
+ return FALSE;
+}
+
+/**
+ * Returns probability of chisquare > value with specified number of freedom
+ * degrees
+ * @param value value to test
+ * @param freedom_deg number of degrees of freedom
+ * @return
+ */
+static gdouble
+inv_chi_square (gdouble value, gint freedom_deg)
+{
+ long double prob, sum;
+ gint i;
+
+ if ((freedom_deg & 1) != 0) {
+ msg_err ("non-odd freedom degrees count: %d", freedom_deg);
+ return 0;
+ }
+
+ value /= 2.;
+ errno = 0;
+#ifdef HAVE_EXPL
+ prob = expl (-value);
+#elif defined(HAVE_EXP2L)
+ prob = exp2l (-value * log2 (M_E));
+#else
+ prob = exp (-value);
+#endif
+ if (errno == ERANGE) {
+ msg_err ("exp overflow");
+ return 0;
+ }
+ sum = prob;
+ for (i = 1; i < freedom_deg / 2; i++) {
+ prob *= value / (gdouble)i;
+ sum += prob;
+ }
+
+ return MIN (1.0, sum);
+}
+
+/*
+ * In this callback we calculate local probabilities for tokens
+ */
+static gboolean
+bayes_classify_callback (gpointer key, gpointer value, gpointer data)
+{
+
+ token_node_t *node = key;
+ struct bayes_callback_data *cd = data;
+ guint i;
+ struct bayes_statfile_data *cur;
+ guint64 spam_count = 0, ham_count = 0, total_count = 0;
+ double spam_prob, spam_freq, ham_freq, bayes_spam_prob;
+
+ for (i = 0; i < cd->statfiles_num; i++) {
+ cur = &cd->statfiles[i];
+ cur->value = statfile_pool_get_block (cd->pool,
+ cur->file,
+ node->h1,
+ node->h2,
+ cd->now);
+ if (cur->value > 0) {
+ cur->total_hits += cur->value;
+ if (cur->st->is_spam) {
+ spam_count += cur->value;
+ }
+ else {
+ ham_count += cur->value;
+ }
+ total_count += cur->value;
+ }
+ }
+
+ /* Probability for this token */
+ if (total_count > 0) {
+ spam_freq = ((double)spam_count / MAX (1., (double)cd->total_spam));
+ ham_freq = ((double)ham_count / MAX (1., (double)cd->total_ham));
+ spam_prob = spam_freq / (spam_freq + ham_freq);
+ bayes_spam_prob = (0.5 + spam_prob * total_count) / (1. + total_count);
+ cd->spam_probability += log (bayes_spam_prob);
+ cd->ham_probability += log (1. - bayes_spam_prob);
+ cd->processed_tokens++;
+ }
+
+ if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) {
+ /* Stop classifying on max tokens */
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+struct classifier_ctx *
+bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cfg)
+{
+ struct classifier_ctx *ctx =
+ rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx));
+
+ ctx->pool = pool;
+ ctx->cfg = cfg;
+ ctx->debug = FALSE;
+
+ return ctx;
+}
+
+gboolean
+bayes_classify (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task,
+ lua_State *L)
+{
+ struct bayes_callback_data data;
+ gchar *value;
+ gint nodes, i = 0, selected_st = -1, cnt;
+ gint minnodes;
+ guint64 maxhits = 0, rev;
+ double final_prob, h, s;
+ struct rspamd_statfile_config *st;
+ stat_file_t *file;
+ GList *cur;
+ char *sumbuf;
+
+ g_assert (pool != NULL);
+ g_assert (ctx != NULL);
+
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ minnodes = strtol (value, NULL, 10);
+ nodes = g_tree_nnodes (input);
+ if (nodes > FEATURE_WINDOW_SIZE) {
+ nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+ }
+ if (nodes < minnodes) {
+ return FALSE;
+ }
+ }
+
+ cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L);
+ if (cur) {
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t)g_list_free, cur);
+ }
+ else {
+ cur = ctx->cfg->statfiles;
+ }
+
+ data.statfiles_num = g_list_length (cur);
+ data.statfiles = g_new0 (struct bayes_statfile_data, data.statfiles_num);
+ data.pool = pool;
+ data.now = time (NULL);
+ data.ctx = ctx;
+
+ data.processed_tokens = 0;
+ data.spam_probability = 0;
+ data.ham_probability = 0;
+ data.total_ham = 0;
+ data.total_spam = 0;
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
+ minnodes = rspamd_config_parse_limit (value, -1);
+ data.max_tokens = minnodes;
+ }
+ else {
+ data.max_tokens = 0;
+ }
+
+ while (cur) {
+ /* Select statfile to classify */
+ st = cur->data;
+ if ((file = statfile_pool_is_open (pool, st->path)) == NULL) {
+ if ((file =
+ statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+ msg_warn ("cannot open %s", st->path);
+ cur = g_list_next (cur);
+ data.statfiles_num--;
+ continue;
+ }
+ }
+ data.statfiles[i].file = file;
+ data.statfiles[i].st = st;
+ statfile_get_revision (file, &rev, NULL);
+ if (st->is_spam) {
+ data.total_spam += rev;
+ }
+ else {
+ data.total_ham += rev;
+ }
+
+ cur = g_list_next (cur);
+ i++;
+ }
+
+ cnt = i;
+
+ g_tree_foreach (input, bayes_classify_callback, &data);
+
+ if (data.processed_tokens == 0 || data.spam_probability == 0) {
+ final_prob = 0;
+ }
+ else {
+ h = 1 - inv_chi_square (-2. * data.spam_probability,
+ 2 * data.processed_tokens);
+ s = 1 - inv_chi_square (-2. * data.ham_probability,
+ 2 * data.processed_tokens);
+ final_prob = (s + 1 - h) / 2.;
+ }
+
+ if (data.processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) {
+
+ sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
+ for (i = 0; i < cnt; i++) {
+ if ((final_prob > 0.5 && !data.statfiles[i].st->is_spam) ||
+ (final_prob < 0.5 && data.statfiles[i].st->is_spam)) {
+ continue;
+ }
+ if (data.statfiles[i].total_hits > maxhits) {
+ maxhits = data.statfiles[i].total_hits;
+ selected_st = i;
+ }
+ }
+ if (selected_st == -1) {
+ msg_err (
+ "unexpected classifier error: cannot select desired statfile");
+ }
+ else {
+ /* Calculate ham probability correctly */
+ if (final_prob < 0.5) {
+ final_prob = 1. - final_prob;
+ }
+ rspamd_snprintf (sumbuf, 32, "%.2f%%", final_prob * 100.);
+ cur = g_list_prepend (NULL, sumbuf);
+ rspamd_task_insert_result (task,
+ data.statfiles[selected_st].st->symbol,
+ final_prob,
+ cur);
+ }
+ }
+
+ g_free (data.statfiles);
+
+ return TRUE;
+}
+
+gboolean
+bayes_learn (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ const char *symbol,
+ GTree *input,
+ gboolean in_class,
+ double *sum,
+ double multiplier,
+ GError **err)
+{
+ struct bayes_callback_data data;
+ gchar *value;
+ gint nodes;
+ gint minnodes;
+ struct rspamd_statfile_config *st, *sel_st = NULL;
+ stat_file_t *to_learn;
+ GList *cur;
+
+ g_assert (pool != NULL);
+ g_assert (ctx != NULL);
+
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ minnodes = strtol (value, NULL, 10);
+ nodes = g_tree_nnodes (input);
+ if (nodes > FEATURE_WINDOW_SIZE) {
+ nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+ }
+ if (nodes < minnodes) {
+ msg_info (
+ "do not learn message as it has too few tokens: %d, while %d min",
+ nodes,
+ minnodes);
+ *sum = 0;
+ g_set_error (err,
+ bayes_error_quark (), /* error domain */
+ 1, /* error code */
+ "message contains too few tokens: %d, while min is %d",
+ nodes, (int)minnodes);
+ return FALSE;
+ }
+ }
+
+ data.pool = pool;
+ data.in_class = in_class;
+ data.now = time (NULL);
+ data.ctx = ctx;
+ data.processed_tokens = 0;
+ data.processed_tokens = 0;
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
+ minnodes = rspamd_config_parse_limit (value, -1);
+ data.max_tokens = minnodes;
+ }
+ else {
+ data.max_tokens = 0;
+ }
+ cur = ctx->cfg->statfiles;
+ while (cur) {
+ /* Select statfile to learn */
+ st = cur->data;
+ if (strcmp (st->symbol, symbol) == 0) {
+ sel_st = st;
+ break;
+ }
+ cur = g_list_next (cur);
+ }
+ if (sel_st == NULL) {
+ g_set_error (err,
+ bayes_error_quark (), /* error domain */
+ 1, /* error code */
+ "cannot find statfile for symbol: %s",
+ symbol);
+ return FALSE;
+ }
+ if ((to_learn = statfile_pool_is_open (pool, sel_st->path)) == NULL) {
+ if ((to_learn =
+ statfile_pool_open (pool, sel_st->path, sel_st->size,
+ FALSE)) == NULL) {
+ msg_warn ("cannot open %s", sel_st->path);
+ if (statfile_pool_create (pool, sel_st->path, sel_st->size) == -1) {
+ msg_err ("cannot create statfile %s", sel_st->path);
+ g_set_error (err,
+ bayes_error_quark (), /* error domain */
+ 1, /* error code */
+ "cannot create statfile: %s",
+ sel_st->path);
+ return FALSE;
+ }
+ if ((to_learn =
+ statfile_pool_open (pool, sel_st->path, sel_st->size,
+ FALSE)) == NULL) {
+ g_set_error (err,
+ bayes_error_quark (), /* error domain */
+ 1, /* error code */
+ "cannot open statfile %s after creation",
+ sel_st->path);
+ msg_err ("cannot open statfile %s after creation",
+ sel_st->path);
+ return FALSE;
+ }
+ }
+ }
+ data.file = to_learn;
+ statfile_pool_lock_file (pool, data.file);
+ g_tree_foreach (input, bayes_learn_callback, &data);
+ statfile_inc_revision (to_learn);
+ statfile_pool_unlock_file (pool, data.file);
+
+ if (sum != NULL) {
+ *sum = data.processed_tokens;
+ }
+
+ return TRUE;
+}
+
+gboolean
+bayes_learn_spam (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task,
+ gboolean is_spam,
+ lua_State *L,
+ GError **err)
+{
+ struct bayes_callback_data data;
+ gchar *value;
+ gint nodes;
+ gint minnodes;
+ struct rspamd_statfile_config *st;
+ stat_file_t *file;
+ GList *cur;
+ gboolean skip_labels;
+
+ g_assert (pool != NULL);
+ g_assert (ctx != NULL);
+
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ minnodes = strtol (value, NULL, 10);
+ nodes = g_tree_nnodes (input);
+ if (nodes > FEATURE_WINDOW_SIZE) {
+ nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+ }
+ if (nodes < minnodes) {
+ g_set_error (err,
+ bayes_error_quark (), /* error domain */
+ 1, /* error code */
+ "message contains too few tokens: %d, while min is %d",
+ nodes, (int)minnodes);
+ return FALSE;
+ }
+ }
+
+ cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, TRUE, is_spam, L);
+ if (cur) {
+ skip_labels = FALSE;
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t)g_list_free, cur);
+ }
+ else {
+ /* Do not try to learn specific statfiles if pre callback returned nil */
+ skip_labels = TRUE;
+ cur = ctx->cfg->statfiles;
+ }
+
+ data.pool = pool;
+ data.now = time (NULL);
+ data.ctx = ctx;
+ data.in_class = TRUE;
+
+ data.processed_tokens = 0;
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
+ minnodes = rspamd_config_parse_limit (value, -1);
+ data.max_tokens = minnodes;
+ }
+ else {
+ data.max_tokens = 0;
+ }
+
+ while (cur) {
+ /* Select statfiles to learn */
+ st = cur->data;
+ if (st->is_spam != is_spam || (skip_labels && st->label)) {
+ cur = g_list_next (cur);
+ continue;
+ }
+ if ((file = statfile_pool_is_open (pool, st->path)) == NULL) {
+ if ((file =
+ statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+ msg_warn ("cannot open %s", st->path);
+ if (statfile_pool_create (pool, st->path, st->size) == -1) {
+ msg_err ("cannot create statfile %s", st->path);
+ g_set_error (err,
+ bayes_error_quark (), /* error domain */
+ 1, /* error code */
+ "cannot create statfile: %s",
+ st->path);
+ return FALSE;
+ }
+ if ((file =
+ statfile_pool_open (pool, st->path, st->size,
+ FALSE)) == NULL) {
+ g_set_error (err,
+ bayes_error_quark (), /* error domain */
+ 1, /* error code */
+ "cannot open statfile %s after creation",
+ st->path);
+ msg_err ("cannot open statfile %s after creation",
+ st->path);
+ return FALSE;
+ }
+ }
+ }
+ data.file = file;
+ statfile_pool_lock_file (pool, data.file);
+ g_tree_foreach (input, bayes_learn_callback, &data);
+ statfile_inc_revision (file);
+ statfile_pool_unlock_file (pool, data.file);
+ maybe_write_binlog (ctx->cfg, st, file, input);
+ msg_info ("increase revision for %s", st->path);
+
+ cur = g_list_next (cur);
+ }
+
+ return TRUE;
+}
+
+GList *
+bayes_weights (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task)
+{
+ /* This function is unimplemented with new normalizer */
+ return NULL;
+}
--- /dev/null
+/*
+ * Copyright (c) 2009-2012, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Common classifier functions
+ */
+
+#include "classifiers.h"
+
+struct classifier classifiers[] = {
+ {
+ .name = "winnow",
+ .init_func = winnow_init,
+ .classify_func = winnow_classify,
+ .learn_func = winnow_learn,
+ .learn_spam_func = winnow_learn_spam,
+ .weights_func = winnow_weights
+ },
+ {
+ .name = "bayes",
+ .init_func = bayes_init,
+ .classify_func = bayes_classify,
+ .learn_func = bayes_learn,
+ .learn_spam_func = bayes_learn_spam,
+ .weights_func = bayes_weights
+ }
+};
+
+struct classifier *
+get_classifier (const char *name)
+{
+ guint i;
+
+ for (i = 0; i < sizeof (classifiers) / sizeof (classifiers[0]); i++) {
+ if (strcmp (classifiers[i].name, name) == 0) {
+ return &classifiers[i];
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * vi:ts=4
+ */
--- /dev/null
+/*
+ * Copyright (c) 2009-2012, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Winnow classifier
+ */
+
+#include "classifiers.h"
+#include "tokenizers.h"
+#include "main.h"
+#include "filter.h"
+#include "cfg_file.h"
+#include "lua/lua_common.h"
+
+#define WINNOW_PROMOTION 1.23
+#define WINNOW_DEMOTION 0.83
+
+#define MEDIAN_WINDOW_SIZE 5
+
+#define MAX_WEIGHT G_MAXDOUBLE / 2.
+
+
+
+#define MAX_LEARN_ITERATIONS 100
+
+static inline GQuark
+winnow_error_quark (void)
+{
+ return g_quark_from_static_string ("winnow-error");
+}
+
+struct winnow_callback_data {
+ statfile_pool_t *pool;
+ struct classifier_ctx *ctx;
+ stat_file_t *file;
+ stat_file_t *learn_file;
+ long double sum;
+ long double start;
+ double multiplier;
+ guint32 count;
+ guint32 new_blocks;
+ gboolean in_class;
+ gboolean do_demote;
+ gboolean fresh_run;
+ time_t now;
+};
+
+static const double max_common_weight = MAX_WEIGHT * WINNOW_DEMOTION;
+
+
+
+static gboolean
+winnow_classify_callback (gpointer key, gpointer value, gpointer data)
+{
+ token_node_t *node = key;
+ struct winnow_callback_data *cd = data;
+ double v;
+
+ /* Consider that not found blocks have value 1 */
+ v =
+ statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
+ cd->now);
+ if (fabs (v) > ALPHA) {
+ cd->sum += v;
+ }
+ else {
+ cd->sum += 1.0;
+ cd->new_blocks++;
+ }
+
+ cd->count++;
+
+ return FALSE;
+}
+
+static gboolean
+winnow_learn_callback (gpointer key, gpointer value, gpointer data)
+{
+ token_node_t *node = key;
+ struct winnow_callback_data *cd = data;
+ double v, c;
+
+ c = (cd->in_class) ? WINNOW_PROMOTION * cd->multiplier : WINNOW_DEMOTION /
+ cd->multiplier;
+
+ /* Consider that not found blocks have value 1 */
+ v =
+ statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
+ cd->now);
+ if (fabs (v) < ALPHA) {
+ /* Block not found, insert new */
+ cd->start += 1;
+ if (cd->file == cd->learn_file) {
+ statfile_pool_set_block (cd->pool,
+ cd->file,
+ node->h1,
+ node->h2,
+ cd->now,
+ c);
+ node->value = c;
+ cd->new_blocks++;
+ }
+ }
+ else {
+ cd->start += v;
+ /* Here we just increase the extra value of block */
+ if (cd->fresh_run) {
+ node->extra = 0;
+ }
+ else {
+ node->extra++;
+ }
+ node->value = v;
+
+ if (node->extra > 1) {
+ /*
+ * Assume that this node is common for several statfiles, so
+ * decrease its weight proportianally
+ */
+ if (node->value > max_common_weight) {
+ /* Static fluctuation */
+ statfile_pool_set_block (cd->pool,
+ cd->file,
+ node->h1,
+ node->h2,
+ cd->now,
+ 0.);
+ node->value = 0.;
+ }
+ else if (node->value > WINNOW_PROMOTION * cd->multiplier) {
+ /* Try to decrease its value */
+ /* XXX: it is more intelligent to add some adaptive filter here */
+ if (cd->file == cd->learn_file) {
+ if (node->value > max_common_weight / 2.) {
+ node->value *= c;
+ }
+ else {
+ /*
+ * Too high token value that exists also in other
+ * statfiles, may be statistic error, so decrease it
+ * slightly
+ */
+ node->value *= WINNOW_DEMOTION;
+ }
+ }
+ else {
+ node->value = WINNOW_DEMOTION / cd->multiplier;
+ }
+ statfile_pool_set_block (cd->pool,
+ cd->file,
+ node->h1,
+ node->h2,
+ cd->now,
+ node->value);
+ }
+ }
+ else if (cd->file == cd->learn_file) {
+ /* New block or block that is in only one statfile */
+ /* Set some limit on growing */
+ if (v > MAX_WEIGHT) {
+ node->value = v;
+ }
+ else {
+ node->value *= c;
+ }
+ statfile_pool_set_block (cd->pool,
+ cd->file,
+ node->h1,
+ node->h2,
+ cd->now,
+ node->value);
+ }
+ else if (cd->do_demote) {
+ /* Demote blocks in file */
+ node->value *= WINNOW_DEMOTION / cd->multiplier;
+ statfile_pool_set_block (cd->pool,
+ cd->file,
+ node->h1,
+ node->h2,
+ cd->now,
+ node->value);
+ }
+ }
+
+
+ cd->sum += node->value;
+
+ cd->count++;
+
+ return FALSE;
+}
+
+struct classifier_ctx *
+winnow_init (rspamd_mempool_t * pool, struct rspamd_classifier_config *cfg)
+{
+ struct classifier_ctx *ctx =
+ rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx));
+
+ ctx->pool = pool;
+ ctx->cfg = cfg;
+
+ return ctx;
+}
+
+gboolean
+winnow_classify (struct classifier_ctx *ctx,
+ statfile_pool_t * pool,
+ GTree * input,
+ struct rspamd_task *task,
+ lua_State *L)
+{
+ struct winnow_callback_data data;
+ char *sumbuf, *value;
+ long double res = 0., max = 0.;
+ GList *cur;
+ struct rspamd_statfile_config *st, *sel = NULL;
+ int nodes, minnodes;
+
+ g_assert (pool != NULL);
+ g_assert (ctx != NULL);
+
+ data.pool = pool;
+ data.now = time (NULL);
+ data.ctx = ctx;
+
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ minnodes = strtol (value, NULL, 10);
+ nodes = g_tree_nnodes (input);
+ if (nodes > FEATURE_WINDOW_SIZE) {
+ nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+ }
+ if (nodes < minnodes) {
+ msg_info (
+ "do not classify message as it has too few tokens: %d, while %d min",
+ nodes,
+ minnodes);
+ return FALSE;
+ }
+ }
+
+ cur = rspamd_lua_call_cls_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L);
+ if (cur) {
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t)g_list_free, cur);
+ }
+ else {
+ cur = ctx->cfg->statfiles;
+ }
+
+ while (cur) {
+ st = cur->data;
+ data.sum = 0;
+ data.count = 0;
+ data.new_blocks = 0;
+ if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
+ if ((data.file =
+ statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+ msg_warn ("cannot open %s, skip it", st->path);
+ cur = g_list_next (cur);
+ continue;
+ }
+ }
+
+ if (data.file != NULL) {
+ g_tree_foreach (input, winnow_classify_callback, &data);
+ }
+
+ if (data.count != 0) {
+ res = data.sum / (double)data.count;
+ }
+ else {
+ res = 0;
+ }
+ if (res > max) {
+ max = res;
+ sel = st;
+ }
+ cur = g_list_next (cur);
+ }
+
+ if (sel != NULL) {
+#ifdef WITH_LUA
+ max = rspamd_lua_call_cls_post_callbacks (ctx->cfg, task, max, L);
+#endif
+#ifdef HAVE_TANHL
+ max = tanhl (max);
+#else
+ /*
+ * As some implementations of libm does not support tanhl, try to use
+ * tanh
+ */
+ max = tanh ((double) max);
+#endif
+ sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
+ rspamd_snprintf (sumbuf, 32, "%.2F", max);
+ cur = g_list_prepend (NULL, sumbuf);
+ rspamd_task_insert_result (task, sel->symbol, max, cur);
+ }
+
+ return TRUE;
+}
+
+GList *
+winnow_weights (struct classifier_ctx *ctx,
+ statfile_pool_t * pool,
+ GTree * input,
+ struct rspamd_task *task)
+{
+ struct winnow_callback_data data;
+ long double res = 0.;
+ GList *cur, *resl = NULL;
+ struct rspamd_statfile_config *st;
+ struct classify_weight *w;
+ char *value;
+ int nodes, minnodes;
+
+ g_assert (pool != NULL);
+ g_assert (ctx != NULL);
+
+ data.pool = pool;
+ data.now = time (NULL);
+ data.ctx = ctx;
+
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ minnodes = strtol (value, NULL, 10);
+ nodes = g_tree_nnodes (input);
+ if (nodes > FEATURE_WINDOW_SIZE) {
+ nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+ }
+ if (nodes < minnodes) {
+ msg_info (
+ "do not classify message as it has too few tokens: %d, while %d min",
+ nodes,
+ minnodes);
+ return NULL;
+ }
+ }
+
+ cur = ctx->cfg->statfiles;
+ while (cur) {
+ st = cur->data;
+ data.sum = 0;
+ data.count = 0;
+ if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
+ if ((data.file =
+ statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+ msg_warn ("cannot open %s, skip it", st->path);
+ cur = g_list_next (cur);
+ continue;
+ }
+ }
+
+ if (data.file != NULL) {
+ g_tree_foreach (input, winnow_classify_callback, &data);
+ }
+
+ w =
+ rspamd_mempool_alloc0 (task->task_pool,
+ sizeof (struct classify_weight));
+ if (data.count != 0) {
+ res = data.sum / (double)data.count;
+ }
+ else {
+ res = 0;
+ }
+ w->name = st->symbol;
+ w->weight = res;
+ resl = g_list_prepend (resl, w);
+ cur = g_list_next (cur);
+ }
+
+ if (resl != NULL) {
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t)g_list_free, resl);
+ }
+
+ return resl;
+
+}
+
+
+gboolean
+winnow_learn (struct classifier_ctx *ctx,
+ statfile_pool_t *pool,
+ const char *symbol,
+ GTree * input,
+ int in_class,
+ double *sum,
+ double multiplier,
+ GError **err)
+{
+ struct winnow_callback_data data = {
+ .file = NULL,
+ .multiplier = multiplier
+ };
+ char *value;
+ int nodes, minnodes, iterations = 0;
+ struct rspamd_statfile_config *st, *sel_st = NULL;
+ stat_file_t *sel = NULL, *to_learn;
+ long double res = 0., max = 0., start_value = 0., end_value = 0.;
+ double learn_threshold = 0.0;
+ GList *cur, *to_demote = NULL;
+ gboolean force_learn = FALSE;
+
+ g_assert (pool != NULL);
+ g_assert (ctx != NULL);
+
+ data.pool = pool;
+ data.in_class = in_class;
+ data.now = time (NULL);
+ data.ctx = ctx;
+
+
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ minnodes = strtol (value, NULL, 10);
+ nodes = g_tree_nnodes (input);
+ if (nodes > FEATURE_WINDOW_SIZE) {
+ nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+ }
+ if (nodes < minnodes) {
+ msg_info (
+ "do not learn message as it has too few tokens: %d, while %d min",
+ nodes,
+ minnodes);
+ if (sum != NULL) {
+ *sum = 0;
+ }
+ g_set_error (err,
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "message contains too few tokens: %d, while min is %d",
+ nodes, minnodes);
+ return FALSE;
+ }
+ }
+ if (ctx->cfg->opts &&
+ (value =
+ g_hash_table_lookup (ctx->cfg->opts, "learn_threshold")) != NULL) {
+ learn_threshold = strtod (value, NULL);
+ }
+
+ if (learn_threshold <= 1.0 && learn_threshold >= 0) {
+ /* Classify message and check target statfile score */
+ cur = ctx->cfg->statfiles;
+ while (cur) {
+ /* Open or create all statfiles inside classifier */
+ st = cur->data;
+ if (statfile_pool_is_open (pool, st->path) == NULL) {
+ if (statfile_pool_open (pool, st->path, st->size,
+ FALSE) == NULL) {
+ msg_warn ("cannot open %s", st->path);
+ if (statfile_pool_create (pool, st->path, st->size) == -1) {
+ msg_err ("cannot create statfile %s", st->path);
+ g_set_error (err,
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "cannot create statfile: %s",
+ st->path);
+ return FALSE;
+ }
+ if (statfile_pool_open (pool, st->path, st->size,
+ FALSE) == NULL) {
+ g_set_error (err,
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "open statfile %s after creation",
+ st->path);
+ msg_err ("cannot open statfile %s after creation",
+ st->path);
+ return FALSE;
+ }
+ }
+ }
+ if (strcmp (st->symbol, symbol) == 0) {
+ sel_st = st;
+
+ }
+ cur = g_list_next (cur);
+ }
+
+ if (sel_st == NULL) {
+ g_set_error (err,
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "cannot find statfile for symbol %s",
+ symbol);
+ msg_err ("cannot find statfile for symbol %s", symbol);
+ return FALSE;
+ }
+
+ to_learn = statfile_pool_is_open (pool, sel_st->path);
+ if (to_learn == NULL) {
+ g_set_error (err,
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles",
+ sel_st->path);
+ return FALSE;
+ }
+ /* Check target statfile */
+ data.file = to_learn;
+ data.sum = 0;
+ data.count = 0;
+ data.new_blocks = 0;
+ g_tree_foreach (input, winnow_classify_callback, &data);
+ if (data.count > 0) {
+ max = data.sum / (double)data.count;
+ }
+ else {
+ max = 0;
+ }
+ /* If most of blocks are not presented in targeted statfile do forced learn */
+ if (max < 1 + learn_threshold) {
+ force_learn = TRUE;
+ }
+ /* Check other statfiles */
+ while (cur) {
+ st = cur->data;
+ data.sum = 0;
+ data.count = 0;
+ if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
+ g_set_error (err,
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles",
+ st->path);
+ return FALSE;
+ }
+ g_tree_foreach (input, winnow_classify_callback, &data);
+ if (data.count != 0) {
+ res = data.sum / data.count;
+ }
+ else {
+ res = 0;
+ }
+ if (to_learn != data.file && res - max > 1 - learn_threshold) {
+ /* Demote tokens in this statfile */
+ to_demote = g_list_prepend (to_demote, data.file);
+ }
+ cur = g_list_next (cur);
+ }
+ }
+ else {
+ msg_err (
+ "learn threshold is more than 1 or less than 0, so cannot do learn, please check your configuration");
+ g_set_error (err,
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "bad learn_threshold setting: %.2f",
+ learn_threshold);
+ return FALSE;
+ }
+ /* If to_demote list is empty this message is already classified correctly */
+ if (max > WINNOW_PROMOTION && to_demote == NULL && !force_learn) {
+ msg_info (
+ "this message is already of class %s with threshold %.2f and weight %.2F",
+ sel_st->symbol,
+ learn_threshold,
+ max);
+ goto end;
+ }
+ data.learn_file = to_learn;
+ end_value = max;
+ do {
+ cur = ctx->cfg->statfiles;
+ data.fresh_run = TRUE;
+ while (cur) {
+ st = cur->data;
+ data.sum = 0;
+ data.count = 0;
+ data.new_blocks = 0;
+ data.start = 0;
+ if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
+ return FALSE;
+ }
+ if (to_demote != NULL &&
+ g_list_find (to_demote, data.file) != NULL) {
+ data.do_demote = TRUE;
+ }
+ else {
+ data.do_demote = FALSE;
+ }
+
+ statfile_pool_lock_file (pool, data.file);
+ g_tree_foreach (input, winnow_learn_callback, &data);
+ statfile_pool_unlock_file (pool, data.file);
+ if (data.count != 0) {
+ res = data.sum / data.count;
+ }
+ else {
+ res = 0;
+ }
+ if (res > max) {
+ max = res;
+ sel = data.file;
+ }
+ if (data.file == to_learn) {
+ if (data.count > 0) {
+ start_value = data.start / data.count;
+ }
+ end_value = res;
+ }
+ cur = g_list_next (cur);
+ data.fresh_run = FALSE;
+ }
+
+ data.multiplier *= WINNOW_PROMOTION;
+ msg_info (
+ "learn iteration %d for statfile %s: %G -> %G, multiplier: %.2f",
+ iterations + 1,
+ symbol,
+ start_value,
+ end_value,
+ data.multiplier);
+ } while ((in_class ? sel != to_learn : sel ==
+ to_learn) && iterations++ < MAX_LEARN_ITERATIONS);
+
+ if (iterations >= MAX_LEARN_ITERATIONS) {
+ msg_warn (
+ "learning statfile %s was not fully successfull: iterations count is limited to %d, final sum is %G",
+ sel_st->symbol,
+ MAX_LEARN_ITERATIONS,
+ max);
+ g_set_error (err,
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "learning statfile %s was not fully successfull: iterations count is limited to %d",
+ sel_st->symbol, MAX_LEARN_ITERATIONS);
+ return FALSE;
+ }
+ else {
+ msg_info (
+ "learned statfile %s successfully with %d iterations and sum %G",
+ sel_st->symbol,
+ iterations + 1,
+ max);
+ }
+
+
+end:
+ if (sum) {
+#ifdef HAVE_TANHL
+ *sum = (double)tanhl (max);
+#else
+ /*
+ * As some implementations of libm does not support tanhl, try to use
+ * tanh
+ */
+ *sum = tanh ((double) max);
+#endif
+ }
+ return TRUE;
+}
+
+gboolean
+winnow_learn_spam (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task,
+ gboolean is_spam,
+ lua_State *L,
+ GError **err)
+{
+ g_set_error (err,
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "learn spam is not supported for winnow"
+ );
+ return FALSE;
+}
--- /dev/null
+/* Copyright (c) 2015, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef STAT_API_H_
+#define STAT_API_H_
+
+#include "config.h"
+#include "task.h"
+
+/**
+ * @file stat_api.h
+ * High level statistics API
+ */
+
+/**
+ * Initialise statistics modules
+ * @param cfg
+ */
+void rspamd_stat_init (struct rspamd_config *cfg);
+
+/**
+ * Classify the task specified and insert symbols if needed
+ * @param task
+ * @return TRUE if task has been classified
+ */
+gboolean rspamd_stat_classify (struct rspamd_task *task, GError **err);
+
+
+/**
+ * Learn task as spam or ham, task must be processed prior to this call
+ * @param task task to learn
+ * @param spam if TRUE learn spam, otherwise learn ham
+ * @return TRUE if task has been learned
+ */
+gboolean rspamd_stat_learn (struct rspamd_task *task, gboolean spam, GError **err);
+
+
+void rspamd_stat_unload (void);
+
+#endif /* STAT_API_H_ */
--- /dev/null
+#ifndef TOKENIZERS_H
+#define TOKENIZERS_H
+
+#include "config.h"
+#include "mem_pool.h"
+#include "fstring.h"
+#include "main.h"
+
+/* Size for features pipe */
+#define FEATURE_WINDOW_SIZE 5
+
+typedef struct token_node_s {
+ guint32 h1;
+ guint32 h2;
+ double value;
+ uintptr_t extra;
+} token_node_t;
+
+/* Common tokenizer structure */
+struct tokenizer {
+ gchar *name;
+ gint (*tokenize_func)(struct tokenizer *tokenizer,
+ rspamd_mempool_t *pool,
+ GArray *words,
+ GTree **cur,
+ gboolean save_token,
+ gboolean is_utf,
+ GList *exceptions);
+ gchar * (*get_next_word)(rspamd_fstring_t *buf, rspamd_fstring_t *token, GList **exceptions);
+};
+
+/* Compare two token nodes */
+int token_node_compare_func (gconstpointer a, gconstpointer b);
+
+/* Get tokenizer structure by name or return NULL if this name is not found */
+struct tokenizer * get_tokenizer (const char *name);
+
+/* Get next word from specified f_str_t buf */
+gchar * rspamd_tokenizer_get_word (rspamd_fstring_t *buf,
+ rspamd_fstring_t *token, GList **exceptions);
+
+/* Tokenize text into array of words (rspamd_fstring_t type) */
+GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
+ gsize min_len, GList **exceptions);
+
+/* OSB tokenize function */
+int osb_tokenize_text (struct tokenizer *tokenizer,
+ rspamd_mempool_t *pool,
+ GArray *input,
+ GTree **cur,
+ gboolean save_token,
+ gboolean is_utf,
+ GList *exceptions);
+
+/* Make tokens for a subject */
+void tokenize_subject (struct rspamd_task *task, GTree ** tree);
+
+/* Array of all defined tokenizers */
+extern struct tokenizer tokenizers[];
+
+#endif
+/*
+ * vi:ts=4
+ */
--- /dev/null
+/*
+ * Copyright (c) 2009-2012, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * OSB tokenizer
+ */
+
+#include <sys/types.h>
+#include "tokenizers.h"
+
+/* Minimum length of token */
+#define MIN_LEN 4
+
+extern const int primes[];
+
+int
+osb_tokenize_text (struct tokenizer *tokenizer,
+ rspamd_mempool_t * pool,
+ GArray * input,
+ GTree ** tree,
+ gboolean save_token,
+ gboolean is_utf,
+ GList *exceptions)
+{
+ token_node_t *new = NULL;
+ rspamd_fstring_t *token;
+ guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
+ gint i, processed = 0;
+ guint w;
+
+ if (input == NULL) {
+ return FALSE;
+ }
+
+ if (*tree == NULL) {
+ *tree = g_tree_new (token_node_compare_func);
+ rspamd_mempool_add_destructor (pool,
+ (rspamd_mempool_destruct_t) g_tree_destroy,
+ *tree);
+ }
+
+ memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0]));
+
+ for (w = 0; w < input->len; w ++) {
+ token = &g_array_index (input, rspamd_fstring_t, w);
+
+ if (processed < FEATURE_WINDOW_SIZE) {
+ /* Just fill a hashpipe */
+ hashpipe[FEATURE_WINDOW_SIZE - ++processed] =
+ rspamd_fstrhash_lc (token, is_utf);
+ }
+ else {
+ /* Shift hashpipe */
+ for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
+ hashpipe[i] = hashpipe[i - 1];
+ }
+ hashpipe[0] = rspamd_fstrhash_lc (token, is_utf);
+ processed++;
+
+ for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
+ h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
+ h2 = hashpipe[0] * primes[1] + hashpipe[i] *
+ primes[(i << 1) - 1];
+ new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
+ new->h1 = h1;
+ new->h2 = h2;
+ if (save_token) {
+ new->extra =
+ (uintptr_t)rspamd_mempool_fstrdup (pool, token);
+ }
+
+ if (g_tree_lookup (*tree, new) == NULL) {
+ g_tree_insert (*tree, new, new);
+ }
+ }
+ }
+ }
+
+ if (processed <= FEATURE_WINDOW_SIZE) {
+ for (i = 1; i < processed; i++) {
+ h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
+ h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
+ new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
+ new->h1 = h1;
+ new->h2 = h2;
+ if (save_token) {
+ new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, token);
+ }
+
+ if (g_tree_lookup (*tree, new) == NULL) {
+ g_tree_insert (*tree, new, new);
+ }
+ }
+ }
+
+ return TRUE;
+}
+
+/*
+ * vi:ts=4
+ */
--- /dev/null
+/*
+ * Copyright (c) 2009-2012, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Common tokenization functions
+ */
+
+#include <sys/types.h>
+#include "main.h"
+#include "tokenizers.h"
+
+struct tokenizer tokenizers[] = {
+ {"osb-text", osb_tokenize_text, rspamd_tokenizer_get_word},
+};
+
+const int primes[] = {
+ 1, 7,
+ 3, 13,
+ 5, 29,
+ 11, 51,
+ 23, 101,
+ 47, 203,
+ 97, 407,
+ 197, 817,
+ 397, 1637,
+ 797, 3277,
+};
+
+const gchar t_delimiters[255] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+ 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0
+};
+
+struct tokenizer *
+get_tokenizer (const char *name)
+{
+ guint i;
+
+ for (i = 0; i < sizeof (tokenizers) / sizeof (tokenizers[0]); i++) {
+ if (strcmp (tokenizers[i].name, name) == 0) {
+ return &tokenizers[i];
+ }
+ }
+
+ return NULL;
+}
+
+int
+token_node_compare_func (gconstpointer a, gconstpointer b)
+{
+ const token_node_t *aa = a, *bb = b;
+
+ if (aa->h1 == bb->h1) {
+ return aa->h2 - bb->h2;
+ }
+
+ return aa->h1 - bb->h1;
+}
+
+/* Get next word from specified f_str_t buf */
+gchar *
+rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions)
+{
+ gsize remain, pos;
+ guchar *p;
+ struct process_exception *ex = NULL;
+
+ if (buf == NULL) {
+ return NULL;
+ }
+
+ if (exceptions != NULL && *exceptions != NULL) {
+ ex = (*exceptions)->data;
+ }
+
+ if (token->begin == NULL) {
+ if (ex != NULL) {
+ if (ex->pos == 0) {
+ token->begin = buf->begin + ex->len;
+ token->len = ex->len;
+ }
+ else {
+ token->begin = buf->begin;
+ token->len = 0;
+ }
+ }
+ else {
+ token->begin = buf->begin;
+ token->len = 0;
+ }
+ }
+
+ token->len = 0;
+
+ pos = token->begin - buf->begin;
+ if (pos >= buf->len) {
+ return NULL;
+ }
+
+ remain = buf->len - pos;
+ p = token->begin;
+ /* Skip non delimiters symbols */
+ do {
+ if (ex != NULL && ex->pos == pos) {
+ /* Go to the next exception */
+ *exceptions = g_list_next (*exceptions);
+ return p + ex->len;
+ }
+ pos++;
+ p++;
+ remain--;
+ } while (remain > 0 && t_delimiters[*p]);
+
+ token->begin = p;
+
+ while (remain > 0 && !t_delimiters[*p]) {
+ if (ex != NULL && ex->pos == pos) {
+ *exceptions = g_list_next (*exceptions);
+ return p + ex->len;
+ }
+ token->len++;
+ pos++;
+ remain--;
+ p++;
+ }
+
+ if (remain == 0) {
+ return NULL;
+ }
+
+ return p;
+}
+
+GArray *
+rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
+ gsize min_len, GList **exceptions)
+{
+ rspamd_fstring_t token, buf;
+ gchar *pos;
+ gsize l;
+ GArray *res;
+
+ if (len == 0 || text == NULL) {
+ return NULL;
+ }
+
+ buf.begin = text;
+ buf.len = len;
+ buf.size = buf.len;
+ token.begin = NULL;
+ token.len = 0;
+
+ res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t));
+ while ((pos = rspamd_tokenizer_get_word (&buf,
+ &token, exceptions)) != NULL) {
+ if (is_utf) {
+ l = g_utf8_strlen (token.begin, token.len);
+ }
+ else {
+ l = token.len;
+ }
+ if (min_len > 0 && l < min_len) {
+ token.begin = pos;
+ continue;
+ }
+ g_array_append_val (res, token);
+
+ token.begin = pos;
+ }
+
+ return res;
+}
+
+
+void
+tokenize_subject (struct rspamd_task *task, GTree ** tree)
+{
+ gchar *sub;
+ struct tokenizer *osb_tokenizer;
+ GArray *words;
+
+ if (*tree == NULL) {
+ *tree = g_tree_new (token_node_compare_func);
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t) g_tree_destroy, *tree);
+ }
+
+ osb_tokenizer = get_tokenizer ("osb-text");
+
+ /* Try to use pre-defined subject */
+ if (task->subject != NULL) {
+ sub = task->subject;
+ }
+ else {
+ sub = (gchar *)g_mime_message_get_subject (task->message);
+ }
+
+ if (sub != NULL) {
+ words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL);
+ if (words != NULL) {
+ osb_tokenizer->tokenize_func (osb_tokenizer,
+ task->task_pool,
+ words,
+ tree,
+ FALSE,
+ TRUE,
+ NULL);
+ g_array_free (words, TRUE);
+ }
+ }
+}
+
+/*
+ * vi:ts=4
+ */
#include "lua_common.h"
#include "cfg_file.h"
-#include "classifiers/classifiers.h"
+#include "classifiers.h"
/* Classifier methods */
LUA_FUNCTION_DEF (classifier, register_pre_callback);
#include "message.h"
#include "radix.h"
#include "trie.h"
-#include "classifiers/classifiers.h"
+#include "classifiers.h"
/***
* This module is used to configure rspamd and is normally available as global
#include "images.h"
#include "cfg_file.h"
#include "statfile.h"
-#include "tokenizers/tokenizers.h"
-#include "classifiers/classifiers.h"
+#include "tokenizers.h"
+#include "classifiers.h"
#include "binlog.h"
#include "statfile_sync.h"
#include "diff.h"
+++ /dev/null
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * OSB tokenizer
- */
-
-#include <sys/types.h>
-#include "tokenizers.h"
-
-/* Minimum length of token */
-#define MIN_LEN 4
-
-extern const int primes[];
-
-int
-osb_tokenize_text (struct tokenizer *tokenizer,
- rspamd_mempool_t * pool,
- GArray * input,
- GTree ** tree,
- gboolean save_token,
- gboolean is_utf,
- GList *exceptions)
-{
- token_node_t *new = NULL;
- rspamd_fstring_t *token;
- guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
- gint i, processed = 0;
- guint w;
-
- if (input == NULL) {
- return FALSE;
- }
-
- if (*tree == NULL) {
- *tree = g_tree_new (token_node_compare_func);
- rspamd_mempool_add_destructor (pool,
- (rspamd_mempool_destruct_t) g_tree_destroy,
- *tree);
- }
-
- memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0]));
-
- for (w = 0; w < input->len; w ++) {
- token = &g_array_index (input, rspamd_fstring_t, w);
-
- if (processed < FEATURE_WINDOW_SIZE) {
- /* Just fill a hashpipe */
- hashpipe[FEATURE_WINDOW_SIZE - ++processed] =
- rspamd_fstrhash_lc (token, is_utf);
- }
- else {
- /* Shift hashpipe */
- for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) {
- hashpipe[i] = hashpipe[i - 1];
- }
- hashpipe[0] = rspamd_fstrhash_lc (token, is_utf);
- processed++;
-
- for (i = 1; i < FEATURE_WINDOW_SIZE; i++) {
- h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
- h2 = hashpipe[0] * primes[1] + hashpipe[i] *
- primes[(i << 1) - 1];
- new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
- new->h1 = h1;
- new->h2 = h2;
- if (save_token) {
- new->extra =
- (uintptr_t)rspamd_mempool_fstrdup (pool, token);
- }
-
- if (g_tree_lookup (*tree, new) == NULL) {
- g_tree_insert (*tree, new, new);
- }
- }
- }
- }
-
- if (processed <= FEATURE_WINDOW_SIZE) {
- for (i = 1; i < processed; i++) {
- h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
- h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
- new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
- new->h1 = h1;
- new->h2 = h2;
- if (save_token) {
- new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, token);
- }
-
- if (g_tree_lookup (*tree, new) == NULL) {
- g_tree_insert (*tree, new, new);
- }
- }
- }
-
- return TRUE;
-}
-
-/*
- * vi:ts=4
- */
+++ /dev/null
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Common tokenization functions
- */
-
-#include <sys/types.h>
-#include "main.h"
-#include "tokenizers.h"
-
-struct tokenizer tokenizers[] = {
- {"osb-text", osb_tokenize_text, rspamd_tokenizer_get_word},
-};
-
-const int primes[] = {
- 1, 7,
- 3, 13,
- 5, 29,
- 11, 51,
- 23, 101,
- 47, 203,
- 97, 407,
- 197, 817,
- 397, 1637,
- 797, 3277,
-};
-
-const gchar t_delimiters[255] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
- 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
- 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
- 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0
-};
-
-struct tokenizer *
-get_tokenizer (const char *name)
-{
- guint i;
-
- for (i = 0; i < sizeof (tokenizers) / sizeof (tokenizers[0]); i++) {
- if (strcmp (tokenizers[i].name, name) == 0) {
- return &tokenizers[i];
- }
- }
-
- return NULL;
-}
-
-int
-token_node_compare_func (gconstpointer a, gconstpointer b)
-{
- const token_node_t *aa = a, *bb = b;
-
- if (aa->h1 == bb->h1) {
- return aa->h2 - bb->h2;
- }
-
- return aa->h1 - bb->h1;
-}
-
-/* Get next word from specified f_str_t buf */
-gchar *
-rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions)
-{
- gsize remain, pos;
- guchar *p;
- struct process_exception *ex = NULL;
-
- if (buf == NULL) {
- return NULL;
- }
-
- if (exceptions != NULL && *exceptions != NULL) {
- ex = (*exceptions)->data;
- }
-
- if (token->begin == NULL) {
- if (ex != NULL) {
- if (ex->pos == 0) {
- token->begin = buf->begin + ex->len;
- token->len = ex->len;
- }
- else {
- token->begin = buf->begin;
- token->len = 0;
- }
- }
- else {
- token->begin = buf->begin;
- token->len = 0;
- }
- }
-
- token->len = 0;
-
- pos = token->begin - buf->begin;
- if (pos >= buf->len) {
- return NULL;
- }
-
- remain = buf->len - pos;
- p = token->begin;
- /* Skip non delimiters symbols */
- do {
- if (ex != NULL && ex->pos == pos) {
- /* Go to the next exception */
- *exceptions = g_list_next (*exceptions);
- return p + ex->len;
- }
- pos++;
- p++;
- remain--;
- } while (remain > 0 && t_delimiters[*p]);
-
- token->begin = p;
-
- while (remain > 0 && !t_delimiters[*p]) {
- if (ex != NULL && ex->pos == pos) {
- *exceptions = g_list_next (*exceptions);
- return p + ex->len;
- }
- token->len++;
- pos++;
- remain--;
- p++;
- }
-
- if (remain == 0) {
- return NULL;
- }
-
- return p;
-}
-
-GArray *
-rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
- gsize min_len, GList **exceptions)
-{
- rspamd_fstring_t token, buf;
- gchar *pos;
- gsize l;
- GArray *res;
-
- if (len == 0 || text == NULL) {
- return NULL;
- }
-
- buf.begin = text;
- buf.len = len;
- buf.size = buf.len;
- token.begin = NULL;
- token.len = 0;
-
- res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t));
- while ((pos = rspamd_tokenizer_get_word (&buf,
- &token, exceptions)) != NULL) {
- if (is_utf) {
- l = g_utf8_strlen (token.begin, token.len);
- }
- else {
- l = token.len;
- }
- if (min_len > 0 && l < min_len) {
- token.begin = pos;
- continue;
- }
- g_array_append_val (res, token);
-
- token.begin = pos;
- }
-
- return res;
-}
-
-
-void
-tokenize_subject (struct rspamd_task *task, GTree ** tree)
-{
- gchar *sub;
- struct tokenizer *osb_tokenizer;
- GArray *words;
-
- if (*tree == NULL) {
- *tree = g_tree_new (token_node_compare_func);
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t) g_tree_destroy, *tree);
- }
-
- osb_tokenizer = get_tokenizer ("osb-text");
-
- /* Try to use pre-defined subject */
- if (task->subject != NULL) {
- sub = task->subject;
- }
- else {
- sub = (gchar *)g_mime_message_get_subject (task->message);
- }
-
- if (sub != NULL) {
- words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL);
- if (words != NULL) {
- osb_tokenizer->tokenize_func (osb_tokenizer,
- task->task_pool,
- words,
- tree,
- FALSE,
- TRUE,
- NULL);
- g_array_free (words, TRUE);
- }
- }
-}
-
-/*
- * vi:ts=4
- */
+++ /dev/null
-#ifndef TOKENIZERS_H
-#define TOKENIZERS_H
-
-#include "config.h"
-#include "mem_pool.h"
-#include "fstring.h"
-#include "main.h"
-
-/* Size for features pipe */
-#define FEATURE_WINDOW_SIZE 5
-
-typedef struct token_node_s {
- guint32 h1;
- guint32 h2;
- double value;
- uintptr_t extra;
-} token_node_t;
-
-/* Common tokenizer structure */
-struct tokenizer {
- gchar *name;
- gint (*tokenize_func)(struct tokenizer *tokenizer,
- rspamd_mempool_t *pool,
- GArray *words,
- GTree **cur,
- gboolean save_token,
- gboolean is_utf,
- GList *exceptions);
- gchar * (*get_next_word)(rspamd_fstring_t *buf, rspamd_fstring_t *token, GList **exceptions);
-};
-
-/* Compare two token nodes */
-int token_node_compare_func (gconstpointer a, gconstpointer b);
-
-/* Get tokenizer structure by name or return NULL if this name is not found */
-struct tokenizer * get_tokenizer (const char *name);
-
-/* Get next word from specified f_str_t buf */
-gchar * rspamd_tokenizer_get_word (rspamd_fstring_t *buf,
- rspamd_fstring_t *token, GList **exceptions);
-
-/* Tokenize text into array of words (rspamd_fstring_t type) */
-GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
- gsize min_len, GList **exceptions);
-
-/* OSB tokenize function */
-int osb_tokenize_text (struct tokenizer *tokenizer,
- rspamd_mempool_t *pool,
- GArray *input,
- GTree **cur,
- gboolean save_token,
- gboolean is_utf,
- GList *exceptions);
-
-/* Make tokens for a subject */
-void tokenize_subject (struct rspamd_task *task, GTree ** tree);
-
-/* Array of all defined tokenizers */
-extern struct tokenizer tokenizers[];
-
-#endif
-/*
- * vi:ts=4
- */