aboutsummaryrefslogtreecommitdiffstats
path: root/src/libmime/filter.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libmime/filter.c')
-rw-r--r--src/libmime/filter.c1096
1 files changed, 1096 insertions, 0 deletions
diff --git a/src/libmime/filter.c b/src/libmime/filter.c
new file mode 100644
index 000000000..cb0630d9d
--- /dev/null
+++ b/src/libmime/filter.c
@@ -0,0 +1,1096 @@
+/*
+ * Copyright (c) 2009-2012, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "mem_pool.h"
+#include "filter.h"
+#include "main.h"
+#include "message.h"
+#include "cfg_file.h"
+#include "util.h"
+#include "expressions.h"
+#include "settings.h"
+#include "binlog.h"
+#include "diff.h"
+#include "classifiers/classifiers.h"
+#include "tokenizers/tokenizers.h"
+
+#ifdef WITH_LUA
+# include "lua/lua_common.h"
+#endif
+
+#define COMMON_PART_FACTOR 95
+
+#ifndef PARAM_H_HAS_BITSET
+/* Bit map related macros. */
+#define NBBY 8 /* number of bits in a byte */
+#define setbit(a,i) (((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY))
+#define clrbit(a,i) (((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY)))
+#define isset(a,i) \
+ (((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY)))
+#define isclr(a,i) \
+ ((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0)
+#endif
+#define BITSPERBYTE (8*sizeof (gchar))
+#define NBYTES(nbits) (((nbits) + BITSPERBYTE - 1) / BITSPERBYTE)
+
+static inline GQuark
+filter_error_quark (void)
+{
+ return g_quark_from_static_string ("g-filter-error-quark");
+}
+
+static void
+insert_metric_result (struct rspamd_task *task, struct metric *metric, const gchar *symbol,
+ double flag, GList * opts, gboolean single)
+{
+ struct metric_result *metric_res;
+ struct symbol *s;
+ gdouble *weight, w;
+
+ metric_res = g_hash_table_lookup (task->results, metric->name);
+
+ if (metric_res == NULL) {
+ /* Create new metric chain */
+ metric_res = rspamd_mempool_alloc (task->task_pool, sizeof (struct metric_result));
+ metric_res->symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+ metric_res->checked = FALSE;
+ rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t) g_hash_table_unref, metric_res->symbols);
+ metric_res->metric = metric;
+ metric_res->grow_factor = 0;
+ metric_res->score = 0;
+ metric_res->domain_settings = NULL;
+ metric_res->user_settings = NULL;
+ apply_metric_settings (task, metric, metric_res);
+ g_hash_table_insert (task->results, (gpointer) metric->name, metric_res);
+ }
+
+ weight = g_hash_table_lookup (metric->symbols, symbol);
+ if (weight == NULL) {
+ w = 0.0;
+ }
+ else {
+ w = (*weight) * flag;
+ }
+
+
+ /* Add metric score */
+ if ((s = g_hash_table_lookup (metric_res->symbols, symbol)) != NULL) {
+ if (s->options && opts && opts != s->options) {
+ /* Append new options */
+ s->options = g_list_concat (s->options, g_list_copy(opts));
+ /*
+ * Note that there is no need to add new destructor of GList as elements of appended
+ * GList are used directly, so just free initial GList
+ */
+ }
+ else if (opts) {
+ s->options = g_list_copy (opts);
+ rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t) g_list_free, s->options);
+ }
+ if (!single) {
+ /* Handle grow factor */
+ if (metric_res->grow_factor && w > 0) {
+ w *= metric_res->grow_factor;
+ metric_res->grow_factor *= metric->grow_factor;
+ }
+ s->score += w;
+ metric_res->score += w;
+ }
+ else {
+ if (fabs (s->score) < fabs (w)) {
+ /* Replace less weight with a bigger one */
+ metric_res->score = metric_res->score - s->score + w;
+ s->score = w;
+ }
+ }
+ }
+ else {
+ s = rspamd_mempool_alloc (task->task_pool, sizeof (struct symbol));
+
+ /* Handle grow factor */
+ if (metric_res->grow_factor && w > 0) {
+ w *= metric_res->grow_factor;
+ metric_res->grow_factor *= metric->grow_factor;
+ }
+ else if (w > 0) {
+ metric_res->grow_factor = metric->grow_factor;
+ }
+
+ s->score = w;
+ s->name = symbol;
+ metric_res->score += w;
+
+ if (opts) {
+ s->options = g_list_copy (opts);
+ rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t) g_list_free, s->options);
+ }
+ else {
+ s->options = NULL;
+ }
+
+ g_hash_table_insert (metric_res->symbols, (gpointer) symbol, s);
+ }
+ debug_task ("symbol %s, score %.2f, metric %s, factor: %f", symbol, s->score, metric->name, w);
+
+}
+
+#if ((GLIB_MAJOR_VERSION == 2) && (GLIB_MINOR_VERSION <= 30))
+static GStaticMutex result_mtx = G_STATIC_MUTEX_INIT;
+#else
+G_LOCK_DEFINE (result_mtx);
+#endif
+
+static void
+insert_result_common (struct rspamd_task *task, const gchar *symbol, double flag, GList * opts, gboolean single)
+{
+ struct metric *metric;
+ struct cache_item *item;
+ GList *cur, *metric_list;
+
+ /* Avoid concurrenting inserting of results */
+#if ((GLIB_MAJOR_VERSION == 2) && (GLIB_MINOR_VERSION <= 30))
+ g_static_mutex_lock (&result_mtx);
+#else
+ G_LOCK (result_mtx);
+#endif
+ metric_list = g_hash_table_lookup (task->cfg->metrics_symbols, symbol);
+ if (metric_list) {
+ cur = metric_list;
+
+ while (cur) {
+ metric = cur->data;
+ insert_metric_result (task, metric, symbol, flag, opts, single);
+ cur = g_list_next (cur);
+ }
+ }
+ else {
+ /* Insert symbol to default metric */
+ insert_metric_result (task, task->cfg->default_metric, symbol, flag, opts, single);
+ }
+
+ /* Process cache item */
+ if (task->cfg->cache) {
+ item = g_hash_table_lookup (task->cfg->cache->items_by_symbol, symbol);
+ if (item != NULL) {
+ item->s->frequency++;
+ }
+ }
+
+ if (opts != NULL) {
+ /* XXX: it is not wise to destroy them here */
+ g_list_free (opts);
+ }
+#if ((GLIB_MAJOR_VERSION == 2) && (GLIB_MINOR_VERSION <= 30))
+ g_static_mutex_unlock (&result_mtx);
+#else
+ G_UNLOCK (result_mtx);
+#endif
+}
+
+/* Insert result that may be increased on next insertions */
+void
+insert_result (struct rspamd_task *task, const gchar *symbol, double flag, GList * opts)
+{
+ insert_result_common (task, symbol, flag, opts, task->cfg->one_shot_mode);
+}
+
+/* Insert result as a single option */
+void
+insert_result_single (struct rspamd_task *task, const gchar *symbol, double flag, GList * opts)
+{
+ insert_result_common (task, symbol, flag, opts, TRUE);
+}
+
+/* Return true if metric has score that is more than spam score for it */
+static gboolean
+check_metric_is_spam (struct rspamd_task *task, struct metric *metric)
+{
+ struct metric_result *res;
+ double ms, rs;
+
+ /* Avoid concurrency while checking results */
+#if ((GLIB_MAJOR_VERSION == 2) && (GLIB_MINOR_VERSION <= 30))
+ g_static_mutex_lock (&result_mtx);
+#else
+ G_LOCK (result_mtx);
+#endif
+ res = g_hash_table_lookup (task->results, metric->name);
+ if (res) {
+#if ((GLIB_MAJOR_VERSION == 2) && (GLIB_MINOR_VERSION <= 30))
+ g_static_mutex_unlock (&result_mtx);
+#else
+ G_UNLOCK (result_mtx);
+#endif
+ if (!check_metric_settings (res, &ms, &rs)) {
+ ms = metric->actions[METRIC_ACTION_REJECT].score;
+ }
+ return (ms > 0 && res->score >= ms);
+ }
+
+#if ((GLIB_MAJOR_VERSION == 2) && (GLIB_MINOR_VERSION <= 30))
+ g_static_mutex_unlock (&result_mtx);
+#else
+ G_UNLOCK (result_mtx);
+#endif
+
+ return FALSE;
+}
+
+gint
+process_filters (struct rspamd_task *task)
+{
+ GList *cur;
+ struct metric *metric;
+ gpointer item = NULL;
+
+ /* Process metrics symbols */
+ while (call_symbol_callback (task, task->cfg->cache, &item)) {
+ /* Check reject actions */
+ cur = task->cfg->metrics_list;
+ while (cur) {
+ metric = cur->data;
+ if (!task->pass_all_filters &&
+ metric->actions[METRIC_ACTION_REJECT].score > 0 &&
+ check_metric_is_spam (task, metric)) {
+ task->state = WRITE_REPLY;
+ return 1;
+ }
+ cur = g_list_next (cur);
+ }
+ }
+
+ task->state = WAIT_FILTER;
+
+ return 1;
+}
+
+
+struct composites_data {
+ struct rspamd_task *task;
+ struct metric_result *metric_res;
+ GTree *symbols_to_remove;
+ guint8 *checked;
+};
+
+struct symbol_remove_data {
+ struct symbol *ms;
+ gboolean remove_weight;
+ gboolean remove_symbol;
+};
+
+static gint
+remove_compare_data (gconstpointer a, gconstpointer b)
+{
+ const gchar *ca = a, *cb = b;
+
+ return strcmp (ca, cb);
+}
+
+static void
+composites_foreach_callback (gpointer key, gpointer value, void *data)
+{
+ struct composites_data *cd = (struct composites_data *)data;
+ struct rspamd_composite *composite = value, *ncomp;
+ struct expression *expr;
+ GQueue *stack;
+ GList *symbols = NULL, *s;
+ gsize cur, op1, op2;
+ gchar logbuf[256], *sym, *check_sym;
+ gint r;
+ struct symbol *ms;
+ struct symbol_remove_data *rd;
+
+
+ expr = composite->expr;
+ if (isset (cd->checked, composite->id)) {
+ /* Symbol was already checked */
+ return;
+ }
+
+ stack = g_queue_new ();
+
+ while (expr) {
+ if (expr->type == EXPR_STR) {
+ /* Find corresponding symbol */
+ sym = expr->content.operand;
+ if (*sym == '~' || *sym == '-') {
+ sym ++;
+ }
+ if (g_hash_table_lookup (cd->metric_res->symbols, sym) == NULL) {
+ cur = 0;
+ if ((ncomp = g_hash_table_lookup (cd->task->cfg->composite_symbols, sym)) != NULL) {
+ /* Set checked for this symbol to avoid cyclic references */
+ if (isclr (cd->checked, ncomp->id)) {
+ setbit (cd->checked, composite->id);
+ composites_foreach_callback (sym, ncomp, cd);
+ if (g_hash_table_lookup (cd->metric_res->symbols, sym) != NULL) {
+ cur = 1;
+ }
+ }
+ }
+ }
+ else {
+ cur = 1;
+ symbols = g_list_prepend (symbols, expr->content.operand);
+ }
+ g_queue_push_head (stack, GSIZE_TO_POINTER (cur));
+ }
+ else {
+ if (g_queue_is_empty (stack)) {
+ /* Queue has no operands for operation, exiting */
+ g_list_free (symbols);
+ g_queue_free (stack);
+ setbit (cd->checked, composite->id);
+ return;
+ }
+ switch (expr->content.operation) {
+ case '!':
+ op1 = GPOINTER_TO_SIZE (g_queue_pop_head (stack));
+ op1 = !op1;
+ g_queue_push_head (stack, GSIZE_TO_POINTER (op1));
+ break;
+ case '&':
+ op1 = GPOINTER_TO_SIZE (g_queue_pop_head (stack));
+ op2 = GPOINTER_TO_SIZE (g_queue_pop_head (stack));
+ g_queue_push_head (stack, GSIZE_TO_POINTER (op1 && op2));
+ break;
+ case '|':
+ op1 = GPOINTER_TO_SIZE (g_queue_pop_head (stack));
+ op2 = GPOINTER_TO_SIZE (g_queue_pop_head (stack));
+ g_queue_push_head (stack, GSIZE_TO_POINTER (op1 || op2));
+ break;
+ default:
+ expr = expr->next;
+ continue;
+ }
+ }
+ expr = expr->next;
+ }
+ if (!g_queue_is_empty (stack)) {
+ op1 = GPOINTER_TO_SIZE (g_queue_pop_head (stack));
+ if (op1) {
+ /* Remove all symbols that are in composite symbol */
+ s = g_list_first (symbols);
+ r = rspamd_snprintf (logbuf, sizeof (logbuf), "<%s>, insert symbol %s instead of symbols: ", cd->task->message_id, key);
+ while (s) {
+ sym = s->data;
+ if (*sym == '~' || *sym == '-') {
+ check_sym = sym + 1;
+ }
+ else {
+ check_sym = sym;
+ }
+ ms = g_hash_table_lookup (cd->metric_res->symbols, check_sym);
+
+ if (ms == NULL) {
+ /* Try to process other composites */
+ if ((ncomp = g_hash_table_lookup (cd->task->cfg->composite_symbols, check_sym)) != NULL) {
+ /* Set checked for this symbol to avoid cyclic references */
+ if (isclr (cd->checked, ncomp->id)) {
+ setbit (cd->checked, composite->id);
+ composites_foreach_callback (check_sym, ncomp, cd);
+ ms = g_hash_table_lookup (cd->metric_res->symbols, check_sym);
+ }
+ }
+ }
+
+ if (ms != NULL) {
+ rd = rspamd_mempool_alloc (cd->task->task_pool, sizeof (struct symbol_remove_data));
+ rd->ms = ms;
+ if (G_UNLIKELY (*sym == '~')) {
+ rd->remove_weight = FALSE;
+ rd->remove_symbol = TRUE;
+ }
+ else if (G_UNLIKELY (*sym == '-')) {
+ rd->remove_symbol = FALSE;
+ rd->remove_weight = FALSE;
+ }
+ else {
+ rd->remove_symbol = TRUE;
+ rd->remove_weight = TRUE;
+ }
+ if (!g_tree_lookup (cd->symbols_to_remove, rd)) {
+ g_tree_insert (cd->symbols_to_remove, (gpointer)ms->name, rd);
+ }
+ }
+ else {
+
+ }
+
+ if (s->next) {
+ r += rspamd_snprintf (logbuf + r, sizeof (logbuf) -r, "%s, ", s->data);
+ }
+ else {
+ r += rspamd_snprintf (logbuf + r, sizeof (logbuf) -r, "%s", s->data);
+ }
+ s = g_list_next (s);
+ }
+ /* Add new symbol */
+ insert_result_single (cd->task, key, 1.0, NULL);
+ msg_info ("%s", logbuf);
+ }
+ }
+
+ setbit (cd->checked, composite->id);
+ g_queue_free (stack);
+ g_list_free (symbols);
+
+ return;
+}
+
+static gboolean
+check_autolearn (struct statfile_autolearn_params *params, struct rspamd_task *task)
+{
+ gchar *metric_name = DEFAULT_METRIC;
+ struct metric_result *metric_res;
+ GList *cur;
+
+ if (params->metric != NULL) {
+ metric_name = (gchar *)params->metric;
+ }
+
+ /* First check threshold */
+ metric_res = g_hash_table_lookup (task->results, metric_name);
+ if (metric_res == NULL) {
+ if (params->symbols == NULL && params->threshold_max > 0) {
+ /* For ham messages */
+ return TRUE;
+ }
+ debug_task ("metric %s has no results", metric_name);
+ return FALSE;
+ }
+ else {
+ /* Process score of metric */
+ if ((params->threshold_min != 0 && metric_res->score > params->threshold_min) || (params->threshold_max != 0 && metric_res->score < params->threshold_max)) {
+ /* Now check for specific symbols */
+ if (params->symbols) {
+ cur = params->symbols;
+ while (cur) {
+ if (g_hash_table_lookup (metric_res->symbols, cur->data) == NULL) {
+ return FALSE;
+ }
+ cur = g_list_next (cur);
+ }
+ }
+ /* Now allow processing of actual autolearn */
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+void
+process_autolearn (struct statfile *st, struct rspamd_task *task, GTree * tokens, struct classifier *classifier, gchar *filename, struct classifier_ctx *ctx)
+{
+ stat_file_t *statfile;
+ struct statfile *unused;
+
+ if (check_autolearn (st->autolearn, task)) {
+ if (tokens) {
+ /* Take care of subject */
+ tokenize_subject (task, &tokens);
+ msg_info ("message with id <%s> autolearned statfile '%s'", task->message_id, filename);
+
+ /* Get or create statfile */
+ statfile = get_statfile_by_symbol (task->worker->srv->statfile_pool, ctx->cfg,
+ st->symbol, &unused, TRUE);
+
+ if (statfile == NULL) {
+ return;
+ }
+
+ classifier->learn_func (ctx, task->worker->srv->statfile_pool, st->symbol, tokens, TRUE, NULL, 1., NULL);
+ maybe_write_binlog (ctx->cfg, st, statfile, tokens);
+ statfile_pool_plan_invalidate (task->worker->srv->statfile_pool, DEFAULT_STATFILE_INVALIDATE_TIME, DEFAULT_STATFILE_INVALIDATE_JITTER);
+ }
+ }
+}
+
+static gboolean
+composites_remove_symbols (gpointer key, gpointer value, gpointer data)
+{
+ struct composites_data *cd = data;
+ struct symbol_remove_data *rd = value;
+
+ if (rd->remove_symbol) {
+ g_hash_table_remove (cd->metric_res->symbols, key);
+ }
+ if (rd->remove_weight) {
+ cd->metric_res->score -= rd->ms->score;
+ }
+
+ return FALSE;
+}
+
+static void
+composites_metric_callback (gpointer key, gpointer value, gpointer data)
+{
+ struct rspamd_task *task = (struct rspamd_task *)data;
+ struct composites_data *cd = rspamd_mempool_alloc (task->task_pool, sizeof (struct composites_data));
+ struct metric_result *metric_res = (struct metric_result *)value;
+
+ cd->task = task;
+ cd->metric_res = (struct metric_result *)metric_res;
+ cd->symbols_to_remove = g_tree_new (remove_compare_data);
+ cd->checked = rspamd_mempool_alloc0 (task->task_pool, NBYTES (g_hash_table_size (task->cfg->composite_symbols)));
+
+ /* Process hash table */
+ g_hash_table_foreach (task->cfg->composite_symbols, composites_foreach_callback, cd);
+
+ /* Remove symbols that are in composites */
+ g_tree_foreach (cd->symbols_to_remove, composites_remove_symbols, cd);
+ /* Free list */
+ g_tree_destroy (cd->symbols_to_remove);
+}
+
+void
+make_composites (struct rspamd_task *task)
+{
+ g_hash_table_foreach (task->results, composites_metric_callback, task);
+}
+
+struct classifiers_cbdata {
+ struct rspamd_task *task;
+ struct lua_locked_state *nL;
+};
+
+static void
+classifiers_callback (gpointer value, void *arg)
+{
+ struct classifiers_cbdata *cbdata = arg;
+ struct rspamd_task *task;
+ struct classifier_config *cl = value;
+ struct classifier_ctx *ctx;
+ struct mime_text_part *text_part, *p1, *p2;
+ struct statfile *st;
+ GTree *tokens = NULL;
+ GList *cur;
+ f_str_t c;
+ gchar *header = NULL;
+ gint *dist = NULL, diff;
+ gboolean is_twopart = FALSE;
+
+ task = cbdata->task;
+
+ if ((header = g_hash_table_lookup (cl->opts, "header")) != NULL) {
+ cur = message_get_header (task->task_pool, task->message, header, FALSE);
+ if (cur) {
+ rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_list_free, cur);
+ }
+ }
+ else {
+ cur = g_list_first (task->text_parts);
+ dist = rspamd_mempool_get_variable (task->task_pool, "parts_distance");
+ if (cur != NULL && cur->next != NULL && cur->next->next == NULL) {
+ is_twopart = TRUE;
+ }
+ }
+ ctx = cl->classifier->init_func (task->task_pool, cl);
+
+ if ((tokens = g_hash_table_lookup (task->tokens, cl->tokenizer)) == NULL) {
+ while (cur != NULL) {
+ if (header) {
+ c.len = strlen (cur->data);
+ if (c.len > 0) {
+ c.begin = cur->data;
+ if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE, NULL)) {
+ msg_info ("cannot tokenize input");
+ return;
+ }
+ }
+ }
+ else {
+ text_part = (struct mime_text_part *)cur->data;
+ if (text_part->is_empty) {
+ cur = g_list_next (cur);
+ continue;
+ }
+ if (dist != NULL && cur->next == NULL) {
+ /* Compare part's content */
+
+ if (*dist >= COMMON_PART_FACTOR) {
+ msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id);
+ break;
+ }
+ }
+ else if (cur->next == NULL && is_twopart) {
+ p1 = cur->prev->data;
+ p2 = text_part;
+ if (p1->diff_str != NULL && p2->diff_str != NULL) {
+ diff = compare_diff_distance (p1->diff_str, p2->diff_str);
+ }
+ else {
+ diff = fuzzy_compare_parts (p1, p2);
+ }
+ if (diff >= COMMON_PART_FACTOR) {
+ msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id);
+ break;
+ }
+ }
+ c.begin = (gchar *)text_part->content->data;
+ c.len = text_part->content->len;
+ /* Tree would be freed at task pool freeing */
+ if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens,
+ FALSE, text_part->is_utf, text_part->urls_offset)) {
+ msg_info ("cannot tokenize input");
+ return;
+ }
+ }
+ cur = g_list_next (cur);
+ }
+ g_hash_table_insert (task->tokens, cl->tokenizer, tokens);
+ }
+
+ /* Take care of subject */
+ tokenize_subject (task, &tokens);
+
+ if (tokens == NULL) {
+ return;
+ }
+
+ if (cbdata->nL != NULL) {
+ rspamd_mutex_lock (cbdata->nL->m);
+ cl->classifier->classify_func (ctx, task->worker->srv->statfile_pool, tokens, task, cbdata->nL->L);
+ rspamd_mutex_unlock (cbdata->nL->m);
+ }
+ else {
+ /* Non-threaded case */
+ cl->classifier->classify_func (ctx, task->worker->srv->statfile_pool, tokens, task, task->cfg->lua_state);
+ }
+
+ /* Autolearning */
+ cur = g_list_first (cl->statfiles);
+ while (cur) {
+ st = cur->data;
+ if (st->autolearn) {
+ if (check_autolearn (st->autolearn, task)) {
+ /* Process autolearn */
+ process_autolearn (st, task, tokens, cl->classifier, st->path, ctx);
+ }
+ }
+ cur = g_list_next (cur);
+ }
+}
+
+
+void
+process_statfiles (struct rspamd_task *task)
+{
+ struct classifiers_cbdata cbdata;
+
+ if (task->is_skipped) {
+ return;
+ }
+
+ if (task->tokens == NULL) {
+ task->tokens = g_hash_table_new (g_direct_hash, g_direct_equal);
+ rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_hash_table_unref, task->tokens);
+ }
+ cbdata.task = task;
+ cbdata.nL = NULL;
+ g_list_foreach (task->cfg->classifiers, classifiers_callback, &cbdata);
+
+ /* Process results */
+ make_composites (task);
+}
+
+void
+process_statfiles_threaded (gpointer data, gpointer user_data)
+{
+ struct rspamd_task *task = (struct rspamd_task *)data;
+ struct lua_locked_state *nL = user_data;
+ struct classifiers_cbdata cbdata;
+
+ if (task->is_skipped) {
+ remove_async_thread (task->s);
+ return;
+ }
+
+ if (task->tokens == NULL) {
+ task->tokens = g_hash_table_new (g_direct_hash, g_direct_equal);
+ rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_hash_table_unref, task->tokens);
+ }
+
+ cbdata.task = task;
+ cbdata.nL = nL;
+ g_list_foreach (task->cfg->classifiers, classifiers_callback, &cbdata);
+ remove_async_thread (task->s);
+}
+
+static void
+insert_metric_header (gpointer metric_name, gpointer metric_value, gpointer data)
+{
+#ifndef GLIB_HASH_COMPAT
+ struct rspamd_task *task = (struct rspamd_task *)data;
+ gint r = 0;
+ /* Try to be rfc2822 compatible and avoid long headers with folding */
+ gchar header_name[128], outbuf[1000];
+ GList *symbols = NULL, *cur;
+ struct metric_result *metric_res = (struct metric_result *)metric_value;
+ double ms, rs;
+
+ rspamd_snprintf (header_name, sizeof (header_name), "X-Spam-%s", metric_res->metric->name);
+
+ if (!check_metric_settings (metric_res, &ms, &rs)) {
+ ms = metric_res->metric->actions[METRIC_ACTION_REJECT].score;
+ }
+ if (ms > 0 && metric_res->score >= ms) {
+ r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, "yes; %.2f/%.2f/%.2f; ", metric_res->score, ms, rs);
+ }
+ else {
+ r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, "no; %.2f/%.2f/%.2f; ", metric_res->score, ms, rs);
+ }
+
+ symbols = g_hash_table_get_keys (metric_res->symbols);
+ cur = symbols;
+ while (cur) {
+ if (g_list_next (cur) != NULL) {
+ r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, "%s,", (gchar *)cur->data);
+ }
+ else {
+ r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, "%s", (gchar *)cur->data);
+ }
+ cur = g_list_next (cur);
+ }
+ g_list_free (symbols);
+#ifdef GMIME24
+ g_mime_object_append_header (GMIME_OBJECT (task->message), header_name, outbuf);
+#else
+ g_mime_message_add_header (task->message, header_name, outbuf);
+#endif
+
+#endif /* GLIB_COMPAT */
+}
+
+void
+insert_headers (struct rspamd_task *task)
+{
+ g_hash_table_foreach (task->results, insert_metric_header, task);
+}
+
+gboolean
+check_action_str (const gchar *data, gint *result)
+{
+ if (g_ascii_strncasecmp (data, "reject", sizeof ("reject") - 1) == 0) {
+ *result = METRIC_ACTION_REJECT;
+ }
+ else if (g_ascii_strncasecmp (data, "greylist", sizeof ("greylist") - 1) == 0) {
+ *result = METRIC_ACTION_GREYLIST;
+ }
+ else if (g_ascii_strncasecmp (data, "add_header", sizeof ("add_header") - 1) == 0) {
+ *result = METRIC_ACTION_ADD_HEADER;
+ }
+ else if (g_ascii_strncasecmp (data, "rewrite_subject", sizeof ("rewrite_subject") - 1) == 0) {
+ *result = METRIC_ACTION_REWRITE_SUBJECT;
+ }
+ else {
+ return FALSE;
+ }
+ return TRUE;
+}
+
+const gchar *
+str_action_metric (enum rspamd_metric_action action)
+{
+ switch (action) {
+ case METRIC_ACTION_REJECT:
+ return "reject";
+ case METRIC_ACTION_SOFT_REJECT:
+ return "soft_reject";
+ case METRIC_ACTION_REWRITE_SUBJECT:
+ return "rewrite_subject";
+ case METRIC_ACTION_ADD_HEADER:
+ return "add_header";
+ case METRIC_ACTION_GREYLIST:
+ return "greylist";
+ case METRIC_ACTION_NOACTION:
+ return "no_action";
+ case METRIC_ACTION_MAX:
+ return "invalid max action";
+ }
+
+ return "unknown action";
+}
+
+gint
+check_metric_action (double score, double required_score, struct metric *metric)
+{
+ struct metric_action *action, *selected_action = NULL;
+ double max_score = 0;
+ int i;
+
+ if (score >= required_score) {
+ return METRIC_ACTION_REJECT;
+ }
+ else if (metric->actions == NULL) {
+ return METRIC_ACTION_NOACTION;
+ }
+ else {
+ for (i = METRIC_ACTION_REJECT; i < METRIC_ACTION_MAX; i ++) {
+ action = &metric->actions[i];
+ if (action->score < 0) {
+ continue;
+ }
+ if (score >= action->score && action->score > max_score) {
+ selected_action = action;
+ max_score = action->score;
+ }
+ }
+ if (selected_action) {
+ return selected_action->action;
+ }
+ else {
+ return METRIC_ACTION_NOACTION;
+ }
+ }
+}
+
+gboolean
+learn_task (const gchar *statfile, struct rspamd_task *task, GError **err)
+{
+ GList *cur, *ex;
+ struct classifier_config *cl;
+ struct classifier_ctx *cls_ctx;
+ gchar *s;
+ f_str_t c;
+ GTree *tokens = NULL;
+ struct statfile *st;
+ stat_file_t *stf;
+ gdouble sum;
+ struct mime_text_part *part, *p1, *p2;
+ gboolean is_utf = FALSE, is_twopart = FALSE;
+ gint diff;
+
+
+ /* Load classifier by symbol */
+ cl = g_hash_table_lookup (task->cfg->classifiers_symbols, statfile);
+ if (cl == NULL) {
+ g_set_error (err, filter_error_quark(), 1, "Statfile %s is not configured in any classifier", statfile);
+ return FALSE;
+ }
+
+ /* If classifier has 'header' option just classify header of this type */
+ if ((s = g_hash_table_lookup (cl->opts, "header")) != NULL) {
+ cur = message_get_header (task->task_pool, task->message, s, FALSE);
+ if (cur) {
+ rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_list_free, cur);
+ }
+ }
+ else {
+ /* Classify message otherwise */
+ cur = g_list_first (task->text_parts);
+ if (cur != NULL && cur->next != NULL && cur->next->next == NULL) {
+ is_twopart = TRUE;
+ }
+ }
+
+ /* Get tokens from each element */
+ while (cur) {
+ if (s != NULL) {
+ c.len = strlen (cur->data);
+ c.begin = cur->data;
+ ex = NULL;
+ }
+ else {
+ part = cur->data;
+ /* Skip empty parts */
+ if (part->is_empty) {
+ cur = g_list_next (cur);
+ continue;
+ }
+ c.begin = (gchar *)part->content->data;
+ c.len = part->content->len;
+ is_utf = part->is_utf;
+ ex = part->urls_offset;
+ if (is_twopart && cur->next == NULL) {
+ /* Compare part's content */
+ p1 = cur->prev->data;
+ p2 = part;
+ if (p1->diff_str != NULL && p2->diff_str != NULL) {
+ diff = compare_diff_distance (p1->diff_str, p2->diff_str);
+ }
+ else {
+ diff = fuzzy_compare_parts (p1, p2);
+ }
+ if (diff >= COMMON_PART_FACTOR) {
+ msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id);
+ break;
+ }
+ }
+ }
+ /* Get tokens */
+ if (!cl->tokenizer->tokenize_func (
+ cl->tokenizer, task->task_pool,
+ &c, &tokens, FALSE, is_utf, ex)) {
+ g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message");
+ return FALSE;
+ }
+ cur = g_list_next (cur);
+ }
+
+ /* Handle messages without text */
+ if (tokens == NULL) {
+ g_set_error (err, filter_error_quark(), 3, "Cannot tokenize message, no text data");
+ msg_info ("learn failed for message <%s>, no tokens to extract", task->message_id);
+ return FALSE;
+ }
+
+ /* Take care of subject */
+ tokenize_subject (task, &tokens);
+
+ /* Init classifier */
+ cls_ctx = cl->classifier->init_func (
+ task->task_pool, cl);
+ /* Get or create statfile */
+ stf = get_statfile_by_symbol (task->worker->srv->statfile_pool,
+ cl, statfile, &st, TRUE);
+
+ /* Learn */
+ if (stf== NULL || !cl->classifier->learn_func (
+ cls_ctx, task->worker->srv->statfile_pool,
+ statfile, tokens, TRUE, &sum,
+ 1.0, err)) {
+ if (*err) {
+ msg_info ("learn failed for message <%s>, learn error: %s", task->message_id, (*err)->message);
+ return FALSE;
+ }
+ else {
+ g_set_error (err, filter_error_quark(), 4, "Learn failed, unknown learn classifier error");
+ msg_info ("learn failed for message <%s>, unknown learn error", task->message_id);
+ return FALSE;
+ }
+ }
+ /* Increase statistics */
+ task->worker->srv->stat->messages_learned++;
+
+ maybe_write_binlog (cl, st, stf, tokens);
+ msg_info ("learn success for message <%s>, for statfile: %s, sum weight: %.2f",
+ task->message_id, statfile, sum);
+ statfile_pool_plan_invalidate (task->worker->srv->statfile_pool,
+ DEFAULT_STATFILE_INVALIDATE_TIME,
+ DEFAULT_STATFILE_INVALIDATE_JITTER);
+
+ return TRUE;
+}
+
+gboolean
+learn_task_spam (struct classifier_config *cl, struct rspamd_task *task, gboolean is_spam, GError **err)
+{
+ GList *cur, *ex;
+ struct classifier_ctx *cls_ctx;
+ f_str_t c;
+ GTree *tokens = NULL;
+ struct mime_text_part *part, *p1, *p2;
+ gboolean is_utf = FALSE, is_twopart = FALSE;
+ gint diff;
+
+ cur = g_list_first (task->text_parts);
+ if (cur != NULL && cur->next != NULL && cur->next->next == NULL) {
+ is_twopart = TRUE;
+ }
+
+ /* Get tokens from each element */
+ while (cur) {
+ part = cur->data;
+ /* Skip empty parts */
+ if (part->is_empty) {
+ cur = g_list_next (cur);
+ continue;
+ }
+ c.begin = (gchar *)part->content->data;
+ c.len = part->content->len;
+ is_utf = part->is_utf;
+ ex = part->urls_offset;
+ if (is_twopart && cur->next == NULL) {
+ /*
+ * Compare part's content
+ * Note: here we don't have filters proceeded this message, so using pool variable is a bad idea
+ */
+ p1 = cur->prev->data;
+ p2 = part;
+ if (p1->diff_str != NULL && p2->diff_str != NULL) {
+ diff = compare_diff_distance (p1->diff_str, p2->diff_str);
+ }
+ else {
+ diff = fuzzy_compare_parts (p1, p2);
+ }
+ if (diff >= COMMON_PART_FACTOR) {
+ msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id);
+ break;
+ }
+ }
+ /* Get tokens */
+ if (!cl->tokenizer->tokenize_func (
+ cl->tokenizer, task->task_pool,
+ &c, &tokens, FALSE, is_utf, ex)) {
+ g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message");
+ return FALSE;
+ }
+ cur = g_list_next (cur);
+ }
+
+ /* Handle messages without text */
+ if (tokens == NULL) {
+ g_set_error (err, filter_error_quark(), 3, "Cannot tokenize message, no text data");
+ msg_info ("learn failed for message <%s>, no tokens to extract", task->message_id);
+ return FALSE;
+ }
+
+ /* Take care of subject */
+ tokenize_subject (task, &tokens);
+
+ /* Init classifier */
+ cls_ctx = cl->classifier->init_func (
+ task->task_pool, cl);
+ /* Learn */
+ if (!cl->classifier->learn_spam_func (
+ cls_ctx, task->worker->srv->statfile_pool,
+ tokens, task, is_spam, task->cfg->lua_state, err)) {
+ if (*err) {
+ msg_info ("learn failed for message <%s>, learn error: %s", task->message_id, (*err)->message);
+ return FALSE;
+ }
+ else {
+ g_set_error (err, filter_error_quark(), 4, "Learn failed, unknown learn classifier error");
+ msg_info ("learn failed for message <%s>, unknown learn error", task->message_id);
+ return FALSE;
+ }
+ }
+ /* Increase statistics */
+ task->worker->srv->stat->messages_learned++;
+
+ msg_info ("learn success for message <%s>",
+ task->message_id);
+ statfile_pool_plan_invalidate (task->worker->srv->statfile_pool,
+ DEFAULT_STATFILE_INVALIDATE_TIME,
+ DEFAULT_STATFILE_INVALIDATE_JITTER);
+
+ return TRUE;
+}
+
+/*
+ * vi:ts=4
+ */