/* Copyright (c) 2015, Vsevolod Stakhov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "stat_api.h" #include "main.h" #include "stat_internal.h" #include "message.h" #include "lua/lua_common.h" #include #define RSPAMD_CLASSIFY_OP 0 #define RSPAMD_LEARN_OP 1 #define RSPAMD_UNLEARN_OP 2 struct preprocess_cb_data { struct rspamd_task *task; GList *classifier_runtimes; struct rspamd_tokenizer_runtime *tok; guint results_count; gboolean unlearn; gboolean spam; }; static struct rspamd_tokenizer_runtime * rspamd_stat_get_tokenizer_runtime (struct rspamd_tokenizer_config *cf, rspamd_mempool_t *pool, struct rspamd_tokenizer_runtime **ls) { struct rspamd_tokenizer_runtime *tok = NULL, *cur; const gchar *name; if (cf == NULL || cf->name == NULL) { name = RSPAMD_DEFAULT_TOKENIZER; } else { name = cf->name; } LL_FOREACH (*ls, cur) { if (strcmp (cur->name, name) == 0) { tok = cur; break; } } if (tok == NULL) { tok = rspamd_mempool_alloc (pool, sizeof (*tok)); tok->tokenizer = rspamd_stat_get_tokenizer (name); if (tok->tokenizer == NULL) { return NULL; } tok->tokens = g_tree_new (token_node_compare_func); rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t)g_tree_destroy, tok->tokens); tok->name = name; LL_PREPEND(*ls, tok); } return tok; } static gboolean preprocess_init_stat_token (gpointer k, gpointer v, gpointer d) { rspamd_token_t *t = (rspamd_token_t *)v; struct preprocess_cb_data *cbdata = (struct preprocess_cb_data *)d; struct rspamd_statfile_runtime *st_runtime; struct rspamd_classifier_runtime *cl_runtime; struct rspamd_token_result *res; GList *cur, *curst; gint i = 0; t->results = g_array_sized_new (FALSE, TRUE, sizeof (struct rspamd_token_result), cbdata->results_count); g_array_set_size (t->results, cbdata->results_count); cur = g_list_first (cbdata->classifier_runtimes); while (cur) { cl_runtime = (struct rspamd_classifier_runtime *)cur->data; if (cl_runtime->clcf->min_tokens > 0 && (guint32)g_tree_nnodes (cbdata->tok->tokens) < cl_runtime->clcf->min_tokens) { /* Skip this classifier */ msg_debug ("<%s> contains less tokens than required for %s classifier: " "%ud < %ud", cbdata->task->message_id, cl_runtime->clcf->name, g_tree_nnodes (cbdata->tok->tokens), cl_runtime->clcf->min_tokens); cur = g_list_next (cur); continue; } curst = cl_runtime->st_runtime; while (curst) { st_runtime = (struct rspamd_statfile_runtime *)curst->data; res = &g_array_index (t->results, struct rspamd_token_result, i); res->cl_runtime = cl_runtime; res->st_runtime = st_runtime; if (st_runtime->backend->process_token (t, res, st_runtime->backend->ctx)) { if (cl_runtime->clcf->max_tokens > 0 && cl_runtime->processed_tokens > cl_runtime->clcf->max_tokens) { msg_debug ("<%s> contains more tokens than allowed for %s classifier: " "%ud > %ud", cbdata->task, cl_runtime->clcf->name, cl_runtime->processed_tokens, cl_runtime->clcf->max_tokens); return TRUE; } } i ++; curst = g_list_next (curst); } cur = g_list_next (cur); } return FALSE; } static GList* rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx, struct rspamd_task *task, struct rspamd_tokenizer_runtime *tklist, lua_State *L, gint op, gboolean spam, GError **err) { struct rspamd_classifier_config *clcf; struct rspamd_statfile_config *stcf; struct rspamd_classifier_runtime *cl_runtime; struct rspamd_statfile_runtime *st_runtime; struct rspamd_stat_backend *bk; gpointer backend_runtime; GList *cur, *st_list = NULL, *curst; GList *cl_runtimes = NULL; guint result_size = 0, start_pos = 0, end_pos = 0; struct preprocess_cb_data cbdata; cur = g_list_first (task->cfg->classifiers); while (cur) { clcf = (struct rspamd_classifier_config *)cur->data; if (clcf->pre_callbacks != NULL) { st_list = rspamd_lua_call_cls_pre_callbacks (clcf, task, FALSE, FALSE, L); } if (st_list != NULL) { rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_list_free, st_list); } else { st_list = clcf->statfiles; } /* Now init runtime values */ cl_runtime = rspamd_mempool_alloc0 (task->task_pool, sizeof (*cl_runtime)); cl_runtime->cl = rspamd_stat_get_classifier (clcf->classifier); if (cl_runtime->cl == NULL) { g_set_error (err, rspamd_stat_quark(), 500, "classifier %s is not defined", clcf->classifier); g_list_free (cl_runtimes); return NULL; } cl_runtime->clcf = clcf; cl_runtime->tok = rspamd_stat_get_tokenizer_runtime (clcf->tokenizer, task->task_pool, &tklist); curst = st_list; while (curst != NULL) { stcf = (struct rspamd_statfile_config *)curst->data; /* On learning skip statfiles that do not belong to class */ if (op == RSPAMD_LEARN_OP && (spam != stcf->is_spam)) { curst = g_list_next (curst); continue; } bk = rspamd_stat_get_backend (stcf->backend); if (bk == NULL) { msg_warn ("backend of type %s is not defined", stcf->backend); curst = g_list_next (curst); continue; } backend_runtime = bk->runtime (stcf, op != RSPAMD_CLASSIFY_OP, bk->ctx); st_runtime = rspamd_mempool_alloc0 (task->task_pool, sizeof (*st_runtime)); st_runtime->st = stcf; st_runtime->backend_runtime = backend_runtime; st_runtime->backend = bk; if (stcf->is_spam) { cl_runtime->total_spam += bk->total_learns (backend_runtime, bk->ctx); } else { cl_runtime->total_ham += bk->total_learns (backend_runtime, bk->ctx); } cl_runtime->st_runtime = g_list_prepend (cl_runtime->st_runtime, st_runtime); result_size ++; curst = g_list_next (curst); end_pos ++; } if (cl_runtime->st_runtime != NULL) { rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_list_free, cl_runtime->st_runtime); cl_runtimes = g_list_prepend (cl_runtimes, cl_runtime); } /* Set positions in the results array */ cl_runtime->start_pos = start_pos; cl_runtime->end_pos = end_pos; msg_debug ("added runtime for %s classifier from %ud to %ud", clcf->name, start_pos, end_pos); start_pos = end_pos; /* Next classifier */ cur = g_list_next (cur); } if (cl_runtimes != NULL) { rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_list_free, cl_runtimes); cbdata.results_count = result_size; cbdata.classifier_runtimes = cl_runtimes; cbdata.task = task; cbdata.tok = cl_runtime->tok; g_tree_foreach (cl_runtime->tok->tokens, preprocess_init_stat_token, &cbdata); } return cl_runtimes; } /* * Tokenize task using the tokenizer specified */ static void rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf, struct rspamd_stat_ctx *st_ctx, struct rspamd_task *task, struct rspamd_tokenizer_runtime *tok) { struct mime_text_part *part; GArray *words; gchar *sub; GList *cur; cur = task->text_parts; while (cur != NULL) { part = (struct mime_text_part *)cur->data; if (!part->is_empty && part->words != NULL) { /* * XXX: Use normalized words if needed here */ tok->tokenizer->tokenize_func (cf, task->task_pool, part->words, tok->tokens, part->is_utf); } cur = g_list_next (cur); } if (task->subject != NULL) { sub = task->subject; } else { sub = (gchar *)g_mime_message_get_subject (task->message); } if (sub != NULL) { words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL); if (words != NULL) { tok->tokenizer->tokenize_func (cf, task->task_pool, words, tok->tokens, TRUE); g_array_free (words, TRUE); } } } rspamd_stat_result_t rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err) { struct rspamd_stat_classifier *cls; struct rspamd_classifier_config *clcf; struct rspamd_stat_ctx *st_ctx; struct rspamd_tokenizer_runtime *tklist = NULL, *tok; struct rspamd_classifier_runtime *cl_run; struct classifier_ctx *cl_ctx; GList *cl_runtimes; GList *cur; gboolean ret = RSPAMD_STAT_PROCESS_ERROR; st_ctx = rspamd_stat_get_ctx (); g_assert (st_ctx != NULL); cur = g_list_first (task->cfg->classifiers); /* Tokenization */ while (cur) { clcf = (struct rspamd_classifier_config *)cur->data; cls = rspamd_stat_get_classifier (clcf->classifier); if (cls == NULL) { g_set_error (err, rspamd_stat_quark (), 500, "type %s is not defined" "for classifiers", clcf->classifier); return RSPAMD_STAT_PROCESS_ERROR; } tok = rspamd_stat_get_tokenizer_runtime (clcf->tokenizer, task->task_pool, &tklist); if (tok == NULL) { g_set_error (err, rspamd_stat_quark (), 500, "type %s is not defined" "for tokenizers", clcf->tokenizer ? clcf->tokenizer->name : "unknown"); return RSPAMD_STAT_PROCESS_ERROR; } rspamd_stat_process_tokenize (clcf->tokenizer, st_ctx, task, tok); cur = g_list_next (cur); } /* Initialize classifiers and statfiles runtime */ if ((cl_runtimes = rspamd_stat_preprocess (st_ctx, task, tklist, L, RSPAMD_CLASSIFY_OP, FALSE, err)) == NULL) { return RSPAMD_STAT_PROCESS_ERROR; } cur = cl_runtimes; while (cur) { cl_run = (struct rspamd_classifier_runtime *)cur->data; if (cl_run->cl) { cl_ctx = cl_run->cl->init_func (task->task_pool, cl_run->clcf); if (cl_ctx != NULL) { if (cl_run->cl->classify_func (cl_ctx, cl_run->tok->tokens, cl_run, task)) { ret = RSPAMD_STAT_PROCESS_OK; } } } cur = g_list_next (cur); } return ret; } static gboolean rspamd_stat_learn_token (gpointer k, gpointer v, gpointer d) { rspamd_token_t *t = (rspamd_token_t *)v; struct preprocess_cb_data *cbdata = (struct preprocess_cb_data *)d; struct rspamd_statfile_runtime *st_runtime; struct rspamd_classifier_runtime *cl_runtime; struct rspamd_token_result *res; GList *cur, *curst; gint i = 0; cur = g_list_first (cbdata->classifier_runtimes); while (cur) { cl_runtime = (struct rspamd_classifier_runtime *)cur->data; if (cl_runtime->clcf->min_tokens > 0 && (guint32)g_tree_nnodes (cbdata->tok->tokens) < cl_runtime->clcf->min_tokens) { /* Skip this classifier */ msg_debug ("<%s> contains less tokens than required for %s classifier: " "%ud < %ud", cbdata->task->message_id, cl_runtime->clcf->name, g_tree_nnodes (cbdata->tok->tokens), cl_runtime->clcf->min_tokens); cur = g_list_next (cur); continue; } curst = cl_runtime->st_runtime; while (curst) { res = &g_array_index (t->results, struct rspamd_token_result, i); st_runtime = (struct rspamd_statfile_runtime *)curst->data; if (st_runtime->backend->learn_token (t, res, st_runtime->backend->ctx)) { cl_runtime->processed_tokens ++; if (cl_runtime->clcf->max_tokens > 0 && cl_runtime->processed_tokens > cl_runtime->clcf->max_tokens) { msg_debug ("<%s> contains more tokens than allowed for %s classifier: " "%ud > %ud", cbdata->task, cl_runtime->clcf->name, cl_runtime->processed_tokens, cl_runtime->clcf->max_tokens); return TRUE; } } i ++; curst = g_list_next (curst); } cur = g_list_next (cur); } return FALSE; } rspamd_stat_result_t rspamd_stat_learn (struct rspamd_task *task, gboolean spam, lua_State *L, GError **err) { struct rspamd_stat_classifier *cls; struct rspamd_classifier_config *clcf; struct rspamd_stat_ctx *st_ctx; struct rspamd_tokenizer_runtime *tklist = NULL, *tok; struct rspamd_classifier_runtime *cl_run; struct rspamd_statfile_runtime *st_run; struct classifier_ctx *cl_ctx; struct preprocess_cb_data cbdata; GList *cl_runtimes; GList *cur, *curst; gboolean ret = RSPAMD_STAT_PROCESS_ERROR, unlearn = FALSE; gulong nrev; rspamd_learn_t learn_res = RSPAMD_LEARN_OK; guint i; st_ctx = rspamd_stat_get_ctx (); g_assert (st_ctx != NULL); cur = g_list_first (task->cfg->classifiers); /* Tokenization */ while (cur) { clcf = (struct rspamd_classifier_config *)cur->data; cls = rspamd_stat_get_classifier (clcf->classifier); if (cls == NULL) { g_set_error (err, rspamd_stat_quark (), 500, "type %s is not defined" "for classifiers", clcf->classifier); return RSPAMD_STAT_PROCESS_ERROR; } tok = rspamd_stat_get_tokenizer_runtime (clcf->tokenizer, task->task_pool, &tklist); if (tok == NULL) { g_set_error (err, rspamd_stat_quark (), 500, "type %s is not defined" "for tokenizers", clcf->tokenizer ? clcf->tokenizer->name : "unknown"); return RSPAMD_STAT_PROCESS_ERROR; } rspamd_stat_process_tokenize (clcf->tokenizer, st_ctx, task, tok); cur = g_list_next (cur); } /* Check whether we have learned that file */ for (i = 0; i < st_ctx->caches_count; i ++) { learn_res = st_ctx->caches[i].process (task, spam, st_ctx->caches[i].ctx); if (learn_res == RSPAMD_LEARN_INGORE) { /* Do not learn twice */ g_set_error (err, rspamd_stat_quark (), 404, "<%s> has been already " "learned as %s, ignore it", task->message_id, spam ? "spam" : "ham"); return RSPAMD_STAT_PROCESS_ERROR; } else if (learn_res == RSPAMD_LEARN_UNLEARN) { unlearn = TRUE; } } /* Initialize classifiers and statfiles runtime */ if ((cl_runtimes = rspamd_stat_preprocess (st_ctx, task, tklist, L, unlearn ? RSPAMD_UNLEARN_OP : RSPAMD_LEARN_OP, spam, err)) == NULL) { return RSPAMD_STAT_PROCESS_ERROR; } cur = cl_runtimes; while (cur) { cl_run = (struct rspamd_classifier_runtime *)cur->data; if (cl_run->cl) { cl_ctx = cl_run->cl->init_func (task->task_pool, cl_run->clcf); if (cl_ctx != NULL) { if (cl_run->cl->learn_spam_func (cl_ctx, cl_run->tok->tokens, cl_run, task, spam, err)) { msg_debug ("learned %s classifier %s", spam ? "spam" : "ham", cl_run->clcf->name); ret = RSPAMD_STAT_PROCESS_OK; cbdata.classifier_runtimes = cur; cbdata.task = task; cbdata.tok = cl_run->tok; cbdata.unlearn = unlearn; cbdata.spam = spam; g_tree_foreach (cl_run->tok->tokens, rspamd_stat_learn_token, &cbdata); curst = g_list_first (cl_run->st_runtime); while (curst) { st_run = (struct rspamd_statfile_runtime *)curst->data; if (unlearn && spam != st_run->st->is_spam) { nrev = st_run->backend->dec_learns (st_run->backend_runtime, st_run->backend->ctx); msg_debug ("unlearned %s, new revision: %ul", st_run->st->symbol, nrev); } else { nrev = st_run->backend->inc_learns (st_run->backend_runtime, st_run->backend->ctx); msg_debug ("learned %s, new revision: %ul", st_run->st->symbol, nrev); } st_run->backend->finalize_learn (st_run->backend_runtime, st_run->backend->ctx); curst = g_list_next (curst); } } else { return RSPAMD_STAT_PROCESS_ERROR; } } } cur = g_list_next (cur); } return ret; } ucl_object_t * rspamd_stat_statistics (struct rspamd_config *cfg, guint64 *total_learns) { struct rspamd_classifier_config *clcf; struct rspamd_statfile_config *stcf; struct rspamd_stat_backend *bk; gpointer backend_runtime; GList *cur, *st_list = NULL, *curst; ucl_object_t *res = NULL, *elt; guint64 learns = 0; if (cfg != NULL && cfg->classifiers != NULL) { res = ucl_object_typed_new (UCL_ARRAY); cur = g_list_first (cfg->classifiers); while (cur) { clcf = (struct rspamd_classifier_config *)cur->data; st_list = clcf->statfiles; curst = st_list; while (curst != NULL) { stcf = (struct rspamd_statfile_config *)curst->data; bk = rspamd_stat_get_backend (stcf->backend); if (bk == NULL) { msg_warn ("backend of type %s is not defined", stcf->backend); curst = g_list_next (curst); continue; } backend_runtime = bk->runtime (stcf, FALSE, bk->ctx); learns += bk->total_learns (backend_runtime, bk->ctx); elt = bk->get_stat (backend_runtime, bk->ctx); if (elt != NULL) { ucl_array_append (res, elt); } curst = g_list_next (curst); } /* Next classifier */ cur = g_list_next (cur); } if (total_learns != NULL) { *total_learns = learns; } } return res; }