diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-01-05 18:55:56 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-01-05 18:55:56 +0000 |
commit | e5676fe57ac78238817b64efa107e00d34485a9c (patch) | |
tree | 45ca0cbc7738f0aa488872b2120a7d56223a5fd3 /src/libstat/stat_process.c | |
parent | c3f06a6a9cadbbc8c852f6ebe361c917aff3e384 (diff) | |
download | rspamd-e5676fe57ac78238817b64efa107e00d34485a9c.tar.gz rspamd-e5676fe57ac78238817b64efa107e00d34485a9c.zip |
Fix statistics processing
Diffstat (limited to 'src/libstat/stat_process.c')
-rw-r--r-- | src/libstat/stat_process.c | 512 |
1 files changed, 129 insertions, 383 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 1506f4d48..8a4269727 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2015, Vsevolod Stakhov +/* Copyright (c) 2015-2016, Vsevolod Stakhov * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -37,19 +37,8 @@ static const gint similarity_treshold = 80; -#if 0 -struct preprocess_cb_data { - struct rspamd_task *task; - GList *classifier_runtimes; - struct rspamd_tokenizer_runtime *tok; - guint results_count; - gboolean unlearn; - gboolean spam; -}; - static void rspamd_stat_tokenize_header (struct rspamd_task *task, - struct rspamd_tokenizer_runtime *tok, const gchar *name, const gchar *prefix, GArray *ar) { struct raw_header *rh, *cur; @@ -82,8 +71,8 @@ rspamd_stat_tokenize_header (struct rspamd_task *task, } static void -rspamd_stat_tokenize_parts_metadata (struct rspamd_task *task, - struct rspamd_tokenizer_runtime *tok) +rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx, + struct rspamd_task *task) { struct rspamd_image *img; struct mime_part *part; @@ -165,16 +154,17 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_task *task, cur = g_list_first (task->cfg->classify_headers); while (cur) { - rspamd_stat_tokenize_header (task, tok, cur->data, "UA:", ar); + rspamd_stat_tokenize_header (task, cur->data, "UA:", ar); cur = g_list_next (cur); } - tok->tokenizer->tokenize_func (tok, + st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool, ar, TRUE, - "META:"); + "META:", + task->tokens); g_array_free (ar, TRUE); } @@ -184,24 +174,36 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_task *task, */ static void rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, - struct rspamd_task *task, struct rspamd_tokenizer_runtime *tok) + struct rspamd_task *task) { struct mime_text_part *part; GArray *words; gchar *sub; - guint i; + guint i, reserved_len = 0; gint *pdiff; - gboolean compat; - compat = tok->tokenizer->is_compat (tok); + for (i = 0; i < task->text_parts->len; i++) { + part = g_ptr_array_index (task->text_parts, i); + + if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) { + reserved_len += part->normalized_words->len; + } + /* XXX: normal window size */ + reserved_len += 5; + } + + task->tokens = g_ptr_array_sized_new (reserved_len); + rspamd_mempool_add_destructor (task->task_pool, + rspamd_ptr_array_free_hard, task->tokens); pdiff = rspamd_mempool_get_variable (task->task_pool, "parts_distance"); for (i = 0; i < task->text_parts->len; i ++) { part = g_ptr_array_index (task->text_parts, i); if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) { - tok->tokenizer->tokenize_func (tok, task->task_pool, - part->normalized_words, IS_PART_UTF (part), NULL); + st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool, + part->normalized_words, IS_PART_UTF (part), + NULL, task->tokens); } @@ -220,324 +222,118 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, } if (sub != NULL) { - words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, compat, + words = rspamd_tokenize_text (sub, strlen (sub), TRUE, NULL, NULL, FALSE, NULL); if (words != NULL) { - tok->tokenizer->tokenize_func (tok, + st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool, words, TRUE, - "SUBJECT"); + "SUBJECT", + task->tokens); g_array_free (words, TRUE); } } - rspamd_stat_tokenize_parts_metadata (task, tok); + rspamd_stat_tokenize_parts_metadata (st_ctx, task); } -static struct rspamd_tokenizer_runtime * -rspamd_stat_get_tokenizer_runtime (struct rspamd_tokenizer_config *cf, - struct rspamd_stat_ctx *st_ctx, - struct rspamd_task *task, - struct rspamd_classifier_runtime *cl_runtime, - gpointer conf, gsize conf_len) +static void +rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx, + struct rspamd_task *task, gboolean learn) { - struct rspamd_tokenizer_runtime *tok = NULL; - const gchar *name; - - if (cf == NULL || cf->name == NULL) { - name = RSPAMD_DEFAULT_TOKENIZER; - cf->name = name; - } - else { - name = cf->name; - } - - tok = rspamd_mempool_alloc (task->task_pool, sizeof (*tok)); - tok->tokenizer = rspamd_stat_get_tokenizer (name); - tok->tkcf = cf; - - if (tok->tokenizer == NULL) { - return NULL; - } - - if (!tok->tokenizer->load_config (task->task_pool, tok, conf, conf_len)) { - return NULL; - } + guint i; + struct rspamd_statfile *st; + gpointer bk_run; - tok->tokens = g_tree_new (token_node_compare_func); + rspamd_stat_process_tokenize (st_ctx, task); + task->stat_runtimes = g_ptr_array_sized_new (st_ctx->statfiles->len); rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t)g_tree_destroy, tok->tokens); - tok->name = name; - rspamd_stat_process_tokenize (st_ctx, task, tok); - cl_runtime->tok = tok; - - return tok; -} + rspamd_ptr_array_free_hard, task->stat_runtimes); -static gboolean -preprocess_init_stat_token (gpointer k, gpointer v, gpointer d) -{ - rspamd_token_t *t = (rspamd_token_t *)v; - struct preprocess_cb_data *cbdata = (struct preprocess_cb_data *)d; - struct rspamd_statfile_runtime *st_runtime; - struct rspamd_classifier_runtime *cl_runtime; - struct rspamd_token_result *res; - GList *cur, *curst; - struct rspamd_task *task; - gint i = 0; + for (i = 0; i < st_ctx->statfiles->len; i ++) { + st = g_ptr_array_index (st_ctx->statfiles, i); + g_assert (st != NULL); - task = cbdata->task; - t->results = g_array_sized_new (FALSE, TRUE, - sizeof (struct rspamd_token_result), cbdata->results_count); - g_array_set_size (t->results, cbdata->results_count); - rspamd_mempool_add_destructor (cbdata->task->task_pool, - rspamd_array_free_hard, t->results); - - cur = g_list_first (cbdata->classifier_runtimes); + bk_run = st->backend->runtime (task, st->stcf, learn, st->bkcf); - while (cur) { - cl_runtime = (struct rspamd_classifier_runtime *)cur->data; - - if (cl_runtime->clcf->min_tokens > 0 && - (guint32)g_tree_nnodes (cbdata->tok->tokens) < cl_runtime->clcf->min_tokens) { - /* Skip this classifier */ - cur = g_list_next (cur); - cl_runtime->skipped = TRUE; - continue; + if (bk_run == NULL) { + msg_err_task ("cannot init backend %s for statfile %s", + st->backend->name, st->stcf->symbol); } - curst = cl_runtime->st_runtime; - - while (curst) { - - st_runtime = (struct rspamd_statfile_runtime *)curst->data; - res = &g_array_index (t->results, struct rspamd_token_result, i); - res->cl_runtime = cl_runtime; - res->st_runtime = st_runtime; - - if (cl_runtime->backend->process_token (cbdata->task, t, res, - cl_runtime->backend->ctx)) { - - if (cl_runtime->clcf->max_tokens > 0 && - cl_runtime->processed_tokens > cl_runtime->clcf->max_tokens) { - msg_debug_task ("message contains more tokens than allowed for %s classifier: " - "%uL > %ud", cl_runtime->clcf->name, - cl_runtime->processed_tokens, - cl_runtime->clcf->max_tokens); - - return TRUE; - } - } - - i ++; - curst = g_list_next (curst); - } - - cur = g_list_next (cur); + g_ptr_array_add (task->stat_runtimes, bk_run); } - - - return FALSE; } -static GList* -rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx, - struct rspamd_task *task, - lua_State *L, - gint op, - gboolean spam, - const gchar *classifier, - GError **err) +static void +rspamd_stat_backends_process (struct rspamd_stat_ctx *st_ctx, + struct rspamd_task *task) { - struct rspamd_classifier_config *clcf; - struct rspamd_statfile_config *stcf; - struct rspamd_classifier_runtime *cl_runtime; - struct rspamd_statfile_runtime *st_runtime; - struct rspamd_stat_backend *bk; - gpointer backend_runtime, tok_config; - GList *cur, *st_list = NULL, *curst; - GList *cl_runtimes = NULL; - guint result_size = 0, start_pos = 0, end_pos = 0; - gsize conf_len; - struct preprocess_cb_data cbdata; - - cur = g_list_first (task->cfg->classifiers); - - while (cur) { - clcf = (struct rspamd_classifier_config *)cur->data; - st_list = NULL; - - if (classifier != NULL && - (clcf->name == NULL || strcmp (clcf->name, classifier) != 0)) { - /* Skip this classifier */ - msg_debug_task ("skip classifier %s, as we are requested to check %s only", - clcf->name, classifier); - cur = g_list_next (cur); - continue; - } - - if (clcf->pre_callbacks != NULL) { - st_list = rspamd_lua_call_cls_pre_callbacks (clcf, task, FALSE, - FALSE, L); - } - if (st_list != NULL) { - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t)g_list_free, st_list); - } - else { - st_list = clcf->statfiles; - } - - /* Now init runtime values */ - cl_runtime = rspamd_mempool_alloc0 (task->task_pool, sizeof (*cl_runtime)); - cl_runtime->cl = rspamd_stat_get_classifier (clcf->classifier); - - if (cl_runtime->cl == NULL) { - g_set_error (err, rspamd_stat_quark(), 500, - "classifier %s is not defined", clcf->classifier); - g_list_free (cl_runtimes); - return NULL; - } - - cl_runtime->clcf = clcf; - - bk = rspamd_stat_get_backend (clcf->backend); - if (bk == NULL) { - g_set_error (err, rspamd_stat_quark(), 500, - "backend %s is not defined", clcf->backend); - g_list_free (cl_runtimes); - return NULL; - } - - cl_runtime->backend = bk; - - curst = st_list; - while (curst != NULL) { - stcf = (struct rspamd_statfile_config *)curst->data; - - /* On learning skip statfiles that do not belong to class */ - if (op == RSPAMD_LEARN_OP && (spam != stcf->is_spam)) { - curst = g_list_next (curst); - continue; - } - - backend_runtime = bk->runtime (task, stcf, op != RSPAMD_CLASSIFY_OP, - bk->ctx); - - if (backend_runtime == NULL) { - if (op != RSPAMD_CLASSIFY_OP) { - /* Assume backend absence as fatal error */ - g_set_error (err, rspamd_stat_quark(), 500, - "cannot open backend for statfile %s", stcf->symbol); - g_list_free (cl_runtimes); - - return NULL; - } - else { - /* Just skip this element */ - msg_warn ("backend of type %s does not exist: %s", - clcf->backend, stcf->symbol); - curst = g_list_next (curst); - continue; - } - } - - tok_config = bk->load_tokenizer_config (backend_runtime, - &conf_len); - - if (cl_runtime->tok == NULL) { - cl_runtime->tok = rspamd_stat_get_tokenizer_runtime (clcf->tokenizer, - st_ctx, task, cl_runtime, tok_config, conf_len); - - if (cl_runtime->tok == NULL) { - g_set_error (err, rspamd_stat_quark(), 500, - "cannot initialize tokenizer for statfile %s", stcf->symbol); - g_list_free (cl_runtimes); + guint i; + struct rspamd_statfile *st; + struct rspamd_classifier *cl; + gpointer bk_run; - return NULL; - } - } + g_assert (task->stat_runtimes != NULL); - if (!cl_runtime->tok->tokenizer->compatible_config ( - cl_runtime->tok, tok_config, conf_len)) { - g_set_error (err, rspamd_stat_quark(), 500, - "incompatible tokenizer for statfile %s", stcf->symbol); - g_list_free (cl_runtimes); + for (i = 0; i < st_ctx->statfiles->len; i++) { + st = g_ptr_array_index (st_ctx->statfiles, i); + bk_run = g_ptr_array_index (task->stat_runtimes, i); + cl = st->classifier; + g_assert (st != NULL); - return NULL; - } + if (bk_run != NULL) { + st->backend->process_tokens (task, task->tokens, i, bk_run); - st_runtime = rspamd_mempool_alloc0 (task->task_pool, - sizeof (*st_runtime)); - st_runtime->st = stcf; - st_runtime->backend_runtime = backend_runtime; - - if (stcf->is_spam) { - cl_runtime->total_spam += bk->total_learns (task, backend_runtime, - bk->ctx); + if (st->stcf->is_spam) { + cl->spam_learns = st->backend->total_learns (task, + bk_run, + st_ctx); } else { - cl_runtime->total_ham += bk->total_learns (task, backend_runtime, - bk->ctx); + cl->ham_learns = st->backend->total_learns (task, + bk_run, + st_ctx); } - - cl_runtime->st_runtime = g_list_prepend (cl_runtime->st_runtime, - st_runtime); - result_size ++; - - curst = g_list_next (curst); - end_pos ++; - } - - if (cl_runtime->st_runtime != NULL) { - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t)g_list_free, - cl_runtime->st_runtime); - cl_runtimes = g_list_prepend (cl_runtimes, cl_runtime); } + } +} - /* Set positions in the results array */ - cl_runtime->start_pos = start_pos; - cl_runtime->end_pos = end_pos; +static void +rspamd_stat_backends_post_process (struct rspamd_stat_ctx *st_ctx, + struct rspamd_task *task) +{ + guint i; + struct rspamd_statfile *st; + gpointer bk_run; - msg_debug_task ("added runtime for %s classifier from %ud to %ud", - clcf->name, start_pos, end_pos); + g_assert (task->stat_runtimes != NULL); - start_pos = end_pos; + for (i = 0; i < st_ctx->statfiles->len; i++) { + st = g_ptr_array_index (st_ctx->statfiles, i); + bk_run = g_ptr_array_index (task->stat_runtimes, i); + g_assert (st != NULL); - /* Next classifier */ - cur = g_list_next (cur); + if (bk_run != NULL) { + st->backend->finalize_process (task, bk_run, st_ctx); + } } +} - if (cl_runtimes != NULL) { - /* Reverse list as we have used g_list_prepend */ - cl_runtimes = g_list_reverse (cl_runtimes); - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) g_list_free, - cl_runtimes); - cur = g_list_first (cl_runtimes); - - while (cur) { - cl_runtime = cur->data; +static void +rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx, + struct rspamd_task *task) +{ + guint i; + struct rspamd_classifier *cl; - cbdata.results_count = result_size; - cbdata.classifier_runtimes = cl_runtimes; - cbdata.task = task; - cbdata.tok = cl_runtime->tok; - g_tree_foreach (cbdata.tok->tokens, preprocess_init_stat_token, - &cbdata); + for (i = 0; i < st_ctx->classifiers->len; i++) { + cl = g_ptr_array_index (st_ctx->classifiers, i); + g_assert (cl != NULL); - cur = g_list_next (cur); - } + cl->subrs->classify_func (cl, task->tokens, task); } - else if (classifier != NULL) { - /* We likely cannot find any classifier with this name */ - g_set_error (err, rspamd_stat_quark (), 404, - "cannot find classifier %s", classifier); - } - - return cl_runtimes; } rspamd_stat_result_t @@ -545,102 +341,30 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, guint stage, GError **err) { struct rspamd_stat_ctx *st_ctx; - struct rspamd_statfile_runtime *st_run; - struct rspamd_classifier_runtime *cl_run; - GList *cl_runtimes; - GList *cur, *curst; - gboolean ret = RSPAMD_STAT_PROCESS_OK; + rspamd_stat_result_t ret = RSPAMD_STAT_PROCESS_OK; st_ctx = rspamd_stat_get_ctx (); g_assert (st_ctx != NULL); - cl_runtimes = task->cl_runtimes; - if (stage == RSPAMD_TASK_STAGE_CLASSIFIERS_PRE) { - /* Initialize classifiers and statfiles runtime */ - if (task->cl_runtimes == NULL) { - if ((cl_runtimes = rspamd_stat_preprocess (st_ctx, task, L, - RSPAMD_CLASSIFY_OP, FALSE, NULL, err)) == NULL) { - return RSPAMD_STAT_PROCESS_OK; - } - - task->cl_runtimes = cl_runtimes; - cur = cl_runtimes; - - /* Finalize backend so it can load tokens delayed if needed */ - while (cur) { - cl_run = (struct rspamd_classifier_runtime *) cur->data; - curst = cl_run->st_runtime; - - while (curst) { - st_run = curst->data; - cl_run->backend->finalize_process (task, - st_run->backend_runtime, - cl_run->backend->ctx); - curst = g_list_next (curst); - } - cur = g_list_next (cur); - } - } + if (stage == RSPAMD_TASK_STAGE_CLASSIFIERS_PRE) { + /* Preprocess tokens */ + rspamd_stat_preprocess (st_ctx, task, FALSE); } else if (stage == RSPAMD_TASK_STAGE_CLASSIFIERS) { - cur = cl_runtimes; - - /* The first stage of classification */ - while (cur) { - cl_run = (struct rspamd_classifier_runtime *) cur->data; - cl_run->stage = RSPAMD_STAT_STAGE_PRE; - - if (cl_run->cl) { - cl_run->clctx = cl_run->cl->init_func (task->task_pool, - cl_run->clcf); - - if (cl_run->clctx != NULL) { - cl_run->cl->classify_func (cl_run->clctx, cl_run->tok->tokens, - cl_run, task); - } - } - - cur = g_list_next (cur); - } + /* Process backends */ + rspamd_stat_backends_process (st_ctx, task); } else if (stage == RSPAMD_TASK_STAGE_CLASSIFIERS_POST) { - cur = cl_runtimes; - /* The second stage of classification */ - while (cur) { - cl_run = (struct rspamd_classifier_runtime *) cur->data; - cl_run->stage = RSPAMD_STAT_STAGE_POST; - - if (cl_run->skipped) { - cur = g_list_next (cur); - continue; - } - - cl_run = (struct rspamd_classifier_runtime *) cur->data; - cl_run->stage = RSPAMD_STAT_STAGE_POST; - - if (cl_run->skipped) { - cur = g_list_next (cur); - continue; - } - - if (cl_run->cl) { - if (cl_run->clctx != NULL) { - if (cl_run->cl->classify_func (cl_run->clctx, - cl_run->tok->tokens, - cl_run, task)) { - ret = RSPAMD_STAT_PROCESS_OK; - } - } - } - - cur = g_list_next (cur); - } + /* Process classifiers */ + rspamd_stat_backends_post_process (st_ctx, task); + rspamd_stat_classifiers_process (st_ctx, task); } return ret; } +#if 0 static gboolean rspamd_stat_learn_token (gpointer k, gpointer v, gpointer d) { @@ -911,4 +635,26 @@ rspamd_stat_result_t rspamd_stat_statistics (struct rspamd_task *task, return RSPAMD_STAT_PROCESS_OK; } +#else +/* TODO: finish learning */ +rspamd_stat_result_t rspamd_stat_learn (struct rspamd_task *task, + gboolean spam, lua_State *L, const gchar *classifier, + GError **err) +{ + return RSPAMD_STAT_PROCESS_ERROR; +} + +/** + * Get the overall statistics for all statfile backends + * @param cfg configuration + * @param total_learns the total number of learns is stored here + * @return array of statistical information + */ +rspamd_stat_result_t rspamd_stat_statistics (struct rspamd_task *task, + struct rspamd_config *cfg, + guint64 *total_learns, + ucl_object_t **res) +{ + return RSPAMD_STAT_PROCESS_ERROR; +} #endif |