From 32a96e82d075bdba6e9e567080977a76830cbce2 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 29 Jan 2009 17:46:26 +0300 Subject: [PATCH] * Do another rework of filters/metrics and statfiles processing * Add 'probability' normalizing to winnow algorithm and calculate not pure weight but normalized probability --- rspamd.conf.sample | 9 +- src/cfg_file.h | 1 - src/cfg_file.y | 32 +++---- src/cfg_utils.c | 2 + src/classifiers/classifiers.c | 8 +- src/classifiers/classifiers.h | 21 +++-- src/classifiers/winnow.c | 71 +++++++++++++--- src/controller.c | 16 +++- src/filter.c | 156 ++++++++++++++++------------------ src/filter.h | 3 +- src/plugins/regexp.c | 4 +- src/plugins/surbl.c | 1 - 12 files changed, 198 insertions(+), 126 deletions(-) diff --git a/rspamd.conf.sample b/rspamd.conf.sample index 794638a5e..f0e2a9411 100644 --- a/rspamd.conf.sample +++ b/rspamd.conf.sample @@ -62,9 +62,6 @@ statfile { # Tokenizer for this statfile # Deafault: osb-text tokenizer = "osb-text"; - # Classifier for this statfile - # Default: winnow - classifier = "winnow"; }; statfile { alias = "test.ham"; @@ -73,4 +70,10 @@ statfile { size = 10M; }; +# Factors coefficients +factors { + "SURBL_DNS" = 10.5; + "winnow" = 5.5; +}; + url_filters = "surbl"; diff --git a/src/cfg_file.h b/src/cfg_file.h index bb934134c..11d008dbd 100644 --- a/src/cfg_file.h +++ b/src/cfg_file.h @@ -103,7 +103,6 @@ struct statfile { char *metric; size_t size; struct tokenizer *tokenizer; - struct classifier *classifier; }; struct config_scalar { diff --git a/src/cfg_file.y b/src/cfg_file.y index 641220870..e842c560e 100644 --- a/src/cfg_file.y +++ b/src/cfg_file.y @@ -297,6 +297,9 @@ metric: yyerror ("yyparse: not enough arguments in metric definition"); YYERROR; } + if (cur_metric->classifier == NULL) { + cur_metric->classifier = get_classifier ("winnow"); + } g_hash_table_insert (cfg->metrics, cur_metric->name, cur_metric); cur_metric = NULL; } @@ -310,6 +313,7 @@ metriccmd: | metricname | metricfunction | metricscore + | metricclassifier ; metricname: @@ -345,6 +349,18 @@ metricscore: } ; +metricclassifier: + CLASSIFIER EQSIGN QUOTEDSTRING { + if (cur_metric == NULL) { + cur_metric = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct metric)); + } + if ((cur_metric->classifier = get_classifier ($3)) == NULL) { + yyerror ("yyparse: unknown classifier %s", $3); + YYERROR; + } + } + ; + factors: FACTORS OBRACE factorsbody EBRACE ; @@ -555,9 +571,6 @@ statfile: if (cur_statfile->metric == NULL) { cur_statfile->metric = memory_pool_strdup (cfg->cfg_pool, "default"); } - if (cur_statfile->classifier == NULL) { - cur_statfile->classifier = get_classifier ("winnow"); - } if (cur_statfile->tokenizer == NULL) { cur_statfile->tokenizer = get_tokenizer ("osb-text"); } @@ -578,7 +591,6 @@ statfilecmd: | statfilesize | statfilemetric | statfiletokenizer - | statfileclassifier ; statfilealias: @@ -650,17 +662,7 @@ statfiletokenizer: } ; -statfileclassifier: - CLASSIFIER EQSIGN QUOTEDSTRING { - if (cur_statfile == NULL) { - cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); - } - if ((cur_statfile->classifier = get_classifier ($3)) == NULL) { - yyerror ("yyparse: unknown classifier %s", $3); - YYERROR; - } - } - ; + statfile_pool_size: STATFILE_POOL_SIZE EQSIGN SIZELIMIT { diff --git a/src/cfg_utils.c b/src/cfg_utils.c index 99e7881b8..68c1a1174 100644 --- a/src/cfg_utils.c +++ b/src/cfg_utils.c @@ -19,6 +19,7 @@ #include "cfg_file.h" #include "main.h" #include "filter.h" +#include "classifiers/classifiers.h" #ifndef HAVE_OWN_QUEUE_H #include #else @@ -189,6 +190,7 @@ init_defaults (struct config_file *cfg) def_metric->func_name = "factors"; def_metric->func = factor_consolidation_func; def_metric->required_score = DEFAULT_SCORE; + def_metric->classifier = get_classifier ("winnow"); g_hash_table_insert (cfg->metrics, "default", def_metric); LIST_INIT (&cfg->perl_modules); diff --git a/src/classifiers/classifiers.c b/src/classifiers/classifiers.c index aabdd7590..162b201c8 100644 --- a/src/classifiers/classifiers.c +++ b/src/classifiers/classifiers.c @@ -6,7 +6,13 @@ #include "classifiers.h" struct classifier classifiers[] = { - {"winnow", winnow_classify, winnow_learn, winnow_add_result }, + { + .name = "winnow", + .init_func = winnow_init, + .classify_func = winnow_classify, + .learn_func = winnow_learn, + .result_file_func = winnow_result_file + }, }; struct classifier* diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h index 71a08c684..2a2929b7d 100644 --- a/src/classifiers/classifiers.h +++ b/src/classifiers/classifiers.h @@ -11,20 +11,29 @@ #include "../statfile.h" #include "../tokenizers/tokenizers.h" +struct classifier_ctx { + memory_pool_t *pool; + GHashTable *results; +}; /* Common classifier structure */ struct classifier { char *name; - double (*classify_func)(statfile_pool_t *pool, char *statfile, GTree *input); - void (*learn_func)(statfile_pool_t *pool, char *statfile, GTree *input, int in_class); - double (*add_result_func)(double result, double new); + struct classifier_ctx* (*init_func)(memory_pool_t *pool); + void (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, + char *statfile, GTree *input, double scale); + void (*learn_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, + char *statfile, GTree *input, int in_class); + char* (*result_file_func)(struct classifier_ctx *ctx, double *probability); }; /* Get classifier structure by name or return NULL if this name is not found */ struct classifier* get_classifier (char *name); + /* Winnow algorithm */ -double winnow_classify (statfile_pool_t *pool, char *statfile, GTree *input); -void winnow_learn (statfile_pool_t *pool, char *statfile, GTree *input, int in_class); -double winnow_add_result (double result, double new); +struct classifier_ctx* winnow_init (memory_pool_t *pool); +void winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, char *statfile, GTree *input, double scale); +void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, char *statfile, GTree *input, int in_class); +char* winnow_result_file (struct classifier_ctx* ctx, double *probability); /* Array of all defined classifiers */ extern struct classifier classifiers[]; diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c index 29aa94899..5775a7997 100644 --- a/src/classifiers/winnow.c +++ b/src/classifiers/winnow.c @@ -3,6 +3,7 @@ */ #include +#include #include "classifiers.h" #define WINNOW_PROMOTION 1.23 @@ -10,6 +11,7 @@ struct winnow_callback_data { statfile_pool_t *pool; + struct classifier_ctx *ctx; char *filename; double sum; int count; @@ -30,6 +32,7 @@ classify_callback (gpointer key, gpointer value, gpointer data) } else { cd->sum += v; + cd->in_class ++; } cd->count ++; @@ -59,33 +62,51 @@ learn_callback (gpointer key, gpointer value, gpointer data) return FALSE; } +struct classifier_ctx* +winnow_init (memory_pool_t *pool) +{ + struct classifier_ctx *ctx = memory_pool_alloc (pool, sizeof (struct classifier_ctx)); + + ctx->pool = pool; + ctx->results = g_hash_table_new (g_str_hash, g_str_equal); + memory_pool_add_destructor (pool, (pool_destruct_func)g_hash_table_destroy, ctx->results); -double -winnow_classify (statfile_pool_t *pool, char *statfile, GTree *input) + return ctx; +} +void +winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, GTree *input, double scale) { struct winnow_callback_data data; + double *res = memory_pool_alloc (ctx->pool, sizeof (double)); + + g_assert (pool != NULL); + g_assert (ctx != NULL); data.pool = pool; data.filename = statfile; data.sum = 0; data.count = 0; data.now = time (NULL); + data.ctx = ctx; if (!statfile_pool_is_open (pool, statfile)) { if (statfile_pool_open (pool, statfile) == -1) { - return 0; + return; } } g_tree_foreach (input, classify_callback, &data); - - return data.sum / data.count; + *res = scale * (data.sum / data.count); + g_hash_table_insert (ctx->results, statfile, res); } void -winnow_learn (statfile_pool_t *pool, char *statfile, GTree *input, int in_class) +winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, GTree *input, int in_class) { struct winnow_callback_data data; + + g_assert (pool != NULL); + g_assert (ctx != NULL); data.pool = pool; data.filename = statfile; @@ -93,6 +114,7 @@ winnow_learn (statfile_pool_t *pool, char *statfile, GTree *input, int in_class) data.count = 0; data.in_class = in_class; data.now = time (NULL); + data.ctx = ctx; if (!statfile_pool_is_open (pool, statfile)) { if (statfile_pool_open (pool, statfile) == -1) { @@ -103,11 +125,40 @@ winnow_learn (statfile_pool_t *pool, char *statfile, GTree *input, int in_class) statfile_pool_lock_file (pool, statfile); g_tree_foreach (input, learn_callback, &data); statfile_pool_unlock_file (pool, statfile); - } -double -winnow_add_result (double result, double new) +struct winnow_result_data { + char *filename; + double max_score; + double sum; +}; + +static void +result_file_callback (gpointer key, gpointer value, gpointer data) +{ + struct winnow_result_data *d = (struct winnow_result_data *)data; + double w = *((double *)value); + + if (fabs (w) > fabs (d->max_score)) { + d->filename = (char *)key; + d->max_score = w; + } + d->sum += fabs (w); +} + +char* +winnow_result_file (struct classifier_ctx* ctx, double *probability) { - return result + new; + struct winnow_result_data data = { NULL, 0, 0 }; + g_assert (ctx != NULL); + + g_hash_table_foreach (ctx->results, result_file_callback, &data); + if (data.sum != 0) { + *probability = data.max_score / data.sum; + } + else { + *probability = 1; + } + + return data.filename; } diff --git a/src/controller.c b/src/controller.c index 8a3377c44..104b8d7ca 100644 --- a/src/controller.c +++ b/src/controller.c @@ -140,6 +140,7 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control time_t uptime; unsigned long size = 0; struct statfile *statfile; + struct metric *metric; memory_pool_stat_t mem_st; switch (cmd->type) { @@ -270,11 +271,19 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control return; } + + metric = g_hash_table_lookup (session->cfg->metrics, statfile->metric); + session->learn_rcpt = NULL; session->learn_from = NULL; session->learn_filename = NULL; session->learn_tokenizer = statfile->tokenizer; - session->learn_classifier = statfile->classifier; + if (metric != NULL) { + session->learn_classifier = metric->classifier; + } + else { + session->learn_classifier = get_classifier ("winnow"); + } /* By default learn positive */ session->in_class = 1; /* Get all arguments */ @@ -348,6 +357,7 @@ static void read_socket (struct bufferevent *bev, void *arg) { struct controller_session *session = (struct controller_session *)arg; + struct classifier_ctx *cls_ctx; int len, i; char *s, **params, *cmd, out_buf[128]; GList *comp_list, *cur = NULL; @@ -424,7 +434,9 @@ read_socket (struct bufferevent *bev, void *arg) return; } } - session->learn_classifier->learn_func (session->worker->srv->statfile_pool, session->learn_filename, tokens, session->in_class); + cls_ctx = session->learn_classifier->init_func (session->session_pool); + session->learn_classifier->learn_func (cls_ctx, session->worker->srv->statfile_pool, + session->learn_filename, tokens, session->in_class); session->worker->srv->stat->messages_learned ++; i = snprintf (out_buf, sizeof (out_buf), "learn ok" CRLF); bufferevent_write (bev, out_buf, i); diff --git a/src/filter.c b/src/filter.c index b847831d7..8e1362428 100644 --- a/src/filter.c +++ b/src/filter.c @@ -14,10 +14,12 @@ #include "tokenizers/tokenizers.h" void -insert_result (struct worker_task *task, const char *metric_name, const char *symbol, u_char flag) +insert_result (struct worker_task *task, const char *metric_name, const char *symbol, double flag) { struct metric *metric; struct metric_result *metric_res; + double *fl = memory_pool_alloc (task->task_pool, sizeof (double)); + *fl = flag; metric = g_hash_table_lookup (task->worker->srv->cfg->metrics, metric_name); if (metric == NULL) { @@ -35,42 +37,50 @@ insert_result (struct worker_task *task, const char *metric_name, const char *sy g_hash_table_insert (task->results, (gpointer)metric_name, metric_res); } - g_hash_table_insert (metric_res->symbols, (gpointer)symbol, GSIZE_TO_POINTER (flag)); + g_hash_table_insert (metric_res->symbols, (gpointer)symbol, fl); } /* * Default consolidation function based on factors in config file */ +struct consolidation_callback_data { + struct worker_task *task; + double score; +}; + +static void +consolidation_callback (gpointer key, gpointer value, gpointer arg) +{ + double *factor; + double val = *(double *)value; + struct consolidation_callback_data *data = (struct consolidation_callback_data *)arg; + + factor = g_hash_table_lookup (data->task->worker->srv->cfg->factors, key); + if (factor == NULL) { + msg_debug ("consolidation_callback: got %.2f score for metric %s, factor: 1", val, (char *)key); + data->score += val; + } + else { + data->score += *factor * val; + msg_debug ("consolidation_callback: got %.2f score for metric %s, factor: %.2f", val, (char *)key, *factor); + } +} + double factor_consolidation_func (struct worker_task *task, const char *metric_name) { struct metric_result *metric_res; - double *factor; double res = 0.; - GList *symbols = NULL, *cur; + struct consolidation_callback_data data = { task, 0 }; metric_res = g_hash_table_lookup (task->results, metric_name); if (metric_res == NULL) { return res; } - symbols = g_hash_table_get_keys (metric_res->symbols); - cur = g_list_first (symbols); - while (cur) { - factor = g_hash_table_lookup (task->worker->srv->cfg->factors, cur->data); - if (factor == NULL) { - /* Default multiplier is 1 */ - res ++; - } - else { - res += *factor; - } - cur = g_list_next (cur); - } + g_hash_table_foreach (metric_res->symbols, consolidation_callback, &data); - g_list_free (symbols); - - return res; + return data.score; } /* @@ -273,6 +283,7 @@ composites_foreach_callback (gpointer key, gpointer value, void *data) GQueue *stack; GList *symbols = NULL, *s; gsize cur, op1, op2; + double *res; stack = g_queue_new (); @@ -326,7 +337,9 @@ composites_foreach_callback (gpointer key, gpointer value, void *data) s = g_list_next (s); } /* Add new symbol */ - g_hash_table_insert (cd->metric_res->symbols, key, GSIZE_TO_POINTER (op1)); + res = memory_pool_alloc (cd->task->task_pool, sizeof (double)); + *res = 1; + g_hash_table_insert (cd->metric_res->symbols, key, res); } } @@ -355,30 +368,32 @@ make_composites (struct worker_task *task) g_hash_table_foreach (task->results, composites_metric_callback, task); } +struct statfile_result_data { + struct metric *metric; + struct classifier_ctx *ctx; +}; + struct statfile_callback_data { - GHashTable *metrics; GHashTable *tokens; + GHashTable *classifiers; struct worker_task *task; }; -struct statfile_result { - double weight; - GList *symbols; - struct classifier *classifier; -}; - static void statfiles_callback (gpointer key, gpointer value, void *arg) { struct statfile_callback_data *data= (struct statfile_callback_data *)arg; struct worker_task *task = data->task; struct statfile *st = (struct statfile *)value; + struct classifier *classifier; + struct statfile_result_data *res_data; + struct metric *metric; + GTree *tokens = NULL; - char *filename; - double weight; - struct statfile_result *res; GList *cur = NULL; GByteArray *content; + + char *filename; f_str_t c; if (g_list_length (task->rcpt) == 1) { @@ -406,65 +421,37 @@ statfiles_callback (gpointer key, gpointer value, void *arg) g_hash_table_insert (data->tokens, st->tokenizer, tokens); } - weight = st->classifier->classify_func (task->worker->srv->statfile_pool, filename, tokens); - - msg_debug ("process_statfiles: got classify weight: %.2f", weight); - - if (weight > 0.000001) { - - if ((res = g_hash_table_lookup (data->metrics, st->metric)) == NULL) { - res = memory_pool_alloc (task->task_pool, sizeof (struct statfile_result)); - res->symbols = g_list_prepend (NULL, st->alias); - res->weight = st->classifier->add_result_func (0, weight * st->weight); - g_hash_table_insert (data->metrics, st->metric, res); - } - else { - res->symbols = g_list_prepend (NULL, st->alias); - res->weight = st->classifier->add_result_func (res->weight, weight * st->weight); - } - msg_debug ("process_statfiles: result weight: %.2f", res->weight); + metric = g_hash_table_lookup (task->cfg->metrics, st->metric); + if (metric == NULL) { + classifier = get_classifier ("winnow"); + } + else { + classifier = metric->classifier; + } + if ((res_data = g_hash_table_lookup (data->classifiers, classifier)) == NULL) { + res_data = memory_pool_alloc (task->task_pool, sizeof (struct statfile_result_data)); + res_data->ctx = classifier->init_func (task->task_pool); + res_data->metric = metric; + g_hash_table_insert (data->classifiers, classifier, res_data); } + classifier->classify_func (res_data->ctx, task->worker->srv->statfile_pool, filename, tokens, st->weight); + } static void statfiles_results_callback (gpointer key, gpointer value, void *arg) { struct worker_task *task = (struct worker_task *)arg; - struct metric_result *metric_res; - struct metric *metric; - struct statfile_result *res = (struct statfile_result *)value; - GList *cur_symbol; - - metric_res = g_hash_table_lookup (task->results, (char *)key); - - metric = g_hash_table_lookup (task->worker->srv->cfg->metrics, (char *)key); - if (metric == NULL) { - return; - } - - if (metric_res == NULL) { - /* Create new metric chain */ - metric_res = memory_pool_alloc (task->task_pool, sizeof (struct metric_result)); - metric_res->symbols = g_hash_table_new (g_str_hash, g_str_equal); - memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_hash_table_destroy, metric_res->symbols); - metric_res->metric = metric; - metric_res->score = res->weight; - g_hash_table_insert (task->results, metric->name, metric_res); - } - else { - metric_res->score += res->weight; - } - msg_debug ("statfiles_results_callback: got total weight %.2f for metric %s", metric_res->score, metric->name); - - cur_symbol = g_list_first (res->symbols); - while (cur_symbol) { - msg_debug ("statfiles_results_callback: insert symbol %s to metric %s", (char *)cur_symbol->data, metric->name); - g_hash_table_insert (metric_res->symbols, (char *)cur_symbol->data, GSIZE_TO_POINTER (1)); - cur_symbol = g_list_next (cur_symbol); - } + struct statfile_result_data *res = (struct statfile_result_data *)value; + struct classifier *classifier = (struct classifier *)key; + double *w; + char *filename; - g_list_free (res->symbols); + w = memory_pool_alloc (task->task_pool, sizeof (double)); + filename = classifier->result_file_func (res->ctx, w); + insert_result (task, res->metric->name, classifier->name, *w); + msg_debug ("statfiles_results_callback: got total weight %.2f for metric %s", *w, res->metric->name); } @@ -476,13 +463,14 @@ process_statfiles (struct worker_task *task) cd.task = task; cd.tokens = g_hash_table_new (g_direct_hash, g_direct_equal); - cd.metrics = g_hash_table_new (g_str_hash, g_str_equal); + cd.classifiers = g_hash_table_new (g_str_hash, g_str_equal); g_hash_table_foreach (task->cfg->statfiles, statfiles_callback, &cd); - g_hash_table_foreach (cd.metrics, statfiles_results_callback, task); + g_hash_table_foreach (cd.classifiers, statfiles_results_callback, task); g_hash_table_destroy (cd.tokens); - g_hash_table_destroy (cd.metrics); + g_hash_table_destroy (cd.classifiers); + g_hash_table_foreach (task->results, metric_process_callback, task); task->state = WRITE_REPLY; } diff --git a/src/filter.h b/src/filter.h index dec13fd72..33c55b162 100644 --- a/src/filter.h +++ b/src/filter.h @@ -27,6 +27,7 @@ struct metric { char *func_name; metric_cons_func func; double required_score; + struct classifier *classifier; }; struct metric_result { @@ -37,7 +38,7 @@ struct metric_result { int process_filters (struct worker_task *task); void process_statfiles (struct worker_task *task); -void insert_result (struct worker_task *task, const char *metric_name, const char *symbol, u_char flag); +void insert_result (struct worker_task *task, const char *metric_name, const char *symbol, double flag); void make_composites (struct worker_task *task); double factor_consolidation_func (struct worker_task *task, const char *metric_name); diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c index 6722e7b92..6213f778b 100644 --- a/src/plugins/regexp.c +++ b/src/plugins/regexp.c @@ -167,13 +167,13 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task) } return 0; case REGEXP_MESSAGE: - msg_debug ("process_message: checking mime regexp: /%s/", re->regexp_text); + msg_debug ("process_message: checking message regexp: /%s/", re->regexp_text); if (g_regex_match_full (re->regexp, task->msg->buf->begin, task->msg->buf->len, 0, 0, NULL, NULL) == TRUE) { return 1; } return 0; case REGEXP_URL: - msg_debug ("process_url: checking mime regexp: /%s/", re->regexp_text); + msg_debug ("process_url: checking url regexp: /%s/", re->regexp_text); TAILQ_FOREACH (url, &task->urls, next) { if (g_regex_match (re->regexp, struri (url), 0, NULL) == TRUE) { return 1; diff --git a/src/plugins/surbl.c b/src/plugins/surbl.c index dc790f1bf..1a0377233 100644 --- a/src/plugins/surbl.c +++ b/src/plugins/surbl.c @@ -313,7 +313,6 @@ dns_callback (int result, char type, int count, int ttl, void *addresses, void * } else { msg_debug ("surbl_check: url %s is not in surbl %s", param->url->host, surbl_module_ctx->suffix); - insert_result (param->task, surbl_module_ctx->metric, surbl_module_ctx->symbol, 0); } param->task->save.saved --; -- 2.39.5