diff options
Diffstat (limited to 'src/classifiers/bayes.c')
-rw-r--r-- | src/classifiers/bayes.c | 322 |
1 files changed, 130 insertions, 192 deletions
diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c index 2d3fca084..a8a18f5ff 100644 --- a/src/classifiers/bayes.c +++ b/src/classifiers/bayes.c @@ -25,13 +25,13 @@ /* * Bayesian classifier */ -#include "binlog.h" -#include "cfg_file.h" #include "classifiers.h" +#include "tokenizers/tokenizers.h" +#include "main.h" #include "filter.h" +#include "cfg_file.h" +#include "binlog.h" #include "lua/lua_common.h" -#include "main.h" -#include "tokenizers/tokenizers.h" #define LOCAL_PROB_DENOM 16.0 @@ -42,68 +42,56 @@ bayes_error_quark (void) } struct bayes_statfile_data { - guint64 hits; - guint64 total_hits; - double value; - struct rspamd_statfile_config *st; - stat_file_t *file; + guint64 hits; + guint64 total_hits; + double value; + struct rspamd_statfile_config *st; + stat_file_t *file; }; struct bayes_callback_data { - statfile_pool_t *pool; - struct classifier_ctx *ctx; - gboolean in_class; - time_t now; - stat_file_t *file; - struct bayes_statfile_data *statfiles; - guint32 statfiles_num; - guint64 total_spam; - guint64 total_ham; - guint64 processed_tokens; - gsize max_tokens; - double spam_probability; - double ham_probability; + statfile_pool_t *pool; + struct classifier_ctx *ctx; + gboolean in_class; + time_t now; + stat_file_t *file; + struct bayes_statfile_data *statfiles; + guint32 statfiles_num; + guint64 total_spam; + guint64 total_ham; + guint64 processed_tokens; + gsize max_tokens; + double spam_probability; + double ham_probability; }; -static gboolean +static gboolean bayes_learn_callback (gpointer key, gpointer value, gpointer data) { - token_node_t *node = key; - struct bayes_callback_data *cd = data; - gint c; - guint64 v; + token_node_t *node = key; + struct bayes_callback_data *cd = data; + gint c; + guint64 v; c = (cd->in_class) ? 1 : -1; /* Consider that not found blocks have value 1 */ - v = - statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, - cd->now); + v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now); if (v == 0 && c > 0) { - statfile_pool_set_block (cd->pool, - cd->file, - node->h1, - node->h2, - cd->now, - c); - cd->processed_tokens++; + statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, c); + cd->processed_tokens ++; } else if (v != 0) { if (G_LIKELY (c > 0)) { - v++; + v ++; } - else if (c < 0) { + else if (c < 0){ if (v != 0) { - v--; + v --; } } - statfile_pool_set_block (cd->pool, - cd->file, - node->h1, - node->h2, - cd->now, - v); - cd->processed_tokens++; + statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, v); + cd->processed_tokens ++; } if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) { @@ -145,7 +133,7 @@ inv_chi_square (gdouble value, gint freedom_deg) return 0; } sum = prob; - for (i = 1; i < freedom_deg / 2; i++) { + for (i = 1; i < freedom_deg / 2; i ++) { prob *= value / (gdouble)i; sum += prob; } @@ -160,20 +148,16 @@ static gboolean bayes_classify_callback (gpointer key, gpointer value, gpointer data) { - token_node_t *node = key; - struct bayes_callback_data *cd = data; - guint i; - struct bayes_statfile_data *cur; - guint64 spam_count = 0, ham_count = 0, total_count = 0; - double spam_prob, spam_freq, ham_freq, bayes_spam_prob; + token_node_t *node = key; + struct bayes_callback_data *cd = data; + guint i; + struct bayes_statfile_data *cur; + guint64 spam_count = 0, ham_count = 0, total_count = 0; + double spam_prob, spam_freq, ham_freq, bayes_spam_prob; - for (i = 0; i < cd->statfiles_num; i++) { + for (i = 0; i < cd->statfiles_num; i ++) { cur = &cd->statfiles[i]; - cur->value = statfile_pool_get_block (cd->pool, - cur->file, - node->h1, - node->h2, - cd->now); + cur->value = statfile_pool_get_block (cd->pool, cur->file, node->h1, node->h2, cd->now); if (cur->value > 0) { cur->total_hits += cur->value; if (cur->st->is_spam) { @@ -194,7 +178,7 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) bayes_spam_prob = (0.5 + spam_prob * total_count) / (1. + total_count); cd->spam_probability += log (bayes_spam_prob); cd->ham_probability += log (1. - bayes_spam_prob); - cd->processed_tokens++; + cd->processed_tokens ++; } if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) { @@ -205,11 +189,10 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) return FALSE; } -struct classifier_ctx * +struct classifier_ctx* bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cfg) { - struct classifier_ctx *ctx = - rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx)); + struct classifier_ctx *ctx = rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx)); ctx->pool = pool; ctx->cfg = cfg; @@ -219,28 +202,23 @@ bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cfg) } gboolean -bayes_classify (struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, - struct rspamd_task *task, - lua_State *L) +bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct rspamd_task *task, lua_State *L) { - struct bayes_callback_data data; - gchar *value; - gint nodes, i = 0, selected_st = -1, cnt; - gint minnodes; - guint64 maxhits = 0, rev; - double final_prob, h, s; - struct rspamd_statfile_config *st; - stat_file_t *file; - GList *cur; - char *sumbuf; + struct bayes_callback_data data; + gchar *value; + gint nodes, i = 0, selected_st = -1, cnt; + gint minnodes; + guint64 maxhits = 0, rev; + double final_prob, h, s; + struct rspamd_statfile_config *st; + stat_file_t *file; + GList *cur; + char *sumbuf; g_assert (pool != NULL); g_assert (ctx != NULL); - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { + if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { minnodes = strtol (value, NULL, 10); nodes = g_tree_nnodes (input); if (nodes > FEATURE_WINDOW_SIZE) { @@ -253,8 +231,7 @@ bayes_classify (struct classifier_ctx * ctx, cur = call_classifier_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L); if (cur) { - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t)g_list_free, cur); + rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_list_free, cur); } else { cur = ctx->cfg->statfiles; @@ -271,8 +248,7 @@ bayes_classify (struct classifier_ctx * ctx, data.ham_probability = 0; data.total_ham = 0; data.total_spam = 0; - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) { + if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) { minnodes = rspamd_config_parse_limit (value, -1); data.max_tokens = minnodes; } @@ -284,11 +260,10 @@ bayes_classify (struct classifier_ctx * ctx, /* Select statfile to classify */ st = cur->data; if ((file = statfile_pool_is_open (pool, st->path)) == NULL) { - if ((file = - statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { + if ((file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { msg_warn ("cannot open %s", st->path); cur = g_list_next (cur); - data.statfiles_num--; + data.statfiles_num --; continue; } } @@ -303,7 +278,7 @@ bayes_classify (struct classifier_ctx * ctx, } cur = g_list_next (cur); - i++; + i ++; } cnt = i; @@ -314,19 +289,17 @@ bayes_classify (struct classifier_ctx * ctx, final_prob = 0; } else { - h = 1 - inv_chi_square (-2. * data.spam_probability, - 2 * data.processed_tokens); - s = 1 - inv_chi_square (-2. * data.ham_probability, - 2 * data.processed_tokens); + h = 1 - inv_chi_square (-2. * data.spam_probability, 2 * data.processed_tokens); + s = 1 - inv_chi_square (-2. * data.ham_probability, 2 * data.processed_tokens); final_prob = (s + 1 - h) / 2.; } if (data.processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) { sumbuf = rspamd_mempool_alloc (task->task_pool, 32); - for (i = 0; i < cnt; i++) { + for (i = 0; i < cnt; i ++) { if ((final_prob > 0.5 && !data.statfiles[i].st->is_spam) || - (final_prob < 0.5 && data.statfiles[i].st->is_spam)) { + (final_prob < 0.5 && data.statfiles[i].st->is_spam)) { continue; } if (data.statfiles[i].total_hits > maxhits) { @@ -335,8 +308,7 @@ bayes_classify (struct classifier_ctx * ctx, } } if (selected_st == -1) { - msg_err ( - "unexpected classifier error: cannot select desired statfile"); + msg_err ("unexpected classifier error: cannot select desired statfile"); } else { /* Calculate ham probability correctly */ @@ -345,10 +317,7 @@ bayes_classify (struct classifier_ctx * ctx, } rspamd_snprintf (sumbuf, 32, "%.2f%%", final_prob * 100.); cur = g_list_prepend (NULL, sumbuf); - insert_result (task, - data.statfiles[selected_st].st->symbol, - final_prob, - cur); + insert_result (task, data.statfiles[selected_st].st->symbol, final_prob, cur); } } @@ -358,44 +327,34 @@ bayes_classify (struct classifier_ctx * ctx, } gboolean -bayes_learn (struct classifier_ctx * ctx, - statfile_pool_t *pool, - const char *symbol, - GTree *input, - gboolean in_class, - double *sum, - double multiplier, - GError **err) +bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symbol, GTree *input, + gboolean in_class, double *sum, double multiplier, GError **err) { - struct bayes_callback_data data; - gchar *value; - gint nodes; - gint minnodes; - struct rspamd_statfile_config *st, *sel_st = NULL; - stat_file_t *to_learn; - GList *cur; + struct bayes_callback_data data; + gchar *value; + gint nodes; + gint minnodes; + struct rspamd_statfile_config *st, *sel_st = NULL; + stat_file_t *to_learn; + GList *cur; g_assert (pool != NULL); g_assert (ctx != NULL); - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { + if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { minnodes = strtol (value, NULL, 10); nodes = g_tree_nnodes (input); if (nodes > FEATURE_WINDOW_SIZE) { nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; } if (nodes < minnodes) { - msg_info ( - "do not learn message as it has too few tokens: %d, while %d min", - nodes, - minnodes); + msg_info ("do not learn message as it has too few tokens: %d, while %d min", nodes, minnodes); *sum = 0; g_set_error (err, - bayes_error_quark (), /* error domain */ - 1, /* error code */ - "message contains too few tokens: %d, while min is %d", - nodes, (int)minnodes); + bayes_error_quark(), /* error domain */ + 1, /* error code */ + "message contains too few tokens: %d, while min is %d", + nodes, (int)minnodes); return FALSE; } } @@ -406,8 +365,7 @@ bayes_learn (struct classifier_ctx * ctx, data.ctx = ctx; data.processed_tokens = 0; data.processed_tokens = 0; - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) { + if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) { minnodes = rspamd_config_parse_limit (value, -1); data.max_tokens = minnodes; } @@ -426,36 +384,31 @@ bayes_learn (struct classifier_ctx * ctx, } if (sel_st == NULL) { g_set_error (err, - bayes_error_quark (), /* error domain */ - 1, /* error code */ - "cannot find statfile for symbol: %s", - symbol); + bayes_error_quark(), /* error domain */ + 1, /* error code */ + "cannot find statfile for symbol: %s", + symbol); return FALSE; } if ((to_learn = statfile_pool_is_open (pool, sel_st->path)) == NULL) { - if ((to_learn = - statfile_pool_open (pool, sel_st->path, sel_st->size, - FALSE)) == NULL) { + if ((to_learn = statfile_pool_open (pool, sel_st->path, sel_st->size, FALSE)) == NULL) { msg_warn ("cannot open %s", sel_st->path); if (statfile_pool_create (pool, sel_st->path, sel_st->size) == -1) { msg_err ("cannot create statfile %s", sel_st->path); g_set_error (err, - bayes_error_quark (), /* error domain */ - 1, /* error code */ - "cannot create statfile: %s", - sel_st->path); + bayes_error_quark(), /* error domain */ + 1, /* error code */ + "cannot create statfile: %s", + sel_st->path); return FALSE; } - if ((to_learn = - statfile_pool_open (pool, sel_st->path, sel_st->size, - FALSE)) == NULL) { + if ((to_learn = statfile_pool_open (pool, sel_st->path, sel_st->size, FALSE)) == NULL) { g_set_error (err, - bayes_error_quark (), /* error domain */ - 1, /* error code */ - "cannot open statfile %s after creation", - sel_st->path); - msg_err ("cannot open statfile %s after creation", - sel_st->path); + bayes_error_quark(), /* error domain */ + 1, /* error code */ + "cannot open statfile %s after creation", + sel_st->path); + msg_err ("cannot open statfile %s after creation", sel_st->path); return FALSE; } } @@ -474,28 +427,22 @@ bayes_learn (struct classifier_ctx * ctx, } gboolean -bayes_learn_spam (struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, - struct rspamd_task *task, - gboolean is_spam, - lua_State *L, - GError **err) +bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool, + GTree *input, struct rspamd_task *task, gboolean is_spam, lua_State *L, GError **err) { - struct bayes_callback_data data; - gchar *value; - gint nodes; - gint minnodes; - struct rspamd_statfile_config *st; - stat_file_t *file; - GList *cur; - gboolean skip_labels; + struct bayes_callback_data data; + gchar *value; + gint nodes; + gint minnodes; + struct rspamd_statfile_config *st; + stat_file_t *file; + GList *cur; + gboolean skip_labels; g_assert (pool != NULL); g_assert (ctx != NULL); - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { + if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { minnodes = strtol (value, NULL, 10); nodes = g_tree_nnodes (input); if (nodes > FEATURE_WINDOW_SIZE) { @@ -503,10 +450,10 @@ bayes_learn_spam (struct classifier_ctx * ctx, } if (nodes < minnodes) { g_set_error (err, - bayes_error_quark (), /* error domain */ - 1, /* error code */ - "message contains too few tokens: %d, while min is %d", - nodes, (int)minnodes); + bayes_error_quark(), /* error domain */ + 1, /* error code */ + "message contains too few tokens: %d, while min is %d", + nodes, (int)minnodes); return FALSE; } } @@ -514,8 +461,7 @@ bayes_learn_spam (struct classifier_ctx * ctx, cur = call_classifier_pre_callbacks (ctx->cfg, task, TRUE, is_spam, L); if (cur) { skip_labels = FALSE; - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t)g_list_free, cur); + rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_list_free, cur); } else { /* Do not try to learn specific statfiles if pre callback returned nil */ @@ -529,8 +475,7 @@ bayes_learn_spam (struct classifier_ctx * ctx, data.in_class = TRUE; data.processed_tokens = 0; - if (ctx->cfg->opts && - (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) { + if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) { minnodes = rspamd_config_parse_limit (value, -1); data.max_tokens = minnodes; } @@ -546,28 +491,24 @@ bayes_learn_spam (struct classifier_ctx * ctx, continue; } if ((file = statfile_pool_is_open (pool, st->path)) == NULL) { - if ((file = - statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { + if ((file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { msg_warn ("cannot open %s", st->path); if (statfile_pool_create (pool, st->path, st->size) == -1) { msg_err ("cannot create statfile %s", st->path); g_set_error (err, - bayes_error_quark (), /* error domain */ - 1, /* error code */ - "cannot create statfile: %s", - st->path); + bayes_error_quark(), /* error domain */ + 1, /* error code */ + "cannot create statfile: %s", + st->path); return FALSE; } - if ((file = - statfile_pool_open (pool, st->path, st->size, - FALSE)) == NULL) { + if ((file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { g_set_error (err, - bayes_error_quark (), /* error domain */ - 1, /* error code */ - "cannot open statfile %s after creation", - st->path); - msg_err ("cannot open statfile %s after creation", - st->path); + bayes_error_quark(), /* error domain */ + 1, /* error code */ + "cannot open statfile %s after creation", + st->path); + msg_err ("cannot open statfile %s after creation", st->path); return FALSE; } } @@ -587,10 +528,7 @@ bayes_learn_spam (struct classifier_ctx * ctx, } GList * -bayes_weights (struct classifier_ctx * ctx, - statfile_pool_t *pool, - GTree *input, - struct rspamd_task *task) +bayes_weights (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct rspamd_task *task) { /* This function is unimplemented with new normalizer */ return NULL; |