diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-28 19:07:26 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-28 19:07:26 +0400 |
commit | b3c36d4946f675619b81c9223f5ac1a86c55c55c (patch) | |
tree | 6cdd79cae18ce387f6c00f8ce23aef65b4a5c02b /src/classifiers/bayes.c | |
parent | 0e6a4235b1794a61d12fcde33cffaf8dd83c51f0 (diff) | |
download | rspamd-b3c36d4946f675619b81c9223f5ac1a86c55c55c.tar.gz rspamd-b3c36d4946f675619b81c9223f5ac1a86c55c55c.zip |
* Add correcting factor to statistics.
Now learning increments version of a statfile.
Avoid learning and classifying of similar text parts if a message has 2 text parts.
Several fixes to statistics.
Diffstat (limited to 'src/classifiers/bayes.c')
-rw-r--r-- | src/classifiers/bayes.c | 48 |
1 files changed, 33 insertions, 15 deletions
diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c index af79e0eaa..7363df522 100644 --- a/src/classifiers/bayes.c +++ b/src/classifiers/bayes.c @@ -47,7 +47,8 @@ struct bayes_statfile_data { guint64 total_hits; double local_probability; double post_probability; - guint64 value; + double corr; + double value; struct statfile *st; stat_file_t *file; }; @@ -60,6 +61,7 @@ struct bayes_callback_data { stat_file_t *file; struct bayes_statfile_data *statfiles; guint32 statfiles_num; + guint64 learned_tokens; }; static gboolean @@ -67,7 +69,8 @@ bayes_learn_callback (gpointer key, gpointer value, gpointer data) { token_node_t *node = key; struct bayes_callback_data *cd = data; - gint v, c; + gint c; + guint64 v; c = (cd->in_class) ? 1 : -1; @@ -75,8 +78,9 @@ bayes_learn_callback (gpointer key, gpointer value, gpointer data) v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now); if (v == 0 && c > 0) { statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, c); + cd->learned_tokens ++; } - else { + else if (v != 0) { if (G_LIKELY (c > 0)) { v ++; } @@ -86,6 +90,7 @@ bayes_learn_callback (gpointer key, gpointer value, gpointer data) } } statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, v); + cd->learned_tokens ++; } return FALSE; @@ -102,24 +107,21 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) struct bayes_callback_data *cd = data; double renorm = 0; gint i; - guint64 local_hits = 0; + double local_hits = 0; struct bayes_statfile_data *cur; for (i = 0; i < cd->statfiles_num; i ++) { cur = &cd->statfiles[i]; - cur->value = statfile_pool_get_block (cd->pool, cur->file, node->h1, node->h2, cd->now); + cur->value = statfile_pool_get_block (cd->pool, cur->file, node->h1, node->h2, cd->now) * cur->corr; if (cur->value > 0) { - cur->total_hits += cur->value; + cur->total_hits ++; cur->hits = cur->value; local_hits += cur->value; } - else { - cur->value = 0; - } } for (i = 0; i < cd->statfiles_num; i ++) { cur = &cd->statfiles[i]; - cur->local_probability = 0.5 + ((double)cur->value - ((double)local_hits - cur->value)) / + cur->local_probability = 0.5 + (cur->value - (local_hits - cur->value)) / (LOCAL_PROB_DENOM * (1.0 + local_hits)); renorm += cur->post_probability * cur->local_probability; } @@ -145,7 +147,7 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) cur->post_probability = G_MINDOUBLE * 100; } if (cd->ctx->debug) { - msg_info ("token: %s, statfile: %s, probability: %uL, post_probability: %.4f", + msg_info ("token: %s, statfile: %s, probability: %.4f, post_probability: %.4f", node->extra, cur->st->symbol, cur->value, cur->post_probability); } } @@ -169,8 +171,9 @@ gboolean bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task) { struct bayes_callback_data data; - char *value; - int nodes, minnodes, i, cnt, best_num = 0; + gchar *value; + gint nodes, minnodes, i = 0, cnt, best_num = 0; + guint64 rev, total_learns = 0; double best = 0; struct statfile *st; stat_file_t *file; @@ -198,7 +201,6 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, data.ctx = ctx; cur = ctx->cfg->statfiles; - i = 0; while (cur) { /* Select statfile to learn */ st = cur->data; @@ -214,11 +216,21 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, data.statfiles[i].st = st; data.statfiles[i].post_probability = 0.5; data.statfiles[i].local_probability = 0.5; - i ++; + statfile_get_revision (file, &rev, NULL); + total_learns += rev; + cur = g_list_next (cur); + i ++; } + cnt = i; + /* Calculate correction factor */ + for (i = 0; i < cnt; i ++) { + statfile_get_revision (data.statfiles[i].file, &rev, NULL); + data.statfiles[i].corr = ((double)rev / cnt) / (double)total_learns; + } + g_tree_foreach (input, bayes_classify_callback, &data); for (i = 0; i < cnt; i ++) { @@ -277,6 +289,7 @@ bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symb data.in_class = in_class; data.now = time (NULL); data.ctx = ctx; + data.learned_tokens = 0; cur = ctx->cfg->statfiles; while (cur) { /* Select statfile to learn */ @@ -321,8 +334,13 @@ bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symb data.file = to_learn; statfile_pool_lock_file (pool, data.file); g_tree_foreach (input, bayes_learn_callback, &data); + statfile_inc_revision (to_learn); statfile_pool_unlock_file (pool, data.file); + if (sum != NULL) { + *sum = data.learned_tokens; + } + return TRUE; } |