diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2013-05-23 17:20:02 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2013-05-23 17:20:02 +0100 |
commit | 6713fce35d73a13a49bc2650bd6c22b7f9a170a8 (patch) | |
tree | de5ee3cf53e0f6a11668fd44e6ba2e4b2a8d8345 /src/classifiers/bayes.c | |
parent | cac53229174befe479e48b7e0d5cb1d81c46c223 (diff) | |
download | rspamd-6713fce35d73a13a49bc2650bd6c22b7f9a170a8.tar.gz rspamd-6713fce35d73a13a49bc2650bd6c22b7f9a170a8.zip |
Fix calculations.
Diffstat (limited to 'src/classifiers/bayes.c')
-rw-r--r-- | src/classifiers/bayes.c | 11 |
1 files changed, 8 insertions, 3 deletions
diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c index f3ad36558..e701fe2a0 100644 --- a/src/classifiers/bayes.c +++ b/src/classifiers/bayes.c @@ -62,6 +62,7 @@ struct bayes_callback_data { guint64 processed_tokens; gsize max_tokens; double spam_probability; + double ham_probability; }; static gboolean @@ -170,6 +171,7 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) spam_prob = spam_freq / (spam_freq + ham_freq); bayes_spam_prob = (0.5 + spam_prob * total_count) / (double)total_count; cd->spam_probability += log (bayes_spam_prob); + cd->ham_probability += log (1. - bayes_spam_prob); cd->processed_tokens ++; } @@ -201,7 +203,7 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, gint nodes, i = 0, selected_st = -1, cnt; gint minnodes; guint64 maxhits = 0; - double final_prob; + double final_prob, h, s; struct statfile *st; stat_file_t *file; GList *cur; @@ -237,6 +239,7 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, data.processed_tokens = 0; data.spam_probability = 0; + data.ham_probability = 0; data.total_ham = 0; data.total_spam = 0; if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) { @@ -279,10 +282,12 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, final_prob = 0; } else { - final_prob = inv_chi_square (-2. * data.spam_probability, 2 * data.processed_tokens); + h = 1 - inv_chi_square (-2. * data.spam_probability, 2 * data.processed_tokens); + s = 1 - inv_chi_square (-2. * data.ham_probability, 2 * data.processed_tokens); + final_prob = (s + 1 - h) / 2.; } - if (final_prob > 0 && fabs (final_prob - 0.5) > 0.0001) { + if (final_prob > 0 && fabs (final_prob - 0.5) > 0.1) { sumbuf = memory_pool_alloc (task->task_pool, 32); for (i = 0; i < cnt; i ++) { |