diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-07-07 12:46:27 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-07-07 12:46:27 +0100 |
commit | 9a891cf26de7a8cc943f4ff3ad64eb3590eb880c (patch) | |
tree | 9491ea2e7de24f43bfb17a4128ba7fed2c32dace /src/libstat/classifiers | |
parent | de12aa88fbcf67aacec46bd1324f92e24e02163a (diff) | |
download | rspamd-9a891cf26de7a8cc943f4ff3ad64eb3590eb880c.tar.gz rspamd-9a891cf26de7a8cc943f4ff3ad64eb3590eb880c.zip |
Fix some extreme cases in bayes.
Diffstat (limited to 'src/libstat/classifiers')
-rw-r--r-- | src/libstat/classifiers/bayes.c | 63 |
1 files changed, 29 insertions, 34 deletions
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index 6c17e90ca..03512f771 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -167,43 +167,37 @@ bayes_classify (struct classifier_ctx * ctx, g_tree_foreach (input, bayes_classify_callback, rt); } else { - - if (rt->spam_prob == 0) { - final_prob = 0; + h = 1 - inv_chi_square (-2. * rt->spam_prob, + 2 * rt->processed_tokens); + s = 1 - inv_chi_square (-2. * rt->ham_prob, + 2 * rt->processed_tokens); + + if (isfinite (s) && isfinite (h)) { + final_prob = (s + 1.0 - h) / 2.; + msg_debug ("<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f," + " %L tokens processed of %ud total tokens", + task->message_id, rt->ham_prob, h, rt->spam_prob, s, + rt->processed_tokens, g_tree_nnodes (input)); } else { - h = 1 - inv_chi_square (-2. * rt->spam_prob, - 2 * rt->processed_tokens); - s = 1 - inv_chi_square (-2. * rt->ham_prob, - 2 * rt->processed_tokens); - - if (isfinite (s) && isfinite (h)) { - final_prob = (s + 1.0 - h) / 2.; - msg_debug ("<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f," - " %L tokens processed of %ud total tokens", - task->message_id, rt->ham_prob, h, rt->spam_prob, s, - rt->processed_tokens, g_tree_nnodes (input)); + /* + * We have some overflow, hence we need to check which class + * is NaN + */ + if (isfinite (h)) { + final_prob = 1.0; + msg_debug ("<%s> spam class is overflowed, as we have no" + " ham samples", task->message_id); + } + else if (isfinite (s)){ + final_prob = 0.0; + msg_debug ("<%s> ham class is overflowed, as we have no" + " spam samples", task->message_id); } else { - /* - * We have some overflow, hence we need to check which class - * is NaN - */ - if (isfinite (h)) { - final_prob = 1.0; - msg_debug ("<%s> spam class is overflowed, as we have no" - " ham samples", task->message_id); - } - else if (isfinite (s)){ - final_prob = 0.0; - msg_debug ("<%s> spam class is overflowed, as we have no" - " spam samples", task->message_id); - } - else { - final_prob = 0.5; - msg_warn ("<%s> spam and ham classes are both overflowed", - task->message_id); - } + final_prob = 0.5; + msg_warn ("<%s> spam and ham classes are both overflowed", + task->message_id); } } @@ -228,7 +222,8 @@ bayes_classify (struct classifier_ctx * ctx, if (selected_st == NULL) { msg_err ( - "unexpected classifier error: cannot select desired statfile"); + "unexpected classifier error: cannot select desired statfile, " + "prob: %.4f", final_prob); } else { /* Correctly scale HAM */ |