summaryrefslogtreecommitdiffstats
path: root/src/libstat/classifiers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-07 12:46:27 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-07 12:46:27 +0100
commit9a891cf26de7a8cc943f4ff3ad64eb3590eb880c (patch)
tree9491ea2e7de24f43bfb17a4128ba7fed2c32dace /src/libstat/classifiers
parentde12aa88fbcf67aacec46bd1324f92e24e02163a (diff)
downloadrspamd-9a891cf26de7a8cc943f4ff3ad64eb3590eb880c.tar.gz
rspamd-9a891cf26de7a8cc943f4ff3ad64eb3590eb880c.zip
Fix some extreme cases in bayes.
Diffstat (limited to 'src/libstat/classifiers')
-rw-r--r--src/libstat/classifiers/bayes.c63
1 files changed, 29 insertions, 34 deletions
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index 6c17e90ca..03512f771 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -167,43 +167,37 @@ bayes_classify (struct classifier_ctx * ctx,
g_tree_foreach (input, bayes_classify_callback, rt);
}
else {
-
- if (rt->spam_prob == 0) {
- final_prob = 0;
+ h = 1 - inv_chi_square (-2. * rt->spam_prob,
+ 2 * rt->processed_tokens);
+ s = 1 - inv_chi_square (-2. * rt->ham_prob,
+ 2 * rt->processed_tokens);
+
+ if (isfinite (s) && isfinite (h)) {
+ final_prob = (s + 1.0 - h) / 2.;
+ msg_debug ("<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f,"
+ " %L tokens processed of %ud total tokens",
+ task->message_id, rt->ham_prob, h, rt->spam_prob, s,
+ rt->processed_tokens, g_tree_nnodes (input));
}
else {
- h = 1 - inv_chi_square (-2. * rt->spam_prob,
- 2 * rt->processed_tokens);
- s = 1 - inv_chi_square (-2. * rt->ham_prob,
- 2 * rt->processed_tokens);
-
- if (isfinite (s) && isfinite (h)) {
- final_prob = (s + 1.0 - h) / 2.;
- msg_debug ("<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f,"
- " %L tokens processed of %ud total tokens",
- task->message_id, rt->ham_prob, h, rt->spam_prob, s,
- rt->processed_tokens, g_tree_nnodes (input));
+ /*
+ * We have some overflow, hence we need to check which class
+ * is NaN
+ */
+ if (isfinite (h)) {
+ final_prob = 1.0;
+ msg_debug ("<%s> spam class is overflowed, as we have no"
+ " ham samples", task->message_id);
+ }
+ else if (isfinite (s)){
+ final_prob = 0.0;
+ msg_debug ("<%s> ham class is overflowed, as we have no"
+ " spam samples", task->message_id);
}
else {
- /*
- * We have some overflow, hence we need to check which class
- * is NaN
- */
- if (isfinite (h)) {
- final_prob = 1.0;
- msg_debug ("<%s> spam class is overflowed, as we have no"
- " ham samples", task->message_id);
- }
- else if (isfinite (s)){
- final_prob = 0.0;
- msg_debug ("<%s> spam class is overflowed, as we have no"
- " spam samples", task->message_id);
- }
- else {
- final_prob = 0.5;
- msg_warn ("<%s> spam and ham classes are both overflowed",
- task->message_id);
- }
+ final_prob = 0.5;
+ msg_warn ("<%s> spam and ham classes are both overflowed",
+ task->message_id);
}
}
@@ -228,7 +222,8 @@ bayes_classify (struct classifier_ctx * ctx,
if (selected_st == NULL) {
msg_err (
- "unexpected classifier error: cannot select desired statfile");
+ "unexpected classifier error: cannot select desired statfile, "
+ "prob: %.4f", final_prob);
}
else {
/* Correctly scale HAM */