Fix extreme cases in bayes classifier.

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Sat, 20 Jun 2015 21:17:46 +0000 (22:17 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Tue, 23 Jun 2015 09:57:20 +0000 (10:57 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 20 Jun 2015 21:17:46 +0000 (22:17 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 23 Jun 2015 09:57:20 +0000 (10:57 +0100)
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c

index d04341c3c6d68893439e8e273f38819eb2381033..87cc6e464ce8cabed492d535a52304fdb795799d 100644 (file)
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -176,11 +176,35 @@ bayes_classify (struct classifier_ctx * ctx,
                                         2 * rt->processed_tokens);
                         s = 1 - inv_chi_square (-2. * rt->ham_prob,
                                         2 * rt->processed_tokens);
-                       final_prob = (s + 1.0 - h) / 2.;
-                       msg_debug ("<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f,"
-                                       " %L tokens processed of %ud total tokens",
-                                       task->message_id, rt->ham_prob, h, rt->spam_prob, s,
-                                       rt->processed_tokens, g_tree_nnodes (input));
+
+                       if (isnormal (s) && isnormal (h)) {
+                               final_prob = (s + 1.0 - h) / 2.;
+                               msg_debug ("<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f,"
+                                               " %L tokens processed of %ud total tokens",
+                                               task->message_id, rt->ham_prob, h, rt->spam_prob, s,
+                                               rt->processed_tokens, g_tree_nnodes (input));
+                       }
+                       else {
+                               /*
+                                * We have some overflow, hence we need to check which class
+                                * is NaN
+                                */
+                               if (isnormal (h)) {
+                                       final_prob = 1.0;
+                                       msg_debug ("<%s> spam class is overflowed, as we have no"
+                                                       " ham samples", task->message_id);
+                               }
+                               else if (isnormal (s)){
+                                       final_prob = 0.0;
+                                       msg_debug ("<%s> spam class is overflowed, as we have no"
+                                                       " spam samples", task->message_id);
+                               }
+                               else {
+                                       final_prob = 0.5;
+                                       msg_warn ("<%s> spam and ham classes are both overflowed",
+                                                       task->message_id);
+                               }
+                       }
                 }
  
                 if (rt->processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) {
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Sat, 20 Jun 2015 21:17:46 +0000 (22:17 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Tue, 23 Jun 2015 09:57:20 +0000 (10:57 +0100)