[Feature] Filter tokens in bayes

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 15 Nov 2018 13:35:19 +0000 (13:35 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 15 Nov 2018 13:35:19 +0000 (13:35 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 15 Nov 2018 13:35:19 +0000 (13:35 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 15 Nov 2018 13:35:19 +0000 (13:35 +0000)
diff --git a/src/libserver/cfg_utils.c b/src/libserver/cfg_utils.c

index 45f289e8964ca17fb9c096231d6f66b3a7b4a94d..b874d439e685253e66bfa7f57b8c4144866d911f 100644 (file)
--- a/src/libserver/cfg_utils.c
+++ b/src/libserver/cfg_utils.c
@@ -971,7 +971,7 @@ rspamd_config_new_classifier (struct rspamd_config *cfg,
                 c =
                         rspamd_mempool_alloc0 (cfg->cfg_pool,
                                 sizeof (struct rspamd_classifier_config));
-               c->min_prob_strength = 0.343;
+               c->min_prob_strength = 0.05;
                 c->min_token_hits = 2;
         }
  
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c

index c7621cb776251a8334502318e8877698e489536f..2e494e5267ca872ae9c04bd662650bdb6ed1eca2 100644 (file)
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -123,7 +123,7 @@ bayes_classify_token (struct rspamd_classifier *ctx,
  {
         guint i;
         gint id;
-       guint64 spam_count = 0, ham_count = 0, total_count = 0;
+       guint spam_count = 0, ham_count = 0, total_count = 0;
         struct rspamd_statfile *st;
         struct rspamd_task *task;
         const gchar *token_type = "txt";
@@ -175,7 +175,7 @@ bayes_classify_token (struct rspamd_classifier *ctx,
         }
  
         /* Probability for this token */
-       if (total_count > 0) {
+       if (total_count > ctx->cfg->min_token_hits) {
                 spam_freq = ((double)spam_count / MAX (1., (double) ctx->spam_learns));
                 ham_freq = ((double)ham_count / MAX (1., (double)ctx->ham_learns));
                 spam_prob = spam_freq / (spam_freq + ham_freq);
@@ -193,6 +193,18 @@ bayes_classify_token (struct rspamd_classifier *ctx,
                 w = (fw * total_count) / (1.0 + fw * total_count);
  
                 bayes_spam_prob = PROB_COMBINE (spam_prob, total_count, w, 0.5);
+
+               if ((bayes_spam_prob > 0.5 && bayes_spam_prob < 0.5 + ctx->cfg->min_prob_strength) ||
+                       (bayes_spam_prob < 0.5 && bayes_spam_prob > 0.5 - ctx->cfg->min_prob_strength)) {
+                       msg_debug_bayes (
+                                       "token %uL <%*s:%*s> skipped, prob not in range: %f",
+                                       tok->data,
+                                       (int) tok->t1->len, tok->t1->begin,
+                                       (int) tok->t2->len, tok->t2->begin, bayes_spam_prob);
+
+                       return;
+               }
+
                 bayes_ham_prob = PROB_COMBINE (ham_prob, total_count, w, 0.5);
  
                 cl->spam_prob += log (bayes_spam_prob);
@@ -207,8 +219,9 @@ bayes_classify_token (struct rspamd_classifier *ctx,
                 }
  
                 if (tok->t1 && tok->t2) {
-                       msg_debug_bayes ("token(%s) %uL <%*s:%*s>: weight: %f, cf: %f, total_count: %L, "
-                                       "spam_count: %L, ham_count: %L,"
+                       msg_debug_bayes ("token(%s) %uL <%*s:%*s>: weight: %f, cf: %f, "
+                                       "total_count: %ud, "
+                                       "spam_count: %ud, ham_count: %ud,"
                                         "spam_prob: %.3f, ham_prob: %.3f, "
                                         "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
                                         "current spam prob: %.3f, current ham prob: %.3f",
@@ -222,8 +235,9 @@ bayes_classify_token (struct rspamd_classifier *ctx,
                                         cl->spam_prob, cl->ham_prob);
                 }
                 else {
-                       msg_debug_bayes ("token(%s) %uL <?:?>: weight: %f, cf: %f, total_count: %L, "
-                                       "spam_count: %L, ham_count: %L,"
+                       msg_debug_bayes ("token(%s) %uL <?:?>: weight: %f, cf: %f, "
+                                       "total_count: %ud, "
+                                       "spam_count: %ud, ham_count: %ud,"
                                         "spam_prob: %.3f, ham_prob: %.3f, "
                                         "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
                                         "current spam prob: %.3f, current ham prob: %.3f",
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 15 Nov 2018 13:35:19 +0000 (13:35 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 15 Nov 2018 13:35:19 +0000 (13:35 +0000)
src/libserver/cfg_utils.c		patch \| blob \| history
src/libstat/classifiers/bayes.c		patch \| blob \| history