From 3dadbb5159db3a59342834082144690588aa61db Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 15 Nov 2018 13:35:19 +0000 Subject: [PATCH] [Feature] Filter tokens in bayes --- src/libserver/cfg_utils.c | 2 +- src/libstat/classifiers/bayes.c | 26 ++++++++++++++++++++------ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/libserver/cfg_utils.c b/src/libserver/cfg_utils.c index 45f289e89..b874d439e 100644 --- a/src/libserver/cfg_utils.c +++ b/src/libserver/cfg_utils.c @@ -971,7 +971,7 @@ rspamd_config_new_classifier (struct rspamd_config *cfg, c = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (struct rspamd_classifier_config)); - c->min_prob_strength = 0.343; + c->min_prob_strength = 0.05; c->min_token_hits = 2; } diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index c7621cb77..2e494e526 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -123,7 +123,7 @@ bayes_classify_token (struct rspamd_classifier *ctx, { guint i; gint id; - guint64 spam_count = 0, ham_count = 0, total_count = 0; + guint spam_count = 0, ham_count = 0, total_count = 0; struct rspamd_statfile *st; struct rspamd_task *task; const gchar *token_type = "txt"; @@ -175,7 +175,7 @@ bayes_classify_token (struct rspamd_classifier *ctx, } /* Probability for this token */ - if (total_count > 0) { + if (total_count > ctx->cfg->min_token_hits) { spam_freq = ((double)spam_count / MAX (1., (double) ctx->spam_learns)); ham_freq = ((double)ham_count / MAX (1., (double)ctx->ham_learns)); spam_prob = spam_freq / (spam_freq + ham_freq); @@ -193,6 +193,18 @@ bayes_classify_token (struct rspamd_classifier *ctx, w = (fw * total_count) / (1.0 + fw * total_count); bayes_spam_prob = PROB_COMBINE (spam_prob, total_count, w, 0.5); + + if ((bayes_spam_prob > 0.5 && bayes_spam_prob < 0.5 + ctx->cfg->min_prob_strength) || + (bayes_spam_prob < 0.5 && bayes_spam_prob > 0.5 - ctx->cfg->min_prob_strength)) { + msg_debug_bayes ( + "token %uL <%*s:%*s> skipped, prob not in range: %f", + tok->data, + (int) tok->t1->len, tok->t1->begin, + (int) tok->t2->len, tok->t2->begin, bayes_spam_prob); + + return; + } + bayes_ham_prob = PROB_COMBINE (ham_prob, total_count, w, 0.5); cl->spam_prob += log (bayes_spam_prob); @@ -207,8 +219,9 @@ bayes_classify_token (struct rspamd_classifier *ctx, } if (tok->t1 && tok->t2) { - msg_debug_bayes ("token(%s) %uL <%*s:%*s>: weight: %f, cf: %f, total_count: %L, " - "spam_count: %L, ham_count: %L," + msg_debug_bayes ("token(%s) %uL <%*s:%*s>: weight: %f, cf: %f, " + "total_count: %ud, " + "spam_count: %ud, ham_count: %ud," "spam_prob: %.3f, ham_prob: %.3f, " "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, " "current spam prob: %.3f, current ham prob: %.3f", @@ -222,8 +235,9 @@ bayes_classify_token (struct rspamd_classifier *ctx, cl->spam_prob, cl->ham_prob); } else { - msg_debug_bayes ("token(%s) %uL : weight: %f, cf: %f, total_count: %L, " - "spam_count: %L, ham_count: %L," + msg_debug_bayes ("token(%s) %uL : weight: %f, cf: %f, " + "total_count: %ud, " + "spam_count: %ud, ham_count: %ud," "spam_prob: %.3f, ham_prob: %.3f, " "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, " "current spam prob: %.3f, current ham prob: %.3f", -- 2.39.5