From d07c0aa5edf25f4b98c4d20639b9c501164806bf Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 4 Apr 2017 17:38:12 +0100 Subject: [PATCH] [Feature] Ignore bayes with mostly metatokens or with too few text --- src/libstat/classifiers/bayes.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index c9faae6bd..f836f5472 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -83,6 +83,7 @@ struct bayes_task_closure { double spam_prob; guint64 processed_tokens; guint64 total_hits; + guint64 text_tokens; struct rspamd_task *task; }; @@ -158,6 +159,10 @@ bayes_classify_token (struct rspamd_classifier *ctx, cl->ham_prob += log2 (bayes_ham_prob); cl->processed_tokens ++; + if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_META)) { + cl->text_tokens ++; + } + if (tok->t1 && tok->t2) { msg_debug_bayes ("token %uL <%*s:%*s>: weight: %f, total_count: %L, " "spam_count: %L, ham_count: %L," @@ -247,14 +252,15 @@ bayes_classify (struct rspamd_classifier * ctx, final_prob = (s + 1.0 - h) / 2.; msg_debug_bayes ( "<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f," - " %L tokens processed of %ud total tokens", + " %L tokens processed of %ud total tokens (%uL text tokens)", task->message_id, cl.ham_prob, h, cl.spam_prob, s, cl.processed_tokens, - tokens->len); + tokens->len, + cl.text_tokens); } else { /* @@ -282,6 +288,26 @@ bayes_classify (struct rspamd_classifier * ctx, *pprob = final_prob; rspamd_mempool_set_variable (task->task_pool, "bayes_prob", pprob, NULL); + if (cl.text_tokens <= (cl.processed_tokens - cl.text_tokens) / 2) { + msg_info_bayes ("ignore bayes probability %.2f since we have " + "much more metatokens (%d) than text tokens (%d)", + final_prob, + cl.processed_tokens - cl.text_tokens, cl.text_tokens); + + return TRUE; + } + + if (ctx->cfg->min_tokens > 0 && + cl.text_tokens < ctx->cfg->min_tokens * 0.1) { + msg_info_bayes ("ignore bayes probability %.2f since we have " + "too few text tokens: %d, at least %.0f is required", + final_prob, + cl.text_tokens, + ctx->cfg->min_tokens * 0.1); + + return TRUE; + } + if (cl.processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) { /* Now we can have exactly one HAM and exactly one SPAM statfiles per classifier */ for (i = 0; i < ctx->statfiles_ids->len; i++) { -- 2.39.5