From d07c0aa5edf25f4b98c4d20639b9c501164806bf Mon Sep 17 00:00:00 2001
From: Vsevolod Stakhov <vsevolod@highsecure.ru>
Date: Tue, 4 Apr 2017 17:38:12 +0100
Subject: [PATCH] [Feature] Ignore bayes with mostly metatokens or with too few
 text

---
 src/libstat/classifiers/bayes.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index c9faae6bd..f836f5472 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -83,6 +83,7 @@ struct bayes_task_closure {
 	double spam_prob;
 	guint64 processed_tokens;
 	guint64 total_hits;
+	guint64 text_tokens;
 	struct rspamd_task *task;
 };
 
@@ -158,6 +159,10 @@ bayes_classify_token (struct rspamd_classifier *ctx,
 		cl->ham_prob += log2 (bayes_ham_prob);
 		cl->processed_tokens ++;
 
+		if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_META)) {
+			cl->text_tokens ++;
+		}
+
 		if (tok->t1 && tok->t2) {
 			msg_debug_bayes ("token %uL <%*s:%*s>: weight: %f, total_count: %L, "
 					"spam_count: %L, ham_count: %L,"
@@ -247,14 +252,15 @@ bayes_classify (struct rspamd_classifier * ctx,
 		final_prob = (s + 1.0 - h) / 2.;
 		msg_debug_bayes (
 				"<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f,"
-						" %L tokens processed of %ud total tokens",
+						" %L tokens processed of %ud total tokens (%uL text tokens)",
 				task->message_id,
 				cl.ham_prob,
 				h,
 				cl.spam_prob,
 				s,
 				cl.processed_tokens,
-				tokens->len);
+				tokens->len,
+				cl.text_tokens);
 	}
 	else {
 		/*
@@ -282,6 +288,26 @@ bayes_classify (struct rspamd_classifier * ctx,
 	*pprob = final_prob;
 	rspamd_mempool_set_variable (task->task_pool, "bayes_prob", pprob, NULL);
 
+	if (cl.text_tokens <= (cl.processed_tokens - cl.text_tokens) / 2) {
+		msg_info_bayes ("ignore bayes probability %.2f since we have "
+				"much more metatokens (%d) than text tokens (%d)",
+				final_prob,
+				cl.processed_tokens - cl.text_tokens, cl.text_tokens);
+
+		return TRUE;
+	}
+
+	if (ctx->cfg->min_tokens > 0 &&
+			cl.text_tokens < ctx->cfg->min_tokens * 0.1) {
+		msg_info_bayes ("ignore bayes probability %.2f since we have "
+				"too few text tokens: %d, at least %.0f is required",
+				final_prob,
+				cl.text_tokens,
+				ctx->cfg->min_tokens * 0.1);
+
+		return TRUE;
+	}
+
 	if (cl.processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) {
 		/* Now we can have exactly one HAM and exactly one SPAM statfiles per classifier */
 		for (i = 0; i < ctx->statfiles_ids->len; i++) {
-- 
2.39.5