[Feature] Probabilistically skip metatokens

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2017-04-05 15:00:24 +0100
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2017-04-05 15:00:24 +0100
commit: e8a0388b7d3b70144e49b6b5bd89a480f6781659 (patch)
tree: 789fa88a9d95c702d18dcf4d0415aca2eb044992
parent: 7c4ea9f998b92416e41cad22e8f7f15d9e07be3b (diff)
download: rspamd-e8a0388b7d3b70144e49b6b5bd89a480f6781659.tar.gz
rspamd-e8a0388b7d3b70144e49b6b5bd89a480f6781659.zip
1 files changed, 51 insertions, 3 deletions
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index f836f5472..7441153ff 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -81,6 +81,7 @@ inv_chi_square (struct rspamd_task *task, gdouble value, gint freedom_deg)
 struct bayes_task_closure {
 	double ham_prob;
 	double spam_prob;
+	gdouble meta_skip_prob;
 	guint64 processed_tokens;
 	guint64 total_hits;
 	guint64 text_tokens;
@@ -106,6 +107,7 @@ bayes_classify_token (struct rspamd_classifier *ctx,
 	guint64 spam_count = 0, ham_count = 0, total_count = 0;
 	struct rspamd_statfile *st;
 	struct rspamd_task *task;
+	const gchar *token_type = "txt";
 	double spam_prob, spam_freq, ham_freq, bayes_spam_prob, bayes_ham_prob,
 		ham_prob, fw, w, norm_sum, norm_sub, val;
 
@@ -118,6 +120,22 @@ bayes_classify_token (struct rspamd_classifier *ctx,
 	}
 #endif
 
+	if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_META && cl->meta_skip_prob > 0) {
+		val = rspamd_random_double_fast ();
+
+		if (val <= cl->meta_skip_prob) {
+			if (tok->t1 && tok->t2) {
+				msg_debug_bayes (
+						"token(meta) %uL <%*s:%*s> probabilistically skipped",
+						tok->data,
+						(int) tok->t1->len, tok->t1->begin,
+						(int) tok->t2->len, tok->t2->begin);
+			}
+
+			return;
+		}
+	}
+
 	for (i = 0; i < ctx->statfiles_ids->len; i++) {
 		id = g_array_index (ctx->statfiles_ids, gint, i);
 		st = g_ptr_array_index (ctx->ctx->statfiles, id);
@@ -162,13 +180,17 @@ bayes_classify_token (struct rspamd_classifier *ctx,
 		if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_META)) {
 			cl->text_tokens ++;
 		}
+		else {
+			token_type = "meta";
+		}
 
 		if (tok->t1 && tok->t2) {
-			msg_debug_bayes ("token %uL <%*s:%*s>: weight: %f, total_count: %L, "
+			msg_debug_bayes ("token(%s) %uL <%*s:%*s>: weight: %f, total_count: %L, "
 					"spam_count: %L, ham_count: %L,"
 					"spam_prob: %.3f, ham_prob: %.3f, "
 					"bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
 					"current spam prob: %.3f, current ham prob: %.3f",
+					token_type,
 					tok->data,
 					(int) tok->t1->len, tok->t1->begin,
 					(int) tok->t2->len, tok->t2->begin,
@@ -178,11 +200,12 @@ bayes_classify_token (struct rspamd_classifier *ctx,
 					cl->spam_prob, cl->ham_prob);
 		}
 		else {
-			msg_debug_bayes ("token %uL <?:?>: weight: %f, total_count: %L, "
+			msg_debug_bayes ("token(%s) %uL <?:?>: weight: %f, total_count: %L, "
 					"spam_count: %L, ham_count: %L,"
 					"spam_prob: %.3f, ham_prob: %.3f, "
 					"bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
 					"current spam prob: %.3f, current ham prob: %.3f",
+					token_type,
 					tok->data,
 					fw, total_count, spam_count, ham_count,
 					spam_prob, ham_prob,
@@ -212,7 +235,7 @@ bayes_classify (struct rspamd_classifier * ctx,
 	struct rspamd_statfile *st = NULL;
 	struct bayes_task_closure cl;
 	rspamd_token_t *tok;
-	guint i;
+	guint i, text_tokens = 0;
 	gint id;
 
 	g_assert (ctx != NULL);
@@ -241,6 +264,31 @@ bayes_classify (struct rspamd_classifier * ctx,
 
 	for (i = 0; i < tokens->len; i ++) {
 		tok = g_ptr_array_index (tokens, i);
+		if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_META)) {
+			text_tokens ++;
+		}
+	}
+
+	if (text_tokens == 0) {
+		msg_info_task ("skip classification as there are no text tokens, "
+				"%ud total tokens",
+				tokens->len);
+
+		return TRUE;
+	}
+
+	/*
+	 * Skip some metatokens if we don't have enough text tokens
+	 */
+	if (text_tokens > tokens->len - text_tokens) {
+		cl.meta_skip_prob = 0.0;
+	}
+	else {
+		cl.meta_skip_prob = 1.0 - text_tokens / tokens->len;
+	}
+
+	for (i = 0; i < tokens->len; i ++) {
+		tok = g_ptr_array_index (tokens, i);
 
 		bayes_classify_token (ctx, tok, &cl);
 	}
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2017-04-05 15:00:24 +0100
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2017-04-05 15:00:24 +0100
commit	e8a0388b7d3b70144e49b6b5bd89a480f6781659 (patch)
tree	789fa88a9d95c702d18dcf4d0415aca2eb044992
parent	7c4ea9f998b92416e41cad22e8f7f15d9e07be3b (diff)
download	rspamd-e8a0388b7d3b70144e49b6b5bd89a480f6781659.tar.gz rspamd-e8a0388b7d3b70144e49b6b5bd89a480f6781659.zip