summaryrefslogtreecommitdiffstats
path: root/src/libstat/classifiers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2017-04-04 17:38:12 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2017-04-04 17:38:12 +0100
commitd07c0aa5edf25f4b98c4d20639b9c501164806bf (patch)
tree1ab5fd4b7e6df271ea657cdd8c7ca6c7276e6ef9 /src/libstat/classifiers
parentecd34e6a20d460311ba83ddb159b42bc85506fdc (diff)
downloadrspamd-d07c0aa5edf25f4b98c4d20639b9c501164806bf.tar.gz
rspamd-d07c0aa5edf25f4b98c4d20639b9c501164806bf.zip
[Feature] Ignore bayes with mostly metatokens or with too few text
Diffstat (limited to 'src/libstat/classifiers')
-rw-r--r--src/libstat/classifiers/bayes.c30
1 files changed, 28 insertions, 2 deletions
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index c9faae6bd..f836f5472 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -83,6 +83,7 @@ struct bayes_task_closure {
double spam_prob;
guint64 processed_tokens;
guint64 total_hits;
+ guint64 text_tokens;
struct rspamd_task *task;
};
@@ -158,6 +159,10 @@ bayes_classify_token (struct rspamd_classifier *ctx,
cl->ham_prob += log2 (bayes_ham_prob);
cl->processed_tokens ++;
+ if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_META)) {
+ cl->text_tokens ++;
+ }
+
if (tok->t1 && tok->t2) {
msg_debug_bayes ("token %uL <%*s:%*s>: weight: %f, total_count: %L, "
"spam_count: %L, ham_count: %L,"
@@ -247,14 +252,15 @@ bayes_classify (struct rspamd_classifier * ctx,
final_prob = (s + 1.0 - h) / 2.;
msg_debug_bayes (
"<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f,"
- " %L tokens processed of %ud total tokens",
+ " %L tokens processed of %ud total tokens (%uL text tokens)",
task->message_id,
cl.ham_prob,
h,
cl.spam_prob,
s,
cl.processed_tokens,
- tokens->len);
+ tokens->len,
+ cl.text_tokens);
}
else {
/*
@@ -282,6 +288,26 @@ bayes_classify (struct rspamd_classifier * ctx,
*pprob = final_prob;
rspamd_mempool_set_variable (task->task_pool, "bayes_prob", pprob, NULL);
+ if (cl.text_tokens <= (cl.processed_tokens - cl.text_tokens) / 2) {
+ msg_info_bayes ("ignore bayes probability %.2f since we have "
+ "much more metatokens (%d) than text tokens (%d)",
+ final_prob,
+ cl.processed_tokens - cl.text_tokens, cl.text_tokens);
+
+ return TRUE;
+ }
+
+ if (ctx->cfg->min_tokens > 0 &&
+ cl.text_tokens < ctx->cfg->min_tokens * 0.1) {
+ msg_info_bayes ("ignore bayes probability %.2f since we have "
+ "too few text tokens: %d, at least %.0f is required",
+ final_prob,
+ cl.text_tokens,
+ ctx->cfg->min_tokens * 0.1);
+
+ return TRUE;
+ }
+
if (cl.processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) {
/* Now we can have exactly one HAM and exactly one SPAM statfiles per classifier */
for (i = 0; i < ctx->statfiles_ids->len; i++) {