]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Ignore bayes with mostly metatokens or with too few text
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 4 Apr 2017 16:38:12 +0000 (17:38 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 4 Apr 2017 16:38:12 +0000 (17:38 +0100)
src/libstat/classifiers/bayes.c

index c9faae6bd32280eb375cc40bcc437bc859e4e269..f836f54722d6bfeae085669d2b9b448d905e3e2a 100644 (file)
@@ -83,6 +83,7 @@ struct bayes_task_closure {
        double spam_prob;
        guint64 processed_tokens;
        guint64 total_hits;
+       guint64 text_tokens;
        struct rspamd_task *task;
 };
 
@@ -158,6 +159,10 @@ bayes_classify_token (struct rspamd_classifier *ctx,
                cl->ham_prob += log2 (bayes_ham_prob);
                cl->processed_tokens ++;
 
+               if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_META)) {
+                       cl->text_tokens ++;
+               }
+
                if (tok->t1 && tok->t2) {
                        msg_debug_bayes ("token %uL <%*s:%*s>: weight: %f, total_count: %L, "
                                        "spam_count: %L, ham_count: %L,"
@@ -247,14 +252,15 @@ bayes_classify (struct rspamd_classifier * ctx,
                final_prob = (s + 1.0 - h) / 2.;
                msg_debug_bayes (
                                "<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f,"
-                                               " %L tokens processed of %ud total tokens",
+                                               " %L tokens processed of %ud total tokens (%uL text tokens)",
                                task->message_id,
                                cl.ham_prob,
                                h,
                                cl.spam_prob,
                                s,
                                cl.processed_tokens,
-                               tokens->len);
+                               tokens->len,
+                               cl.text_tokens);
        }
        else {
                /*
@@ -282,6 +288,26 @@ bayes_classify (struct rspamd_classifier * ctx,
        *pprob = final_prob;
        rspamd_mempool_set_variable (task->task_pool, "bayes_prob", pprob, NULL);
 
+       if (cl.text_tokens <= (cl.processed_tokens - cl.text_tokens) / 2) {
+               msg_info_bayes ("ignore bayes probability %.2f since we have "
+                               "much more metatokens (%d) than text tokens (%d)",
+                               final_prob,
+                               cl.processed_tokens - cl.text_tokens, cl.text_tokens);
+
+               return TRUE;
+       }
+
+       if (ctx->cfg->min_tokens > 0 &&
+                       cl.text_tokens < ctx->cfg->min_tokens * 0.1) {
+               msg_info_bayes ("ignore bayes probability %.2f since we have "
+                               "too few text tokens: %d, at least %.0f is required",
+                               final_prob,
+                               cl.text_tokens,
+                               ctx->cfg->min_tokens * 0.1);
+
+               return TRUE;
+       }
+
        if (cl.processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) {
                /* Now we can have exactly one HAM and exactly one SPAM statfiles per classifier */
                for (i = 0; i < ctx->statfiles_ids->len; i++) {