]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Add unigramms support in bayes
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 13 Apr 2017 16:47:57 +0000 (17:47 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 13 Apr 2017 16:48:13 +0000 (17:48 +0100)
src/libstat/classifiers/bayes.c
src/libstat/stat_api.h
src/libstat/tokenizers/osb.c

index 2aa15eee9211d0558e16997c351536d17f0bebff..ea9269a127f4f700506f95c84a37cc5a8311d1a0 100644 (file)
@@ -161,7 +161,15 @@ bayes_classify_token (struct rspamd_classifier *ctx,
                ham_freq = ((double)ham_count / MAX (1., (double)ctx->ham_learns));
                spam_prob = spam_freq / (spam_freq + ham_freq);
                ham_prob = ham_freq / (spam_freq + ham_freq);
-               fw = feature_weight[tok->window_idx % G_N_ELEMENTS (feature_weight)];
+
+               if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UNIGRAM) {
+                       fw = 1.0;
+               }
+               else {
+                       fw = feature_weight[tok->window_idx %
+                                       G_N_ELEMENTS (feature_weight)];
+               }
+
                norm_sum = (spam_freq + ham_freq) * (spam_freq + ham_freq);
                norm_sub = (spam_freq - ham_freq) * (spam_freq - ham_freq);
 
index 6c2604e8952b768bd8f20ef03a167013d63de26f..a4e02a591c767410a4963572ab645f3a785217be 100644 (file)
@@ -31,6 +31,7 @@
 #define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1 << 2)
 #define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1 << 3)
 #define RSPAMD_STAT_TOKEN_FLAG_SUBJECT (1 << 4)
+#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1 << 5)
 
 typedef struct rspamd_stat_token_s {
        const gchar *begin;
index f0e351bb50ed960aa34ff725acfa89662e830835..54668e758b92fb08367969818ee39e9fb298c233 100644 (file)
@@ -326,6 +326,18 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
                        }
                }
 
+               if (token_flags & RSPAMD_STAT_TOKEN_FLAG_UNIGRAM) {
+                       new_tok = rspamd_mempool_alloc0 (pool, token_size);
+                       new_tok->flags = token_flags;
+                       new_tok->t1 = token;
+                       new_tok->t2 = token;
+                       new_tok->data = cur;
+                       new_tok->window_idx = 0;
+                       g_ptr_array_add (result, new_tok);
+
+                       continue;
+               }
+
 #define ADD_TOKEN do {\
     new_tok = rspamd_mempool_alloc0 (pool, token_size); \
     new_tok->flags = token_flags; \