From 23d9a026aa817ebce23cd28cb50be559e6f85896 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 13 Apr 2017 17:47:57 +0100 Subject: [PATCH] [Feature] Add unigramms support in bayes --- src/libstat/classifiers/bayes.c | 10 +++++++++- src/libstat/stat_api.h | 1 + src/libstat/tokenizers/osb.c | 12 ++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index 2aa15eee9..ea9269a12 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -161,7 +161,15 @@ bayes_classify_token (struct rspamd_classifier *ctx, ham_freq = ((double)ham_count / MAX (1., (double)ctx->ham_learns)); spam_prob = spam_freq / (spam_freq + ham_freq); ham_prob = ham_freq / (spam_freq + ham_freq); - fw = feature_weight[tok->window_idx % G_N_ELEMENTS (feature_weight)]; + + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UNIGRAM) { + fw = 1.0; + } + else { + fw = feature_weight[tok->window_idx % + G_N_ELEMENTS (feature_weight)]; + } + norm_sum = (spam_freq + ham_freq) * (spam_freq + ham_freq); norm_sub = (spam_freq - ham_freq) * (spam_freq - ham_freq); diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h index 6c2604e89..a4e02a591 100644 --- a/src/libstat/stat_api.h +++ b/src/libstat/stat_api.h @@ -31,6 +31,7 @@ #define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1 << 2) #define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1 << 3) #define RSPAMD_STAT_TOKEN_FLAG_SUBJECT (1 << 4) +#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1 << 5) typedef struct rspamd_stat_token_s { const gchar *begin; diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index f0e351bb5..54668e758 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -326,6 +326,18 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, } } + if (token_flags & RSPAMD_STAT_TOKEN_FLAG_UNIGRAM) { + new_tok = rspamd_mempool_alloc0 (pool, token_size); + new_tok->flags = token_flags; + new_tok->t1 = token; + new_tok->t2 = token; + new_tok->data = cur; + new_tok->window_idx = 0; + g_ptr_array_add (result, new_tok); + + continue; + } + #define ADD_TOKEN do {\ new_tok = rspamd_mempool_alloc0 (pool, token_size); \ new_tok->flags = token_flags; \ -- 2.39.5