aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2017-04-13 17:47:57 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2017-04-13 17:48:13 +0100
commit23d9a026aa817ebce23cd28cb50be559e6f85896 (patch)
tree571544e530935276fa938a1ee49dad599b66e9dc /src/libstat
parentdbff4c2f3c72e8946f79bbcbc2e1e2262b3ce458 (diff)
downloadrspamd-23d9a026aa817ebce23cd28cb50be559e6f85896.tar.gz
rspamd-23d9a026aa817ebce23cd28cb50be559e6f85896.zip
[Feature] Add unigramms support in bayes
Diffstat (limited to 'src/libstat')
-rw-r--r--src/libstat/classifiers/bayes.c10
-rw-r--r--src/libstat/stat_api.h1
-rw-r--r--src/libstat/tokenizers/osb.c12
3 files changed, 22 insertions, 1 deletions
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index 2aa15eee9..ea9269a12 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -161,7 +161,15 @@ bayes_classify_token (struct rspamd_classifier *ctx,
ham_freq = ((double)ham_count / MAX (1., (double)ctx->ham_learns));
spam_prob = spam_freq / (spam_freq + ham_freq);
ham_prob = ham_freq / (spam_freq + ham_freq);
- fw = feature_weight[tok->window_idx % G_N_ELEMENTS (feature_weight)];
+
+ if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UNIGRAM) {
+ fw = 1.0;
+ }
+ else {
+ fw = feature_weight[tok->window_idx %
+ G_N_ELEMENTS (feature_weight)];
+ }
+
norm_sum = (spam_freq + ham_freq) * (spam_freq + ham_freq);
norm_sub = (spam_freq - ham_freq) * (spam_freq - ham_freq);
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h
index 6c2604e89..a4e02a591 100644
--- a/src/libstat/stat_api.h
+++ b/src/libstat/stat_api.h
@@ -31,6 +31,7 @@
#define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1 << 2)
#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1 << 3)
#define RSPAMD_STAT_TOKEN_FLAG_SUBJECT (1 << 4)
+#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1 << 5)
typedef struct rspamd_stat_token_s {
const gchar *begin;
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
index f0e351bb5..54668e758 100644
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -326,6 +326,18 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
}
}
+ if (token_flags & RSPAMD_STAT_TOKEN_FLAG_UNIGRAM) {
+ new_tok = rspamd_mempool_alloc0 (pool, token_size);
+ new_tok->flags = token_flags;
+ new_tok->t1 = token;
+ new_tok->t2 = token;
+ new_tok->data = cur;
+ new_tok->window_idx = 0;
+ g_ptr_array_add (result, new_tok);
+
+ continue;
+ }
+
#define ADD_TOKEN do {\
new_tok = rspamd_mempool_alloc0 (pool, token_size); \
new_tok->flags = token_flags; \