aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/classifiers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-05-05 18:41:13 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-05-05 18:41:13 +0100
commit726d1ab9f1d8f1d00347dada9c14072e639b444d (patch)
tree9539d4ef2f3e5bb9c0e13ef06e4f852bf22d0096 /src/libstat/classifiers
parent8d19881b31eaf63b1356fffbadb66215f8c7d0ad (diff)
downloadrspamd-726d1ab9f1d8f1d00347dada9c14072e639b444d.tar.gz
rspamd-726d1ab9f1d8f1d00347dada9c14072e639b444d.zip
Add another normalization argument.
Diffstat (limited to 'src/libstat/classifiers')
-rw-r--r--src/libstat/classifiers/bayes.c10
1 files changed, 8 insertions, 2 deletions
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index 78d112dd0..abb02bd90 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -94,7 +94,7 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)
struct rspamd_token_result *res;
guint64 spam_count = 0, ham_count = 0, total_count = 0;
double spam_prob, spam_freq, ham_freq, bayes_spam_prob, bayes_ham_prob,
- ham_prob, fw, w;
+ ham_prob, fw, w, norm_sum, norm_sub;
for (i = rt->start_pos; i < rt->end_pos; i++) {
res = &g_array_index (node->results, struct rspamd_token_result, i);
@@ -118,8 +118,14 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)
spam_prob = spam_freq / (spam_freq + ham_freq);
ham_prob = ham_freq / (spam_freq + ham_freq);
fw = feature_weight[node->window_idx % G_N_ELEMENTS (feature_weight)];
- w = (fw * total_count) / (4.0 * (1.0 + fw * total_count));
+ norm_sum = (spam_freq + ham_freq) * (spam_freq + ham_freq);
+ norm_sub = (spam_freq - ham_freq) * (spam_freq - ham_freq);
+ w = (norm_sub) / (norm_sum) *
+ (fw * total_count) / (4.0 * (1.0 + fw * total_count));
bayes_spam_prob = PROB_COMBINE (spam_prob, total_count, w, 0.5);
+ norm_sub = (ham_freq - spam_freq) * (ham_freq - spam_freq);
+ w = (norm_sub) / (norm_sum) *
+ (fw * total_count) / (4.0 * (1.0 + fw * total_count));
bayes_ham_prob = PROB_COMBINE (ham_prob, total_count, w, 0.5);
rt->spam_prob += log (bayes_spam_prob);
rt->ham_prob += log (bayes_ham_prob);