From bfeff0ab44bf0062a6ad3083b3becde22f08d4cf Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 23 Nov 2015 10:04:00 +0000 Subject: [PATCH] Some more fixes to OSB algorithm --- src/libstat/classifiers/bayes.c | 15 ++++++++++----- src/libstat/tokenizers/osb.c | 5 ++++- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index 966d5b458..3d16c05eb 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -94,7 +94,11 @@ struct bayes_task_closure { struct rspamd_task *task; }; -static const double feature_weight[] = { 0, 3125, 256, 27, 4, 1 }; +/* + * Mathematically we use pow(complexity, complexity), where complexity is the + * window index + */ +static const double feature_weight[] = { 0, 1, 4, 27, 256, 3125, 46656, 823543 }; #define PROB_COMBINE(prob, cnt, weight, assumed) (((weight) * (assumed) + (cnt) * (prob)) / ((weight) + (cnt))) /* @@ -151,11 +155,12 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) rt->ham_prob += log (bayes_ham_prob); res->cl_runtime->processed_tokens ++; - msg_debug_bayes ("token: total_count: %L, spam_count: %L, ham_count: %L," - " spam_prob: %.3f, " - "ham_prob: %.3f, bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, " + msg_debug_bayes ("token: weight: %f, total_count: %L, " + "spam_count: %L, ham_count: %L," + "spam_prob: %.3f, ham_prob: %.3f, " + "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, " "current spam prob: %.3f, current ham prob: %.3f", - total_count, spam_count, ham_count, + fw, total_count, spam_count, ham_count, spam_prob, ham_prob, bayes_spam_prob, bayes_ham_prob, rt->spam_prob, rt->ham_prob); diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index cac8f7071..2d1b3bb3e 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -308,7 +308,7 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, memcpy (new->data, &cur, sizeof (cur)); } - new->window_idx = i; + new->window_idx = i + 1; if (g_tree_lookup (tree, new) == NULL) { g_tree_insert (tree, new, new); @@ -318,6 +318,7 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, } if (processed <= window_size) { + memmove (hashpipe, hashpipe + (window_size - processed + 1), processed); for (i = 1; i < processed; i++) { new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t)); new->datalen = sizeof (gint64); @@ -335,6 +336,8 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, memcpy (new->data, &cur, sizeof (cur)); } + new->window_idx = i + 1; + if (g_tree_lookup (tree, new) == NULL) { g_tree_insert (tree, new, new); } -- 2.39.5