diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-11-23 10:04:00 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-11-23 10:04:00 +0000 |
commit | bfeff0ab44bf0062a6ad3083b3becde22f08d4cf (patch) | |
tree | af92cb21f25c3a739dfca82d0140c74b194ff336 | |
parent | 62fee4b415f9c4e738f7fd5dea441dabe1244d2d (diff) | |
download | rspamd-bfeff0ab44bf0062a6ad3083b3becde22f08d4cf.tar.gz rspamd-bfeff0ab44bf0062a6ad3083b3becde22f08d4cf.zip |
Some more fixes to OSB algorithm
-rw-r--r-- | src/libstat/classifiers/bayes.c | 15 | ||||
-rw-r--r-- | src/libstat/tokenizers/osb.c | 5 |
2 files changed, 14 insertions, 6 deletions
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index 966d5b458..3d16c05eb 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -94,7 +94,11 @@ struct bayes_task_closure { struct rspamd_task *task; }; -static const double feature_weight[] = { 0, 3125, 256, 27, 4, 1 }; +/* + * Mathematically we use pow(complexity, complexity), where complexity is the + * window index + */ +static const double feature_weight[] = { 0, 1, 4, 27, 256, 3125, 46656, 823543 }; #define PROB_COMBINE(prob, cnt, weight, assumed) (((weight) * (assumed) + (cnt) * (prob)) / ((weight) + (cnt))) /* @@ -151,11 +155,12 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) rt->ham_prob += log (bayes_ham_prob); res->cl_runtime->processed_tokens ++; - msg_debug_bayes ("token: total_count: %L, spam_count: %L, ham_count: %L," - " spam_prob: %.3f, " - "ham_prob: %.3f, bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, " + msg_debug_bayes ("token: weight: %f, total_count: %L, " + "spam_count: %L, ham_count: %L," + "spam_prob: %.3f, ham_prob: %.3f, " + "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, " "current spam prob: %.3f, current ham prob: %.3f", - total_count, spam_count, ham_count, + fw, total_count, spam_count, ham_count, spam_prob, ham_prob, bayes_spam_prob, bayes_ham_prob, rt->spam_prob, rt->ham_prob); diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index cac8f7071..2d1b3bb3e 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -308,7 +308,7 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, memcpy (new->data, &cur, sizeof (cur)); } - new->window_idx = i; + new->window_idx = i + 1; if (g_tree_lookup (tree, new) == NULL) { g_tree_insert (tree, new, new); @@ -318,6 +318,7 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, } if (processed <= window_size) { + memmove (hashpipe, hashpipe + (window_size - processed + 1), processed); for (i = 1; i < processed; i++) { new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t)); new->datalen = sizeof (gint64); @@ -335,6 +336,8 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt, memcpy (new->data, &cur, sizeof (cur)); } + new->window_idx = i + 1; + if (g_tree_lookup (tree, new) == NULL) { g_tree_insert (tree, new, new); } |