]> source.dussan.org Git - rspamd.git/commitdiff
Some more fixes to OSB algorithm
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 23 Nov 2015 10:04:00 +0000 (10:04 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 21 Dec 2015 18:22:52 +0000 (18:22 +0000)
src/libstat/classifiers/bayes.c
src/libstat/tokenizers/osb.c

index 966d5b458fea6657c49188495918c716821c31b6..3d16c05ebfb7eefb389c54a4a147211463592946 100644 (file)
@@ -94,7 +94,11 @@ struct bayes_task_closure {
        struct rspamd_task *task;
 };
 
-static const double feature_weight[] = { 0, 3125, 256, 27, 4, 1 };
+/*
+ * Mathematically we use pow(complexity, complexity), where complexity is the
+ * window index
+ */
+static const double feature_weight[] = { 0, 1, 4, 27, 256, 3125, 46656, 823543 };
 
 #define PROB_COMBINE(prob, cnt, weight, assumed) (((weight) * (assumed) + (cnt) * (prob)) / ((weight) + (cnt)))
 /*
@@ -151,11 +155,12 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)
                rt->ham_prob += log (bayes_ham_prob);
                res->cl_runtime->processed_tokens ++;
 
-               msg_debug_bayes ("token: total_count: %L, spam_count: %L, ham_count: %L,"
-                               " spam_prob: %.3f, "
-                               "ham_prob: %.3f, bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
+               msg_debug_bayes ("token: weight: %f, total_count: %L, "
+                               "spam_count: %L, ham_count: %L,"
+                               "spam_prob: %.3f, ham_prob: %.3f, "
+                               "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
                                "current spam prob: %.3f, current ham prob: %.3f",
-                               total_count, spam_count, ham_count,
+                               fw, total_count, spam_count, ham_count,
                                spam_prob, ham_prob,
                                bayes_spam_prob, bayes_ham_prob,
                                rt->spam_prob, rt->ham_prob);
index 20fc6ece843859106422a84daabd40037361bfb6..975933fef4ce4bbb19fdb03b50331256d0f5853c 100644 (file)
@@ -308,7 +308,7 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
                                        memcpy (new->data, &cur, sizeof (cur));
                                }
 
-                               new->window_idx = i;
+                               new->window_idx = i + 1;
 
                                if (g_tree_lookup (tree, new) == NULL) {
                                        g_tree_insert (tree, new, new);
@@ -318,6 +318,7 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
        }
 
        if (processed <= window_size) {
+               memmove (hashpipe, hashpipe + (window_size - processed + 1), processed);
                for (i = 1; i < processed; i++) {
                        new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
                        new->datalen = sizeof (gint64);
@@ -335,6 +336,8 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
                                memcpy (new->data, &cur, sizeof (cur));
                        }
 
+                       new->window_idx = i + 1;
+
                        if (g_tree_lookup (tree, new) == NULL) {
                                g_tree_insert (tree, new, new);
                        }