Some more fixes to OSB algorithm

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 23 Nov 2015 10:04:00 +0000 (10:04 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 21 Dec 2015 18:22:52 +0000 (18:22 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 23 Nov 2015 10:04:00 +0000 (10:04 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 21 Dec 2015 18:22:52 +0000 (18:22 +0000)
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c

index 966d5b458fea6657c49188495918c716821c31b6..3d16c05ebfb7eefb389c54a4a147211463592946 100644 (file)
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -94,7 +94,11 @@ struct bayes_task_closure {
         struct rspamd_task *task;
  };
  
-static const double feature_weight[] = { 0, 3125, 256, 27, 4, 1 };
+/*
+ * Mathematically we use pow(complexity, complexity), where complexity is the
+ * window index
+ */
+static const double feature_weight[] = { 0, 1, 4, 27, 256, 3125, 46656, 823543 };
  
  #define PROB_COMBINE(prob, cnt, weight, assumed) (((weight) * (assumed) + (cnt) * (prob)) / ((weight) + (cnt)))
  /*
@@ -151,11 +155,12 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)
                 rt->ham_prob += log (bayes_ham_prob);
                 res->cl_runtime->processed_tokens ++;
  
-               msg_debug_bayes ("token: total_count: %L, spam_count: %L, ham_count: %L,"
-                               " spam_prob: %.3f, "
-                               "ham_prob: %.3f, bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
+               msg_debug_bayes ("token: weight: %f, total_count: %L, "
+                               "spam_count: %L, ham_count: %L,"
+                               "spam_prob: %.3f, ham_prob: %.3f, "
+                               "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
                                 "current spam prob: %.3f, current ham prob: %.3f",
-                               total_count, spam_count, ham_count,
+                               fw, total_count, spam_count, ham_count,
                                 spam_prob, ham_prob,
                                 bayes_spam_prob, bayes_ham_prob,
                                 rt->spam_prob, rt->ham_prob);
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c

index 20fc6ece843859106422a84daabd40037361bfb6..975933fef4ce4bbb19fdb03b50331256d0f5853c 100644 (file)
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -308,7 +308,7 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
                                         memcpy (new->data, &cur, sizeof (cur));
                                 }
  
-                               new->window_idx = i;
+                               new->window_idx = i + 1;
  
                                 if (g_tree_lookup (tree, new) == NULL) {
                                         g_tree_insert (tree, new, new);
@@ -318,6 +318,7 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
         }
  
         if (processed <= window_size) {
+               memmove (hashpipe, hashpipe + (window_size - processed + 1), processed);
                 for (i = 1; i < processed; i++) {
                         new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
                         new->datalen = sizeof (gint64);
@@ -335,6 +336,8 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
                                 memcpy (new->data, &cur, sizeof (cur));
                         }
  
+                       new->window_idx = i + 1;
+
                         if (g_tree_lookup (tree, new) == NULL) {
                                 g_tree_insert (tree, new, new);
                         }
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 23 Nov 2015 10:04:00 +0000 (10:04 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 21 Dec 2015 18:22:52 +0000 (18:22 +0000)
src/libstat/classifiers/bayes.c		patch \| blob \| history
src/libstat/tokenizers/osb.c		patch \| blob \| history