aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-11-23 10:04:00 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-11-23 10:04:00 +0000
commitbfeff0ab44bf0062a6ad3083b3becde22f08d4cf (patch)
treeaf92cb21f25c3a739dfca82d0140c74b194ff336
parent62fee4b415f9c4e738f7fd5dea441dabe1244d2d (diff)
downloadrspamd-bfeff0ab44bf0062a6ad3083b3becde22f08d4cf.tar.gz
rspamd-bfeff0ab44bf0062a6ad3083b3becde22f08d4cf.zip
Some more fixes to OSB algorithm
-rw-r--r--src/libstat/classifiers/bayes.c15
-rw-r--r--src/libstat/tokenizers/osb.c5
2 files changed, 14 insertions, 6 deletions
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index 966d5b458..3d16c05eb 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -94,7 +94,11 @@ struct bayes_task_closure {
struct rspamd_task *task;
};
-static const double feature_weight[] = { 0, 3125, 256, 27, 4, 1 };
+/*
+ * Mathematically we use pow(complexity, complexity), where complexity is the
+ * window index
+ */
+static const double feature_weight[] = { 0, 1, 4, 27, 256, 3125, 46656, 823543 };
#define PROB_COMBINE(prob, cnt, weight, assumed) (((weight) * (assumed) + (cnt) * (prob)) / ((weight) + (cnt)))
/*
@@ -151,11 +155,12 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)
rt->ham_prob += log (bayes_ham_prob);
res->cl_runtime->processed_tokens ++;
- msg_debug_bayes ("token: total_count: %L, spam_count: %L, ham_count: %L,"
- " spam_prob: %.3f, "
- "ham_prob: %.3f, bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
+ msg_debug_bayes ("token: weight: %f, total_count: %L, "
+ "spam_count: %L, ham_count: %L,"
+ "spam_prob: %.3f, ham_prob: %.3f, "
+ "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
"current spam prob: %.3f, current ham prob: %.3f",
- total_count, spam_count, ham_count,
+ fw, total_count, spam_count, ham_count,
spam_prob, ham_prob,
bayes_spam_prob, bayes_ham_prob,
rt->spam_prob, rt->ham_prob);
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
index cac8f7071..2d1b3bb3e 100644
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -308,7 +308,7 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
memcpy (new->data, &cur, sizeof (cur));
}
- new->window_idx = i;
+ new->window_idx = i + 1;
if (g_tree_lookup (tree, new) == NULL) {
g_tree_insert (tree, new, new);
@@ -318,6 +318,7 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
}
if (processed <= window_size) {
+ memmove (hashpipe, hashpipe + (window_size - processed + 1), processed);
for (i = 1; i < processed; i++) {
new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
new->datalen = sizeof (gint64);
@@ -335,6 +336,8 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_runtime *rt,
memcpy (new->data, &cur, sizeof (cur));
}
+ new->window_idx = i + 1;
+
if (g_tree_lookup (tree, new) == NULL) {
g_tree_insert (tree, new, new);
}