summaryrefslogtreecommitdiffstats
path: root/src/classifiers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2011-06-24 20:25:54 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2011-06-24 20:25:54 +0400
commita3fa4d672341fd2f1888d3a2f2ed85ae57913b78 (patch)
tree352c634bbbc74cf17644545ace66a8feedc841c3 /src/classifiers
parent63725086863e4f422340479f83dd7ef374613e76 (diff)
downloadrspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.tar.gz
rspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.zip
* Welcome 0.4.0
Uncompatible changes: - Statistics is uncompatible in utf8 mode Major changes: - Improved utf8 mode - Convert all characters to lowercase in statistics - Skip URL's in statistics - Improve speed of bayes classifier by using integer arithmetics - Fixed statfiles synchronization that was broken for a long time - Synchronization is now configurable Minor changes: - Bugfixes - Removed some of legacy code - Types polishing
Diffstat (limited to 'src/classifiers')
-rw-r--r--src/classifiers/bayes.c31
1 files changed, 15 insertions, 16 deletions
diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c
index b4f7826e5..dadd33e5e 100644
--- a/src/classifiers/bayes.c
+++ b/src/classifiers/bayes.c
@@ -43,11 +43,11 @@ bayes_error_quark (void)
}
struct bayes_statfile_data {
- double hits;
- double total_hits;
+ guint64 hits;
+ guint64 total_hits;
double local_probability;
double post_probability;
- double value;
+ guint value;
struct statfile *st;
stat_file_t *file;
};
@@ -67,25 +67,22 @@ bayes_learn_callback (gpointer key, gpointer value, gpointer data)
{
token_node_t *node = key;
struct bayes_callback_data *cd = data;
- double v, c;
+ gint v, c;
c = (cd->in_class) ? 1 : -1;
/* Consider that not found blocks have value 1 */
v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now);
- if (fabs (v) < ALPHA && c > 0) {
+ if (v == 0 && c > 0) {
statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, c);
}
else {
- if (G_LIKELY (c > 0 && c < G_MAXDOUBLE)) {
- v += c;
+ if (G_LIKELY (c > 0)) {
+ v ++;
}
else if (c < 0){
- if (v > -c) {
- v -= c;
- }
- else {
- v = 0;
+ if (v != 0) {
+ v --;
}
}
statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, v);
@@ -103,14 +100,15 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)
token_node_t *node = key;
struct bayes_callback_data *cd = data;
- double local_hits = 0, renorm = 0;
- int i;
+ double renorm = 0;
+ gint i;
+ guint64 local_hits = 0;
struct bayes_statfile_data *cur;
for (i = 0; i < cd->statfiles_num; i ++) {
cur = &cd->statfiles[i];
cur->value = statfile_pool_get_block (cd->pool, cur->file, node->h1, node->h2, cd->now);
- if (cur->value > ALPHA) {
+ if (cur->value > 0) {
cur->total_hits += cur->value;
cur->hits = cur->value;
local_hits += cur->value;
@@ -121,7 +119,8 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)
}
for (i = 0; i < cd->statfiles_num; i ++) {
cur = &cd->statfiles[i];
- cur->local_probability = 0.5 + (cur->value - (local_hits - cur->value)) / (LOCAL_PROB_DENOM * (local_hits + 1.0));
+ cur->local_probability = 0.5 + ((double)cur->value - ((double)local_hits - cur->value)) /
+ (LOCAL_PROB_DENOM * (1.0 + local_hits));
renorm += cur->post_probability * cur->local_probability;
}