summaryrefslogtreecommitdiffstats
path: root/src/classifiers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2010-05-27 17:33:31 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2010-05-27 17:33:31 +0400
commitac8249b6ee746f022b0753789e6e2b46ab842abc (patch)
tree0aa5fd13aa5df6a9f72b46295abff33670d211da /src/classifiers
parent41503710ed4ae1d52d98cb2f0686a19fc814f25c (diff)
downloadrspamd-ac8249b6ee746f022b0753789e6e2b46ab842abc.tar.gz
rspamd-ac8249b6ee746f022b0753789e6e2b46ab842abc.zip
* Implement new learning system, now rspamd should be much more intelligent while learning messages
Diffstat (limited to 'src/classifiers')
-rw-r--r--src/classifiers/winnow.c157
1 files changed, 125 insertions, 32 deletions
diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c
index a5e7b3cf8..637be759d 100644
--- a/src/classifiers/winnow.c
+++ b/src/classifiers/winnow.c
@@ -38,17 +38,29 @@
#define WINNOW_PROMOTION 1.23
#define WINNOW_DEMOTION 0.83
+#define MEDIAN_WINDOW_SIZE 5
+
+#define MAX_WEIGHT G_MAXDOUBLE / 2.
+
+#define ALPHA 0.001
+
+#define MAX_LEARN_ITERATIONS 100
+
struct winnow_callback_data {
statfile_pool_t *pool;
struct classifier_ctx *ctx;
stat_file_t *file;
+ stat_file_t *learn_file;
double sum;
double multiplier;
int count;
- int in_class;
+ gboolean in_class;
+ gboolean fresh_run;
time_t now;
};
+static const double max_common_weight = MAX_WEIGHT * WINNOW_DEMOTION;
+
static gboolean
classify_callback (gpointer key, gpointer value, gpointer data)
{
@@ -58,9 +70,9 @@ classify_callback (gpointer key, gpointer value, gpointer data)
/* Consider that not found blocks have value 1 */
v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now);
- if (fabs (v) > 0.00001) {
- if (cd->sum + v > G_MAXDOUBLE / 2.) {
- cd->sum = G_MAXDOUBLE / 2.;
+ if (fabs (v) > ALPHA) {
+ if (cd->sum + v > MAX_WEIGHT) {
+ cd->sum = MAX_WEIGHT;
}
else {
cd->sum += v;
@@ -78,31 +90,78 @@ learn_callback (gpointer key, gpointer value, gpointer data)
{
token_node_t *node = key;
struct winnow_callback_data *cd = data;
- double v, c;
+ double v, c;
c = (cd->in_class) ? WINNOW_PROMOTION : WINNOW_DEMOTION;
c *= cd->multiplier;
/* Consider that not found blocks have value 1 */
v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now);
- if (fabs (v) < 0.00001) {
- statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, c);
- node->value = c;
+ if (fabs (v) < ALPHA) {
+ /* Block not found, insert new */
+ if (cd->file == cd->learn_file) {
+ statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, c);
+ node->value = c;
+ }
}
else {
- statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, v * c);
- /* Set some limit on growing */
- if (v > G_MAXDOUBLE / 2.) {
- node->value = v;
- }
- else {
- node->value = v * c;
- }
+ /* Here we just increase the extra value of block */
+ if (cd->fresh_run) {
+ node->extra = 0;
+ }
+ else {
+ node->extra ++;
+ }
+ node->value = v;
+
+ if (node->extra > 1) {
+ /*
+ * Assume that this node is common for several statfiles, so
+ * decrease its weight proportianally
+ */
+ if (node->value > max_common_weight) {
+ /* Static fluctuation */
+ statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, 0.);
+ node->value = 0.;
+ }
+ else if (node->value > WINNOW_PROMOTION) {
+ /* Try to decrease its value */
+ /* XXX: it is more intelligent to add some adaptive filter here */
+ if (cd->file == cd->learn_file) {
+ if (node->value > max_common_weight / 2.) {
+ node->value *= c;
+ }
+ else {
+ /*
+ * Too high token value that exists also in other
+ * statfiles, may be statistic error, so decrease it
+ * slightly
+ */
+ node->value *= WINNOW_DEMOTION * cd->multiplier;
+ }
+ }
+ else {
+ node->value = sqrt (node->value);
+ }
+ statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, node->value);
+ }
+ }
+ else if (cd->file == cd->learn_file) {
+ /* New block or block that is in only one statfile */
+ /* Set some limit on growing */
+ if (v > MAX_WEIGHT) {
+ node->value = v;
+ }
+ else {
+ node->value *= c;
+ }
+ statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, node->value);
+ }
}
- if (cd->sum + node->value > G_MAXDOUBLE / 2.) {
- cd->sum = G_MAXDOUBLE / 2.;
+ if (cd->sum + node->value > MAX_WEIGHT) {
+ cd->sum = MAX_WEIGHT;
}
else {
cd->sum += node->value;
@@ -223,8 +282,6 @@ winnow_weights (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inpu
g_assert (ctx != NULL);
data.pool = pool;
- data.sum = 0;
- data.count = 0;
data.now = time (NULL);
data.ctx = ctx;
@@ -240,6 +297,8 @@ winnow_weights (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inpu
cur = ctx->cfg->statfiles;
while (cur) {
st = cur->data;
+ data.sum = 0;
+ data.count = 0;
if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
if ((data.file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
msg_warn ("cannot open %s, skip it", st->path);
@@ -254,7 +313,7 @@ winnow_weights (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inpu
statfile_pool_unlock_file (pool, data.file);
}
- w = memory_pool_alloc (task->task_pool, sizeof (struct classify_weight));
+ w = memory_pool_alloc0 (task->task_pool, sizeof (struct classify_weight));
if (data.count != 0) {
res = data.sum / data.count;
}
@@ -281,12 +340,14 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, stat_file_t *fi
{
struct winnow_callback_data data = {
.file = NULL,
- .sum = 0,
- .count = 0,
.multiplier = multiplier
};
char *value;
- int nodes, minnodes;
+ int nodes, minnodes, iterations = 0;
+ struct statfile *st;
+ stat_file_t *sel;
+ double res = 0., max = 0.;
+ GList *cur;
g_assert (pool != NULL);
g_assert (ctx != NULL);
@@ -295,8 +356,7 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, stat_file_t *fi
data.in_class = in_class;
data.now = time (NULL);
data.ctx = ctx;
-
- data.file = file;
+ data.learn_file = file;
if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
@@ -307,12 +367,45 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, stat_file_t *fi
return;
}
}
-
- if (data.file != NULL) {
- statfile_pool_lock_file (pool, data.file);
- g_tree_foreach (input, learn_callback, &data);
- statfile_pool_unlock_file (pool, data.file);
- }
+
+ do {
+ cur = ctx->cfg->statfiles;
+ data.fresh_run = TRUE;
+ while (cur) {
+ st = cur->data;
+ data.sum = 0;
+ data.count = 0;
+ if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
+ if ((data.file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+ msg_warn ("cannot open %s, skip it", st->path);
+ cur = g_list_next (cur);
+ continue;
+ }
+ }
+ statfile_pool_lock_file (pool, data.file);
+ g_tree_foreach (input, learn_callback, &data);
+ statfile_pool_unlock_file (pool, data.file);
+ if (data.count != 0) {
+ res = data.sum / data.count;
+ }
+ else {
+ res = 0;
+ }
+ if (res > max) {
+ max = res;
+ sel = data.file;
+ }
+ cur = g_list_next (cur);
+ data.fresh_run = FALSE;
+ }
+
+ if (data.multiplier > 1) {
+ data.multiplier *= data.multiplier;
+ }
+ else {
+ data.multiplier *= WINNOW_PROMOTION;
+ }
+ } while ((in_class ? sel != file : sel == file) && iterations ++ < MAX_LEARN_ITERATIONS);
if (sum) {
if (data.count != 0) {