diff options
author | cebka@lenovo-laptop <cebka@lenovo-laptop> | 2010-03-01 18:37:06 +0300 |
---|---|---|
committer | cebka@lenovo-laptop <cebka@lenovo-laptop> | 2010-03-01 18:37:06 +0300 |
commit | 74cf00015278784d04d26b44bcf326f9493f7d62 (patch) | |
tree | afe42b0a47504bf383d755e1b3254e1164944712 /src/classifiers | |
parent | 0fec0b5c2d20fb505f1b8345d48b5098fe5598f6 (diff) | |
download | rspamd-74cf00015278784d04d26b44bcf326f9493f7d62.tar.gz rspamd-74cf00015278784d04d26b44bcf326f9493f7d62.zip |
* Add weights command for getting weights of each message by each statfile
* Add ability to specify multiplier when learning
* Add statistics about spam and ham messages
Diffstat (limited to 'src/classifiers')
-rw-r--r-- | src/classifiers/classifiers.c | 5 | ||||
-rw-r--r-- | src/classifiers/classifiers.h | 14 | ||||
-rw-r--r-- | src/classifiers/winnow.c | 67 |
3 files changed, 80 insertions, 6 deletions
diff --git a/src/classifiers/classifiers.c b/src/classifiers/classifiers.c index 566cf2b75..219576870 100644 --- a/src/classifiers/classifiers.c +++ b/src/classifiers/classifiers.c @@ -30,12 +30,13 @@ #include "classifiers.h" struct classifier classifiers[] = { - { + { .name = "winnow", .init_func = winnow_init, .classify_func = winnow_classify, .learn_func = winnow_learn, - }, + .weights_func = winnow_weights + } }; struct classifier * diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h index 12787f049..de937bc3f 100644 --- a/src/classifiers/classifiers.h +++ b/src/classifiers/classifiers.h @@ -14,13 +14,20 @@ struct classifier_ctx { GHashTable *results; struct classifier_config *cfg; }; + +struct classify_weight { + const char *name; + double weight; +}; + /* Common classifier structure */ struct classifier { char *name; struct classifier_ctx* (*init_func)(memory_pool_t *pool, struct classifier_config *cf); void (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task); void (*learn_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, - stat_file_t *file, GTree *input, gboolean in_class, double *sum); + stat_file_t *file, GTree *input, gboolean in_class, double *sum, double multiplier); + GList* (*weights_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task); }; /* Get classifier structure by name or return NULL if this name is not found */ @@ -29,7 +36,10 @@ struct classifier* get_classifier (char *name); /* Winnow algorithm */ struct classifier_ctx* winnow_init (memory_pool_t *pool, struct classifier_config *cf); void winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task); -void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, stat_file_t *file, GTree *input, gboolean in_class, double *sum); +void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, stat_file_t *file, GTree *input, + gboolean in_class, double *sum, double multiplier); +GList *winnow_weights (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task); + /* Array of all defined classifiers */ extern struct classifier classifiers[]; diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c index e103dd50d..af370bd38 100644 --- a/src/classifiers/winnow.c +++ b/src/classifiers/winnow.c @@ -42,6 +42,7 @@ struct winnow_callback_data { struct classifier_ctx *ctx; stat_file_t *file; double sum; + double multiplier; int count; int in_class; time_t now; @@ -77,8 +78,9 @@ learn_callback (gpointer key, gpointer value, gpointer data) token_node_t *node = key; struct winnow_callback_data *cd = data; double v, c; - + c = (cd->in_class) ? WINNOW_PROMOTION : WINNOW_DEMOTION; + c *= cd->multiplier; /* Consider that not found blocks have value 1 */ v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now); @@ -195,13 +197,74 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inp } } +GList * +winnow_weights (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * input, struct worker_task *task) +{ + struct winnow_callback_data data; + double res = 0.; + GList *cur, *resl = NULL; + struct statfile *st; + struct classify_weight *w; + + g_assert (pool != NULL); + g_assert (ctx != NULL); + + data.pool = pool; + data.sum = 0; + data.count = 0; + data.now = time (NULL); + data.ctx = ctx; + + cur = ctx->cfg->statfiles; + while (cur) { + st = cur->data; + if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) { + if ((data.file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { + msg_warn ("cannot open %s, skip it", st->path); + cur = g_list_next (cur); + continue; + } + } + + if (data.file != NULL) { + statfile_pool_lock_file (pool, data.file); + g_tree_foreach (input, classify_callback, &data); + statfile_pool_unlock_file (pool, data.file); + } + + w = memory_pool_alloc (task->task_pool, sizeof (struct classify_weight)); + if (data.count != 0) { + res = data.sum / data.count; + w->name = st->symbol; + w->weight = res; + resl = g_list_prepend (resl, w); + } + else { + res = 0; + w->name = st->symbol; + w->weight = res; + resl = g_list_prepend (resl, w); + } + cur = g_list_next (cur); + } + + if (resl != NULL) { + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, resl); + } + + return resl; + +} + + void -winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, stat_file_t *file, GTree * input, int in_class, double *sum) +winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, stat_file_t *file, GTree * input, int in_class, double *sum, double multiplier) { struct winnow_callback_data data = { .file = NULL, .sum = 0, .count = 0, + .multiplier = multiplier }; g_assert (pool != NULL); |