summaryrefslogtreecommitdiffstats
path: root/src/classifiers
diff options
context:
space:
mode:
authorcebka@lenovo-laptop <cebka@lenovo-laptop>2010-03-01 18:37:06 +0300
committercebka@lenovo-laptop <cebka@lenovo-laptop>2010-03-01 18:37:06 +0300
commit74cf00015278784d04d26b44bcf326f9493f7d62 (patch)
treeafe42b0a47504bf383d755e1b3254e1164944712 /src/classifiers
parent0fec0b5c2d20fb505f1b8345d48b5098fe5598f6 (diff)
downloadrspamd-74cf00015278784d04d26b44bcf326f9493f7d62.tar.gz
rspamd-74cf00015278784d04d26b44bcf326f9493f7d62.zip
* Add weights command for getting weights of each message by each statfile
* Add ability to specify multiplier when learning * Add statistics about spam and ham messages
Diffstat (limited to 'src/classifiers')
-rw-r--r--src/classifiers/classifiers.c5
-rw-r--r--src/classifiers/classifiers.h14
-rw-r--r--src/classifiers/winnow.c67
3 files changed, 80 insertions, 6 deletions
diff --git a/src/classifiers/classifiers.c b/src/classifiers/classifiers.c
index 566cf2b75..219576870 100644
--- a/src/classifiers/classifiers.c
+++ b/src/classifiers/classifiers.c
@@ -30,12 +30,13 @@
#include "classifiers.h"
struct classifier classifiers[] = {
- {
+ {
.name = "winnow",
.init_func = winnow_init,
.classify_func = winnow_classify,
.learn_func = winnow_learn,
- },
+ .weights_func = winnow_weights
+ }
};
struct classifier *
diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h
index 12787f049..de937bc3f 100644
--- a/src/classifiers/classifiers.h
+++ b/src/classifiers/classifiers.h
@@ -14,13 +14,20 @@ struct classifier_ctx {
GHashTable *results;
struct classifier_config *cfg;
};
+
+struct classify_weight {
+ const char *name;
+ double weight;
+};
+
/* Common classifier structure */
struct classifier {
char *name;
struct classifier_ctx* (*init_func)(memory_pool_t *pool, struct classifier_config *cf);
void (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task);
void (*learn_func)(struct classifier_ctx* ctx, statfile_pool_t *pool,
- stat_file_t *file, GTree *input, gboolean in_class, double *sum);
+ stat_file_t *file, GTree *input, gboolean in_class, double *sum, double multiplier);
+ GList* (*weights_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task);
};
/* Get classifier structure by name or return NULL if this name is not found */
@@ -29,7 +36,10 @@ struct classifier* get_classifier (char *name);
/* Winnow algorithm */
struct classifier_ctx* winnow_init (memory_pool_t *pool, struct classifier_config *cf);
void winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task);
-void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, stat_file_t *file, GTree *input, gboolean in_class, double *sum);
+void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, stat_file_t *file, GTree *input,
+ gboolean in_class, double *sum, double multiplier);
+GList *winnow_weights (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task);
+
/* Array of all defined classifiers */
extern struct classifier classifiers[];
diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c
index e103dd50d..af370bd38 100644
--- a/src/classifiers/winnow.c
+++ b/src/classifiers/winnow.c
@@ -42,6 +42,7 @@ struct winnow_callback_data {
struct classifier_ctx *ctx;
stat_file_t *file;
double sum;
+ double multiplier;
int count;
int in_class;
time_t now;
@@ -77,8 +78,9 @@ learn_callback (gpointer key, gpointer value, gpointer data)
token_node_t *node = key;
struct winnow_callback_data *cd = data;
double v, c;
-
+
c = (cd->in_class) ? WINNOW_PROMOTION : WINNOW_DEMOTION;
+ c *= cd->multiplier;
/* Consider that not found blocks have value 1 */
v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now);
@@ -195,13 +197,74 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inp
}
}
+GList *
+winnow_weights (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * input, struct worker_task *task)
+{
+ struct winnow_callback_data data;
+ double res = 0.;
+ GList *cur, *resl = NULL;
+ struct statfile *st;
+ struct classify_weight *w;
+
+ g_assert (pool != NULL);
+ g_assert (ctx != NULL);
+
+ data.pool = pool;
+ data.sum = 0;
+ data.count = 0;
+ data.now = time (NULL);
+ data.ctx = ctx;
+
+ cur = ctx->cfg->statfiles;
+ while (cur) {
+ st = cur->data;
+ if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
+ if ((data.file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+ msg_warn ("cannot open %s, skip it", st->path);
+ cur = g_list_next (cur);
+ continue;
+ }
+ }
+
+ if (data.file != NULL) {
+ statfile_pool_lock_file (pool, data.file);
+ g_tree_foreach (input, classify_callback, &data);
+ statfile_pool_unlock_file (pool, data.file);
+ }
+
+ w = memory_pool_alloc (task->task_pool, sizeof (struct classify_weight));
+ if (data.count != 0) {
+ res = data.sum / data.count;
+ w->name = st->symbol;
+ w->weight = res;
+ resl = g_list_prepend (resl, w);
+ }
+ else {
+ res = 0;
+ w->name = st->symbol;
+ w->weight = res;
+ resl = g_list_prepend (resl, w);
+ }
+ cur = g_list_next (cur);
+ }
+
+ if (resl != NULL) {
+ memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, resl);
+ }
+
+ return resl;
+
+}
+
+
void
-winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, stat_file_t *file, GTree * input, int in_class, double *sum)
+winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, stat_file_t *file, GTree * input, int in_class, double *sum, double multiplier)
{
struct winnow_callback_data data = {
.file = NULL,
.sum = 0,
.count = 0,
+ .multiplier = multiplier
};
g_assert (pool != NULL);