diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-09-14 19:11:19 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-09-14 19:11:19 +0400 |
commit | a0f41f7c5712e73e8aa521f2064bc53be3315d0a (patch) | |
tree | 147e4d8956a5a3b85e0ecc15b9fcbe29742e4e5c /src/classifiers | |
parent | a90c7d7a12561845e3371efc6803b1ecf6ad7d89 (diff) | |
download | rspamd-a0f41f7c5712e73e8aa521f2064bc53be3315d0a.tar.gz rspamd-a0f41f7c5712e73e8aa521f2064bc53be3315d0a.zip |
* New system of classifiers interface and statfiles processing
* Fix sample config
* Fix compile warnings
* Fix building without lua support
* Fix bugs with nrcpt header parsing and symbols cache loading (by Anton Nekhoroshikh)
Diffstat (limited to 'src/classifiers')
-rw-r--r-- | src/classifiers/classifiers.c | 1 | ||||
-rw-r--r-- | src/classifiers/classifiers.h | 19 | ||||
-rw-r--r-- | src/classifiers/winnow.c | 113 |
3 files changed, 66 insertions, 67 deletions
diff --git a/src/classifiers/classifiers.c b/src/classifiers/classifiers.c index 283350972..482d111b0 100644 --- a/src/classifiers/classifiers.c +++ b/src/classifiers/classifiers.c @@ -35,7 +35,6 @@ struct classifier classifiers[] = { .init_func = winnow_init, .classify_func = winnow_classify, .learn_func = winnow_learn, - .result_file_func = winnow_result_file }, }; diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h index 13a295724..fcb251da1 100644 --- a/src/classifiers/classifiers.h +++ b/src/classifiers/classifiers.h @@ -6,29 +6,30 @@ #include "../statfile.h" #include "../tokenizers/tokenizers.h" +struct classifier_config; +struct worker_task; + struct classifier_ctx { memory_pool_t *pool; GHashTable *results; + struct classifier_config *cfg; }; /* Common classifier structure */ struct classifier { char *name; - struct classifier_ctx* (*init_func)(memory_pool_t *pool); - void (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, - char *statfile, GTree *input, double scale); + struct classifier_ctx* (*init_func)(memory_pool_t *pool, struct classifier_config *cf); + void (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task); void (*learn_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, - char *statfile, GTree *input, int in_class); - char* (*result_file_func)(struct classifier_ctx *ctx, double *probability); + char *symbol, GTree *input, gboolean in_class); }; /* Get classifier structure by name or return NULL if this name is not found */ struct classifier* get_classifier (char *name); /* Winnow algorithm */ -struct classifier_ctx* winnow_init (memory_pool_t *pool); -void winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, char *statfile, GTree *input, double scale); -void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, char *statfile, GTree *input, int in_class); -char* winnow_result_file (struct classifier_ctx* ctx, double *probability); +struct classifier_ctx* winnow_init (memory_pool_t *pool, struct classifier_config *cf); +void winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task); +void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, char *symbol, GTree *input, gboolean in_class); /* Array of all defined classifiers */ extern struct classifier classifiers[]; diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c index edd929af0..88298faf4 100644 --- a/src/classifiers/winnow.c +++ b/src/classifiers/winnow.c @@ -27,6 +27,9 @@ */ #include "classifiers.h" +#include "../main.h" +#include "../filter.h" +#include "../cfg_file.h" #define WINNOW_PROMOTION 1.23 #define WINNOW_DEMOTION 0.83 @@ -85,21 +88,23 @@ learn_callback (gpointer key, gpointer value, gpointer data) } struct classifier_ctx* -winnow_init (memory_pool_t *pool) +winnow_init (memory_pool_t *pool, struct classifier_config *cfg) { struct classifier_ctx *ctx = memory_pool_alloc (pool, sizeof (struct classifier_ctx)); ctx->pool = pool; - ctx->results = g_hash_table_new (g_str_hash, g_str_equal); - memory_pool_add_destructor (pool, (pool_destruct_func)g_hash_table_destroy, ctx->results); + ctx->cfg = cfg; return ctx; } void -winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, GTree *input, double scale) +winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task) { struct winnow_callback_data data; double *res = memory_pool_alloc (ctx->pool, sizeof (double)); + double max = 0; + GList *cur; + struct statfile *st, *sel = NULL; g_assert (pool != NULL); g_assert (ctx != NULL); @@ -109,29 +114,44 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfi data.count = 0; data.now = time (NULL); data.ctx = ctx; - - if ((data.file = statfile_pool_is_open (pool, statfile)) == NULL) { - if ((data.file = statfile_pool_open (pool, statfile)) == NULL) { - return; + + cur = ctx->cfg->statfiles; + while (cur) { + st = cur->data; + if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) { + if ((data.file = statfile_pool_open (pool, st->path)) == NULL) { + msg_warn ("winnow_classify: cannot open %s, skip it", st->path); + cur = g_list_next (cur); + continue; + } } - } - g_tree_foreach (input, classify_callback, &data); + g_tree_foreach (input, classify_callback, &data); - if (data.count != 0) { - *res = scale * (data.sum / data.count); + if (data.count != 0) { + *res = (data.sum / data.count); + } + else { + *res = 0; + } + if (*res > max) { + max = *res; + sel = st; + } + cur = g_list_next (cur); } - else { - *res = 0; + + if (sel != NULL) { + insert_result (task, ctx->cfg->metric, sel->symbol, 1, NULL); } - - g_hash_table_insert (ctx->results, statfile, res); } void -winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, GTree *input, int in_class) +winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *symbol, GTree *input, int in_class) { struct winnow_callback_data data; + GList *cur; + struct statfile *st; g_assert (pool != NULL); g_assert (ctx != NULL); @@ -142,50 +162,29 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, data.in_class = in_class; data.now = time (NULL); data.ctx = ctx; - - if ((data.file = statfile_pool_is_open (pool, statfile)) == NULL) { - if ((data.file = statfile_pool_open (pool, statfile)) == NULL) { - return; + + cur = g_list_first (ctx->cfg->statfiles); + while (cur) { + st = cur->data; + if (strcmp (symbol, st->symbol) == 0) { + if ((data.file = statfile_pool_open (pool, st->path)) == NULL) { + /* Try to create statfile */ + if (statfile_pool_create (pool, + st->path, st->size / sizeof (struct stat_file_block)) == -1) { + msg_err ("winnow_learn: cannot create statfile %s", st->path); + return; + } + if ((data.file = statfile_pool_open (pool, st->path)) == NULL) { + msg_err ("winnow_learn: cannot create statfile %s", st->path); + return; + } + } + break; } + cur = g_list_next (cur); } statfile_pool_lock_file (pool, data.file); g_tree_foreach (input, learn_callback, &data); statfile_pool_unlock_file (pool, data.file); } - -struct winnow_result_data { - char *filename; - double max_score; - double sum; -}; - -static void -result_file_callback (gpointer key, gpointer value, gpointer data) -{ - struct winnow_result_data *d = (struct winnow_result_data *)data; - double w = *((double *)value); - - if (fabs (w) > fabs (d->max_score)) { - d->filename = (char *)key; - d->max_score = w; - } - d->sum += fabs (w); -} - -char* -winnow_result_file (struct classifier_ctx* ctx, double *probability) -{ - struct winnow_result_data data = { NULL, 0, 0 }; - g_assert (ctx != NULL); - - g_hash_table_foreach (ctx->results, result_file_callback, &data); - if (data.sum != 0) { - *probability = data.max_score / data.sum; - } - else { - *probability = 1; - } - - return data.filename; -} |