diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-07-12 20:46:55 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-07-12 20:46:55 +0400 |
commit | ff4871310ff5b269dcd02ea300cf78092860e1d4 (patch) | |
tree | cfa435f5de1dc8efc646a0ca1fc6fd261b2c1aa6 /src/classifiers | |
parent | c4105fc43199d51af271bc24d3345aa57906d973 (diff) | |
download | rspamd-ff4871310ff5b269dcd02ea300cf78092860e1d4.tar.gz rspamd-ff4871310ff5b269dcd02ea300cf78092860e1d4.zip |
* First commit to implement multi-statfile filter system with new learning mechanizm (untested yet)
Diffstat (limited to 'src/classifiers')
-rw-r--r-- | src/classifiers/bayes.c | 80 | ||||
-rw-r--r-- | src/classifiers/classifiers.c | 2 | ||||
-rw-r--r-- | src/classifiers/classifiers.h | 6 | ||||
-rw-r--r-- | src/classifiers/winnow.c | 33 |
4 files changed, 103 insertions, 18 deletions
diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c index 7363df522..44e9323a2 100644 --- a/src/classifiers/bayes.c +++ b/src/classifiers/bayes.c @@ -30,9 +30,8 @@ #include "../main.h" #include "../filter.h" #include "../cfg_file.h" -#ifdef WITH_LUA +#include "../binlog.h" #include "../lua/lua_common.h" -#endif #define LOCAL_PROB_DENOM 16.0 @@ -194,15 +193,22 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, } } - data.statfiles_num = g_list_length (ctx->cfg->statfiles); + cur = call_classifier_pre_callbacks (ctx->cfg, task, FALSE, FALSE); + if (cur) { + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, cur); + } + else { + cur = ctx->cfg->statfiles; + } + + data.statfiles_num = g_list_length (cur); data.statfiles = g_new0 (struct bayes_statfile_data, data.statfiles_num); data.pool = pool; data.now = time (NULL); data.ctx = ctx; - cur = ctx->cfg->statfiles; while (cur) { - /* Select statfile to learn */ + /* Select statfile to classify */ st = cur->data; if ((file = statfile_pool_is_open (pool, st->path)) == NULL) { if ((file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { @@ -344,6 +350,70 @@ bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symb return TRUE; } +gboolean +bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool, + GTree *input, struct worker_task *task, gboolean is_spam, GError **err) +{ + struct bayes_callback_data data; + gchar *value; + gint nodes, minnodes; + struct statfile *st; + stat_file_t *file; + GList *cur; + + g_assert (pool != NULL); + g_assert (ctx != NULL); + + if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { + minnodes = strtol (value, NULL, 10); + nodes = g_tree_nnodes (input); + if (nodes > FEATURE_WINDOW_SIZE) { + nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; + } + if (nodes < minnodes) { + return FALSE; + } + } + + cur = call_classifier_pre_callbacks (ctx->cfg, task, FALSE, FALSE); + if (cur) { + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, cur); + } + else { + cur = ctx->cfg->statfiles; + } + + data.pool = pool; + data.now = time (NULL); + data.ctx = ctx; + + while (cur) { + /* Select statfiles to learn */ + st = cur->data; + if (st->is_spam != is_spam) { + cur = g_list_next (cur); + continue; + } + if ((file = statfile_pool_is_open (pool, st->path)) == NULL) { + if ((file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) { + msg_warn ("cannot open %s", st->path); + cur = g_list_next (cur); + continue; + } + } + data.file = file; + statfile_pool_lock_file (pool, data.file); + g_tree_foreach (input, bayes_learn_callback, &data); + statfile_inc_revision (file); + statfile_pool_unlock_file (pool, data.file); + maybe_write_binlog (ctx->cfg, st, file, input); + + cur = g_list_next (cur); + } + + return TRUE; +} + GList * bayes_weights (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task) { diff --git a/src/classifiers/classifiers.c b/src/classifiers/classifiers.c index 6b0554e1b..5e2b9ea88 100644 --- a/src/classifiers/classifiers.c +++ b/src/classifiers/classifiers.c @@ -35,6 +35,7 @@ struct classifier classifiers[] = { .init_func = winnow_init, .classify_func = winnow_classify, .learn_func = winnow_learn, + .learn_spam_func = winnow_learn_spam, .weights_func = winnow_weights }, { @@ -42,6 +43,7 @@ struct classifier classifiers[] = { .init_func = bayes_init, .classify_func = bayes_classify, .learn_func = bayes_learn, + .learn_spam_func = bayes_learn_spam, .weights_func = bayes_weights } }; diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h index 601db0205..78ceb196e 100644 --- a/src/classifiers/classifiers.h +++ b/src/classifiers/classifiers.h @@ -32,6 +32,8 @@ struct classifier { gboolean (*learn_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symbol, GTree *input, gboolean in_class, double *sum, double multiplier, GError **err); + gboolean (*learn_spam_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, + GTree *input, struct worker_task *task, gboolean is_spam, GError **err); GList* (*weights_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task); }; @@ -43,6 +45,8 @@ struct classifier_ctx* winnow_init (memory_pool_t *pool, struct classifier_confi gboolean winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task); gboolean winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symbol, GTree *input, gboolean in_class, double *sum, double multiplier, GError **err); +gboolean winnow_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool, + GTree *input, struct worker_task *task, gboolean is_spam, GError **err); GList *winnow_weights (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task); /* Bayes algorithm */ @@ -50,6 +54,8 @@ struct classifier_ctx* bayes_init (memory_pool_t *pool, struct classifier_config gboolean bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task); gboolean bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symbol, GTree *input, gboolean in_class, double *sum, double multiplier, GError **err); +gboolean bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool, + GTree *input, struct worker_task *task, gboolean is_spam, GError **err); GList *bayes_weights (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task); /* Array of all defined classifiers */ extern struct classifier classifiers[]; diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c index 2e8b98423..b123ce3e5 100644 --- a/src/classifiers/winnow.c +++ b/src/classifiers/winnow.c @@ -223,19 +223,14 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inp } } - if (ctx->cfg->pre_callbacks) { -#ifdef WITH_LUA - cur = call_classifier_pre_callbacks (ctx->cfg, task); - if (cur) { - memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, cur); - } -#else - cur = ctx->cfg->statfiles; -#endif - } - else { - cur = ctx->cfg->statfiles; - } + cur = call_classifier_pre_callbacks (ctx->cfg, task, FALSE, FALSE); + if (cur) { + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, cur); + } + else { + cur = ctx->cfg->statfiles; + } + while (cur) { st = cur->data; data.sum = 0; @@ -597,3 +592,15 @@ end: } return TRUE; } + +gboolean +winnow_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool, + GTree *input, struct worker_task *task, gboolean is_spam, GError **err) +{ + g_set_error (err, + winnow_error_quark(), /* error domain */ + 1, /* error code */ + "learn spam is not supported for winnow" + ); + return FALSE; +} |