aboutsummaryrefslogtreecommitdiffstats
path: root/src/classifiers/bayes.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2011-07-12 20:46:55 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2011-07-12 20:46:55 +0400
commitff4871310ff5b269dcd02ea300cf78092860e1d4 (patch)
treecfa435f5de1dc8efc646a0ca1fc6fd261b2c1aa6 /src/classifiers/bayes.c
parentc4105fc43199d51af271bc24d3345aa57906d973 (diff)
downloadrspamd-ff4871310ff5b269dcd02ea300cf78092860e1d4.tar.gz
rspamd-ff4871310ff5b269dcd02ea300cf78092860e1d4.zip
* First commit to implement multi-statfile filter system with new learning mechanizm (untested yet)
Diffstat (limited to 'src/classifiers/bayes.c')
-rw-r--r--src/classifiers/bayes.c80
1 files changed, 75 insertions, 5 deletions
diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c
index 7363df522..44e9323a2 100644
--- a/src/classifiers/bayes.c
+++ b/src/classifiers/bayes.c
@@ -30,9 +30,8 @@
#include "../main.h"
#include "../filter.h"
#include "../cfg_file.h"
-#ifdef WITH_LUA
+#include "../binlog.h"
#include "../lua/lua_common.h"
-#endif
#define LOCAL_PROB_DENOM 16.0
@@ -194,15 +193,22 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input,
}
}
- data.statfiles_num = g_list_length (ctx->cfg->statfiles);
+ cur = call_classifier_pre_callbacks (ctx->cfg, task, FALSE, FALSE);
+ if (cur) {
+ memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, cur);
+ }
+ else {
+ cur = ctx->cfg->statfiles;
+ }
+
+ data.statfiles_num = g_list_length (cur);
data.statfiles = g_new0 (struct bayes_statfile_data, data.statfiles_num);
data.pool = pool;
data.now = time (NULL);
data.ctx = ctx;
- cur = ctx->cfg->statfiles;
while (cur) {
- /* Select statfile to learn */
+ /* Select statfile to classify */
st = cur->data;
if ((file = statfile_pool_is_open (pool, st->path)) == NULL) {
if ((file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
@@ -344,6 +350,70 @@ bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symb
return TRUE;
}
+gboolean
+bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
+ GTree *input, struct worker_task *task, gboolean is_spam, GError **err)
+{
+ struct bayes_callback_data data;
+ gchar *value;
+ gint nodes, minnodes;
+ struct statfile *st;
+ stat_file_t *file;
+ GList *cur;
+
+ g_assert (pool != NULL);
+ g_assert (ctx != NULL);
+
+ if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ minnodes = strtol (value, NULL, 10);
+ nodes = g_tree_nnodes (input);
+ if (nodes > FEATURE_WINDOW_SIZE) {
+ nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
+ }
+ if (nodes < minnodes) {
+ return FALSE;
+ }
+ }
+
+ cur = call_classifier_pre_callbacks (ctx->cfg, task, FALSE, FALSE);
+ if (cur) {
+ memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, cur);
+ }
+ else {
+ cur = ctx->cfg->statfiles;
+ }
+
+ data.pool = pool;
+ data.now = time (NULL);
+ data.ctx = ctx;
+
+ while (cur) {
+ /* Select statfiles to learn */
+ st = cur->data;
+ if (st->is_spam != is_spam) {
+ cur = g_list_next (cur);
+ continue;
+ }
+ if ((file = statfile_pool_is_open (pool, st->path)) == NULL) {
+ if ((file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+ msg_warn ("cannot open %s", st->path);
+ cur = g_list_next (cur);
+ continue;
+ }
+ }
+ data.file = file;
+ statfile_pool_lock_file (pool, data.file);
+ g_tree_foreach (input, bayes_learn_callback, &data);
+ statfile_inc_revision (file);
+ statfile_pool_unlock_file (pool, data.file);
+ maybe_write_binlog (ctx->cfg, st, file, input);
+
+ cur = g_list_next (cur);
+ }
+
+ return TRUE;
+}
+
GList *
bayes_weights (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task)
{