aboutsummaryrefslogtreecommitdiffstats
path: root/src/classifiers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2009-09-14 19:11:19 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2009-09-14 19:11:19 +0400
commita0f41f7c5712e73e8aa521f2064bc53be3315d0a (patch)
tree147e4d8956a5a3b85e0ecc15b9fcbe29742e4e5c /src/classifiers
parenta90c7d7a12561845e3371efc6803b1ecf6ad7d89 (diff)
downloadrspamd-a0f41f7c5712e73e8aa521f2064bc53be3315d0a.tar.gz
rspamd-a0f41f7c5712e73e8aa521f2064bc53be3315d0a.zip
* New system of classifiers interface and statfiles processing
* Fix sample config * Fix compile warnings * Fix building without lua support * Fix bugs with nrcpt header parsing and symbols cache loading (by Anton Nekhoroshikh)
Diffstat (limited to 'src/classifiers')
-rw-r--r--src/classifiers/classifiers.c1
-rw-r--r--src/classifiers/classifiers.h19
-rw-r--r--src/classifiers/winnow.c113
3 files changed, 66 insertions, 67 deletions
diff --git a/src/classifiers/classifiers.c b/src/classifiers/classifiers.c
index 283350972..482d111b0 100644
--- a/src/classifiers/classifiers.c
+++ b/src/classifiers/classifiers.c
@@ -35,7 +35,6 @@ struct classifier classifiers[] = {
.init_func = winnow_init,
.classify_func = winnow_classify,
.learn_func = winnow_learn,
- .result_file_func = winnow_result_file
},
};
diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h
index 13a295724..fcb251da1 100644
--- a/src/classifiers/classifiers.h
+++ b/src/classifiers/classifiers.h
@@ -6,29 +6,30 @@
#include "../statfile.h"
#include "../tokenizers/tokenizers.h"
+struct classifier_config;
+struct worker_task;
+
struct classifier_ctx {
memory_pool_t *pool;
GHashTable *results;
+ struct classifier_config *cfg;
};
/* Common classifier structure */
struct classifier {
char *name;
- struct classifier_ctx* (*init_func)(memory_pool_t *pool);
- void (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool,
- char *statfile, GTree *input, double scale);
+ struct classifier_ctx* (*init_func)(memory_pool_t *pool, struct classifier_config *cf);
+ void (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task);
void (*learn_func)(struct classifier_ctx* ctx, statfile_pool_t *pool,
- char *statfile, GTree *input, int in_class);
- char* (*result_file_func)(struct classifier_ctx *ctx, double *probability);
+ char *symbol, GTree *input, gboolean in_class);
};
/* Get classifier structure by name or return NULL if this name is not found */
struct classifier* get_classifier (char *name);
/* Winnow algorithm */
-struct classifier_ctx* winnow_init (memory_pool_t *pool);
-void winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, char *statfile, GTree *input, double scale);
-void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, char *statfile, GTree *input, int in_class);
-char* winnow_result_file (struct classifier_ctx* ctx, double *probability);
+struct classifier_ctx* winnow_init (memory_pool_t *pool, struct classifier_config *cf);
+void winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task);
+void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, char *symbol, GTree *input, gboolean in_class);
/* Array of all defined classifiers */
extern struct classifier classifiers[];
diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c
index edd929af0..88298faf4 100644
--- a/src/classifiers/winnow.c
+++ b/src/classifiers/winnow.c
@@ -27,6 +27,9 @@
*/
#include "classifiers.h"
+#include "../main.h"
+#include "../filter.h"
+#include "../cfg_file.h"
#define WINNOW_PROMOTION 1.23
#define WINNOW_DEMOTION 0.83
@@ -85,21 +88,23 @@ learn_callback (gpointer key, gpointer value, gpointer data)
}
struct classifier_ctx*
-winnow_init (memory_pool_t *pool)
+winnow_init (memory_pool_t *pool, struct classifier_config *cfg)
{
struct classifier_ctx *ctx = memory_pool_alloc (pool, sizeof (struct classifier_ctx));
ctx->pool = pool;
- ctx->results = g_hash_table_new (g_str_hash, g_str_equal);
- memory_pool_add_destructor (pool, (pool_destruct_func)g_hash_table_destroy, ctx->results);
+ ctx->cfg = cfg;
return ctx;
}
void
-winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, GTree *input, double scale)
+winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task)
{
struct winnow_callback_data data;
double *res = memory_pool_alloc (ctx->pool, sizeof (double));
+ double max = 0;
+ GList *cur;
+ struct statfile *st, *sel = NULL;
g_assert (pool != NULL);
g_assert (ctx != NULL);
@@ -109,29 +114,44 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfi
data.count = 0;
data.now = time (NULL);
data.ctx = ctx;
-
- if ((data.file = statfile_pool_is_open (pool, statfile)) == NULL) {
- if ((data.file = statfile_pool_open (pool, statfile)) == NULL) {
- return;
+
+ cur = ctx->cfg->statfiles;
+ while (cur) {
+ st = cur->data;
+ if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
+ if ((data.file = statfile_pool_open (pool, st->path)) == NULL) {
+ msg_warn ("winnow_classify: cannot open %s, skip it", st->path);
+ cur = g_list_next (cur);
+ continue;
+ }
}
- }
- g_tree_foreach (input, classify_callback, &data);
+ g_tree_foreach (input, classify_callback, &data);
- if (data.count != 0) {
- *res = scale * (data.sum / data.count);
+ if (data.count != 0) {
+ *res = (data.sum / data.count);
+ }
+ else {
+ *res = 0;
+ }
+ if (*res > max) {
+ max = *res;
+ sel = st;
+ }
+ cur = g_list_next (cur);
}
- else {
- *res = 0;
+
+ if (sel != NULL) {
+ insert_result (task, ctx->cfg->metric, sel->symbol, 1, NULL);
}
-
- g_hash_table_insert (ctx->results, statfile, res);
}
void
-winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, GTree *input, int in_class)
+winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *symbol, GTree *input, int in_class)
{
struct winnow_callback_data data;
+ GList *cur;
+ struct statfile *st;
g_assert (pool != NULL);
g_assert (ctx != NULL);
@@ -142,50 +162,29 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile,
data.in_class = in_class;
data.now = time (NULL);
data.ctx = ctx;
-
- if ((data.file = statfile_pool_is_open (pool, statfile)) == NULL) {
- if ((data.file = statfile_pool_open (pool, statfile)) == NULL) {
- return;
+
+ cur = g_list_first (ctx->cfg->statfiles);
+ while (cur) {
+ st = cur->data;
+ if (strcmp (symbol, st->symbol) == 0) {
+ if ((data.file = statfile_pool_open (pool, st->path)) == NULL) {
+ /* Try to create statfile */
+ if (statfile_pool_create (pool,
+ st->path, st->size / sizeof (struct stat_file_block)) == -1) {
+ msg_err ("winnow_learn: cannot create statfile %s", st->path);
+ return;
+ }
+ if ((data.file = statfile_pool_open (pool, st->path)) == NULL) {
+ msg_err ("winnow_learn: cannot create statfile %s", st->path);
+ return;
+ }
+ }
+ break;
}
+ cur = g_list_next (cur);
}
statfile_pool_lock_file (pool, data.file);
g_tree_foreach (input, learn_callback, &data);
statfile_pool_unlock_file (pool, data.file);
}
-
-struct winnow_result_data {
- char *filename;
- double max_score;
- double sum;
-};
-
-static void
-result_file_callback (gpointer key, gpointer value, gpointer data)
-{
- struct winnow_result_data *d = (struct winnow_result_data *)data;
- double w = *((double *)value);
-
- if (fabs (w) > fabs (d->max_score)) {
- d->filename = (char *)key;
- d->max_score = w;
- }
- d->sum += fabs (w);
-}
-
-char*
-winnow_result_file (struct classifier_ctx* ctx, double *probability)
-{
- struct winnow_result_data data = { NULL, 0, 0 };
- g_assert (ctx != NULL);
-
- g_hash_table_foreach (ctx->results, result_file_callback, &data);
- if (data.sum != 0) {
- *probability = data.max_score / data.sum;
- }
- else {
- *probability = 1;
- }
-
- return data.filename;
-}