diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2008-12-04 19:33:26 +0300 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2008-12-04 19:33:26 +0300 |
commit | 1162170387a535c21a63777c5d73ecbf706d0e02 (patch) | |
tree | e09861309abc11030df40987ed30227b71308676 /src/classifiers | |
parent | 249c0583d2a12ddde67e05251e47f256a58cfd05 (diff) | |
download | rspamd-1162170387a535c21a63777c5d73ecbf706d0e02.tar.gz rspamd-1162170387a535c21a63777c5d73ecbf706d0e02.zip |
* Add simple implementation of classifiers abstraction and winnow classifier
* Force statfile to work with float values
Diffstat (limited to 'src/classifiers')
-rw-r--r-- | src/classifiers/classifiers.c | 28 | ||||
-rw-r--r-- | src/classifiers/classifiers.h | 33 | ||||
-rw-r--r-- | src/classifiers/winnow.c | 107 |
3 files changed, 168 insertions, 0 deletions
diff --git a/src/classifiers/classifiers.c b/src/classifiers/classifiers.c new file mode 100644 index 000000000..5dab03122 --- /dev/null +++ b/src/classifiers/classifiers.c @@ -0,0 +1,28 @@ +/* + * Common classifier functions + */ + +#include <sys/types.h> +#include "classifiers.h" + +struct classifier classifiers[] = { + {"winnow", winnow_classify, winnow_learn }, +}; + +struct classifier* +get_classifier (char *name) +{ + int i; + + for (i = 0; i < sizeof (classifiers) / sizeof (classifiers[0]); i ++) { + if (strcmp (classifiers[i].name, name) == 0) { + return &classifiers[i]; + } + } + + return NULL; +} + +/* + * vi:ts=4 + */ diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h new file mode 100644 index 000000000..eb5e5de4e --- /dev/null +++ b/src/classifiers/classifiers.h @@ -0,0 +1,33 @@ +#ifndef CLASSIFIERS_H +#define CLASSIFIERS_H + +#include <sys/types.h> +#include "../config.h" + +#ifdef HAVE_STDINT_H +#include <stdint.h> +#endif +#include "../mem_pool.h" +#include "../statfile.h" +#include "../tokenizers/tokenizers.h" + +/* Common classifier structure */ +struct classifier { + char *name; + double (*classify_func)(statfile_pool_t *pool, char *statfile, GTree *input); + void (*learn_func)(statfile_pool_t *pool, char *statfile, GTree *input, int in_class); +}; + +/* Get classifier structure by name or return NULL if this name is not found */ +struct classifier* get_classifier (char *name); +/* Winnow algorithm */ +double winnow_classify (statfile_pool_t *pool, char *statfile, GTree *input); +void winnow_learn (statfile_pool_t *pool, char *statfile, GTree *input, int in_class); + +/* Array of all defined classifiers */ +extern struct classifier classifiers[]; + +#endif +/* + * vi:ts=4 + */ diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c new file mode 100644 index 000000000..552f054b2 --- /dev/null +++ b/src/classifiers/winnow.c @@ -0,0 +1,107 @@ +/* + * Winnow classifier + */ + +#include <sys/types.h> +#include "classifiers.h" + +#define WINNOW_PROMOTION 1.23 +#define WINNOW_DEMOTION 0.83 + +struct winnow_callback_data { + statfile_pool_t *pool; + char *filename; + double sum; + int count; + int in_class; + time_t now; +}; + +static gboolean +classify_callback (gpointer key, gpointer value, gpointer data) +{ + token_node_t *node = key; + struct winnow_callback_data *cd = data; + float v; + + /* Consider that not found blocks have value 1 */ + if ((v = statfile_pool_get_block (cd->pool, cd->filename, node->h1, node->h2, cd->now)) < 0.00001) { + cd->sum += 1; + } + else { + cd->sum += v; + } + + cd->count ++; + + return FALSE; +} + +static gboolean +learn_callback (gpointer key, gpointer value, gpointer data) +{ + token_node_t *node = key; + struct winnow_callback_data *cd = data; + float v, c; + + c = (cd->in_class) ? WINNOW_PROMOTION : WINNOW_DEMOTION; + + /* Consider that not found blocks have value 1 */ + if ((v = statfile_pool_get_block (cd->pool, cd->filename, node->h1, node->h2, cd->now)) < 0.00001) { + statfile_pool_set_block (cd->pool, cd->filename, node->h1, node->h2, cd->now, c); + } + else { + statfile_pool_set_block (cd->pool, cd->filename, node->h1, node->h2, cd->now, v * c); + } + + cd->count ++; + + return FALSE; +} + + +double +winnow_classify (statfile_pool_t *pool, char *statfile, GTree *input) +{ + struct winnow_callback_data data; + + data.pool = pool; + data.filename = statfile; + data.sum = 0; + data.count = 0; + data.now = time (NULL); + + if (!statfile_pool_is_open (pool, statfile)) { + if (statfile_pool_open (pool, statfile) == -1) { + return 0; + } + } + + g_tree_foreach (input, classify_callback, &data); + + return data.sum / data.count; +} + +void +winnow_learn (statfile_pool_t *pool, char *statfile, GTree *input, int in_class) +{ + struct winnow_callback_data data; + + data.pool = pool; + data.filename = statfile; + data.sum = 0; + data.count = 0; + data.in_class = in_class; + data.now = time (NULL); + + if (!statfile_pool_is_open (pool, statfile)) { + if (statfile_pool_open (pool, statfile) == -1) { + return; + } + } + + statfile_pool_lock_file (pool, statfile); + g_tree_foreach (input, learn_callback, &data); + statfile_pool_unlock_file (pool, statfile); + +} |