From 1162170387a535c21a63777c5d73ecbf706d0e02 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 4 Dec 2008 19:33:26 +0300 Subject: [PATCH] * Add simple implementation of classifiers abstraction and winnow classifier * Force statfile to work with float values --- configure | 2 +- src/classifiers/classifiers.c | 28 +++++++++ src/classifiers/classifiers.h | 33 +++++++++++ src/classifiers/winnow.c | 107 ++++++++++++++++++++++++++++++++++ src/statfile.c | 4 +- src/statfile.h | 6 +- test/rspamd_statfile_test.c | 4 +- 7 files changed, 176 insertions(+), 8 deletions(-) create mode 100644 src/classifiers/classifiers.c create mode 100644 src/classifiers/classifiers.h create mode 100644 src/classifiers/winnow.c diff --git a/configure b/configure index 08d83c713..76b2bbb60 100755 --- a/configure +++ b/configure @@ -24,7 +24,7 @@ CACHE="config.cache" SRCDIR="src" OBJDIR="src/.obj" -SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c tokenizers/tokenizers.c tokenizers/osb.c statfile.c ${LEX_OUTPUT} ${YACC_OUTPUT}" +SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c tokenizers/tokenizers.c tokenizers/osb.c classifiers/classifiers.c classifiers/winnow.c statfile.c ${LEX_OUTPUT} ${YACC_OUTPUT}" MODULES="surbl regexp" CFLAGS="$CFLAGS -W -Wpointer-arith -Wno-unused-parameter" diff --git a/src/classifiers/classifiers.c b/src/classifiers/classifiers.c new file mode 100644 index 000000000..5dab03122 --- /dev/null +++ b/src/classifiers/classifiers.c @@ -0,0 +1,28 @@ +/* + * Common classifier functions + */ + +#include +#include "classifiers.h" + +struct classifier classifiers[] = { + {"winnow", winnow_classify, winnow_learn }, +}; + +struct classifier* +get_classifier (char *name) +{ + int i; + + for (i = 0; i < sizeof (classifiers) / sizeof (classifiers[0]); i ++) { + if (strcmp (classifiers[i].name, name) == 0) { + return &classifiers[i]; + } + } + + return NULL; +} + +/* + * vi:ts=4 + */ diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h new file mode 100644 index 000000000..eb5e5de4e --- /dev/null +++ b/src/classifiers/classifiers.h @@ -0,0 +1,33 @@ +#ifndef CLASSIFIERS_H +#define CLASSIFIERS_H + +#include +#include "../config.h" + +#ifdef HAVE_STDINT_H +#include +#endif +#include "../mem_pool.h" +#include "../statfile.h" +#include "../tokenizers/tokenizers.h" + +/* Common classifier structure */ +struct classifier { + char *name; + double (*classify_func)(statfile_pool_t *pool, char *statfile, GTree *input); + void (*learn_func)(statfile_pool_t *pool, char *statfile, GTree *input, int in_class); +}; + +/* Get classifier structure by name or return NULL if this name is not found */ +struct classifier* get_classifier (char *name); +/* Winnow algorithm */ +double winnow_classify (statfile_pool_t *pool, char *statfile, GTree *input); +void winnow_learn (statfile_pool_t *pool, char *statfile, GTree *input, int in_class); + +/* Array of all defined classifiers */ +extern struct classifier classifiers[]; + +#endif +/* + * vi:ts=4 + */ diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c new file mode 100644 index 000000000..552f054b2 --- /dev/null +++ b/src/classifiers/winnow.c @@ -0,0 +1,107 @@ +/* + * Winnow classifier + */ + +#include +#include "classifiers.h" + +#define WINNOW_PROMOTION 1.23 +#define WINNOW_DEMOTION 0.83 + +struct winnow_callback_data { + statfile_pool_t *pool; + char *filename; + double sum; + int count; + int in_class; + time_t now; +}; + +static gboolean +classify_callback (gpointer key, gpointer value, gpointer data) +{ + token_node_t *node = key; + struct winnow_callback_data *cd = data; + float v; + + /* Consider that not found blocks have value 1 */ + if ((v = statfile_pool_get_block (cd->pool, cd->filename, node->h1, node->h2, cd->now)) < 0.00001) { + cd->sum += 1; + } + else { + cd->sum += v; + } + + cd->count ++; + + return FALSE; +} + +static gboolean +learn_callback (gpointer key, gpointer value, gpointer data) +{ + token_node_t *node = key; + struct winnow_callback_data *cd = data; + float v, c; + + c = (cd->in_class) ? WINNOW_PROMOTION : WINNOW_DEMOTION; + + /* Consider that not found blocks have value 1 */ + if ((v = statfile_pool_get_block (cd->pool, cd->filename, node->h1, node->h2, cd->now)) < 0.00001) { + statfile_pool_set_block (cd->pool, cd->filename, node->h1, node->h2, cd->now, c); + } + else { + statfile_pool_set_block (cd->pool, cd->filename, node->h1, node->h2, cd->now, v * c); + } + + cd->count ++; + + return FALSE; +} + + +double +winnow_classify (statfile_pool_t *pool, char *statfile, GTree *input) +{ + struct winnow_callback_data data; + + data.pool = pool; + data.filename = statfile; + data.sum = 0; + data.count = 0; + data.now = time (NULL); + + if (!statfile_pool_is_open (pool, statfile)) { + if (statfile_pool_open (pool, statfile) == -1) { + return 0; + } + } + + g_tree_foreach (input, classify_callback, &data); + + return data.sum / data.count; +} + +void +winnow_learn (statfile_pool_t *pool, char *statfile, GTree *input, int in_class) +{ + struct winnow_callback_data data; + + data.pool = pool; + data.filename = statfile; + data.sum = 0; + data.count = 0; + data.in_class = in_class; + data.now = time (NULL); + + if (!statfile_pool_is_open (pool, statfile)) { + if (statfile_pool_open (pool, statfile) == -1) { + return; + } + } + + statfile_pool_lock_file (pool, statfile); + g_tree_foreach (input, learn_callback, &data); + statfile_pool_unlock_file (pool, statfile); + +} diff --git a/src/statfile.c b/src/statfile.c index 3f68a1de2..ca78b79e9 100644 --- a/src/statfile.c +++ b/src/statfile.c @@ -267,7 +267,7 @@ statfile_pool_unlock_file (statfile_pool_t *pool, char *filename) memory_pool_unlock_mutex (file->lock); } -uint32_t +float statfile_pool_get_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now) { stat_file_t *file; @@ -311,7 +311,7 @@ statfile_pool_get_block (statfile_pool_t *pool, char *filename, uint32_t h1, uin } void -statfile_pool_set_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now, uint32_t value) +statfile_pool_set_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now, float value) { stat_file_t *file; struct stat_file_block *block, *to_expire = NULL; diff --git a/src/statfile.h b/src/statfile.h index 624ea64ee..ea0dc9965 100644 --- a/src/statfile.h +++ b/src/statfile.h @@ -26,7 +26,7 @@ struct stat_file_header { struct stat_file_block { uint32_t hash1; uint32_t hash2; - uint32_t value; /* In fact this is float */ + float value; /* In fact this is float */ uint32_t last_access; }; @@ -62,8 +62,8 @@ int statfile_pool_close (statfile_pool_t *pool, char *filename); void statfile_pool_delete (statfile_pool_t *pool); void statfile_pool_lock_file (statfile_pool_t *pool, char *filename); void statfile_pool_unlock_file (statfile_pool_t *pool, char *filename); -uint32_t statfile_pool_get_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now); -void statfile_pool_set_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now, uint32_t value); +float statfile_pool_get_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now); +void statfile_pool_set_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now, float value); int statfile_pool_is_open (statfile_pool_t *pool, char *filename); #endif diff --git a/test/rspamd_statfile_test.c b/test/rspamd_statfile_test.c index 6537bcfaf..19a6cf7ab 100644 --- a/test/rspamd_statfile_test.c +++ b/test/rspamd_statfile_test.c @@ -45,13 +45,13 @@ rspamd_statfile_test_func () /* Get and set random blocks */ statfile_pool_lock_file (pool, TEST_FILENAME); for (i = 0; i < HASHES_NUM; i ++) { - statfile_pool_set_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now, random_hashes[i]); + statfile_pool_set_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now, 1.0); } statfile_pool_unlock_file (pool, TEST_FILENAME); for (i = 0; i < HASHES_NUM; i ++) { v = statfile_pool_get_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now); - g_assert(v == random_hashes[i]); + g_assert(v == 1.0); } statfile_pool_delete (pool); -- 2.39.5