aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2008-12-04 19:33:26 +0300
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2008-12-04 19:33:26 +0300
commit1162170387a535c21a63777c5d73ecbf706d0e02 (patch)
treee09861309abc11030df40987ed30227b71308676
parent249c0583d2a12ddde67e05251e47f256a58cfd05 (diff)
downloadrspamd-1162170387a535c21a63777c5d73ecbf706d0e02.tar.gz
rspamd-1162170387a535c21a63777c5d73ecbf706d0e02.zip
* Add simple implementation of classifiers abstraction and winnow classifier
* Force statfile to work with float values
-rwxr-xr-xconfigure2
-rw-r--r--src/classifiers/classifiers.c28
-rw-r--r--src/classifiers/classifiers.h33
-rw-r--r--src/classifiers/winnow.c107
-rw-r--r--src/statfile.c4
-rw-r--r--src/statfile.h6
-rw-r--r--test/rspamd_statfile_test.c4
7 files changed, 176 insertions, 8 deletions
diff --git a/configure b/configure
index 08d83c713..76b2bbb60 100755
--- a/configure
+++ b/configure
@@ -24,7 +24,7 @@ CACHE="config.cache"
SRCDIR="src"
OBJDIR="src/.obj"
-SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c tokenizers/tokenizers.c tokenizers/osb.c statfile.c ${LEX_OUTPUT} ${YACC_OUTPUT}"
+SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c tokenizers/tokenizers.c tokenizers/osb.c classifiers/classifiers.c classifiers/winnow.c statfile.c ${LEX_OUTPUT} ${YACC_OUTPUT}"
MODULES="surbl regexp"
CFLAGS="$CFLAGS -W -Wpointer-arith -Wno-unused-parameter"
diff --git a/src/classifiers/classifiers.c b/src/classifiers/classifiers.c
new file mode 100644
index 000000000..5dab03122
--- /dev/null
+++ b/src/classifiers/classifiers.c
@@ -0,0 +1,28 @@
+/*
+ * Common classifier functions
+ */
+
+#include <sys/types.h>
+#include "classifiers.h"
+
+struct classifier classifiers[] = {
+ {"winnow", winnow_classify, winnow_learn },
+};
+
+struct classifier*
+get_classifier (char *name)
+{
+ int i;
+
+ for (i = 0; i < sizeof (classifiers) / sizeof (classifiers[0]); i ++) {
+ if (strcmp (classifiers[i].name, name) == 0) {
+ return &classifiers[i];
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * vi:ts=4
+ */
diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h
new file mode 100644
index 000000000..eb5e5de4e
--- /dev/null
+++ b/src/classifiers/classifiers.h
@@ -0,0 +1,33 @@
+#ifndef CLASSIFIERS_H
+#define CLASSIFIERS_H
+
+#include <sys/types.h>
+#include "../config.h"
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+#include "../mem_pool.h"
+#include "../statfile.h"
+#include "../tokenizers/tokenizers.h"
+
+/* Common classifier structure */
+struct classifier {
+ char *name;
+ double (*classify_func)(statfile_pool_t *pool, char *statfile, GTree *input);
+ void (*learn_func)(statfile_pool_t *pool, char *statfile, GTree *input, int in_class);
+};
+
+/* Get classifier structure by name or return NULL if this name is not found */
+struct classifier* get_classifier (char *name);
+/* Winnow algorithm */
+double winnow_classify (statfile_pool_t *pool, char *statfile, GTree *input);
+void winnow_learn (statfile_pool_t *pool, char *statfile, GTree *input, int in_class);
+
+/* Array of all defined classifiers */
+extern struct classifier classifiers[];
+
+#endif
+/*
+ * vi:ts=4
+ */
diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c
new file mode 100644
index 000000000..552f054b2
--- /dev/null
+++ b/src/classifiers/winnow.c
@@ -0,0 +1,107 @@
+/*
+ * Winnow classifier
+ */
+
+#include <sys/types.h>
+#include "classifiers.h"
+
+#define WINNOW_PROMOTION 1.23
+#define WINNOW_DEMOTION 0.83
+
+struct winnow_callback_data {
+ statfile_pool_t *pool;
+ char *filename;
+ double sum;
+ int count;
+ int in_class;
+ time_t now;
+};
+
+static gboolean
+classify_callback (gpointer key, gpointer value, gpointer data)
+{
+ token_node_t *node = key;
+ struct winnow_callback_data *cd = data;
+ float v;
+
+ /* Consider that not found blocks have value 1 */
+ if ((v = statfile_pool_get_block (cd->pool, cd->filename, node->h1, node->h2, cd->now)) < 0.00001) {
+ cd->sum += 1;
+ }
+ else {
+ cd->sum += v;
+ }
+
+ cd->count ++;
+
+ return FALSE;
+}
+
+static gboolean
+learn_callback (gpointer key, gpointer value, gpointer data)
+{
+ token_node_t *node = key;
+ struct winnow_callback_data *cd = data;
+ float v, c;
+
+ c = (cd->in_class) ? WINNOW_PROMOTION : WINNOW_DEMOTION;
+
+ /* Consider that not found blocks have value 1 */
+ if ((v = statfile_pool_get_block (cd->pool, cd->filename, node->h1, node->h2, cd->now)) < 0.00001) {
+ statfile_pool_set_block (cd->pool, cd->filename, node->h1, node->h2, cd->now, c);
+ }
+ else {
+ statfile_pool_set_block (cd->pool, cd->filename, node->h1, node->h2, cd->now, v * c);
+ }
+
+ cd->count ++;
+
+ return FALSE;
+}
+
+
+double
+winnow_classify (statfile_pool_t *pool, char *statfile, GTree *input)
+{
+ struct winnow_callback_data data;
+
+ data.pool = pool;
+ data.filename = statfile;
+ data.sum = 0;
+ data.count = 0;
+ data.now = time (NULL);
+
+ if (!statfile_pool_is_open (pool, statfile)) {
+ if (statfile_pool_open (pool, statfile) == -1) {
+ return 0;
+ }
+ }
+
+ g_tree_foreach (input, classify_callback, &data);
+
+ return data.sum / data.count;
+}
+
+void
+winnow_learn (statfile_pool_t *pool, char *statfile, GTree *input, int in_class)
+{
+ struct winnow_callback_data data;
+
+ data.pool = pool;
+ data.filename = statfile;
+ data.sum = 0;
+ data.count = 0;
+ data.in_class = in_class;
+ data.now = time (NULL);
+
+ if (!statfile_pool_is_open (pool, statfile)) {
+ if (statfile_pool_open (pool, statfile) == -1) {
+ return;
+ }
+ }
+
+ statfile_pool_lock_file (pool, statfile);
+ g_tree_foreach (input, learn_callback, &data);
+ statfile_pool_unlock_file (pool, statfile);
+
+}
diff --git a/src/statfile.c b/src/statfile.c
index 3f68a1de2..ca78b79e9 100644
--- a/src/statfile.c
+++ b/src/statfile.c
@@ -267,7 +267,7 @@ statfile_pool_unlock_file (statfile_pool_t *pool, char *filename)
memory_pool_unlock_mutex (file->lock);
}
-uint32_t
+float
statfile_pool_get_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now)
{
stat_file_t *file;
@@ -311,7 +311,7 @@ statfile_pool_get_block (statfile_pool_t *pool, char *filename, uint32_t h1, uin
}
void
-statfile_pool_set_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now, uint32_t value)
+statfile_pool_set_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now, float value)
{
stat_file_t *file;
struct stat_file_block *block, *to_expire = NULL;
diff --git a/src/statfile.h b/src/statfile.h
index 624ea64ee..ea0dc9965 100644
--- a/src/statfile.h
+++ b/src/statfile.h
@@ -26,7 +26,7 @@ struct stat_file_header {
struct stat_file_block {
uint32_t hash1;
uint32_t hash2;
- uint32_t value; /* In fact this is float */
+ float value; /* In fact this is float */
uint32_t last_access;
};
@@ -62,8 +62,8 @@ int statfile_pool_close (statfile_pool_t *pool, char *filename);
void statfile_pool_delete (statfile_pool_t *pool);
void statfile_pool_lock_file (statfile_pool_t *pool, char *filename);
void statfile_pool_unlock_file (statfile_pool_t *pool, char *filename);
-uint32_t statfile_pool_get_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now);
-void statfile_pool_set_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now, uint32_t value);
+float statfile_pool_get_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now);
+void statfile_pool_set_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now, float value);
int statfile_pool_is_open (statfile_pool_t *pool, char *filename);
#endif
diff --git a/test/rspamd_statfile_test.c b/test/rspamd_statfile_test.c
index 6537bcfaf..19a6cf7ab 100644
--- a/test/rspamd_statfile_test.c
+++ b/test/rspamd_statfile_test.c
@@ -45,13 +45,13 @@ rspamd_statfile_test_func ()
/* Get and set random blocks */
statfile_pool_lock_file (pool, TEST_FILENAME);
for (i = 0; i < HASHES_NUM; i ++) {
- statfile_pool_set_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now, random_hashes[i]);
+ statfile_pool_set_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now, 1.0);
}
statfile_pool_unlock_file (pool, TEST_FILENAME);
for (i = 0; i < HASHES_NUM; i ++) {
v = statfile_pool_get_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now);
- g_assert(v == random_hashes[i]);
+ g_assert(v == 1.0);
}
statfile_pool_delete (pool);