]> source.dussan.org Git - rspamd.git/commitdiff
* Add simple implementation of classifiers abstraction and winnow classifier
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Thu, 4 Dec 2008 16:33:26 +0000 (19:33 +0300)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Thu, 4 Dec 2008 16:33:26 +0000 (19:33 +0300)
* Force statfile to work with float values

configure
src/classifiers/classifiers.c [new file with mode: 0644]
src/classifiers/classifiers.h [new file with mode: 0644]
src/classifiers/winnow.c [new file with mode: 0644]
src/statfile.c
src/statfile.h
test/rspamd_statfile_test.c

index 08d83c713c93b04af4a86d47ca3ea88a58af9895..76b2bbb608128985d2d95db5a0923fa82e989934 100755 (executable)
--- a/configure
+++ b/configure
@@ -24,7 +24,7 @@ CACHE="config.cache"
 
 SRCDIR="src"
 OBJDIR="src/.obj"
-SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c tokenizers/tokenizers.c tokenizers/osb.c statfile.c ${LEX_OUTPUT} ${YACC_OUTPUT}"
+SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c tokenizers/tokenizers.c tokenizers/osb.c classifiers/classifiers.c classifiers/winnow.c statfile.c ${LEX_OUTPUT} ${YACC_OUTPUT}"
 MODULES="surbl regexp"
 
 CFLAGS="$CFLAGS -W -Wpointer-arith -Wno-unused-parameter"
diff --git a/src/classifiers/classifiers.c b/src/classifiers/classifiers.c
new file mode 100644 (file)
index 0000000..5dab031
--- /dev/null
@@ -0,0 +1,28 @@
+/*
+ * Common classifier functions
+ */
+
+#include <sys/types.h>
+#include "classifiers.h"
+
+struct classifier classifiers[] = {
+       {"winnow", winnow_classify, winnow_learn },
+};
+
+struct classifier*
+get_classifier (char *name)
+{
+       int i;
+
+       for (i = 0; i < sizeof (classifiers) / sizeof (classifiers[0]); i ++) {
+               if (strcmp (classifiers[i].name, name) == 0) {
+                       return &classifiers[i];
+               }
+       }
+
+       return NULL;
+}
+
+/*
+ * vi:ts=4
+ */
diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h
new file mode 100644 (file)
index 0000000..eb5e5de
--- /dev/null
@@ -0,0 +1,33 @@
+#ifndef CLASSIFIERS_H
+#define CLASSIFIERS_H
+
+#include <sys/types.h>
+#include "../config.h"
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+#include "../mem_pool.h"
+#include "../statfile.h"
+#include "../tokenizers/tokenizers.h"
+
+/* Common classifier structure */
+struct classifier {
+       char *name;
+       double (*classify_func)(statfile_pool_t *pool, char *statfile, GTree *input);
+       void (*learn_func)(statfile_pool_t *pool, char *statfile, GTree *input, int in_class);
+};
+
+/* Get classifier structure by name or return NULL if this name is not found */
+struct classifier* get_classifier (char *name);
+/* Winnow algorithm */
+double winnow_classify (statfile_pool_t *pool, char *statfile, GTree *input);
+void winnow_learn (statfile_pool_t *pool, char *statfile, GTree *input, int in_class);
+
+/* Array of all defined classifiers */
+extern struct classifier classifiers[];
+
+#endif
+/*
+ * vi:ts=4
+ */
diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c
new file mode 100644 (file)
index 0000000..552f054
--- /dev/null
@@ -0,0 +1,107 @@
+/*
+ * Winnow classifier
+ */
+
+#include <sys/types.h>
+#include "classifiers.h"
+
+#define WINNOW_PROMOTION 1.23
+#define WINNOW_DEMOTION 0.83
+
+struct winnow_callback_data {
+       statfile_pool_t *pool;
+       char *filename;
+       double sum;
+       int count;
+       int in_class;
+       time_t now;
+};
+
+static gboolean
+classify_callback (gpointer key, gpointer value, gpointer data) 
+{
+       token_node_t *node = key;
+       struct winnow_callback_data *cd = data;
+       float v;
+       
+       /* Consider that not found blocks have value 1 */
+       if ((v = statfile_pool_get_block (cd->pool, cd->filename, node->h1, node->h2, cd->now)) < 0.00001) {
+               cd->sum += 1;
+       }
+       else {
+               cd->sum += v;
+       }
+
+       cd->count ++;
+
+       return FALSE;
+}
+
+static gboolean
+learn_callback (gpointer key, gpointer value, gpointer data) 
+{
+       token_node_t *node = key;
+       struct winnow_callback_data *cd = data;
+       float v, c;
+
+       c = (cd->in_class) ? WINNOW_PROMOTION : WINNOW_DEMOTION;
+
+       /* Consider that not found blocks have value 1 */
+       if ((v = statfile_pool_get_block (cd->pool, cd->filename, node->h1, node->h2, cd->now)) < 0.00001) {
+               statfile_pool_set_block (cd->pool, cd->filename, node->h1, node->h2, cd->now, c);
+       }
+       else {
+               statfile_pool_set_block (cd->pool, cd->filename, node->h1, node->h2, cd->now, v * c);
+       }
+
+       cd->count ++;
+       
+       return FALSE;
+}
+
+
+double 
+winnow_classify (statfile_pool_t *pool, char *statfile, GTree *input)
+{
+       struct winnow_callback_data data;
+
+       data.pool = pool;
+       data.filename = statfile;
+       data.sum = 0;
+       data.count = 0;
+       data.now = time (NULL);
+
+       if (!statfile_pool_is_open (pool, statfile)) {
+               if (statfile_pool_open (pool, statfile) == -1) {
+                       return 0;
+               }
+       }
+
+       g_tree_foreach (input, classify_callback, &data);
+       
+       return data.sum / data.count;
+}
+
+void
+winnow_learn (statfile_pool_t *pool, char *statfile, GTree *input, int in_class)
+{
+       struct winnow_callback_data data;
+
+       data.pool = pool;
+       data.filename = statfile;
+       data.sum = 0;
+       data.count = 0;
+       data.in_class = in_class;
+       data.now = time (NULL);
+
+       if (!statfile_pool_is_open (pool, statfile)) {
+               if (statfile_pool_open (pool, statfile) == -1) {
+                       return;
+               }
+       }
+
+       statfile_pool_lock_file (pool, statfile);
+       g_tree_foreach (input, learn_callback, &data);
+       statfile_pool_unlock_file (pool, statfile);
+       
+}
index 3f68a1de20f04655c4b3e4b1c401fcf567401c50..ca78b79e9d499e622c3c95f71e9d4f450fb1bff2 100644 (file)
@@ -267,7 +267,7 @@ statfile_pool_unlock_file (statfile_pool_t *pool, char *filename)
        memory_pool_unlock_mutex (file->lock);
 }
 
-uint32_t 
+float
 statfile_pool_get_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now)
 {
        stat_file_t *file;
@@ -311,7 +311,7 @@ statfile_pool_get_block (statfile_pool_t *pool, char *filename, uint32_t h1, uin
 }
 
 void
-statfile_pool_set_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now, uint32_t value)
+statfile_pool_set_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now, float value)
 {
        stat_file_t *file;
        struct stat_file_block *block, *to_expire = NULL;
index 624ea64ee62d9e4ff0a93bc32ea6f0adb868e237..ea0dc9965b9caed98e187814ed230e4d5c46ac3f 100644 (file)
@@ -26,7 +26,7 @@ struct stat_file_header {
 struct stat_file_block {
        uint32_t hash1;
        uint32_t hash2;
-       uint32_t value; /* In fact this is float */
+       float value; /* In fact this is float */
        uint32_t last_access;
 };
 
@@ -62,8 +62,8 @@ int statfile_pool_close (statfile_pool_t *pool, char *filename);
 void statfile_pool_delete (statfile_pool_t *pool);
 void statfile_pool_lock_file (statfile_pool_t *pool, char *filename);
 void statfile_pool_unlock_file (statfile_pool_t *pool, char *filename);
-uint32_t statfile_pool_get_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now);
-void statfile_pool_set_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now, uint32_t value);
+float statfile_pool_get_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now);
+void statfile_pool_set_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now, float value);
 int statfile_pool_is_open (statfile_pool_t *pool, char *filename);
 
 #endif
index 6537bcfafe541671d7e09f80c4e6e61ecf03342b..19a6cf7abd1374093247aef7c05f07bae48e218f 100644 (file)
@@ -45,13 +45,13 @@ rspamd_statfile_test_func ()
        /* Get and set random blocks */
        statfile_pool_lock_file (pool, TEST_FILENAME);
        for (i = 0; i < HASHES_NUM; i ++) {
-               statfile_pool_set_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now, random_hashes[i]);
+               statfile_pool_set_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now, 1.0);
        }
        statfile_pool_unlock_file (pool, TEST_FILENAME);
 
        for (i = 0; i < HASHES_NUM; i ++) {
                v = statfile_pool_get_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now);
-               g_assert(v == random_hashes[i]);
+               g_assert(v == 1.0);
        }
 
        statfile_pool_delete (pool);