SRCDIR="src"
OBJDIR="src/.obj"
-SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c tokenizers/tokenizers.c tokenizers/osb.c statfile.c ${LEX_OUTPUT} ${YACC_OUTPUT}"
+SOURCES="upstream.c cfg_utils.c memcached.c main.c util.c controller.c worker.c fstring.c url.c perl.c protocol.c mem_pool.c filter.c plugins/regexp.c plugins/surbl.c tokenizers/tokenizers.c tokenizers/osb.c classifiers/classifiers.c classifiers/winnow.c statfile.c ${LEX_OUTPUT} ${YACC_OUTPUT}"
MODULES="surbl regexp"
CFLAGS="$CFLAGS -W -Wpointer-arith -Wno-unused-parameter"
--- /dev/null
+/*
+ * Common classifier functions
+ */
+
+#include <sys/types.h>
+#include "classifiers.h"
+
+struct classifier classifiers[] = {
+ {"winnow", winnow_classify, winnow_learn },
+};
+
+struct classifier*
+get_classifier (char *name)
+{
+ int i;
+
+ for (i = 0; i < sizeof (classifiers) / sizeof (classifiers[0]); i ++) {
+ if (strcmp (classifiers[i].name, name) == 0) {
+ return &classifiers[i];
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * vi:ts=4
+ */
--- /dev/null
+#ifndef CLASSIFIERS_H
+#define CLASSIFIERS_H
+
+#include <sys/types.h>
+#include "../config.h"
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+#include "../mem_pool.h"
+#include "../statfile.h"
+#include "../tokenizers/tokenizers.h"
+
+/* Common classifier structure */
+struct classifier {
+ char *name;
+ double (*classify_func)(statfile_pool_t *pool, char *statfile, GTree *input);
+ void (*learn_func)(statfile_pool_t *pool, char *statfile, GTree *input, int in_class);
+};
+
+/* Get classifier structure by name or return NULL if this name is not found */
+struct classifier* get_classifier (char *name);
+/* Winnow algorithm */
+double winnow_classify (statfile_pool_t *pool, char *statfile, GTree *input);
+void winnow_learn (statfile_pool_t *pool, char *statfile, GTree *input, int in_class);
+
+/* Array of all defined classifiers */
+extern struct classifier classifiers[];
+
+#endif
+/*
+ * vi:ts=4
+ */
--- /dev/null
+/*
+ * Winnow classifier
+ */
+
+#include <sys/types.h>
+#include "classifiers.h"
+
+#define WINNOW_PROMOTION 1.23
+#define WINNOW_DEMOTION 0.83
+
+struct winnow_callback_data {
+ statfile_pool_t *pool;
+ char *filename;
+ double sum;
+ int count;
+ int in_class;
+ time_t now;
+};
+
+static gboolean
+classify_callback (gpointer key, gpointer value, gpointer data)
+{
+ token_node_t *node = key;
+ struct winnow_callback_data *cd = data;
+ float v;
+
+ /* Consider that not found blocks have value 1 */
+ if ((v = statfile_pool_get_block (cd->pool, cd->filename, node->h1, node->h2, cd->now)) < 0.00001) {
+ cd->sum += 1;
+ }
+ else {
+ cd->sum += v;
+ }
+
+ cd->count ++;
+
+ return FALSE;
+}
+
+static gboolean
+learn_callback (gpointer key, gpointer value, gpointer data)
+{
+ token_node_t *node = key;
+ struct winnow_callback_data *cd = data;
+ float v, c;
+
+ c = (cd->in_class) ? WINNOW_PROMOTION : WINNOW_DEMOTION;
+
+ /* Consider that not found blocks have value 1 */
+ if ((v = statfile_pool_get_block (cd->pool, cd->filename, node->h1, node->h2, cd->now)) < 0.00001) {
+ statfile_pool_set_block (cd->pool, cd->filename, node->h1, node->h2, cd->now, c);
+ }
+ else {
+ statfile_pool_set_block (cd->pool, cd->filename, node->h1, node->h2, cd->now, v * c);
+ }
+
+ cd->count ++;
+
+ return FALSE;
+}
+
+
+double
+winnow_classify (statfile_pool_t *pool, char *statfile, GTree *input)
+{
+ struct winnow_callback_data data;
+
+ data.pool = pool;
+ data.filename = statfile;
+ data.sum = 0;
+ data.count = 0;
+ data.now = time (NULL);
+
+ if (!statfile_pool_is_open (pool, statfile)) {
+ if (statfile_pool_open (pool, statfile) == -1) {
+ return 0;
+ }
+ }
+
+ g_tree_foreach (input, classify_callback, &data);
+
+ return data.sum / data.count;
+}
+
+void
+winnow_learn (statfile_pool_t *pool, char *statfile, GTree *input, int in_class)
+{
+ struct winnow_callback_data data;
+
+ data.pool = pool;
+ data.filename = statfile;
+ data.sum = 0;
+ data.count = 0;
+ data.in_class = in_class;
+ data.now = time (NULL);
+
+ if (!statfile_pool_is_open (pool, statfile)) {
+ if (statfile_pool_open (pool, statfile) == -1) {
+ return;
+ }
+ }
+
+ statfile_pool_lock_file (pool, statfile);
+ g_tree_foreach (input, learn_callback, &data);
+ statfile_pool_unlock_file (pool, statfile);
+
+}
memory_pool_unlock_mutex (file->lock);
}
-uint32_t
+float
statfile_pool_get_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now)
{
stat_file_t *file;
}
void
-statfile_pool_set_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now, uint32_t value)
+statfile_pool_set_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now, float value)
{
stat_file_t *file;
struct stat_file_block *block, *to_expire = NULL;
struct stat_file_block {
uint32_t hash1;
uint32_t hash2;
- uint32_t value; /* In fact this is float */
+ float value; /* In fact this is float */
uint32_t last_access;
};
void statfile_pool_delete (statfile_pool_t *pool);
void statfile_pool_lock_file (statfile_pool_t *pool, char *filename);
void statfile_pool_unlock_file (statfile_pool_t *pool, char *filename);
-uint32_t statfile_pool_get_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now);
-void statfile_pool_set_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now, uint32_t value);
+float statfile_pool_get_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now);
+void statfile_pool_set_block (statfile_pool_t *pool, char *filename, uint32_t h1, uint32_t h2, time_t now, float value);
int statfile_pool_is_open (statfile_pool_t *pool, char *filename);
#endif
/* Get and set random blocks */
statfile_pool_lock_file (pool, TEST_FILENAME);
for (i = 0; i < HASHES_NUM; i ++) {
- statfile_pool_set_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now, random_hashes[i]);
+ statfile_pool_set_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now, 1.0);
}
statfile_pool_unlock_file (pool, TEST_FILENAME);
for (i = 0; i < HASHES_NUM; i ++) {
v = statfile_pool_get_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now);
- g_assert(v == random_hashes[i]);
+ g_assert(v == 1.0);
}
statfile_pool_delete (pool);