summaryrefslogtreecommitdiffstats
path: root/src/classifiers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2008-12-04 19:33:26 +0300
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2008-12-04 19:33:26 +0300
commit1162170387a535c21a63777c5d73ecbf706d0e02 (patch)
treee09861309abc11030df40987ed30227b71308676 /src/classifiers
parent249c0583d2a12ddde67e05251e47f256a58cfd05 (diff)
downloadrspamd-1162170387a535c21a63777c5d73ecbf706d0e02.tar.gz
rspamd-1162170387a535c21a63777c5d73ecbf706d0e02.zip
* Add simple implementation of classifiers abstraction and winnow classifier
* Force statfile to work with float values
Diffstat (limited to 'src/classifiers')
-rw-r--r--src/classifiers/classifiers.c28
-rw-r--r--src/classifiers/classifiers.h33
-rw-r--r--src/classifiers/winnow.c107
3 files changed, 168 insertions, 0 deletions
diff --git a/src/classifiers/classifiers.c b/src/classifiers/classifiers.c
new file mode 100644
index 000000000..5dab03122
--- /dev/null
+++ b/src/classifiers/classifiers.c
@@ -0,0 +1,28 @@
+/*
+ * Common classifier functions
+ */
+
+#include <sys/types.h>
+#include "classifiers.h"
+
+struct classifier classifiers[] = {
+ {"winnow", winnow_classify, winnow_learn },
+};
+
+struct classifier*
+get_classifier (char *name)
+{
+ int i;
+
+ for (i = 0; i < sizeof (classifiers) / sizeof (classifiers[0]); i ++) {
+ if (strcmp (classifiers[i].name, name) == 0) {
+ return &classifiers[i];
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * vi:ts=4
+ */
diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h
new file mode 100644
index 000000000..eb5e5de4e
--- /dev/null
+++ b/src/classifiers/classifiers.h
@@ -0,0 +1,33 @@
+#ifndef CLASSIFIERS_H
+#define CLASSIFIERS_H
+
+#include <sys/types.h>
+#include "../config.h"
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+#include "../mem_pool.h"
+#include "../statfile.h"
+#include "../tokenizers/tokenizers.h"
+
+/* Common classifier structure */
+struct classifier {
+ char *name;
+ double (*classify_func)(statfile_pool_t *pool, char *statfile, GTree *input);
+ void (*learn_func)(statfile_pool_t *pool, char *statfile, GTree *input, int in_class);
+};
+
+/* Get classifier structure by name or return NULL if this name is not found */
+struct classifier* get_classifier (char *name);
+/* Winnow algorithm */
+double winnow_classify (statfile_pool_t *pool, char *statfile, GTree *input);
+void winnow_learn (statfile_pool_t *pool, char *statfile, GTree *input, int in_class);
+
+/* Array of all defined classifiers */
+extern struct classifier classifiers[];
+
+#endif
+/*
+ * vi:ts=4
+ */
diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c
new file mode 100644
index 000000000..552f054b2
--- /dev/null
+++ b/src/classifiers/winnow.c
@@ -0,0 +1,107 @@
+/*
+ * Winnow classifier
+ */
+
+#include <sys/types.h>
+#include "classifiers.h"
+
+#define WINNOW_PROMOTION 1.23
+#define WINNOW_DEMOTION 0.83
+
+struct winnow_callback_data {
+ statfile_pool_t *pool;
+ char *filename;
+ double sum;
+ int count;
+ int in_class;
+ time_t now;
+};
+
+static gboolean
+classify_callback (gpointer key, gpointer value, gpointer data)
+{
+ token_node_t *node = key;
+ struct winnow_callback_data *cd = data;
+ float v;
+
+ /* Consider that not found blocks have value 1 */
+ if ((v = statfile_pool_get_block (cd->pool, cd->filename, node->h1, node->h2, cd->now)) < 0.00001) {
+ cd->sum += 1;
+ }
+ else {
+ cd->sum += v;
+ }
+
+ cd->count ++;
+
+ return FALSE;
+}
+
+static gboolean
+learn_callback (gpointer key, gpointer value, gpointer data)
+{
+ token_node_t *node = key;
+ struct winnow_callback_data *cd = data;
+ float v, c;
+
+ c = (cd->in_class) ? WINNOW_PROMOTION : WINNOW_DEMOTION;
+
+ /* Consider that not found blocks have value 1 */
+ if ((v = statfile_pool_get_block (cd->pool, cd->filename, node->h1, node->h2, cd->now)) < 0.00001) {
+ statfile_pool_set_block (cd->pool, cd->filename, node->h1, node->h2, cd->now, c);
+ }
+ else {
+ statfile_pool_set_block (cd->pool, cd->filename, node->h1, node->h2, cd->now, v * c);
+ }
+
+ cd->count ++;
+
+ return FALSE;
+}
+
+
+double
+winnow_classify (statfile_pool_t *pool, char *statfile, GTree *input)
+{
+ struct winnow_callback_data data;
+
+ data.pool = pool;
+ data.filename = statfile;
+ data.sum = 0;
+ data.count = 0;
+ data.now = time (NULL);
+
+ if (!statfile_pool_is_open (pool, statfile)) {
+ if (statfile_pool_open (pool, statfile) == -1) {
+ return 0;
+ }
+ }
+
+ g_tree_foreach (input, classify_callback, &data);
+
+ return data.sum / data.count;
+}
+
+void
+winnow_learn (statfile_pool_t *pool, char *statfile, GTree *input, int in_class)
+{
+ struct winnow_callback_data data;
+
+ data.pool = pool;
+ data.filename = statfile;
+ data.sum = 0;
+ data.count = 0;
+ data.in_class = in_class;
+ data.now = time (NULL);
+
+ if (!statfile_pool_is_open (pool, statfile)) {
+ if (statfile_pool_open (pool, statfile) == -1) {
+ return;
+ }
+ }
+
+ statfile_pool_lock_file (pool, statfile);
+ g_tree_foreach (input, learn_callback, &data);
+ statfile_pool_unlock_file (pool, statfile);
+
+}