aboutsummaryrefslogtreecommitdiffstats
path: root/src/classifiers/bayes.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2010-08-13 18:50:29 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2010-08-13 18:50:29 +0400
commit868842d9c133e8823365386200f5c48442a068b5 (patch)
tree9a6961134b6d5612e870142069166cdbb5ed2ca8 /src/classifiers/bayes.c
parent9aa989ea76ee78107ed7ae02a3d1b8e297f57b4c (diff)
downloadrspamd-868842d9c133e8823365386200f5c48442a068b5.tar.gz
rspamd-868842d9c133e8823365386200f5c48442a068b5.zip
* Add bayesian classifier (initial version)
Diffstat (limited to 'src/classifiers/bayes.c')
-rw-r--r--src/classifiers/bayes.c383
1 files changed, 383 insertions, 0 deletions
diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c
new file mode 100644
index 000000000..1ce36e0bd
--- /dev/null
+++ b/src/classifiers/bayes.c
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2009, Rambler media
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Bayesian classifier
+ */
+#include "classifiers.h"
+#include "../tokenizers/tokenizers.h"
+#include "../main.h"
+#include "../filter.h"
+#include "../cfg_file.h"
+#ifdef WITH_LUA
+#include "../lua/lua_common.h"
+#endif
+
+#define LOCAL_PROB_DENOM 16.0
+
+G_INLINE_FUNC GQuark
+bayes_error_quark (void)
+{
+ return g_quark_from_static_string ("bayes-error");
+}
+
+struct bayes_statfile_data {
+ double hits;
+ double total_hits;
+ double local_probability;
+ double post_probability;
+ double value;
+ struct statfile *st;
+ stat_file_t *file;
+};
+
+struct bayes_callback_data {
+ statfile_pool_t *pool;
+ struct classifier_ctx *ctx;
+ gboolean in_class;
+ time_t now;
+ stat_file_t *file;
+ struct bayes_statfile_data *statfiles;
+ guint32 statfiles_num;
+};
+
+static gboolean
+bayes_learn_callback (gpointer key, gpointer value, gpointer data)
+{
+ token_node_t *node = key;
+ struct bayes_callback_data *cd = data;
+ double v, c;
+
+ c = (cd->in_class) ? 1 : -1;
+
+ /* Consider that not found blocks have value 1 */
+ v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now);
+ if (fabs (v) < ALPHA && c > 0) {
+ statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, c);
+ }
+ else {
+ if (G_LIKELY (c > 0 && c < G_MAXDOUBLE)) {
+ v += c;
+ }
+ else if (c < 0){
+ if (v > -c) {
+ v -= c;
+ }
+ else {
+ v = 0;
+ }
+ }
+ statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, v);
+ }
+
+ return FALSE;
+}
+
+/*
+ * In this callback we calculate local probabilities for tokens
+ */
+static gboolean
+bayes_classify_callback (gpointer key, gpointer value, gpointer data)
+{
+
+ token_node_t *node = key;
+ struct bayes_callback_data *cd = data;
+ double local_hits = 0, renorm = 0;
+ int i;
+ struct bayes_statfile_data *cur;
+
+ for (i = 0; i < cd->statfiles_num; i ++) {
+ cur = &cd->statfiles[i];
+ cur->value = statfile_pool_get_block (cd->pool, cur->file, node->h1, node->h2, cd->now);
+ if (cur->value > ALPHA) {
+ cur->total_hits += cur->value;
+ cur->hits = cur->value;
+ local_hits += cur->value;
+ }
+ else {
+ cur->value = 0;
+ }
+ }
+ for (i = 0; i < cd->statfiles_num; i ++) {
+ cur = &cd->statfiles[i];
+ cur->local_probability = 0.5 + (cur->value - (local_hits - cur->value)) / (LOCAL_PROB_DENOM * (local_hits + 1.0));
+ renorm += cur->post_probability * cur->local_probability;
+ }
+
+ for (i = 0; i < cd->statfiles_num; i ++) {
+ cur = &cd->statfiles[i];
+ cur->post_probability = (cur->post_probability * cur->local_probability) / renorm;
+ if (cur->post_probability < G_MINDOUBLE * 100) {
+ cur->post_probability = G_MINDOUBLE * 100;
+ }
+ }
+ renorm = 0;
+ for (i = 0; i < cd->statfiles_num; i ++) {
+ cur = &cd->statfiles[i];
+ renorm += cur->post_probability;
+ }
+ /* Renormalize to form sum of probabilities equal to 1 */
+ for (i = 0; i < cd->statfiles_num; i ++) {
+ cur = &cd->statfiles[i];
+ cur->post_probability /= renorm;
+ if (cur->post_probability < G_MINDOUBLE * 10) {
+ cur->post_probability = G_MINDOUBLE * 100;
+ }
+ }
+
+ return FALSE;
+}
+
+struct classifier_ctx*
+bayes_init (memory_pool_t *pool, struct classifier_config *cfg)
+{
+ struct classifier_ctx *ctx = memory_pool_alloc (pool, sizeof (struct classifier_ctx));
+
+ ctx->pool = pool;
+ ctx->cfg = cfg;
+
+ return ctx;
+}
+
+gboolean
+bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task)
+{
+ struct bayes_callback_data data;
+ char *value;
+ int nodes, minnodes, i, cnt, best_num = 0;
+ double best = 0;
+ struct statfile *st;
+ stat_file_t *file;
+ GList *cur;
+ char *sumbuf;
+
+ g_assert (pool != NULL);
+ g_assert (ctx != NULL);
+
+ if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ minnodes = strtol (value, NULL, 10);
+ nodes = g_tree_nnodes (input) / FEATURE_WINDOW_SIZE;
+ if (nodes < minnodes) {
+ return FALSE;
+ }
+ }
+
+ data.statfiles_num = g_list_length (ctx->cfg->statfiles);
+ data.statfiles = g_new0 (struct bayes_statfile_data, data.statfiles_num);
+ data.pool = pool;
+ data.now = time (NULL);
+ data.ctx = ctx;
+
+ cur = ctx->cfg->statfiles;
+ i = 0;
+ while (cur) {
+ /* Select statfile to learn */
+ st = cur->data;
+ if ((file = statfile_pool_is_open (pool, st->path)) == NULL) {
+ if ((file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+ msg_warn ("cannot open %s", st->path);
+ cur = g_list_next (cur);
+ data.statfiles_num --;
+ continue;
+ }
+ }
+ data.statfiles[i].file = file;
+ data.statfiles[i].st = st;
+ data.statfiles[i].post_probability = 0.5;
+ data.statfiles[i].local_probability = 0.5;
+ i ++;
+ cur = g_list_next (cur);
+ }
+ cnt = i;
+
+ g_tree_foreach (input, bayes_classify_callback, &data);
+
+ for (i = 0; i < cnt; i ++) {
+ debug_task ("got probability for symbol %s: %.2f", data.statfiles[i].st->symbol, data.statfiles[i].post_probability);
+ if (data.statfiles[i].post_probability > best) {
+ best = data.statfiles[i].post_probability;
+ best_num = i;
+ }
+ }
+
+ if (best > 0.5) {
+ sumbuf = memory_pool_alloc (task->task_pool, 32);
+ rspamd_snprintf (sumbuf, 32, "%.2f", best);
+ cur = g_list_prepend (NULL, sumbuf);
+ insert_result (task, data.statfiles[best_num].st->symbol, best, cur);
+ }
+
+ g_free (data.statfiles);
+
+ return TRUE;
+}
+
+gboolean
+bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symbol, GTree *input,
+ gboolean in_class, double *sum, double multiplier, GError **err)
+{
+ struct bayes_callback_data data;
+ char *value;
+ int nodes, minnodes;
+ struct statfile *st, *sel_st = NULL;
+ stat_file_t *to_learn;
+ GList *cur;
+
+ g_assert (pool != NULL);
+ g_assert (ctx != NULL);
+
+ if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ minnodes = strtol (value, NULL, 10);
+ nodes = g_tree_nnodes (input) / FEATURE_WINDOW_SIZE;
+ if (nodes < minnodes) {
+ msg_info ("do not learn message as it has too few tokens: %d, while %d min", nodes, minnodes);
+ *sum = 0;
+ g_set_error (err,
+ bayes_error_quark(), /* error domain */
+ 1, /* error code */
+ "message contains too few tokens: %d, while min is %d",
+ nodes, minnodes);
+ return FALSE;
+ }
+ }
+
+ data.pool = pool;
+ data.in_class = in_class;
+ data.now = time (NULL);
+ data.ctx = ctx;
+ cur = ctx->cfg->statfiles;
+ while (cur) {
+ /* Select statfile to learn */
+ st = cur->data;
+ if (strcmp (st->symbol, symbol) == 0) {
+ sel_st = st;
+ break;
+ }
+ cur = g_list_next (cur);
+ }
+ if (sel_st == NULL) {
+ g_set_error (err,
+ bayes_error_quark(), /* error domain */
+ 1, /* error code */
+ "cannot find statfile for symbol: %s",
+ symbol);
+ }
+ if ((to_learn = statfile_pool_is_open (pool, sel_st->path)) == NULL) {
+ if ((to_learn = statfile_pool_open (pool, sel_st->path, sel_st->size, FALSE)) == NULL) {
+ msg_warn ("cannot open %s", sel_st->path);
+ if (statfile_pool_create (pool, sel_st->path, sel_st->size) == -1) {
+ msg_err ("cannot create statfile %s", sel_st->path);
+ g_set_error (err,
+ bayes_error_quark(), /* error domain */
+ 1, /* error code */
+ "cannot create statfile: %s",
+ sel_st->path);
+ return FALSE;
+ }
+ if ((to_learn = statfile_pool_open (pool, sel_st->path, sel_st->size, FALSE)) == NULL) {
+ g_set_error (err,
+ bayes_error_quark(), /* error domain */
+ 1, /* error code */
+ "cannot open statfile %s after creation",
+ sel_st->path);
+ msg_err ("cannot open statfile %s after creation", sel_st->path);
+ return FALSE;
+ }
+ }
+ }
+ data.file = to_learn;
+ statfile_pool_lock_file (pool, data.file);
+ g_tree_foreach (input, bayes_learn_callback, &data);
+ statfile_pool_unlock_file (pool, data.file);
+
+ return TRUE;
+}
+
+GList *
+bayes_weights (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task)
+{
+ struct bayes_callback_data data;
+ char *value;
+ int nodes, minnodes, i, cnt;
+ struct classify_weight *w;
+ struct statfile *st;
+ stat_file_t *file;
+ GList *cur, *resl = NULL;
+
+ g_assert (pool != NULL);
+ g_assert (ctx != NULL);
+
+ if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ minnodes = strtol (value, NULL, 10);
+ nodes = g_tree_nnodes (input) / FEATURE_WINDOW_SIZE;
+ if (nodes < minnodes) {
+ return NULL;
+ }
+ }
+
+ data.statfiles_num = g_list_length (ctx->cfg->statfiles);
+ data.statfiles = g_new0 (struct bayes_statfile_data, data.statfiles_num);
+ data.pool = pool;
+ data.now = time (NULL);
+ data.ctx = ctx;
+
+ cur = ctx->cfg->statfiles;
+ i = 0;
+ while (cur) {
+ /* Select statfile to learn */
+ st = cur->data;
+ if ((file = statfile_pool_is_open (pool, st->path)) == NULL) {
+ if ((file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+ msg_warn ("cannot open %s", st->path);
+ cur = g_list_next (cur);
+ data.statfiles_num --;
+ continue;
+ }
+ }
+ data.statfiles[i].file = file;
+ data.statfiles[i].st = st;
+ data.statfiles[i].post_probability = 0.5;
+ data.statfiles[i].local_probability = 0.5;
+ i ++;
+ cur = g_list_next (cur);
+ }
+ cnt = i;
+
+ g_tree_foreach (input, bayes_classify_callback, &data);
+
+ for (i = 0; i < cnt; i ++) {
+ w = memory_pool_alloc0 (task->task_pool, sizeof (struct classify_weight));
+ w->name = data.statfiles[i].st->symbol;
+ w->weight = data.statfiles[i].post_probability;
+ resl = g_list_prepend (resl, w);
+ }
+
+ g_free (data.statfiles);
+
+ if (resl != NULL) {
+ memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, resl);
+ }
+
+ return resl;
+}