aboutsummaryrefslogtreecommitdiffstats
path: root/src/classifiers/bayes.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/classifiers/bayes.c')
-rw-r--r--src/classifiers/bayes.c322
1 files changed, 130 insertions, 192 deletions
diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c
index 2d3fca084..a8a18f5ff 100644
--- a/src/classifiers/bayes.c
+++ b/src/classifiers/bayes.c
@@ -25,13 +25,13 @@
/*
* Bayesian classifier
*/
-#include "binlog.h"
-#include "cfg_file.h"
#include "classifiers.h"
+#include "tokenizers/tokenizers.h"
+#include "main.h"
#include "filter.h"
+#include "cfg_file.h"
+#include "binlog.h"
#include "lua/lua_common.h"
-#include "main.h"
-#include "tokenizers/tokenizers.h"
#define LOCAL_PROB_DENOM 16.0
@@ -42,68 +42,56 @@ bayes_error_quark (void)
}
struct bayes_statfile_data {
- guint64 hits;
- guint64 total_hits;
- double value;
- struct rspamd_statfile_config *st;
- stat_file_t *file;
+ guint64 hits;
+ guint64 total_hits;
+ double value;
+ struct rspamd_statfile_config *st;
+ stat_file_t *file;
};
struct bayes_callback_data {
- statfile_pool_t *pool;
- struct classifier_ctx *ctx;
- gboolean in_class;
- time_t now;
- stat_file_t *file;
- struct bayes_statfile_data *statfiles;
- guint32 statfiles_num;
- guint64 total_spam;
- guint64 total_ham;
- guint64 processed_tokens;
- gsize max_tokens;
- double spam_probability;
- double ham_probability;
+ statfile_pool_t *pool;
+ struct classifier_ctx *ctx;
+ gboolean in_class;
+ time_t now;
+ stat_file_t *file;
+ struct bayes_statfile_data *statfiles;
+ guint32 statfiles_num;
+ guint64 total_spam;
+ guint64 total_ham;
+ guint64 processed_tokens;
+ gsize max_tokens;
+ double spam_probability;
+ double ham_probability;
};
-static gboolean
+static gboolean
bayes_learn_callback (gpointer key, gpointer value, gpointer data)
{
- token_node_t *node = key;
- struct bayes_callback_data *cd = data;
- gint c;
- guint64 v;
+ token_node_t *node = key;
+ struct bayes_callback_data *cd = data;
+ gint c;
+ guint64 v;
c = (cd->in_class) ? 1 : -1;
/* Consider that not found blocks have value 1 */
- v =
- statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
- cd->now);
+ v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now);
if (v == 0 && c > 0) {
- statfile_pool_set_block (cd->pool,
- cd->file,
- node->h1,
- node->h2,
- cd->now,
- c);
- cd->processed_tokens++;
+ statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, c);
+ cd->processed_tokens ++;
}
else if (v != 0) {
if (G_LIKELY (c > 0)) {
- v++;
+ v ++;
}
- else if (c < 0) {
+ else if (c < 0){
if (v != 0) {
- v--;
+ v --;
}
}
- statfile_pool_set_block (cd->pool,
- cd->file,
- node->h1,
- node->h2,
- cd->now,
- v);
- cd->processed_tokens++;
+ statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, v);
+ cd->processed_tokens ++;
}
if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) {
@@ -145,7 +133,7 @@ inv_chi_square (gdouble value, gint freedom_deg)
return 0;
}
sum = prob;
- for (i = 1; i < freedom_deg / 2; i++) {
+ for (i = 1; i < freedom_deg / 2; i ++) {
prob *= value / (gdouble)i;
sum += prob;
}
@@ -160,20 +148,16 @@ static gboolean
bayes_classify_callback (gpointer key, gpointer value, gpointer data)
{
- token_node_t *node = key;
- struct bayes_callback_data *cd = data;
- guint i;
- struct bayes_statfile_data *cur;
- guint64 spam_count = 0, ham_count = 0, total_count = 0;
- double spam_prob, spam_freq, ham_freq, bayes_spam_prob;
+ token_node_t *node = key;
+ struct bayes_callback_data *cd = data;
+ guint i;
+ struct bayes_statfile_data *cur;
+ guint64 spam_count = 0, ham_count = 0, total_count = 0;
+ double spam_prob, spam_freq, ham_freq, bayes_spam_prob;
- for (i = 0; i < cd->statfiles_num; i++) {
+ for (i = 0; i < cd->statfiles_num; i ++) {
cur = &cd->statfiles[i];
- cur->value = statfile_pool_get_block (cd->pool,
- cur->file,
- node->h1,
- node->h2,
- cd->now);
+ cur->value = statfile_pool_get_block (cd->pool, cur->file, node->h1, node->h2, cd->now);
if (cur->value > 0) {
cur->total_hits += cur->value;
if (cur->st->is_spam) {
@@ -194,7 +178,7 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)
bayes_spam_prob = (0.5 + spam_prob * total_count) / (1. + total_count);
cd->spam_probability += log (bayes_spam_prob);
cd->ham_probability += log (1. - bayes_spam_prob);
- cd->processed_tokens++;
+ cd->processed_tokens ++;
}
if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) {
@@ -205,11 +189,10 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)
return FALSE;
}
-struct classifier_ctx *
+struct classifier_ctx*
bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cfg)
{
- struct classifier_ctx *ctx =
- rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx));
+ struct classifier_ctx *ctx = rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx));
ctx->pool = pool;
ctx->cfg = cfg;
@@ -219,28 +202,23 @@ bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cfg)
}
gboolean
-bayes_classify (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input,
- struct rspamd_task *task,
- lua_State *L)
+bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct rspamd_task *task, lua_State *L)
{
- struct bayes_callback_data data;
- gchar *value;
- gint nodes, i = 0, selected_st = -1, cnt;
- gint minnodes;
- guint64 maxhits = 0, rev;
- double final_prob, h, s;
- struct rspamd_statfile_config *st;
- stat_file_t *file;
- GList *cur;
- char *sumbuf;
+ struct bayes_callback_data data;
+ gchar *value;
+ gint nodes, i = 0, selected_st = -1, cnt;
+ gint minnodes;
+ guint64 maxhits = 0, rev;
+ double final_prob, h, s;
+ struct rspamd_statfile_config *st;
+ stat_file_t *file;
+ GList *cur;
+ char *sumbuf;
g_assert (pool != NULL);
g_assert (ctx != NULL);
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
nodes = g_tree_nnodes (input);
if (nodes > FEATURE_WINDOW_SIZE) {
@@ -253,8 +231,7 @@ bayes_classify (struct classifier_ctx * ctx,
cur = call_classifier_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L);
if (cur) {
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)g_list_free, cur);
+ rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_list_free, cur);
}
else {
cur = ctx->cfg->statfiles;
@@ -271,8 +248,7 @@ bayes_classify (struct classifier_ctx * ctx,
data.ham_probability = 0;
data.total_ham = 0;
data.total_spam = 0;
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
+ if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
minnodes = rspamd_config_parse_limit (value, -1);
data.max_tokens = minnodes;
}
@@ -284,11 +260,10 @@ bayes_classify (struct classifier_ctx * ctx,
/* Select statfile to classify */
st = cur->data;
if ((file = statfile_pool_is_open (pool, st->path)) == NULL) {
- if ((file =
- statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+ if ((file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
msg_warn ("cannot open %s", st->path);
cur = g_list_next (cur);
- data.statfiles_num--;
+ data.statfiles_num --;
continue;
}
}
@@ -303,7 +278,7 @@ bayes_classify (struct classifier_ctx * ctx,
}
cur = g_list_next (cur);
- i++;
+ i ++;
}
cnt = i;
@@ -314,19 +289,17 @@ bayes_classify (struct classifier_ctx * ctx,
final_prob = 0;
}
else {
- h = 1 - inv_chi_square (-2. * data.spam_probability,
- 2 * data.processed_tokens);
- s = 1 - inv_chi_square (-2. * data.ham_probability,
- 2 * data.processed_tokens);
+ h = 1 - inv_chi_square (-2. * data.spam_probability, 2 * data.processed_tokens);
+ s = 1 - inv_chi_square (-2. * data.ham_probability, 2 * data.processed_tokens);
final_prob = (s + 1 - h) / 2.;
}
if (data.processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) {
sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
- for (i = 0; i < cnt; i++) {
+ for (i = 0; i < cnt; i ++) {
if ((final_prob > 0.5 && !data.statfiles[i].st->is_spam) ||
- (final_prob < 0.5 && data.statfiles[i].st->is_spam)) {
+ (final_prob < 0.5 && data.statfiles[i].st->is_spam)) {
continue;
}
if (data.statfiles[i].total_hits > maxhits) {
@@ -335,8 +308,7 @@ bayes_classify (struct classifier_ctx * ctx,
}
}
if (selected_st == -1) {
- msg_err (
- "unexpected classifier error: cannot select desired statfile");
+ msg_err ("unexpected classifier error: cannot select desired statfile");
}
else {
/* Calculate ham probability correctly */
@@ -345,10 +317,7 @@ bayes_classify (struct classifier_ctx * ctx,
}
rspamd_snprintf (sumbuf, 32, "%.2f%%", final_prob * 100.);
cur = g_list_prepend (NULL, sumbuf);
- insert_result (task,
- data.statfiles[selected_st].st->symbol,
- final_prob,
- cur);
+ insert_result (task, data.statfiles[selected_st].st->symbol, final_prob, cur);
}
}
@@ -358,44 +327,34 @@ bayes_classify (struct classifier_ctx * ctx,
}
gboolean
-bayes_learn (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- const char *symbol,
- GTree *input,
- gboolean in_class,
- double *sum,
- double multiplier,
- GError **err)
+bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symbol, GTree *input,
+ gboolean in_class, double *sum, double multiplier, GError **err)
{
- struct bayes_callback_data data;
- gchar *value;
- gint nodes;
- gint minnodes;
- struct rspamd_statfile_config *st, *sel_st = NULL;
- stat_file_t *to_learn;
- GList *cur;
+ struct bayes_callback_data data;
+ gchar *value;
+ gint nodes;
+ gint minnodes;
+ struct rspamd_statfile_config *st, *sel_st = NULL;
+ stat_file_t *to_learn;
+ GList *cur;
g_assert (pool != NULL);
g_assert (ctx != NULL);
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
nodes = g_tree_nnodes (input);
if (nodes > FEATURE_WINDOW_SIZE) {
nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
}
if (nodes < minnodes) {
- msg_info (
- "do not learn message as it has too few tokens: %d, while %d min",
- nodes,
- minnodes);
+ msg_info ("do not learn message as it has too few tokens: %d, while %d min", nodes, minnodes);
*sum = 0;
g_set_error (err,
- bayes_error_quark (), /* error domain */
- 1, /* error code */
- "message contains too few tokens: %d, while min is %d",
- nodes, (int)minnodes);
+ bayes_error_quark(), /* error domain */
+ 1, /* error code */
+ "message contains too few tokens: %d, while min is %d",
+ nodes, (int)minnodes);
return FALSE;
}
}
@@ -406,8 +365,7 @@ bayes_learn (struct classifier_ctx * ctx,
data.ctx = ctx;
data.processed_tokens = 0;
data.processed_tokens = 0;
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
+ if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
minnodes = rspamd_config_parse_limit (value, -1);
data.max_tokens = minnodes;
}
@@ -426,36 +384,31 @@ bayes_learn (struct classifier_ctx * ctx,
}
if (sel_st == NULL) {
g_set_error (err,
- bayes_error_quark (), /* error domain */
- 1, /* error code */
- "cannot find statfile for symbol: %s",
- symbol);
+ bayes_error_quark(), /* error domain */
+ 1, /* error code */
+ "cannot find statfile for symbol: %s",
+ symbol);
return FALSE;
}
if ((to_learn = statfile_pool_is_open (pool, sel_st->path)) == NULL) {
- if ((to_learn =
- statfile_pool_open (pool, sel_st->path, sel_st->size,
- FALSE)) == NULL) {
+ if ((to_learn = statfile_pool_open (pool, sel_st->path, sel_st->size, FALSE)) == NULL) {
msg_warn ("cannot open %s", sel_st->path);
if (statfile_pool_create (pool, sel_st->path, sel_st->size) == -1) {
msg_err ("cannot create statfile %s", sel_st->path);
g_set_error (err,
- bayes_error_quark (), /* error domain */
- 1, /* error code */
- "cannot create statfile: %s",
- sel_st->path);
+ bayes_error_quark(), /* error domain */
+ 1, /* error code */
+ "cannot create statfile: %s",
+ sel_st->path);
return FALSE;
}
- if ((to_learn =
- statfile_pool_open (pool, sel_st->path, sel_st->size,
- FALSE)) == NULL) {
+ if ((to_learn = statfile_pool_open (pool, sel_st->path, sel_st->size, FALSE)) == NULL) {
g_set_error (err,
- bayes_error_quark (), /* error domain */
- 1, /* error code */
- "cannot open statfile %s after creation",
- sel_st->path);
- msg_err ("cannot open statfile %s after creation",
- sel_st->path);
+ bayes_error_quark(), /* error domain */
+ 1, /* error code */
+ "cannot open statfile %s after creation",
+ sel_st->path);
+ msg_err ("cannot open statfile %s after creation", sel_st->path);
return FALSE;
}
}
@@ -474,28 +427,22 @@ bayes_learn (struct classifier_ctx * ctx,
}
gboolean
-bayes_learn_spam (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input,
- struct rspamd_task *task,
- gboolean is_spam,
- lua_State *L,
- GError **err)
+bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
+ GTree *input, struct rspamd_task *task, gboolean is_spam, lua_State *L, GError **err)
{
- struct bayes_callback_data data;
- gchar *value;
- gint nodes;
- gint minnodes;
- struct rspamd_statfile_config *st;
- stat_file_t *file;
- GList *cur;
- gboolean skip_labels;
+ struct bayes_callback_data data;
+ gchar *value;
+ gint nodes;
+ gint minnodes;
+ struct rspamd_statfile_config *st;
+ stat_file_t *file;
+ GList *cur;
+ gboolean skip_labels;
g_assert (pool != NULL);
g_assert (ctx != NULL);
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
nodes = g_tree_nnodes (input);
if (nodes > FEATURE_WINDOW_SIZE) {
@@ -503,10 +450,10 @@ bayes_learn_spam (struct classifier_ctx * ctx,
}
if (nodes < minnodes) {
g_set_error (err,
- bayes_error_quark (), /* error domain */
- 1, /* error code */
- "message contains too few tokens: %d, while min is %d",
- nodes, (int)minnodes);
+ bayes_error_quark(), /* error domain */
+ 1, /* error code */
+ "message contains too few tokens: %d, while min is %d",
+ nodes, (int)minnodes);
return FALSE;
}
}
@@ -514,8 +461,7 @@ bayes_learn_spam (struct classifier_ctx * ctx,
cur = call_classifier_pre_callbacks (ctx->cfg, task, TRUE, is_spam, L);
if (cur) {
skip_labels = FALSE;
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)g_list_free, cur);
+ rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_list_free, cur);
}
else {
/* Do not try to learn specific statfiles if pre callback returned nil */
@@ -529,8 +475,7 @@ bayes_learn_spam (struct classifier_ctx * ctx,
data.in_class = TRUE;
data.processed_tokens = 0;
- if (ctx->cfg->opts &&
- (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
+ if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
minnodes = rspamd_config_parse_limit (value, -1);
data.max_tokens = minnodes;
}
@@ -546,28 +491,24 @@ bayes_learn_spam (struct classifier_ctx * ctx,
continue;
}
if ((file = statfile_pool_is_open (pool, st->path)) == NULL) {
- if ((file =
- statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+ if ((file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
msg_warn ("cannot open %s", st->path);
if (statfile_pool_create (pool, st->path, st->size) == -1) {
msg_err ("cannot create statfile %s", st->path);
g_set_error (err,
- bayes_error_quark (), /* error domain */
- 1, /* error code */
- "cannot create statfile: %s",
- st->path);
+ bayes_error_quark(), /* error domain */
+ 1, /* error code */
+ "cannot create statfile: %s",
+ st->path);
return FALSE;
}
- if ((file =
- statfile_pool_open (pool, st->path, st->size,
- FALSE)) == NULL) {
+ if ((file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
g_set_error (err,
- bayes_error_quark (), /* error domain */
- 1, /* error code */
- "cannot open statfile %s after creation",
- st->path);
- msg_err ("cannot open statfile %s after creation",
- st->path);
+ bayes_error_quark(), /* error domain */
+ 1, /* error code */
+ "cannot open statfile %s after creation",
+ st->path);
+ msg_err ("cannot open statfile %s after creation", st->path);
return FALSE;
}
}
@@ -587,10 +528,7 @@ bayes_learn_spam (struct classifier_ctx * ctx,
}
GList *
-bayes_weights (struct classifier_ctx * ctx,
- statfile_pool_t *pool,
- GTree *input,
- struct rspamd_task *task)
+bayes_weights (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct rspamd_task *task)
{
/* This function is unimplemented with new normalizer */
return NULL;