aboutsummaryrefslogtreecommitdiffstats
path: root/src/classifiers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2014-07-23 12:45:28 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2014-07-23 12:45:28 +0100
commite0483657ff6cf1adc828ccce457814d61fe90a0d (patch)
tree5183e4163f40b81b3e7d5f51488d360883782154 /src/classifiers
parent7962087e808fb824aa3af6d41d02abc92916ba1e (diff)
downloadrspamd-e0483657ff6cf1adc828ccce457814d61fe90a0d.tar.gz
rspamd-e0483657ff6cf1adc828ccce457814d61fe90a0d.zip
Unify code style.
Diffstat (limited to 'src/classifiers')
-rw-r--r--src/classifiers/bayes.c322
-rw-r--r--src/classifiers/classifiers.c38
-rw-r--r--src/classifiers/classifiers.h90
-rw-r--r--src/classifiers/winnow.c388
4 files changed, 517 insertions, 321 deletions
diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c
index a8a18f5ff..2d3fca084 100644
--- a/src/classifiers/bayes.c
+++ b/src/classifiers/bayes.c
@@ -25,13 +25,13 @@
/*
* Bayesian classifier
*/
+#include "binlog.h"
+#include "cfg_file.h"
#include "classifiers.h"
-#include "tokenizers/tokenizers.h"
-#include "main.h"
#include "filter.h"
-#include "cfg_file.h"
-#include "binlog.h"
#include "lua/lua_common.h"
+#include "main.h"
+#include "tokenizers/tokenizers.h"
#define LOCAL_PROB_DENOM 16.0
@@ -42,56 +42,68 @@ bayes_error_quark (void)
}
struct bayes_statfile_data {
- guint64 hits;
- guint64 total_hits;
- double value;
- struct rspamd_statfile_config *st;
- stat_file_t *file;
+ guint64 hits;
+ guint64 total_hits;
+ double value;
+ struct rspamd_statfile_config *st;
+ stat_file_t *file;
};
struct bayes_callback_data {
- statfile_pool_t *pool;
- struct classifier_ctx *ctx;
- gboolean in_class;
- time_t now;
- stat_file_t *file;
- struct bayes_statfile_data *statfiles;
- guint32 statfiles_num;
- guint64 total_spam;
- guint64 total_ham;
- guint64 processed_tokens;
- gsize max_tokens;
- double spam_probability;
- double ham_probability;
+ statfile_pool_t *pool;
+ struct classifier_ctx *ctx;
+ gboolean in_class;
+ time_t now;
+ stat_file_t *file;
+ struct bayes_statfile_data *statfiles;
+ guint32 statfiles_num;
+ guint64 total_spam;
+ guint64 total_ham;
+ guint64 processed_tokens;
+ gsize max_tokens;
+ double spam_probability;
+ double ham_probability;
};
-static gboolean
+static gboolean
bayes_learn_callback (gpointer key, gpointer value, gpointer data)
{
- token_node_t *node = key;
- struct bayes_callback_data *cd = data;
- gint c;
- guint64 v;
+ token_node_t *node = key;
+ struct bayes_callback_data *cd = data;
+ gint c;
+ guint64 v;
c = (cd->in_class) ? 1 : -1;
/* Consider that not found blocks have value 1 */
- v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now);
+ v =
+ statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
+ cd->now);
if (v == 0 && c > 0) {
- statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, c);
- cd->processed_tokens ++;
+ statfile_pool_set_block (cd->pool,
+ cd->file,
+ node->h1,
+ node->h2,
+ cd->now,
+ c);
+ cd->processed_tokens++;
}
else if (v != 0) {
if (G_LIKELY (c > 0)) {
- v ++;
+ v++;
}
- else if (c < 0){
+ else if (c < 0) {
if (v != 0) {
- v --;
+ v--;
}
}
- statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, v);
- cd->processed_tokens ++;
+ statfile_pool_set_block (cd->pool,
+ cd->file,
+ node->h1,
+ node->h2,
+ cd->now,
+ v);
+ cd->processed_tokens++;
}
if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) {
@@ -133,7 +145,7 @@ inv_chi_square (gdouble value, gint freedom_deg)
return 0;
}
sum = prob;
- for (i = 1; i < freedom_deg / 2; i ++) {
+ for (i = 1; i < freedom_deg / 2; i++) {
prob *= value / (gdouble)i;
sum += prob;
}
@@ -148,16 +160,20 @@ static gboolean
bayes_classify_callback (gpointer key, gpointer value, gpointer data)
{
- token_node_t *node = key;
- struct bayes_callback_data *cd = data;
- guint i;
- struct bayes_statfile_data *cur;
- guint64 spam_count = 0, ham_count = 0, total_count = 0;
- double spam_prob, spam_freq, ham_freq, bayes_spam_prob;
+ token_node_t *node = key;
+ struct bayes_callback_data *cd = data;
+ guint i;
+ struct bayes_statfile_data *cur;
+ guint64 spam_count = 0, ham_count = 0, total_count = 0;
+ double spam_prob, spam_freq, ham_freq, bayes_spam_prob;
- for (i = 0; i < cd->statfiles_num; i ++) {
+ for (i = 0; i < cd->statfiles_num; i++) {
cur = &cd->statfiles[i];
- cur->value = statfile_pool_get_block (cd->pool, cur->file, node->h1, node->h2, cd->now);
+ cur->value = statfile_pool_get_block (cd->pool,
+ cur->file,
+ node->h1,
+ node->h2,
+ cd->now);
if (cur->value > 0) {
cur->total_hits += cur->value;
if (cur->st->is_spam) {
@@ -178,7 +194,7 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)
bayes_spam_prob = (0.5 + spam_prob * total_count) / (1. + total_count);
cd->spam_probability += log (bayes_spam_prob);
cd->ham_probability += log (1. - bayes_spam_prob);
- cd->processed_tokens ++;
+ cd->processed_tokens++;
}
if (cd->max_tokens != 0 && cd->processed_tokens > cd->max_tokens) {
@@ -189,10 +205,11 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)
return FALSE;
}
-struct classifier_ctx*
+struct classifier_ctx *
bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cfg)
{
- struct classifier_ctx *ctx = rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx));
+ struct classifier_ctx *ctx =
+ rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx));
ctx->pool = pool;
ctx->cfg = cfg;
@@ -202,23 +219,28 @@ bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cfg)
}
gboolean
-bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct rspamd_task *task, lua_State *L)
+bayes_classify (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task,
+ lua_State *L)
{
- struct bayes_callback_data data;
- gchar *value;
- gint nodes, i = 0, selected_st = -1, cnt;
- gint minnodes;
- guint64 maxhits = 0, rev;
- double final_prob, h, s;
- struct rspamd_statfile_config *st;
- stat_file_t *file;
- GList *cur;
- char *sumbuf;
+ struct bayes_callback_data data;
+ gchar *value;
+ gint nodes, i = 0, selected_st = -1, cnt;
+ gint minnodes;
+ guint64 maxhits = 0, rev;
+ double final_prob, h, s;
+ struct rspamd_statfile_config *st;
+ stat_file_t *file;
+ GList *cur;
+ char *sumbuf;
g_assert (pool != NULL);
g_assert (ctx != NULL);
- if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
nodes = g_tree_nnodes (input);
if (nodes > FEATURE_WINDOW_SIZE) {
@@ -231,7 +253,8 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input,
cur = call_classifier_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L);
if (cur) {
- rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_list_free, cur);
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t)g_list_free, cur);
}
else {
cur = ctx->cfg->statfiles;
@@ -248,7 +271,8 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input,
data.ham_probability = 0;
data.total_ham = 0;
data.total_spam = 0;
- if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
minnodes = rspamd_config_parse_limit (value, -1);
data.max_tokens = minnodes;
}
@@ -260,10 +284,11 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input,
/* Select statfile to classify */
st = cur->data;
if ((file = statfile_pool_is_open (pool, st->path)) == NULL) {
- if ((file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+ if ((file =
+ statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
msg_warn ("cannot open %s", st->path);
cur = g_list_next (cur);
- data.statfiles_num --;
+ data.statfiles_num--;
continue;
}
}
@@ -278,7 +303,7 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input,
}
cur = g_list_next (cur);
- i ++;
+ i++;
}
cnt = i;
@@ -289,17 +314,19 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input,
final_prob = 0;
}
else {
- h = 1 - inv_chi_square (-2. * data.spam_probability, 2 * data.processed_tokens);
- s = 1 - inv_chi_square (-2. * data.ham_probability, 2 * data.processed_tokens);
+ h = 1 - inv_chi_square (-2. * data.spam_probability,
+ 2 * data.processed_tokens);
+ s = 1 - inv_chi_square (-2. * data.ham_probability,
+ 2 * data.processed_tokens);
final_prob = (s + 1 - h) / 2.;
}
if (data.processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) {
sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
- for (i = 0; i < cnt; i ++) {
+ for (i = 0; i < cnt; i++) {
if ((final_prob > 0.5 && !data.statfiles[i].st->is_spam) ||
- (final_prob < 0.5 && data.statfiles[i].st->is_spam)) {
+ (final_prob < 0.5 && data.statfiles[i].st->is_spam)) {
continue;
}
if (data.statfiles[i].total_hits > maxhits) {
@@ -308,7 +335,8 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input,
}
}
if (selected_st == -1) {
- msg_err ("unexpected classifier error: cannot select desired statfile");
+ msg_err (
+ "unexpected classifier error: cannot select desired statfile");
}
else {
/* Calculate ham probability correctly */
@@ -317,7 +345,10 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input,
}
rspamd_snprintf (sumbuf, 32, "%.2f%%", final_prob * 100.);
cur = g_list_prepend (NULL, sumbuf);
- insert_result (task, data.statfiles[selected_st].st->symbol, final_prob, cur);
+ insert_result (task,
+ data.statfiles[selected_st].st->symbol,
+ final_prob,
+ cur);
}
}
@@ -327,34 +358,44 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input,
}
gboolean
-bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symbol, GTree *input,
- gboolean in_class, double *sum, double multiplier, GError **err)
+bayes_learn (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ const char *symbol,
+ GTree *input,
+ gboolean in_class,
+ double *sum,
+ double multiplier,
+ GError **err)
{
- struct bayes_callback_data data;
- gchar *value;
- gint nodes;
- gint minnodes;
- struct rspamd_statfile_config *st, *sel_st = NULL;
- stat_file_t *to_learn;
- GList *cur;
+ struct bayes_callback_data data;
+ gchar *value;
+ gint nodes;
+ gint minnodes;
+ struct rspamd_statfile_config *st, *sel_st = NULL;
+ stat_file_t *to_learn;
+ GList *cur;
g_assert (pool != NULL);
g_assert (ctx != NULL);
- if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
nodes = g_tree_nnodes (input);
if (nodes > FEATURE_WINDOW_SIZE) {
nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
}
if (nodes < minnodes) {
- msg_info ("do not learn message as it has too few tokens: %d, while %d min", nodes, minnodes);
+ msg_info (
+ "do not learn message as it has too few tokens: %d, while %d min",
+ nodes,
+ minnodes);
*sum = 0;
g_set_error (err,
- bayes_error_quark(), /* error domain */
- 1, /* error code */
- "message contains too few tokens: %d, while min is %d",
- nodes, (int)minnodes);
+ bayes_error_quark (), /* error domain */
+ 1, /* error code */
+ "message contains too few tokens: %d, while min is %d",
+ nodes, (int)minnodes);
return FALSE;
}
}
@@ -365,7 +406,8 @@ bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symb
data.ctx = ctx;
data.processed_tokens = 0;
data.processed_tokens = 0;
- if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
minnodes = rspamd_config_parse_limit (value, -1);
data.max_tokens = minnodes;
}
@@ -384,31 +426,36 @@ bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symb
}
if (sel_st == NULL) {
g_set_error (err,
- bayes_error_quark(), /* error domain */
- 1, /* error code */
- "cannot find statfile for symbol: %s",
- symbol);
+ bayes_error_quark (), /* error domain */
+ 1, /* error code */
+ "cannot find statfile for symbol: %s",
+ symbol);
return FALSE;
}
if ((to_learn = statfile_pool_is_open (pool, sel_st->path)) == NULL) {
- if ((to_learn = statfile_pool_open (pool, sel_st->path, sel_st->size, FALSE)) == NULL) {
+ if ((to_learn =
+ statfile_pool_open (pool, sel_st->path, sel_st->size,
+ FALSE)) == NULL) {
msg_warn ("cannot open %s", sel_st->path);
if (statfile_pool_create (pool, sel_st->path, sel_st->size) == -1) {
msg_err ("cannot create statfile %s", sel_st->path);
g_set_error (err,
- bayes_error_quark(), /* error domain */
- 1, /* error code */
- "cannot create statfile: %s",
- sel_st->path);
+ bayes_error_quark (), /* error domain */
+ 1, /* error code */
+ "cannot create statfile: %s",
+ sel_st->path);
return FALSE;
}
- if ((to_learn = statfile_pool_open (pool, sel_st->path, sel_st->size, FALSE)) == NULL) {
+ if ((to_learn =
+ statfile_pool_open (pool, sel_st->path, sel_st->size,
+ FALSE)) == NULL) {
g_set_error (err,
- bayes_error_quark(), /* error domain */
- 1, /* error code */
- "cannot open statfile %s after creation",
- sel_st->path);
- msg_err ("cannot open statfile %s after creation", sel_st->path);
+ bayes_error_quark (), /* error domain */
+ 1, /* error code */
+ "cannot open statfile %s after creation",
+ sel_st->path);
+ msg_err ("cannot open statfile %s after creation",
+ sel_st->path);
return FALSE;
}
}
@@ -427,22 +474,28 @@ bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symb
}
gboolean
-bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
- GTree *input, struct rspamd_task *task, gboolean is_spam, lua_State *L, GError **err)
+bayes_learn_spam (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task,
+ gboolean is_spam,
+ lua_State *L,
+ GError **err)
{
- struct bayes_callback_data data;
- gchar *value;
- gint nodes;
- gint minnodes;
- struct rspamd_statfile_config *st;
- stat_file_t *file;
- GList *cur;
- gboolean skip_labels;
+ struct bayes_callback_data data;
+ gchar *value;
+ gint nodes;
+ gint minnodes;
+ struct rspamd_statfile_config *st;
+ stat_file_t *file;
+ GList *cur;
+ gboolean skip_labels;
g_assert (pool != NULL);
g_assert (ctx != NULL);
- if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
nodes = g_tree_nnodes (input);
if (nodes > FEATURE_WINDOW_SIZE) {
@@ -450,10 +503,10 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
}
if (nodes < minnodes) {
g_set_error (err,
- bayes_error_quark(), /* error domain */
- 1, /* error code */
- "message contains too few tokens: %d, while min is %d",
- nodes, (int)minnodes);
+ bayes_error_quark (), /* error domain */
+ 1, /* error code */
+ "message contains too few tokens: %d, while min is %d",
+ nodes, (int)minnodes);
return FALSE;
}
}
@@ -461,7 +514,8 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
cur = call_classifier_pre_callbacks (ctx->cfg, task, TRUE, is_spam, L);
if (cur) {
skip_labels = FALSE;
- rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_list_free, cur);
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t)g_list_free, cur);
}
else {
/* Do not try to learn specific statfiles if pre callback returned nil */
@@ -475,7 +529,8 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
data.in_class = TRUE;
data.processed_tokens = 0;
- if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
minnodes = rspamd_config_parse_limit (value, -1);
data.max_tokens = minnodes;
}
@@ -491,24 +546,28 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
continue;
}
if ((file = statfile_pool_is_open (pool, st->path)) == NULL) {
- if ((file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+ if ((file =
+ statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
msg_warn ("cannot open %s", st->path);
if (statfile_pool_create (pool, st->path, st->size) == -1) {
msg_err ("cannot create statfile %s", st->path);
g_set_error (err,
- bayes_error_quark(), /* error domain */
- 1, /* error code */
- "cannot create statfile: %s",
- st->path);
+ bayes_error_quark (), /* error domain */
+ 1, /* error code */
+ "cannot create statfile: %s",
+ st->path);
return FALSE;
}
- if ((file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+ if ((file =
+ statfile_pool_open (pool, st->path, st->size,
+ FALSE)) == NULL) {
g_set_error (err,
- bayes_error_quark(), /* error domain */
- 1, /* error code */
- "cannot open statfile %s after creation",
- st->path);
- msg_err ("cannot open statfile %s after creation", st->path);
+ bayes_error_quark (), /* error domain */
+ 1, /* error code */
+ "cannot open statfile %s after creation",
+ st->path);
+ msg_err ("cannot open statfile %s after creation",
+ st->path);
return FALSE;
}
}
@@ -528,7 +587,10 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
}
GList *
-bayes_weights (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct rspamd_task *task)
+bayes_weights (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task)
{
/* This function is unimplemented with new normalizer */
return NULL;
diff --git a/src/classifiers/classifiers.c b/src/classifiers/classifiers.c
index fb294379c..95dd52c44 100644
--- a/src/classifiers/classifiers.c
+++ b/src/classifiers/classifiers.c
@@ -28,29 +28,29 @@
#include "classifiers.h"
-struct classifier classifiers[] = {
- {
- .name = "winnow",
- .init_func = winnow_init,
- .classify_func = winnow_classify,
- .learn_func = winnow_learn,
- .learn_spam_func = winnow_learn_spam,
- .weights_func = winnow_weights
- },
- {
- .name = "bayes",
- .init_func = bayes_init,
- .classify_func = bayes_classify,
- .learn_func = bayes_learn,
- .learn_spam_func = bayes_learn_spam,
- .weights_func = bayes_weights
- }
+struct classifier classifiers[] = {
+ {
+ .name = "winnow",
+ .init_func = winnow_init,
+ .classify_func = winnow_classify,
+ .learn_func = winnow_learn,
+ .learn_spam_func = winnow_learn_spam,
+ .weights_func = winnow_weights
+ },
+ {
+ .name = "bayes",
+ .init_func = bayes_init,
+ .classify_func = bayes_classify,
+ .learn_func = bayes_learn,
+ .learn_spam_func = bayes_learn_spam,
+ .weights_func = bayes_weights
+ }
};
-struct classifier *
+struct classifier *
get_classifier (const char *name)
{
- guint i;
+ guint i;
for (i = 0; i < sizeof (classifiers) / sizeof (classifiers[0]); i++) {
if (strcmp (classifiers[i].name, name) == 0) {
diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h
index 2b36d8c02..8e59fc555 100644
--- a/src/classifiers/classifiers.h
+++ b/src/classifiers/classifiers.h
@@ -28,36 +28,80 @@ struct classify_weight {
/* Common classifier structure */
struct classifier {
char *name;
- struct classifier_ctx* (*init_func)(rspamd_mempool_t *pool, struct rspamd_classifier_config *cf);
- gboolean (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct rspamd_task *task, lua_State *L);
- gboolean (*learn_func)(struct classifier_ctx* ctx, statfile_pool_t *pool,
- const char *symbol, GTree *input, gboolean in_class,
- double *sum, double multiplier, GError **err);
- gboolean (*learn_spam_func)(struct classifier_ctx* ctx, statfile_pool_t *pool,
- GTree *input, struct rspamd_task *task, gboolean is_spam, lua_State *L, GError **err);
- GList* (*weights_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct rspamd_task *task);
+ struct classifier_ctx * (*init_func)(rspamd_mempool_t *pool,
+ struct rspamd_classifier_config *cf);
+ gboolean (*classify_func)(struct classifier_ctx * ctx,
+ statfile_pool_t *pool, GTree *input, struct rspamd_task *task,
+ lua_State *L);
+ gboolean (*learn_func)(struct classifier_ctx * ctx, statfile_pool_t *pool,
+ const char *symbol, GTree *input, gboolean in_class,
+ double *sum, double multiplier, GError **err);
+ gboolean (*learn_spam_func)(struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input, struct rspamd_task *task, gboolean is_spam, lua_State *L,
+ GError **err);
+ GList * (*weights_func)(struct classifier_ctx * ctx, statfile_pool_t *pool,
+ GTree *input, struct rspamd_task *task);
};
/* Get classifier structure by name or return NULL if this name is not found */
-struct classifier* get_classifier (const char *name);
+struct classifier * get_classifier (const char *name);
/* Winnow algorithm */
-struct classifier_ctx* winnow_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cf);
-gboolean winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct rspamd_task *task, lua_State *L);
-gboolean winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symbol, GTree *input,
- gboolean in_class, double *sum, double multiplier, GError **err);
-gboolean winnow_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
- GTree *input, struct rspamd_task *task, gboolean is_spam, lua_State *L, GError **err);
-GList *winnow_weights (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct rspamd_task *task);
+struct classifier_ctx * winnow_init (rspamd_mempool_t *pool,
+ struct rspamd_classifier_config *cf);
+gboolean winnow_classify (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task,
+ lua_State *L);
+gboolean winnow_learn (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ const char *symbol,
+ GTree *input,
+ gboolean in_class,
+ double *sum,
+ double multiplier,
+ GError **err);
+gboolean winnow_learn_spam (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task,
+ gboolean is_spam,
+ lua_State *L,
+ GError **err);
+GList * winnow_weights (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task);
/* Bayes algorithm */
-struct classifier_ctx* bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier_config *cf);
-gboolean bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct rspamd_task *task, lua_State *L);
-gboolean bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symbol, GTree *input,
- gboolean in_class, double *sum, double multiplier, GError **err);
-gboolean bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
- GTree *input, struct rspamd_task *task, gboolean is_spam, lua_State *L, GError **err);
-GList *bayes_weights (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct rspamd_task *task);
+struct classifier_ctx * bayes_init (rspamd_mempool_t *pool,
+ struct rspamd_classifier_config *cf);
+gboolean bayes_classify (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task,
+ lua_State *L);
+gboolean bayes_learn (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ const char *symbol,
+ GTree *input,
+ gboolean in_class,
+ double *sum,
+ double multiplier,
+ GError **err);
+gboolean bayes_learn_spam (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task,
+ gboolean is_spam,
+ lua_State *L,
+ GError **err);
+GList * bayes_weights (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task);
/* Array of all defined classifiers */
extern struct classifier classifiers[];
diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c
index 1fe8f16d2..85d8cfa20 100644
--- a/src/classifiers/winnow.c
+++ b/src/classifiers/winnow.c
@@ -26,12 +26,12 @@
* Winnow classifier
*/
+#include "cfg_file.h"
#include "classifiers.h"
-#include "tokenizers/tokenizers.h"
-#include "main.h"
#include "filter.h"
-#include "cfg_file.h"
#include "lua/lua_common.h"
+#include "main.h"
+#include "tokenizers/tokenizers.h"
#define WINNOW_PROMOTION 1.23
#define WINNOW_DEMOTION 0.83
@@ -51,40 +51,42 @@ winnow_error_quark (void)
}
struct winnow_callback_data {
- statfile_pool_t *pool;
- struct classifier_ctx *ctx;
- stat_file_t *file;
- stat_file_t *learn_file;
- long double sum;
- long double start;
- double multiplier;
- guint32 count;
- guint32 new_blocks;
- gboolean in_class;
- gboolean do_demote;
- gboolean fresh_run;
- time_t now;
+ statfile_pool_t *pool;
+ struct classifier_ctx *ctx;
+ stat_file_t *file;
+ stat_file_t *learn_file;
+ long double sum;
+ long double start;
+ double multiplier;
+ guint32 count;
+ guint32 new_blocks;
+ gboolean in_class;
+ gboolean do_demote;
+ gboolean fresh_run;
+ time_t now;
};
static const double max_common_weight = MAX_WEIGHT * WINNOW_DEMOTION;
-static gboolean
+static gboolean
winnow_classify_callback (gpointer key, gpointer value, gpointer data)
{
- token_node_t *node = key;
- struct winnow_callback_data *cd = data;
- double v;
+ token_node_t *node = key;
+ struct winnow_callback_data *cd = data;
+ double v;
/* Consider that not found blocks have value 1 */
- v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now);
+ v =
+ statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
+ cd->now);
if (fabs (v) > ALPHA) {
cd->sum += v;
}
else {
cd->sum += 1.0;
- cd->new_blocks ++;
+ cd->new_blocks++;
}
cd->count++;
@@ -92,24 +94,32 @@ winnow_classify_callback (gpointer key, gpointer value, gpointer data)
return FALSE;
}
-static gboolean
+static gboolean
winnow_learn_callback (gpointer key, gpointer value, gpointer data)
{
- token_node_t *node = key;
- struct winnow_callback_data *cd = data;
- double v, c;
-
- c = (cd->in_class) ? WINNOW_PROMOTION * cd->multiplier : WINNOW_DEMOTION / cd->multiplier;
+ token_node_t *node = key;
+ struct winnow_callback_data *cd = data;
+ double v, c;
+
+ c = (cd->in_class) ? WINNOW_PROMOTION * cd->multiplier : WINNOW_DEMOTION /
+ cd->multiplier;
/* Consider that not found blocks have value 1 */
- v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now);
+ v =
+ statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2,
+ cd->now);
if (fabs (v) < ALPHA) {
/* Block not found, insert new */
cd->start += 1;
if (cd->file == cd->learn_file) {
- statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, c);
+ statfile_pool_set_block (cd->pool,
+ cd->file,
+ node->h1,
+ node->h2,
+ cd->now,
+ c);
node->value = c;
- cd->new_blocks ++;
+ cd->new_blocks++;
}
}
else {
@@ -119,18 +129,23 @@ winnow_learn_callback (gpointer key, gpointer value, gpointer data)
node->extra = 0;
}
else {
- node->extra ++;
+ node->extra++;
}
node->value = v;
-
+
if (node->extra > 1) {
- /*
+ /*
* Assume that this node is common for several statfiles, so
* decrease its weight proportianally
*/
if (node->value > max_common_weight) {
/* Static fluctuation */
- statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, 0.);
+ statfile_pool_set_block (cd->pool,
+ cd->file,
+ node->h1,
+ node->h2,
+ cd->now,
+ 0.);
node->value = 0.;
}
else if (node->value > WINNOW_PROMOTION * cd->multiplier) {
@@ -141,7 +156,7 @@ winnow_learn_callback (gpointer key, gpointer value, gpointer data)
node->value *= c;
}
else {
- /*
+ /*
* Too high token value that exists also in other
* statfiles, may be statistic error, so decrease it
* slightly
@@ -152,8 +167,13 @@ winnow_learn_callback (gpointer key, gpointer value, gpointer data)
else {
node->value = WINNOW_DEMOTION / cd->multiplier;
}
- statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, node->value);
- }
+ statfile_pool_set_block (cd->pool,
+ cd->file,
+ node->h1,
+ node->h2,
+ cd->now,
+ node->value);
+ }
}
else if (cd->file == cd->learn_file) {
/* New block or block that is in only one statfile */
@@ -164,12 +184,22 @@ winnow_learn_callback (gpointer key, gpointer value, gpointer data)
else {
node->value *= c;
}
- statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, node->value);
+ statfile_pool_set_block (cd->pool,
+ cd->file,
+ node->h1,
+ node->h2,
+ cd->now,
+ node->value);
}
else if (cd->do_demote) {
/* Demote blocks in file */
node->value *= WINNOW_DEMOTION / cd->multiplier;
- statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, node->value);
+ statfile_pool_set_block (cd->pool,
+ cd->file,
+ node->h1,
+ node->h2,
+ cd->now,
+ node->value);
}
}
@@ -181,10 +211,11 @@ winnow_learn_callback (gpointer key, gpointer value, gpointer data)
return FALSE;
}
-struct classifier_ctx *
+struct classifier_ctx *
winnow_init (rspamd_mempool_t * pool, struct rspamd_classifier_config *cfg)
{
- struct classifier_ctx *ctx = rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx));
+ struct classifier_ctx *ctx =
+ rspamd_mempool_alloc (pool, sizeof (struct classifier_ctx));
ctx->pool = pool;
ctx->cfg = cfg;
@@ -193,14 +224,18 @@ winnow_init (rspamd_mempool_t * pool, struct rspamd_classifier_config *cfg)
}
gboolean
-winnow_classify (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * input, struct rspamd_task *task, lua_State *L)
+winnow_classify (struct classifier_ctx *ctx,
+ statfile_pool_t * pool,
+ GTree * input,
+ struct rspamd_task *task,
+ lua_State *L)
{
- struct winnow_callback_data data;
- char *sumbuf, *value;
- long double res = 0., max = 0.;
- GList *cur;
- struct rspamd_statfile_config *st, *sel = NULL;
- int nodes, minnodes;
+ struct winnow_callback_data data;
+ char *sumbuf, *value;
+ long double res = 0., max = 0.;
+ GList *cur;
+ struct rspamd_statfile_config *st, *sel = NULL;
+ int nodes, minnodes;
g_assert (pool != NULL);
g_assert (ctx != NULL);
@@ -208,22 +243,27 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inp
data.pool = pool;
data.now = time (NULL);
data.ctx = ctx;
-
- if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
nodes = g_tree_nnodes (input);
if (nodes > FEATURE_WINDOW_SIZE) {
nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
}
if (nodes < minnodes) {
- msg_info ("do not classify message as it has too few tokens: %d, while %d min", nodes, minnodes);
+ msg_info (
+ "do not classify message as it has too few tokens: %d, while %d min",
+ nodes,
+ minnodes);
return FALSE;
}
}
cur = call_classifier_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L);
if (cur) {
- rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_list_free, cur);
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t)g_list_free, cur);
}
else {
cur = ctx->cfg->statfiles;
@@ -235,7 +275,8 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inp
data.count = 0;
data.new_blocks = 0;
if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
- if ((data.file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+ if ((data.file =
+ statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
msg_warn ("cannot open %s, skip it", st->path);
cur = g_list_next (cur);
continue;
@@ -261,16 +302,16 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inp
if (sel != NULL) {
#ifdef WITH_LUA
- max = call_classifier_post_callbacks (ctx->cfg, task, max, L);
+ max = call_classifier_post_callbacks (ctx->cfg, task, max, L);
#endif
#ifdef HAVE_TANHL
- max = tanhl (max);
+ max = tanhl (max);
#else
- /*
- * As some implementations of libm does not support tanhl, try to use
- * tanh
- */
- max = tanh ((double) max);
+ /*
+ * As some implementations of libm does not support tanhl, try to use
+ * tanh
+ */
+ max = tanh ((double) max);
#endif
sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
rspamd_snprintf (sumbuf, 32, "%.2F", max);
@@ -282,15 +323,18 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inp
}
GList *
-winnow_weights (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * input, struct rspamd_task *task)
+winnow_weights (struct classifier_ctx *ctx,
+ statfile_pool_t * pool,
+ GTree * input,
+ struct rspamd_task *task)
{
- struct winnow_callback_data data;
- long double res = 0.;
- GList *cur, *resl = NULL;
- struct rspamd_statfile_config *st;
- struct classify_weight *w;
- char *value;
- int nodes, minnodes;
+ struct winnow_callback_data data;
+ long double res = 0.;
+ GList *cur, *resl = NULL;
+ struct rspamd_statfile_config *st;
+ struct classify_weight *w;
+ char *value;
+ int nodes, minnodes;
g_assert (pool != NULL);
g_assert (ctx != NULL);
@@ -299,25 +343,30 @@ winnow_weights (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inpu
data.now = time (NULL);
data.ctx = ctx;
- if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
nodes = g_tree_nnodes (input);
if (nodes > FEATURE_WINDOW_SIZE) {
nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
}
if (nodes < minnodes) {
- msg_info ("do not classify message as it has too few tokens: %d, while %d min", nodes, minnodes);
+ msg_info (
+ "do not classify message as it has too few tokens: %d, while %d min",
+ nodes,
+ minnodes);
return NULL;
}
}
-
+
cur = ctx->cfg->statfiles;
while (cur) {
st = cur->data;
data.sum = 0;
data.count = 0;
if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
- if ((data.file = statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
+ if ((data.file =
+ statfile_pool_open (pool, st->path, st->size, FALSE)) == NULL) {
msg_warn ("cannot open %s, skip it", st->path);
cur = g_list_next (cur);
continue;
@@ -328,7 +377,9 @@ winnow_weights (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inpu
g_tree_foreach (input, winnow_classify_callback, &data);
}
- w = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct classify_weight));
+ w =
+ rspamd_mempool_alloc0 (task->task_pool,
+ sizeof (struct classify_weight));
if (data.count != 0) {
res = data.sum / (double)data.count;
}
@@ -340,9 +391,10 @@ winnow_weights (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inpu
resl = g_list_prepend (resl, w);
cur = g_list_next (cur);
}
-
+
if (resl != NULL) {
- rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_list_free, resl);
+ rspamd_mempool_add_destructor (task->task_pool,
+ (rspamd_mempool_destruct_t)g_list_free, resl);
}
return resl;
@@ -351,21 +403,27 @@ winnow_weights (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inpu
gboolean
-winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, const char *symbol,
- GTree * input, int in_class, double *sum, double multiplier, GError **err)
+winnow_learn (struct classifier_ctx *ctx,
+ statfile_pool_t *pool,
+ const char *symbol,
+ GTree * input,
+ int in_class,
+ double *sum,
+ double multiplier,
+ GError **err)
{
- struct winnow_callback_data data = {
+ struct winnow_callback_data data = {
.file = NULL,
.multiplier = multiplier
};
- char *value;
- int nodes, minnodes, iterations = 0;
- struct rspamd_statfile_config *st, *sel_st = NULL;
- stat_file_t *sel = NULL, *to_learn;
- long double res = 0., max = 0., start_value = 0., end_value = 0.;
- double learn_threshold = 0.0;
- GList *cur, *to_demote = NULL;
- gboolean force_learn = FALSE;
+ char *value;
+ int nodes, minnodes, iterations = 0;
+ struct rspamd_statfile_config *st, *sel_st = NULL;
+ stat_file_t *sel = NULL, *to_learn;
+ long double res = 0., max = 0., start_value = 0., end_value = 0.;
+ double learn_threshold = 0.0;
+ GList *cur, *to_demote = NULL;
+ gboolean force_learn = FALSE;
g_assert (pool != NULL);
g_assert (ctx != NULL);
@@ -376,29 +434,35 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, const char *sym
data.ctx = ctx;
- if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
+ if (ctx->cfg->opts &&
+ (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
nodes = g_tree_nnodes (input);
if (nodes > FEATURE_WINDOW_SIZE) {
nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
}
if (nodes < minnodes) {
- msg_info ("do not learn message as it has too few tokens: %d, while %d min", nodes, minnodes);
+ msg_info (
+ "do not learn message as it has too few tokens: %d, while %d min",
+ nodes,
+ minnodes);
if (sum != NULL) {
*sum = 0;
}
g_set_error (err,
- winnow_error_quark(), /* error domain */
- 1, /* error code */
- "message contains too few tokens: %d, while min is %d",
- nodes, minnodes);
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "message contains too few tokens: %d, while min is %d",
+ nodes, minnodes);
return FALSE;
}
}
- if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "learn_threshold")) != NULL) {
+ if (ctx->cfg->opts &&
+ (value =
+ g_hash_table_lookup (ctx->cfg->opts, "learn_threshold")) != NULL) {
learn_threshold = strtod (value, NULL);
}
-
+
if (learn_threshold <= 1.0 && learn_threshold >= 0) {
/* Classify message and check target statfile score */
cur = ctx->cfg->statfiles;
@@ -406,24 +470,27 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, const char *sym
/* Open or create all statfiles inside classifier */
st = cur->data;
if (statfile_pool_is_open (pool, st->path) == NULL) {
- if (statfile_pool_open (pool, st->path, st->size, FALSE) == NULL) {
+ if (statfile_pool_open (pool, st->path, st->size,
+ FALSE) == NULL) {
msg_warn ("cannot open %s", st->path);
if (statfile_pool_create (pool, st->path, st->size) == -1) {
msg_err ("cannot create statfile %s", st->path);
g_set_error (err,
- winnow_error_quark(), /* error domain */
- 1, /* error code */
- "cannot create statfile: %s",
- st->path);
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "cannot create statfile: %s",
+ st->path);
return FALSE;
}
- if (statfile_pool_open (pool, st->path, st->size, FALSE) == NULL) {
+ if (statfile_pool_open (pool, st->path, st->size,
+ FALSE) == NULL) {
g_set_error (err,
- winnow_error_quark(), /* error domain */
- 1, /* error code */
- "open statfile %s after creation",
- st->path);
- msg_err ("cannot open statfile %s after creation", st->path);
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "open statfile %s after creation",
+ st->path);
+ msg_err ("cannot open statfile %s after creation",
+ st->path);
return FALSE;
}
}
@@ -437,10 +504,10 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, const char *sym
if (sel_st == NULL) {
g_set_error (err,
- winnow_error_quark(), /* error domain */
- 1, /* error code */
- "cannot find statfile for symbol %s",
- symbol);
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "cannot find statfile for symbol %s",
+ symbol);
msg_err ("cannot find statfile for symbol %s", symbol);
return FALSE;
}
@@ -448,10 +515,10 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, const char *sym
to_learn = statfile_pool_is_open (pool, sel_st->path);
if (to_learn == NULL) {
g_set_error (err,
- winnow_error_quark(), /* error domain */
- 1, /* error code */
- "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles",
- sel_st->path);
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles",
+ sel_st->path);
return FALSE;
}
/* Check target statfile */
@@ -477,10 +544,10 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, const char *sym
data.count = 0;
if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
g_set_error (err,
- winnow_error_quark(), /* error domain */
- 1, /* error code */
- "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles",
- st->path);
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "statfile %s is not opened this maybe if your statfile pool is too small to handle all statfiles",
+ st->path);
return FALSE;
}
g_tree_foreach (input, winnow_classify_callback, &data);
@@ -498,18 +565,22 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, const char *sym
}
}
else {
- msg_err ("learn threshold is more than 1 or less than 0, so cannot do learn, please check your configuration");
+ msg_err (
+ "learn threshold is more than 1 or less than 0, so cannot do learn, please check your configuration");
g_set_error (err,
- winnow_error_quark(), /* error domain */
- 1, /* error code */
- "bad learn_threshold setting: %.2f",
- learn_threshold);
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "bad learn_threshold setting: %.2f",
+ learn_threshold);
return FALSE;
}
/* If to_demote list is empty this message is already classified correctly */
if (max > WINNOW_PROMOTION && to_demote == NULL && !force_learn) {
- msg_info ("this message is already of class %s with threshold %.2f and weight %.2F",
- sel_st->symbol, learn_threshold, max);
+ msg_info (
+ "this message is already of class %s with threshold %.2f and weight %.2F",
+ sel_st->symbol,
+ learn_threshold,
+ max);
goto end;
}
data.learn_file = to_learn;
@@ -526,7 +597,8 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, const char *sym
if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
return FALSE;
}
- if (to_demote != NULL && g_list_find (to_demote, data.file) != NULL) {
+ if (to_demote != NULL &&
+ g_list_find (to_demote, data.file) != NULL) {
data.do_demote = TRUE;
}
else {
@@ -557,48 +629,66 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, const char *sym
}
data.multiplier *= WINNOW_PROMOTION;
- msg_info ("learn iteration %d for statfile %s: %G -> %G, multiplier: %.2f", iterations + 1, symbol,
- start_value, end_value, data.multiplier);
- } while ((in_class ? sel != to_learn : sel == to_learn) && iterations ++ < MAX_LEARN_ITERATIONS);
-
+ msg_info (
+ "learn iteration %d for statfile %s: %G -> %G, multiplier: %.2f",
+ iterations + 1,
+ symbol,
+ start_value,
+ end_value,
+ data.multiplier);
+ } while ((in_class ? sel != to_learn : sel ==
+ to_learn) && iterations++ < MAX_LEARN_ITERATIONS);
+
if (iterations >= MAX_LEARN_ITERATIONS) {
- msg_warn ("learning statfile %s was not fully successfull: iterations count is limited to %d, final sum is %G",
- sel_st->symbol, MAX_LEARN_ITERATIONS, max);
+ msg_warn (
+ "learning statfile %s was not fully successfull: iterations count is limited to %d, final sum is %G",
+ sel_st->symbol,
+ MAX_LEARN_ITERATIONS,
+ max);
g_set_error (err,
- winnow_error_quark(), /* error domain */
- 1, /* error code */
- "learning statfile %s was not fully successfull: iterations count is limited to %d",
- sel_st->symbol, MAX_LEARN_ITERATIONS);
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "learning statfile %s was not fully successfull: iterations count is limited to %d",
+ sel_st->symbol, MAX_LEARN_ITERATIONS);
return FALSE;
}
else {
- msg_info ("learned statfile %s successfully with %d iterations and sum %G", sel_st->symbol, iterations + 1, max);
+ msg_info (
+ "learned statfile %s successfully with %d iterations and sum %G",
+ sel_st->symbol,
+ iterations + 1,
+ max);
}
end:
if (sum) {
#ifdef HAVE_TANHL
- *sum = (double)tanhl (max);
+ *sum = (double)tanhl (max);
#else
- /*
- * As some implementations of libm does not support tanhl, try to use
- * tanh
- */
- *sum = tanh ((double) max);
+ /*
+ * As some implementations of libm does not support tanhl, try to use
+ * tanh
+ */
+ *sum = tanh ((double) max);
#endif
}
return TRUE;
}
gboolean
-winnow_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
- GTree *input, struct rspamd_task *task, gboolean is_spam, lua_State *L, GError **err)
+winnow_learn_spam (struct classifier_ctx * ctx,
+ statfile_pool_t *pool,
+ GTree *input,
+ struct rspamd_task *task,
+ gboolean is_spam,
+ lua_State *L,
+ GError **err)
{
g_set_error (err,
- winnow_error_quark(), /* error domain */
- 1, /* error code */
- "learn spam is not supported for winnow"
- );
+ winnow_error_quark (), /* error domain */
+ 1, /* error code */
+ "learn spam is not supported for winnow"
+ );
return FALSE;
}