diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-11-23 01:23:50 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-11-23 01:23:50 +0000 |
commit | 70e79ce3e6885074a22d1526d9a2bfbbb6e5a5ea (patch) | |
tree | fb89d04a01d3bceb13d80af1f77e4ae0e6171d42 /src/libstat | |
parent | 7cc0bdd4c447238451b9ffb8e16f5d97a2b7e21e (diff) | |
download | rspamd-70e79ce3e6885074a22d1526d9a2bfbbb6e5a5ea.tar.gz rspamd-70e79ce3e6885074a22d1526d9a2bfbbb6e5a5ea.zip |
Improve debugging for bayes.
Diffstat (limited to 'src/libstat')
-rw-r--r-- | src/libstat/classifiers/bayes.c | 62 |
1 files changed, 51 insertions, 11 deletions
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index 2a7ea2b89..966d5b458 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -32,6 +32,24 @@ #include "stat_internal.h" #include "math.h" +#define msg_err_bayes(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \ + "bayes", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_warn_bayes(...) rspamd_default_log_function (G_LOG_LEVEL_WARNING, \ + "bayes", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_info_bayes(...) rspamd_default_log_function (G_LOG_LEVEL_INFO, \ + "bayes", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_debug_bayes(...) rspamd_default_log_function (G_LOG_LEVEL_DEBUG, \ + "bayes", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) + + static inline GQuark bayes_error_quark (void) { @@ -46,7 +64,7 @@ bayes_error_quark (void) * @return */ static gdouble -inv_chi_square (gdouble value, gint freedom_deg) +inv_chi_square (struct rspamd_task *task, gdouble value, gint freedom_deg) { double prob, sum, m; gint i; @@ -56,7 +74,7 @@ inv_chi_square (gdouble value, gint freedom_deg) prob = exp (value); if (errno == ERANGE) { - msg_err ("exp overflow"); + msg_err_bayes ("exp overflow"); return 0; } @@ -71,6 +89,11 @@ inv_chi_square (gdouble value, gint freedom_deg) return MIN (1.0, sum); } +struct bayes_task_closure { + struct rspamd_classifier_runtime *rt; + struct rspamd_task *task; +}; + static const double feature_weight[] = { 0, 3125, 256, 27, 4, 1 }; #define PROB_COMBINE(prob, cnt, weight, assumed) (((weight) * (assumed) + (cnt) * (prob)) / ((weight) + (cnt))) @@ -81,13 +104,18 @@ static gboolean bayes_classify_callback (gpointer key, gpointer value, gpointer data) { rspamd_token_t *node = value; - struct rspamd_classifier_runtime *rt = (struct rspamd_classifier_runtime *)data; + struct bayes_task_closure *cl = data; + struct rspamd_classifier_runtime *rt; guint i; struct rspamd_token_result *res; guint64 spam_count = 0, ham_count = 0, total_count = 0; + struct rspamd_task *task; double spam_prob, spam_freq, ham_freq, bayes_spam_prob, bayes_ham_prob, ham_prob, fw, w, norm_sum, norm_sub; + rt = cl->rt; + task = cl->task; + for (i = rt->start_pos; i < rt->end_pos; i++) { res = &g_array_index (node->results, struct rspamd_token_result, i); @@ -122,6 +150,15 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) rt->spam_prob += log (bayes_spam_prob); rt->ham_prob += log (bayes_ham_prob); res->cl_runtime->processed_tokens ++; + + msg_debug_bayes ("token: total_count: %L, spam_count: %L, ham_count: %L," + " spam_prob: %.3f, " + "ham_prob: %.3f, bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, " + "current spam prob: %.3f, current ham prob: %.3f", + total_count, spam_count, ham_count, + spam_prob, ham_prob, + bayes_spam_prob, bayes_ham_prob, + rt->spam_prob, rt->ham_prob); } return FALSE; @@ -151,6 +188,7 @@ bayes_classify (struct classifier_ctx * ctx, struct rspamd_statfile_runtime *st, *selected_st = NULL; GList *cur; char *sumbuf; + struct bayes_task_closure cl; g_assert (ctx != NULL); g_assert (input != NULL); @@ -158,15 +196,17 @@ bayes_classify (struct classifier_ctx * ctx, g_assert (rt->end_pos > rt->start_pos); if (rt->stage == RSPAMD_STAT_STAGE_PRE) { - g_tree_foreach (input, bayes_classify_callback, rt); + cl.rt = rt; + cl.task = task; + g_tree_foreach (input, bayes_classify_callback, &cl); } else { - h = 1 - inv_chi_square (rt->spam_prob, rt->processed_tokens); - s = 1 - inv_chi_square (rt->ham_prob, rt->processed_tokens); + h = 1 - inv_chi_square (task, rt->spam_prob, rt->processed_tokens); + s = 1 - inv_chi_square (task, rt->ham_prob, rt->processed_tokens); if (isfinite (s) && isfinite (h)) { final_prob = (s + 1.0 - h) / 2.; - msg_debug ("<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f," + msg_debug_bayes ("<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f," " %L tokens processed of %ud total tokens", task->message_id, rt->ham_prob, h, rt->spam_prob, s, rt->processed_tokens, g_tree_nnodes (input)); @@ -178,17 +218,17 @@ bayes_classify (struct classifier_ctx * ctx, */ if (isfinite (h)) { final_prob = 1.0; - msg_debug ("<%s> spam class is overflowed, as we have no" + msg_debug_bayes ("<%s> spam class is overflowed, as we have no" " ham samples", task->message_id); } else if (isfinite (s)){ final_prob = 0.0; - msg_debug ("<%s> ham class is overflowed, as we have no" + msg_debug_bayes ("<%s> ham class is overflowed, as we have no" " spam samples", task->message_id); } else { final_prob = 0.5; - msg_warn ("<%s> spam and ham classes are both overflowed", + msg_warn_bayes ("<%s> spam and ham classes are both overflowed", task->message_id); } } @@ -213,7 +253,7 @@ bayes_classify (struct classifier_ctx * ctx, } if (selected_st == NULL) { - msg_err ( + msg_err_bayes ( "unexpected classifier error: cannot select desired statfile, " "prob: %.4f", final_prob); } |