Start splitting statistic processing to separate stages.

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 2 Mar 2015 17:35:34 +0000 (17:35 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 2 Mar 2015 17:35:34 +0000 (17:35 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 2 Mar 2015 17:35:34 +0000 (17:35 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 2 Mar 2015 17:35:34 +0000 (17:35 +0000)
diff --git a/src/libserver/task.h b/src/libserver/task.h

index ded241b317eb8fe17ec1f380168b66fbaf50705e..135e8bf9269ad9bc6d451463f344cd1bcff13a79 100644 (file)
--- a/src/libserver/task.h
+++ b/src/libserver/task.h
@@ -143,6 +143,7 @@ struct rspamd_task {
         struct event_base *ev_base;                                 /**< Event base                                                                             */
  
         GThreadPool *classify_pool;                                 /**< A pool of classify threads                     */
+       gpointer classify_data;                                                                         /**< Opaque classifiers data                                            */
  
         struct {
                 enum rspamd_metric_action action;                       /**< Action of pre filters                                                      */
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c

index 7932ceb9e45d49d52a63c63a904fe9616f7ad94a..823f5eff9ad1baafc89430199e83d8354589990d 100644 (file)
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -151,55 +151,59 @@ bayes_classify (struct classifier_ctx * ctx,
         g_assert (rt != NULL);
         g_assert (rt->end_pos > rt->start_pos);
  
-       g_tree_foreach (input, bayes_classify_callback, rt);
-
-       if (rt->spam_prob == 0) {
-               final_prob = 0;
+       if (rt->stage == RSPAMD_STAT_STAGE_PRE) {
+               g_tree_foreach (input, bayes_classify_callback, rt);
         }
         else {
-               h = 1 - inv_chi_square (-2. * rt->spam_prob,
-                               2 * rt->processed_tokens);
-               s = 1 - inv_chi_square (-2. * rt->ham_prob,
-                               2 * rt->processed_tokens);
-               final_prob = (s + 1 - h) / 2.;
-               msg_debug ("<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f",
-                               task->message_id, rt->ham_prob, h, rt->spam_prob, s);
-       }
  
-       if (rt->processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) {
+               if (rt->spam_prob == 0) {
+                       final_prob = 0;
+               }
+               else {
+                       h = 1 - inv_chi_square (-2. * rt->spam_prob,
+                                       2 * rt->processed_tokens);
+                       s = 1 - inv_chi_square (-2. * rt->ham_prob,
+                                       2 * rt->processed_tokens);
+                       final_prob = (s + 1 - h) / 2.;
+                       msg_debug ("<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f",
+                                       task->message_id, rt->ham_prob, h, rt->spam_prob, s);
+               }
+
+               if (rt->processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) {
  
-               sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
-               cur = g_list_first (rt->st_runtime);
+                       sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
+                       cur = g_list_first (rt->st_runtime);
  
-               while (cur) {
-                       st = (struct rspamd_statfile_runtime *)cur->data;
+                       while (cur) {
+                               st = (struct rspamd_statfile_runtime *)cur->data;
  
-                       if ((final_prob < 0.5 && !st->st->is_spam) ||
-                               (final_prob > 0.5 && st->st->is_spam)) {
-                               if (st->total_hits > maxhits) {
-                                       maxhits = st->total_hits;
-                                       selected_st = st;
+                               if ((final_prob < 0.5 && !st->st->is_spam) ||
+                                               (final_prob > 0.5 && st->st->is_spam)) {
+                                       if (st->total_hits > maxhits) {
+                                               maxhits = st->total_hits;
+                                               selected_st = st;
+                                       }
                                 }
-                       }
  
-                       cur = g_list_next (cur);
-               }
+                               cur = g_list_next (cur);
+                       }
  
-               if (selected_st == NULL) {
-                       msg_err (
-                               "unexpected classifier error: cannot select desired statfile");
-               }
-               else {
-                       /* Calculate ham probability correctly */
-                       if (final_prob < 0.5) {
-                               final_prob = 1. - final_prob;
+                       if (selected_st == NULL) {
+                               msg_err (
+                                       "unexpected classifier error: cannot select desired statfile");
+                       }
+                       else {
+                               /* Calculate ham probability correctly */
+                               if (final_prob < 0.5) {
+                                       final_prob = 1. - final_prob;
+                               }
+                               rspamd_snprintf (sumbuf, 32, "%.2f%%", final_prob * 100.);
+                               cur = g_list_prepend (NULL, sumbuf);
+                               rspamd_task_insert_result (task,
+                                               selected_st->st->symbol,
+                                               final_prob,
+                                               cur);
                         }
-                       rspamd_snprintf (sumbuf, 32, "%.2f%%", final_prob * 100.);
-                       cur = g_list_prepend (NULL, sumbuf);
-                       rspamd_task_insert_result (task,
-                               selected_st->st->symbol,
-                               final_prob,
-                               cur);
                 }
         }
  
diff --git a/src/libstat/stat_internal.h b/src/libstat/stat_internal.h

index 29bd937fbc42491b3cb8c6b1b0f196b3f9cd854c..0514048148998d0dea6cdda562fa99d988f3f1d6 100644 (file)
--- a/src/libstat/stat_internal.h
+++ b/src/libstat/stat_internal.h
@@ -30,6 +30,11 @@
  #include "backends/backends.h"
  #include "learn_cache/learn_cache.h"
  
+enum stat_process_stage {
+       RSPAMD_STAT_STAGE_PRE = 0,
+       RSPAMD_STAT_STAGE_POST
+};
+
  struct rspamd_tokenizer_runtime {
         GTree *tokens;
         const gchar *name;
@@ -51,6 +56,7 @@ struct rspamd_classifier_runtime {
         struct rspamd_tokenizer_runtime *tok;
         double ham_prob;
         double spam_prob;
+       enum stat_process_stage stage;
         guint64 total_spam;
         guint64 total_ham;
         guint64 processed_tokens;
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c

index 311eaa0ea5e0b8ad87c5f02980fd813e3071d48f..4cb0f42bb1885c3ce34fb12b3876c5e37d5928eb 100644 (file)
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -400,10 +400,27 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err)
  
         while (cur) {
                 cl_run = (struct rspamd_classifier_runtime *)cur->data;
+               cl_run->stage = RSPAMD_STAT_STAGE_PRE;
  
                 if (cl_run->cl) {
                         cl_ctx = cl_run->cl->init_func (task->task_pool, cl_run->clcf);
  
+                       if (cl_ctx != NULL) {
+                               cl_run->cl->classify_func (cl_ctx, cl_run->tok->tokens,
+                                               cl_run, task);
+                       }
+               }
+
+               cur = g_list_next (cur);
+       }
+
+       /* XXX: backend runtime post-processing */
+       /* Post-processing */
+       while (cur) {
+               cl_run = (struct rspamd_classifier_runtime *)cur->data;
+               cl_run->stage = RSPAMD_STAT_STAGE_POST;
+
+               if (cl_run->cl) {
                         if (cl_ctx != NULL) {
                                 if (cl_run->cl->classify_func (cl_ctx, cl_run->tok->tokens,
                                                 cl_run, task)) {
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 2 Mar 2015 17:35:34 +0000 (17:35 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 2 Mar 2015 17:35:34 +0000 (17:35 +0000)
src/libserver/task.h		patch \| blob \| history
src/libstat/classifiers/bayes.c		patch \| blob \| history
src/libstat/stat_internal.h		patch \| blob \| history
src/libstat/stat_process.c		patch \| blob \| history