]> source.dussan.org Git - rspamd.git/commitdiff
Start splitting statistic processing to separate stages.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 2 Mar 2015 17:35:34 +0000 (17:35 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 2 Mar 2015 17:35:34 +0000 (17:35 +0000)
src/libserver/task.h
src/libstat/classifiers/bayes.c
src/libstat/stat_internal.h
src/libstat/stat_process.c

index ded241b317eb8fe17ec1f380168b66fbaf50705e..135e8bf9269ad9bc6d451463f344cd1bcff13a79 100644 (file)
@@ -143,6 +143,7 @@ struct rspamd_task {
        struct event_base *ev_base;                                 /**< Event base                                                                             */
 
        GThreadPool *classify_pool;                                 /**< A pool of classify threads                     */
+       gpointer classify_data;                                                                         /**< Opaque classifiers data                                            */
 
        struct {
                enum rspamd_metric_action action;                       /**< Action of pre filters                                                      */
index 7932ceb9e45d49d52a63c63a904fe9616f7ad94a..823f5eff9ad1baafc89430199e83d8354589990d 100644 (file)
@@ -151,55 +151,59 @@ bayes_classify (struct classifier_ctx * ctx,
        g_assert (rt != NULL);
        g_assert (rt->end_pos > rt->start_pos);
 
-       g_tree_foreach (input, bayes_classify_callback, rt);
-
-       if (rt->spam_prob == 0) {
-               final_prob = 0;
+       if (rt->stage == RSPAMD_STAT_STAGE_PRE) {
+               g_tree_foreach (input, bayes_classify_callback, rt);
        }
        else {
-               h = 1 - inv_chi_square (-2. * rt->spam_prob,
-                               2 * rt->processed_tokens);
-               s = 1 - inv_chi_square (-2. * rt->ham_prob,
-                               2 * rt->processed_tokens);
-               final_prob = (s + 1 - h) / 2.;
-               msg_debug ("<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f",
-                               task->message_id, rt->ham_prob, h, rt->spam_prob, s);
-       }
 
-       if (rt->processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) {
+               if (rt->spam_prob == 0) {
+                       final_prob = 0;
+               }
+               else {
+                       h = 1 - inv_chi_square (-2. * rt->spam_prob,
+                                       2 * rt->processed_tokens);
+                       s = 1 - inv_chi_square (-2. * rt->ham_prob,
+                                       2 * rt->processed_tokens);
+                       final_prob = (s + 1 - h) / 2.;
+                       msg_debug ("<%s> got ham prob %.2f -> %.2f and spam prob %.2f -> %.2f",
+                                       task->message_id, rt->ham_prob, h, rt->spam_prob, s);
+               }
+
+               if (rt->processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) {
 
-               sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
-               cur = g_list_first (rt->st_runtime);
+                       sumbuf = rspamd_mempool_alloc (task->task_pool, 32);
+                       cur = g_list_first (rt->st_runtime);
 
-               while (cur) {
-                       st = (struct rspamd_statfile_runtime *)cur->data;
+                       while (cur) {
+                               st = (struct rspamd_statfile_runtime *)cur->data;
 
-                       if ((final_prob < 0.5 && !st->st->is_spam) ||
-                               (final_prob > 0.5 && st->st->is_spam)) {
-                               if (st->total_hits > maxhits) {
-                                       maxhits = st->total_hits;
-                                       selected_st = st;
+                               if ((final_prob < 0.5 && !st->st->is_spam) ||
+                                               (final_prob > 0.5 && st->st->is_spam)) {
+                                       if (st->total_hits > maxhits) {
+                                               maxhits = st->total_hits;
+                                               selected_st = st;
+                                       }
                                }
-                       }
 
-                       cur = g_list_next (cur);
-               }
+                               cur = g_list_next (cur);
+                       }
 
-               if (selected_st == NULL) {
-                       msg_err (
-                               "unexpected classifier error: cannot select desired statfile");
-               }
-               else {
-                       /* Calculate ham probability correctly */
-                       if (final_prob < 0.5) {
-                               final_prob = 1. - final_prob;
+                       if (selected_st == NULL) {
+                               msg_err (
+                                       "unexpected classifier error: cannot select desired statfile");
+                       }
+                       else {
+                               /* Calculate ham probability correctly */
+                               if (final_prob < 0.5) {
+                                       final_prob = 1. - final_prob;
+                               }
+                               rspamd_snprintf (sumbuf, 32, "%.2f%%", final_prob * 100.);
+                               cur = g_list_prepend (NULL, sumbuf);
+                               rspamd_task_insert_result (task,
+                                               selected_st->st->symbol,
+                                               final_prob,
+                                               cur);
                        }
-                       rspamd_snprintf (sumbuf, 32, "%.2f%%", final_prob * 100.);
-                       cur = g_list_prepend (NULL, sumbuf);
-                       rspamd_task_insert_result (task,
-                               selected_st->st->symbol,
-                               final_prob,
-                               cur);
                }
        }
 
index 29bd937fbc42491b3cb8c6b1b0f196b3f9cd854c..0514048148998d0dea6cdda562fa99d988f3f1d6 100644 (file)
 #include "backends/backends.h"
 #include "learn_cache/learn_cache.h"
 
+enum stat_process_stage {
+       RSPAMD_STAT_STAGE_PRE = 0,
+       RSPAMD_STAT_STAGE_POST
+};
+
 struct rspamd_tokenizer_runtime {
        GTree *tokens;
        const gchar *name;
@@ -51,6 +56,7 @@ struct rspamd_classifier_runtime {
        struct rspamd_tokenizer_runtime *tok;
        double ham_prob;
        double spam_prob;
+       enum stat_process_stage stage;
        guint64 total_spam;
        guint64 total_ham;
        guint64 processed_tokens;
index 311eaa0ea5e0b8ad87c5f02980fd813e3071d48f..4cb0f42bb1885c3ce34fb12b3876c5e37d5928eb 100644 (file)
@@ -400,10 +400,27 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err)
 
        while (cur) {
                cl_run = (struct rspamd_classifier_runtime *)cur->data;
+               cl_run->stage = RSPAMD_STAT_STAGE_PRE;
 
                if (cl_run->cl) {
                        cl_ctx = cl_run->cl->init_func (task->task_pool, cl_run->clcf);
 
+                       if (cl_ctx != NULL) {
+                               cl_run->cl->classify_func (cl_ctx, cl_run->tok->tokens,
+                                               cl_run, task);
+                       }
+               }
+
+               cur = g_list_next (cur);
+       }
+
+       /* XXX: backend runtime post-processing */
+       /* Post-processing */
+       while (cur) {
+               cl_run = (struct rspamd_classifier_runtime *)cur->data;
+               cl_run->stage = RSPAMD_STAT_STAGE_POST;
+
+               if (cl_run->cl) {
                        if (cl_ctx != NULL) {
                                if (cl_run->cl->classify_func (cl_ctx, cl_run->tok->tokens,
                                                cl_run, task)) {