From 8f5509c65dc6907a7581518246a200236088423c Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 23 Jan 2015 12:18:04 +0000 Subject: Rework statistics runtime structures. --- src/libserver/protocol.c | 2 ++ src/libstat/classifiers/bayes.c | 12 ++++-------- src/libstat/stat_api.h | 29 +++++++++++++++++++++++++++++ src/libstat/tokenizers.h | 19 +------------------ src/libstat/tokenizers/osb.c | 9 ++++++--- src/libstat/tokenizers/tokenizers.c | 2 +- 6 files changed, 43 insertions(+), 30 deletions(-) diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c index b3feda154..44bfe5a4e 100644 --- a/src/libserver/protocol.c +++ b/src/libserver/protocol.c @@ -773,6 +773,8 @@ rspamd_ucl_tolegacy_output (struct rspamd_task *task, ucl_object_tostring (elt)); } + g_assert (ucl_object_todouble (score) < 1000.0); + iter = NULL; while ((elt = ucl_iterate_object (metric, &iter, true)) != NULL) { if (elt->type == UCL_OBJECT) { diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index 06c549292..54db73d9e 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -64,7 +64,7 @@ struct bayes_callback_data { static gboolean bayes_learn_callback (gpointer key, gpointer value, gpointer data) { - token_node_t *node = key; + rspamd_token_t *node = key; struct bayes_callback_data *cd = data; gint c; guint64 v; @@ -144,7 +144,7 @@ static gboolean bayes_classify_callback (gpointer key, gpointer value, gpointer data) { - token_node_t *node = key; + rspamd_token_t *node = key; struct bayes_callback_data *cd = data; guint i; struct bayes_statfile_data *cur; @@ -222,9 +222,7 @@ bayes_classify (struct classifier_ctx * ctx, (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { minnodes = strtol (value, NULL, 10); nodes = g_tree_nnodes (input); - if (nodes > FEATURE_WINDOW_SIZE) { - nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; - } + if (nodes < minnodes) { return FALSE; } @@ -331,9 +329,7 @@ bayes_learn_spam (struct classifier_ctx * ctx, (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { minnodes = strtol (value, NULL, 10); nodes = g_tree_nnodes (input); - if (nodes > FEATURE_WINDOW_SIZE) { - nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; - } + if (nodes < minnodes) { g_set_error (err, bayes_error_quark (), /* error domain */ diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h index 0e2bf86b8..64b3f0b92 100644 --- a/src/libstat/stat_api.h +++ b/src/libstat/stat_api.h @@ -31,6 +31,35 @@ * High level statistics API */ +struct rspamd_statfile_runtime { + struct rspamd_statfile_config *st; + guint64 hits; + guint64 total_hits; +}; + +struct rspamd_classifier_runtime { + double ham_prob; + double spam_prob; + guint64 total_spam; + guint64 total_ham; + guint64 processed_tokens; + gsize max_tokens; +}; + +struct rspamd_token_result { + double value; + struct rspamd_statfile_runtime *st_runtime; + + struct rspamd_classifier_runtime *cl_runtime; +}; + +#define RSPAMD_MAX_TOKEN_LEN 64 +typedef struct token_node_s { + guchar data[RSPAMD_MAX_TOKEN_LEN]; + guint datalen; + GArray *results; +} rspamd_token_t; + /** * Initialise statistics modules * @param cfg diff --git a/src/libstat/tokenizers.h b/src/libstat/tokenizers.h index 48f9b6e56..73d07a5c4 100644 --- a/src/libstat/tokenizers.h +++ b/src/libstat/tokenizers.h @@ -5,24 +5,7 @@ #include "mem_pool.h" #include "fstring.h" #include "main.h" - -/* Size for features pipe */ -#define FEATURE_WINDOW_SIZE 5 -#define MAX_DATA_LEN 64 -#define MAX_VALUES 32 - -struct token_result { - double value; - struct rspamd_statfile_config *st; - double *consolidated_value; -}; - -typedef struct token_node_s { - guchar data[MAX_DATA_LEN]; - guint datalen; - struct token_result *results; - guint results_len; -} token_node_t; +#include "stat_api.h" /* Common tokenizer structure */ struct tokenizer { diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index 4016842b6..abf547f43 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -29,6 +29,9 @@ #include #include "tokenizers.h" +/* Size for features pipe */ +#define FEATURE_WINDOW_SIZE 5 + /* Minimum length of token */ #define MIN_LEN 4 @@ -43,7 +46,7 @@ osb_tokenize_text (struct tokenizer *tokenizer, gboolean is_utf, GList *exceptions) { - token_node_t *new = NULL; + rspamd_token_t *new = NULL; rspamd_fstring_t *token; guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; gint i, processed = 0; @@ -82,7 +85,7 @@ osb_tokenize_text (struct tokenizer *tokenizer, h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; - new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t)); + new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t)); new->datalen = sizeof(gint32) * 2; memcpy(new->data, &h1, sizeof(h1)); memcpy(new->data + sizeof(h1), &h2, sizeof(h2)); @@ -98,7 +101,7 @@ osb_tokenize_text (struct tokenizer *tokenizer, for (i = 1; i < processed; i++) { h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; - new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t)); + new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t)); new->datalen = sizeof(gint32) * 2; memcpy(new->data, &h1, sizeof(h1)); memcpy(new->data + sizeof(h1), &h2, sizeof(h2)); diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 7d00f693a..10e4b92d5 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -92,7 +92,7 @@ rspamd_stat_get_tokenizer (const char *name) int token_node_compare_func (gconstpointer a, gconstpointer b) { - const token_node_t *aa = a, *bb = b; + const rspamd_token_t *aa = a, *bb = b; if (aa->datalen != bb->datalen) { return aa->datalen - bb->datalen; -- cgit v1.2.3