summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-01-23 12:18:04 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-01-23 12:18:04 +0000
commit8f5509c65dc6907a7581518246a200236088423c (patch)
tree448c0233a83ee25855999033c22d78b2297b64f5
parent8969605e58c22b95ac4eb8783b6c4d045732810e (diff)
downloadrspamd-8f5509c65dc6907a7581518246a200236088423c.tar.gz
rspamd-8f5509c65dc6907a7581518246a200236088423c.zip
Rework statistics runtime structures.
-rw-r--r--src/libserver/protocol.c2
-rw-r--r--src/libstat/classifiers/bayes.c12
-rw-r--r--src/libstat/stat_api.h29
-rw-r--r--src/libstat/tokenizers.h19
-rw-r--r--src/libstat/tokenizers/osb.c9
-rw-r--r--src/libstat/tokenizers/tokenizers.c2
6 files changed, 43 insertions, 30 deletions
diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c
index b3feda154..44bfe5a4e 100644
--- a/src/libserver/protocol.c
+++ b/src/libserver/protocol.c
@@ -773,6 +773,8 @@ rspamd_ucl_tolegacy_output (struct rspamd_task *task,
ucl_object_tostring (elt));
}
+ g_assert (ucl_object_todouble (score) < 1000.0);
+
iter = NULL;
while ((elt = ucl_iterate_object (metric, &iter, true)) != NULL) {
if (elt->type == UCL_OBJECT) {
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index 06c549292..54db73d9e 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -64,7 +64,7 @@ struct bayes_callback_data {
static gboolean
bayes_learn_callback (gpointer key, gpointer value, gpointer data)
{
- token_node_t *node = key;
+ rspamd_token_t *node = key;
struct bayes_callback_data *cd = data;
gint c;
guint64 v;
@@ -144,7 +144,7 @@ static gboolean
bayes_classify_callback (gpointer key, gpointer value, gpointer data)
{
- token_node_t *node = key;
+ rspamd_token_t *node = key;
struct bayes_callback_data *cd = data;
guint i;
struct bayes_statfile_data *cur;
@@ -222,9 +222,7 @@ bayes_classify (struct classifier_ctx * ctx,
(value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
nodes = g_tree_nnodes (input);
- if (nodes > FEATURE_WINDOW_SIZE) {
- nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
- }
+
if (nodes < minnodes) {
return FALSE;
}
@@ -331,9 +329,7 @@ bayes_learn_spam (struct classifier_ctx * ctx,
(value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) {
minnodes = strtol (value, NULL, 10);
nodes = g_tree_nnodes (input);
- if (nodes > FEATURE_WINDOW_SIZE) {
- nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE;
- }
+
if (nodes < minnodes) {
g_set_error (err,
bayes_error_quark (), /* error domain */
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h
index 0e2bf86b8..64b3f0b92 100644
--- a/src/libstat/stat_api.h
+++ b/src/libstat/stat_api.h
@@ -31,6 +31,35 @@
* High level statistics API
*/
+struct rspamd_statfile_runtime {
+ struct rspamd_statfile_config *st;
+ guint64 hits;
+ guint64 total_hits;
+};
+
+struct rspamd_classifier_runtime {
+ double ham_prob;
+ double spam_prob;
+ guint64 total_spam;
+ guint64 total_ham;
+ guint64 processed_tokens;
+ gsize max_tokens;
+};
+
+struct rspamd_token_result {
+ double value;
+ struct rspamd_statfile_runtime *st_runtime;
+
+ struct rspamd_classifier_runtime *cl_runtime;
+};
+
+#define RSPAMD_MAX_TOKEN_LEN 64
+typedef struct token_node_s {
+ guchar data[RSPAMD_MAX_TOKEN_LEN];
+ guint datalen;
+ GArray *results;
+} rspamd_token_t;
+
/**
* Initialise statistics modules
* @param cfg
diff --git a/src/libstat/tokenizers.h b/src/libstat/tokenizers.h
index 48f9b6e56..73d07a5c4 100644
--- a/src/libstat/tokenizers.h
+++ b/src/libstat/tokenizers.h
@@ -5,24 +5,7 @@
#include "mem_pool.h"
#include "fstring.h"
#include "main.h"
-
-/* Size for features pipe */
-#define FEATURE_WINDOW_SIZE 5
-#define MAX_DATA_LEN 64
-#define MAX_VALUES 32
-
-struct token_result {
- double value;
- struct rspamd_statfile_config *st;
- double *consolidated_value;
-};
-
-typedef struct token_node_s {
- guchar data[MAX_DATA_LEN];
- guint datalen;
- struct token_result *results;
- guint results_len;
-} token_node_t;
+#include "stat_api.h"
/* Common tokenizer structure */
struct tokenizer {
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
index 4016842b6..abf547f43 100644
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -29,6 +29,9 @@
#include <sys/types.h>
#include "tokenizers.h"
+/* Size for features pipe */
+#define FEATURE_WINDOW_SIZE 5
+
/* Minimum length of token */
#define MIN_LEN 4
@@ -43,7 +46,7 @@ osb_tokenize_text (struct tokenizer *tokenizer,
gboolean is_utf,
GList *exceptions)
{
- token_node_t *new = NULL;
+ rspamd_token_t *new = NULL;
rspamd_fstring_t *token;
guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
gint i, processed = 0;
@@ -82,7 +85,7 @@ osb_tokenize_text (struct tokenizer *tokenizer,
h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
h2 = hashpipe[0] * primes[1] + hashpipe[i] *
primes[(i << 1) - 1];
- new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
+ new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
new->datalen = sizeof(gint32) * 2;
memcpy(new->data, &h1, sizeof(h1));
memcpy(new->data + sizeof(h1), &h2, sizeof(h2));
@@ -98,7 +101,7 @@ osb_tokenize_text (struct tokenizer *tokenizer,
for (i = 1; i < processed; i++) {
h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1];
h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1];
- new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t));
+ new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t));
new->datalen = sizeof(gint32) * 2;
memcpy(new->data, &h1, sizeof(h1));
memcpy(new->data + sizeof(h1), &h2, sizeof(h2));
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 7d00f693a..10e4b92d5 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -92,7 +92,7 @@ rspamd_stat_get_tokenizer (const char *name)
int
token_node_compare_func (gconstpointer a, gconstpointer b)
{
- const token_node_t *aa = a, *bb = b;
+ const rspamd_token_t *aa = a, *bb = b;
if (aa->datalen != bb->datalen) {
return aa->datalen - bb->datalen;