ucl_object_tostring (elt)); | ucl_object_tostring (elt)); | ||||
} | } | ||||
g_assert (ucl_object_todouble (score) < 1000.0); | |||||
iter = NULL; | iter = NULL; | ||||
while ((elt = ucl_iterate_object (metric, &iter, true)) != NULL) { | while ((elt = ucl_iterate_object (metric, &iter, true)) != NULL) { | ||||
if (elt->type == UCL_OBJECT) { | if (elt->type == UCL_OBJECT) { |
static gboolean | static gboolean | ||||
bayes_learn_callback (gpointer key, gpointer value, gpointer data) | bayes_learn_callback (gpointer key, gpointer value, gpointer data) | ||||
{ | { | ||||
token_node_t *node = key; | |||||
rspamd_token_t *node = key; | |||||
struct bayes_callback_data *cd = data; | struct bayes_callback_data *cd = data; | ||||
gint c; | gint c; | ||||
guint64 v; | guint64 v; | ||||
bayes_classify_callback (gpointer key, gpointer value, gpointer data) | bayes_classify_callback (gpointer key, gpointer value, gpointer data) | ||||
{ | { | ||||
token_node_t *node = key; | |||||
rspamd_token_t *node = key; | |||||
struct bayes_callback_data *cd = data; | struct bayes_callback_data *cd = data; | ||||
guint i; | guint i; | ||||
struct bayes_statfile_data *cur; | struct bayes_statfile_data *cur; | ||||
(value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { | (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { | ||||
minnodes = strtol (value, NULL, 10); | minnodes = strtol (value, NULL, 10); | ||||
nodes = g_tree_nnodes (input); | nodes = g_tree_nnodes (input); | ||||
if (nodes > FEATURE_WINDOW_SIZE) { | |||||
nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; | |||||
} | |||||
if (nodes < minnodes) { | if (nodes < minnodes) { | ||||
return FALSE; | return FALSE; | ||||
} | } | ||||
(value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { | (value = g_hash_table_lookup (ctx->cfg->opts, "min_tokens")) != NULL) { | ||||
minnodes = strtol (value, NULL, 10); | minnodes = strtol (value, NULL, 10); | ||||
nodes = g_tree_nnodes (input); | nodes = g_tree_nnodes (input); | ||||
if (nodes > FEATURE_WINDOW_SIZE) { | |||||
nodes = nodes / FEATURE_WINDOW_SIZE + FEATURE_WINDOW_SIZE; | |||||
} | |||||
if (nodes < minnodes) { | if (nodes < minnodes) { | ||||
g_set_error (err, | g_set_error (err, | ||||
bayes_error_quark (), /* error domain */ | bayes_error_quark (), /* error domain */ |
* High level statistics API | * High level statistics API | ||||
*/ | */ | ||||
struct rspamd_statfile_runtime { | |||||
struct rspamd_statfile_config *st; | |||||
guint64 hits; | |||||
guint64 total_hits; | |||||
}; | |||||
struct rspamd_classifier_runtime { | |||||
double ham_prob; | |||||
double spam_prob; | |||||
guint64 total_spam; | |||||
guint64 total_ham; | |||||
guint64 processed_tokens; | |||||
gsize max_tokens; | |||||
}; | |||||
struct rspamd_token_result { | |||||
double value; | |||||
struct rspamd_statfile_runtime *st_runtime; | |||||
struct rspamd_classifier_runtime *cl_runtime; | |||||
}; | |||||
#define RSPAMD_MAX_TOKEN_LEN 64 | |||||
typedef struct token_node_s { | |||||
guchar data[RSPAMD_MAX_TOKEN_LEN]; | |||||
guint datalen; | |||||
GArray *results; | |||||
} rspamd_token_t; | |||||
/** | /** | ||||
* Initialise statistics modules | * Initialise statistics modules | ||||
* @param cfg | * @param cfg |
#include "mem_pool.h" | #include "mem_pool.h" | ||||
#include "fstring.h" | #include "fstring.h" | ||||
#include "main.h" | #include "main.h" | ||||
/* Size for features pipe */ | |||||
#define FEATURE_WINDOW_SIZE 5 | |||||
#define MAX_DATA_LEN 64 | |||||
#define MAX_VALUES 32 | |||||
struct token_result { | |||||
double value; | |||||
struct rspamd_statfile_config *st; | |||||
double *consolidated_value; | |||||
}; | |||||
typedef struct token_node_s { | |||||
guchar data[MAX_DATA_LEN]; | |||||
guint datalen; | |||||
struct token_result *results; | |||||
guint results_len; | |||||
} token_node_t; | |||||
#include "stat_api.h" | |||||
/* Common tokenizer structure */ | /* Common tokenizer structure */ | ||||
struct tokenizer { | struct tokenizer { |
#include <sys/types.h> | #include <sys/types.h> | ||||
#include "tokenizers.h" | #include "tokenizers.h" | ||||
/* Size for features pipe */ | |||||
#define FEATURE_WINDOW_SIZE 5 | |||||
/* Minimum length of token */ | /* Minimum length of token */ | ||||
#define MIN_LEN 4 | #define MIN_LEN 4 | ||||
gboolean is_utf, | gboolean is_utf, | ||||
GList *exceptions) | GList *exceptions) | ||||
{ | { | ||||
token_node_t *new = NULL; | |||||
rspamd_token_t *new = NULL; | |||||
rspamd_fstring_t *token; | rspamd_fstring_t *token; | ||||
guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; | guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; | ||||
gint i, processed = 0; | gint i, processed = 0; | ||||
h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; | h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; | ||||
h2 = hashpipe[0] * primes[1] + hashpipe[i] * | h2 = hashpipe[0] * primes[1] + hashpipe[i] * | ||||
primes[(i << 1) - 1]; | primes[(i << 1) - 1]; | ||||
new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t)); | |||||
new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t)); | |||||
new->datalen = sizeof(gint32) * 2; | new->datalen = sizeof(gint32) * 2; | ||||
memcpy(new->data, &h1, sizeof(h1)); | memcpy(new->data, &h1, sizeof(h1)); | ||||
memcpy(new->data + sizeof(h1), &h2, sizeof(h2)); | memcpy(new->data + sizeof(h1), &h2, sizeof(h2)); | ||||
for (i = 1; i < processed; i++) { | for (i = 1; i < processed; i++) { | ||||
h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; | h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; | ||||
h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; | h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; | ||||
new = rspamd_mempool_alloc0 (pool, sizeof (token_node_t)); | |||||
new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_token_t)); | |||||
new->datalen = sizeof(gint32) * 2; | new->datalen = sizeof(gint32) * 2; | ||||
memcpy(new->data, &h1, sizeof(h1)); | memcpy(new->data, &h1, sizeof(h1)); | ||||
memcpy(new->data + sizeof(h1), &h2, sizeof(h2)); | memcpy(new->data + sizeof(h1), &h2, sizeof(h2)); |
int | int | ||||
token_node_compare_func (gconstpointer a, gconstpointer b) | token_node_compare_func (gconstpointer a, gconstpointer b) | ||||
{ | { | ||||
const token_node_t *aa = a, *bb = b; | |||||
const rspamd_token_t *aa = a, *bb = b; | |||||
if (aa->datalen != bb->datalen) { | if (aa->datalen != bb->datalen) { | ||||
return aa->datalen - bb->datalen; | return aa->datalen - bb->datalen; |