From 683b90f4c6c744557f7429ce6ff77c0f7d2175e1 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 6 May 2011 19:18:40 +0400 Subject: * Major cleanup of cmake build system * Add initial version of statshow utility for statfiles debugging * Add debugging for statistics * Remove unused utilities --- src/classifiers/bayes.c | 7 ++++++- src/classifiers/classifiers.h | 1 + src/controller.c | 3 ++- src/filter.c | 9 +++------ src/tokenizers/osb.c | 5 ++++- src/tokenizers/tokenizers.c | 4 ++-- src/tokenizers/tokenizers.h | 4 ++-- 7 files changed, 20 insertions(+), 13 deletions(-) (limited to 'src') diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c index 9ef2544b0..b4f7826e5 100644 --- a/src/classifiers/bayes.c +++ b/src/classifiers/bayes.c @@ -131,6 +131,7 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) if (cur->post_probability < G_MINDOUBLE * 100) { cur->post_probability = G_MINDOUBLE * 100; } + } renorm = 0; for (i = 0; i < cd->statfiles_num; i ++) { @@ -144,6 +145,10 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) if (cur->post_probability < G_MINDOUBLE * 10) { cur->post_probability = G_MINDOUBLE * 100; } + if (cd->ctx->debug) { + msg_info ("token: %s, statfile: %s, probability: %.4f, post_probability: %.4f", + node->extra, cur->st->symbol, cur->value, cur->post_probability); + } } return FALSE; @@ -156,7 +161,7 @@ bayes_init (memory_pool_t *pool, struct classifier_config *cfg) ctx->pool = pool; ctx->cfg = cfg; - + ctx->debug = FALSE; return ctx; } diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h index 0e6df173a..601db0205 100644 --- a/src/classifiers/classifiers.h +++ b/src/classifiers/classifiers.h @@ -15,6 +15,7 @@ struct worker_task; struct classifier_ctx { memory_pool_t *pool; GHashTable *results; + gboolean debug; struct classifier_config *cfg; }; diff --git a/src/controller.c b/src/controller.c index 9504d3b1f..a06351bb6 100644 --- a/src/controller.c +++ b/src/controller.c @@ -850,7 +850,8 @@ controller_read_socket (f_str_t * in, void *arg) c.begin = part->content->data; c.len = part->content->len; - if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer, session->session_pool, &c, &tokens)) { + if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer, + session->session_pool, &c, &tokens, FALSE)) { i = rspamd_snprintf (out_buf, sizeof (out_buf), "weights failed, tokenizer error" CRLF END); free_task (task, FALSE); if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) { diff --git a/src/filter.c b/src/filter.c index df8e1a9e0..2f8b27060 100644 --- a/src/filter.c +++ b/src/filter.c @@ -36,9 +36,6 @@ #include "classifiers/classifiers.h" #include "tokenizers/tokenizers.h" -#ifndef WITHOUT_PERL -# include "perl.h" -#endif #ifdef WITH_LUA # include "lua/lua_common.h" #endif @@ -615,7 +612,7 @@ classifiers_callback (gpointer value, void *arg) c.len = strlen (cur->data); if (c.len > 0) { c.begin = cur->data; - if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens)) { + if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) { msg_info ("cannot tokenize input"); return; } @@ -630,7 +627,7 @@ classifiers_callback (gpointer value, void *arg) c.begin = text_part->content->data; c.len = text_part->content->len; /* Tree would be freed at task pool freeing */ - if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens)) { + if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) { msg_info ("cannot tokenize input"); return; } @@ -857,7 +854,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) /* Get tokens */ if (!cl->tokenizer->tokenize_func ( cl->tokenizer, task->task_pool, - &c, &tokens)) { + &c, &tokens, FALSE)) { g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message"); return FALSE; } diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index ae59cf8ea..41bcce737 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -35,7 +35,7 @@ extern const int primes[]; int -osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * input, GTree ** tree) +osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * input, GTree ** tree, gboolean save_token) { token_node_t *new = NULL; f_str_t token = { NULL, 0, 0 }, *res; @@ -69,6 +69,9 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * new = memory_pool_alloc0 (pool, sizeof (token_node_t)); new->h1 = h1; new->h2 = h2; + if (save_token) { + new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token); + } if (g_tree_lookup (*tree, new) == NULL) { g_tree_insert (*tree, new, new); diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index b7318bdfc..5af3fe6d5 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -239,13 +239,13 @@ tokenize_subject (struct worker_task *task, GTree ** tree) new = memory_pool_alloc (task->task_pool, sizeof (token_node_t)); subject.begin = task->subject; subject.len = strlen (task->subject); - osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree); + osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE); } if ((sub = g_mime_message_get_subject (task->message)) != NULL) { new = memory_pool_alloc (task->task_pool, sizeof (token_node_t)); subject.begin = (gchar *)sub; subject.len = strlen (sub); - osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree); + osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE); } } diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h index 59a2684d0..741753328 100644 --- a/src/tokenizers/tokenizers.h +++ b/src/tokenizers/tokenizers.h @@ -24,7 +24,7 @@ typedef struct token_node_s { /* Common tokenizer structure */ struct tokenizer { char *name; - int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur); + int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token); f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token); }; @@ -35,7 +35,7 @@ struct tokenizer* get_tokenizer (char *name); /* Get next word from specified f_str_t buf */ f_str_t *get_next_word (f_str_t *buf, f_str_t *token); /* OSB tokenize function */ -int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur); +int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token); /* Common tokenizer for headers */ int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur); /* Make tokens for a subject */ -- cgit v1.2.3