From 92de380c2c5e8ce7073ce979df4e5c7868e52bb6 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 3 Jun 2011 20:23:13 +0400 Subject: * Skip short utf words in statistics --- src/controller.c | 2 +- src/filter.c | 8 +++++--- src/tokenizers/osb.c | 13 ++++++++++--- src/tokenizers/tokenizers.c | 4 ++-- src/tokenizers/tokenizers.h | 4 ++-- 5 files changed, 20 insertions(+), 11 deletions(-) (limited to 'src') diff --git a/src/controller.c b/src/controller.c index b29af9ee1..f69a90f8b 100644 --- a/src/controller.c +++ b/src/controller.c @@ -851,7 +851,7 @@ controller_read_socket (f_str_t * in, void *arg) c.begin = part->content->data; c.len = part->content->len; if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer, - session->session_pool, &c, &tokens, FALSE)) { + session->session_pool, &c, &tokens, FALSE, part->is_utf)) { i = rspamd_snprintf (out_buf, sizeof (out_buf), "weights failed, tokenizer error" CRLF END); free_task (task, FALSE); if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) { diff --git a/src/filter.c b/src/filter.c index 2f8b27060..753c17952 100644 --- a/src/filter.c +++ b/src/filter.c @@ -612,7 +612,7 @@ classifiers_callback (gpointer value, void *arg) c.len = strlen (cur->data); if (c.len > 0) { c.begin = cur->data; - if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) { + if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE)) { msg_info ("cannot tokenize input"); return; } @@ -627,7 +627,7 @@ classifiers_callback (gpointer value, void *arg) c.begin = text_part->content->data; c.len = text_part->content->len; /* Tree would be freed at task pool freeing */ - if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) { + if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf)) { msg_info ("cannot tokenize input"); return; } @@ -815,6 +815,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) stat_file_t *stf; gdouble sum; struct mime_text_part *part; + gboolean is_utf = FALSE; /* Load classifier by symbol */ cl = g_hash_table_lookup (task->cfg->classifiers_symbols, statfile); @@ -850,11 +851,12 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) } c.begin = part->content->data; c.len = part->content->len; + is_utf = part->is_utf; } /* Get tokens */ if (!cl->tokenizer->tokenize_func ( cl->tokenizer, task->task_pool, - &c, &tokens, FALSE)) { + &c, &tokens, FALSE, is_utf)) { g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message"); return FALSE; } diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index 1a04f3464..5f5dfcdcd 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -36,7 +36,7 @@ extern const int primes[]; int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * input, GTree ** tree, - gboolean save_token) + gboolean save_token, gboolean is_utf) { token_node_t *new = NULL; f_str_t token = { NULL, 0, 0 }, *res; @@ -55,8 +55,15 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * while ((res = tokenizer->get_next_word (input, &token)) != NULL) { /* Skip small words */ - if (token.len < MIN_LEN) { - continue; + if (is_utf) { + if (g_utf8_strlen (token.begin, token.len) < MIN_LEN) { + continue; + } + } + else { + if (token.len < MIN_LEN) { + continue; + } } /* Shift hashpipe */ for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) { diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index 5af3fe6d5..9e41a9101 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -239,13 +239,13 @@ tokenize_subject (struct worker_task *task, GTree ** tree) new = memory_pool_alloc (task->task_pool, sizeof (token_node_t)); subject.begin = task->subject; subject.len = strlen (task->subject); - osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE); + osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE); } if ((sub = g_mime_message_get_subject (task->message)) != NULL) { new = memory_pool_alloc (task->task_pool, sizeof (token_node_t)); subject.begin = (gchar *)sub; subject.len = strlen (sub); - osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE); + osb_tokenizer->tokenize_func (osb_tokenizer, task->task_pool, &subject, tree, FALSE, TRUE); } } diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h index 741753328..df5481a1f 100644 --- a/src/tokenizers/tokenizers.h +++ b/src/tokenizers/tokenizers.h @@ -24,7 +24,7 @@ typedef struct token_node_s { /* Common tokenizer structure */ struct tokenizer { char *name; - int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token); + int (*tokenize_func)(struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf); f_str_t* (*get_next_word)(f_str_t *buf, f_str_t *token); }; @@ -35,7 +35,7 @@ struct tokenizer* get_tokenizer (char *name); /* Get next word from specified f_str_t buf */ f_str_t *get_next_word (f_str_t *buf, f_str_t *token); /* OSB tokenize function */ -int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token); +int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur, gboolean save_token, gboolean is_utf); /* Common tokenizer for headers */ int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur); /* Make tokens for a subject */ -- cgit v1.2.3