diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-03 20:23:13 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-03 20:23:13 +0400 |
commit | 92de380c2c5e8ce7073ce979df4e5c7868e52bb6 (patch) | |
tree | 27be3202d27f129f3d94d90298a4d1e0ecf2c281 /src/filter.c | |
parent | 83a9452974ec2f9c7be262a77e54a1ea2557c795 (diff) | |
download | rspamd-92de380c2c5e8ce7073ce979df4e5c7868e52bb6.tar.gz rspamd-92de380c2c5e8ce7073ce979df4e5c7868e52bb6.zip |
* Skip short utf words in statistics
Diffstat (limited to 'src/filter.c')
-rw-r--r-- | src/filter.c | 8 |
1 files changed, 5 insertions, 3 deletions
diff --git a/src/filter.c b/src/filter.c index 2f8b27060..753c17952 100644 --- a/src/filter.c +++ b/src/filter.c @@ -612,7 +612,7 @@ classifiers_callback (gpointer value, void *arg) c.len = strlen (cur->data); if (c.len > 0) { c.begin = cur->data; - if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) { + if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE)) { msg_info ("cannot tokenize input"); return; } @@ -627,7 +627,7 @@ classifiers_callback (gpointer value, void *arg) c.begin = text_part->content->data; c.len = text_part->content->len; /* Tree would be freed at task pool freeing */ - if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) { + if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf)) { msg_info ("cannot tokenize input"); return; } @@ -815,6 +815,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) stat_file_t *stf; gdouble sum; struct mime_text_part *part; + gboolean is_utf = FALSE; /* Load classifier by symbol */ cl = g_hash_table_lookup (task->cfg->classifiers_symbols, statfile); @@ -850,11 +851,12 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) } c.begin = part->content->data; c.len = part->content->len; + is_utf = part->is_utf; } /* Get tokens */ if (!cl->tokenizer->tokenize_func ( cl->tokenizer, task->task_pool, - &c, &tokens, FALSE)) { + &c, &tokens, FALSE, is_utf)) { g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message"); return FALSE; } |