summaryrefslogtreecommitdiffstats
path: root/src/filter.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2011-06-03 20:23:13 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2011-06-03 20:23:13 +0400
commit92de380c2c5e8ce7073ce979df4e5c7868e52bb6 (patch)
tree27be3202d27f129f3d94d90298a4d1e0ecf2c281 /src/filter.c
parent83a9452974ec2f9c7be262a77e54a1ea2557c795 (diff)
downloadrspamd-92de380c2c5e8ce7073ce979df4e5c7868e52bb6.tar.gz
rspamd-92de380c2c5e8ce7073ce979df4e5c7868e52bb6.zip
* Skip short utf words in statistics
Diffstat (limited to 'src/filter.c')
-rw-r--r--src/filter.c8
1 files changed, 5 insertions, 3 deletions
diff --git a/src/filter.c b/src/filter.c
index 2f8b27060..753c17952 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -612,7 +612,7 @@ classifiers_callback (gpointer value, void *arg)
c.len = strlen (cur->data);
if (c.len > 0) {
c.begin = cur->data;
- if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) {
+ if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE)) {
msg_info ("cannot tokenize input");
return;
}
@@ -627,7 +627,7 @@ classifiers_callback (gpointer value, void *arg)
c.begin = text_part->content->data;
c.len = text_part->content->len;
/* Tree would be freed at task pool freeing */
- if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE)) {
+ if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf)) {
msg_info ("cannot tokenize input");
return;
}
@@ -815,6 +815,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
stat_file_t *stf;
gdouble sum;
struct mime_text_part *part;
+ gboolean is_utf = FALSE;
/* Load classifier by symbol */
cl = g_hash_table_lookup (task->cfg->classifiers_symbols, statfile);
@@ -850,11 +851,12 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
}
c.begin = part->content->data;
c.len = part->content->len;
+ is_utf = part->is_utf;
}
/* Get tokens */
if (!cl->tokenizer->tokenize_func (
cl->tokenizer, task->task_pool,
- &c, &tokens, FALSE)) {
+ &c, &tokens, FALSE, is_utf)) {
g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message");
return FALSE;
}