diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-24 20:25:54 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-24 20:25:54 +0400 |
commit | a3fa4d672341fd2f1888d3a2f2ed85ae57913b78 (patch) | |
tree | 352c634bbbc74cf17644545ace66a8feedc841c3 /src/filter.c | |
parent | 63725086863e4f422340479f83dd7ef374613e76 (diff) | |
download | rspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.tar.gz rspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.zip |
* Welcome 0.4.0
Uncompatible changes:
- Statistics is uncompatible in utf8 mode
Major changes:
- Improved utf8 mode
- Convert all characters to lowercase in statistics
- Skip URL's in statistics
- Improve speed of bayes classifier by using integer arithmetics
- Fixed statfiles synchronization that was broken for a long time
- Synchronization is now configurable
Minor changes:
- Bugfixes
- Removed some of legacy code
- Types polishing
Diffstat (limited to 'src/filter.c')
-rw-r--r-- | src/filter.c | 10 |
1 files changed, 6 insertions, 4 deletions
diff --git a/src/filter.c b/src/filter.c index 2c094fda8..797b4f6fe 100644 --- a/src/filter.c +++ b/src/filter.c @@ -612,7 +612,7 @@ classifiers_callback (gpointer value, void *arg) c.len = strlen (cur->data); if (c.len > 0) { c.begin = cur->data; - if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE)) { + if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE, NULL)) { msg_info ("cannot tokenize input"); return; } @@ -627,7 +627,7 @@ classifiers_callback (gpointer value, void *arg) c.begin = text_part->content->data; c.len = text_part->content->len; /* Tree would be freed at task pool freeing */ - if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf)) { + if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf, text_part->urls_offset)) { msg_info ("cannot tokenize input"); return; } @@ -805,7 +805,7 @@ check_metric_action (double score, double required_score, struct metric *metric) gboolean learn_task (const gchar *statfile, struct worker_task *task, GError **err) { - GList *cur; + GList *cur, *ex; struct classifier_config *cl; struct classifier_ctx *cls_ctx; gchar *s; @@ -841,6 +841,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) if (s != NULL) { c.len = strlen (cur->data); c.begin = cur->data; + ex = NULL; } else { part = cur->data; @@ -852,11 +853,12 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) c.begin = part->content->data; c.len = part->content->len; is_utf = part->is_utf; + ex = part->urls_offset; } /* Get tokens */ if (!cl->tokenizer->tokenize_func ( cl->tokenizer, task->task_pool, - &c, &tokens, FALSE, is_utf)) { + &c, &tokens, FALSE, is_utf, ex)) { g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message"); return FALSE; } |