summaryrefslogtreecommitdiffstats
path: root/src/filter.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2011-06-24 20:25:54 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2011-06-24 20:25:54 +0400
commita3fa4d672341fd2f1888d3a2f2ed85ae57913b78 (patch)
tree352c634bbbc74cf17644545ace66a8feedc841c3 /src/filter.c
parent63725086863e4f422340479f83dd7ef374613e76 (diff)
downloadrspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.tar.gz
rspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.zip
* Welcome 0.4.0
Uncompatible changes: - Statistics is uncompatible in utf8 mode Major changes: - Improved utf8 mode - Convert all characters to lowercase in statistics - Skip URL's in statistics - Improve speed of bayes classifier by using integer arithmetics - Fixed statfiles synchronization that was broken for a long time - Synchronization is now configurable Minor changes: - Bugfixes - Removed some of legacy code - Types polishing
Diffstat (limited to 'src/filter.c')
-rw-r--r--src/filter.c10
1 files changed, 6 insertions, 4 deletions
diff --git a/src/filter.c b/src/filter.c
index 2c094fda8..797b4f6fe 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -612,7 +612,7 @@ classifiers_callback (gpointer value, void *arg)
c.len = strlen (cur->data);
if (c.len > 0) {
c.begin = cur->data;
- if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE)) {
+ if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE, NULL)) {
msg_info ("cannot tokenize input");
return;
}
@@ -627,7 +627,7 @@ classifiers_callback (gpointer value, void *arg)
c.begin = text_part->content->data;
c.len = text_part->content->len;
/* Tree would be freed at task pool freeing */
- if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf)) {
+ if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf, text_part->urls_offset)) {
msg_info ("cannot tokenize input");
return;
}
@@ -805,7 +805,7 @@ check_metric_action (double score, double required_score, struct metric *metric)
gboolean
learn_task (const gchar *statfile, struct worker_task *task, GError **err)
{
- GList *cur;
+ GList *cur, *ex;
struct classifier_config *cl;
struct classifier_ctx *cls_ctx;
gchar *s;
@@ -841,6 +841,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
if (s != NULL) {
c.len = strlen (cur->data);
c.begin = cur->data;
+ ex = NULL;
}
else {
part = cur->data;
@@ -852,11 +853,12 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
c.begin = part->content->data;
c.len = part->content->len;
is_utf = part->is_utf;
+ ex = part->urls_offset;
}
/* Get tokens */
if (!cl->tokenizer->tokenize_func (
cl->tokenizer, task->task_pool,
- &c, &tokens, FALSE, is_utf)) {
+ &c, &tokens, FALSE, is_utf, ex)) {
g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message");
return FALSE;
}