* Welcome 0.4.0

Uncompatible changes: - Statistics is uncompatible in utf8 mode Major changes: - Improved utf8 mode - Convert all characters to lowercase in statistics - Skip URL's in statistics - Improve speed of bayes classifier by using integer arithmetics - Fixed statfiles synchronization that was broken for a long time - Synchronization is now configurable Minor changes: - Bugfixes - Removed some of legacy code - Types polishing
author: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2011-06-24 20:25:54 +0400
committer: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2011-06-24 20:25:54 +0400
commit: a3fa4d672341fd2f1888d3a2f2ed85ae57913b78 (patch)
tree: 352c634bbbc74cf17644545ace66a8feedc841c3 /src/filter.c
parent: 63725086863e4f422340479f83dd7ef374613e76 (diff)
download: rspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.tar.gz
rspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.zip
1 files changed, 6 insertions, 4 deletions
diff --git a/src/filter.c b/src/filter.c
index 2c094fda8..797b4f6fe 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -612,7 +612,7 @@ classifiers_callback (gpointer value, void *arg)
 				c.len = strlen (cur->data);
 				if (c.len > 0) {
 					c.begin = cur->data;
-					if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE)) {
+					if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, FALSE, NULL)) {
 						msg_info ("cannot tokenize input");
 						return;
 					}
@@ -627,7 +627,7 @@ classifiers_callback (gpointer value, void *arg)
 				c.begin = text_part->content->data;
 				c.len = text_part->content->len;
 				/* Tree would be freed at task pool freeing */
-				if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf)) {
+				if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf, text_part->urls_offset)) {
 					msg_info ("cannot tokenize input");
 					return;
 				}
@@ -805,7 +805,7 @@ check_metric_action (double score, double required_score, struct metric *metric)
 gboolean
 learn_task (const gchar *statfile, struct worker_task *task, GError **err)
 {
-	GList                          *cur;
+	GList                          *cur, *ex;
 	struct classifier_config       *cl;
 	struct classifier_ctx          *cls_ctx;
 	gchar                          *s;
@@ -841,6 +841,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
 		if (s != NULL) {
 			c.len = strlen (cur->data);
 			c.begin = cur->data;
+			ex = NULL;
 		}
 		else {
 			part = cur->data;
@@ -852,11 +853,12 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
 			c.begin = part->content->data;
 			c.len = part->content->len;
 			is_utf = part->is_utf;
+			ex = part->urls_offset;
 		}
 		/* Get tokens */
 		if (!cl->tokenizer->tokenize_func (
 				cl->tokenizer, task->task_pool,
-				&c, &tokens, FALSE, is_utf)) {
+				&c, &tokens, FALSE, is_utf, ex)) {
 			g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message");
 			return FALSE;
 		}
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2011-06-24 20:25:54 +0400
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2011-06-24 20:25:54 +0400
commit	a3fa4d672341fd2f1888d3a2f2ed85ae57913b78 (patch)
tree	352c634bbbc74cf17644545ace66a8feedc841c3 /src/filter.c
parent	63725086863e4f422340479f83dd7ef374613e76 (diff)
download	rspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.tar.gz rspamd-a3fa4d672341fd2f1888d3a2f2ed85ae57913b78.zip