* Make autolearn working

author: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2009-07-09 20:45:11 +0400
committer: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2009-07-09 20:45:11 +0400
commit: 2234daebbb352b444b322d43cc6c1093f0ce949c (patch)
tree: 320131facabccd4f5aa3eddc465bc50a707b2b00 /src/tokenizers
parent: 19baadf6a0e6b2554de67b674a2c6f30efda13bb (diff)
download: rspamd-2234daebbb352b444b322d43cc6c1093f0ce949c.tar.gz
rspamd-2234daebbb352b444b322d43cc6c1093f0ce949c.zip
2 files changed, 11 insertions, 7 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
index 32d6b902a..d2a1fe22f 100644
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -29,6 +29,8 @@
 #include <sys/types.h>
 #include "tokenizers.h"
 
+/* Minimum length of token */
+#define MIN_LEN 4
 
 extern const int primes[];
 
@@ -36,7 +38,7 @@ int
 osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **tree)
 {
 	token_node_t *new = NULL;
-	f_str_t token = { NULL, 0, 0 };
+	f_str_t token = { NULL, 0, 0 }, *res;
 	uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
 	int i;
 
@@ -52,7 +54,11 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
 
 	msg_debug ("osb_tokenize_text: got input length: %zd", input->len);
 
-	while (tokenizer->get_next_word (input, &token)) {
+	while ((res = tokenizer->get_next_word (input, &token)) != NULL) {
+		/* Skip small words */
+		if (token.len < MIN_LEN) {
+			continue;
+		}
 		/* Shift hashpipe */
 		for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) {
 			hashpipe[i] = hashpipe[i - 1];
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index 4527e699c..7db1af12c 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -78,12 +78,11 @@ f_str_t *
 get_next_word (f_str_t *buf, f_str_t *token)
 {
 	size_t remain;
-	char *pos;
+	unsigned char *pos;
 	
 	if (buf == NULL) {
 		return NULL;
 	}
-
 	if (token->begin == NULL) {
 		token->begin = buf->begin;
 	}
@@ -95,15 +94,14 @@ get_next_word (f_str_t *buf, f_str_t *token)
 	if (remain <= 0) {
 		return NULL;
 	}
-
 	pos = token->begin;
 	/* Skip non graph symbols */
-	while (remain > 0 && !g_ascii_isgraph (*pos)) {
+	while (remain > 0 && (!g_ascii_isgraph (*pos) && *pos < 127)) {
 		token->begin ++;
 		pos ++;
 		remain --;
 	}
-	while (remain > 0 && g_ascii_isgraph (*pos)) {
+	while (remain > 0 && (g_ascii_isgraph (*pos) || *pos > 127)) {
 		token->len ++;
 		pos ++;
 		remain --;
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2009-07-09 20:45:11 +0400
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2009-07-09 20:45:11 +0400
commit	2234daebbb352b444b322d43cc6c1093f0ce949c (patch)
tree	320131facabccd4f5aa3eddc465bc50a707b2b00 /src/tokenizers
parent	19baadf6a0e6b2554de67b674a2c6f30efda13bb (diff)
download	rspamd-2234daebbb352b444b322d43cc6c1093f0ce949c.tar.gz rspamd-2234daebbb352b444b322d43cc6c1093f0ce949c.zip