diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-07-09 20:45:11 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-07-09 20:45:11 +0400 |
commit | 2234daebbb352b444b322d43cc6c1093f0ce949c (patch) | |
tree | 320131facabccd4f5aa3eddc465bc50a707b2b00 /src/tokenizers/osb.c | |
parent | 19baadf6a0e6b2554de67b674a2c6f30efda13bb (diff) | |
download | rspamd-2234daebbb352b444b322d43cc6c1093f0ce949c.tar.gz rspamd-2234daebbb352b444b322d43cc6c1093f0ce949c.zip |
* Make autolearn working
Diffstat (limited to 'src/tokenizers/osb.c')
-rw-r--r-- | src/tokenizers/osb.c | 10 |
1 files changed, 8 insertions, 2 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index 32d6b902a..d2a1fe22f 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -29,6 +29,8 @@ #include <sys/types.h> #include "tokenizers.h" +/* Minimum length of token */ +#define MIN_LEN 4 extern const int primes[]; @@ -36,7 +38,7 @@ int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **tree) { token_node_t *new = NULL; - f_str_t token = { NULL, 0, 0 }; + f_str_t token = { NULL, 0, 0 }, *res; uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2; int i; @@ -52,7 +54,11 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in msg_debug ("osb_tokenize_text: got input length: %zd", input->len); - while (tokenizer->get_next_word (input, &token)) { + while ((res = tokenizer->get_next_word (input, &token)) != NULL) { + /* Skip small words */ + if (token.len < MIN_LEN) { + continue; + } /* Shift hashpipe */ for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) { hashpipe[i] = hashpipe[i - 1]; |