aboutsummaryrefslogtreecommitdiffstats
path: root/src/tokenizers/osb.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2009-07-09 20:45:11 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2009-07-09 20:45:11 +0400
commit2234daebbb352b444b322d43cc6c1093f0ce949c (patch)
tree320131facabccd4f5aa3eddc465bc50a707b2b00 /src/tokenizers/osb.c
parent19baadf6a0e6b2554de67b674a2c6f30efda13bb (diff)
downloadrspamd-2234daebbb352b444b322d43cc6c1093f0ce949c.tar.gz
rspamd-2234daebbb352b444b322d43cc6c1093f0ce949c.zip
* Make autolearn working
Diffstat (limited to 'src/tokenizers/osb.c')
-rw-r--r--src/tokenizers/osb.c10
1 files changed, 8 insertions, 2 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
index 32d6b902a..d2a1fe22f 100644
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -29,6 +29,8 @@
#include <sys/types.h>
#include "tokenizers.h"
+/* Minimum length of token */
+#define MIN_LEN 4
extern const int primes[];
@@ -36,7 +38,7 @@ int
osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **tree)
{
token_node_t *new = NULL;
- f_str_t token = { NULL, 0, 0 };
+ f_str_t token = { NULL, 0, 0 }, *res;
uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
int i;
@@ -52,7 +54,11 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
msg_debug ("osb_tokenize_text: got input length: %zd", input->len);
- while (tokenizer->get_next_word (input, &token)) {
+ while ((res = tokenizer->get_next_word (input, &token)) != NULL) {
+ /* Skip small words */
+ if (token.len < MIN_LEN) {
+ continue;
+ }
/* Shift hashpipe */
for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) {
hashpipe[i] = hashpipe[i - 1];