diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-07-09 20:45:11 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-07-09 20:45:11 +0400 |
commit | 2234daebbb352b444b322d43cc6c1093f0ce949c (patch) | |
tree | 320131facabccd4f5aa3eddc465bc50a707b2b00 /src/tokenizers | |
parent | 19baadf6a0e6b2554de67b674a2c6f30efda13bb (diff) | |
download | rspamd-2234daebbb352b444b322d43cc6c1093f0ce949c.tar.gz rspamd-2234daebbb352b444b322d43cc6c1093f0ce949c.zip |
* Make autolearn working
Diffstat (limited to 'src/tokenizers')
-rw-r--r-- | src/tokenizers/osb.c | 10 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.c | 8 |
2 files changed, 11 insertions, 7 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index 32d6b902a..d2a1fe22f 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -29,6 +29,8 @@ #include <sys/types.h> #include "tokenizers.h" +/* Minimum length of token */ +#define MIN_LEN 4 extern const int primes[]; @@ -36,7 +38,7 @@ int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **tree) { token_node_t *new = NULL; - f_str_t token = { NULL, 0, 0 }; + f_str_t token = { NULL, 0, 0 }, *res; uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2; int i; @@ -52,7 +54,11 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in msg_debug ("osb_tokenize_text: got input length: %zd", input->len); - while (tokenizer->get_next_word (input, &token)) { + while ((res = tokenizer->get_next_word (input, &token)) != NULL) { + /* Skip small words */ + if (token.len < MIN_LEN) { + continue; + } /* Shift hashpipe */ for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) { hashpipe[i] = hashpipe[i - 1]; diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index 4527e699c..7db1af12c 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -78,12 +78,11 @@ f_str_t * get_next_word (f_str_t *buf, f_str_t *token) { size_t remain; - char *pos; + unsigned char *pos; if (buf == NULL) { return NULL; } - if (token->begin == NULL) { token->begin = buf->begin; } @@ -95,15 +94,14 @@ get_next_word (f_str_t *buf, f_str_t *token) if (remain <= 0) { return NULL; } - pos = token->begin; /* Skip non graph symbols */ - while (remain > 0 && !g_ascii_isgraph (*pos)) { + while (remain > 0 && (!g_ascii_isgraph (*pos) && *pos < 127)) { token->begin ++; pos ++; remain --; } - while (remain > 0 && g_ascii_isgraph (*pos)) { + while (remain > 0 && (g_ascii_isgraph (*pos) || *pos > 127)) { token->len ++; pos ++; remain --; |