diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-03 20:23:13 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-06-03 20:23:13 +0400 |
commit | 92de380c2c5e8ce7073ce979df4e5c7868e52bb6 (patch) | |
tree | 27be3202d27f129f3d94d90298a4d1e0ecf2c281 /src/tokenizers/osb.c | |
parent | 83a9452974ec2f9c7be262a77e54a1ea2557c795 (diff) | |
download | rspamd-92de380c2c5e8ce7073ce979df4e5c7868e52bb6.tar.gz rspamd-92de380c2c5e8ce7073ce979df4e5c7868e52bb6.zip |
* Skip short utf words in statistics
Diffstat (limited to 'src/tokenizers/osb.c')
-rw-r--r-- | src/tokenizers/osb.c | 13 |
1 files changed, 10 insertions, 3 deletions
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index 1a04f3464..5f5dfcdcd 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -36,7 +36,7 @@ extern const int primes[]; int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * input, GTree ** tree, - gboolean save_token) + gboolean save_token, gboolean is_utf) { token_node_t *new = NULL; f_str_t token = { NULL, 0, 0 }, *res; @@ -55,8 +55,15 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * while ((res = tokenizer->get_next_word (input, &token)) != NULL) { /* Skip small words */ - if (token.len < MIN_LEN) { - continue; + if (is_utf) { + if (g_utf8_strlen (token.begin, token.len) < MIN_LEN) { + continue; + } + } + else { + if (token.len < MIN_LEN) { + continue; + } } /* Shift hashpipe */ for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) { |