diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-04-02 12:35:19 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-04-02 12:35:19 +0100 |
commit | eee2f2adaf4590013a09730f07afafc8a3a149cd (patch) | |
tree | 5eb6f4fa7034ca8067eeb6f00e367015d310da86 /src/libstat/tokenizers | |
parent | 063498ae63348de0c27eb0260cc7633bbc822df9 (diff) | |
download | rspamd-eee2f2adaf4590013a09730f07afafc8a3a149cd.tar.gz rspamd-eee2f2adaf4590013a09730f07afafc8a3a149cd.zip |
Fix normalization and tokenization.
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 4 |
1 files changed, 3 insertions, 1 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index c9b65e343..eebc57c22 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -110,6 +110,7 @@ rspamd_tokenizer_get_word_compat (rspamd_fstring_t * buf, token->begin = buf->begin; token->len = 0; } + *cur = token->begin; } token->len = 0; @@ -223,6 +224,7 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, token->begin = "exception"; token->len = sizeof ("exception") - 1; state = skip_exception; + continue; } else if (g_unichar_isgraph (uc) && !g_unichar_ispunct (uc)) { state = feed_token; @@ -290,7 +292,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, func = rspamd_tokenizer_get_word; } - res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t)); + res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_fstring_t), 128); while (func (&buf, &pos, &token, &cur, is_utf, &l)) { if (min_len > 0 && l < min_len) { |