aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/tokenizers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-04-02 12:35:19 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-04-02 12:35:19 +0100
commiteee2f2adaf4590013a09730f07afafc8a3a149cd (patch)
tree5eb6f4fa7034ca8067eeb6f00e367015d310da86 /src/libstat/tokenizers
parent063498ae63348de0c27eb0260cc7633bbc822df9 (diff)
downloadrspamd-eee2f2adaf4590013a09730f07afafc8a3a149cd.tar.gz
rspamd-eee2f2adaf4590013a09730f07afafc8a3a149cd.zip
Fix normalization and tokenization.
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r--src/libstat/tokenizers/tokenizers.c4
1 files changed, 3 insertions, 1 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index c9b65e343..eebc57c22 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -110,6 +110,7 @@ rspamd_tokenizer_get_word_compat (rspamd_fstring_t * buf,
token->begin = buf->begin;
token->len = 0;
}
+ *cur = token->begin;
}
token->len = 0;
@@ -223,6 +224,7 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf,
token->begin = "exception";
token->len = sizeof ("exception") - 1;
state = skip_exception;
+ continue;
}
else if (g_unichar_isgraph (uc) && !g_unichar_ispunct (uc)) {
state = feed_token;
@@ -290,7 +292,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
func = rspamd_tokenizer_get_word;
}
- res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t));
+ res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_fstring_t), 128);
while (func (&buf, &pos, &token, &cur, is_utf, &l)) {
if (min_len > 0 && l < min_len) {