From 05f17c6cd61546712e5f213dc624b693e2b0dfa4 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 26 Nov 2018 17:42:43 +0000 Subject: [Feature] Ignore bogus whitespaces in the words Issue: #2649 --- src/libstat/tokenizers/tokenizers.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'src/libstat/tokenizers') diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 19a5dba98..c62718278 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -608,7 +608,14 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen, while (i < srclen) { U16_NEXT_UNSAFE (src, i, t); - *d++ = u_tolower (t); + + if (u_isgraph (t)) { + *d++ = u_tolower (t); + } + else { + /* Invisible spaces ! */ + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES; + } } tok->unicode.begin = dest; -- cgit v1.2.3