aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/tokenizers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-26 17:42:43 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-26 17:42:43 +0000
commit05f17c6cd61546712e5f213dc624b693e2b0dfa4 (patch)
treecfae46d7d7fd61f091142c1a45d231b4af804400 /src/libstat/tokenizers
parentc9f44a6f359770bb7c273df4b52e5a62af3845a2 (diff)
downloadrspamd-05f17c6cd61546712e5f213dc624b693e2b0dfa4.tar.gz
rspamd-05f17c6cd61546712e5f213dc624b693e2b0dfa4.zip
[Feature] Ignore bogus whitespaces in the words
Issue: #2649
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r--src/libstat/tokenizers/tokenizers.c9
1 files changed, 8 insertions, 1 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 19a5dba98..c62718278 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -608,7 +608,14 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen,
while (i < srclen) {
U16_NEXT_UNSAFE (src, i, t);
- *d++ = u_tolower (t);
+
+ if (u_isgraph (t)) {
+ *d++ = u_tolower (t);
+ }
+ else {
+ /* Invisible spaces ! */
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES;
+ }
}
tok->unicode.begin = dest;