diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-26 17:42:43 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-26 17:42:43 +0000 |
commit | 05f17c6cd61546712e5f213dc624b693e2b0dfa4 (patch) | |
tree | cfae46d7d7fd61f091142c1a45d231b4af804400 /src | |
parent | c9f44a6f359770bb7c273df4b52e5a62af3845a2 (diff) | |
download | rspamd-05f17c6cd61546712e5f213dc624b693e2b0dfa4.tar.gz rspamd-05f17c6cd61546712e5f213dc624b693e2b0dfa4.zip |
[Feature] Ignore bogus whitespaces in the words
Issue: #2649
Diffstat (limited to 'src')
-rw-r--r-- | src/libstat/stat_api.h | 1 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 9 |
2 files changed, 9 insertions, 1 deletions
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h index ee8db8af2..9dcd6f8e8 100644 --- a/src/libstat/stat_api.h +++ b/src/libstat/stat_api.h @@ -38,6 +38,7 @@ #define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9) #define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 9) #define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 10) +#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 11) typedef struct rspamd_stat_token_s { rspamd_ftok_t original; /* utf8 raw */ diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 19a5dba98..c62718278 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -608,7 +608,14 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen, while (i < srclen) { U16_NEXT_UNSAFE (src, i, t); - *d++ = u_tolower (t); + + if (u_isgraph (t)) { + *d++ = u_tolower (t); + } + else { + /* Invisible spaces ! */ + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES; + } } tok->unicode.begin = dest; |