aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-26 17:42:43 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-26 17:42:43 +0000
commit05f17c6cd61546712e5f213dc624b693e2b0dfa4 (patch)
treecfae46d7d7fd61f091142c1a45d231b4af804400 /src
parentc9f44a6f359770bb7c273df4b52e5a62af3845a2 (diff)
downloadrspamd-05f17c6cd61546712e5f213dc624b693e2b0dfa4.tar.gz
rspamd-05f17c6cd61546712e5f213dc624b693e2b0dfa4.zip
[Feature] Ignore bogus whitespaces in the words
Issue: #2649
Diffstat (limited to 'src')
-rw-r--r--src/libstat/stat_api.h1
-rw-r--r--src/libstat/tokenizers/tokenizers.c9
2 files changed, 9 insertions, 1 deletions
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h
index ee8db8af2..9dcd6f8e8 100644
--- a/src/libstat/stat_api.h
+++ b/src/libstat/stat_api.h
@@ -38,6 +38,7 @@
#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9)
#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 9)
#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 10)
+#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 11)
typedef struct rspamd_stat_token_s {
rspamd_ftok_t original; /* utf8 raw */
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 19a5dba98..c62718278 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -608,7 +608,14 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen,
while (i < srclen) {
U16_NEXT_UNSAFE (src, i, t);
- *d++ = u_tolower (t);
+
+ if (u_isgraph (t)) {
+ *d++ = u_tolower (t);
+ }
+ else {
+ /* Invisible spaces ! */
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES;
+ }
}
tok->unicode.begin = dest;