aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/libstat/stat_api.h1
-rw-r--r--src/libstat/tokenizers/tokenizers.c9
2 files changed, 9 insertions, 1 deletions
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h
index ee8db8af2..9dcd6f8e8 100644
--- a/src/libstat/stat_api.h
+++ b/src/libstat/stat_api.h
@@ -38,6 +38,7 @@
#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9)
#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 9)
#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 10)
+#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 11)
typedef struct rspamd_stat_token_s {
rspamd_ftok_t original; /* utf8 raw */
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 19a5dba98..c62718278 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -608,7 +608,14 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen,
while (i < srclen) {
U16_NEXT_UNSAFE (src, i, t);
- *d++ = u_tolower (t);
+
+ if (u_isgraph (t)) {
+ *d++ = u_tolower (t);
+ }
+ else {
+ /* Invisible spaces ! */
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES;
+ }
}
tok->unicode.begin = dest;