]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Ignore bogus whitespaces in the words
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 26 Nov 2018 17:42:43 +0000 (17:42 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 26 Nov 2018 17:42:43 +0000 (17:42 +0000)
Issue: #2649

src/libstat/stat_api.h
src/libstat/tokenizers/tokenizers.c

index ee8db8af2e452ee6993aa758879abbbd1e3ee5e4..9dcd6f8e87b771a889d4ddac409677c13affb422 100644 (file)
@@ -38,6 +38,7 @@
 #define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9)
 #define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 9)
 #define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 10)
+#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 11)
 
 typedef struct rspamd_stat_token_s {
        rspamd_ftok_t original; /* utf8 raw */
index 19a5dba98c63fd49f72e05ac182cf08294e4d087..c62718278bc04fea9f42c8f946ccbac3dd02a397 100644 (file)
@@ -608,7 +608,14 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen,
 
        while (i < srclen) {
                U16_NEXT_UNSAFE (src, i, t);
-               *d++ = u_tolower (t);
+
+               if (u_isgraph (t)) {
+                       *d++ = u_tolower (t);
+               }
+               else {
+                       /* Invisible spaces ! */
+                       tok->flags |= RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES;
+               }
        }
 
        tok->unicode.begin = dest;