@@ -38,6 +38,7 @@ | |||
#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9) | |||
#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 9) | |||
#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 10) | |||
#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 11) | |||
typedef struct rspamd_stat_token_s { | |||
rspamd_ftok_t original; /* utf8 raw */ |
@@ -608,7 +608,14 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen, | |||
while (i < srclen) { | |||
U16_NEXT_UNSAFE (src, i, t); | |||
*d++ = u_tolower (t); | |||
if (u_isgraph (t)) { | |||
*d++ = u_tolower (t); | |||
} | |||
else { | |||
/* Invisible spaces ! */ | |||
tok->flags |= RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES; | |||
} | |||
} | |||
tok->unicode.begin = dest; |