diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-02-25 18:19:51 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-02-25 18:19:51 +0000 |
commit | ab43e080ebc5fea5a2c54bcad9180202b1a38711 (patch) | |
tree | dd1499a0523014fcef95a7a5ce1d10d9bdaeebf3 | |
parent | 129ff2943985d3df1fba06c2bc6e721ed62e0201 (diff) | |
download | rspamd-ab43e080ebc5fea5a2c54bcad9180202b1a38711.tar.gz rspamd-ab43e080ebc5fea5a2c54bcad9180202b1a38711.zip |
[Feature] Try to filter bad unicode types during normalisation
-rw-r--r-- | src/libstat/stat_api.h | 1 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 20 |
2 files changed, 20 insertions, 1 deletions
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h index 533c42948..f9d1aab5a 100644 --- a/src/libstat/stat_api.h +++ b/src/libstat/stat_api.h @@ -39,6 +39,7 @@ #define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 10) #define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 11) #define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 12) +#define RSPAMD_STAT_TOKEN_FLAG_EMOJI (1u << 13) typedef struct rspamd_stat_token_s { rspamd_ftok_t original; /* utf8 raw */ diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index acbbcf2f0..caa4a48a5 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -610,7 +610,25 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen, U16_NEXT_UNSAFE (src, i, t); if (u_isgraph (t)) { - *d++ = u_tolower (t); + UCharCategory cat; + + cat = u_charType (t); +#if U_ICU_VERSION_MAJOR_NUM >= 57 + if (u_hasBinaryProperty (t, UCHAR_EMOJI)) { + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_EMOJI; + } +#endif + + if (cat == U_UPPERCASE_LETTER || + cat == U_LOWERCASE_LETTER || + cat == U_DECIMAL_DIGIT_NUMBER || + cat == U_CONNECTOR_PUNCTUATION || + cat == U_MATH_SYMBOL || + cat == U_CURRENCY_SYMBOL || + cat == U_INITIAL_PUNCTUATION || + cat == U_FINAL_PUNCTUATION) { + *d++ = u_tolower (t); + } } else { /* Invisible spaces ! */ |