From: Vsevolod Stakhov Date: Mon, 25 Feb 2019 18:19:51 +0000 (+0000) Subject: [Feature] Try to filter bad unicode types during normalisation X-Git-Tag: 1.9.0~87 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=ab43e080ebc5fea5a2c54bcad9180202b1a38711;p=rspamd.git [Feature] Try to filter bad unicode types during normalisation --- diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h index 533c42948..f9d1aab5a 100644 --- a/src/libstat/stat_api.h +++ b/src/libstat/stat_api.h @@ -39,6 +39,7 @@ #define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 10) #define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 11) #define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 12) +#define RSPAMD_STAT_TOKEN_FLAG_EMOJI (1u << 13) typedef struct rspamd_stat_token_s { rspamd_ftok_t original; /* utf8 raw */ diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index acbbcf2f0..caa4a48a5 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -610,7 +610,25 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen, U16_NEXT_UNSAFE (src, i, t); if (u_isgraph (t)) { - *d++ = u_tolower (t); + UCharCategory cat; + + cat = u_charType (t); +#if U_ICU_VERSION_MAJOR_NUM >= 57 + if (u_hasBinaryProperty (t, UCHAR_EMOJI)) { + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_EMOJI; + } +#endif + + if (cat == U_UPPERCASE_LETTER || + cat == U_LOWERCASE_LETTER || + cat == U_DECIMAL_DIGIT_NUMBER || + cat == U_CONNECTOR_PUNCTUATION || + cat == U_MATH_SYMBOL || + cat == U_CURRENCY_SYMBOL || + cat == U_INITIAL_PUNCTUATION || + cat == U_FINAL_PUNCTUATION) { + *d++ = u_tolower (t); + } } else { /* Invisible spaces ! */