]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Try to filter bad unicode types during normalisation
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 25 Feb 2019 18:19:51 +0000 (18:19 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 25 Feb 2019 18:19:51 +0000 (18:19 +0000)
src/libstat/stat_api.h
src/libstat/tokenizers/tokenizers.c

index 533c4294864e4af77e93b9cd8169626dff8e2fa1..f9d1aab5a9d0b23c8da036c24d7f4f79ceb2d0cf 100644 (file)
@@ -39,6 +39,7 @@
 #define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 10)
 #define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 11)
 #define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 12)
+#define RSPAMD_STAT_TOKEN_FLAG_EMOJI (1u << 13)
 
 typedef struct rspamd_stat_token_s {
        rspamd_ftok_t original; /* utf8 raw */
index acbbcf2f019652f16f5e6b348c23d93d19af8e22..caa4a48a5d1ea603f4f34a3ceb7bce6f1a1871c0 100644 (file)
@@ -610,7 +610,25 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen,
                U16_NEXT_UNSAFE (src, i, t);
 
                if (u_isgraph (t)) {
-                       *d++ = u_tolower (t);
+                       UCharCategory cat;
+
+                       cat = u_charType (t);
+#if U_ICU_VERSION_MAJOR_NUM >= 57
+                       if (u_hasBinaryProperty (t, UCHAR_EMOJI)) {
+                               tok->flags |= RSPAMD_STAT_TOKEN_FLAG_EMOJI;
+                       }
+#endif
+
+                       if (cat == U_UPPERCASE_LETTER ||
+                                       cat == U_LOWERCASE_LETTER ||
+                                       cat == U_DECIMAL_DIGIT_NUMBER ||
+                                       cat == U_CONNECTOR_PUNCTUATION ||
+                                       cat == U_MATH_SYMBOL ||
+                                       cat == U_CURRENCY_SYMBOL ||
+                                       cat == U_INITIAL_PUNCTUATION ||
+                                       cat == U_FINAL_PUNCTUATION) {
+                               *d++ = u_tolower (t);
+                       }
                }
                else {
                        /* Invisible spaces ! */