]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Another try to normalize unicode properly
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 25 Nov 2018 16:33:33 +0000 (16:33 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 25 Nov 2018 16:33:33 +0000 (16:33 +0000)
src/libserver/html.c
src/libserver/url.c
src/libstat/stat_api.h
src/libstat/tokenizers/tokenizers.c
src/libstat/tokenizers/tokenizers.h
src/libutil/fstring.h
src/rspamd.h

index ff745f80d7df6e3d9862f267a857d2c6e5512302..2568d4c2a98730df61656fc740f642cfbf9fc59d 100644 (file)
@@ -2177,6 +2177,7 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
                ex->pos = href_offset;
                ex->len = dest->len - href_offset;
                ex->type = RSPAMD_EXCEPTION_URL;
+               ex->ptr = url;
 
                *exceptions = g_list_prepend (*exceptions,
                                ex);
index 9e6ab72dbe3ed20a69c07c94230d52790cc7bacd..e27a2c39b14892031f2ac4e10c330e534c684f45 100644 (file)
@@ -2546,6 +2546,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
        ex->pos = start_offset;
        ex->len = end_offset - start_offset;
        ex->type = RSPAMD_EXCEPTION_URL;
+       ex->ptr = url;
 
        if (url->protocol == PROTOCOL_MAILTO) {
                if (url->userlen > 0) {
index 8ab3332b90347e65d66685c0fd855e225c74f732..b912f8d203dc424b57ab5cc690c642d0d14f90b9 100644 (file)
 #define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8)
 #define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9)
 #define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 9)
+#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 10)
 
 typedef struct rspamd_stat_token_s {
-       rspamd_ftok_t original;
-       rspamd_ftok_unicode_t unicode;
-       rspamd_ftok_t normalized;
-       rspamd_ftok_t stemmed;
+       rspamd_ftok_t original; /* utf8 raw */
+       rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */
+       rspamd_ftok_t normalized; /* normalized and lowercased utf8 */
+       rspamd_ftok_t stemmed; /* stemmed utf8 */
        guint flags;
 } rspamd_stat_token_t;
 
index d27d9bc58b09af8c9fa9be3edb9ad742b86d3f56..32d9ba0df3b7e2ebf019a6ea8f636b99d47678e7 100644 (file)
@@ -447,7 +447,7 @@ start_over:
                                        if (!decay) {
                                                decay = TRUE;
                                        } else {
-                                               token.original.len = 0;
+                                               token.flags |= RSPAMD_STAT_TOKEN_FLAG_SKIPPED;
                                        }
                                }
                        }
@@ -541,130 +541,164 @@ rspamd_tokenize_subject (struct rspamd_task *task)
        return words;
 }
 
+static inline void
+rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen,
+                                               rspamd_stat_token_t *tok,
+                                               rspamd_mempool_t *pool)
+{
+       UChar32 *dest, t, *d;
+       gint32 i = 0;
+
+       dest = rspamd_mempool_alloc (pool, srclen * sizeof (UChar32));
+       d = dest;
+
+       while (i < srclen) {
+               U16_NEXT_UNSAFE (src, i, t);
+               *d++ = u_tolower (t);
+       }
+
+       tok->unicode.begin = dest;
+       tok->unicode.len = d - dest;
+}
+
+static inline void
+rspamd_ucs32_to_normalised (rspamd_stat_token_t *tok,
+                                                       rspamd_mempool_t *pool)
+{
+       guint i, doff = 0;
+       gsize utflen = 0;
+       gchar *dest;
+       UChar32 t;
+
+       for (i = 0; i < tok->unicode.len; i ++) {
+               utflen += U8_LENGTH (tok->unicode.begin[i]);
+       }
+
+       dest = rspamd_mempool_alloc (pool, utflen + 1);
+
+       for (i = 0; i < tok->unicode.len; i ++) {
+               t = tok->unicode.begin[i];
+               U8_APPEND_UNSAFE (dest, doff, t);
+       }
+
+       g_assert (doff <= utflen);
+       dest[doff] = '\0';
+
+       tok->normalized.len = doff;
+       tok->normalized.begin = dest;
+}
+
 void
-rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool)
+rspamd_normalize_single_word (rspamd_stat_token_t *tok, rspamd_mempool_t *pool)
 {
-       rspamd_stat_token_t *tok;
-       guint i;
        UErrorCode uc_err = U_ZERO_ERROR;
-       guint clen, dlen;
-       gint r;
        UConverter *utf8_converter;
-#if U_ICU_VERSION_MAJOR_NUM >= 44
-       const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
-       gint32 end;
-       UChar *src = NULL, *dest = NULL;
-#endif
+       UChar tmpbuf[1024]; /* Assume that we have no longer words... */
+       gsize ulen;
 
        utf8_converter = rspamd_get_utf8_converter ();
 
-       for (i = 0; i < words->len; i++) {
-               tok = &g_array_index (words, rspamd_stat_token_t, i);
-
-               if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
-                       UChar *unicode;
-                       gchar *utf8;
-                       gsize ulen;
-
-                       uc_err = U_ZERO_ERROR;
-                       ulen = tok->original.len;
-                       unicode = rspamd_mempool_alloc (pool, sizeof (UChar) * (ulen + 1));
-                       ulen = ucnv_toUChars (utf8_converter,
-                                       unicode,
-                                       tok->original.len + 1,
-                                       tok->original.begin,
-                                       tok->original.len,
-                                       &uc_err);
+       if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
+               ulen = ucnv_toUChars (utf8_converter,
+                               tmpbuf,
+                               G_N_ELEMENTS (tmpbuf),
+                               tok->original.begin,
+                               tok->original.len,
+                               &uc_err);
+
+               /* Now, we need to understand if we need to normalise the word */
+               if (!U_SUCCESS (uc_err)) {
+                       tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
+                       tok->unicode.begin = NULL;
+                       tok->unicode.len = 0;
+                       tok->normalized.begin = NULL;
+                       tok->normalized.len = 0;
+               }
+               else {
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+                       const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
+                       gint32 end;
 
+                       /* We can now check if we need to decompose */
+                       end = unorm2_spanQuickCheckYes (norm, tmpbuf, ulen, &uc_err);
 
                        if (!U_SUCCESS (uc_err)) {
-                               tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
-                               tok->unicode.begin = NULL;
-                               tok->unicode.len = 0;
+                               rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
                                tok->normalized.begin = NULL;
                                tok->normalized.len = 0;
+                               tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
                        }
                        else {
-                               /* Perform normalization if available and needed */
-#if U_ICU_VERSION_MAJOR_NUM >= 44
-                               /* We can now check if we need to decompose */
-                               end = unorm2_spanQuickCheckYes (norm, src, ulen, &uc_err);
-
-                               if (!U_SUCCESS (uc_err)) {
-                                       tok->unicode.begin = unicode;
-                                       tok->unicode.len = ulen;
-                                       tok->normalized.begin = NULL;
-                                       tok->normalized.len = 0;
-                                       tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
+                               if (end == ulen) {
+                                       /* Already normalised, just lowercase */
+                                       rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
+                                       rspamd_ucs32_to_normalised (tok, pool);
                                }
                                else {
-                                       if (end == ulen) {
-                                               /* Already normalised */
-                                               tok->unicode.begin = unicode;
-                                               tok->unicode.len = ulen;
-                                               tok->normalized.begin = tok->original.begin;
-                                               tok->normalized.len = tok->original.len;
+                                       /* Perform normalization */
+                                       UChar normbuf[1024];
+
+                                       g_assert (end < G_N_ELEMENTS (normbuf));
+                                       /* First part */
+                                       memcpy (normbuf, tmpbuf, end * sizeof (UChar));
+                                       /* Second part */
+                                       ulen = unorm2_normalizeSecondAndAppend (norm,
+                                                       normbuf, end,
+                                                       G_N_ELEMENTS (normbuf),
+                                                       tmpbuf + end,
+                                                       ulen - end,
+                                                       &uc_err);
+
+                                       if (!U_SUCCESS (uc_err)) {
+                                               if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
+                                                       msg_warn_pool_check ("cannot normalise text '%*s': %s",
+                                                                       (gint)tok->original.len, tok->original.begin,
+                                                                       u_errorName (uc_err));
+                                                       rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
+                                                       rspamd_ucs32_to_normalised (tok, pool);
+                                                       tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
+                                               }
                                        }
                                        else {
-                                               /* Perform normalization */
-
-                                               dest = rspamd_mempool_alloc (pool, ulen * sizeof (UChar));
-                                               /* First part */
-                                               memcpy (dest, src, end * sizeof (*dest));
-                                               /* Second part */
-                                               ulen = unorm2_normalizeSecondAndAppend (norm, dest, end,
-                                                               ulen,
-                                                               src + end, ulen - end, &uc_err);
-
-                                               if (!U_SUCCESS (uc_err)) {
-                                                       if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
-                                                               msg_warn_pool_check ("cannot normalise text '%*s': %s",
-                                                                               (gint)tok->original.len, tok->original.begin,
-                                                                               u_errorName (uc_err));
-                                                               tok->unicode.begin = unicode;
-                                                               tok->unicode.len = ulen;
-                                                               tok->normalized.begin = NULL;
-                                                               tok->normalized.len = 0;
-                                                               tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
-                                                       }
-                                               }
-                                               else {
-                                                       /* Copy normalised back */
-                                                       tok->unicode.begin = dest;
-                                                       tok->unicode.len = ulen;
-                                                       tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED;
-
-                                                       /* Convert utf8 to produce normalized part */
-                                                       clen = ucnv_getMaxCharSize (utf8_converter);
-                                                       dlen = UCNV_GET_MAX_BYTES_FOR_STRING (ulen, clen);
-
-                                                       utf8 = rspamd_mempool_alloc (pool,
-                                                                       sizeof (*utf8) * dlen + 1);
-                                                       r = ucnv_fromUChars (utf8_converter,
-                                                                       utf8,
-                                                                       dlen,
-                                                                       dest,
-                                                                       ulen,
-                                                                       &uc_err);
-                                                       utf8[r] = '\0';
-
-                                                       tok->normalized.begin = utf8;
-                                                       tok->normalized.len = r;
-                                               }
+                                               /* Copy normalised back */
+                                               rspamd_uchars_to_ucs32 (normbuf, ulen, tok, pool);
+                                               tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED;
+                                               rspamd_ucs32_to_normalised (tok, pool);
                                        }
                                }
+                       }
 #else
-                               /* Legacy libicu path */
-                               tok->unicode.begin = unicode;
-                               tok->unicode.len = ulen;
-                               tok->normalized.begin = tok->original.begin;
-                               tok->normalized.len = tok->original.len;
+                       /* Legacy version with no unorm2 interface */
+                       rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
+                       rspamd_ucs32_to_normalised (tok, pool);
 #endif
-                       }
+               }
+       }
+       else {
+               if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+                       /* Simple lowercase */
+                       gchar *dest;
+
+                       dest = rspamd_mempool_alloc (pool, tok->original.len + 1);
+                       rspamd_strlcpy (dest, tok->original.begin, tok->original.len + 1);
+                       rspamd_str_lc (dest, tok->original.len);
+                       tok->normalized.len = tok->original.len;
                }
        }
 }
 
+void
+rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool)
+{
+       rspamd_stat_token_t *tok;
+       guint i;
+
+       for (i = 0; i < words->len; i++) {
+               tok = &g_array_index (words, rspamd_stat_token_t, i);
+               rspamd_normalize_single_word (tok, pool);
+       }
+}
+
 void
 rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
                                   const gchar *language,
@@ -736,12 +770,8 @@ rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
                                }
                        }
                        else {
-                               /* No stemmer, utf8 lowercase */
-                               dest = rspamd_mempool_alloc (pool, tok->normalized.len);
-                               memcpy (dest, tok->normalized.begin, tok->normalized.len);
-                               rspamd_str_lc_utf8 (dest, tok->normalized.len);
                                tok->stemmed.len = tok->normalized.len;
-                               tok->stemmed.begin = dest;
+                               tok->stemmed.begin = tok->normalized.begin;
                        }
 
                        if (tok->stemmed.len > 0 && rspamd_language_detector_is_stop_word (d,
@@ -752,11 +782,8 @@ rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
                else {
                        if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
                                /* Raw text, lowercase */
-                               dest = rspamd_mempool_alloc (pool, tok->original.len);
-                               memcpy (dest, tok->original.begin, tok->original.len);
-                               rspamd_str_lc (dest, tok->original.len);
-                               tok->stemmed.len = tok->original.len;
-                               tok->stemmed.begin = dest;
+                               tok->stemmed.len = tok->normalized.len;
+                               tok->stemmed.begin = tok->normalized.begin;
                        }
                }
        }
index eb4a285de1aa57197345a4a06be3f812cf904b71..683d728ed0f3c79a48fd76f1bcbf1990d207fd56 100644 (file)
@@ -58,6 +58,7 @@ gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
                                                                                  gsize *len);
 
 struct rspamd_lang_detector;
+void rspamd_normalize_single_word (rspamd_stat_token_t *tok, rspamd_mempool_t *pool);
 void rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool);
 void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
                                                const gchar *language,
index 88e41b47ad70df3bf3d027f435c935bd948a019f..96749052ceca4d0057293b6826f27da918af6ed8 100644 (file)
@@ -40,8 +40,8 @@ typedef struct f_str_tok {
 } rspamd_ftok_t;
 
 typedef struct f_str_unicode_tok {
-       gsize len; /* in uchars */
-       const UChar *begin;
+       gsize len; /* in UChar32 */
+       const UChar32 *begin;
 } rspamd_ftok_unicode_t;
 
 /**
index c6d4c209fbd14761ccd18e398fca36b173d0b14c..80149a8e0be9c4078706d5e386ff48a9878b2d6e 100644 (file)
@@ -289,6 +289,7 @@ enum rspamd_exception_type {
 struct rspamd_process_exception {
        goffset pos;
        guint len;
+       gpointer ptr;
        enum rspamd_exception_type type;
 };