]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Add function to normalize unicode on per words basis
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 24 Nov 2018 16:26:01 +0000 (16:26 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 24 Nov 2018 16:26:01 +0000 (16:26 +0000)
src/libmime/lang_detection.c
src/libmime/mime_encoding.c
src/libstat/stat_api.h
src/libstat/tokenizers/tokenizers.c
src/libstat/tokenizers/tokenizers.h
src/libutil/str_util.c
src/libutil/str_util.h

index 102117b214604fe7d4bc741e83bc16f857b83654..e2651b63c9f8c421664860a3649468d185b46ea2 100644 (file)
@@ -801,7 +801,7 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
 
        ret = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*ret));
        ret->languages = g_ptr_array_sized_new (gl.gl_pathc);
-       ret->uchar_converter = ucnv_open ("UTF-8", &uc_err);
+       ret->uchar_converter = rspamd_get_utf8_converter ();
        ret->short_text_limit = short_text_limit;
        ret->stop_words_norm = kh_init (rspamd_stopwords_hash);
 
index c30cbe3e3b8f4b529181a5094a86e490cef996a4..269166344215b55dce96be4aa21c155ad69dec83 100644 (file)
 #define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF)
 
 static rspamd_regexp_t *utf_compatible_re = NULL;
-UConverter *utf8_converter = NULL;
-
-#if U_ICU_VERSION_MAJOR_NUM >= 44
-static const UNormalizer2 *norm = NULL;
-#endif
 
 struct rspamd_charset_substitution {
        const gchar *input;
@@ -101,36 +96,6 @@ rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err)
        return conv;
 }
 
-static inline void
-rspamd_mime_utf8_conv_init (void)
-{
-       if (utf8_converter == NULL) {
-               UErrorCode uc_err = U_ZERO_ERROR;
-
-               utf8_converter = ucnv_open (UTF8_CHARSET, &uc_err);
-
-               if (!U_SUCCESS (uc_err)) {
-                       msg_err ("FATAL error: cannot open converter for utf8: %s",
-                                       u_errorName (uc_err));
-
-                       g_assert_not_reached ();
-               }
-
-               ucnv_setFromUCallBack (utf8_converter,
-                               UCNV_FROM_U_CALLBACK_SUBSTITUTE,
-                               NULL,
-                               NULL,
-                               NULL,
-                               &uc_err);
-               ucnv_setToUCallBack (utf8_converter,
-                               UCNV_TO_U_CALLBACK_SUBSTITUTE,
-                               NULL,
-                               NULL,
-                               NULL,
-                               &uc_err);
-       }
-}
-
 static void
 rspamd_mime_encoding_substitute_init (void)
 {
@@ -224,10 +189,10 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
        UChar *tmp_buf;
 
        UErrorCode uc_err = U_ZERO_ERROR;
-       UConverter *conv;
+       UConverter *conv, *utf8_converter;
 
-       rspamd_mime_utf8_conv_init ();
        conv = rspamd_mime_get_converter_cached (in_enc, &uc_err);
+       utf8_converter = rspamd_get_utf8_converter ();
 
        if (conv == NULL) {
                g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
@@ -282,8 +247,8 @@ rspamd_mime_text_part_ucs_from_utf (struct rspamd_task *task,
 {
        GByteArray *utf;
        UErrorCode uc_err = U_ZERO_ERROR;
+       UConverter *utf8_converter = rspamd_get_utf8_converter ();
 
-       rspamd_mime_utf8_conv_init ();
        utf = text_part->utf_raw_content;
        text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE,
                        sizeof (UChar), utf->len + 1);
@@ -308,10 +273,7 @@ rspamd_mime_text_part_normalise (struct rspamd_task *task,
        UErrorCode uc_err = U_ZERO_ERROR;
        gint32 nsym, end;
        UChar *src = NULL, *dest = NULL;
-
-       if (norm == NULL) {
-               norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
-       }
+       const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
 
        if (!text_part->unicode_raw_content) {
                return;
@@ -367,8 +329,9 @@ rspamd_mime_text_part_maybe_renormalise (struct rspamd_task *task,
        UErrorCode uc_err = U_ZERO_ERROR;
        guint clen, dlen;
        gint r;
+       UConverter *utf8_converter;
 
-       rspamd_mime_utf8_conv_init ();
+       utf8_converter = rspamd_get_utf8_converter ();
 
        if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) &&
                text_part->unicode_raw_content) {
@@ -398,10 +361,10 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
        gint32 r, clen, dlen;
 
        UErrorCode uc_err = U_ZERO_ERROR;
-       UConverter *conv;
+       UConverter *conv, *utf8_converter;
 
-       rspamd_mime_utf8_conv_init ();
        conv = rspamd_mime_get_converter_cached (charset, &uc_err);
+       utf8_converter = rspamd_get_utf8_converter ();
 
        if (conv == NULL) {
                g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
@@ -464,7 +427,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
        gint32 r, clen, dlen;
        UChar *tmp_buf;
        UErrorCode uc_err = U_ZERO_ERROR;
-       UConverter *conv;
+       UConverter *conv, *utf8_converter;
        rspamd_ftok_t charset_tok;
 
        RSPAMD_FTOK_FROM_STR (&charset_tok, enc);
@@ -477,7 +440,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
                return TRUE;
        }
 
-       rspamd_mime_utf8_conv_init ();
+       utf8_converter = rspamd_get_utf8_converter ();
        conv = rspamd_mime_get_converter_cached (enc, &uc_err);
 
        if (conv == NULL) {
@@ -763,6 +726,7 @@ void
 rspamd_utf_to_unicode (GByteArray *in, GArray *dest)
 {
        UErrorCode uc_err = U_ZERO_ERROR;
+       UConverter *utf8_converter = rspamd_get_utf8_converter ();
 
        g_array_set_size (dest, in->len + 1);
        dest->len = ucnv_toUChars (utf8_converter,
index 645e1f1aa71cbcadd2fcb123d859363134931996..c046dd227c356679a6788fc8553a530ca8fd0308 100644 (file)
 #define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6)
 #define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7)
 #define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8)
+#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9)
 
 typedef struct rspamd_stat_token_s {
        rspamd_ftok_t original;
        rspamd_ftok_unicode_t unicode;
-       rspamd_ftok_t normalised;
+       rspamd_ftok_t normalized;
        rspamd_ftok_t stemmed;
        guint flags;
 } rspamd_stat_token_t;
index 8664b9e19fae5d04cadfa354d3683475f21ebddb..247c24dbd624bb60aa3cc47dadf404e412089251 100644 (file)
 #include "rspamd.h"
 #include "tokenizers.h"
 #include "stat_internal.h"
-#include "../../../contrib/mumhash/mum.h"
+#include "contrib/mumhash/mum.h"
+
 #include <unicode/utf8.h>
 #include <unicode/uchar.h>
 #include <unicode/uiter.h>
 #include <unicode/ubrk.h>
+#include <unicode/ucnv.h>
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+#include <unicode/unorm2.h>
+#endif
+
 #include <math.h>
 
 typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
@@ -534,3 +540,129 @@ rspamd_tokenize_subject (struct rspamd_task *task)
        return words;
 }
 
+void
+rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool)
+{
+       rspamd_stat_token_t *tok;
+       guint i;
+       UErrorCode uc_err = U_ZERO_ERROR;
+       guint clen, dlen;
+       gint r;
+       UConverter *utf8_converter;
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+       const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
+       gint32 end;
+       UChar *src = NULL, *dest = NULL;
+#endif
+
+       utf8_converter = rspamd_get_utf8_converter ();
+
+       for (i = 0; i < words->len; i++) {
+               tok = &g_array_index (words, rspamd_stat_token_t, i);
+
+               if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
+                       UChar *unicode;
+                       gchar *utf8;
+                       gsize ulen;
+
+                       uc_err = U_ZERO_ERROR;
+                       ulen = tok->original.len;
+                       unicode = rspamd_mempool_alloc (pool, sizeof (UChar) * (ulen + 1));
+                       ulen = ucnv_toUChars (utf8_converter,
+                                       unicode,
+                                       tok->original.len + 1,
+                                       tok->original.begin,
+                                       tok->original.len,
+                                       &uc_err);
+
+
+                       if (!U_SUCCESS (uc_err)) {
+                               tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
+                               tok->unicode.begin = NULL;
+                               tok->unicode.len = 0;
+                               tok->normalized.begin = NULL;
+                               tok->normalized.len = 0;
+                       }
+                       else {
+                               /* Perform normalization if available and needed */
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+                               /* We can now check if we need to decompose */
+                               end = unorm2_spanQuickCheckYes (norm, src, ulen, &uc_err);
+
+                               if (!U_SUCCESS (uc_err)) {
+                                       tok->unicode.begin = unicode;
+                                       tok->unicode.len = ulen;
+                                       tok->normalized.begin = NULL;
+                                       tok->normalized.len = 0;
+                                       tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
+                               }
+                               else {
+                                       if (end == ulen) {
+                                               /* Already normalised */
+                                               tok->unicode.begin = unicode;
+                                               tok->unicode.len = ulen;
+                                               tok->normalized.begin = tok->original.begin;
+                                               tok->normalized.len = tok->original.len;
+                                       }
+                                       else {
+                                               /* Perform normalization */
+
+                                               dest = rspamd_mempool_alloc (pool, ulen * sizeof (UChar));
+                                               /* First part */
+                                               memcpy (dest, src, end * sizeof (*dest));
+                                               /* Second part */
+                                               ulen = unorm2_normalizeSecondAndAppend (norm, dest, end,
+                                                               ulen,
+                                                               src + end, ulen - end, &uc_err);
+
+                                               if (!U_SUCCESS (uc_err)) {
+                                                       if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
+                                                               msg_warn_pool_check ("cannot normalise text '%*s': %s",
+                                                                               (gint)tok->original.len, tok->original.begin,
+                                                                               u_errorName (uc_err));
+                                                               tok->unicode.begin = unicode;
+                                                               tok->unicode.len = ulen;
+                                                               tok->normalized.begin = NULL;
+                                                               tok->normalized.len = 0;
+                                                               tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
+                                                       }
+                                               }
+                                               else {
+                                                       /* Copy normalised back */
+                                                       tok->unicode.begin = dest;
+                                                       tok->unicode.len = ulen;
+                                                       tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED;
+
+                                                       /* Convert utf8 to produce normalized part */
+                                                       clen = ucnv_getMaxCharSize (utf8_converter);
+                                                       dlen = UCNV_GET_MAX_BYTES_FOR_STRING (ulen, clen);
+
+                                                       utf8 = rspamd_mempool_alloc (pool,
+                                                                       sizeof (*utf8) * dlen + 1);
+                                                       r = ucnv_fromUChars (utf8_converter,
+                                                                       utf8,
+                                                                       dlen,
+                                                                       dest,
+                                                                       ulen,
+                                                                       &uc_err);
+                                                       utf8[r] = '\0';
+
+                                                       tok->normalized.begin = utf8;
+                                                       tok->normalized.len = r;
+                                               }
+                                       }
+                               }
+#else
+                               /* Legacy libicu path */
+                               tok->unicode.begin = unicode;
+                               tok->unicode.len = ulen;
+                               tok->normalized.begin = tok->original.begin;
+                               tok->normalized.len = tok->original.len;
+#endif
+                       }
+               }
+       }
+}
+
+void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
+                                               const gchar *language);
\ No newline at end of file
index 668f08cdc829ac1f94091b8f1c516e2e4445c225..9a55616714827b672037552f6f2ac9c7de4091a4 100644 (file)
@@ -57,6 +57,10 @@ gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
                struct rspamd_tokenizer_config *cf,
                gsize *len);
 
+void rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool);
+
+void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
+               const gchar *language);
 
 GArray * rspamd_tokenize_subject (struct rspamd_task *task);
 #endif
index be7323df3079f916fd7787d1f422861345832c2a..d8b17e3c369f5eb66a704ed6ad40f11b96651028 100644 (file)
@@ -2237,25 +2237,71 @@ rspamd_memrchr (const void *m, gint c, gsize len)
        return NULL;
 }
 
+struct UConverter *
+rspamd_get_utf8_converter (void)
+{
+       static UConverter *utf8_conv = NULL;
+       UErrorCode uc_err = U_ZERO_ERROR;
+
+       if (utf8_conv == NULL) {
+               utf8_conv = ucnv_open ("UTF-8", &uc_err);
+               if (!U_SUCCESS (uc_err)) {
+                       msg_err ("FATAL error: cannot open converter for utf8: %s",
+                                       u_errorName (uc_err));
+
+                       g_assert_not_reached ();
+               }
+
+               ucnv_setFromUCallBack (utf8_conv,
+                               UCNV_FROM_U_CALLBACK_SUBSTITUTE,
+                               NULL,
+                               NULL,
+                               NULL,
+                               &uc_err);
+               ucnv_setToUCallBack (utf8_conv,
+                               UCNV_TO_U_CALLBACK_SUBSTITUTE,
+                               NULL,
+                               NULL,
+                               NULL,
+                               &uc_err);
+       }
+
+       return utf8_conv;
+}
+
+
+const struct UNormalizer2 *
+rspamd_get_unicode_normalizer (void)
+{
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+       UErrorCode uc_err = U_ZERO_ERROR;
+       static const UNormalizer2 *norm = NULL;
+
+       if (norm == NULL) {
+               norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
+               g_assert (U_SUCCESS (uc_err));
+       }
+
+       return norm;
+#else
+       /* Old libicu */
+       return NULL;
+#endif
+}
+
+
 gboolean
 rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
                guint *len)
 {
 #if U_ICU_VERSION_MAJOR_NUM >= 44
        UErrorCode uc_err = U_ZERO_ERROR;
-       static UConverter *utf8_conv = NULL;
-       static const UNormalizer2 *norm = NULL;
+       UConverter *utf8_conv = rspamd_get_utf8_converter ();
+       const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
        gint32 nsym, end;
        UChar *src = NULL, *dest = NULL;
        gboolean ret = FALSE;
 
-       if (utf8_conv == NULL) {
-               utf8_conv = ucnv_open ("UTF-8", &uc_err);
-               g_assert (U_SUCCESS (uc_err));
-               norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
-               g_assert (U_SUCCESS (uc_err));
-       }
-
        /* We first need to convert data to UChars :( */
        src = g_malloc ((*len + 1) * sizeof (*src));
        nsym = ucnv_toUChars (utf8_conv, src, *len + 1,
index ffcc691972dc612d5a04ed2ae2af561755e9fb76..688034ec69e7ee64d32571ca06291295aaaf1260 100644 (file)
@@ -386,6 +386,12 @@ rspamd_str_has_8bit (const guchar *beg, gsize len)
        return FALSE;
 }
 
+struct UConverter;
+struct UConverter *rspamd_get_utf8_converter (void);
+
+struct UNormalizer2;
+const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);
+
 /**
  * Gets a string in UTF8 and normalises it to NFKC_Casefold form
  * @param pool optional memory pool used for logging purposes