From: Vsevolod Stakhov Date: Sat, 24 Nov 2018 16:26:01 +0000 (+0000) Subject: [Project] Add function to normalize unicode on per words basis X-Git-Tag: 1.8.3~58 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=abd5300a45ff290656926b61603a65e9621e090f;p=rspamd.git [Project] Add function to normalize unicode on per words basis --- diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 102117b21..e2651b63c 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -801,7 +801,7 @@ rspamd_language_detector_init (struct rspamd_config *cfg) ret = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*ret)); ret->languages = g_ptr_array_sized_new (gl.gl_pathc); - ret->uchar_converter = ucnv_open ("UTF-8", &uc_err); + ret->uchar_converter = rspamd_get_utf8_converter (); ret->short_text_limit = short_text_limit; ret->stop_words_norm = kh_init (rspamd_stopwords_hash); diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c index c30cbe3e3..269166344 100644 --- a/src/libmime/mime_encoding.c +++ b/src/libmime/mime_encoding.c @@ -40,11 +40,6 @@ #define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF) static rspamd_regexp_t *utf_compatible_re = NULL; -UConverter *utf8_converter = NULL; - -#if U_ICU_VERSION_MAJOR_NUM >= 44 -static const UNormalizer2 *norm = NULL; -#endif struct rspamd_charset_substitution { const gchar *input; @@ -101,36 +96,6 @@ rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err) return conv; } -static inline void -rspamd_mime_utf8_conv_init (void) -{ - if (utf8_converter == NULL) { - UErrorCode uc_err = U_ZERO_ERROR; - - utf8_converter = ucnv_open (UTF8_CHARSET, &uc_err); - - if (!U_SUCCESS (uc_err)) { - msg_err ("FATAL error: cannot open converter for utf8: %s", - u_errorName (uc_err)); - - g_assert_not_reached (); - } - - ucnv_setFromUCallBack (utf8_converter, - UCNV_FROM_U_CALLBACK_SUBSTITUTE, - NULL, - NULL, - NULL, - &uc_err); - ucnv_setToUCallBack (utf8_converter, - UCNV_TO_U_CALLBACK_SUBSTITUTE, - NULL, - NULL, - NULL, - &uc_err); - } -} - static void rspamd_mime_encoding_substitute_init (void) { @@ -224,10 +189,10 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool, UChar *tmp_buf; UErrorCode uc_err = U_ZERO_ERROR; - UConverter *conv; + UConverter *conv, *utf8_converter; - rspamd_mime_utf8_conv_init (); conv = rspamd_mime_get_converter_cached (in_enc, &uc_err); + utf8_converter = rspamd_get_utf8_converter (); if (conv == NULL) { g_set_error (err, rspamd_iconv_error_quark (), EINVAL, @@ -282,8 +247,8 @@ rspamd_mime_text_part_ucs_from_utf (struct rspamd_task *task, { GByteArray *utf; UErrorCode uc_err = U_ZERO_ERROR; + UConverter *utf8_converter = rspamd_get_utf8_converter (); - rspamd_mime_utf8_conv_init (); utf = text_part->utf_raw_content; text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE, sizeof (UChar), utf->len + 1); @@ -308,10 +273,7 @@ rspamd_mime_text_part_normalise (struct rspamd_task *task, UErrorCode uc_err = U_ZERO_ERROR; gint32 nsym, end; UChar *src = NULL, *dest = NULL; - - if (norm == NULL) { - norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err); - } + const UNormalizer2 *norm = rspamd_get_unicode_normalizer (); if (!text_part->unicode_raw_content) { return; @@ -367,8 +329,9 @@ rspamd_mime_text_part_maybe_renormalise (struct rspamd_task *task, UErrorCode uc_err = U_ZERO_ERROR; guint clen, dlen; gint r; + UConverter *utf8_converter; - rspamd_mime_utf8_conv_init (); + utf8_converter = rspamd_get_utf8_converter (); if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) && text_part->unicode_raw_content) { @@ -398,10 +361,10 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, gint32 r, clen, dlen; UErrorCode uc_err = U_ZERO_ERROR; - UConverter *conv; + UConverter *conv, *utf8_converter; - rspamd_mime_utf8_conv_init (); conv = rspamd_mime_get_converter_cached (charset, &uc_err); + utf8_converter = rspamd_get_utf8_converter (); if (conv == NULL) { g_set_error (err, rspamd_iconv_error_quark (), EINVAL, @@ -464,7 +427,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in, gint32 r, clen, dlen; UChar *tmp_buf; UErrorCode uc_err = U_ZERO_ERROR; - UConverter *conv; + UConverter *conv, *utf8_converter; rspamd_ftok_t charset_tok; RSPAMD_FTOK_FROM_STR (&charset_tok, enc); @@ -477,7 +440,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in, return TRUE; } - rspamd_mime_utf8_conv_init (); + utf8_converter = rspamd_get_utf8_converter (); conv = rspamd_mime_get_converter_cached (enc, &uc_err); if (conv == NULL) { @@ -763,6 +726,7 @@ void rspamd_utf_to_unicode (GByteArray *in, GArray *dest) { UErrorCode uc_err = U_ZERO_ERROR; + UConverter *utf8_converter = rspamd_get_utf8_converter (); g_array_set_size (dest, in->len + 1); dest->len = ucnv_toUChars (utf8_converter, diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h index 645e1f1aa..c046dd227 100644 --- a/src/libstat/stat_api.h +++ b/src/libstat/stat_api.h @@ -35,11 +35,12 @@ #define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6) #define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7) #define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8) +#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9) typedef struct rspamd_stat_token_s { rspamd_ftok_t original; rspamd_ftok_unicode_t unicode; - rspamd_ftok_t normalised; + rspamd_ftok_t normalized; rspamd_ftok_t stemmed; guint flags; } rspamd_stat_token_t; diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 8664b9e19..247c24dbd 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -20,11 +20,17 @@ #include "rspamd.h" #include "tokenizers.h" #include "stat_internal.h" -#include "../../../contrib/mumhash/mum.h" +#include "contrib/mumhash/mum.h" + #include #include #include #include +#include +#if U_ICU_VERSION_MAJOR_NUM >= 44 +#include +#endif + #include typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos, @@ -534,3 +540,129 @@ rspamd_tokenize_subject (struct rspamd_task *task) return words; } +void +rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool) +{ + rspamd_stat_token_t *tok; + guint i; + UErrorCode uc_err = U_ZERO_ERROR; + guint clen, dlen; + gint r; + UConverter *utf8_converter; +#if U_ICU_VERSION_MAJOR_NUM >= 44 + const UNormalizer2 *norm = rspamd_get_unicode_normalizer (); + gint32 end; + UChar *src = NULL, *dest = NULL; +#endif + + utf8_converter = rspamd_get_utf8_converter (); + + for (i = 0; i < words->len; i++) { + tok = &g_array_index (words, rspamd_stat_token_t, i); + + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { + UChar *unicode; + gchar *utf8; + gsize ulen; + + uc_err = U_ZERO_ERROR; + ulen = tok->original.len; + unicode = rspamd_mempool_alloc (pool, sizeof (UChar) * (ulen + 1)); + ulen = ucnv_toUChars (utf8_converter, + unicode, + tok->original.len + 1, + tok->original.begin, + tok->original.len, + &uc_err); + + + if (!U_SUCCESS (uc_err)) { + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; + tok->unicode.begin = NULL; + tok->unicode.len = 0; + tok->normalized.begin = NULL; + tok->normalized.len = 0; + } + else { + /* Perform normalization if available and needed */ +#if U_ICU_VERSION_MAJOR_NUM >= 44 + /* We can now check if we need to decompose */ + end = unorm2_spanQuickCheckYes (norm, src, ulen, &uc_err); + + if (!U_SUCCESS (uc_err)) { + tok->unicode.begin = unicode; + tok->unicode.len = ulen; + tok->normalized.begin = NULL; + tok->normalized.len = 0; + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; + } + else { + if (end == ulen) { + /* Already normalised */ + tok->unicode.begin = unicode; + tok->unicode.len = ulen; + tok->normalized.begin = tok->original.begin; + tok->normalized.len = tok->original.len; + } + else { + /* Perform normalization */ + + dest = rspamd_mempool_alloc (pool, ulen * sizeof (UChar)); + /* First part */ + memcpy (dest, src, end * sizeof (*dest)); + /* Second part */ + ulen = unorm2_normalizeSecondAndAppend (norm, dest, end, + ulen, + src + end, ulen - end, &uc_err); + + if (!U_SUCCESS (uc_err)) { + if (uc_err != U_BUFFER_OVERFLOW_ERROR) { + msg_warn_pool_check ("cannot normalise text '%*s': %s", + (gint)tok->original.len, tok->original.begin, + u_errorName (uc_err)); + tok->unicode.begin = unicode; + tok->unicode.len = ulen; + tok->normalized.begin = NULL; + tok->normalized.len = 0; + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; + } + } + else { + /* Copy normalised back */ + tok->unicode.begin = dest; + tok->unicode.len = ulen; + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED; + + /* Convert utf8 to produce normalized part */ + clen = ucnv_getMaxCharSize (utf8_converter); + dlen = UCNV_GET_MAX_BYTES_FOR_STRING (ulen, clen); + + utf8 = rspamd_mempool_alloc (pool, + sizeof (*utf8) * dlen + 1); + r = ucnv_fromUChars (utf8_converter, + utf8, + dlen, + dest, + ulen, + &uc_err); + utf8[r] = '\0'; + + tok->normalized.begin = utf8; + tok->normalized.len = r; + } + } + } +#else + /* Legacy libicu path */ + tok->unicode.begin = unicode; + tok->unicode.len = ulen; + tok->normalized.begin = tok->original.begin; + tok->normalized.len = tok->original.len; +#endif + } + } + } +} + +void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, + const gchar *language); \ No newline at end of file diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 668f08cdc..9a5561671 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -57,6 +57,10 @@ gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool, struct rspamd_tokenizer_config *cf, gsize *len); +void rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool); + +void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, + const gchar *language); GArray * rspamd_tokenize_subject (struct rspamd_task *task); #endif diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index be7323df3..d8b17e3c3 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -2237,25 +2237,71 @@ rspamd_memrchr (const void *m, gint c, gsize len) return NULL; } +struct UConverter * +rspamd_get_utf8_converter (void) +{ + static UConverter *utf8_conv = NULL; + UErrorCode uc_err = U_ZERO_ERROR; + + if (utf8_conv == NULL) { + utf8_conv = ucnv_open ("UTF-8", &uc_err); + if (!U_SUCCESS (uc_err)) { + msg_err ("FATAL error: cannot open converter for utf8: %s", + u_errorName (uc_err)); + + g_assert_not_reached (); + } + + ucnv_setFromUCallBack (utf8_conv, + UCNV_FROM_U_CALLBACK_SUBSTITUTE, + NULL, + NULL, + NULL, + &uc_err); + ucnv_setToUCallBack (utf8_conv, + UCNV_TO_U_CALLBACK_SUBSTITUTE, + NULL, + NULL, + NULL, + &uc_err); + } + + return utf8_conv; +} + + +const struct UNormalizer2 * +rspamd_get_unicode_normalizer (void) +{ +#if U_ICU_VERSION_MAJOR_NUM >= 44 + UErrorCode uc_err = U_ZERO_ERROR; + static const UNormalizer2 *norm = NULL; + + if (norm == NULL) { + norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err); + g_assert (U_SUCCESS (uc_err)); + } + + return norm; +#else + /* Old libicu */ + return NULL; +#endif +} + + gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start, guint *len) { #if U_ICU_VERSION_MAJOR_NUM >= 44 UErrorCode uc_err = U_ZERO_ERROR; - static UConverter *utf8_conv = NULL; - static const UNormalizer2 *norm = NULL; + UConverter *utf8_conv = rspamd_get_utf8_converter (); + const UNormalizer2 *norm = rspamd_get_unicode_normalizer (); gint32 nsym, end; UChar *src = NULL, *dest = NULL; gboolean ret = FALSE; - if (utf8_conv == NULL) { - utf8_conv = ucnv_open ("UTF-8", &uc_err); - g_assert (U_SUCCESS (uc_err)); - norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err); - g_assert (U_SUCCESS (uc_err)); - } - /* We first need to convert data to UChars :( */ src = g_malloc ((*len + 1) * sizeof (*src)); nsym = ucnv_toUChars (utf8_conv, src, *len + 1, diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index ffcc69197..688034ec6 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -386,6 +386,12 @@ rspamd_str_has_8bit (const guchar *beg, gsize len) return FALSE; } +struct UConverter; +struct UConverter *rspamd_get_utf8_converter (void); + +struct UNormalizer2; +const struct UNormalizer2 *rspamd_get_unicode_normalizer (void); + /** * Gets a string in UTF8 and normalises it to NFKC_Casefold form * @param pool optional memory pool used for logging purposes