diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-24 16:26:01 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-24 16:26:01 +0000 |
commit | abd5300a45ff290656926b61603a65e9621e090f (patch) | |
tree | e3d350cca3ecbac3a41fcf96ad2a9dc5f9e48d75 /src/libmime/mime_encoding.c | |
parent | b522caaf83b4a3f16246bdc38d0f7ce866cdc660 (diff) | |
download | rspamd-abd5300a45ff290656926b61603a65e9621e090f.tar.gz rspamd-abd5300a45ff290656926b61603a65e9621e090f.zip |
[Project] Add function to normalize unicode on per words basis
Diffstat (limited to 'src/libmime/mime_encoding.c')
-rw-r--r-- | src/libmime/mime_encoding.c | 58 |
1 files changed, 11 insertions, 47 deletions
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c index c30cbe3e3..269166344 100644 --- a/src/libmime/mime_encoding.c +++ b/src/libmime/mime_encoding.c @@ -40,11 +40,6 @@ #define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF) static rspamd_regexp_t *utf_compatible_re = NULL; -UConverter *utf8_converter = NULL; - -#if U_ICU_VERSION_MAJOR_NUM >= 44 -static const UNormalizer2 *norm = NULL; -#endif struct rspamd_charset_substitution { const gchar *input; @@ -101,36 +96,6 @@ rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err) return conv; } -static inline void -rspamd_mime_utf8_conv_init (void) -{ - if (utf8_converter == NULL) { - UErrorCode uc_err = U_ZERO_ERROR; - - utf8_converter = ucnv_open (UTF8_CHARSET, &uc_err); - - if (!U_SUCCESS (uc_err)) { - msg_err ("FATAL error: cannot open converter for utf8: %s", - u_errorName (uc_err)); - - g_assert_not_reached (); - } - - ucnv_setFromUCallBack (utf8_converter, - UCNV_FROM_U_CALLBACK_SUBSTITUTE, - NULL, - NULL, - NULL, - &uc_err); - ucnv_setToUCallBack (utf8_converter, - UCNV_TO_U_CALLBACK_SUBSTITUTE, - NULL, - NULL, - NULL, - &uc_err); - } -} - static void rspamd_mime_encoding_substitute_init (void) { @@ -224,10 +189,10 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool, UChar *tmp_buf; UErrorCode uc_err = U_ZERO_ERROR; - UConverter *conv; + UConverter *conv, *utf8_converter; - rspamd_mime_utf8_conv_init (); conv = rspamd_mime_get_converter_cached (in_enc, &uc_err); + utf8_converter = rspamd_get_utf8_converter (); if (conv == NULL) { g_set_error (err, rspamd_iconv_error_quark (), EINVAL, @@ -282,8 +247,8 @@ rspamd_mime_text_part_ucs_from_utf (struct rspamd_task *task, { GByteArray *utf; UErrorCode uc_err = U_ZERO_ERROR; + UConverter *utf8_converter = rspamd_get_utf8_converter (); - rspamd_mime_utf8_conv_init (); utf = text_part->utf_raw_content; text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE, sizeof (UChar), utf->len + 1); @@ -308,10 +273,7 @@ rspamd_mime_text_part_normalise (struct rspamd_task *task, UErrorCode uc_err = U_ZERO_ERROR; gint32 nsym, end; UChar *src = NULL, *dest = NULL; - - if (norm == NULL) { - norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err); - } + const UNormalizer2 *norm = rspamd_get_unicode_normalizer (); if (!text_part->unicode_raw_content) { return; @@ -367,8 +329,9 @@ rspamd_mime_text_part_maybe_renormalise (struct rspamd_task *task, UErrorCode uc_err = U_ZERO_ERROR; guint clen, dlen; gint r; + UConverter *utf8_converter; - rspamd_mime_utf8_conv_init (); + utf8_converter = rspamd_get_utf8_converter (); if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) && text_part->unicode_raw_content) { @@ -398,10 +361,10 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, gint32 r, clen, dlen; UErrorCode uc_err = U_ZERO_ERROR; - UConverter *conv; + UConverter *conv, *utf8_converter; - rspamd_mime_utf8_conv_init (); conv = rspamd_mime_get_converter_cached (charset, &uc_err); + utf8_converter = rspamd_get_utf8_converter (); if (conv == NULL) { g_set_error (err, rspamd_iconv_error_quark (), EINVAL, @@ -464,7 +427,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in, gint32 r, clen, dlen; UChar *tmp_buf; UErrorCode uc_err = U_ZERO_ERROR; - UConverter *conv; + UConverter *conv, *utf8_converter; rspamd_ftok_t charset_tok; RSPAMD_FTOK_FROM_STR (&charset_tok, enc); @@ -477,7 +440,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in, return TRUE; } - rspamd_mime_utf8_conv_init (); + utf8_converter = rspamd_get_utf8_converter (); conv = rspamd_mime_get_converter_cached (enc, &uc_err); if (conv == NULL) { @@ -763,6 +726,7 @@ void rspamd_utf_to_unicode (GByteArray *in, GArray *dest) { UErrorCode uc_err = U_ZERO_ERROR; + UConverter *utf8_converter = rspamd_get_utf8_converter (); g_array_set_size (dest, in->len + 1); dest->len = ucnv_toUChars (utf8_converter, |