diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-24 16:26:01 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-24 16:26:01 +0000 |
commit | abd5300a45ff290656926b61603a65e9621e090f (patch) | |
tree | e3d350cca3ecbac3a41fcf96ad2a9dc5f9e48d75 /src/libutil/str_util.c | |
parent | b522caaf83b4a3f16246bdc38d0f7ce866cdc660 (diff) | |
download | rspamd-abd5300a45ff290656926b61603a65e9621e090f.tar.gz rspamd-abd5300a45ff290656926b61603a65e9621e090f.zip |
[Project] Add function to normalize unicode on per words basis
Diffstat (limited to 'src/libutil/str_util.c')
-rw-r--r-- | src/libutil/str_util.c | 64 |
1 files changed, 55 insertions, 9 deletions
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index be7323df3..d8b17e3c3 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -2237,25 +2237,71 @@ rspamd_memrchr (const void *m, gint c, gsize len) return NULL; } +struct UConverter * +rspamd_get_utf8_converter (void) +{ + static UConverter *utf8_conv = NULL; + UErrorCode uc_err = U_ZERO_ERROR; + + if (utf8_conv == NULL) { + utf8_conv = ucnv_open ("UTF-8", &uc_err); + if (!U_SUCCESS (uc_err)) { + msg_err ("FATAL error: cannot open converter for utf8: %s", + u_errorName (uc_err)); + + g_assert_not_reached (); + } + + ucnv_setFromUCallBack (utf8_conv, + UCNV_FROM_U_CALLBACK_SUBSTITUTE, + NULL, + NULL, + NULL, + &uc_err); + ucnv_setToUCallBack (utf8_conv, + UCNV_TO_U_CALLBACK_SUBSTITUTE, + NULL, + NULL, + NULL, + &uc_err); + } + + return utf8_conv; +} + + +const struct UNormalizer2 * +rspamd_get_unicode_normalizer (void) +{ +#if U_ICU_VERSION_MAJOR_NUM >= 44 + UErrorCode uc_err = U_ZERO_ERROR; + static const UNormalizer2 *norm = NULL; + + if (norm == NULL) { + norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err); + g_assert (U_SUCCESS (uc_err)); + } + + return norm; +#else + /* Old libicu */ + return NULL; +#endif +} + + gboolean rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start, guint *len) { #if U_ICU_VERSION_MAJOR_NUM >= 44 UErrorCode uc_err = U_ZERO_ERROR; - static UConverter *utf8_conv = NULL; - static const UNormalizer2 *norm = NULL; + UConverter *utf8_conv = rspamd_get_utf8_converter (); + const UNormalizer2 *norm = rspamd_get_unicode_normalizer (); gint32 nsym, end; UChar *src = NULL, *dest = NULL; gboolean ret = FALSE; - if (utf8_conv == NULL) { - utf8_conv = ucnv_open ("UTF-8", &uc_err); - g_assert (U_SUCCESS (uc_err)); - norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err); - g_assert (U_SUCCESS (uc_err)); - } - /* We first need to convert data to UChars :( */ src = g_malloc ((*len + 1) * sizeof (*src)); nsym = ucnv_toUChars (utf8_conv, src, *len + 1, |