aboutsummaryrefslogtreecommitdiffstats
path: root/src/libutil/str_util.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-24 16:26:01 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-24 16:26:01 +0000
commitabd5300a45ff290656926b61603a65e9621e090f (patch)
treee3d350cca3ecbac3a41fcf96ad2a9dc5f9e48d75 /src/libutil/str_util.c
parentb522caaf83b4a3f16246bdc38d0f7ce866cdc660 (diff)
downloadrspamd-abd5300a45ff290656926b61603a65e9621e090f.tar.gz
rspamd-abd5300a45ff290656926b61603a65e9621e090f.zip
[Project] Add function to normalize unicode on per words basis
Diffstat (limited to 'src/libutil/str_util.c')
-rw-r--r--src/libutil/str_util.c64
1 files changed, 55 insertions, 9 deletions
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index be7323df3..d8b17e3c3 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -2237,25 +2237,71 @@ rspamd_memrchr (const void *m, gint c, gsize len)
return NULL;
}
+struct UConverter *
+rspamd_get_utf8_converter (void)
+{
+ static UConverter *utf8_conv = NULL;
+ UErrorCode uc_err = U_ZERO_ERROR;
+
+ if (utf8_conv == NULL) {
+ utf8_conv = ucnv_open ("UTF-8", &uc_err);
+ if (!U_SUCCESS (uc_err)) {
+ msg_err ("FATAL error: cannot open converter for utf8: %s",
+ u_errorName (uc_err));
+
+ g_assert_not_reached ();
+ }
+
+ ucnv_setFromUCallBack (utf8_conv,
+ UCNV_FROM_U_CALLBACK_SUBSTITUTE,
+ NULL,
+ NULL,
+ NULL,
+ &uc_err);
+ ucnv_setToUCallBack (utf8_conv,
+ UCNV_TO_U_CALLBACK_SUBSTITUTE,
+ NULL,
+ NULL,
+ NULL,
+ &uc_err);
+ }
+
+ return utf8_conv;
+}
+
+
+const struct UNormalizer2 *
+rspamd_get_unicode_normalizer (void)
+{
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+ UErrorCode uc_err = U_ZERO_ERROR;
+ static const UNormalizer2 *norm = NULL;
+
+ if (norm == NULL) {
+ norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
+ g_assert (U_SUCCESS (uc_err));
+ }
+
+ return norm;
+#else
+ /* Old libicu */
+ return NULL;
+#endif
+}
+
+
gboolean
rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
guint *len)
{
#if U_ICU_VERSION_MAJOR_NUM >= 44
UErrorCode uc_err = U_ZERO_ERROR;
- static UConverter *utf8_conv = NULL;
- static const UNormalizer2 *norm = NULL;
+ UConverter *utf8_conv = rspamd_get_utf8_converter ();
+ const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
gint32 nsym, end;
UChar *src = NULL, *dest = NULL;
gboolean ret = FALSE;
- if (utf8_conv == NULL) {
- utf8_conv = ucnv_open ("UTF-8", &uc_err);
- g_assert (U_SUCCESS (uc_err));
- norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
- g_assert (U_SUCCESS (uc_err));
- }
-
/* We first need to convert data to UChars :( */
src = g_malloc ((*len + 1) * sizeof (*src));
nsym = ucnv_toUChars (utf8_conv, src, *len + 1,