summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-24 16:26:01 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-24 16:26:01 +0000
commitabd5300a45ff290656926b61603a65e9621e090f (patch)
treee3d350cca3ecbac3a41fcf96ad2a9dc5f9e48d75 /src
parentb522caaf83b4a3f16246bdc38d0f7ce866cdc660 (diff)
downloadrspamd-abd5300a45ff290656926b61603a65e9621e090f.tar.gz
rspamd-abd5300a45ff290656926b61603a65e9621e090f.zip
[Project] Add function to normalize unicode on per words basis
Diffstat (limited to 'src')
-rw-r--r--src/libmime/lang_detection.c2
-rw-r--r--src/libmime/mime_encoding.c58
-rw-r--r--src/libstat/stat_api.h3
-rw-r--r--src/libstat/tokenizers/tokenizers.c134
-rw-r--r--src/libstat/tokenizers/tokenizers.h4
-rw-r--r--src/libutil/str_util.c64
-rw-r--r--src/libutil/str_util.h6
7 files changed, 212 insertions, 59 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 102117b21..e2651b63c 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -801,7 +801,7 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
ret = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*ret));
ret->languages = g_ptr_array_sized_new (gl.gl_pathc);
- ret->uchar_converter = ucnv_open ("UTF-8", &uc_err);
+ ret->uchar_converter = rspamd_get_utf8_converter ();
ret->short_text_limit = short_text_limit;
ret->stop_words_norm = kh_init (rspamd_stopwords_hash);
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c
index c30cbe3e3..269166344 100644
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -40,11 +40,6 @@
#define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF)
static rspamd_regexp_t *utf_compatible_re = NULL;
-UConverter *utf8_converter = NULL;
-
-#if U_ICU_VERSION_MAJOR_NUM >= 44
-static const UNormalizer2 *norm = NULL;
-#endif
struct rspamd_charset_substitution {
const gchar *input;
@@ -101,36 +96,6 @@ rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err)
return conv;
}
-static inline void
-rspamd_mime_utf8_conv_init (void)
-{
- if (utf8_converter == NULL) {
- UErrorCode uc_err = U_ZERO_ERROR;
-
- utf8_converter = ucnv_open (UTF8_CHARSET, &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- msg_err ("FATAL error: cannot open converter for utf8: %s",
- u_errorName (uc_err));
-
- g_assert_not_reached ();
- }
-
- ucnv_setFromUCallBack (utf8_converter,
- UCNV_FROM_U_CALLBACK_SUBSTITUTE,
- NULL,
- NULL,
- NULL,
- &uc_err);
- ucnv_setToUCallBack (utf8_converter,
- UCNV_TO_U_CALLBACK_SUBSTITUTE,
- NULL,
- NULL,
- NULL,
- &uc_err);
- }
-}
-
static void
rspamd_mime_encoding_substitute_init (void)
{
@@ -224,10 +189,10 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
UChar *tmp_buf;
UErrorCode uc_err = U_ZERO_ERROR;
- UConverter *conv;
+ UConverter *conv, *utf8_converter;
- rspamd_mime_utf8_conv_init ();
conv = rspamd_mime_get_converter_cached (in_enc, &uc_err);
+ utf8_converter = rspamd_get_utf8_converter ();
if (conv == NULL) {
g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
@@ -282,8 +247,8 @@ rspamd_mime_text_part_ucs_from_utf (struct rspamd_task *task,
{
GByteArray *utf;
UErrorCode uc_err = U_ZERO_ERROR;
+ UConverter *utf8_converter = rspamd_get_utf8_converter ();
- rspamd_mime_utf8_conv_init ();
utf = text_part->utf_raw_content;
text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE,
sizeof (UChar), utf->len + 1);
@@ -308,10 +273,7 @@ rspamd_mime_text_part_normalise (struct rspamd_task *task,
UErrorCode uc_err = U_ZERO_ERROR;
gint32 nsym, end;
UChar *src = NULL, *dest = NULL;
-
- if (norm == NULL) {
- norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
- }
+ const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
if (!text_part->unicode_raw_content) {
return;
@@ -367,8 +329,9 @@ rspamd_mime_text_part_maybe_renormalise (struct rspamd_task *task,
UErrorCode uc_err = U_ZERO_ERROR;
guint clen, dlen;
gint r;
+ UConverter *utf8_converter;
- rspamd_mime_utf8_conv_init ();
+ utf8_converter = rspamd_get_utf8_converter ();
if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) &&
text_part->unicode_raw_content) {
@@ -398,10 +361,10 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
gint32 r, clen, dlen;
UErrorCode uc_err = U_ZERO_ERROR;
- UConverter *conv;
+ UConverter *conv, *utf8_converter;
- rspamd_mime_utf8_conv_init ();
conv = rspamd_mime_get_converter_cached (charset, &uc_err);
+ utf8_converter = rspamd_get_utf8_converter ();
if (conv == NULL) {
g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
@@ -464,7 +427,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
gint32 r, clen, dlen;
UChar *tmp_buf;
UErrorCode uc_err = U_ZERO_ERROR;
- UConverter *conv;
+ UConverter *conv, *utf8_converter;
rspamd_ftok_t charset_tok;
RSPAMD_FTOK_FROM_STR (&charset_tok, enc);
@@ -477,7 +440,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
return TRUE;
}
- rspamd_mime_utf8_conv_init ();
+ utf8_converter = rspamd_get_utf8_converter ();
conv = rspamd_mime_get_converter_cached (enc, &uc_err);
if (conv == NULL) {
@@ -763,6 +726,7 @@ void
rspamd_utf_to_unicode (GByteArray *in, GArray *dest)
{
UErrorCode uc_err = U_ZERO_ERROR;
+ UConverter *utf8_converter = rspamd_get_utf8_converter ();
g_array_set_size (dest, in->len + 1);
dest->len = ucnv_toUChars (utf8_converter,
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h
index 645e1f1aa..c046dd227 100644
--- a/src/libstat/stat_api.h
+++ b/src/libstat/stat_api.h
@@ -35,11 +35,12 @@
#define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6)
#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7)
#define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8)
+#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9)
typedef struct rspamd_stat_token_s {
rspamd_ftok_t original;
rspamd_ftok_unicode_t unicode;
- rspamd_ftok_t normalised;
+ rspamd_ftok_t normalized;
rspamd_ftok_t stemmed;
guint flags;
} rspamd_stat_token_t;
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 8664b9e19..247c24dbd 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -20,11 +20,17 @@
#include "rspamd.h"
#include "tokenizers.h"
#include "stat_internal.h"
-#include "../../../contrib/mumhash/mum.h"
+#include "contrib/mumhash/mum.h"
+
#include <unicode/utf8.h>
#include <unicode/uchar.h>
#include <unicode/uiter.h>
#include <unicode/ubrk.h>
+#include <unicode/ucnv.h>
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+#include <unicode/unorm2.h>
+#endif
+
#include <math.h>
typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
@@ -534,3 +540,129 @@ rspamd_tokenize_subject (struct rspamd_task *task)
return words;
}
+void
+rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool)
+{
+ rspamd_stat_token_t *tok;
+ guint i;
+ UErrorCode uc_err = U_ZERO_ERROR;
+ guint clen, dlen;
+ gint r;
+ UConverter *utf8_converter;
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+ const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
+ gint32 end;
+ UChar *src = NULL, *dest = NULL;
+#endif
+
+ utf8_converter = rspamd_get_utf8_converter ();
+
+ for (i = 0; i < words->len; i++) {
+ tok = &g_array_index (words, rspamd_stat_token_t, i);
+
+ if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
+ UChar *unicode;
+ gchar *utf8;
+ gsize ulen;
+
+ uc_err = U_ZERO_ERROR;
+ ulen = tok->original.len;
+ unicode = rspamd_mempool_alloc (pool, sizeof (UChar) * (ulen + 1));
+ ulen = ucnv_toUChars (utf8_converter,
+ unicode,
+ tok->original.len + 1,
+ tok->original.begin,
+ tok->original.len,
+ &uc_err);
+
+
+ if (!U_SUCCESS (uc_err)) {
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
+ tok->unicode.begin = NULL;
+ tok->unicode.len = 0;
+ tok->normalized.begin = NULL;
+ tok->normalized.len = 0;
+ }
+ else {
+ /* Perform normalization if available and needed */
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+ /* We can now check if we need to decompose */
+ end = unorm2_spanQuickCheckYes (norm, src, ulen, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ tok->unicode.begin = unicode;
+ tok->unicode.len = ulen;
+ tok->normalized.begin = NULL;
+ tok->normalized.len = 0;
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
+ }
+ else {
+ if (end == ulen) {
+ /* Already normalised */
+ tok->unicode.begin = unicode;
+ tok->unicode.len = ulen;
+ tok->normalized.begin = tok->original.begin;
+ tok->normalized.len = tok->original.len;
+ }
+ else {
+ /* Perform normalization */
+
+ dest = rspamd_mempool_alloc (pool, ulen * sizeof (UChar));
+ /* First part */
+ memcpy (dest, src, end * sizeof (*dest));
+ /* Second part */
+ ulen = unorm2_normalizeSecondAndAppend (norm, dest, end,
+ ulen,
+ src + end, ulen - end, &uc_err);
+
+ if (!U_SUCCESS (uc_err)) {
+ if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
+ msg_warn_pool_check ("cannot normalise text '%*s': %s",
+ (gint)tok->original.len, tok->original.begin,
+ u_errorName (uc_err));
+ tok->unicode.begin = unicode;
+ tok->unicode.len = ulen;
+ tok->normalized.begin = NULL;
+ tok->normalized.len = 0;
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
+ }
+ }
+ else {
+ /* Copy normalised back */
+ tok->unicode.begin = dest;
+ tok->unicode.len = ulen;
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED;
+
+ /* Convert utf8 to produce normalized part */
+ clen = ucnv_getMaxCharSize (utf8_converter);
+ dlen = UCNV_GET_MAX_BYTES_FOR_STRING (ulen, clen);
+
+ utf8 = rspamd_mempool_alloc (pool,
+ sizeof (*utf8) * dlen + 1);
+ r = ucnv_fromUChars (utf8_converter,
+ utf8,
+ dlen,
+ dest,
+ ulen,
+ &uc_err);
+ utf8[r] = '\0';
+
+ tok->normalized.begin = utf8;
+ tok->normalized.len = r;
+ }
+ }
+ }
+#else
+ /* Legacy libicu path */
+ tok->unicode.begin = unicode;
+ tok->unicode.len = ulen;
+ tok->normalized.begin = tok->original.begin;
+ tok->normalized.len = tok->original.len;
+#endif
+ }
+ }
+ }
+}
+
+void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
+ const gchar *language); \ No newline at end of file
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 668f08cdc..9a5561671 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -57,6 +57,10 @@ gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
struct rspamd_tokenizer_config *cf,
gsize *len);
+void rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool);
+
+void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
+ const gchar *language);
GArray * rspamd_tokenize_subject (struct rspamd_task *task);
#endif
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index be7323df3..d8b17e3c3 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -2237,25 +2237,71 @@ rspamd_memrchr (const void *m, gint c, gsize len)
return NULL;
}
+struct UConverter *
+rspamd_get_utf8_converter (void)
+{
+ static UConverter *utf8_conv = NULL;
+ UErrorCode uc_err = U_ZERO_ERROR;
+
+ if (utf8_conv == NULL) {
+ utf8_conv = ucnv_open ("UTF-8", &uc_err);
+ if (!U_SUCCESS (uc_err)) {
+ msg_err ("FATAL error: cannot open converter for utf8: %s",
+ u_errorName (uc_err));
+
+ g_assert_not_reached ();
+ }
+
+ ucnv_setFromUCallBack (utf8_conv,
+ UCNV_FROM_U_CALLBACK_SUBSTITUTE,
+ NULL,
+ NULL,
+ NULL,
+ &uc_err);
+ ucnv_setToUCallBack (utf8_conv,
+ UCNV_TO_U_CALLBACK_SUBSTITUTE,
+ NULL,
+ NULL,
+ NULL,
+ &uc_err);
+ }
+
+ return utf8_conv;
+}
+
+
+const struct UNormalizer2 *
+rspamd_get_unicode_normalizer (void)
+{
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+ UErrorCode uc_err = U_ZERO_ERROR;
+ static const UNormalizer2 *norm = NULL;
+
+ if (norm == NULL) {
+ norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
+ g_assert (U_SUCCESS (uc_err));
+ }
+
+ return norm;
+#else
+ /* Old libicu */
+ return NULL;
+#endif
+}
+
+
gboolean
rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start,
guint *len)
{
#if U_ICU_VERSION_MAJOR_NUM >= 44
UErrorCode uc_err = U_ZERO_ERROR;
- static UConverter *utf8_conv = NULL;
- static const UNormalizer2 *norm = NULL;
+ UConverter *utf8_conv = rspamd_get_utf8_converter ();
+ const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
gint32 nsym, end;
UChar *src = NULL, *dest = NULL;
gboolean ret = FALSE;
- if (utf8_conv == NULL) {
- utf8_conv = ucnv_open ("UTF-8", &uc_err);
- g_assert (U_SUCCESS (uc_err));
- norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
- g_assert (U_SUCCESS (uc_err));
- }
-
/* We first need to convert data to UChars :( */
src = g_malloc ((*len + 1) * sizeof (*src));
nsym = ucnv_toUChars (utf8_conv, src, *len + 1,
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index ffcc69197..688034ec6 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -386,6 +386,12 @@ rspamd_str_has_8bit (const guchar *beg, gsize len)
return FALSE;
}
+struct UConverter;
+struct UConverter *rspamd_get_utf8_converter (void);
+
+struct UNormalizer2;
+const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);
+
/**
* Gets a string in UTF8 and normalises it to NFKC_Casefold form
* @param pool optional memory pool used for logging purposes