diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-24 16:26:01 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-24 16:26:01 +0000 |
commit | abd5300a45ff290656926b61603a65e9621e090f (patch) | |
tree | e3d350cca3ecbac3a41fcf96ad2a9dc5f9e48d75 /src/libstat | |
parent | b522caaf83b4a3f16246bdc38d0f7ce866cdc660 (diff) | |
download | rspamd-abd5300a45ff290656926b61603a65e9621e090f.tar.gz rspamd-abd5300a45ff290656926b61603a65e9621e090f.zip |
[Project] Add function to normalize unicode on per words basis
Diffstat (limited to 'src/libstat')
-rw-r--r-- | src/libstat/stat_api.h | 3 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 134 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 4 |
3 files changed, 139 insertions, 2 deletions
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h index 645e1f1aa..c046dd227 100644 --- a/src/libstat/stat_api.h +++ b/src/libstat/stat_api.h @@ -35,11 +35,12 @@ #define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6) #define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7) #define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8) +#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9) typedef struct rspamd_stat_token_s { rspamd_ftok_t original; rspamd_ftok_unicode_t unicode; - rspamd_ftok_t normalised; + rspamd_ftok_t normalized; rspamd_ftok_t stemmed; guint flags; } rspamd_stat_token_t; diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 8664b9e19..247c24dbd 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -20,11 +20,17 @@ #include "rspamd.h" #include "tokenizers.h" #include "stat_internal.h" -#include "../../../contrib/mumhash/mum.h" +#include "contrib/mumhash/mum.h" + #include <unicode/utf8.h> #include <unicode/uchar.h> #include <unicode/uiter.h> #include <unicode/ubrk.h> +#include <unicode/ucnv.h> +#if U_ICU_VERSION_MAJOR_NUM >= 44 +#include <unicode/unorm2.h> +#endif + #include <math.h> typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos, @@ -534,3 +540,129 @@ rspamd_tokenize_subject (struct rspamd_task *task) return words; } +void +rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool) +{ + rspamd_stat_token_t *tok; + guint i; + UErrorCode uc_err = U_ZERO_ERROR; + guint clen, dlen; + gint r; + UConverter *utf8_converter; +#if U_ICU_VERSION_MAJOR_NUM >= 44 + const UNormalizer2 *norm = rspamd_get_unicode_normalizer (); + gint32 end; + UChar *src = NULL, *dest = NULL; +#endif + + utf8_converter = rspamd_get_utf8_converter (); + + for (i = 0; i < words->len; i++) { + tok = &g_array_index (words, rspamd_stat_token_t, i); + + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { + UChar *unicode; + gchar *utf8; + gsize ulen; + + uc_err = U_ZERO_ERROR; + ulen = tok->original.len; + unicode = rspamd_mempool_alloc (pool, sizeof (UChar) * (ulen + 1)); + ulen = ucnv_toUChars (utf8_converter, + unicode, + tok->original.len + 1, + tok->original.begin, + tok->original.len, + &uc_err); + + + if (!U_SUCCESS (uc_err)) { + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; + tok->unicode.begin = NULL; + tok->unicode.len = 0; + tok->normalized.begin = NULL; + tok->normalized.len = 0; + } + else { + /* Perform normalization if available and needed */ +#if U_ICU_VERSION_MAJOR_NUM >= 44 + /* We can now check if we need to decompose */ + end = unorm2_spanQuickCheckYes (norm, src, ulen, &uc_err); + + if (!U_SUCCESS (uc_err)) { + tok->unicode.begin = unicode; + tok->unicode.len = ulen; + tok->normalized.begin = NULL; + tok->normalized.len = 0; + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; + } + else { + if (end == ulen) { + /* Already normalised */ + tok->unicode.begin = unicode; + tok->unicode.len = ulen; + tok->normalized.begin = tok->original.begin; + tok->normalized.len = tok->original.len; + } + else { + /* Perform normalization */ + + dest = rspamd_mempool_alloc (pool, ulen * sizeof (UChar)); + /* First part */ + memcpy (dest, src, end * sizeof (*dest)); + /* Second part */ + ulen = unorm2_normalizeSecondAndAppend (norm, dest, end, + ulen, + src + end, ulen - end, &uc_err); + + if (!U_SUCCESS (uc_err)) { + if (uc_err != U_BUFFER_OVERFLOW_ERROR) { + msg_warn_pool_check ("cannot normalise text '%*s': %s", + (gint)tok->original.len, tok->original.begin, + u_errorName (uc_err)); + tok->unicode.begin = unicode; + tok->unicode.len = ulen; + tok->normalized.begin = NULL; + tok->normalized.len = 0; + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; + } + } + else { + /* Copy normalised back */ + tok->unicode.begin = dest; + tok->unicode.len = ulen; + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED; + + /* Convert utf8 to produce normalized part */ + clen = ucnv_getMaxCharSize (utf8_converter); + dlen = UCNV_GET_MAX_BYTES_FOR_STRING (ulen, clen); + + utf8 = rspamd_mempool_alloc (pool, + sizeof (*utf8) * dlen + 1); + r = ucnv_fromUChars (utf8_converter, + utf8, + dlen, + dest, + ulen, + &uc_err); + utf8[r] = '\0'; + + tok->normalized.begin = utf8; + tok->normalized.len = r; + } + } + } +#else + /* Legacy libicu path */ + tok->unicode.begin = unicode; + tok->unicode.len = ulen; + tok->normalized.begin = tok->original.begin; + tok->normalized.len = tok->original.len; +#endif + } + } + } +} + +void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, + const gchar *language);
\ No newline at end of file diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 668f08cdc..9a5561671 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -57,6 +57,10 @@ gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool, struct rspamd_tokenizer_config *cf, gsize *len); +void rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool); + +void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, + const gchar *language); GArray * rspamd_tokenize_subject (struct rspamd_task *task); #endif |