diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-25 16:33:33 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-25 16:33:33 +0000 |
commit | 8adf20f620fa9737666044de7c712eac2174b1c4 (patch) | |
tree | dae0d228d312e67f11468118c39b21ded046f530 /src/libstat | |
parent | 0d53332a7ecaa3a2b5020c7c58d6146d72d7b05c (diff) | |
download | rspamd-8adf20f620fa9737666044de7c712eac2174b1c4.tar.gz rspamd-8adf20f620fa9737666044de7c712eac2174b1c4.zip |
[Project] Another try to normalize unicode properly
Diffstat (limited to 'src/libstat')
-rw-r--r-- | src/libstat/stat_api.h | 9 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 245 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 1 |
3 files changed, 142 insertions, 113 deletions
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h index 8ab3332b9..b912f8d20 100644 --- a/src/libstat/stat_api.h +++ b/src/libstat/stat_api.h @@ -37,12 +37,13 @@ #define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8) #define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9) #define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 9) +#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 10) typedef struct rspamd_stat_token_s { - rspamd_ftok_t original; - rspamd_ftok_unicode_t unicode; - rspamd_ftok_t normalized; - rspamd_ftok_t stemmed; + rspamd_ftok_t original; /* utf8 raw */ + rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */ + rspamd_ftok_t normalized; /* normalized and lowercased utf8 */ + rspamd_ftok_t stemmed; /* stemmed utf8 */ guint flags; } rspamd_stat_token_t; diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index d27d9bc58..32d9ba0df 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -447,7 +447,7 @@ start_over: if (!decay) { decay = TRUE; } else { - token.original.len = 0; + token.flags |= RSPAMD_STAT_TOKEN_FLAG_SKIPPED; } } } @@ -541,131 +541,165 @@ rspamd_tokenize_subject (struct rspamd_task *task) return words; } +static inline void +rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen, + rspamd_stat_token_t *tok, + rspamd_mempool_t *pool) +{ + UChar32 *dest, t, *d; + gint32 i = 0; + + dest = rspamd_mempool_alloc (pool, srclen * sizeof (UChar32)); + d = dest; + + while (i < srclen) { + U16_NEXT_UNSAFE (src, i, t); + *d++ = u_tolower (t); + } + + tok->unicode.begin = dest; + tok->unicode.len = d - dest; +} + +static inline void +rspamd_ucs32_to_normalised (rspamd_stat_token_t *tok, + rspamd_mempool_t *pool) +{ + guint i, doff = 0; + gsize utflen = 0; + gchar *dest; + UChar32 t; + + for (i = 0; i < tok->unicode.len; i ++) { + utflen += U8_LENGTH (tok->unicode.begin[i]); + } + + dest = rspamd_mempool_alloc (pool, utflen + 1); + + for (i = 0; i < tok->unicode.len; i ++) { + t = tok->unicode.begin[i]; + U8_APPEND_UNSAFE (dest, doff, t); + } + + g_assert (doff <= utflen); + dest[doff] = '\0'; + + tok->normalized.len = doff; + tok->normalized.begin = dest; +} + void -rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool) +rspamd_normalize_single_word (rspamd_stat_token_t *tok, rspamd_mempool_t *pool) { - rspamd_stat_token_t *tok; - guint i; UErrorCode uc_err = U_ZERO_ERROR; - guint clen, dlen; - gint r; UConverter *utf8_converter; -#if U_ICU_VERSION_MAJOR_NUM >= 44 - const UNormalizer2 *norm = rspamd_get_unicode_normalizer (); - gint32 end; - UChar *src = NULL, *dest = NULL; -#endif + UChar tmpbuf[1024]; /* Assume that we have no longer words... */ + gsize ulen; utf8_converter = rspamd_get_utf8_converter (); - for (i = 0; i < words->len; i++) { - tok = &g_array_index (words, rspamd_stat_token_t, i); - - if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { - UChar *unicode; - gchar *utf8; - gsize ulen; - - uc_err = U_ZERO_ERROR; - ulen = tok->original.len; - unicode = rspamd_mempool_alloc (pool, sizeof (UChar) * (ulen + 1)); - ulen = ucnv_toUChars (utf8_converter, - unicode, - tok->original.len + 1, - tok->original.begin, - tok->original.len, - &uc_err); + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { + ulen = ucnv_toUChars (utf8_converter, + tmpbuf, + G_N_ELEMENTS (tmpbuf), + tok->original.begin, + tok->original.len, + &uc_err); + + /* Now, we need to understand if we need to normalise the word */ + if (!U_SUCCESS (uc_err)) { + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; + tok->unicode.begin = NULL; + tok->unicode.len = 0; + tok->normalized.begin = NULL; + tok->normalized.len = 0; + } + else { +#if U_ICU_VERSION_MAJOR_NUM >= 44 + const UNormalizer2 *norm = rspamd_get_unicode_normalizer (); + gint32 end; + /* We can now check if we need to decompose */ + end = unorm2_spanQuickCheckYes (norm, tmpbuf, ulen, &uc_err); if (!U_SUCCESS (uc_err)) { - tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; - tok->unicode.begin = NULL; - tok->unicode.len = 0; + rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool); tok->normalized.begin = NULL; tok->normalized.len = 0; + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; } else { - /* Perform normalization if available and needed */ -#if U_ICU_VERSION_MAJOR_NUM >= 44 - /* We can now check if we need to decompose */ - end = unorm2_spanQuickCheckYes (norm, src, ulen, &uc_err); - - if (!U_SUCCESS (uc_err)) { - tok->unicode.begin = unicode; - tok->unicode.len = ulen; - tok->normalized.begin = NULL; - tok->normalized.len = 0; - tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; + if (end == ulen) { + /* Already normalised, just lowercase */ + rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool); + rspamd_ucs32_to_normalised (tok, pool); } else { - if (end == ulen) { - /* Already normalised */ - tok->unicode.begin = unicode; - tok->unicode.len = ulen; - tok->normalized.begin = tok->original.begin; - tok->normalized.len = tok->original.len; + /* Perform normalization */ + UChar normbuf[1024]; + + g_assert (end < G_N_ELEMENTS (normbuf)); + /* First part */ + memcpy (normbuf, tmpbuf, end * sizeof (UChar)); + /* Second part */ + ulen = unorm2_normalizeSecondAndAppend (norm, + normbuf, end, + G_N_ELEMENTS (normbuf), + tmpbuf + end, + ulen - end, + &uc_err); + + if (!U_SUCCESS (uc_err)) { + if (uc_err != U_BUFFER_OVERFLOW_ERROR) { + msg_warn_pool_check ("cannot normalise text '%*s': %s", + (gint)tok->original.len, tok->original.begin, + u_errorName (uc_err)); + rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool); + rspamd_ucs32_to_normalised (tok, pool); + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; + } } else { - /* Perform normalization */ - - dest = rspamd_mempool_alloc (pool, ulen * sizeof (UChar)); - /* First part */ - memcpy (dest, src, end * sizeof (*dest)); - /* Second part */ - ulen = unorm2_normalizeSecondAndAppend (norm, dest, end, - ulen, - src + end, ulen - end, &uc_err); - - if (!U_SUCCESS (uc_err)) { - if (uc_err != U_BUFFER_OVERFLOW_ERROR) { - msg_warn_pool_check ("cannot normalise text '%*s': %s", - (gint)tok->original.len, tok->original.begin, - u_errorName (uc_err)); - tok->unicode.begin = unicode; - tok->unicode.len = ulen; - tok->normalized.begin = NULL; - tok->normalized.len = 0; - tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; - } - } - else { - /* Copy normalised back */ - tok->unicode.begin = dest; - tok->unicode.len = ulen; - tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED; - - /* Convert utf8 to produce normalized part */ - clen = ucnv_getMaxCharSize (utf8_converter); - dlen = UCNV_GET_MAX_BYTES_FOR_STRING (ulen, clen); - - utf8 = rspamd_mempool_alloc (pool, - sizeof (*utf8) * dlen + 1); - r = ucnv_fromUChars (utf8_converter, - utf8, - dlen, - dest, - ulen, - &uc_err); - utf8[r] = '\0'; - - tok->normalized.begin = utf8; - tok->normalized.len = r; - } + /* Copy normalised back */ + rspamd_uchars_to_ucs32 (normbuf, ulen, tok, pool); + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED; + rspamd_ucs32_to_normalised (tok, pool); } } + } #else - /* Legacy libicu path */ - tok->unicode.begin = unicode; - tok->unicode.len = ulen; - tok->normalized.begin = tok->original.begin; - tok->normalized.len = tok->original.len; + /* Legacy version with no unorm2 interface */ + rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool); + rspamd_ucs32_to_normalised (tok, pool); #endif - } + } + } + else { + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + /* Simple lowercase */ + gchar *dest; + + dest = rspamd_mempool_alloc (pool, tok->original.len + 1); + rspamd_strlcpy (dest, tok->original.begin, tok->original.len + 1); + rspamd_str_lc (dest, tok->original.len); + tok->normalized.len = tok->original.len; } } } void +rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool) +{ + rspamd_stat_token_t *tok; + guint i; + + for (i = 0; i < words->len; i++) { + tok = &g_array_index (words, rspamd_stat_token_t, i); + rspamd_normalize_single_word (tok, pool); + } +} + +void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, const gchar *language, struct rspamd_lang_detector *d) @@ -736,12 +770,8 @@ rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, } } else { - /* No stemmer, utf8 lowercase */ - dest = rspamd_mempool_alloc (pool, tok->normalized.len); - memcpy (dest, tok->normalized.begin, tok->normalized.len); - rspamd_str_lc_utf8 (dest, tok->normalized.len); tok->stemmed.len = tok->normalized.len; - tok->stemmed.begin = dest; + tok->stemmed.begin = tok->normalized.begin; } if (tok->stemmed.len > 0 && rspamd_language_detector_is_stop_word (d, @@ -752,11 +782,8 @@ rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, else { if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { /* Raw text, lowercase */ - dest = rspamd_mempool_alloc (pool, tok->original.len); - memcpy (dest, tok->original.begin, tok->original.len); - rspamd_str_lc (dest, tok->original.len); - tok->stemmed.len = tok->original.len; - tok->stemmed.begin = dest; + tok->stemmed.len = tok->normalized.len; + tok->stemmed.begin = tok->normalized.begin; } } } diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index eb4a285de..683d728ed 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -58,6 +58,7 @@ gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool, gsize *len); struct rspamd_lang_detector; +void rspamd_normalize_single_word (rspamd_stat_token_t *tok, rspamd_mempool_t *pool); void rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool); void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, const gchar *language, |