diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-09-06 19:49:44 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-09-06 19:50:18 +0100 |
commit | c31f8bf12bff61c9422de9eeff0292c6ac339c5e (patch) | |
tree | 224c38634f5d6f45218752ca3abb1b39bc7e4093 /src/libstat/tokenizers | |
parent | af5f57916e4345d988802794c84460960ee47d0c (diff) | |
download | rspamd-c31f8bf12bff61c9422de9eeff0292c6ac339c5e.tar.gz rspamd-c31f8bf12bff61c9422de9eeff0292c6ac339c5e.zip |
[Feature] Implement new text tokenizer based on libicu
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 418 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 3 |
2 files changed, 218 insertions, 203 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 5436430fe..9babfc8a1 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -21,8 +21,10 @@ #include "tokenizers.h" #include "stat_internal.h" #include "../../../contrib/mumhash/mum.h" -#include "unicode/utf8.h" -#include "unicode/uchar.h" +#include <unicode/utf8.h> +#include <unicode/uchar.h> +#include <unicode/uiter.h> +#include <unicode/ubrk.h> typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos, rspamd_stat_token_t * token, @@ -148,187 +150,88 @@ rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf, return TRUE; } -static gboolean -rspamd_tokenizer_get_word_utf8 (rspamd_stat_token_t * buf, - gchar const **cur, rspamd_stat_token_t * token, - GList **exceptions, gsize *rl, - gboolean check_signature) +static inline gboolean +rspamd_tokenize_check_limit (gboolean decay, + guint word_decay, + guint nwords, + guint64 *hv, + guint64 *prob, + const rspamd_stat_token_t *token, + gssize remain, + gssize total) { - gint32 i, siglen = 0, remain; - goffset pos; - const gchar *p, *s, *sig = NULL; - UChar32 uc; - guint processed = 0; - struct rspamd_process_exception *ex = NULL; - enum { - skip_delimiters = 0, - feed_token, - process_signature - } state = skip_delimiters; - - if (buf == NULL) { - return FALSE; - } - - if (exceptions != NULL && *exceptions != NULL) { - ex = (*exceptions)->data; - } - - g_assert (cur != NULL); - - if (*cur == NULL) { - *cur = buf->begin; - } + static const gdouble avg_word_len = 6.0; - token->len = 0; + if (!decay) { + if (token->len >= sizeof (guint64)) { +#ifdef _MUM_UNALIGNED_ACCESS + *hv = mum_hash_step (*hv, *(guint64 *)token->begin); +#else + guint64 tmp; + memcpy (&tmp, token->begin, sizeof (tmp)); + *hv = mum_hash_step (*hv, tmp); +#endif + } - pos = *cur - buf->begin; - if (pos >= buf->len) { - return FALSE; - } + /* Check for decay */ + if (word_decay > 0 && nwords > word_decay && remain < (gssize)total) { + /* Start decay */ + gdouble decay_prob; - remain = buf->len - pos; - s = *cur; - p = s; - token->begin = s; + *hv = mum_hash_finish (*hv); - for (i = 0; i < remain; ) { - p = &s[i]; - U8_NEXT (s, i, remain, uc); /* This also advances i */ + /* We assume that word is 6 symbols length in average */ + decay_prob = (gdouble)word_decay / ((total - (remain)) / avg_word_len); - if (uc < 0) { - if (i < remain) { - uc = 0xFFFD; + if (decay_prob >= 1.0) { + *prob = G_MAXUINT64; } else { - return FALSE; + *prob = decay_prob * G_MAXUINT64; } - } - switch (state) { - case skip_delimiters: - if (ex != NULL && p - buf->begin == ex->pos) { - goto process_exception; - } - else if (u_isgraph (uc)) { - if (u_isalnum (uc)) { - state = feed_token; - token->begin = p; - continue; - } - else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) { - sig = p; - siglen = remain - i; - state = process_signature; - continue; - } - } - break; - case feed_token: - if (ex != NULL && p - buf->begin == (gint)ex->pos) { - token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; - goto process_exception; - } - else if (!u_isalnum (uc)) { - token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; - goto set_token; - } - processed ++; - break; - case process_signature: - if (*p == '\r' || *p == '\n') { - msg_debug ("signature found: %*s", (gint)siglen, sig); - return FALSE; - } - else if (*p != ' ' && *p != '-' && *p != '_') { - state = skip_delimiters; - continue; - } - break; + return TRUE; } } + else { + /* Decaying probability */ + /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */ + *hv = (*hv) * 2862933555777941757ULL + 3037000493ULL; - /* Last character */ - if (state == feed_token) { - p = &s[i]; - goto set_token; + if (*hv > *prob) { + return TRUE; + } } return FALSE; +} -set_token: - if (rl) { - *rl = processed; - } +static inline gboolean +rspamd_utf_word_valid (const gchar *text, const gchar *end, + gint32 start, gint32 finish) +{ + const gchar *st = text + start, *fin = text + finish; + UChar32 c; - if (token->len == 0 && processed > 0) { - token->len = p - token->begin; - g_assert (token->len > 0); + if (st >= end || fin > end || st >= fin) { + return FALSE; } - *cur = &s[i]; - - return TRUE; - -process_exception: - if (token->len == 0 && processed > 0) { - /* - * We have processed something before the next exception, so - * continue processing on next iteration of this function call - */ - token->len = p - token->begin; - g_assert (token->len > 0); - - *cur = p; + U8_NEXT (text, start, finish, c); + if (u_isalnum (c)) { return TRUE; } - if (ex->type == RSPAMD_EXCEPTION_URL) { - token->begin = "!!EX!!"; - token->len = sizeof ("!!EX!!") - 1; - token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; - processed = token->len; - } - - p += ex->len; - - /* We need to skip all exceptions that are within this exception */ - *exceptions = g_list_next (*exceptions); - - while (*exceptions) { - ex = (*exceptions)->data; - - if (ex->pos < p - buf->begin) { - /* Nested exception */ - if (ex->pos + ex->len > p - buf->begin) { - /* - * We have somehow overlapping nesting exception, - * extend current offset - */ - p = buf->begin + ex->pos + ex->len; - } - - *exceptions = g_list_next (*exceptions); - } - else { - break; - } - } - - *cur = p; - - if (rl) { - *rl = processed; - } - - return TRUE; + return FALSE; } GArray * rspamd_tokenize_text (const gchar *text, gsize len, + const UText *utxt, enum rspamd_tokenize_type how, - struct rspamd_config *cfg, GList *exceptions, + struct rspamd_config *cfg, + GList *exceptions, guint64 *hash) { rspamd_stat_token_t token, buf; @@ -336,11 +239,11 @@ rspamd_tokenize_text (const gchar *text, gsize len, gsize l = 0; GArray *res; GList *cur = exceptions; - token_get_function func; guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128; guint64 hv = 0; gboolean decay = FALSE; guint64 prob; + static UBreakIterator* bi = NULL; if (text == NULL) { return NULL; @@ -353,18 +256,6 @@ rspamd_tokenize_text (const gchar *text, gsize len, token.len = 0; token.flags = 0; - switch (how) { - case RSPAMD_TOKENIZE_RAW: - func = rspamd_tokenizer_get_word_raw; - break; - case RSPAMD_TOKENIZE_UTF: - func = rspamd_tokenizer_get_word_utf8; - break; - default: - g_assert_not_reached (); - break; - } - if (cfg != NULL) { min_len = cfg->min_word_len; max_len = cfg->max_word_len; @@ -375,56 +266,177 @@ rspamd_tokenize_text (const gchar *text, gsize len, res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t), initial_size); - while (func (&buf, &pos, &token, &cur, &l, FALSE)) { - if (l == 0 || (min_len > 0 && l < min_len) || - (max_len > 0 && l > max_len)) { + if (G_UNLIKELY (how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) { + while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) { + if (l == 0 || (min_len > 0 && l < min_len) || + (max_len > 0 && l > max_len)) { + token.begin = pos; + continue; + } + + if (rspamd_tokenize_check_limit (decay, word_decay, res->len, + &hv, &prob, &token, pos - text, len)) { + if (!decay) { + decay = TRUE; + } else { + token.begin = pos; + continue; + } + } + + g_array_append_val (res, token); token.begin = pos; - continue; } + } + else { + /* UTF8 boundaries */ + UErrorCode uc_err = U_ZERO_ERROR; + int32_t last, p; + struct rspamd_process_exception *ex = NULL; - if (!decay) { - if (token.len >= sizeof (guint64)) { -#ifdef _MUM_UNALIGNED_ACCESS - hv = mum_hash_step (hv, *(guint64 *)token.begin); -#else - guint64 tmp; - memcpy (&tmp, token.begin, sizeof (tmp)); - hv = mum_hash_step (hv, tmp); -#endif - } + if (bi == NULL) { + bi = ubrk_open (UBRK_WORD, NULL, NULL, 0, &uc_err); - /* Check for decay */ - if (word_decay > 0 && res->len > word_decay && pos - text < (gssize)len) { - /* Start decay */ - gdouble decay_prob; + g_assert (U_SUCCESS (uc_err)); + } - decay = TRUE; - hv = mum_hash_finish (hv); + ubrk_setUText (bi, (UText*)utxt, &uc_err); + last = ubrk_first (bi); + p = last; - /* We assume that word is 6 symbols length in average */ - decay_prob = (gdouble)word_decay / ((len - (pos - text)) / 6.0); + if (cur) { + ex = (struct rspamd_process_exception *)cur->data; + } - if (decay_prob >= 1.0) { - prob = G_MAXUINT64; + while (p != UBRK_DONE) { +start_over: + token.len = 0; + + if (p > last) { + if (ex && cur) { + /* Check exception */ + if (ex->pos >= last && ex->pos <= p) { + /* We have an exception within boundary */ + /* First, start to drain exceptions from the start */ + while (cur && ex->pos <= last) { + /* We have an exception at the beginning, skip those */ + last += ex->len; + + if (last > p) { + /* Exception spread over the boundaries */ + while (last > p && p != UBRK_DONE) { + p = ubrk_next (bi); + } + + /* We need to reset our scan with new p and last */ + goto start_over; + } + + if (ex->type == RSPAMD_EXCEPTION_URL) { + token.begin = "!!EX!!"; + token.len = sizeof ("!!EX!!") - 1; + token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; + + g_array_append_val (res, token); + token.flags = 0; + } + + cur = g_list_next (cur); + + if (cur) { + ex = (struct rspamd_process_exception *) cur->data; + } + } + + /* Now, we can have an exception within boundary again */ + if (cur && ex->pos >= last && ex->pos <= p) { + /* Append the first part */ + if (rspamd_utf_word_valid (text, text + len, last, + ex->pos)) { + token.begin = text + last; + token.len = ex->pos - last; + token.flags = 0; + g_array_append_val (res, token); + } + + /* Process the current exception */ + last += ex->len + token.len; + + if (ex->type == RSPAMD_EXCEPTION_URL) { + token.begin = "!!EX!!"; + token.len = sizeof ("!!EX!!") - 1; + token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; + + g_array_append_val (res, token); + } + + if (last > p) { + /* Exception spread over the boundaries */ + while (last > p && p != UBRK_DONE) { + p = ubrk_next (bi); + } + /* We need to reset our scan with new p and last */ + goto start_over; + } + } + else if (p > last) { + if (rspamd_utf_word_valid (text, text + len, last, p)) { + token.begin = text + last; + token.len = p - last; + token.flags = 0; + } + } + } + else if (ex->pos < last) { + /* Forward exceptions list */ + while (cur && ex->pos <= last) { + /* We have an exception at the beginning, skip those */ + cur = g_list_next (cur); + + if (cur) { + ex = (struct rspamd_process_exception *) cur->data; + } + } + + if (rspamd_utf_word_valid (text, text + len, last, p)) { + token.begin = text + last; + token.len = p - last; + token.flags = 0; + } + } + else { + /* No exceptions within boundary */ + if (rspamd_utf_word_valid (text, text + len, last, p)) { + token.begin = text + last; + token.len = p - last; + token.flags = 0; + } + } } else { - prob = decay_prob * G_MAXUINT64; + if (rspamd_utf_word_valid (text, text + len, last, p)) { + token.begin = text + last; + token.len = p - last; + } + } + + if (rspamd_tokenize_check_limit (decay, word_decay, res->len, + &hv, &prob, &token, pos - text, len)) { + if (!decay) { + decay = TRUE; + } else { + token.len = 0; + } } } - } - else { - /* Decaying probability */ - /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */ - hv = 2862933555777941757ULL * hv + 3037000493ULL; - if (hv > prob) { - token.begin = pos; - continue; + if (token.len > 0) { + g_array_append_val (res, token); } - } - g_array_append_val (res, token); - token.begin = pos; + last = p; + p = ubrk_next (bi); + } } if (!decay) { diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 16ab142fd..6c538eafc 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -7,6 +7,8 @@ #include "rspamd.h" #include "stat_api.h" +#include <unicode/utext.h> + #define RSPAMD_DEFAULT_TOKENIZER "osb" struct rspamd_tokenizer_runtime; @@ -37,6 +39,7 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b); /* Tokenize text into array of words (rspamd_stat_token_t type) */ GArray * rspamd_tokenize_text (const gchar *text, gsize len, + const UText *utxt, enum rspamd_tokenize_type how, struct rspamd_config *cfg, GList *exceptions, |