diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-24 14:43:36 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-24 14:43:36 +0000 |
commit | b522caaf83b4a3f16246bdc38d0f7ce866cdc660 (patch) | |
tree | 5c42b7bbf7a274aa65a682bda9dbf07512865bbb | |
parent | d01688d6aabc2d51fd52c640c21265a7fd8e3bdc (diff) | |
download | rspamd-b522caaf83b4a3f16246bdc38d0f7ce866cdc660.tar.gz rspamd-b522caaf83b4a3f16246bdc38d0f7ce866cdc660.zip |
[Project] Start words unicode structure rework
-rw-r--r-- | src/libstat/stat_api.h | 21 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 100 | ||||
-rw-r--r-- | src/libutil/fstring.h | 6 |
3 files changed, 71 insertions, 56 deletions
diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h index 84db8ee01..645e1f1aa 100644 --- a/src/libstat/stat_api.h +++ b/src/libstat/stat_api.h @@ -26,16 +26,21 @@ * High level statistics API */ -#define RSPAMD_STAT_TOKEN_FLAG_TEXT (1 << 0) -#define RSPAMD_STAT_TOKEN_FLAG_META (1 << 1) -#define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1 << 2) -#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1 << 3) -#define RSPAMD_STAT_TOKEN_FLAG_SUBJECT (1 << 4) -#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1 << 5) +#define RSPAMD_STAT_TOKEN_FLAG_TEXT (1u << 0) +#define RSPAMD_STAT_TOKEN_FLAG_META (1u << 1) +#define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1u << 2) +#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1u << 3) +#define RSPAMD_STAT_TOKEN_FLAG_SUBJECT (1u << 4) +#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1u << 5) +#define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6) +#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7) +#define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8) typedef struct rspamd_stat_token_s { - const gchar *begin; - gsize len; + rspamd_ftok_t original; + rspamd_ftok_unicode_t unicode; + rspamd_ftok_t normalised; + rspamd_ftok_t stemmed; guint flags; } rspamd_stat_token_t; diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 2ef5c08fb..8664b9e19 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -80,33 +80,33 @@ rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf, ex = (*exceptions)->data; } - if (token->begin == NULL || *cur == NULL) { + if (token->original.begin == NULL || *cur == NULL) { if (ex != NULL) { if (ex->pos == 0) { - token->begin = buf->begin + ex->len; - token->len = ex->len; + token->original.begin = buf->original.begin + ex->len; + token->original.len = ex->len; token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; } else { - token->begin = buf->begin; - token->len = 0; + token->original.begin = buf->original.begin; + token->original.len = 0; } } else { - token->begin = buf->begin; - token->len = 0; + token->original.begin = buf->original.begin; + token->original.len = 0; } - *cur = token->begin; + *cur = token->original.begin; } - token->len = 0; + token->original.len = 0; - pos = *cur - buf->begin; - if (pos >= buf->len) { + pos = *cur - buf->original.begin; + if (pos >= buf->original.len) { return FALSE; } - remain = buf->len - pos; + remain = buf->original.len - pos; p = *cur; /* Skip non delimiters symbols */ @@ -122,7 +122,7 @@ rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf, remain--; } while (remain > 0 && t_delimiters[(guchar)*p]); - token->begin = p; + token->original.begin = p; while (remain > 0 && !t_delimiters[(guchar)*p]) { if (ex != NULL && ex->pos == pos) { @@ -130,7 +130,7 @@ rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf, *cur = p + ex->len; return TRUE; } - token->len++; + token->original.len++; pos++; remain--; p++; @@ -141,7 +141,7 @@ rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf, } if (rl) { - *rl = token->len; + *rl = token->original.len; } token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; @@ -164,12 +164,12 @@ rspamd_tokenize_check_limit (gboolean decay, static const gdouble avg_word_len = 6.0; if (!decay) { - if (token->len >= sizeof (guint64)) { + if (token->original.len >= sizeof (guint64)) { #ifdef _MUM_UNALIGNED_ACCESS - *hv = mum_hash_step (*hv, *(guint64 *)token->begin); + *hv = mum_hash_step (*hv, *(guint64 *)token->original.begin); #else guint64 tmp; - memcpy (&tmp, token->begin, sizeof (tmp)); + memcpy (&tmp, token->original.begin, sizeof (tmp)); *hv = mum_hash_step (*hv, tmp); #endif } @@ -260,11 +260,11 @@ rspamd_tokenize_text (const gchar *text, gsize len, return NULL; } - buf.begin = text; - buf.len = len; + buf.original.begin = text; + buf.original.len = len; buf.flags = 0; - token.begin = NULL; - token.len = 0; + token.original.begin = NULL; + token.original.len = 0; token.flags = 0; if (cfg != NULL) { @@ -281,24 +281,24 @@ rspamd_tokenize_text (const gchar *text, gsize len, while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) { if (l == 0 || (min_len > 0 && l < min_len) || (max_len > 0 && l > max_len)) { - token.begin = pos; + token.original.begin = pos; continue; } - if (token.len > 0 && + if (token.original.len > 0 && rspamd_tokenize_check_limit (decay, word_decay, res->len, &hv, &prob, &token, pos - text, len)) { if (!decay) { decay = TRUE; } else { - token.begin = pos; + token.original.begin = pos; continue; } } g_array_append_val (res, token); - token.begin = pos; + token.original.begin = pos; } } else { @@ -323,7 +323,7 @@ rspamd_tokenize_text (const gchar *text, gsize len, while (p != UBRK_DONE) { start_over: - token.len = 0; + token.original.len = 0; if (p > last) { if (ex && cur) { @@ -336,8 +336,8 @@ start_over: last += ex->len; if (ex->type == RSPAMD_EXCEPTION_URL) { - token.begin = "!!EX!!"; - token.len = sizeof ("!!EX!!") - 1; + token.original.begin = "!!EX!!"; + token.original.len = sizeof ("!!EX!!") - 1; token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; g_array_append_val (res, token); @@ -363,8 +363,8 @@ start_over: /* Append the first part */ if (rspamd_utf_word_valid (text, text + len, last, ex->pos)) { - token.begin = text + last; - token.len = ex->pos - last; + token.original.begin = text + last; + token.original.len = ex->pos - last; token.flags = 0; g_array_append_val (res, token); } @@ -373,8 +373,8 @@ start_over: last += ex->len + (ex->pos - last); if (ex->type == RSPAMD_EXCEPTION_URL) { - token.begin = "!!EX!!"; - token.len = sizeof ("!!EX!!") - 1; + token.original.begin = "!!EX!!"; + token.original.len = sizeof ("!!EX!!") - 1; token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; g_array_append_val (res, token); @@ -394,9 +394,10 @@ start_over: } else if (p > last) { if (rspamd_utf_word_valid (text, text + len, last, p)) { - token.begin = text + last; - token.len = p - last; - token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; + token.original.begin = text + last; + token.original.len = p - last; + token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | + RSPAMD_STAT_TOKEN_FLAG_UTF; } } } @@ -408,40 +409,43 @@ start_over: } if (rspamd_utf_word_valid (text, text + len, last, p)) { - token.begin = text + last; - token.len = p - last; - token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; + token.original.begin = text + last; + token.original.len = p - last; + token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | + RSPAMD_STAT_TOKEN_FLAG_UTF; } } else { /* No exceptions within boundary */ if (rspamd_utf_word_valid (text, text + len, last, p)) { - token.begin = text + last; - token.len = p - last; - token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; + token.original.begin = text + last; + token.original.len = p - last; + token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | + RSPAMD_STAT_TOKEN_FLAG_UTF; } } } else { if (rspamd_utf_word_valid (text, text + len, last, p)) { - token.begin = text + last; - token.len = p - last; - token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; + token.original.begin = text + last; + token.original.len = p - last; + token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | + RSPAMD_STAT_TOKEN_FLAG_UTF; } } - if (token.len > 0 && + if (token.original.len > 0 && rspamd_tokenize_check_limit (decay, word_decay, res->len, &hv, &prob, &token, p, len)) { if (!decay) { decay = TRUE; } else { - token.len = 0; + token.original.len = 0; } } } - if (token.len > 0) { + if (token.original.len > 0) { g_array_append_val (res, token); } diff --git a/src/libutil/fstring.h b/src/libutil/fstring.h index 1f194827c..88e41b47a 100644 --- a/src/libutil/fstring.h +++ b/src/libutil/fstring.h @@ -18,6 +18,7 @@ #include "config.h" #include "mem_pool.h" +#include <unicode/uchar.h> /** * Fixed strings library @@ -38,6 +39,11 @@ typedef struct f_str_tok { const gchar *begin; } rspamd_ftok_t; +typedef struct f_str_unicode_tok { + gsize len; /* in uchars */ + const UChar *begin; +} rspamd_ftok_unicode_t; + /** * Create new fixed length string */ |