From 5bc3b26c98812d5a1bc1c4753ad656b403bf1e3a Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 1 Apr 2015 14:54:57 +0100 Subject: [PATCH] Add new UTF8 tokenizer. --- src/libmime/message.c | 9 +- src/libstat/stat_process.c | 2 +- src/libstat/tokenizers/tokenizers.c | 163 ++++++++++++++++++++++++---- src/libstat/tokenizers/tokenizers.h | 2 +- 4 files changed, 149 insertions(+), 27 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index ebf12b413..8f7a9d5c8 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -1190,8 +1190,11 @@ rspamd_normalize_text_part (struct rspamd_task *task, } } - part->normalized_words = g_array_sized_new (FALSE, FALSE, - sizeof (rspamd_fstring_t), part->words->len); + /* Ugly workaround */ + part->normalized_words = rspamd_tokenize_text (part->content->data, + part->content->len, part->is_utf, task->cfg->min_word_len, + part->urls_offset, FALSE); + for (i = 0; i < part->words->len; i ++) { w = &g_array_index (part->words, rspamd_fstring_t, i); if (stem) { @@ -1324,7 +1327,7 @@ process_text_part (struct rspamd_task *task, detect_text_language (text_part); text_part->words = rspamd_tokenize_text (text_part->content->data, text_part->content->len, text_part->is_utf, task->cfg->min_word_len, - &text_part->urls_offset); + text_part->urls_offset, TRUE); rspamd_normalize_text_part (task, text_part); } diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 24481ee0c..7587baec1 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -316,7 +316,7 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf, } if (sub != NULL) { - words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL); + words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL, compat); if (words != NULL) { tok->tokenizer->tokenize_func (cf, task->task_pool, diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index b9a4bd68b..744e6707e 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -30,6 +30,10 @@ #include "tokenizers.h" #include "stat_internal.h" +typedef gboolean (*token_get_function) (rspamd_fstring_t * buf, gchar **pos, + rspamd_fstring_t * token, + GList **exceptions, gboolean is_utf, gsize *rl); + const gchar t_delimiters[255] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, @@ -72,22 +76,26 @@ token_node_compare_func (gconstpointer a, gconstpointer b) } /* Get next word from specified f_str_t buf */ -static gchar * -rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions) +static gboolean +rspamd_tokenizer_get_word_compat (rspamd_fstring_t * buf, + gchar **cur, rspamd_fstring_t * token, + GList **exceptions, gboolean is_utf, gsize *rl) { gsize remain, pos; guchar *p; struct process_exception *ex = NULL; if (buf == NULL) { - return NULL; + return FALSE; } + g_assert (cur != NULL); + if (exceptions != NULL && *exceptions != NULL) { ex = (*exceptions)->data; } - if (token->begin == NULL) { + if (token->begin == NULL || *cur == NULL) { if (ex != NULL) { if (ex->pos == 0) { token->begin = buf->begin + ex->len; @@ -106,19 +114,21 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GLi token->len = 0; - pos = token->begin - buf->begin; + pos = *cur - buf->begin; if (pos >= buf->len) { - return NULL; + return FALSE; } remain = buf->len - pos; - p = token->begin; + p = *cur; + /* Skip non delimiters symbols */ do { if (ex != NULL && ex->pos == pos) { /* Go to the next exception */ *exceptions = g_list_next (*exceptions); - return p + ex->len; + *cur = p + ex->len; + return TRUE; } pos++; p++; @@ -130,7 +140,8 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GLi while (remain > 0 && !t_delimiters[*p]) { if (ex != NULL && ex->pos == pos) { *exceptions = g_list_next (*exceptions); - return p + ex->len; + *cur = p + ex->len; + return TRUE; } token->len++; pos++; @@ -139,20 +150,127 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GLi } if (remain == 0) { - return NULL; + return FALSE; + } + + if (rl) { + if (is_utf) { + *rl = g_utf8_strlen (token->begin, token->len); + } + else { + *rl = token->len; + } + } + + *cur = p; + + return TRUE; +} + +static gboolean +rspamd_tokenizer_get_word (rspamd_fstring_t * buf, + gchar **cur, rspamd_fstring_t * token, + GList **exceptions, gboolean is_utf, gsize *rl) +{ + gsize remain, pos; + gchar *p, *next_p; + gunichar uc; + guint processed = 0; + struct process_exception *ex = NULL; + enum { + skip_delimiters = 0, + feed_token, + skip_exception + } state = skip_delimiters; + + if (buf == NULL) { + return FALSE; + } + + if (exceptions != NULL && *exceptions != NULL) { + ex = (*exceptions)->data; + } + + g_assert (is_utf); + g_assert (cur != NULL); + + if (*cur == NULL) { + *cur = buf->begin; + } + + token->len = 0; + + pos = *cur - buf->begin; + if (pos >= buf->len) { + return FALSE; } - return p; + remain = buf->len - pos; + p = *cur; + token->begin = p; + + while (remain > 0) { + uc = g_utf8_get_char (p); + next_p = g_utf8_next_char (p); + + if (next_p - p > (gint)remain) { + return FALSE; + } + + switch (state) { + case skip_delimiters: + if (ex != NULL && p - buf->begin == (gint)ex->pos) { + token->begin = "exception"; + token->len = sizeof ("exception") - 1; + state = skip_exception; + } + else if (g_unichar_isgraph (uc) && !g_unichar_ispunct (uc)) { + state = feed_token; + token->begin = p; + continue; + } + break; + case feed_token: + if (ex != NULL && p - buf->begin == (gint)ex->pos) { + goto set_token; + } + else if (!g_unichar_isgraph (uc) || g_unichar_ispunct (uc)) { + goto set_token; + } + processed ++; + break; + case skip_exception: + *cur = p + ex->len; + *exceptions = g_list_next (*exceptions); + goto set_token; + break; + } + + p = next_p; + } + +set_token: + if (rl) { + *rl = processed; + } + + token->len = p - *cur; + g_assert (token->len > 0); + *cur = p; + + return TRUE; } GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, - gsize min_len, GList **exceptions) + gsize min_len, GList *exceptions, gboolean compat) { rspamd_fstring_t token, buf; - gchar *pos; + gchar *pos = NULL; gsize l; GArray *res; + GList *cur = exceptions; + token_get_function func; if (len == 0 || text == NULL) { return NULL; @@ -164,21 +282,22 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, token.begin = NULL; token.len = 0; + if (compat || !is_utf) { + func = rspamd_tokenizer_get_word_compat; + } + else { + func = rspamd_tokenizer_get_word; + } + res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t)); - while ((pos = rspamd_tokenizer_get_word (&buf, - &token, exceptions)) != NULL) { - if (is_utf) { - l = g_utf8_strlen (token.begin, token.len); - } - else { - l = token.len; - } + + while (func (&buf, &pos, &token, &cur, is_utf, &l)) { if (min_len > 0 && l < min_len) { token.begin = pos; continue; } - g_array_append_val (res, token); + g_array_append_val (res, token); token.begin = pos; } diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 1cf3a1589..fb4b42a96 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -28,7 +28,7 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b); /* Tokenize text into array of words (rspamd_fstring_t type) */ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, - gsize min_len, GList **exceptions); + gsize min_len, GList *exceptions, gboolean compat); /* OSB tokenize function */ gint rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf, -- 2.39.5