]> source.dussan.org Git - rspamd.git/commitdiff
Add new UTF8 tokenizer.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 1 Apr 2015 13:54:57 +0000 (14:54 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 1 Apr 2015 13:54:57 +0000 (14:54 +0100)
src/libmime/message.c
src/libstat/stat_process.c
src/libstat/tokenizers/tokenizers.c
src/libstat/tokenizers/tokenizers.h

index ebf12b41374d94ea5f00f1f866d26578659c5641..8f7a9d5c8ffff64928dce8bff5f2a1104b53bc17 100644 (file)
@@ -1190,8 +1190,11 @@ rspamd_normalize_text_part (struct rspamd_task *task,
                }
        }
 
-       part->normalized_words = g_array_sized_new (FALSE, FALSE,
-                       sizeof (rspamd_fstring_t), part->words->len);
+       /* Ugly workaround */
+       part->normalized_words = rspamd_tokenize_text (part->content->data,
+                       part->content->len, part->is_utf, task->cfg->min_word_len,
+                       part->urls_offset, FALSE);
+
        for (i = 0; i < part->words->len; i ++) {
                w = &g_array_index (part->words, rspamd_fstring_t, i);
                if (stem) {
@@ -1324,7 +1327,7 @@ process_text_part (struct rspamd_task *task,
        detect_text_language (text_part);
        text_part->words = rspamd_tokenize_text (text_part->content->data,
                        text_part->content->len, text_part->is_utf, task->cfg->min_word_len,
-                       &text_part->urls_offset);
+                       text_part->urls_offset, TRUE);
        rspamd_normalize_text_part (task, text_part);
 }
 
index 24481ee0c1768383cf80ae369e93c08b9b069c6b..7587baec1e4a7e8c95af6dd7f9579238c9a67b68 100644 (file)
@@ -316,7 +316,7 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
        }
 
        if (sub != NULL) {
-               words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL);
+               words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL, compat);
                if (words != NULL) {
                        tok->tokenizer->tokenize_func (cf,
                                        task->task_pool,
index b9a4bd68b9f6d40f1e5239b3850ab535b4e19e10..744e6707e43ab2500323d8a38ddb14476dabbedd 100644 (file)
 #include "tokenizers.h"
 #include "stat_internal.h"
 
+typedef gboolean (*token_get_function) (rspamd_fstring_t * buf, gchar **pos,
+               rspamd_fstring_t * token,
+               GList **exceptions, gboolean is_utf, gsize *rl);
+
 const gchar t_delimiters[255] = {
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
@@ -72,22 +76,26 @@ token_node_compare_func (gconstpointer a, gconstpointer b)
 }
 
 /* Get next word from specified f_str_t buf */
-static gchar *
-rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions)
+static gboolean
+rspamd_tokenizer_get_word_compat (rspamd_fstring_t * buf,
+               gchar **cur, rspamd_fstring_t * token,
+               GList **exceptions, gboolean is_utf, gsize *rl)
 {
        gsize remain, pos;
        guchar *p;
        struct process_exception *ex = NULL;
 
        if (buf == NULL) {
-               return NULL;
+               return FALSE;
        }
 
+       g_assert (cur != NULL);
+
        if (exceptions != NULL && *exceptions != NULL) {
                ex = (*exceptions)->data;
        }
 
-       if (token->begin == NULL) {
+       if (token->begin == NULL || *cur == NULL) {
                if (ex != NULL) {
                        if (ex->pos == 0) {
                                token->begin = buf->begin + ex->len;
@@ -106,19 +114,21 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GLi
 
        token->len = 0;
 
-       pos = token->begin - buf->begin;
+       pos = *cur - buf->begin;
        if (pos >= buf->len) {
-               return NULL;
+               return FALSE;
        }
 
        remain = buf->len - pos;
-       p = token->begin;
+       p = *cur;
+
        /* Skip non delimiters symbols */
        do {
                if (ex != NULL && ex->pos == pos) {
                        /* Go to the next exception */
                        *exceptions = g_list_next (*exceptions);
-                       return p + ex->len;
+                       *cur = p + ex->len;
+                       return TRUE;
                }
                pos++;
                p++;
@@ -130,7 +140,8 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GLi
        while (remain > 0 && !t_delimiters[*p]) {
                if (ex != NULL && ex->pos == pos) {
                        *exceptions = g_list_next (*exceptions);
-                       return p + ex->len;
+                       *cur = p + ex->len;
+                       return TRUE;
                }
                token->len++;
                pos++;
@@ -139,20 +150,127 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GLi
        }
 
        if (remain == 0) {
-               return NULL;
+               return FALSE;
+       }
+
+       if (rl) {
+               if (is_utf) {
+                       *rl = g_utf8_strlen (token->begin, token->len);
+               }
+               else {
+                       *rl = token->len;
+               }
+       }
+
+       *cur = p;
+
+       return TRUE;
+}
+
+static gboolean
+rspamd_tokenizer_get_word (rspamd_fstring_t * buf,
+               gchar **cur, rspamd_fstring_t * token,
+               GList **exceptions, gboolean is_utf, gsize *rl)
+{
+       gsize remain, pos;
+       gchar *p, *next_p;
+       gunichar uc;
+       guint processed = 0;
+       struct process_exception *ex = NULL;
+       enum {
+               skip_delimiters = 0,
+               feed_token,
+               skip_exception
+       } state = skip_delimiters;
+
+       if (buf == NULL) {
+               return FALSE;
+       }
+
+       if (exceptions != NULL && *exceptions != NULL) {
+               ex = (*exceptions)->data;
+       }
+
+       g_assert (is_utf);
+       g_assert (cur != NULL);
+
+       if (*cur == NULL) {
+               *cur = buf->begin;
+       }
+
+       token->len = 0;
+
+       pos = *cur - buf->begin;
+       if (pos >= buf->len) {
+               return FALSE;
        }
 
-       return p;
+       remain = buf->len - pos;
+       p = *cur;
+       token->begin = p;
+
+       while (remain > 0) {
+               uc = g_utf8_get_char (p);
+               next_p = g_utf8_next_char (p);
+
+               if (next_p - p > (gint)remain) {
+                       return FALSE;
+               }
+
+               switch (state) {
+               case skip_delimiters:
+                       if (ex != NULL && p - buf->begin == (gint)ex->pos) {
+                               token->begin = "exception";
+                               token->len = sizeof ("exception") - 1;
+                               state = skip_exception;
+                       }
+                       else if (g_unichar_isgraph (uc) && !g_unichar_ispunct (uc)) {
+                               state = feed_token;
+                               token->begin = p;
+                               continue;
+                       }
+                       break;
+               case feed_token:
+                       if (ex != NULL && p - buf->begin == (gint)ex->pos) {
+                               goto set_token;
+                       }
+                       else if (!g_unichar_isgraph (uc) || g_unichar_ispunct (uc)) {
+                               goto set_token;
+                       }
+                       processed ++;
+                       break;
+               case skip_exception:
+                       *cur = p + ex->len;
+                       *exceptions = g_list_next (*exceptions);
+                       goto set_token;
+                       break;
+               }
+
+               p = next_p;
+       }
+
+set_token:
+       if (rl) {
+               *rl = processed;
+       }
+
+       token->len = p - *cur;
+       g_assert (token->len > 0);
+       *cur = p;
+
+       return TRUE;
 }
 
 GArray *
 rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
-               gsize min_len, GList **exceptions)
+               gsize min_len, GList *exceptions, gboolean compat)
 {
        rspamd_fstring_t token, buf;
-       gchar *pos;
+       gchar *pos = NULL;
        gsize l;
        GArray *res;
+       GList *cur = exceptions;
+       token_get_function func;
 
        if (len == 0 || text == NULL) {
                return NULL;
@@ -164,21 +282,22 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
        token.begin = NULL;
        token.len = 0;
 
+       if (compat || !is_utf) {
+               func = rspamd_tokenizer_get_word_compat;
+       }
+       else {
+               func = rspamd_tokenizer_get_word;
+       }
+
        res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t));
-       while ((pos = rspamd_tokenizer_get_word (&buf,
-                       &token, exceptions)) != NULL) {
-               if (is_utf) {
-                       l = g_utf8_strlen (token.begin, token.len);
-               }
-               else {
-                       l = token.len;
-               }
+
+       while (func (&buf, &pos, &token, &cur, is_utf, &l)) {
                if (min_len > 0 && l < min_len) {
                        token.begin = pos;
                        continue;
                }
-               g_array_append_val (res, token);
 
+               g_array_append_val (res, token);
                token.begin = pos;
        }
 
index 1cf3a1589e62c0637bcfba06de256acab9e5b331..fb4b42a965efb733ac0fc34dde96386c7ceb755e 100644 (file)
@@ -28,7 +28,7 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b);
 
 /* Tokenize text into array of words (rspamd_fstring_t type) */
 GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
-               gsize min_len, GList **exceptions);
+               gsize min_len, GList *exceptions, gboolean compat);
 
 /* OSB tokenize function */
 gint rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,