Add new UTF8 tokenizer.

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Wed, 1 Apr 2015 13:54:57 +0000 (14:54 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Wed, 1 Apr 2015 13:54:57 +0000 (14:54 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 1 Apr 2015 13:54:57 +0000 (14:54 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 1 Apr 2015 13:54:57 +0000 (14:54 +0100)
diff --git a/src/libmime/message.c b/src/libmime/message.c

index ebf12b41374d94ea5f00f1f866d26578659c5641..8f7a9d5c8ffff64928dce8bff5f2a1104b53bc17 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -1190,8 +1190,11 @@ rspamd_normalize_text_part (struct rspamd_task *task,
                 }
         }
  
-       part->normalized_words = g_array_sized_new (FALSE, FALSE,
-                       sizeof (rspamd_fstring_t), part->words->len);
+       /* Ugly workaround */
+       part->normalized_words = rspamd_tokenize_text (part->content->data,
+                       part->content->len, part->is_utf, task->cfg->min_word_len,
+                       part->urls_offset, FALSE);
+
         for (i = 0; i < part->words->len; i ++) {
                 w = &g_array_index (part->words, rspamd_fstring_t, i);
                 if (stem) {
@@ -1324,7 +1327,7 @@ process_text_part (struct rspamd_task *task,
         detect_text_language (text_part);
         text_part->words = rspamd_tokenize_text (text_part->content->data,
                         text_part->content->len, text_part->is_utf, task->cfg->min_word_len,
-                       &text_part->urls_offset);
+                       text_part->urls_offset, TRUE);
         rspamd_normalize_text_part (task, text_part);
  }
  
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c

index 24481ee0c1768383cf80ae369e93c08b9b069c6b..7587baec1e4a7e8c95af6dd7f9579238c9a67b68 100644 (file)
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -316,7 +316,7 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
         }
  
         if (sub != NULL) {
-               words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL);
+               words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL, compat);
                 if (words != NULL) {
                         tok->tokenizer->tokenize_func (cf,
                                         task->task_pool,
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c

index b9a4bd68b9f6d40f1e5239b3850ab535b4e19e10..744e6707e43ab2500323d8a38ddb14476dabbedd 100644 (file)
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -30,6 +30,10 @@
  #include "tokenizers.h"
  #include "stat_internal.h"
  
+typedef gboolean (*token_get_function) (rspamd_fstring_t * buf, gchar **pos,
+               rspamd_fstring_t * token,
+               GList **exceptions, gboolean is_utf, gsize *rl);
+
  const gchar t_delimiters[255] = {
         0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
@@ -72,22 +76,26 @@ token_node_compare_func (gconstpointer a, gconstpointer b)
  }
  
  /* Get next word from specified f_str_t buf */
-static gchar *
-rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions)
+static gboolean
+rspamd_tokenizer_get_word_compat (rspamd_fstring_t * buf,
+               gchar **cur, rspamd_fstring_t * token,
+               GList **exceptions, gboolean is_utf, gsize *rl)
  {
         gsize remain, pos;
         guchar *p;
         struct process_exception *ex = NULL;
  
         if (buf == NULL) {
-               return NULL;
+               return FALSE;
         }
  
+       g_assert (cur != NULL);
+
         if (exceptions != NULL && *exceptions != NULL) {
                 ex = (*exceptions)->data;
         }
  
-       if (token->begin == NULL) {
+       if (token->begin == NULL || *cur == NULL) {
                 if (ex != NULL) {
                         if (ex->pos == 0) {
                                 token->begin = buf->begin + ex->len;
@@ -106,19 +114,21 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GLi
  
         token->len = 0;
  
-       pos = token->begin - buf->begin;
+       pos = *cur - buf->begin;
         if (pos >= buf->len) {
-               return NULL;
+               return FALSE;
         }
  
         remain = buf->len - pos;
-       p = token->begin;
+       p = *cur;
+
         /* Skip non delimiters symbols */
         do {
                 if (ex != NULL && ex->pos == pos) {
                         /* Go to the next exception */
                         *exceptions = g_list_next (*exceptions);
-                       return p + ex->len;
+                       *cur = p + ex->len;
+                       return TRUE;
                 }
                 pos++;
                 p++;
@@ -130,7 +140,8 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GLi
         while (remain > 0 && !t_delimiters[*p]) {
                 if (ex != NULL && ex->pos == pos) {
                         *exceptions = g_list_next (*exceptions);
-                       return p + ex->len;
+                       *cur = p + ex->len;
+                       return TRUE;
                 }
                 token->len++;
                 pos++;
@@ -139,20 +150,127 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GLi
         }
  
         if (remain == 0) {
-               return NULL;
+               return FALSE;
+       }
+
+       if (rl) {
+               if (is_utf) {
+                       *rl = g_utf8_strlen (token->begin, token->len);
+               }
+               else {
+                       *rl = token->len;
+               }
+       }
+
+       *cur = p;
+
+       return TRUE;
+}
+
+static gboolean
+rspamd_tokenizer_get_word (rspamd_fstring_t * buf,
+               gchar **cur, rspamd_fstring_t * token,
+               GList **exceptions, gboolean is_utf, gsize *rl)
+{
+       gsize remain, pos;
+       gchar *p, *next_p;
+       gunichar uc;
+       guint processed = 0;
+       struct process_exception *ex = NULL;
+       enum {
+               skip_delimiters = 0,
+               feed_token,
+               skip_exception
+       } state = skip_delimiters;
+
+       if (buf == NULL) {
+               return FALSE;
+       }
+
+       if (exceptions != NULL && *exceptions != NULL) {
+               ex = (*exceptions)->data;
+       }
+
+       g_assert (is_utf);
+       g_assert (cur != NULL);
+
+       if (*cur == NULL) {
+               *cur = buf->begin;
+       }
+
+       token->len = 0;
+
+       pos = *cur - buf->begin;
+       if (pos >= buf->len) {
+               return FALSE;
         }
  
-       return p;
+       remain = buf->len - pos;
+       p = *cur;
+       token->begin = p;
+
+       while (remain > 0) {
+               uc = g_utf8_get_char (p);
+               next_p = g_utf8_next_char (p);
+
+               if (next_p - p > (gint)remain) {
+                       return FALSE;
+               }
+
+               switch (state) {
+               case skip_delimiters:
+                       if (ex != NULL && p - buf->begin == (gint)ex->pos) {
+                               token->begin = "exception";
+                               token->len = sizeof ("exception") - 1;
+                               state = skip_exception;
+                       }
+                       else if (g_unichar_isgraph (uc) && !g_unichar_ispunct (uc)) {
+                               state = feed_token;
+                               token->begin = p;
+                               continue;
+                       }
+                       break;
+               case feed_token:
+                       if (ex != NULL && p - buf->begin == (gint)ex->pos) {
+                               goto set_token;
+                       }
+                       else if (!g_unichar_isgraph (uc) || g_unichar_ispunct (uc)) {
+                               goto set_token;
+                       }
+                       processed ++;
+                       break;
+               case skip_exception:
+                       *cur = p + ex->len;
+                       *exceptions = g_list_next (*exceptions);
+                       goto set_token;
+                       break;
+               }
+
+               p = next_p;
+       }
+
+set_token:
+       if (rl) {
+               *rl = processed;
+       }
+
+       token->len = p - *cur;
+       g_assert (token->len > 0);
+       *cur = p;
+
+       return TRUE;
  }
  
  GArray *
  rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
-               gsize min_len, GList **exceptions)
+               gsize min_len, GList *exceptions, gboolean compat)
  {
         rspamd_fstring_t token, buf;
-       gchar *pos;
+       gchar *pos = NULL;
         gsize l;
         GArray *res;
+       GList *cur = exceptions;
+       token_get_function func;
  
         if (len == 0 || text == NULL) {
                 return NULL;
@@ -164,21 +282,22 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
         token.begin = NULL;
         token.len = 0;
  
+       if (compat || !is_utf) {
+               func = rspamd_tokenizer_get_word_compat;
+       }
+       else {
+               func = rspamd_tokenizer_get_word;
+       }
+
         res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t));
-       while ((pos = rspamd_tokenizer_get_word (&buf,
-                       &token, exceptions)) != NULL) {
-               if (is_utf) {
-                       l = g_utf8_strlen (token.begin, token.len);
-               }
-               else {
-                       l = token.len;
-               }
+
+       while (func (&buf, &pos, &token, &cur, is_utf, &l)) {
                 if (min_len > 0 && l < min_len) {
                         token.begin = pos;
                         continue;
                 }
-               g_array_append_val (res, token);
  
+               g_array_append_val (res, token);
                 token.begin = pos;
         }
  
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h

index 1cf3a1589e62c0637bcfba06de256acab9e5b331..fb4b42a965efb733ac0fc34dde96386c7ceb755e 100644 (file)
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -28,7 +28,7 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b);
  
  /* Tokenize text into array of words (rspamd_fstring_t type) */
  GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
-               gsize min_len, GList **exceptions);
+               gsize min_len, GList *exceptions, gboolean compat);
  
  /* OSB tokenize function */
  gint rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf,
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Wed, 1 Apr 2015 13:54:57 +0000 (14:54 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Wed, 1 Apr 2015 13:54:57 +0000 (14:54 +0100)
src/libmime/message.c		patch \| blob \| history
src/libstat/stat_process.c		patch \| blob \| history
src/libstat/tokenizers/tokenizers.c		patch \| blob \| history
src/libstat/tokenizers/tokenizers.h		patch \| blob \| history