Fix normalization and tokenization.

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 2 Apr 2015 11:35:19 +0000 (12:35 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 2 Apr 2015 11:35:19 +0000 (12:35 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 2 Apr 2015 11:35:19 +0000 (12:35 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 2 Apr 2015 11:35:19 +0000 (12:35 +0100)
diff --git a/src/libmime/message.c b/src/libmime/message.c

index 2eac86ed299ed9e36a96974559901550d9efb027..dfef04ce80c5554757f6df14c62a5b39cfae91e1 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -1180,7 +1180,7 @@ rspamd_normalize_text_part (struct rspamd_task *task,
         struct sb_stemmer *stem = NULL;
         rspamd_fstring_t *w, stw;
         const guchar *r;
-       guint i;
+       guint i, nlen;
         GArray *tmp;
  
         if (part->language && part->language[0] != '\0' && part->is_utf) {
@@ -1203,13 +1203,11 @@ rspamd_normalize_text_part (struct rspamd_task *task,
                                 r = sb_stemmer_stem (stem, w->begin, w->len);
                         }
  
-                       if (stem == NULL || r == NULL) {
-                               stw.begin = rspamd_mempool_fstrdup (task->task_pool, w);
-                               stw.len = w->len;
-                       }
-                       else {
-                               stw.begin = rspamd_mempool_strdup (task->task_pool, r);
-                               stw.len = strlen (r);
+                       if (stem != NULL && r != NULL) {
+                               nlen = strlen (r);
+                               nlen = MIN (nlen, stw.len);
+                               memcpy (stw.begin, r, nlen);
+                               stw.len = nlen;
                         }
  
                         if (part->is_utf) {
@@ -1218,9 +1216,8 @@ rspamd_normalize_text_part (struct rspamd_task *task,
                         else {
                                 rspamd_str_lc (stw.begin, stw.len);
                         }
-                       g_array_append_val (part->normalized_words, stw);
                 }
-               g_array_free (tmp, TRUE);
+               part->normalized_words = tmp;
         }
  
         if (stem != NULL) {
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c

index c9b65e343ea4839f73b7b912a87b26688998f840..eebc57c22db8e39bf47f01947424f533d67cb8df 100644 (file)
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -110,6 +110,7 @@ rspamd_tokenizer_get_word_compat (rspamd_fstring_t * buf,
                         token->begin = buf->begin;
                         token->len = 0;
                 }
+               *cur = token->begin;
         }
  
         token->len = 0;
@@ -223,6 +224,7 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf,
                                 token->begin = "exception";
                                 token->len = sizeof ("exception") - 1;
                                 state = skip_exception;
+                               continue;
                         }
                         else if (g_unichar_isgraph (uc) && !g_unichar_ispunct (uc)) {
                                 state = feed_token;
@@ -290,7 +292,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
                 func = rspamd_tokenizer_get_word;
         }
  
-       res = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t));
+       res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_fstring_t), 128);
  
         while (func (&buf, &pos, &token, &cur, is_utf, &l)) {
                 if (min_len > 0 && l < min_len) {
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 2 Apr 2015 11:35:19 +0000 (12:35 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 2 Apr 2015 11:35:19 +0000 (12:35 +0100)
src/libmime/message.c		patch \| blob \| history
src/libstat/tokenizers/tokenizers.c		patch \| blob \| history