[Rework] Rework exceptions and newlines processing

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Wed, 13 Jul 2016 16:03:27 +0000 (17:03 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Wed, 13 Jul 2016 16:07:23 +0000 (17:07 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 13 Jul 2016 16:03:27 +0000 (17:03 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 13 Jul 2016 16:07:23 +0000 (17:07 +0100)
diff --git a/src/libmime/message.c b/src/libmime/message.c

index f6c023294b87e0856834f430ea70c1fe95bfcecd..4605d1c6944c08699a8d2d3aa8a09e7acf42c019 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -658,15 +658,15 @@ detect_text_language (struct rspamd_mime_text_part *part)
  }
  
  static void
-rspamd_normalize_text_part (struct rspamd_task *task,
+rspamd_extract_words (struct rspamd_task *task,
                 struct rspamd_mime_text_part *part)
  {
  #ifdef WITH_SNOWBALL
         struct sb_stemmer *stem = NULL;
  #endif
         rspamd_ftok_t *w;
-       const guchar *r, *p, *c, *end;
         gchar *temp_word;
+       const guchar *r;
         guint i, nlen;
  
  #ifdef WITH_SNOWBALL
@@ -674,71 +674,14 @@ rspamd_normalize_text_part (struct rspamd_task *task,
                 stem = sb_stemmer_new (part->language, "UTF_8");
                 if (stem == NULL) {
                         msg_info_task ("<%s> cannot create lemmatizer for %s language",
-                               task->message_id, part->language);
+                                       task->message_id, part->language);
                 }
         }
  #endif
-       /* Strip newlines */
-       part->stripped_content = g_byte_array_sized_new (part->content->len);
-       part->newlines = g_ptr_array_sized_new (128);
-       p = part->content->data;
-       c = p;
-       end = p + part->content->len;
-
-       while (p < end) {
-               p = memchr (c, '\n', end - c);
-
-               if (p) {
-                       if (*(p - 1) == '\r') {
-                               p --;
-                       }
-
-                       if (p > c) {
-                               g_byte_array_append (part->stripped_content, c, p - c);
-                       }
-
-                       /* As it could cause reallocation, we initially store offsets */
-                       g_ptr_array_add (part->newlines,
-                                       GUINT_TO_POINTER (part->stripped_content->len));
-                       part->nlines ++;
-                       p ++;
-
-                       while (p < end && (*p == '\r' || *p == '\n')) {
-                               if (*p == '\n') {
-                                       part->nlines ++;
-                               }
-
-                               p ++;
-                       }
-                       c = p;
-               }
-               else {
-                       p = end;
-                       break;
-               }
-       }
-
-       if (p > c) {
-               g_byte_array_append (part->stripped_content, c, p - c);
-       }
-
-       /* Now convert offsets to real pointers for convenience */
-       for (i = 0; i < part->newlines->len; i ++) {
-               guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i));
-               g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off;
-       }
-
-       rspamd_mempool_add_destructor (task->task_pool,
-                       (rspamd_mempool_destruct_t) free_byte_array_callback,
-                       part->stripped_content);
-       rspamd_mempool_add_destructor (task->task_pool,
-                       (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
-                       part->newlines);
-
         /* Ugly workaround */
         part->normalized_words = rspamd_tokenize_text (part->content->data,
                         part->content->len, IS_PART_UTF (part), task->cfg,
-                       part->urls_offset, FALSE,
+                       part->exceptions, FALSE,
                         NULL);
  
         if (part->normalized_words) {
@@ -798,6 +741,78 @@ rspamd_normalize_text_part (struct rspamd_task *task,
  #endif
  }
  
+static void
+rspamd_normalize_text_part (struct rspamd_task *task,
+               struct rspamd_mime_text_part *part)
+{
+
+       const guchar *p, *c, *end;
+       guint i;
+       struct rspamd_process_exception *ex;
+
+       /* Strip newlines */
+       part->stripped_content = g_byte_array_sized_new (part->content->len);
+       part->newlines = g_ptr_array_sized_new (128);
+       p = part->content->data;
+       c = p;
+       end = p + part->content->len;
+
+       while (p < end) {
+               p = memchr (c, '\n', end - c);
+
+               if (p) {
+                       if (*(p - 1) == '\r') {
+                               p --;
+                       }
+
+                       if (p > c) {
+                               g_byte_array_append (part->stripped_content, c, p - c);
+                       }
+
+                       /* As it could cause reallocation, we initially store offsets */
+                       g_ptr_array_add (part->newlines,
+                                       GUINT_TO_POINTER (part->stripped_content->len));
+                       ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex));
+                       ex->pos = part->stripped_content->len;
+                       ex->len = 0;
+                       ex->type = RSPAMD_EXCEPTION_NEWLINE;
+                       part->exceptions = g_list_prepend (part->exceptions, ex);
+                       part->nlines ++;
+                       p ++;
+
+                       while (p < end && (*p == '\r' || *p == '\n')) {
+                               if (*p == '\n') {
+                                       part->nlines ++;
+                               }
+
+                               p ++;
+                       }
+                       c = p;
+               }
+               else {
+                       p = end;
+                       break;
+               }
+       }
+
+       if (p > c) {
+               g_byte_array_append (part->stripped_content, c, p - c);
+       }
+
+       /* Now convert offsets to real pointers for convenience */
+       for (i = 0; i < part->newlines->len; i ++) {
+               guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i));
+               g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off;
+       }
+
+       rspamd_mempool_add_destructor (task->task_pool,
+                       (rspamd_mempool_destruct_t) free_byte_array_callback,
+                       part->stripped_content);
+       rspamd_mempool_add_destructor (task->task_pool,
+                       (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
+                       part->newlines);
+}
+
  #define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
  
  static guint
@@ -872,6 +887,14 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
         return FALSE;
  }
  
+static gint
+exceptions_compare_func (gconstpointer a, gconstpointer b)
+{
+       const struct rspamd_process_exception *ea = a, *eb = b;
+
+       return ea->pos - eb->pos;
+}
+
  static void
  process_text_part (struct rspamd_task *task,
         GByteArray *part_content,
@@ -932,7 +955,7 @@ process_text_part (struct rspamd_task *task,
                                 task->task_pool,
                                 text_part->html,
                                 part_content,
-                               &text_part->urls_offset,
+                               &text_part->exceptions,
                                 task->urls,
                                 task->emails);
  
@@ -941,10 +964,10 @@ process_text_part (struct rspamd_task *task,
                 }
  
                 /* Handle offsets of this part */
-               if (text_part->urls_offset != NULL) {
-                       text_part->urls_offset = g_list_reverse (text_part->urls_offset);
+               if (text_part->exceptions != NULL) {
+                       text_part->exceptions = g_list_reverse (text_part->exceptions);
                         rspamd_mempool_add_destructor (task->task_pool,
-                                       (rspamd_mempool_destruct_t) g_list_free, text_part->urls_offset);
+                                       (rspamd_mempool_destruct_t) g_list_free, text_part->exceptions);
                 }
  
                 rspamd_mempool_add_destructor (task->task_pool,
@@ -1006,6 +1029,11 @@ process_text_part (struct rspamd_task *task,
         if (!IS_PART_HTML (text_part)) {
                 rspamd_url_text_extract (task->task_pool, task, text_part, FALSE);
         }
+
+       text_part->exceptions = g_list_sort (text_part->exceptions,
+                       exceptions_compare_func);
+
+       rspamd_extract_words (task, text_part);
  }
  
  struct mime_foreach_data {
diff --git a/src/libmime/message.h b/src/libmime/message.h

index 0d2ae74b4ab704a866d514229e01b3aea070984d..3fe26e685650a8f8ef1a2734c9fbe0efcd8e0d97 100644 (file)
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -57,7 +57,7 @@ struct rspamd_mime_text_part {
         GByteArray *stripped_content; /**< no newlines or html tags                     */
         GPtrArray *newlines;    /**< positions of newlines in text                                      */
         struct html_content *html;
-       GList *urls_offset;     /**< list of offsets of urls                                            */
+       GList *exceptions;      /**< list of offsets of urls                                            */
         GMimeObject *parent;
         struct rspamd_mime_part *mime_part;
         GArray *normalized_words;
diff --git a/src/libserver/html.c b/src/libserver/html.c

index 0a25e488a67bc6f65be2442e16bc6b7bb17b5bca..1188515c59c66aa79e3ef45bf6faf6e6a59f3837 100644 (file)
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -1601,7 +1601,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
         gint substate = 0, len, href_offset = -1;
         struct html_tag *cur_tag = NULL;
         struct rspamd_url *url = NULL, *turl;
-       struct process_exception *ex;
+       struct rspamd_process_exception *ex;
         enum {
                 parse_start = 0,
                 tag_begin,
@@ -1977,6 +1977,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                                                 ex = rspamd_mempool_alloc (pool, sizeof (*ex));
                                                                 ex->pos = href_offset;
                                                                 ex->len = dest->len - href_offset;
+                                                               ex->type = RSPAMD_EXCEPTION_URL;
  
                                                                 *exceptions = g_list_prepend (*exceptions, ex);
                                                         }
diff --git a/src/libserver/url.c b/src/libserver/url.c

index 823e32a434cf2503217e1f531b363f729f67a2a0..1ccc91a2720085f7be63268d5c80bc9540dcae87 100644 (file)
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -2294,17 +2294,18 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
                 gsize end_offset, gpointer ud)
  {
         struct rspamd_url_mimepart_cbdata *cbd = ud;
-       struct process_exception *ex;
+       struct rspamd_process_exception *ex;
         struct rspamd_task *task;
         gchar *url_str = NULL;
         struct rspamd_url *query_url;
         gint rc;
  
         task = cbd->task;
-       ex = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct process_exception));
+       ex = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_process_exception));
  
         ex->pos = start_offset;
         ex->len = end_offset - start_offset;
+       ex->type = RSPAMD_EXCEPTION_URL;
  
         if (url->protocol == PROTOCOL_MAILTO) {
                 if (url->userlen > 0) {
@@ -2320,8 +2321,8 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
                 }
         }
  
-       cbd->part->urls_offset = g_list_prepend (
-                       cbd->part->urls_offset,
+       cbd->part->exceptions = g_list_prepend (
+                       cbd->part->exceptions,
                         ex);
  
         /* We also search the query for additional url inside */
@@ -2376,10 +2377,10 @@ rspamd_url_text_extract (rspamd_mempool_t *pool,
                         rspamd_url_text_part_callback, &mcbd);
  
         /* Handle offsets of this part */
-       if (part->urls_offset != NULL) {
-               part->urls_offset = g_list_reverse (part->urls_offset);
+       if (part->exceptions != NULL) {
+               part->exceptions = g_list_reverse (part->exceptions);
                 rspamd_mempool_add_destructor (task->task_pool,
-                               (rspamd_mempool_destruct_t) g_list_free, part->urls_offset);
+                               (rspamd_mempool_destruct_t) g_list_free, part->exceptions);
         }
  }
  
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c

index 4e0e4b75dcf6b22a100839298f98442f4dd5c22f..6eab11f98ecbf6bc49ee79e28cffb40a5d19c785 100644 (file)
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -75,7 +75,7 @@ rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf,
  {
         gsize remain, pos;
         const gchar *p;
-       struct process_exception *ex = NULL;
+       struct rspamd_process_exception *ex = NULL;
  
         if (buf == NULL) {
                 return FALSE;
@@ -166,11 +166,12 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf,
                 GList **exceptions, gboolean is_utf, gsize *rl,
                 gboolean check_signature)
  {
-       gsize remain, pos, siglen = 0;
+       gsize remain, siglen = 0;
+       goffset pos;
         const gchar *p, *next_p, *sig = NULL;
         gunichar uc;
         guint processed = 0;
-       struct process_exception *ex = NULL;
+       struct rspamd_process_exception *ex = NULL;
         enum {
                 skip_delimiters = 0,
                 feed_token,
@@ -214,10 +215,12 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf,
  
                 switch (state) {
                 case skip_delimiters:
-                       if (ex != NULL && p - buf->begin == (gint)ex->pos) {
-                               token->begin = "!!EX!!";
-                               token->len = sizeof ("!!EX!!") - 1;
-                               processed = token->len;
+                       if (ex != NULL && p - buf->begin == ex->pos) {
+                               if (ex->type == RSPAMD_EXCEPTION_URL) {
+                                       token->begin = "!!EX!!";
+                                       token->len = sizeof ("!!EX!!") - 1;
+                                       processed = token->len;
+                               }
                                 state = skip_exception;
                                 continue;
                         }
@@ -270,12 +273,13 @@ set_token:
                 *rl = processed;
         }
  
-       if (token->len == 0) {
+       if (token->len == 0 && processed > 0) {
                 token->len = p - token->begin;
                 g_assert (token->len > 0);
-               *cur = p;
         }
  
+       *cur = p;
+
         return TRUE;
  }
  
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c

index ccbcec6e6ad17b086f791b06c2f696c42e1d1b1a..6ce4179f1e1f714b8934538d5066ea542d13f804 100644 (file)
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -733,7 +733,7 @@ lua_util_tokenize_text (lua_State *L)
         gsize len, pos, ex_len, i;
         GList *exceptions = NULL, *cur;
         struct rspamd_lua_text *t;
-       struct process_exception *ex;
+       struct rspamd_process_exception *ex;
         GArray *res;
         rspamd_ftok_t *w;
         gboolean compat = FALSE;
diff --git a/src/rspamd.h b/src/rspamd.h

index 5626337fc0115f87896e42cd29a122a3e8261bec..ffebfe387a179c5dfd51609b64ac507bdaaeaf00 100644 (file)
--- a/src/rspamd.h
+++ b/src/rspamd.h
@@ -265,12 +265,17 @@ struct rspamd_main {
         struct event_base *ev_base;
  };
  
+enum rspamd_exception_type {
+       RSPAMD_EXCEPTION_NEWLINE = 0,
+       RSPAMD_EXCEPTION_URL,
+};
  /**
   * Structure to point exception in text from processing
   */
-struct process_exception {
-       gsize pos;
-       gsize len;
+struct rspamd_process_exception {
+       goffset pos;
+       guint len;
+       enum rspamd_exception_type type;
  };
  
  /**
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Wed, 13 Jul 2016 16:03:27 +0000 (17:03 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Wed, 13 Jul 2016 16:07:23 +0000 (17:07 +0100)
src/libmime/message.c		patch \| blob \| history
src/libmime/message.h		patch \| blob \| history
src/libserver/html.c		patch \| blob \| history
src/libserver/url.c		patch \| blob \| history
src/libstat/tokenizers/tokenizers.c		patch \| blob \| history
src/lua/lua_util.c		patch \| blob \| history
src/rspamd.h		patch \| blob \| history