]> source.dussan.org Git - rspamd.git/commitdiff
[Rework] Rework exceptions and newlines processing
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 13 Jul 2016 16:03:27 +0000 (17:03 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 13 Jul 2016 16:07:23 +0000 (17:07 +0100)
src/libmime/message.c
src/libmime/message.h
src/libserver/html.c
src/libserver/url.c
src/libstat/tokenizers/tokenizers.c
src/lua/lua_util.c
src/rspamd.h

index f6c023294b87e0856834f430ea70c1fe95bfcecd..4605d1c6944c08699a8d2d3aa8a09e7acf42c019 100644 (file)
@@ -658,15 +658,15 @@ detect_text_language (struct rspamd_mime_text_part *part)
 }
 
 static void
-rspamd_normalize_text_part (struct rspamd_task *task,
+rspamd_extract_words (struct rspamd_task *task,
                struct rspamd_mime_text_part *part)
 {
 #ifdef WITH_SNOWBALL
        struct sb_stemmer *stem = NULL;
 #endif
        rspamd_ftok_t *w;
-       const guchar *r, *p, *c, *end;
        gchar *temp_word;
+       const guchar *r;
        guint i, nlen;
 
 #ifdef WITH_SNOWBALL
@@ -674,71 +674,14 @@ rspamd_normalize_text_part (struct rspamd_task *task,
                stem = sb_stemmer_new (part->language, "UTF_8");
                if (stem == NULL) {
                        msg_info_task ("<%s> cannot create lemmatizer for %s language",
-                               task->message_id, part->language);
+                                       task->message_id, part->language);
                }
        }
 #endif
-       /* Strip newlines */
-       part->stripped_content = g_byte_array_sized_new (part->content->len);
-       part->newlines = g_ptr_array_sized_new (128);
-       p = part->content->data;
-       c = p;
-       end = p + part->content->len;
-
-       while (p < end) {
-               p = memchr (c, '\n', end - c);
-
-               if (p) {
-                       if (*(p - 1) == '\r') {
-                               p --;
-                       }
-
-                       if (p > c) {
-                               g_byte_array_append (part->stripped_content, c, p - c);
-                       }
-
-                       /* As it could cause reallocation, we initially store offsets */
-                       g_ptr_array_add (part->newlines,
-                                       GUINT_TO_POINTER (part->stripped_content->len));
-                       part->nlines ++;
-                       p ++;
-
-                       while (p < end && (*p == '\r' || *p == '\n')) {
-                               if (*p == '\n') {
-                                       part->nlines ++;
-                               }
-
-                               p ++;
-                       }
-                       c = p;
-               }
-               else {
-                       p = end;
-                       break;
-               }
-       }
-
-       if (p > c) {
-               g_byte_array_append (part->stripped_content, c, p - c);
-       }
-
-       /* Now convert offsets to real pointers for convenience */
-       for (i = 0; i < part->newlines->len; i ++) {
-               guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i));
-               g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off;
-       }
-
-       rspamd_mempool_add_destructor (task->task_pool,
-                       (rspamd_mempool_destruct_t) free_byte_array_callback,
-                       part->stripped_content);
-       rspamd_mempool_add_destructor (task->task_pool,
-                       (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
-                       part->newlines);
-
        /* Ugly workaround */
        part->normalized_words = rspamd_tokenize_text (part->content->data,
                        part->content->len, IS_PART_UTF (part), task->cfg,
-                       part->urls_offset, FALSE,
+                       part->exceptions, FALSE,
                        NULL);
 
        if (part->normalized_words) {
@@ -798,6 +741,78 @@ rspamd_normalize_text_part (struct rspamd_task *task,
 #endif
 }
 
+static void
+rspamd_normalize_text_part (struct rspamd_task *task,
+               struct rspamd_mime_text_part *part)
+{
+
+       const guchar *p, *c, *end;
+       guint i;
+       struct rspamd_process_exception *ex;
+
+       /* Strip newlines */
+       part->stripped_content = g_byte_array_sized_new (part->content->len);
+       part->newlines = g_ptr_array_sized_new (128);
+       p = part->content->data;
+       c = p;
+       end = p + part->content->len;
+
+       while (p < end) {
+               p = memchr (c, '\n', end - c);
+
+               if (p) {
+                       if (*(p - 1) == '\r') {
+                               p --;
+                       }
+
+                       if (p > c) {
+                               g_byte_array_append (part->stripped_content, c, p - c);
+                       }
+
+                       /* As it could cause reallocation, we initially store offsets */
+                       g_ptr_array_add (part->newlines,
+                                       GUINT_TO_POINTER (part->stripped_content->len));
+                       ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex));
+                       ex->pos = part->stripped_content->len;
+                       ex->len = 0;
+                       ex->type = RSPAMD_EXCEPTION_NEWLINE;
+                       part->exceptions = g_list_prepend (part->exceptions, ex);
+                       part->nlines ++;
+                       p ++;
+
+                       while (p < end && (*p == '\r' || *p == '\n')) {
+                               if (*p == '\n') {
+                                       part->nlines ++;
+                               }
+
+                               p ++;
+                       }
+                       c = p;
+               }
+               else {
+                       p = end;
+                       break;
+               }
+       }
+
+       if (p > c) {
+               g_byte_array_append (part->stripped_content, c, p - c);
+       }
+
+       /* Now convert offsets to real pointers for convenience */
+       for (i = 0; i < part->newlines->len; i ++) {
+               guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i));
+               g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off;
+       }
+
+       rspamd_mempool_add_destructor (task->task_pool,
+                       (rspamd_mempool_destruct_t) free_byte_array_callback,
+                       part->stripped_content);
+       rspamd_mempool_add_destructor (task->task_pool,
+                       (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
+                       part->newlines);
+}
+
 #define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
 
 static guint
@@ -872,6 +887,14 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
        return FALSE;
 }
 
+static gint
+exceptions_compare_func (gconstpointer a, gconstpointer b)
+{
+       const struct rspamd_process_exception *ea = a, *eb = b;
+
+       return ea->pos - eb->pos;
+}
+
 static void
 process_text_part (struct rspamd_task *task,
        GByteArray *part_content,
@@ -932,7 +955,7 @@ process_text_part (struct rspamd_task *task,
                                task->task_pool,
                                text_part->html,
                                part_content,
-                               &text_part->urls_offset,
+                               &text_part->exceptions,
                                task->urls,
                                task->emails);
 
@@ -941,10 +964,10 @@ process_text_part (struct rspamd_task *task,
                }
 
                /* Handle offsets of this part */
-               if (text_part->urls_offset != NULL) {
-                       text_part->urls_offset = g_list_reverse (text_part->urls_offset);
+               if (text_part->exceptions != NULL) {
+                       text_part->exceptions = g_list_reverse (text_part->exceptions);
                        rspamd_mempool_add_destructor (task->task_pool,
-                                       (rspamd_mempool_destruct_t) g_list_free, text_part->urls_offset);
+                                       (rspamd_mempool_destruct_t) g_list_free, text_part->exceptions);
                }
 
                rspamd_mempool_add_destructor (task->task_pool,
@@ -1006,6 +1029,11 @@ process_text_part (struct rspamd_task *task,
        if (!IS_PART_HTML (text_part)) {
                rspamd_url_text_extract (task->task_pool, task, text_part, FALSE);
        }
+
+       text_part->exceptions = g_list_sort (text_part->exceptions,
+                       exceptions_compare_func);
+
+       rspamd_extract_words (task, text_part);
 }
 
 struct mime_foreach_data {
index 0d2ae74b4ab704a866d514229e01b3aea070984d..3fe26e685650a8f8ef1a2734c9fbe0efcd8e0d97 100644 (file)
@@ -57,7 +57,7 @@ struct rspamd_mime_text_part {
        GByteArray *stripped_content; /**< no newlines or html tags                     */
        GPtrArray *newlines;    /**< positions of newlines in text                                      */
        struct html_content *html;
-       GList *urls_offset;     /**< list of offsets of urls                                            */
+       GList *exceptions;      /**< list of offsets of urls                                            */
        GMimeObject *parent;
        struct rspamd_mime_part *mime_part;
        GArray *normalized_words;
index 0a25e488a67bc6f65be2442e16bc6b7bb17b5bca..1188515c59c66aa79e3ef45bf6faf6e6a59f3837 100644 (file)
@@ -1601,7 +1601,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
        gint substate = 0, len, href_offset = -1;
        struct html_tag *cur_tag = NULL;
        struct rspamd_url *url = NULL, *turl;
-       struct process_exception *ex;
+       struct rspamd_process_exception *ex;
        enum {
                parse_start = 0,
                tag_begin,
@@ -1977,6 +1977,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                                                ex = rspamd_mempool_alloc (pool, sizeof (*ex));
                                                                ex->pos = href_offset;
                                                                ex->len = dest->len - href_offset;
+                                                               ex->type = RSPAMD_EXCEPTION_URL;
 
                                                                *exceptions = g_list_prepend (*exceptions, ex);
                                                        }
index 823e32a434cf2503217e1f531b363f729f67a2a0..1ccc91a2720085f7be63268d5c80bc9540dcae87 100644 (file)
@@ -2294,17 +2294,18 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
                gsize end_offset, gpointer ud)
 {
        struct rspamd_url_mimepart_cbdata *cbd = ud;
-       struct process_exception *ex;
+       struct rspamd_process_exception *ex;
        struct rspamd_task *task;
        gchar *url_str = NULL;
        struct rspamd_url *query_url;
        gint rc;
 
        task = cbd->task;
-       ex = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct process_exception));
+       ex = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_process_exception));
 
        ex->pos = start_offset;
        ex->len = end_offset - start_offset;
+       ex->type = RSPAMD_EXCEPTION_URL;
 
        if (url->protocol == PROTOCOL_MAILTO) {
                if (url->userlen > 0) {
@@ -2320,8 +2321,8 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
                }
        }
 
-       cbd->part->urls_offset = g_list_prepend (
-                       cbd->part->urls_offset,
+       cbd->part->exceptions = g_list_prepend (
+                       cbd->part->exceptions,
                        ex);
 
        /* We also search the query for additional url inside */
@@ -2376,10 +2377,10 @@ rspamd_url_text_extract (rspamd_mempool_t *pool,
                        rspamd_url_text_part_callback, &mcbd);
 
        /* Handle offsets of this part */
-       if (part->urls_offset != NULL) {
-               part->urls_offset = g_list_reverse (part->urls_offset);
+       if (part->exceptions != NULL) {
+               part->exceptions = g_list_reverse (part->exceptions);
                rspamd_mempool_add_destructor (task->task_pool,
-                               (rspamd_mempool_destruct_t) g_list_free, part->urls_offset);
+                               (rspamd_mempool_destruct_t) g_list_free, part->exceptions);
        }
 }
 
index 4e0e4b75dcf6b22a100839298f98442f4dd5c22f..6eab11f98ecbf6bc49ee79e28cffb40a5d19c785 100644 (file)
@@ -75,7 +75,7 @@ rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf,
 {
        gsize remain, pos;
        const gchar *p;
-       struct process_exception *ex = NULL;
+       struct rspamd_process_exception *ex = NULL;
 
        if (buf == NULL) {
                return FALSE;
@@ -166,11 +166,12 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf,
                GList **exceptions, gboolean is_utf, gsize *rl,
                gboolean check_signature)
 {
-       gsize remain, pos, siglen = 0;
+       gsize remain, siglen = 0;
+       goffset pos;
        const gchar *p, *next_p, *sig = NULL;
        gunichar uc;
        guint processed = 0;
-       struct process_exception *ex = NULL;
+       struct rspamd_process_exception *ex = NULL;
        enum {
                skip_delimiters = 0,
                feed_token,
@@ -214,10 +215,12 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf,
 
                switch (state) {
                case skip_delimiters:
-                       if (ex != NULL && p - buf->begin == (gint)ex->pos) {
-                               token->begin = "!!EX!!";
-                               token->len = sizeof ("!!EX!!") - 1;
-                               processed = token->len;
+                       if (ex != NULL && p - buf->begin == ex->pos) {
+                               if (ex->type == RSPAMD_EXCEPTION_URL) {
+                                       token->begin = "!!EX!!";
+                                       token->len = sizeof ("!!EX!!") - 1;
+                                       processed = token->len;
+                               }
                                state = skip_exception;
                                continue;
                        }
@@ -270,12 +273,13 @@ set_token:
                *rl = processed;
        }
 
-       if (token->len == 0) {
+       if (token->len == 0 && processed > 0) {
                token->len = p - token->begin;
                g_assert (token->len > 0);
-               *cur = p;
        }
 
+       *cur = p;
+
        return TRUE;
 }
 
index ccbcec6e6ad17b086f791b06c2f696c42e1d1b1a..6ce4179f1e1f714b8934538d5066ea542d13f804 100644 (file)
@@ -733,7 +733,7 @@ lua_util_tokenize_text (lua_State *L)
        gsize len, pos, ex_len, i;
        GList *exceptions = NULL, *cur;
        struct rspamd_lua_text *t;
-       struct process_exception *ex;
+       struct rspamd_process_exception *ex;
        GArray *res;
        rspamd_ftok_t *w;
        gboolean compat = FALSE;
index 5626337fc0115f87896e42cd29a122a3e8261bec..ffebfe387a179c5dfd51609b64ac507bdaaeaf00 100644 (file)
@@ -265,12 +265,17 @@ struct rspamd_main {
        struct event_base *ev_base;
 };
 
+enum rspamd_exception_type {
+       RSPAMD_EXCEPTION_NEWLINE = 0,
+       RSPAMD_EXCEPTION_URL,
+};
 /**
  * Structure to point exception in text from processing
  */
-struct process_exception {
-       gsize pos;
-       gsize len;
+struct rspamd_process_exception {
+       goffset pos;
+       guint len;
+       enum rspamd_exception_type type;
 };
 
 /**