From 70cbb6d39a06eb6f71832517bfd788ad217b6965 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 13 Jul 2016 17:03:27 +0100 Subject: [PATCH] [Rework] Rework exceptions and newlines processing --- src/libmime/message.c | 158 ++++++++++++++++------------ src/libmime/message.h | 2 +- src/libserver/html.c | 3 +- src/libserver/url.c | 15 +-- src/libstat/tokenizers/tokenizers.c | 22 ++-- src/lua/lua_util.c | 2 +- src/rspamd.h | 11 +- 7 files changed, 126 insertions(+), 87 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index f6c023294..4605d1c69 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -658,15 +658,15 @@ detect_text_language (struct rspamd_mime_text_part *part) } static void -rspamd_normalize_text_part (struct rspamd_task *task, +rspamd_extract_words (struct rspamd_task *task, struct rspamd_mime_text_part *part) { #ifdef WITH_SNOWBALL struct sb_stemmer *stem = NULL; #endif rspamd_ftok_t *w; - const guchar *r, *p, *c, *end; gchar *temp_word; + const guchar *r; guint i, nlen; #ifdef WITH_SNOWBALL @@ -674,71 +674,14 @@ rspamd_normalize_text_part (struct rspamd_task *task, stem = sb_stemmer_new (part->language, "UTF_8"); if (stem == NULL) { msg_info_task ("<%s> cannot create lemmatizer for %s language", - task->message_id, part->language); + task->message_id, part->language); } } #endif - /* Strip newlines */ - part->stripped_content = g_byte_array_sized_new (part->content->len); - part->newlines = g_ptr_array_sized_new (128); - p = part->content->data; - c = p; - end = p + part->content->len; - - while (p < end) { - p = memchr (c, '\n', end - c); - - if (p) { - if (*(p - 1) == '\r') { - p --; - } - - if (p > c) { - g_byte_array_append (part->stripped_content, c, p - c); - } - - /* As it could cause reallocation, we initially store offsets */ - g_ptr_array_add (part->newlines, - GUINT_TO_POINTER (part->stripped_content->len)); - part->nlines ++; - p ++; - - while (p < end && (*p == '\r' || *p == '\n')) { - if (*p == '\n') { - part->nlines ++; - } - - p ++; - } - c = p; - } - else { - p = end; - break; - } - } - - if (p > c) { - g_byte_array_append (part->stripped_content, c, p - c); - } - - /* Now convert offsets to real pointers for convenience */ - for (i = 0; i < part->newlines->len; i ++) { - guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i)); - g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off; - } - - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) free_byte_array_callback, - part->stripped_content); - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard, - part->newlines); - /* Ugly workaround */ part->normalized_words = rspamd_tokenize_text (part->content->data, part->content->len, IS_PART_UTF (part), task->cfg, - part->urls_offset, FALSE, + part->exceptions, FALSE, NULL); if (part->normalized_words) { @@ -798,6 +741,78 @@ rspamd_normalize_text_part (struct rspamd_task *task, #endif } +static void +rspamd_normalize_text_part (struct rspamd_task *task, + struct rspamd_mime_text_part *part) +{ + + const guchar *p, *c, *end; + guint i; + struct rspamd_process_exception *ex; + + /* Strip newlines */ + part->stripped_content = g_byte_array_sized_new (part->content->len); + part->newlines = g_ptr_array_sized_new (128); + p = part->content->data; + c = p; + end = p + part->content->len; + + while (p < end) { + p = memchr (c, '\n', end - c); + + if (p) { + if (*(p - 1) == '\r') { + p --; + } + + if (p > c) { + g_byte_array_append (part->stripped_content, c, p - c); + } + + /* As it could cause reallocation, we initially store offsets */ + g_ptr_array_add (part->newlines, + GUINT_TO_POINTER (part->stripped_content->len)); + ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex)); + ex->pos = part->stripped_content->len; + ex->len = 0; + ex->type = RSPAMD_EXCEPTION_NEWLINE; + part->exceptions = g_list_prepend (part->exceptions, ex); + part->nlines ++; + p ++; + + while (p < end && (*p == '\r' || *p == '\n')) { + if (*p == '\n') { + part->nlines ++; + } + + p ++; + } + c = p; + } + else { + p = end; + break; + } + } + + if (p > c) { + g_byte_array_append (part->stripped_content, c, p - c); + } + + /* Now convert offsets to real pointers for convenience */ + for (i = 0; i < part->newlines->len; i ++) { + guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i)); + g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off; + } + + rspamd_mempool_add_destructor (task->task_pool, + (rspamd_mempool_destruct_t) free_byte_array_callback, + part->stripped_content); + rspamd_mempool_add_destructor (task->task_pool, + (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard, + part->newlines); +} + #define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c))) static guint @@ -872,6 +887,14 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part return FALSE; } +static gint +exceptions_compare_func (gconstpointer a, gconstpointer b) +{ + const struct rspamd_process_exception *ea = a, *eb = b; + + return ea->pos - eb->pos; +} + static void process_text_part (struct rspamd_task *task, GByteArray *part_content, @@ -932,7 +955,7 @@ process_text_part (struct rspamd_task *task, task->task_pool, text_part->html, part_content, - &text_part->urls_offset, + &text_part->exceptions, task->urls, task->emails); @@ -941,10 +964,10 @@ process_text_part (struct rspamd_task *task, } /* Handle offsets of this part */ - if (text_part->urls_offset != NULL) { - text_part->urls_offset = g_list_reverse (text_part->urls_offset); + if (text_part->exceptions != NULL) { + text_part->exceptions = g_list_reverse (text_part->exceptions); rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) g_list_free, text_part->urls_offset); + (rspamd_mempool_destruct_t) g_list_free, text_part->exceptions); } rspamd_mempool_add_destructor (task->task_pool, @@ -1006,6 +1029,11 @@ process_text_part (struct rspamd_task *task, if (!IS_PART_HTML (text_part)) { rspamd_url_text_extract (task->task_pool, task, text_part, FALSE); } + + text_part->exceptions = g_list_sort (text_part->exceptions, + exceptions_compare_func); + + rspamd_extract_words (task, text_part); } struct mime_foreach_data { diff --git a/src/libmime/message.h b/src/libmime/message.h index 0d2ae74b4..3fe26e685 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -57,7 +57,7 @@ struct rspamd_mime_text_part { GByteArray *stripped_content; /**< no newlines or html tags */ GPtrArray *newlines; /**< positions of newlines in text */ struct html_content *html; - GList *urls_offset; /**< list of offsets of urls */ + GList *exceptions; /**< list of offsets of urls */ GMimeObject *parent; struct rspamd_mime_part *mime_part; GArray *normalized_words; diff --git a/src/libserver/html.c b/src/libserver/html.c index 0a25e488a..1188515c5 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -1601,7 +1601,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, gint substate = 0, len, href_offset = -1; struct html_tag *cur_tag = NULL; struct rspamd_url *url = NULL, *turl; - struct process_exception *ex; + struct rspamd_process_exception *ex; enum { parse_start = 0, tag_begin, @@ -1977,6 +1977,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, ex = rspamd_mempool_alloc (pool, sizeof (*ex)); ex->pos = href_offset; ex->len = dest->len - href_offset; + ex->type = RSPAMD_EXCEPTION_URL; *exceptions = g_list_prepend (*exceptions, ex); } diff --git a/src/libserver/url.c b/src/libserver/url.c index 823e32a43..1ccc91a27 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -2294,17 +2294,18 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, gsize end_offset, gpointer ud) { struct rspamd_url_mimepart_cbdata *cbd = ud; - struct process_exception *ex; + struct rspamd_process_exception *ex; struct rspamd_task *task; gchar *url_str = NULL; struct rspamd_url *query_url; gint rc; task = cbd->task; - ex = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct process_exception)); + ex = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_process_exception)); ex->pos = start_offset; ex->len = end_offset - start_offset; + ex->type = RSPAMD_EXCEPTION_URL; if (url->protocol == PROTOCOL_MAILTO) { if (url->userlen > 0) { @@ -2320,8 +2321,8 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, } } - cbd->part->urls_offset = g_list_prepend ( - cbd->part->urls_offset, + cbd->part->exceptions = g_list_prepend ( + cbd->part->exceptions, ex); /* We also search the query for additional url inside */ @@ -2376,10 +2377,10 @@ rspamd_url_text_extract (rspamd_mempool_t *pool, rspamd_url_text_part_callback, &mcbd); /* Handle offsets of this part */ - if (part->urls_offset != NULL) { - part->urls_offset = g_list_reverse (part->urls_offset); + if (part->exceptions != NULL) { + part->exceptions = g_list_reverse (part->exceptions); rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) g_list_free, part->urls_offset); + (rspamd_mempool_destruct_t) g_list_free, part->exceptions); } } diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 4e0e4b75d..6eab11f98 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -75,7 +75,7 @@ rspamd_tokenizer_get_word_compat (rspamd_ftok_t * buf, { gsize remain, pos; const gchar *p; - struct process_exception *ex = NULL; + struct rspamd_process_exception *ex = NULL; if (buf == NULL) { return FALSE; @@ -166,11 +166,12 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf, GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature) { - gsize remain, pos, siglen = 0; + gsize remain, siglen = 0; + goffset pos; const gchar *p, *next_p, *sig = NULL; gunichar uc; guint processed = 0; - struct process_exception *ex = NULL; + struct rspamd_process_exception *ex = NULL; enum { skip_delimiters = 0, feed_token, @@ -214,10 +215,12 @@ rspamd_tokenizer_get_word (rspamd_ftok_t * buf, switch (state) { case skip_delimiters: - if (ex != NULL && p - buf->begin == (gint)ex->pos) { - token->begin = "!!EX!!"; - token->len = sizeof ("!!EX!!") - 1; - processed = token->len; + if (ex != NULL && p - buf->begin == ex->pos) { + if (ex->type == RSPAMD_EXCEPTION_URL) { + token->begin = "!!EX!!"; + token->len = sizeof ("!!EX!!") - 1; + processed = token->len; + } state = skip_exception; continue; } @@ -270,12 +273,13 @@ set_token: *rl = processed; } - if (token->len == 0) { + if (token->len == 0 && processed > 0) { token->len = p - token->begin; g_assert (token->len > 0); - *cur = p; } + *cur = p; + return TRUE; } diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index ccbcec6e6..6ce4179f1 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -733,7 +733,7 @@ lua_util_tokenize_text (lua_State *L) gsize len, pos, ex_len, i; GList *exceptions = NULL, *cur; struct rspamd_lua_text *t; - struct process_exception *ex; + struct rspamd_process_exception *ex; GArray *res; rspamd_ftok_t *w; gboolean compat = FALSE; diff --git a/src/rspamd.h b/src/rspamd.h index 5626337fc..ffebfe387 100644 --- a/src/rspamd.h +++ b/src/rspamd.h @@ -265,12 +265,17 @@ struct rspamd_main { struct event_base *ev_base; }; +enum rspamd_exception_type { + RSPAMD_EXCEPTION_NEWLINE = 0, + RSPAMD_EXCEPTION_URL, +}; /** * Structure to point exception in text from processing */ -struct process_exception { - gsize pos; - gsize len; +struct rspamd_process_exception { + goffset pos; + guint len; + enum rspamd_exception_type type; }; /** -- 2.39.5