diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-07-13 17:03:27 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-07-13 17:07:23 +0100 |
commit | 70cbb6d39a06eb6f71832517bfd788ad217b6965 (patch) | |
tree | 5e0e41033565b271021072aa5c2455f0e79a91a7 /src/libmime | |
parent | d2af2a1d52a8f9b26b7c77b12ce555db24f07df4 (diff) | |
download | rspamd-70cbb6d39a06eb6f71832517bfd788ad217b6965.tar.gz rspamd-70cbb6d39a06eb6f71832517bfd788ad217b6965.zip |
[Rework] Rework exceptions and newlines processing
Diffstat (limited to 'src/libmime')
-rw-r--r-- | src/libmime/message.c | 158 | ||||
-rw-r--r-- | src/libmime/message.h | 2 |
2 files changed, 94 insertions, 66 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c index f6c023294..4605d1c69 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -658,15 +658,15 @@ detect_text_language (struct rspamd_mime_text_part *part) } static void -rspamd_normalize_text_part (struct rspamd_task *task, +rspamd_extract_words (struct rspamd_task *task, struct rspamd_mime_text_part *part) { #ifdef WITH_SNOWBALL struct sb_stemmer *stem = NULL; #endif rspamd_ftok_t *w; - const guchar *r, *p, *c, *end; gchar *temp_word; + const guchar *r; guint i, nlen; #ifdef WITH_SNOWBALL @@ -674,71 +674,14 @@ rspamd_normalize_text_part (struct rspamd_task *task, stem = sb_stemmer_new (part->language, "UTF_8"); if (stem == NULL) { msg_info_task ("<%s> cannot create lemmatizer for %s language", - task->message_id, part->language); + task->message_id, part->language); } } #endif - /* Strip newlines */ - part->stripped_content = g_byte_array_sized_new (part->content->len); - part->newlines = g_ptr_array_sized_new (128); - p = part->content->data; - c = p; - end = p + part->content->len; - - while (p < end) { - p = memchr (c, '\n', end - c); - - if (p) { - if (*(p - 1) == '\r') { - p --; - } - - if (p > c) { - g_byte_array_append (part->stripped_content, c, p - c); - } - - /* As it could cause reallocation, we initially store offsets */ - g_ptr_array_add (part->newlines, - GUINT_TO_POINTER (part->stripped_content->len)); - part->nlines ++; - p ++; - - while (p < end && (*p == '\r' || *p == '\n')) { - if (*p == '\n') { - part->nlines ++; - } - - p ++; - } - c = p; - } - else { - p = end; - break; - } - } - - if (p > c) { - g_byte_array_append (part->stripped_content, c, p - c); - } - - /* Now convert offsets to real pointers for convenience */ - for (i = 0; i < part->newlines->len; i ++) { - guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i)); - g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off; - } - - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) free_byte_array_callback, - part->stripped_content); - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard, - part->newlines); - /* Ugly workaround */ part->normalized_words = rspamd_tokenize_text (part->content->data, part->content->len, IS_PART_UTF (part), task->cfg, - part->urls_offset, FALSE, + part->exceptions, FALSE, NULL); if (part->normalized_words) { @@ -798,6 +741,78 @@ rspamd_normalize_text_part (struct rspamd_task *task, #endif } +static void +rspamd_normalize_text_part (struct rspamd_task *task, + struct rspamd_mime_text_part *part) +{ + + const guchar *p, *c, *end; + guint i; + struct rspamd_process_exception *ex; + + /* Strip newlines */ + part->stripped_content = g_byte_array_sized_new (part->content->len); + part->newlines = g_ptr_array_sized_new (128); + p = part->content->data; + c = p; + end = p + part->content->len; + + while (p < end) { + p = memchr (c, '\n', end - c); + + if (p) { + if (*(p - 1) == '\r') { + p --; + } + + if (p > c) { + g_byte_array_append (part->stripped_content, c, p - c); + } + + /* As it could cause reallocation, we initially store offsets */ + g_ptr_array_add (part->newlines, + GUINT_TO_POINTER (part->stripped_content->len)); + ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex)); + ex->pos = part->stripped_content->len; + ex->len = 0; + ex->type = RSPAMD_EXCEPTION_NEWLINE; + part->exceptions = g_list_prepend (part->exceptions, ex); + part->nlines ++; + p ++; + + while (p < end && (*p == '\r' || *p == '\n')) { + if (*p == '\n') { + part->nlines ++; + } + + p ++; + } + c = p; + } + else { + p = end; + break; + } + } + + if (p > c) { + g_byte_array_append (part->stripped_content, c, p - c); + } + + /* Now convert offsets to real pointers for convenience */ + for (i = 0; i < part->newlines->len; i ++) { + guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i)); + g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off; + } + + rspamd_mempool_add_destructor (task->task_pool, + (rspamd_mempool_destruct_t) free_byte_array_callback, + part->stripped_content); + rspamd_mempool_add_destructor (task->task_pool, + (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard, + part->newlines); +} + #define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c))) static guint @@ -872,6 +887,14 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part return FALSE; } +static gint +exceptions_compare_func (gconstpointer a, gconstpointer b) +{ + const struct rspamd_process_exception *ea = a, *eb = b; + + return ea->pos - eb->pos; +} + static void process_text_part (struct rspamd_task *task, GByteArray *part_content, @@ -932,7 +955,7 @@ process_text_part (struct rspamd_task *task, task->task_pool, text_part->html, part_content, - &text_part->urls_offset, + &text_part->exceptions, task->urls, task->emails); @@ -941,10 +964,10 @@ process_text_part (struct rspamd_task *task, } /* Handle offsets of this part */ - if (text_part->urls_offset != NULL) { - text_part->urls_offset = g_list_reverse (text_part->urls_offset); + if (text_part->exceptions != NULL) { + text_part->exceptions = g_list_reverse (text_part->exceptions); rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) g_list_free, text_part->urls_offset); + (rspamd_mempool_destruct_t) g_list_free, text_part->exceptions); } rspamd_mempool_add_destructor (task->task_pool, @@ -1006,6 +1029,11 @@ process_text_part (struct rspamd_task *task, if (!IS_PART_HTML (text_part)) { rspamd_url_text_extract (task->task_pool, task, text_part, FALSE); } + + text_part->exceptions = g_list_sort (text_part->exceptions, + exceptions_compare_func); + + rspamd_extract_words (task, text_part); } struct mime_foreach_data { diff --git a/src/libmime/message.h b/src/libmime/message.h index 0d2ae74b4..3fe26e685 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -57,7 +57,7 @@ struct rspamd_mime_text_part { GByteArray *stripped_content; /**< no newlines or html tags */ GPtrArray *newlines; /**< positions of newlines in text */ struct html_content *html; - GList *urls_offset; /**< list of offsets of urls */ + GList *exceptions; /**< list of offsets of urls */ GMimeObject *parent; struct rspamd_mime_part *mime_part; GArray *normalized_words; |