From 253bad00ac438c3683f971b94c9a11bd76e06594 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 16 Jun 2016 15:11:41 +0100 Subject: [PATCH] [Feature] Use one pass to remove newlines and store their positions --- src/libmime/message.c | 45 +++++++++++++++++++++++++------------------ src/libmime/message.h | 1 + 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index 978ab566f..db7cc6a53 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -670,23 +670,37 @@ rspamd_normalize_text_part (struct rspamd_task *task, #endif /* Strip newlines */ part->stripped_content = g_byte_array_sized_new (part->content->len); + part->newlines = g_ptr_array_sized_new (128); p = part->content->data; c = p; end = p + part->content->len; while (p < end) { - if (*p == '\r' || *p == '\n') { + p = memchr (c, '\n', end - c); + + if (p) { if (p > c) { g_byte_array_append (part->stripped_content, c, p - c); } + /* As it could cause reallocation, we initially store offsets */ + g_ptr_array_add (part->newlines, + GUINT_TO_POINTER (part->stripped_content->len)); + part->nlines ++; + p ++; + while (p < end && (*p == '\r' || *p == '\n')) { + if (*p == '\n') { + part->nlines ++; + } + p ++; } c = p; } else { - p ++; + p = end; + break; } } @@ -694,9 +708,18 @@ rspamd_normalize_text_part (struct rspamd_task *task, g_byte_array_append (part->stripped_content, c, p - c); } + /* Now convert offsets to real pointers for convenience */ + for (i = 0; i < part->newlines->len; i ++) { + guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i)); + g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off; + } + rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t) free_byte_array_callback, part->stripped_content); + rspamd_mempool_add_destructor (task->task_pool, + (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard, + part->newlines); /* Ugly workaround */ part->normalized_words = rspamd_tokenize_text (part->content->data, @@ -844,8 +867,7 @@ process_text_part (struct rspamd_task *task, gboolean is_empty) { struct mime_text_part *text_part; - const gchar *cd, *p, *c; - guint remain; + const gchar *cd; /* Skip attachments */ #ifndef GMIME24 @@ -964,21 +986,6 @@ process_text_part (struct rspamd_task *task, detect_text_language (text_part); rspamd_normalize_text_part (task, text_part); - /* Calculate number of lines */ - p = text_part->content->data; - remain = text_part->content->len; - c = p; - - while (p != NULL && remain > 0) { - p = memchr (c, '\n', remain); - - if (p != NULL) { - text_part->nlines ++; - remain -= p - c + 1; - c = p + 1; - } - } - if (!IS_PART_HTML (text_part)) { rspamd_url_text_extract (task->task_pool, task, text_part, FALSE); } diff --git a/src/libmime/message.h b/src/libmime/message.h index 619154c6b..8017514f3 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -46,6 +46,7 @@ struct mime_text_part { GByteArray *orig; GByteArray *content; GByteArray *stripped_content; /**< no newlines or html tags */ + GPtrArray *newlines; /**< positions of newlines in text */ struct html_content *html; GList *urls_offset; /**< list of offsets of urls */ GMimeObject *parent; -- 2.39.5