]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Use one pass to remove newlines and store their positions
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 16 Jun 2016 14:11:41 +0000 (15:11 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 16 Jun 2016 14:11:41 +0000 (15:11 +0100)
src/libmime/message.c
src/libmime/message.h

index 978ab566f4261b240f17a9f6c3f536603b0141c9..db7cc6a5391c6281e743d315bd41483ff474f2ca 100644 (file)
@@ -670,23 +670,37 @@ rspamd_normalize_text_part (struct rspamd_task *task,
 #endif
        /* Strip newlines */
        part->stripped_content = g_byte_array_sized_new (part->content->len);
+       part->newlines = g_ptr_array_sized_new (128);
        p = part->content->data;
        c = p;
        end = p + part->content->len;
 
        while (p < end) {
-               if (*p == '\r' || *p == '\n') {
+               p = memchr (c, '\n', end - c);
+
+               if (p) {
                        if (p > c) {
                                g_byte_array_append (part->stripped_content, c, p - c);
                        }
 
+                       /* As it could cause reallocation, we initially store offsets */
+                       g_ptr_array_add (part->newlines,
+                                       GUINT_TO_POINTER (part->stripped_content->len));
+                       part->nlines ++;
+                       p ++;
+
                        while (p < end && (*p == '\r' || *p == '\n')) {
+                               if (*p == '\n') {
+                                       part->nlines ++;
+                               }
+
                                p ++;
                        }
                        c = p;
                }
                else {
-                       p ++;
+                       p = end;
+                       break;
                }
        }
 
@@ -694,9 +708,18 @@ rspamd_normalize_text_part (struct rspamd_task *task,
                g_byte_array_append (part->stripped_content, c, p - c);
        }
 
+       /* Now convert offsets to real pointers for convenience */
+       for (i = 0; i < part->newlines->len; i ++) {
+               guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i));
+               g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off;
+       }
+
        rspamd_mempool_add_destructor (task->task_pool,
                        (rspamd_mempool_destruct_t) free_byte_array_callback,
                        part->stripped_content);
+       rspamd_mempool_add_destructor (task->task_pool,
+                       (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
+                       part->newlines);
 
        /* Ugly workaround */
        part->normalized_words = rspamd_tokenize_text (part->content->data,
@@ -844,8 +867,7 @@ process_text_part (struct rspamd_task *task,
        gboolean is_empty)
 {
        struct mime_text_part *text_part;
-       const gchar *cd, *p, *c;
-       guint remain;
+       const gchar *cd;
 
        /* Skip attachments */
 #ifndef GMIME24
@@ -964,21 +986,6 @@ process_text_part (struct rspamd_task *task,
        detect_text_language (text_part);
        rspamd_normalize_text_part (task, text_part);
 
-       /* Calculate number of lines */
-       p = text_part->content->data;
-       remain = text_part->content->len;
-       c = p;
-
-       while (p != NULL && remain > 0) {
-               p = memchr (c, '\n', remain);
-
-               if (p != NULL) {
-                       text_part->nlines ++;
-                       remain -= p - c + 1;
-                       c = p + 1;
-               }
-       }
-
        if (!IS_PART_HTML (text_part)) {
                rspamd_url_text_extract (task->task_pool, task, text_part, FALSE);
        }
index 619154c6b8a5e2f0b6d3c0b2f3326a2e31bfd7f8..8017514f306e439ef695822c919ecc30d231614d 100644 (file)
@@ -46,6 +46,7 @@ struct mime_text_part {
        GByteArray *orig;
        GByteArray *content;
        GByteArray *stripped_content; /**< no newlines or html tags                     */
+       GPtrArray *newlines;    /**< positions of newlines in text                                      */
        struct html_content *html;
        GList *urls_offset;     /**< list of offsets of urls                                            */
        GMimeObject *parent;