[Feature] Use one pass to remove newlines and store their positions

This commit is contained in:
Vsevolod Stakhov 2016-06-16 15:11:41 +01:00
parent 1a5e9eb0d7
commit 253bad00ac
2 changed files with 27 additions and 19 deletions

View File

@ -670,23 +670,37 @@ rspamd_normalize_text_part (struct rspamd_task *task,
#endif
/* Strip newlines */
part->stripped_content = g_byte_array_sized_new (part->content->len);
part->newlines = g_ptr_array_sized_new (128);
p = part->content->data;
c = p;
end = p + part->content->len;
while (p < end) {
if (*p == '\r' || *p == '\n') {
p = memchr (c, '\n', end - c);
if (p) {
if (p > c) {
g_byte_array_append (part->stripped_content, c, p - c);
}
/* As it could cause reallocation, we initially store offsets */
g_ptr_array_add (part->newlines,
GUINT_TO_POINTER (part->stripped_content->len));
part->nlines ++;
p ++;
while (p < end && (*p == '\r' || *p == '\n')) {
if (*p == '\n') {
part->nlines ++;
}
p ++;
}
c = p;
}
else {
p ++;
p = end;
break;
}
}
@ -694,9 +708,18 @@ rspamd_normalize_text_part (struct rspamd_task *task,
g_byte_array_append (part->stripped_content, c, p - c);
}
/* Now convert offsets to real pointers for convenience */
for (i = 0; i < part->newlines->len; i ++) {
guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i));
g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off;
}
rspamd_mempool_add_destructor (task->task_pool,
(rspamd_mempool_destruct_t) free_byte_array_callback,
part->stripped_content);
rspamd_mempool_add_destructor (task->task_pool,
(rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
part->newlines);
/* Ugly workaround */
part->normalized_words = rspamd_tokenize_text (part->content->data,
@ -844,8 +867,7 @@ process_text_part (struct rspamd_task *task,
gboolean is_empty)
{
struct mime_text_part *text_part;
const gchar *cd, *p, *c;
guint remain;
const gchar *cd;
/* Skip attachments */
#ifndef GMIME24
@ -964,21 +986,6 @@ process_text_part (struct rspamd_task *task,
detect_text_language (text_part);
rspamd_normalize_text_part (task, text_part);
/* Calculate number of lines */
p = text_part->content->data;
remain = text_part->content->len;
c = p;
while (p != NULL && remain > 0) {
p = memchr (c, '\n', remain);
if (p != NULL) {
text_part->nlines ++;
remain -= p - c + 1;
c = p + 1;
}
}
if (!IS_PART_HTML (text_part)) {
rspamd_url_text_extract (task->task_pool, task, text_part, FALSE);
}

View File

@ -46,6 +46,7 @@ struct mime_text_part {
GByteArray *orig;
GByteArray *content;
GByteArray *stripped_content; /**< no newlines or html tags */
GPtrArray *newlines; /**< positions of newlines in text */
struct html_content *html;
GList *urls_offset; /**< list of offsets of urls */
GMimeObject *parent;