From f5dcf4b8a4a6a9881d95e4d4b1edd4c27c077d08 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 23 Jul 2016 17:13:36 +0100 Subject: [PATCH] [Feature] Create a dedicated parser to strip newlines Issue: #744 --- src/CMakeLists.txt | 8 +++- src/libmime/message.c | 68 ++++-------------------------- src/libmime/smtp_parsers.h | 4 ++ src/ragel/newlines_strip.rl | 82 +++++++++++++++++++++++++++++++++++++ 4 files changed, 101 insertions(+), 61 deletions(-) create mode 100644 src/ragel/newlines_strip.rl diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 817a927b8..d0512da83 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -117,6 +117,11 @@ RAGEL_TARGET(ragel_smtp_received DEPENDS ${RAGEL_DEPENDS} COMPILE_FLAGS -T1 OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/smtp_received_parser.rl.c) +RAGEL_TARGET(ragel_newlines_strip + INPUTS ragel/newlines_strip.rl + DEPENDS ${RAGEL_DEPENDS} + COMPILE_FLAGS -G2 + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/newlines_strip.rl.c) ######################### LINK SECTION ############################### ADD_LIBRARY(rspamd-server STATIC @@ -129,7 +134,8 @@ ADD_LIBRARY(rspamd-server STATIC ${CMAKE_CURRENT_BINARY_DIR}/modules.c ${PLUGINSSRC} "${RAGEL_ragel_smtp_addr_OUTPUTS}" - "${RAGEL_ragel_smtp_received_OUTPUTS}") + "${RAGEL_ragel_smtp_received_OUTPUTS}" + "${RAGEL_ragel_newlines_strip_OUTPUTS}") TARGET_LINK_LIBRARIES(rspamd-server rspamd-http-parser) TARGET_LINK_LIBRARIES(rspamd-server rspamd-cdb) TARGET_LINK_LIBRARIES(rspamd-server rspamd-lpeg) diff --git a/src/libmime/message.c b/src/libmime/message.c index 6e4e69597..89ccff68b 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -794,68 +794,16 @@ rspamd_normalize_text_part (struct rspamd_task *task, c = p; end = p + part->content->len; - while (p < end) { - p = memchr (c, '\n', end - c); - - if (p) { - if (*(p - 1) == '\r') { - p --; - } - - if (p > c) { - g_byte_array_append (part->stripped_content, c, p - c); - } - - /* - * Now we need to decide, maybe we have the following cases: - * 1. Multiple newlines must be replaced by one newline - * 2. If a line is finished with punctuation character, then insert - * one newline - * 3. In HTML parts we have to insert newlines as well - */ - - if (p > part->content->data && - (IS_PART_HTML (part) || - *(p - 1) == '\n' || - g_ascii_ispunct (*(p - 1)) - )) { - g_byte_array_append (part->stripped_content, "\n", 1); - } - - /* As it could cause reallocation, we initially store offsets */ - g_ptr_array_add (part->newlines, - GUINT_TO_POINTER (part->stripped_content->len)); - ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex)); - ex->pos = part->stripped_content->len; - ex->len = 0; - ex->type = RSPAMD_EXCEPTION_NEWLINE; - part->exceptions = g_list_prepend (part->exceptions, ex); - part->nlines ++; - p ++; - - while (p < end && (*p == '\r' || *p == '\n')) { - if (*p == '\n') { - part->nlines ++; - } - - p ++; - } - c = p; - } - else { - p = end; - break; - } - } - - if (p > c) { - g_byte_array_append (part->stripped_content, c, p - c); - } + rspamd_strip_newlines_parse (p, end, part->stripped_content, + IS_PART_HTML (part), &part->nlines, part->newlines); - /* Now convert offsets to real pointers for convenience */ for (i = 0; i < part->newlines->len; i ++) { - guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i)); - g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off; + ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex)); + p = g_ptr_array_index (part->newlines, i); + ex->pos = p - c; + ex->len = 0; + ex->type = RSPAMD_EXCEPTION_NEWLINE; + part->exceptions = g_list_prepend (part->exceptions, ex); } rspamd_mempool_add_destructor (task->task_pool, diff --git a/src/libmime/smtp_parsers.h b/src/libmime/smtp_parsers.h index 62e7738e3..07bd24688 100644 --- a/src/libmime/smtp_parsers.h +++ b/src/libmime/smtp_parsers.h @@ -26,4 +26,8 @@ int rspamd_smtp_recieved_parse (struct rspamd_task *task, int rspamd_smtp_addr_parse (const char *data, size_t len, struct rspamd_email_address *addr); +void rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, + GByteArray *data, gboolean is_html, guint *newlines_count, + GPtrArray *newlines); + #endif /* SRC_LIBMIME_SMTP_PARSERS_H_ */ diff --git a/src/ragel/newlines_strip.rl b/src/ragel/newlines_strip.rl new file mode 100644 index 000000000..a2f5620bd --- /dev/null +++ b/src/ragel/newlines_strip.rl @@ -0,0 +1,82 @@ +%%{ + machine newlines_strip; + + action Double_CRLF { + if (!crlf_added) { + g_byte_array_append (data, (const guint8 *)"\n", 1); + c = p; + } + + crlf_added = TRUE; + c = p; + } + + action WSP { + g_byte_array_append (data, (const guint8 *)" ", 1); + c = p; + } + + action Text_Start { + crlf_added = FALSE; + c = p; + } + + action Text_End { + if (p > c) { + g_byte_array_append (data, (const guint8 *)c, p - c); + last_c = *(p - 1); + } + + c = p; + } + + action Line_CRLF { + if (!crlf_added) { + if (is_html || g_ascii_ispunct (last_c)) { + g_byte_array_append (data, (const guint8 *)"\n", 1); + crlf_added = TRUE; + } + } + + (*newlines_count)++; + g_ptr_array_add (newlines, (gpointer)p); + c = p; + } + + + WSP = " " | "\t" | "\v"; + CRLF = ("\r" . "\n") | ( "\r" ) | ("\n"); + DOUBLE_CRLF = (CRLF <: (WSP* CRLF)+) %Double_CRLF; + ANY_CRLF = CRLF | DOUBLE_CRLF; + LINE_ELT = ((WSP+ %WSP)** :> ((^space)+) >Text_Start %Text_End <: (WSP+ %WSP)**); + LINE = LINE_ELT+; + TEXT = ANY_CRLF** . (LINE <: ANY_CRLF %Line_CRLF)+ | LINE | ANY_CRLF %Line_CRLF; + + main := TEXT; +}%% + +#include + +%% write data; + +void +rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, + GByteArray *data, gboolean is_html, guint *newlines_count, + GPtrArray *newlines) +{ + const gchar *c, *p, *eof; + gint last_c = -1; + gint cs = 0; + gboolean crlf_added = FALSE; + + c = begin; + p = begin; + eof = pe; + + %% write init; + %% write exec; + + if (p > c) { + g_byte_array_append (data, (const guint8 *)c, p - c); + } +} -- 2.39.5