From fa4f75e41ebcbf9a45c0077ca040db3df9dc0936 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 21 Jun 2017 08:59:05 +0100 Subject: [PATCH] [Feature] Rework newlines strip parser one more time Issue: #1687 --- src/CMakeLists.txt | 5 -- src/libmime/message.c | 146 ++++++++++++++++++++++++++++++++++++ src/libmime/smtp_parsers.h | 4 - src/ragel/newlines_strip.rl | 82 -------------------- 4 files changed, 146 insertions(+), 91 deletions(-) delete mode 100644 src/ragel/newlines_strip.rl diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a318af59e..a637d3bdb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -118,11 +118,6 @@ RAGEL_TARGET(ragel_smtp_received DEPENDS ${RAGEL_DEPENDS} COMPILE_FLAGS -T1 OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/smtp_received_parser.rl.c) -RAGEL_TARGET(ragel_newlines_strip - INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/newlines_strip.rl - DEPENDS ${RAGEL_DEPENDS} - COMPILE_FLAGS -G2 - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/newlines_strip.rl.c) RAGEL_TARGET(ragel_content_type INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/content_type_parser.rl DEPENDS ${RAGEL_DEPENDS} diff --git a/src/libmime/message.c b/src/libmime/message.c index 647cacdf0..503ec5126 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -336,6 +336,152 @@ rspamd_extract_words (struct rspamd_task *task, } +static void +rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, + GByteArray *data, gboolean is_html, guint *newlines_count, + GPtrArray *newlines) +{ + const gchar *p = begin, *c = begin; + gchar last_c = '\0'; + gboolean crlf_added = FALSE; + enum { + normal_char, + seen_cr, + seen_lf, + } state = normal_char; + + while (p < pe) { + if (G_UNLIKELY (*p) == '\r') { + switch (state) { + case normal_char: + state = seen_cr; + if (p > c) { + last_c = *(p - 1); + g_byte_array_append (data, (const guint8 *)c, p - c); + } + + crlf_added = FALSE; + c = p + 1; + break; + case seen_cr: + /* Double \r\r */ + if (!crlf_added) { + g_byte_array_append (data, (const guint8 *)" ", 1); + crlf_added = TRUE; + g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len)))); + } + + (*newlines_count)++; + c = p + 1; + break; + case seen_lf: + /* Likely \r\n\r...*/ + state = seen_cr; + c = p + 1; + break; + } + + p ++; + } + else if (G_UNLIKELY (*p == '\n')) { + switch (state) { + case normal_char: + state = seen_lf; + + if (p > c) { + last_c = *(p - 1); + g_byte_array_append (data, (const guint8 *)c, p - c); + } + + c = p + 1; + + if (is_html || g_ascii_ispunct (last_c)) { + g_byte_array_append (data, (const guint8 *)" ", 1); + crlf_added = TRUE; + } + else { + crlf_added = FALSE; + } + + break; + case seen_cr: + /* \r\n */ + if (!crlf_added) { + if (is_html || g_ascii_ispunct (last_c)) { + g_byte_array_append (data, (const guint8 *) " ", 1); + crlf_added = TRUE; + } + + g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len)))); + } + + c = p + 1; + state = seen_lf; + + break; + case seen_lf: + /* Double \n\n */ + if (!crlf_added) { + g_byte_array_append (data, (const guint8 *)" ", 1); + crlf_added = TRUE; + g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len)))); + } + + (*newlines_count)++; + + c = p + 1; + break; + } + + p ++; + } + else { + switch (state) { + case normal_char: + break; + case seen_cr: + case seen_lf: + (*newlines_count)++; + + /* Skip initial spaces */ + if (G_UNLIKELY (*p == ' ')) { + if (!crlf_added) { + g_byte_array_append (data, (const guint8 *)" ", 1); + } + + while (p < pe && *p == ' ') { + p ++; + c ++; + } + } + + state = normal_char; + break; + } + + p ++; + } + } + + /* Leftover */ + if (p > c) { + switch (state) { + case normal_char: + g_byte_array_append (data, (const guint8 *)c, p - c); + break; + default: + + if (!crlf_added) { + g_byte_array_append (data, (const guint8 *)" ", 1); + g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len)))); + } + + (*newlines_count)++; + break; + } + } +} + static void rspamd_normalize_text_part (struct rspamd_task *task, struct rspamd_mime_text_part *part) diff --git a/src/libmime/smtp_parsers.h b/src/libmime/smtp_parsers.h index 3f13abb59..57fb5d552 100644 --- a/src/libmime/smtp_parsers.h +++ b/src/libmime/smtp_parsers.h @@ -27,10 +27,6 @@ int rspamd_smtp_recieved_parse (struct rspamd_task *task, int rspamd_smtp_addr_parse (const char *data, size_t len, struct rspamd_email_address *addr); -void rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, - GByteArray *data, gboolean is_html, guint *newlines_count, - GPtrArray *newlines); - gboolean rspamd_content_type_parser (const char *data, size_t len, struct rspamd_content_type *ct, rspamd_mempool_t *pool); gboolean rspamd_content_disposition_parser (const char *data, size_t len, diff --git a/src/ragel/newlines_strip.rl b/src/ragel/newlines_strip.rl deleted file mode 100644 index d5de198a5..000000000 --- a/src/ragel/newlines_strip.rl +++ /dev/null @@ -1,82 +0,0 @@ -%%{ - machine newlines_strip; - - action Double_CRLF { - if (!crlf_added && p > c) { - (*newlines_count)++; - g_byte_array_append (data, (const guint8 *)" ", 1); - c = p; - } - - crlf_added = TRUE; - c = p; - } - - action WSP { - g_byte_array_append (data, (const guint8 *)" ", 1); - c = p; - } - - action Text_Start { - crlf_added = FALSE; - c = p; - } - - action Text_End { - if (p > c) { - g_byte_array_append (data, (const guint8 *)c, p - c); - last_c = *(p - 1); - } - - c = p; - } - - action Line_CRLF { - if (!crlf_added) { - if (is_html || g_ascii_ispunct (last_c)) { - g_byte_array_append (data, (const guint8 *)" ", 1); - crlf_added = TRUE; - } - } - - (*newlines_count)++; - g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len)))); - c = p; - } - - - WSP = " " | "\t" | "\v"; - CRLF = ("\r" . "\n") | ( "\r" ) | ("\n"); - DOUBLE_CRLF = (CRLF <: (WSP* CRLF)+) %Double_CRLF; - ANY_CRLF = CRLF | DOUBLE_CRLF; - LINE = (([^\r\n]+) >Text_Start %Text_End); - TEXT = ANY_CRLF* . (LINE <: ANY_CRLF %Line_CRLF)+ | LINE | ANY_CRLF %Line_CRLF; - - main := TEXT; -}%% - -#include - -%% write data; - -void -rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, - GByteArray *data, gboolean is_html, guint *newlines_count, - GPtrArray *newlines) -{ - const gchar *c, *p, *eof; - gint last_c = -1; - gint cs = 0; - gboolean crlf_added = FALSE; - - c = begin; - p = begin; - eof = pe; - - %% write init; - %% write exec; - - if (p > c) { - g_byte_array_append (data, (const guint8 *)c, p - c); - } -} -- 2.39.5