From f5dcf4b8a4a6a9881d95e4d4b1edd4c27c077d08 Mon Sep 17 00:00:00 2001
From: Vsevolod Stakhov <vsevolod@highsecure.ru>
Date: Sat, 23 Jul 2016 17:13:36 +0100
Subject: [PATCH] [Feature] Create a dedicated parser to strip newlines

Issue: #744
---
 src/CMakeLists.txt          |  8 +++-
 src/libmime/message.c       | 68 ++++--------------------------
 src/libmime/smtp_parsers.h  |  4 ++
 src/ragel/newlines_strip.rl | 82 +++++++++++++++++++++++++++++++++++++
 4 files changed, 101 insertions(+), 61 deletions(-)
 create mode 100644 src/ragel/newlines_strip.rl

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 817a927b8..d0512da83 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -117,6 +117,11 @@ RAGEL_TARGET(ragel_smtp_received
 	DEPENDS ${RAGEL_DEPENDS}
 	COMPILE_FLAGS -T1
 	OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/smtp_received_parser.rl.c)
+RAGEL_TARGET(ragel_newlines_strip
+	INPUTS ragel/newlines_strip.rl
+	DEPENDS ${RAGEL_DEPENDS}
+	COMPILE_FLAGS -G2
+	OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/newlines_strip.rl.c)
 ######################### LINK SECTION ###############################
 
 ADD_LIBRARY(rspamd-server STATIC
@@ -129,7 +134,8 @@ ADD_LIBRARY(rspamd-server STATIC
 		${CMAKE_CURRENT_BINARY_DIR}/modules.c
 		${PLUGINSSRC}
 		"${RAGEL_ragel_smtp_addr_OUTPUTS}"
-		"${RAGEL_ragel_smtp_received_OUTPUTS}")
+		"${RAGEL_ragel_smtp_received_OUTPUTS}"
+		"${RAGEL_ragel_newlines_strip_OUTPUTS}")
 TARGET_LINK_LIBRARIES(rspamd-server rspamd-http-parser)
 TARGET_LINK_LIBRARIES(rspamd-server rspamd-cdb)
 TARGET_LINK_LIBRARIES(rspamd-server rspamd-lpeg)
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 6e4e69597..89ccff68b 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -794,68 +794,16 @@ rspamd_normalize_text_part (struct rspamd_task *task,
 	c = p;
 	end = p + part->content->len;
 
-	while (p < end) {
-		p = memchr (c, '\n', end - c);
-
-		if (p) {
-			if (*(p - 1) == '\r') {
-				p --;
-			}
-
-			if (p > c) {
-				g_byte_array_append (part->stripped_content, c, p - c);
-			}
-
-			/*
-			 * Now we need to decide, maybe we have the following cases:
-			 * 1. Multiple newlines must be replaced by one newline
-			 * 2. If a line is finished with punctuation character, then insert
-			 * one newline
-			 * 3. In HTML parts we have to insert newlines as well
-			 */
-
-			if (p > part->content->data &&
-					(IS_PART_HTML (part) ||
-					*(p - 1) == '\n' ||
-					g_ascii_ispunct (*(p - 1))
-					)) {
-				g_byte_array_append (part->stripped_content, "\n", 1);
-			}
-
-			/* As it could cause reallocation, we initially store offsets */
-			g_ptr_array_add (part->newlines,
-					GUINT_TO_POINTER (part->stripped_content->len));
-			ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex));
-			ex->pos = part->stripped_content->len;
-			ex->len = 0;
-			ex->type = RSPAMD_EXCEPTION_NEWLINE;
-			part->exceptions = g_list_prepend (part->exceptions, ex);
-			part->nlines ++;
-			p ++;
-
-			while (p < end && (*p == '\r' || *p == '\n')) {
-				if (*p == '\n') {
-					part->nlines ++;
-				}
-
-				p ++;
-			}
-			c = p;
-		}
-		else {
-			p = end;
-			break;
-		}
-	}
-
-	if (p > c) {
-		g_byte_array_append (part->stripped_content, c, p - c);
-	}
+	rspamd_strip_newlines_parse (p, end, part->stripped_content,
+			IS_PART_HTML (part), &part->nlines, part->newlines);
 
-	/* Now convert offsets to real pointers for convenience */
 	for (i = 0; i < part->newlines->len; i ++) {
-		guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i));
-		g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off;
+		ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex));
+		p = g_ptr_array_index (part->newlines, i);
+		ex->pos = p - c;
+		ex->len = 0;
+		ex->type = RSPAMD_EXCEPTION_NEWLINE;
+		part->exceptions = g_list_prepend (part->exceptions, ex);
 	}
 
 	rspamd_mempool_add_destructor (task->task_pool,
diff --git a/src/libmime/smtp_parsers.h b/src/libmime/smtp_parsers.h
index 62e7738e3..07bd24688 100644
--- a/src/libmime/smtp_parsers.h
+++ b/src/libmime/smtp_parsers.h
@@ -26,4 +26,8 @@ int rspamd_smtp_recieved_parse (struct rspamd_task *task,
 int rspamd_smtp_addr_parse (const char *data, size_t len,
 		struct rspamd_email_address *addr);
 
+void rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
+		GByteArray *data, gboolean is_html, guint *newlines_count,
+		GPtrArray *newlines);
+
 #endif /* SRC_LIBMIME_SMTP_PARSERS_H_ */
diff --git a/src/ragel/newlines_strip.rl b/src/ragel/newlines_strip.rl
new file mode 100644
index 000000000..a2f5620bd
--- /dev/null
+++ b/src/ragel/newlines_strip.rl
@@ -0,0 +1,82 @@
+%%{
+  machine newlines_strip;
+
+  action Double_CRLF {
+    if (!crlf_added) {
+      g_byte_array_append (data, (const guint8 *)"\n", 1);
+      c = p;
+    }
+
+    crlf_added = TRUE;
+    c = p;
+  }
+
+  action WSP {
+    g_byte_array_append (data, (const guint8 *)" ", 1);
+    c = p;
+  }
+
+  action Text_Start {
+    crlf_added = FALSE;
+    c = p;
+  }
+
+  action Text_End {
+    if (p > c) {
+      g_byte_array_append (data, (const guint8 *)c, p - c);
+      last_c = *(p - 1);
+    }
+
+    c = p;
+  }
+
+  action Line_CRLF {
+    if (!crlf_added) {
+      if (is_html || g_ascii_ispunct (last_c)) {
+         g_byte_array_append (data, (const guint8 *)"\n", 1);
+         crlf_added = TRUE;
+      }
+    }
+
+    (*newlines_count)++;
+    g_ptr_array_add (newlines, (gpointer)p);
+    c = p;
+  }
+
+
+  WSP   = " " | "\t" | "\v";
+  CRLF  = ("\r" . "\n") | ( "\r" ) | ("\n");
+  DOUBLE_CRLF = (CRLF <: (WSP* CRLF)+) %Double_CRLF;
+  ANY_CRLF = CRLF | DOUBLE_CRLF;
+  LINE_ELT = ((WSP+ %WSP)** :> ((^space)+) >Text_Start %Text_End <: (WSP+ %WSP)**);
+  LINE = LINE_ELT+;
+  TEXT  = ANY_CRLF** . (LINE <: ANY_CRLF %Line_CRLF)+ | LINE | ANY_CRLF %Line_CRLF;
+
+  main := TEXT;
+}%%
+
+#include <glib.h>
+
+%% write data;
+
+void
+rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
+    GByteArray *data, gboolean is_html, guint *newlines_count,
+    GPtrArray *newlines)
+{
+  const gchar *c, *p, *eof;
+  gint last_c = -1;
+  gint cs = 0;
+  gboolean crlf_added = FALSE;
+
+  c = begin;
+  p = begin;
+  eof = pe;
+
+  %% write init;
+  %% write exec;
+
+  if (p > c) {
+     g_byte_array_append (data, (const guint8 *)c, p - c);
+  }
+}
-- 
2.39.5