summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/CMakeLists.txt8
-rw-r--r--src/libmime/message.c68
-rw-r--r--src/libmime/smtp_parsers.h4
-rw-r--r--src/ragel/newlines_strip.rl82
4 files changed, 101 insertions, 61 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 817a927b8..d0512da83 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -117,6 +117,11 @@ RAGEL_TARGET(ragel_smtp_received
DEPENDS ${RAGEL_DEPENDS}
COMPILE_FLAGS -T1
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/smtp_received_parser.rl.c)
+RAGEL_TARGET(ragel_newlines_strip
+ INPUTS ragel/newlines_strip.rl
+ DEPENDS ${RAGEL_DEPENDS}
+ COMPILE_FLAGS -G2
+ OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/newlines_strip.rl.c)
######################### LINK SECTION ###############################
ADD_LIBRARY(rspamd-server STATIC
@@ -129,7 +134,8 @@ ADD_LIBRARY(rspamd-server STATIC
${CMAKE_CURRENT_BINARY_DIR}/modules.c
${PLUGINSSRC}
"${RAGEL_ragel_smtp_addr_OUTPUTS}"
- "${RAGEL_ragel_smtp_received_OUTPUTS}")
+ "${RAGEL_ragel_smtp_received_OUTPUTS}"
+ "${RAGEL_ragel_newlines_strip_OUTPUTS}")
TARGET_LINK_LIBRARIES(rspamd-server rspamd-http-parser)
TARGET_LINK_LIBRARIES(rspamd-server rspamd-cdb)
TARGET_LINK_LIBRARIES(rspamd-server rspamd-lpeg)
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 6e4e69597..89ccff68b 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -794,68 +794,16 @@ rspamd_normalize_text_part (struct rspamd_task *task,
c = p;
end = p + part->content->len;
- while (p < end) {
- p = memchr (c, '\n', end - c);
-
- if (p) {
- if (*(p - 1) == '\r') {
- p --;
- }
-
- if (p > c) {
- g_byte_array_append (part->stripped_content, c, p - c);
- }
-
- /*
- * Now we need to decide, maybe we have the following cases:
- * 1. Multiple newlines must be replaced by one newline
- * 2. If a line is finished with punctuation character, then insert
- * one newline
- * 3. In HTML parts we have to insert newlines as well
- */
-
- if (p > part->content->data &&
- (IS_PART_HTML (part) ||
- *(p - 1) == '\n' ||
- g_ascii_ispunct (*(p - 1))
- )) {
- g_byte_array_append (part->stripped_content, "\n", 1);
- }
-
- /* As it could cause reallocation, we initially store offsets */
- g_ptr_array_add (part->newlines,
- GUINT_TO_POINTER (part->stripped_content->len));
- ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex));
- ex->pos = part->stripped_content->len;
- ex->len = 0;
- ex->type = RSPAMD_EXCEPTION_NEWLINE;
- part->exceptions = g_list_prepend (part->exceptions, ex);
- part->nlines ++;
- p ++;
-
- while (p < end && (*p == '\r' || *p == '\n')) {
- if (*p == '\n') {
- part->nlines ++;
- }
-
- p ++;
- }
- c = p;
- }
- else {
- p = end;
- break;
- }
- }
-
- if (p > c) {
- g_byte_array_append (part->stripped_content, c, p - c);
- }
+ rspamd_strip_newlines_parse (p, end, part->stripped_content,
+ IS_PART_HTML (part), &part->nlines, part->newlines);
- /* Now convert offsets to real pointers for convenience */
for (i = 0; i < part->newlines->len; i ++) {
- guint off = GPOINTER_TO_UINT (g_ptr_array_index (part->newlines, i));
- g_ptr_array_index (part->newlines, i) = part->stripped_content->data + off;
+ ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex));
+ p = g_ptr_array_index (part->newlines, i);
+ ex->pos = p - c;
+ ex->len = 0;
+ ex->type = RSPAMD_EXCEPTION_NEWLINE;
+ part->exceptions = g_list_prepend (part->exceptions, ex);
}
rspamd_mempool_add_destructor (task->task_pool,
diff --git a/src/libmime/smtp_parsers.h b/src/libmime/smtp_parsers.h
index 62e7738e3..07bd24688 100644
--- a/src/libmime/smtp_parsers.h
+++ b/src/libmime/smtp_parsers.h
@@ -26,4 +26,8 @@ int rspamd_smtp_recieved_parse (struct rspamd_task *task,
int rspamd_smtp_addr_parse (const char *data, size_t len,
struct rspamd_email_address *addr);
+void rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
+ GByteArray *data, gboolean is_html, guint *newlines_count,
+ GPtrArray *newlines);
+
#endif /* SRC_LIBMIME_SMTP_PARSERS_H_ */
diff --git a/src/ragel/newlines_strip.rl b/src/ragel/newlines_strip.rl
new file mode 100644
index 000000000..a2f5620bd
--- /dev/null
+++ b/src/ragel/newlines_strip.rl
@@ -0,0 +1,82 @@
+%%{
+ machine newlines_strip;
+
+ action Double_CRLF {
+ if (!crlf_added) {
+ g_byte_array_append (data, (const guint8 *)"\n", 1);
+ c = p;
+ }
+
+ crlf_added = TRUE;
+ c = p;
+ }
+
+ action WSP {
+ g_byte_array_append (data, (const guint8 *)" ", 1);
+ c = p;
+ }
+
+ action Text_Start {
+ crlf_added = FALSE;
+ c = p;
+ }
+
+ action Text_End {
+ if (p > c) {
+ g_byte_array_append (data, (const guint8 *)c, p - c);
+ last_c = *(p - 1);
+ }
+
+ c = p;
+ }
+
+ action Line_CRLF {
+ if (!crlf_added) {
+ if (is_html || g_ascii_ispunct (last_c)) {
+ g_byte_array_append (data, (const guint8 *)"\n", 1);
+ crlf_added = TRUE;
+ }
+ }
+
+ (*newlines_count)++;
+ g_ptr_array_add (newlines, (gpointer)p);
+ c = p;
+ }
+
+
+ WSP = " " | "\t" | "\v";
+ CRLF = ("\r" . "\n") | ( "\r" ) | ("\n");
+ DOUBLE_CRLF = (CRLF <: (WSP* CRLF)+) %Double_CRLF;
+ ANY_CRLF = CRLF | DOUBLE_CRLF;
+ LINE_ELT = ((WSP+ %WSP)** :> ((^space)+) >Text_Start %Text_End <: (WSP+ %WSP)**);
+ LINE = LINE_ELT+;
+ TEXT = ANY_CRLF** . (LINE <: ANY_CRLF %Line_CRLF)+ | LINE | ANY_CRLF %Line_CRLF;
+
+ main := TEXT;
+}%%
+
+#include <glib.h>
+
+%% write data;
+
+void
+rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
+ GByteArray *data, gboolean is_html, guint *newlines_count,
+ GPtrArray *newlines)
+{
+ const gchar *c, *p, *eof;
+ gint last_c = -1;
+ gint cs = 0;
+ gboolean crlf_added = FALSE;
+
+ c = begin;
+ p = begin;
+ eof = pe;
+
+ %% write init;
+ %% write exec;
+
+ if (p > c) {
+ g_byte_array_append (data, (const guint8 *)c, p - c);
+ }
+}