aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2017-06-21 08:59:05 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2017-06-21 08:59:05 +0100
commitfa4f75e41ebcbf9a45c0077ca040db3df9dc0936 (patch)
tree888b345f38d1aeccdc816b432c467c9c0be9fd5a
parent22794d547d1147d31bb226912ea37bfc8bf8d5f1 (diff)
downloadrspamd-fa4f75e41ebcbf9a45c0077ca040db3df9dc0936.tar.gz
rspamd-fa4f75e41ebcbf9a45c0077ca040db3df9dc0936.zip
[Feature] Rework newlines strip parser one more time
Issue: #1687
-rw-r--r--src/CMakeLists.txt5
-rw-r--r--src/libmime/message.c146
-rw-r--r--src/libmime/smtp_parsers.h4
-rw-r--r--src/ragel/newlines_strip.rl82
4 files changed, 146 insertions, 91 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a318af59e..a637d3bdb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -118,11 +118,6 @@ RAGEL_TARGET(ragel_smtp_received
DEPENDS ${RAGEL_DEPENDS}
COMPILE_FLAGS -T1
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/smtp_received_parser.rl.c)
-RAGEL_TARGET(ragel_newlines_strip
- INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/newlines_strip.rl
- DEPENDS ${RAGEL_DEPENDS}
- COMPILE_FLAGS -G2
- OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/newlines_strip.rl.c)
RAGEL_TARGET(ragel_content_type
INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/content_type_parser.rl
DEPENDS ${RAGEL_DEPENDS}
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 647cacdf0..503ec5126 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -337,6 +337,152 @@ rspamd_extract_words (struct rspamd_task *task,
}
static void
+rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
+ GByteArray *data, gboolean is_html, guint *newlines_count,
+ GPtrArray *newlines)
+{
+ const gchar *p = begin, *c = begin;
+ gchar last_c = '\0';
+ gboolean crlf_added = FALSE;
+ enum {
+ normal_char,
+ seen_cr,
+ seen_lf,
+ } state = normal_char;
+
+ while (p < pe) {
+ if (G_UNLIKELY (*p) == '\r') {
+ switch (state) {
+ case normal_char:
+ state = seen_cr;
+ if (p > c) {
+ last_c = *(p - 1);
+ g_byte_array_append (data, (const guint8 *)c, p - c);
+ }
+
+ crlf_added = FALSE;
+ c = p + 1;
+ break;
+ case seen_cr:
+ /* Double \r\r */
+ if (!crlf_added) {
+ g_byte_array_append (data, (const guint8 *)" ", 1);
+ crlf_added = TRUE;
+ g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len))));
+ }
+
+ (*newlines_count)++;
+ c = p + 1;
+ break;
+ case seen_lf:
+ /* Likely \r\n\r...*/
+ state = seen_cr;
+ c = p + 1;
+ break;
+ }
+
+ p ++;
+ }
+ else if (G_UNLIKELY (*p == '\n')) {
+ switch (state) {
+ case normal_char:
+ state = seen_lf;
+
+ if (p > c) {
+ last_c = *(p - 1);
+ g_byte_array_append (data, (const guint8 *)c, p - c);
+ }
+
+ c = p + 1;
+
+ if (is_html || g_ascii_ispunct (last_c)) {
+ g_byte_array_append (data, (const guint8 *)" ", 1);
+ crlf_added = TRUE;
+ }
+ else {
+ crlf_added = FALSE;
+ }
+
+ break;
+ case seen_cr:
+ /* \r\n */
+ if (!crlf_added) {
+ if (is_html || g_ascii_ispunct (last_c)) {
+ g_byte_array_append (data, (const guint8 *) " ", 1);
+ crlf_added = TRUE;
+ }
+
+ g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len))));
+ }
+
+ c = p + 1;
+ state = seen_lf;
+
+ break;
+ case seen_lf:
+ /* Double \n\n */
+ if (!crlf_added) {
+ g_byte_array_append (data, (const guint8 *)" ", 1);
+ crlf_added = TRUE;
+ g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len))));
+ }
+
+ (*newlines_count)++;
+
+ c = p + 1;
+ break;
+ }
+
+ p ++;
+ }
+ else {
+ switch (state) {
+ case normal_char:
+ break;
+ case seen_cr:
+ case seen_lf:
+ (*newlines_count)++;
+
+ /* Skip initial spaces */
+ if (G_UNLIKELY (*p == ' ')) {
+ if (!crlf_added) {
+ g_byte_array_append (data, (const guint8 *)" ", 1);
+ }
+
+ while (p < pe && *p == ' ') {
+ p ++;
+ c ++;
+ }
+ }
+
+ state = normal_char;
+ break;
+ }
+
+ p ++;
+ }
+ }
+
+ /* Leftover */
+ if (p > c) {
+ switch (state) {
+ case normal_char:
+ g_byte_array_append (data, (const guint8 *)c, p - c);
+ break;
+ default:
+
+ if (!crlf_added) {
+ g_byte_array_append (data, (const guint8 *)" ", 1);
+ g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len))));
+ }
+
+ (*newlines_count)++;
+ break;
+ }
+ }
+}
+
+static void
rspamd_normalize_text_part (struct rspamd_task *task,
struct rspamd_mime_text_part *part)
{
diff --git a/src/libmime/smtp_parsers.h b/src/libmime/smtp_parsers.h
index 3f13abb59..57fb5d552 100644
--- a/src/libmime/smtp_parsers.h
+++ b/src/libmime/smtp_parsers.h
@@ -27,10 +27,6 @@ int rspamd_smtp_recieved_parse (struct rspamd_task *task,
int rspamd_smtp_addr_parse (const char *data, size_t len,
struct rspamd_email_address *addr);
-void rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
- GByteArray *data, gboolean is_html, guint *newlines_count,
- GPtrArray *newlines);
-
gboolean rspamd_content_type_parser (const char *data, size_t len,
struct rspamd_content_type *ct, rspamd_mempool_t *pool);
gboolean rspamd_content_disposition_parser (const char *data, size_t len,
diff --git a/src/ragel/newlines_strip.rl b/src/ragel/newlines_strip.rl
deleted file mode 100644
index d5de198a5..000000000
--- a/src/ragel/newlines_strip.rl
+++ /dev/null
@@ -1,82 +0,0 @@
-%%{
- machine newlines_strip;
-
- action Double_CRLF {
- if (!crlf_added && p > c) {
- (*newlines_count)++;
- g_byte_array_append (data, (const guint8 *)" ", 1);
- c = p;
- }
-
- crlf_added = TRUE;
- c = p;
- }
-
- action WSP {
- g_byte_array_append (data, (const guint8 *)" ", 1);
- c = p;
- }
-
- action Text_Start {
- crlf_added = FALSE;
- c = p;
- }
-
- action Text_End {
- if (p > c) {
- g_byte_array_append (data, (const guint8 *)c, p - c);
- last_c = *(p - 1);
- }
-
- c = p;
- }
-
- action Line_CRLF {
- if (!crlf_added) {
- if (is_html || g_ascii_ispunct (last_c)) {
- g_byte_array_append (data, (const guint8 *)" ", 1);
- crlf_added = TRUE;
- }
- }
-
- (*newlines_count)++;
- g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len))));
- c = p;
- }
-
-
- WSP = " " | "\t" | "\v";
- CRLF = ("\r" . "\n") | ( "\r" ) | ("\n");
- DOUBLE_CRLF = (CRLF <: (WSP* CRLF)+) %Double_CRLF;
- ANY_CRLF = CRLF | DOUBLE_CRLF;
- LINE = (([^\r\n]+) >Text_Start %Text_End);
- TEXT = ANY_CRLF* . (LINE <: ANY_CRLF %Line_CRLF)+ | LINE | ANY_CRLF %Line_CRLF;
-
- main := TEXT;
-}%%
-
-#include <glib.h>
-
-%% write data;
-
-void
-rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
- GByteArray *data, gboolean is_html, guint *newlines_count,
- GPtrArray *newlines)
-{
- const gchar *c, *p, *eof;
- gint last_c = -1;
- gint cs = 0;
- gboolean crlf_added = FALSE;
-
- c = begin;
- p = begin;
- eof = pe;
-
- %% write init;
- %% write exec;
-
- if (p > c) {
- g_byte_array_append (data, (const guint8 *)c, p - c);
- }
-}