[Feature] Rework newlines strip parser one more time

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Wed, 21 Jun 2017 07:59:05 +0000 (08:59 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Wed, 21 Jun 2017 07:59:05 +0000 (08:59 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 21 Jun 2017 07:59:05 +0000 (08:59 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 21 Jun 2017 07:59:05 +0000 (08:59 +0100)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt

index a318af59e84c6c6a628819b24fae5036e2f0e223..a637d3bdb36af62c199290557216e8bd6a57786f 100644 (file)
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -118,11 +118,6 @@ RAGEL_TARGET(ragel_smtp_received
         DEPENDS ${RAGEL_DEPENDS}
         COMPILE_FLAGS -T1
         OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/smtp_received_parser.rl.c)
-RAGEL_TARGET(ragel_newlines_strip
-       INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/newlines_strip.rl
-       DEPENDS ${RAGEL_DEPENDS}
-       COMPILE_FLAGS -G2
-       OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/newlines_strip.rl.c)
  RAGEL_TARGET(ragel_content_type
         INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/content_type_parser.rl
         DEPENDS ${RAGEL_DEPENDS}
diff --git a/src/libmime/message.c b/src/libmime/message.c

index 647cacdf02565cc9a10a7ce8b35edce5f5143e31..503ec512634f7460815697eeaebe1a3b563a94c8 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -336,6 +336,152 @@ rspamd_extract_words (struct rspamd_task *task,
  
  }
  
+static void
+rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
+               GByteArray *data, gboolean is_html, guint *newlines_count,
+               GPtrArray *newlines)
+{
+       const gchar *p = begin, *c = begin;
+       gchar last_c = '\0';
+       gboolean crlf_added = FALSE;
+       enum {
+               normal_char,
+               seen_cr,
+               seen_lf,
+       } state = normal_char;
+
+       while (p < pe) {
+               if (G_UNLIKELY (*p) == '\r') {
+                       switch (state) {
+                       case normal_char:
+                               state = seen_cr;
+                               if (p > c) {
+                                       last_c = *(p - 1);
+                                       g_byte_array_append (data, (const guint8 *)c, p - c);
+                               }
+
+                               crlf_added = FALSE;
+                               c = p + 1;
+                               break;
+                       case seen_cr:
+                               /* Double \r\r */
+                               if (!crlf_added) {
+                                       g_byte_array_append (data, (const guint8 *)" ", 1);
+                                       crlf_added = TRUE;
+                                       g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len))));
+                               }
+
+                               (*newlines_count)++;
+                               c = p + 1;
+                               break;
+                       case seen_lf:
+                               /* Likely \r\n\r...*/
+                               state = seen_cr;
+                               c = p + 1;
+                               break;
+                       }
+
+                       p ++;
+               }
+               else if (G_UNLIKELY (*p == '\n')) {
+                       switch (state) {
+                       case normal_char:
+                               state = seen_lf;
+
+                               if (p > c) {
+                                       last_c = *(p - 1);
+                                       g_byte_array_append (data, (const guint8 *)c, p - c);
+                               }
+
+                               c = p + 1;
+
+                               if (is_html || g_ascii_ispunct (last_c)) {
+                                       g_byte_array_append (data, (const guint8 *)" ", 1);
+                                       crlf_added = TRUE;
+                               }
+                               else {
+                                       crlf_added = FALSE;
+                               }
+
+                               break;
+                       case seen_cr:
+                               /* \r\n */
+                               if (!crlf_added) {
+                                       if (is_html || g_ascii_ispunct (last_c)) {
+                                               g_byte_array_append (data, (const guint8 *) " ", 1);
+                                               crlf_added = TRUE;
+                                       }
+
+                                       g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len))));
+                               }
+
+                               c = p + 1;
+                               state = seen_lf;
+
+                               break;
+                       case seen_lf:
+                               /* Double \n\n */
+                               if (!crlf_added) {
+                                       g_byte_array_append (data, (const guint8 *)" ", 1);
+                                       crlf_added = TRUE;
+                                       g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len))));
+                               }
+
+                               (*newlines_count)++;
+
+                               c = p + 1;
+                               break;
+                       }
+
+                       p ++;
+               }
+               else {
+                       switch (state) {
+                       case normal_char:
+                               break;
+                       case seen_cr:
+                       case seen_lf:
+                               (*newlines_count)++;
+
+                               /* Skip initial spaces */
+                               if (G_UNLIKELY (*p == ' ')) {
+                                       if (!crlf_added) {
+                                               g_byte_array_append (data, (const guint8 *)" ", 1);
+                                       }
+
+                                       while (p < pe && *p == ' ') {
+                                               p ++;
+                                               c ++;
+                                       }
+                               }
+
+                               state = normal_char;
+                               break;
+                       }
+
+                       p ++;
+               }
+       }
+
+       /* Leftover */
+       if (p > c) {
+               switch (state) {
+               case normal_char:
+                       g_byte_array_append (data, (const guint8 *)c, p - c);
+                       break;
+               default:
+
+                       if (!crlf_added) {
+                               g_byte_array_append (data, (const guint8 *)" ", 1);
+                               g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len))));
+                       }
+
+                       (*newlines_count)++;
+                       break;
+               }
+       }
+}
+
  static void
  rspamd_normalize_text_part (struct rspamd_task *task,
                 struct rspamd_mime_text_part *part)
diff --git a/src/libmime/smtp_parsers.h b/src/libmime/smtp_parsers.h

index 3f13abb598854545a39bb965de26b3d25946e9af..57fb5d552b59639ac93cf4fdedad3f14cea170bc 100644 (file)
--- a/src/libmime/smtp_parsers.h
+++ b/src/libmime/smtp_parsers.h
@@ -27,10 +27,6 @@ int rspamd_smtp_recieved_parse (struct rspamd_task *task,
  int rspamd_smtp_addr_parse (const char *data, size_t len,
                 struct rspamd_email_address *addr);
  
-void rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
-               GByteArray *data, gboolean is_html, guint *newlines_count,
-               GPtrArray *newlines);
-
  gboolean rspamd_content_type_parser (const char *data, size_t len,
                 struct rspamd_content_type *ct, rspamd_mempool_t *pool);
  gboolean rspamd_content_disposition_parser (const char *data, size_t len,
diff --git a/src/ragel/newlines_strip.rl b/src/ragel/newlines_strip.rl

deleted file mode 100644 (file)

index d5de198..0000000
--- a/src/ragel/newlines_strip.rl
+++ /dev/null
@@ -1,82 +0,0 @@
-%%{
-  machine newlines_strip;
-
-  action Double_CRLF {
-    if (!crlf_added && p > c) {
-      (*newlines_count)++;
-      g_byte_array_append (data, (const guint8 *)" ", 1);
-      c = p;
-    }
-
-    crlf_added = TRUE;
-    c = p;
-  }
-
-  action WSP {
-    g_byte_array_append (data, (const guint8 *)" ", 1);
-    c = p;
-  }
-
-  action Text_Start {
-    crlf_added = FALSE;
-    c = p;
-  }
-
-  action Text_End {
-    if (p > c) {
-      g_byte_array_append (data, (const guint8 *)c, p - c);
-      last_c = *(p - 1);
-    }
-
-    c = p;
-  }
-
-  action Line_CRLF {
-    if (!crlf_added) {
-      if (is_html || g_ascii_ispunct (last_c)) {
-         g_byte_array_append (data, (const guint8 *)" ", 1);
-         crlf_added = TRUE;
-      }
-    }
-
-    (*newlines_count)++;
-    g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len))));
-    c = p;
-  }
-
-
-  WSP   = " " | "\t" | "\v";
-  CRLF  = ("\r" . "\n") | ( "\r" ) | ("\n");
-  DOUBLE_CRLF = (CRLF <: (WSP* CRLF)+) %Double_CRLF;
-  ANY_CRLF = CRLF | DOUBLE_CRLF;
-  LINE = (([^\r\n]+) >Text_Start %Text_End);
-  TEXT  = ANY_CRLF* . (LINE <: ANY_CRLF %Line_CRLF)+ | LINE | ANY_CRLF %Line_CRLF;
-
-  main := TEXT;
-}%%
-
-#include <glib.h>
-
-%% write data;
-
-void
-rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
-    GByteArray *data, gboolean is_html, guint *newlines_count,
-    GPtrArray *newlines)
-{
-  const gchar *c, *p, *eof;
-  gint last_c = -1;
-  gint cs = 0;
-  gboolean crlf_added = FALSE;
-
-  c = begin;
-  p = begin;
-  eof = pe;
-
-  %% write init;
-  %% write exec;
-
-  if (p > c) {
-     g_byte_array_append (data, (const guint8 *)c, p - c);
-  }
-}
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Wed, 21 Jun 2017 07:59:05 +0000 (08:59 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Wed, 21 Jun 2017 07:59:05 +0000 (08:59 +0100)
src/CMakeLists.txt		patch \| blob \| history
src/libmime/message.c		patch \| blob \| history
src/libmime/smtp_parsers.h		patch \| blob \| history
src/ragel/newlines_strip.rl	[deleted file]	patch \| blob \| history