]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Rework newlines strip parser one more time
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 21 Jun 2017 07:59:05 +0000 (08:59 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 21 Jun 2017 07:59:05 +0000 (08:59 +0100)
Issue: #1687

src/CMakeLists.txt
src/libmime/message.c
src/libmime/smtp_parsers.h
src/ragel/newlines_strip.rl [deleted file]

index a318af59e84c6c6a628819b24fae5036e2f0e223..a637d3bdb36af62c199290557216e8bd6a57786f 100644 (file)
@@ -118,11 +118,6 @@ RAGEL_TARGET(ragel_smtp_received
        DEPENDS ${RAGEL_DEPENDS}
        COMPILE_FLAGS -T1
        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/smtp_received_parser.rl.c)
-RAGEL_TARGET(ragel_newlines_strip
-       INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/newlines_strip.rl
-       DEPENDS ${RAGEL_DEPENDS}
-       COMPILE_FLAGS -G2
-       OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/newlines_strip.rl.c)
 RAGEL_TARGET(ragel_content_type
        INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/content_type_parser.rl
        DEPENDS ${RAGEL_DEPENDS}
index 647cacdf02565cc9a10a7ce8b35edce5f5143e31..503ec512634f7460815697eeaebe1a3b563a94c8 100644 (file)
@@ -336,6 +336,152 @@ rspamd_extract_words (struct rspamd_task *task,
 
 }
 
+static void
+rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
+               GByteArray *data, gboolean is_html, guint *newlines_count,
+               GPtrArray *newlines)
+{
+       const gchar *p = begin, *c = begin;
+       gchar last_c = '\0';
+       gboolean crlf_added = FALSE;
+       enum {
+               normal_char,
+               seen_cr,
+               seen_lf,
+       } state = normal_char;
+
+       while (p < pe) {
+               if (G_UNLIKELY (*p) == '\r') {
+                       switch (state) {
+                       case normal_char:
+                               state = seen_cr;
+                               if (p > c) {
+                                       last_c = *(p - 1);
+                                       g_byte_array_append (data, (const guint8 *)c, p - c);
+                               }
+
+                               crlf_added = FALSE;
+                               c = p + 1;
+                               break;
+                       case seen_cr:
+                               /* Double \r\r */
+                               if (!crlf_added) {
+                                       g_byte_array_append (data, (const guint8 *)" ", 1);
+                                       crlf_added = TRUE;
+                                       g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len))));
+                               }
+
+                               (*newlines_count)++;
+                               c = p + 1;
+                               break;
+                       case seen_lf:
+                               /* Likely \r\n\r...*/
+                               state = seen_cr;
+                               c = p + 1;
+                               break;
+                       }
+
+                       p ++;
+               }
+               else if (G_UNLIKELY (*p == '\n')) {
+                       switch (state) {
+                       case normal_char:
+                               state = seen_lf;
+
+                               if (p > c) {
+                                       last_c = *(p - 1);
+                                       g_byte_array_append (data, (const guint8 *)c, p - c);
+                               }
+
+                               c = p + 1;
+
+                               if (is_html || g_ascii_ispunct (last_c)) {
+                                       g_byte_array_append (data, (const guint8 *)" ", 1);
+                                       crlf_added = TRUE;
+                               }
+                               else {
+                                       crlf_added = FALSE;
+                               }
+
+                               break;
+                       case seen_cr:
+                               /* \r\n */
+                               if (!crlf_added) {
+                                       if (is_html || g_ascii_ispunct (last_c)) {
+                                               g_byte_array_append (data, (const guint8 *) " ", 1);
+                                               crlf_added = TRUE;
+                                       }
+
+                                       g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len))));
+                               }
+
+                               c = p + 1;
+                               state = seen_lf;
+
+                               break;
+                       case seen_lf:
+                               /* Double \n\n */
+                               if (!crlf_added) {
+                                       g_byte_array_append (data, (const guint8 *)" ", 1);
+                                       crlf_added = TRUE;
+                                       g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len))));
+                               }
+
+                               (*newlines_count)++;
+
+                               c = p + 1;
+                               break;
+                       }
+
+                       p ++;
+               }
+               else {
+                       switch (state) {
+                       case normal_char:
+                               break;
+                       case seen_cr:
+                       case seen_lf:
+                               (*newlines_count)++;
+
+                               /* Skip initial spaces */
+                               if (G_UNLIKELY (*p == ' ')) {
+                                       if (!crlf_added) {
+                                               g_byte_array_append (data, (const guint8 *)" ", 1);
+                                       }
+
+                                       while (p < pe && *p == ' ') {
+                                               p ++;
+                                               c ++;
+                                       }
+                               }
+
+                               state = normal_char;
+                               break;
+                       }
+
+                       p ++;
+               }
+       }
+
+       /* Leftover */
+       if (p > c) {
+               switch (state) {
+               case normal_char:
+                       g_byte_array_append (data, (const guint8 *)c, p - c);
+                       break;
+               default:
+
+                       if (!crlf_added) {
+                               g_byte_array_append (data, (const guint8 *)" ", 1);
+                               g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len))));
+                       }
+
+                       (*newlines_count)++;
+                       break;
+               }
+       }
+}
+
 static void
 rspamd_normalize_text_part (struct rspamd_task *task,
                struct rspamd_mime_text_part *part)
index 3f13abb598854545a39bb965de26b3d25946e9af..57fb5d552b59639ac93cf4fdedad3f14cea170bc 100644 (file)
@@ -27,10 +27,6 @@ int rspamd_smtp_recieved_parse (struct rspamd_task *task,
 int rspamd_smtp_addr_parse (const char *data, size_t len,
                struct rspamd_email_address *addr);
 
-void rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
-               GByteArray *data, gboolean is_html, guint *newlines_count,
-               GPtrArray *newlines);
-
 gboolean rspamd_content_type_parser (const char *data, size_t len,
                struct rspamd_content_type *ct, rspamd_mempool_t *pool);
 gboolean rspamd_content_disposition_parser (const char *data, size_t len,
diff --git a/src/ragel/newlines_strip.rl b/src/ragel/newlines_strip.rl
deleted file mode 100644 (file)
index d5de198..0000000
+++ /dev/null
@@ -1,82 +0,0 @@
-%%{
-  machine newlines_strip;
-
-  action Double_CRLF {
-    if (!crlf_added && p > c) {
-      (*newlines_count)++;
-      g_byte_array_append (data, (const guint8 *)" ", 1);
-      c = p;
-    }
-
-    crlf_added = TRUE;
-    c = p;
-  }
-
-  action WSP {
-    g_byte_array_append (data, (const guint8 *)" ", 1);
-    c = p;
-  }
-
-  action Text_Start {
-    crlf_added = FALSE;
-    c = p;
-  }
-
-  action Text_End {
-    if (p > c) {
-      g_byte_array_append (data, (const guint8 *)c, p - c);
-      last_c = *(p - 1);
-    }
-
-    c = p;
-  }
-
-  action Line_CRLF {
-    if (!crlf_added) {
-      if (is_html || g_ascii_ispunct (last_c)) {
-         g_byte_array_append (data, (const guint8 *)" ", 1);
-         crlf_added = TRUE;
-      }
-    }
-
-    (*newlines_count)++;
-    g_ptr_array_add (newlines, (((gpointer) (goffset) (data->len))));
-    c = p;
-  }
-
-
-  WSP   = " " | "\t" | "\v";
-  CRLF  = ("\r" . "\n") | ( "\r" ) | ("\n");
-  DOUBLE_CRLF = (CRLF <: (WSP* CRLF)+) %Double_CRLF;
-  ANY_CRLF = CRLF | DOUBLE_CRLF;
-  LINE = (([^\r\n]+) >Text_Start %Text_End);
-  TEXT  = ANY_CRLF* . (LINE <: ANY_CRLF %Line_CRLF)+ | LINE | ANY_CRLF %Line_CRLF;
-
-  main := TEXT;
-}%%
-
-#include <glib.h>
-
-%% write data;
-
-void
-rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
-    GByteArray *data, gboolean is_html, guint *newlines_count,
-    GPtrArray *newlines)
-{
-  const gchar *c, *p, *eof;
-  gint last_c = -1;
-  gint cs = 0;
-  gboolean crlf_added = FALSE;
-
-  c = begin;
-  p = begin;
-  eof = pe;
-
-  %% write init;
-  %% write exec;
-
-  if (p > c) {
-     g_byte_array_append (data, (const guint8 *)c, p - c);
-  }
-}