]> source.dussan.org Git - rspamd.git/commitdiff
[CritFix] Switch from ragel to C for Content-Type parsing
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 27 Jun 2017 07:37:50 +0000 (08:37 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 27 Jun 2017 07:37:50 +0000 (08:37 +0100)
src/CMakeLists.txt
src/libmime/content_type.c
src/libmime/smtp_parsers.h
src/ragel/content_type.rl
src/ragel/content_type_parser.rl

index a637d3bdb36af62c199290557216e8bd6a57786f..4cca7dc027b70f48d415bf96db4ad1a3dce3082e 100644 (file)
@@ -106,7 +106,6 @@ SET(RAGEL_DEPENDS "${CMAKE_SOURCE_DIR}/src/ragel/smtp_address.rl"
        "${CMAKE_SOURCE_DIR}/src/ragel/smtp_ip.rl"
        "${CMAKE_SOURCE_DIR}/src/ragel/smtp_whitespace.rl"
        "${CMAKE_SOURCE_DIR}/src/ragel/smtp_received.rl"
-       "${CMAKE_SOURCE_DIR}/src/ragel/content_type.rl"
        "${CMAKE_SOURCE_DIR}/src/ragel/content_disposition.rl")
 RAGEL_TARGET(ragel_smtp_addr
        INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/smtp_addr_parser.rl
@@ -118,11 +117,6 @@ RAGEL_TARGET(ragel_smtp_received
        DEPENDS ${RAGEL_DEPENDS}
        COMPILE_FLAGS -T1
        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/smtp_received_parser.rl.c)
-RAGEL_TARGET(ragel_content_type
-       INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/content_type_parser.rl
-       DEPENDS ${RAGEL_DEPENDS}
-       COMPILE_FLAGS -G2
-       OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/content_type.rl.c)
 RAGEL_TARGET(ragel_content_disposition
        INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/content_disposition_parser.rl
        DEPENDS ${RAGEL_DEPENDS}
index 28dadbc135404394741998d530c679b6807d3468..38f979453ba2dce1f50943c942f89e78febb9743 100644 (file)
@@ -71,27 +71,286 @@ rspamd_content_type_add_param (rspamd_mempool_t *pool,
        }
 }
 
-struct rspamd_content_type *
-rspamd_content_type_parse (const gchar *in,
-               gsize len, rspamd_mempool_t *pool)
+static struct rspamd_content_type *
+rspamd_content_type_parser (const gchar *in, gsize len, rspamd_mempool_t *pool)
 {
+       guint obraces = 0, ebraces = 0, qlen = 0;
+       const gchar *p, *c, *end, *pname_start = NULL, *pname_end = NULL;
        struct rspamd_content_type *res = NULL, val;
-       rspamd_ftok_t srch;
+       gboolean eqsign_seen = FALSE;
+       enum {
+               parse_type,
+               parse_subtype,
+               parse_after_subtype,
+               parse_param_name,
+               parse_param_after_name,
+               parse_param_value,
+               parse_param_value_after_quote,
+               parse_space,
+               parse_quoted,
+               parse_comment,
+       } state = parse_space, next_state = parse_type;
+
+       p = in;
+       c = p;
+       end = p + len;
+       memset (&val, 0, sizeof (val));
+       val.lc_data = (gchar *)in;
+
+       while (p < end) {
+               switch (state) {
+               case parse_type:
+                       if (g_ascii_isspace (*p) || *p == ';') {
+                               /* We have type without subtype */
+                               val.type.begin = c;
+                               val.type.len = p - c;
+                               state = parse_after_subtype;
+                       } else if (*p == '/') {
+                               val.type.begin = c;
+                               val.type.len = p - c;
+                               state = parse_space;
+                               next_state = parse_subtype;
+                               p++;
+                       } else {
+                               p++;
+                       }
+                       break;
+               case parse_subtype:
+                       if (g_ascii_isspace (*p) || *p == ';') {
+                               val.subtype.begin = c;
+                               val.subtype.len = p - c;
+                               state = parse_after_subtype;
+                       } else {
+                               p++;
+                       }
+                       break;
+               case parse_after_subtype:
+                       if (*p == ';' || g_ascii_isspace (*p)) {
+                               p++;
+                       } else if (*p == '(') {
+                               c = p;
+                               state = parse_comment;
+                               next_state = parse_param_name;
+                               obraces = 1;
+                               ebraces = 0;
+                               pname_start = NULL;
+                               pname_end = NULL;
+                               eqsign_seen = FALSE;
+                               p++;
+                       } else {
+                               c = p;
+                               state = parse_param_name;
+                               pname_start = NULL;
+                               pname_end = NULL;
+                               eqsign_seen = FALSE;
+                       }
+                       break;
+               case parse_param_name:
+                       if (*p == '=') {
+                               pname_start = c;
+                               pname_end = p;
+                               state = parse_param_after_name;
+                               eqsign_seen = TRUE;
+                               p++;
+                       } else if (g_ascii_isspace (*p)) {
+                               pname_start = c;
+                               pname_end = p;
+                               state = parse_param_after_name;
+                       } else {
+                               p++;
+                       }
+                       break;
+               case parse_param_after_name:
+                       if (g_ascii_isspace (*p)) {
+                               p++;
+                       } else if (*p == '=') {
+                               if (eqsign_seen) {
+                                       /* Treat as value start */
+                                       c = p;
+                                       eqsign_seen = FALSE;
+                                       state = parse_space;
+                                       next_state = parse_param_value;
+                                       p++;
+                               } else {
+                                       eqsign_seen = TRUE;
+                                       p++;
+                               }
+                       } else {
+                               if (eqsign_seen) {
+                                       state = parse_param_value;
+                                       c = p;
+                               } else {
+                                       /* Invalid parameter without value */
+                                       c = p;
+                                       state = parse_param_name;
+                                       pname_start = NULL;
+                                       pname_end = NULL;
+                               }
+                       }
+                       break;
+               case parse_param_value:
+                       if (*p == '"') {
+                               p++;
+                               c = p;
+                               state = parse_quoted;
+                               next_state = parse_param_value_after_quote;
+                       } else if (g_ascii_isspace (*p)) {
+                               if (pname_start && pname_end && pname_end > pname_start) {
+                                       rspamd_content_type_add_param (pool, &val, pname_start,
+                                                       pname_end, c, p);
 
-       val.lc_data = rspamd_mempool_alloc (pool, len);
-       memcpy (val.lc_data, in, len);
-       rspamd_str_lc (val.lc_data, len);
+                               }
+
+                               state = parse_space;
+                               next_state = parse_param_name;
+                               pname_start = NULL;
+                               pname_end = NULL;
+                       } else if (*p == '(') {
+                               if (pname_start && pname_end && pname_end > pname_start) {
+                                       rspamd_content_type_add_param (pool, &val, pname_start,
+                                                       pname_end, c, p);
+                               }
+
+                               obraces = 1;
+                               ebraces = 0;
+                               p++;
+                               state = parse_comment;
+                               next_state = parse_param_name;
+                               pname_start = NULL;
+                               pname_end = NULL;
+                       } else {
+                               p++;
+                       }
+                       break;
+               case parse_param_value_after_quote:
+                       if (pname_start && pname_end && pname_end > pname_start) {
+                               rspamd_content_type_add_param (pool, &val, pname_start,
+                                               pname_end, c, c + qlen);
+                       }
 
-       if (rspamd_content_type_parser (val.lc_data, len, &val, pool)) {
+                       if (g_ascii_isspace (*p)) {
+                               state = parse_space;
+                               next_state = parse_param_name;
+                               pname_start = NULL;
+                               pname_end = NULL;
+                       } else if (*p == '(') {
+                               obraces = 1;
+                               ebraces = 0;
+                               p++;
+                               state = parse_comment;
+                               next_state = parse_param_name;
+                               pname_start = NULL;
+                               pname_end = NULL;
+                       } else {
+                               state = parse_param_name;
+                               pname_start = NULL;
+                               pname_end = NULL;
+                               c = p;
+                       }
+                       break;
+               case parse_quoted:
+                       if (*p == '\\') {
+                               /* Quoted pair */
+                               if (p + 1 < end) {
+                                       p += 2;
+                               } else {
+                                       p++;
+                               }
+                       } else if (*p == '"') {
+                               qlen = p - c;
+                               state = next_state;
+                       } else {
+                               p++;
+                       }
+                       break;
+               case parse_comment:
+                       if (*p == '(') {
+                               obraces++;
+                               p++;
+                       } else if (*p == ')') {
+                               ebraces++;
+                               p++;
+
+                               if (ebraces == obraces && p < end) {
+                                       if (g_ascii_isspace (*p)) {
+                                               state = parse_space;
+                                       } else {
+                                               c = p;
+                                               state = next_state;
+                                       }
+                               }
+                       } else {
+                               p++;
+                       }
+                       break;
+               case parse_space:
+                       if (g_ascii_isspace (*p)) {
+                               p++;
+                       } else if (*p == '(') {
+                               obraces = 1;
+                               ebraces = 0;
+                               p++;
+                               state = parse_comment;
+                       } else {
+                               c = p;
+                               state = next_state;
+                       }
+                       break;
+               }
+       }
+
+       /* Process leftover */
+       switch (state) {
+       case parse_type:
+               val.type.begin = c;
+               val.type.len = p - c;
+               break;
+       case parse_subtype:
+               val.subtype.begin = c;
+               val.subtype.len = p - c;
+               break;
+       case parse_param_value:
+               if (pname_start && pname_end && pname_end > pname_start) {
+                       rspamd_content_type_add_param (pool, &val, pname_start,
+                                       pname_end, c, p);
+
+               }
+       case parse_param_value_after_quote:
+               if (pname_start && pname_end && pname_end > pname_start) {
+                       rspamd_content_type_add_param (pool, &val, pname_start,
+                                       pname_end, c, c + qlen);
+               }
+               break;
+       default:
+               break;
+       }
+
+       if (val.type.len > 0) {
                res = rspamd_mempool_alloc (pool, sizeof (val));
                memcpy (res, &val, sizeof (val));
+       }
 
+       return res;
+}
+
+struct rspamd_content_type *
+rspamd_content_type_parse (const gchar *in,
+               gsize len, rspamd_mempool_t *pool)
+{
+       struct rspamd_content_type *res = NULL;
+       rspamd_ftok_t srch;
+       gchar *lc_data;
+
+       lc_data = rspamd_mempool_alloc (pool, len);
+       memcpy (lc_data, in, len);
+       rspamd_str_lc (lc_data, len);
+
+       if ((res = rspamd_content_type_parser (lc_data, len, pool)) != NULL) {
                if (res->attrs) {
                        rspamd_mempool_add_destructor (pool,
                                        (rspamd_mempool_destruct_t)g_hash_table_unref, res->attrs);
                }
 
-
                /* Now do some hacks to work with broken content types */
                if (res->subtype.len == 0) {
                        res->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
@@ -157,7 +416,7 @@ rspamd_content_type_parse (const gchar *in,
                }
        }
        else {
-               msg_warn_pool ("cannot parse content type: %*s", (gint)len, val.lc_data);
+               msg_warn_pool ("cannot parse content type: %*s", (gint)len, lc_data);
        }
 
        return res;
index 57fb5d552b59639ac93cf4fdedad3f14cea170bc..439e6c7c8ac110e4a03b30da13f2eb857d1410f5 100644 (file)
@@ -26,9 +26,6 @@ int rspamd_smtp_recieved_parse (struct rspamd_task *task,
                const char *data, size_t len, struct received_header *rh);
 int rspamd_smtp_addr_parse (const char *data, size_t len,
                struct rspamd_email_address *addr);
-
-gboolean rspamd_content_type_parser (const char *data, size_t len,
-               struct rspamd_content_type *ct, rspamd_mempool_t *pool);
 gboolean rspamd_content_disposition_parser (const char *data, size_t len,
                struct rspamd_content_disposition *cd, rspamd_mempool_t *pool);
 
index 6f65a7d0f92330572d10918dcfdbcb5eba518fa1..29889208f2dbe3be7b3bd65c05c6bfc12e87019b 100644 (file)
@@ -9,12 +9,11 @@
   comment        =   "(" (FWS? ccontent)* FWS? ")";
   CFWS           =   ((FWS? comment)+ FWS?) | FWS;
   qcontent = qtextSMTP | quoted_pairSMTP;
-  quoted_string = CFWS?
-                  (DQUOTE
+  quoted_string = (DQUOTE
                     (((FWS? qcontent)* FWS?) >Quoted_Str_Start %Quoted_Str_End)
-                  DQUOTE) CFWS?;
+                  DQUOTE);
   token = 0x21..0x27 | 0x2a..0x2b | 0x2c..0x2e | 0x30..0x39 | 0x41..0x5a | 0x5e..0x7e;
-  value = (quoted_string | (token | 0x3d)+) >Param_Value_Start %Param_Value_End;
+  value = (quoted_string | (token)+) >Param_Value_Start %Param_Value_End;
   attribute = (token+) >Param_Name_Start %Param_Name_End;
   parameter = CFWS? attribute FWS? "=" FWS? value CFWS?;
 
index eca3da3f8303cd096578b507ddc0feeecf60b32b..f248e011685c0c6b0584d039ce146e6dd8541de0 100644 (file)
@@ -42,6 +42,7 @@
   }
 
   action Param_Name_Start {
+          printf("name start: %s\n", p);
     qstart = NULL;
     qend = NULL;
     pname_start = p;
 
 
   action Param_Name_End {
+          printf("name end: %s\n", p);
     if (qstart) {
       pname_start = qstart;
     }
-    if (qend && qend >= qstart) {
+    if (qstart && qend && qend >= qstart) {
       pname_end = qend;
     }
     else if (p >= pname_start) {
       pname_end = p;
     }
-    qstart = NULL;
-    qend = NULL;
+
+    if (qstart && qend) {
+      qstart = NULL;
+      qend = NULL;
+    }
   }
 
 
   action Param_Value_Start {
-    qstart = NULL;
-    qend = NULL;
+          printf("value start: %s\n", p);
+    if (qend) {
+      qstart = NULL;
+      qend = NULL;
+    }
 
-    if (pname_end) {
+    if (pname_end && !pvalue_start) {
       pvalue_start = p;
       pvalue_end = NULL;
     }
 
 
   action Param_Value_End {
-    if (pname_end) {
+          printf("value end: %s\n", p);
+    if (pname_end && pname_start) {
       if (qstart) {
         pvalue_start = qstart;
+
+        if (!qend) {
+          pvalue_end = NULL;
+        }
       }
+
       if (qend && qend >= qstart) {
-        pvalue_end = qend;
+        if (qstart) {
+          pvalue_end = qend;
+        }
+        else {
+          pvalue_end = NULL;
+        }
       }
-      else if (p >= pvalue_start) {
+      else if (!qstart && p >= pvalue_start) {
         pvalue_end = p;
       }
-      qstart = NULL;
-      qend = NULL;
 
-      if (pvalue_end && pvalue_end > pvalue_start && pname_end > pname_start) {
-        rspamd_content_type_add_param (pool, ct, pname_start, pname_end, pvalue_start, pvalue_end);
+      if (pname_start && pvalue_start && pvalue_end && pvalue_end > pvalue_start
+              && pname_end > pname_start) {
+        rspamd_content_type_add_param (pool, ct, pname_start, pname_end,
+                pvalue_start, pvalue_end);
+        pname_start = NULL;
+        pname_end = NULL;
+        pvalue_start = NULL;
+        pvalue_end = NULL;
+        qend = NULL;
+        qstart = NULL;
       }
     }
-
-    pname_start = NULL;
-    pname_end = NULL;
-    pvalue_start = NULL;
-    pvalue_end = NULL;
-    qend = NULL;
-    qstart = NULL;
   }
 
   action Quoted_Str_Start {