From: Vsevolod Stakhov Date: Tue, 27 Jun 2017 07:37:50 +0000 (+0100) Subject: [CritFix] Switch from ragel to C for Content-Type parsing X-Git-Tag: 1.6.2~70 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=a310f8defd90479ca09274fab1958addb4fb95ae;p=rspamd.git [CritFix] Switch from ragel to C for Content-Type parsing --- diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a637d3bdb..4cca7dc02 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -106,7 +106,6 @@ SET(RAGEL_DEPENDS "${CMAKE_SOURCE_DIR}/src/ragel/smtp_address.rl" "${CMAKE_SOURCE_DIR}/src/ragel/smtp_ip.rl" "${CMAKE_SOURCE_DIR}/src/ragel/smtp_whitespace.rl" "${CMAKE_SOURCE_DIR}/src/ragel/smtp_received.rl" - "${CMAKE_SOURCE_DIR}/src/ragel/content_type.rl" "${CMAKE_SOURCE_DIR}/src/ragel/content_disposition.rl") RAGEL_TARGET(ragel_smtp_addr INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/smtp_addr_parser.rl @@ -118,11 +117,6 @@ RAGEL_TARGET(ragel_smtp_received DEPENDS ${RAGEL_DEPENDS} COMPILE_FLAGS -T1 OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/smtp_received_parser.rl.c) -RAGEL_TARGET(ragel_content_type - INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/content_type_parser.rl - DEPENDS ${RAGEL_DEPENDS} - COMPILE_FLAGS -G2 - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/content_type.rl.c) RAGEL_TARGET(ragel_content_disposition INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/content_disposition_parser.rl DEPENDS ${RAGEL_DEPENDS} diff --git a/src/libmime/content_type.c b/src/libmime/content_type.c index 28dadbc13..38f979453 100644 --- a/src/libmime/content_type.c +++ b/src/libmime/content_type.c @@ -71,27 +71,286 @@ rspamd_content_type_add_param (rspamd_mempool_t *pool, } } -struct rspamd_content_type * -rspamd_content_type_parse (const gchar *in, - gsize len, rspamd_mempool_t *pool) +static struct rspamd_content_type * +rspamd_content_type_parser (const gchar *in, gsize len, rspamd_mempool_t *pool) { + guint obraces = 0, ebraces = 0, qlen = 0; + const gchar *p, *c, *end, *pname_start = NULL, *pname_end = NULL; struct rspamd_content_type *res = NULL, val; - rspamd_ftok_t srch; + gboolean eqsign_seen = FALSE; + enum { + parse_type, + parse_subtype, + parse_after_subtype, + parse_param_name, + parse_param_after_name, + parse_param_value, + parse_param_value_after_quote, + parse_space, + parse_quoted, + parse_comment, + } state = parse_space, next_state = parse_type; + + p = in; + c = p; + end = p + len; + memset (&val, 0, sizeof (val)); + val.lc_data = (gchar *)in; + + while (p < end) { + switch (state) { + case parse_type: + if (g_ascii_isspace (*p) || *p == ';') { + /* We have type without subtype */ + val.type.begin = c; + val.type.len = p - c; + state = parse_after_subtype; + } else if (*p == '/') { + val.type.begin = c; + val.type.len = p - c; + state = parse_space; + next_state = parse_subtype; + p++; + } else { + p++; + } + break; + case parse_subtype: + if (g_ascii_isspace (*p) || *p == ';') { + val.subtype.begin = c; + val.subtype.len = p - c; + state = parse_after_subtype; + } else { + p++; + } + break; + case parse_after_subtype: + if (*p == ';' || g_ascii_isspace (*p)) { + p++; + } else if (*p == '(') { + c = p; + state = parse_comment; + next_state = parse_param_name; + obraces = 1; + ebraces = 0; + pname_start = NULL; + pname_end = NULL; + eqsign_seen = FALSE; + p++; + } else { + c = p; + state = parse_param_name; + pname_start = NULL; + pname_end = NULL; + eqsign_seen = FALSE; + } + break; + case parse_param_name: + if (*p == '=') { + pname_start = c; + pname_end = p; + state = parse_param_after_name; + eqsign_seen = TRUE; + p++; + } else if (g_ascii_isspace (*p)) { + pname_start = c; + pname_end = p; + state = parse_param_after_name; + } else { + p++; + } + break; + case parse_param_after_name: + if (g_ascii_isspace (*p)) { + p++; + } else if (*p == '=') { + if (eqsign_seen) { + /* Treat as value start */ + c = p; + eqsign_seen = FALSE; + state = parse_space; + next_state = parse_param_value; + p++; + } else { + eqsign_seen = TRUE; + p++; + } + } else { + if (eqsign_seen) { + state = parse_param_value; + c = p; + } else { + /* Invalid parameter without value */ + c = p; + state = parse_param_name; + pname_start = NULL; + pname_end = NULL; + } + } + break; + case parse_param_value: + if (*p == '"') { + p++; + c = p; + state = parse_quoted; + next_state = parse_param_value_after_quote; + } else if (g_ascii_isspace (*p)) { + if (pname_start && pname_end && pname_end > pname_start) { + rspamd_content_type_add_param (pool, &val, pname_start, + pname_end, c, p); - val.lc_data = rspamd_mempool_alloc (pool, len); - memcpy (val.lc_data, in, len); - rspamd_str_lc (val.lc_data, len); + } + + state = parse_space; + next_state = parse_param_name; + pname_start = NULL; + pname_end = NULL; + } else if (*p == '(') { + if (pname_start && pname_end && pname_end > pname_start) { + rspamd_content_type_add_param (pool, &val, pname_start, + pname_end, c, p); + } + + obraces = 1; + ebraces = 0; + p++; + state = parse_comment; + next_state = parse_param_name; + pname_start = NULL; + pname_end = NULL; + } else { + p++; + } + break; + case parse_param_value_after_quote: + if (pname_start && pname_end && pname_end > pname_start) { + rspamd_content_type_add_param (pool, &val, pname_start, + pname_end, c, c + qlen); + } - if (rspamd_content_type_parser (val.lc_data, len, &val, pool)) { + if (g_ascii_isspace (*p)) { + state = parse_space; + next_state = parse_param_name; + pname_start = NULL; + pname_end = NULL; + } else if (*p == '(') { + obraces = 1; + ebraces = 0; + p++; + state = parse_comment; + next_state = parse_param_name; + pname_start = NULL; + pname_end = NULL; + } else { + state = parse_param_name; + pname_start = NULL; + pname_end = NULL; + c = p; + } + break; + case parse_quoted: + if (*p == '\\') { + /* Quoted pair */ + if (p + 1 < end) { + p += 2; + } else { + p++; + } + } else if (*p == '"') { + qlen = p - c; + state = next_state; + } else { + p++; + } + break; + case parse_comment: + if (*p == '(') { + obraces++; + p++; + } else if (*p == ')') { + ebraces++; + p++; + + if (ebraces == obraces && p < end) { + if (g_ascii_isspace (*p)) { + state = parse_space; + } else { + c = p; + state = next_state; + } + } + } else { + p++; + } + break; + case parse_space: + if (g_ascii_isspace (*p)) { + p++; + } else if (*p == '(') { + obraces = 1; + ebraces = 0; + p++; + state = parse_comment; + } else { + c = p; + state = next_state; + } + break; + } + } + + /* Process leftover */ + switch (state) { + case parse_type: + val.type.begin = c; + val.type.len = p - c; + break; + case parse_subtype: + val.subtype.begin = c; + val.subtype.len = p - c; + break; + case parse_param_value: + if (pname_start && pname_end && pname_end > pname_start) { + rspamd_content_type_add_param (pool, &val, pname_start, + pname_end, c, p); + + } + case parse_param_value_after_quote: + if (pname_start && pname_end && pname_end > pname_start) { + rspamd_content_type_add_param (pool, &val, pname_start, + pname_end, c, c + qlen); + } + break; + default: + break; + } + + if (val.type.len > 0) { res = rspamd_mempool_alloc (pool, sizeof (val)); memcpy (res, &val, sizeof (val)); + } + return res; +} + +struct rspamd_content_type * +rspamd_content_type_parse (const gchar *in, + gsize len, rspamd_mempool_t *pool) +{ + struct rspamd_content_type *res = NULL; + rspamd_ftok_t srch; + gchar *lc_data; + + lc_data = rspamd_mempool_alloc (pool, len); + memcpy (lc_data, in, len); + rspamd_str_lc (lc_data, len); + + if ((res = rspamd_content_type_parser (lc_data, len, pool)) != NULL) { if (res->attrs) { rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t)g_hash_table_unref, res->attrs); } - /* Now do some hacks to work with broken content types */ if (res->subtype.len == 0) { res->flags |= RSPAMD_CONTENT_TYPE_BROKEN; @@ -157,7 +416,7 @@ rspamd_content_type_parse (const gchar *in, } } else { - msg_warn_pool ("cannot parse content type: %*s", (gint)len, val.lc_data); + msg_warn_pool ("cannot parse content type: %*s", (gint)len, lc_data); } return res; diff --git a/src/libmime/smtp_parsers.h b/src/libmime/smtp_parsers.h index 57fb5d552..439e6c7c8 100644 --- a/src/libmime/smtp_parsers.h +++ b/src/libmime/smtp_parsers.h @@ -26,9 +26,6 @@ int rspamd_smtp_recieved_parse (struct rspamd_task *task, const char *data, size_t len, struct received_header *rh); int rspamd_smtp_addr_parse (const char *data, size_t len, struct rspamd_email_address *addr); - -gboolean rspamd_content_type_parser (const char *data, size_t len, - struct rspamd_content_type *ct, rspamd_mempool_t *pool); gboolean rspamd_content_disposition_parser (const char *data, size_t len, struct rspamd_content_disposition *cd, rspamd_mempool_t *pool); diff --git a/src/ragel/content_type.rl b/src/ragel/content_type.rl index 6f65a7d0f..29889208f 100644 --- a/src/ragel/content_type.rl +++ b/src/ragel/content_type.rl @@ -9,12 +9,11 @@ comment = "(" (FWS? ccontent)* FWS? ")"; CFWS = ((FWS? comment)+ FWS?) | FWS; qcontent = qtextSMTP | quoted_pairSMTP; - quoted_string = CFWS? - (DQUOTE + quoted_string = (DQUOTE (((FWS? qcontent)* FWS?) >Quoted_Str_Start %Quoted_Str_End) - DQUOTE) CFWS?; + DQUOTE); token = 0x21..0x27 | 0x2a..0x2b | 0x2c..0x2e | 0x30..0x39 | 0x41..0x5a | 0x5e..0x7e; - value = (quoted_string | (token | 0x3d)+) >Param_Value_Start %Param_Value_End; + value = (quoted_string | (token)+) >Param_Value_Start %Param_Value_End; attribute = (token+) >Param_Name_Start %Param_Name_End; parameter = CFWS? attribute FWS? "=" FWS? value CFWS?; diff --git a/src/ragel/content_type_parser.rl b/src/ragel/content_type_parser.rl index eca3da3f8..f248e0116 100644 --- a/src/ragel/content_type_parser.rl +++ b/src/ragel/content_type_parser.rl @@ -42,6 +42,7 @@ } action Param_Name_Start { + printf("name start: %s\n", p); qstart = NULL; qend = NULL; pname_start = p; @@ -50,25 +51,32 @@ action Param_Name_End { + printf("name end: %s\n", p); if (qstart) { pname_start = qstart; } - if (qend && qend >= qstart) { + if (qstart && qend && qend >= qstart) { pname_end = qend; } else if (p >= pname_start) { pname_end = p; } - qstart = NULL; - qend = NULL; + + if (qstart && qend) { + qstart = NULL; + qend = NULL; + } } action Param_Value_Start { - qstart = NULL; - qend = NULL; + printf("value start: %s\n", p); + if (qend) { + qstart = NULL; + qend = NULL; + } - if (pname_end) { + if (pname_end && !pvalue_start) { pvalue_start = p; pvalue_end = NULL; } @@ -76,30 +84,40 @@ action Param_Value_End { - if (pname_end) { + printf("value end: %s\n", p); + if (pname_end && pname_start) { if (qstart) { pvalue_start = qstart; + + if (!qend) { + pvalue_end = NULL; + } } + if (qend && qend >= qstart) { - pvalue_end = qend; + if (qstart) { + pvalue_end = qend; + } + else { + pvalue_end = NULL; + } } - else if (p >= pvalue_start) { + else if (!qstart && p >= pvalue_start) { pvalue_end = p; } - qstart = NULL; - qend = NULL; - if (pvalue_end && pvalue_end > pvalue_start && pname_end > pname_start) { - rspamd_content_type_add_param (pool, ct, pname_start, pname_end, pvalue_start, pvalue_end); + if (pname_start && pvalue_start && pvalue_end && pvalue_end > pvalue_start + && pname_end > pname_start) { + rspamd_content_type_add_param (pool, ct, pname_start, pname_end, + pvalue_start, pvalue_end); + pname_start = NULL; + pname_end = NULL; + pvalue_start = NULL; + pvalue_end = NULL; + qend = NULL; + qstart = NULL; } } - - pname_start = NULL; - pname_end = NULL; - pvalue_start = NULL; - pvalue_end = NULL; - qend = NULL; - qstart = NULL; } action Quoted_Str_Start {