From ba7924d7f331ba0523a3f5a2dd012f236d41b46e Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 13 Dec 2016 17:48:21 +0000 Subject: [PATCH] [Rework] Add preliminary implementation of the mime parser --- src/libmime/CMakeLists.txt | 3 +- src/libmime/email_addr.h | 2 +- src/libmime/images.c | 2 +- src/libmime/message.c | 6 +- src/libmime/mime_expressions.c | 2 +- src/libmime/mime_headers.c | 6 +- src/libmime/mime_headers.h | 2 +- src/libmime/mime_parser.c | 425 +++++++++++++++++++++++++++++++++ src/libmime/mime_parser.h | 25 ++ src/libserver/dkim.c | 2 +- src/libserver/re_cache.c | 2 +- src/libstat/stat_process.c | 2 +- src/lua/lua_task.c | 2 +- src/plugins/dkim_check.c | 2 +- 14 files changed, 467 insertions(+), 16 deletions(-) create mode 100644 src/libmime/mime_parser.c create mode 100644 src/libmime/mime_parser.h diff --git a/src/libmime/CMakeLists.txt b/src/libmime/CMakeLists.txt index 3ec13937a..0a3b22ecb 100644 --- a/src/libmime/CMakeLists.txt +++ b/src/libmime/CMakeLists.txt @@ -7,6 +7,7 @@ SET(LIBRSPAMDMIMESRC ${CMAKE_CURRENT_SOURCE_DIR}/message.c ${CMAKE_CURRENT_SOURCE_DIR}/archives.c ${CMAKE_CURRENT_SOURCE_DIR}/content_type.c - ${CMAKE_CURRENT_SOURCE_DIR}/mime_headers.c) + ${CMAKE_CURRENT_SOURCE_DIR}/mime_headers.c + ${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c) SET(RSPAMD_MIME ${LIBRSPAMDMIMESRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/libmime/email_addr.h b/src/libmime/email_addr.h index 98be33abe..de91237db 100644 --- a/src/libmime/email_addr.h +++ b/src/libmime/email_addr.h @@ -19,7 +19,7 @@ #include "config.h" #include "ref.h" -struct raw_header; +struct rspamd_mime_header; enum rspamd_email_address_flags { RSPAMD_EMAIL_ADDR_VALID = (1 << 0), diff --git a/src/libmime/images.c b/src/libmime/images.c index 72f585a14..a0b9c30d5 100644 --- a/src/libmime/images.c +++ b/src/libmime/images.c @@ -553,7 +553,7 @@ process_image (struct rspamd_task *task, struct rspamd_mime_part *part) { enum rspamd_image_type type; struct rspamd_image *img = NULL; - struct raw_header *rh; + struct rspamd_mime_header *rh; struct rspamd_mime_text_part *tp; struct html_image *himg; const gchar *cid, *html_cid; diff --git a/src/libmime/message.c b/src/libmime/message.c index cb9009d7b..e8b8f0bab 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -1138,7 +1138,7 @@ rspamd_message_parse (struct rspamd_task *task) GPtrArray *hdrs; GMimeObject *parent; const GMimeContentType *ct; - struct raw_header *rh; + struct rspamd_mime_header *rh; struct rspamd_mime_text_part *p1, *p2; struct mime_foreach_data md; struct received_header *recv, *trecv; @@ -1527,7 +1527,7 @@ rspamd_message_get_header_from_hash (GHashTable *htb, gboolean strong) { GPtrArray *ret, *ar; - struct raw_header *cur; + struct rspamd_mime_header *cur; guint i; ar = g_hash_table_lookup (htb, field); @@ -1573,7 +1573,7 @@ rspamd_message_get_mime_header_array (struct rspamd_task *task, gboolean strong) { GPtrArray *ret, *ar; - struct raw_header *cur; + struct rspamd_mime_header *cur; guint nelems = 0, i, j; struct rspamd_mime_part *mp; diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c index 4fd1a893c..928949e8a 100644 --- a/src/libmime/mime_expressions.c +++ b/src/libmime/mime_expressions.c @@ -1350,7 +1350,7 @@ rspamd_compare_transfer_encoding (struct rspamd_task * task, GPtrArray *headerlist; struct expression_argument *arg; guint i; - struct raw_header *rh; + struct rspamd_mime_header *rh; static const char *hname = "Content-Transfer-Encoding"; if (args == NULL) { diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c index 8ba8e6c60..43e0fe7bc 100644 --- a/src/libmime/mime_headers.c +++ b/src/libmime/mime_headers.c @@ -19,7 +19,7 @@ static void rspamd_mime_header_add (struct rspamd_task *task, - GHashTable *target, struct raw_header *rh) + GHashTable *target, struct rspamd_mime_header *rh) { GPtrArray *ar; @@ -40,7 +40,7 @@ void rspamd_mime_headers_process (struct rspamd_task *task, GHashTable *target, const gchar *in, gsize len, gboolean check_newlines) { - struct raw_header *new = NULL; + struct rspamd_mime_header *new = NULL; const gchar *p, *c, *end; gchar *tmp, *tp; gint state = 0, l, next_state = 100, err_state = 100, t_state; @@ -73,7 +73,7 @@ rspamd_mime_headers_process (struct rspamd_task *task, GHashTable *target, if (*p == ':') { new = rspamd_mempool_alloc0 (task->task_pool, - sizeof (struct raw_header)); + sizeof (struct rspamd_mime_header)); l = p - c; tmp = rspamd_mempool_alloc (task->task_pool, l + 1); rspamd_strlcpy (tmp, c, l + 1); diff --git a/src/libmime/mime_headers.h b/src/libmime/mime_headers.h index aa76bed18..98176cab5 100644 --- a/src/libmime/mime_headers.h +++ b/src/libmime/mime_headers.h @@ -20,7 +20,7 @@ struct rspamd_task; -struct raw_header { +struct rspamd_mime_header { gchar *name; gchar *value; const gchar *raw_value; /* As it is in the message (unfolded and unparsed) */ diff --git a/src/libmime/mime_parser.c b/src/libmime/mime_parser.c new file mode 100644 index 000000000..eab42b9b6 --- /dev/null +++ b/src/libmime/mime_parser.c @@ -0,0 +1,425 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "task.h" +#include "mime_parser.h" +#include "mime_headers.h" +#include "message.h" +#include "content_type.h" +#include "multipattern.h" +#include "cryptobox.h" + +static struct rspamd_multipattern *mp_boundary = NULL; + +struct rspamd_mime_parser_stack { + GPtrArray *stack; /* Stack of parts */ + struct rspamd_mime_part *cur_part; + const gchar *pos; + const gchar *end; +}; + +#define RSPAMD_MIME_QUARK (rspamd_mime_parser_quark()) +static GQuark +rspamd_mime_parser_quark (void) +{ + return g_quark_from_static_string ("mime-parser"); +} + +static void +rspamd_mime_parser_init_mp (void) +{ + mp_boundary = rspamd_multipattern_create (RSPAMD_MULTIPATTERN_DEFAULT); + g_assert (mp_boundary != NULL); + rspamd_multipattern_add_pattern (mp_boundary, "\r--", 0); + rspamd_multipattern_add_pattern (mp_boundary, "\n--", 0); +} + +static enum rspamd_cte +rspamd_mime_parse_cte (const gchar *in, gsize len) +{ + guint64 h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64, + in, len, 0xdeadbabe); + enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN; + + switch (h) { + case 0xCEDAA7056B4753F7ULL: /* 7bit */ + ret = RSPAMD_CTE_7BIT; + break; + case 0x42E0745448B39FC1ULL: /* 8bit */ + ret = RSPAMD_CTE_8BIT; + break; + case 0x6D69A5BB02A633B0ULL: /* quoted-printable */ + ret = RSPAMD_CTE_QP; + break; + case 0x96305588A76DC9A9ULL: /* base64 */ + case 0x171029DE1B0423A9ULL: /* base-64 */ + ret = RSPAMD_CTE_B64; + break; + } + + return ret; +} + +static void +rspamd_mime_part_get_cte_heuristic (struct rspamd_mime_part *part) +{ + const guint check_len = 80; + guint real_len, nspaces = 0, neqsign = 0, n8bit = 0; + gboolean b64_chars = TRUE; + const guchar *p, *end; + enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN; + + real_len = MIN (check_len, part->raw_data.len); + p = (const guchar *)part->raw_data.begin; + end = p + real_len; + + while (p < end) { + if (*p == ' ') { + nspaces ++; + } + else if (*p == '=') { + neqsign ++; + } + else if (*p >= 0x80) { + n8bit ++; + b64_chars = FALSE; + } + else if (!(g_ascii_isalnum (*p) || *p == '/' || *p == '+')) { + b64_chars = FALSE; + } + + p ++; + } + + if (b64_chars && neqsign < 2 && nspaces == 0) { + ret = RSPAMD_CTE_B64; + } + else if (n8bit == 0) { + if (neqsign > 2 && nspaces > 2) { + ret = RSPAMD_CTE_QP; + } + else { + ret = RSPAMD_CTE_7BIT; + } + } + else { + ret = RSPAMD_CTE_8BIT; + } +} + +static void +rspamd_mime_part_get_cte (struct rspamd_task *task, struct rspamd_mime_part *part) +{ + struct rspamd_mime_header *hdr; + guint i; + GPtrArray *hdrs; + enum rspamd_cte cte = RSPAMD_CTE_UNKNOWN; + + hdrs = rspamd_message_get_header_from_hash (part->raw_headers, + task->task_pool, + "Content-Transfer-Encoding", FALSE); + + if (hdrs == NULL) { + rspamd_mime_part_get_cte_heuristic (part); + } + else { + for (i = 0; i < hdrs->len; i ++) { + gsize hlen; + + hdr = g_ptr_array_index (hdrs, i); + hlen = strlen (hdr->value); + rspamd_str_lc (hdr->value, hlen); + cte = rspamd_mime_parse_cte (hdr->value, hlen); + + if (cte != RSPAMD_CTE_UNKNOWN) { + break; + } + } + + if (cte == RSPAMD_CTE_UNKNOWN) { + rspamd_mime_part_get_cte_heuristic (part); + } + else { + part->cte = cte; + } + } +} + +static gboolean +rspamd_mime_parse_normal_part (struct rspamd_task *task, + struct rspamd_mime_part *part, + struct rspamd_mime_parser_stack *st, + GError **err) +{ + rspamd_fstring_t *parsed; + gssize r; + + g_assert (part != NULL); + + rspamd_mime_part_get_cte (task, part); + + switch (part->cte) { + case RSPAMD_CTE_7BIT: + case RSPAMD_CTE_8BIT: + case RSPAMD_CTE_UNKNOWN: + part->parsed_data.begin = part->raw_data.begin; + part->parsed_data.len = part->raw_data.len; + break; + case RSPAMD_CTE_QP: + parsed = rspamd_fstring_sized_new (part->raw_data.len); + r = rspamd_decode_qp_buf (part->raw_data.begin, part->raw_data.len, + parsed->str, parsed->allocated); + g_assert (r != -1); + parsed->len = r; + part->parsed_data.begin = parsed->str; + part->parsed_data.len = parsed->len; + rspamd_mempool_add_destructor (task->task_pool, + (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed); + break; + case RSPAMD_CTE_B64: + parsed = rspamd_fstring_sized_new (part->raw_data.len / 4 * 3 + 12); + rspamd_cryptobox_base64_decode (part->raw_data.begin, part->raw_data.len, + parsed->str, &parsed->len); + part->parsed_data.begin = parsed->str; + part->parsed_data.len = parsed->len; + rspamd_mempool_add_destructor (task->task_pool, + (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed); + break; + default: + g_assert_not_reached (); + } + + return TRUE; +} + +static gboolean +rspamd_mime_parse_multipart_part (struct rspamd_task *task, + struct rspamd_mime_part *part, + struct rspamd_mime_parser_stack *st, + GError **err) +{ + return FALSE; +} + +static gboolean +rspamd_mime_parse_message (struct rspamd_task *task, + struct rspamd_mime_part *part, + struct rspamd_mime_parser_stack *st, + GError **err) +{ + struct rspamd_content_type *ct, *sel = NULL; + struct rspamd_mime_header *hdr; + GPtrArray *hdrs = NULL; + const gchar *pbegin, *p; + gsize plen, len; + struct rspamd_mime_part *npart; + goffset hdr_pos, body_pos; + guint i; + gboolean ret = FALSE; + GString str; + + /* Parse headers */ + if (st->cur_part == NULL) { + p = task->msg.begin; + len = task->msg.len; + /* Skip any space characters to avoid some bad messages to be unparsed */ + while (len > 0 && g_ascii_isspace (*p)) { + p ++; + len --; + } + + /* + * Exim somehow uses mailbox format for messages being scanned: + * From xxx@xxx.com Fri May 13 19:08:48 2016 + * + * So we check if a task has non-http format then we check for such a line + * at the beginning to avoid errors + */ + if (!(task->flags & RSPAMD_TASK_FLAG_JSON) || (task->flags & + RSPAMD_TASK_FLAG_LOCAL_CLIENT)) { + if (len > sizeof ("From ") - 1) { + if (memcmp (p, "From ", sizeof ("From ") - 1) == 0) { + /* Skip to CRLF */ + msg_info_task ("mailbox input detected, enable workaround"); + p += sizeof ("From ") - 1; + len -= sizeof ("From ") - 1; + + while (len > 0 && *p != '\n') { + p ++; + len --; + } + while (len > 0 && g_ascii_isspace (*p)) { + p ++; + len --; + } + } + } + } + + str.str = (gchar *)p; + str.len = len; + } + else { + p = part->raw_data.begin; + len = part->raw_data.len; + + /* Skip any space characters to avoid some bad messages to be unparsed */ + while (len > 0 && g_ascii_isspace (*p)) { + p ++; + len --; + } + + str.str = (gchar *)p; + str.len = len; + } + + hdr_pos = rspamd_string_find_eoh (&str, &body_pos); + + if (hdr_pos > 0 && hdr_pos < str.len) { + + if (st->cur_part == NULL) { + task->raw_headers_content.begin = (gchar *) (str.str); + task->raw_headers_content.len = hdr_pos; + task->raw_headers_content.body_start = str.str + body_pos; + + if (task->raw_headers_content.len > 0) { + rspamd_mime_headers_process (task, task->raw_headers, + task->raw_headers_content.begin, + task->raw_headers_content.len, + TRUE); + } + + hdrs = rspamd_message_get_header_from_hash (task->raw_headers, + task->task_pool, + "Content-Type", FALSE); + } + else { + part->raw_headers_str = str.str; + part->raw_headers_len = str.len; + part->raw_data.begin = p + body_pos; + part->raw_data.len -= body_pos; + task->raw_headers_content.body_start = p + body_pos; + + if (task->raw_headers_content.len > 0) { + rspamd_mime_headers_process (task, task->raw_headers, + task->raw_headers_content.begin, + task->raw_headers_content.len, + TRUE); + } + + hdrs = rspamd_message_get_header_from_hash (st->cur_part->raw_headers, + task->task_pool, + "Content-Type", FALSE); + } + + } + + + if (hdrs == NULL) { + g_set_error (err, RSPAMD_MIME_QUARK, EINVAL, + "Content type header is absent"); + + return FALSE; + } + + for (i = 0; i < hdrs->len; i ++) { + hdr = g_ptr_array_index (hdrs, i); + ct = rspamd_content_type_parse (hdr->value, strlen (hdr->value), + task->task_pool); + + /* Here we prefer multipart content-type or any content-type */ + if (ct) { + if (sel == NULL) { + sel = ct; + } + else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) { + sel = ct; + } + } + } + + if (sel == NULL) { + g_set_error (err, RSPAMD_MIME_QUARK, EINVAL, + "Content type header cannot be parsed"); + + return FALSE; + } + + if (part) { + pbegin = part->raw_data.begin; + plen = part->raw_data.len; + } + else { + pbegin = st->pos; + plen = st->end - pbegin; + } + + npart = rspamd_mempool_alloc0 (task->task_pool, + sizeof (struct rspamd_mime_part)); + npart->raw_data.begin = pbegin; + npart->raw_data.len = plen; + npart->parent = part; + npart->ct = sel; + + if (st->cur_part == NULL) { + npart->raw_headers = g_hash_table_ref (task->raw_headers); + } + else { + npart->raw_headers = g_hash_table_ref (st->cur_part->raw_headers); + } + + if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) { + st->cur_part = npart; + g_ptr_array_add (st->stack, npart); + ret = rspamd_mime_parse_multipart_part (task, npart, st, err); + } + else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) { + st->cur_part = npart; + g_ptr_array_add (st->stack, npart); + ret = rspamd_mime_parse_message (task, npart, st, err); + } + else { + ret = rspamd_mime_parse_normal_part (task, npart, st, err); + } + + return ret; +} + +gboolean +rspamd_mime_parse_task (struct rspamd_task *task, GError **err) +{ + struct rspamd_mime_parser_stack *st; + gboolean ret; + + if (mp_boundary == NULL) { + rspamd_mime_parser_init_mp (); + } + + st = g_slice_alloc0 (sizeof (*st)); + st->stack = g_ptr_array_sized_new (4); + st->pos = task->raw_headers_content.body_start; + st->end = task->msg.begin + task->msg.len; + + if (st->pos == NULL) { + st->pos = task->msg.begin; + } + + ret = rspamd_mime_parse_message (task, NULL, st, err); + //rspamd_mime_parse_stack_free (st); + + return ret; +} diff --git a/src/libmime/mime_parser.h b/src/libmime/mime_parser.h new file mode 100644 index 000000000..f1355d72e --- /dev/null +++ b/src/libmime/mime_parser.h @@ -0,0 +1,25 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBMIME_MIME_PARSER_H_ +#define SRC_LIBMIME_MIME_PARSER_H_ + +#include "config.h" + +struct rspamd_task; + +gboolean rspamd_mime_parse_task (struct rspamd_task *task, GError **err); + +#endif /* SRC_LIBMIME_MIME_PARSER_H_ */ diff --git a/src/libserver/dkim.c b/src/libserver/dkim.c index fbba759e0..873d59d22 100644 --- a/src/libserver/dkim.c +++ b/src/libserver/dkim.c @@ -1713,7 +1713,7 @@ rspamd_dkim_canonize_header (struct rspamd_dkim_common_ctx *ctx, const gchar *dkim_header, const gchar *dkim_domain) { - struct raw_header *rh; + struct rspamd_mime_header *rh; guint rh_num = 0, i; GPtrArray *ar; diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c index 961fcf291..1e713dc86 100644 --- a/src/libserver/re_cache.c +++ b/src/libserver/re_cache.c @@ -753,7 +753,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, guint ret = 0, i, re_id; GPtrArray *headerlist; GHashTableIter it; - struct raw_header *rh; + struct rspamd_mime_header *rh; const gchar *in, *end; const guchar **scvec; guint *lenvec; diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 768b9af69..be90dff83 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -35,7 +35,7 @@ static void rspamd_stat_tokenize_header (struct rspamd_task *task, const gchar *name, const gchar *prefix, GArray *ar) { - struct raw_header *cur; + struct rspamd_mime_header *cur; GPtrArray *hdrs; guint i; rspamd_ftok_t str; diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index ea821f488..8d922757e 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -1426,7 +1426,7 @@ rspamd_lua_push_header (lua_State * L, gboolean raw) { - struct raw_header *rh; + struct rspamd_mime_header *rh; guint i; const gchar *val; diff --git a/src/plugins/dkim_check.c b/src/plugins/dkim_check.c index 5d30cdcb3..7c00f4e90 100644 --- a/src/plugins/dkim_check.c +++ b/src/plugins/dkim_check.c @@ -702,7 +702,7 @@ dkim_symbol_callback (struct rspamd_task *task, void *unused) rspamd_dkim_context_t *ctx; rspamd_dkim_key_t *key; GError *err = NULL; - struct raw_header *rh; + struct rspamd_mime_header *rh; struct dkim_check_result *res = NULL, *cur; guint checked = 0, i; -- 2.39.5