]> source.dussan.org Git - rspamd.git/commitdiff
[Rework] Add preliminary implementation of the mime parser
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 13 Dec 2016 17:48:21 +0000 (17:48 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 13 Dec 2016 17:48:21 +0000 (17:48 +0000)
14 files changed:
src/libmime/CMakeLists.txt
src/libmime/email_addr.h
src/libmime/images.c
src/libmime/message.c
src/libmime/mime_expressions.c
src/libmime/mime_headers.c
src/libmime/mime_headers.h
src/libmime/mime_parser.c [new file with mode: 0644]
src/libmime/mime_parser.h [new file with mode: 0644]
src/libserver/dkim.c
src/libserver/re_cache.c
src/libstat/stat_process.c
src/lua/lua_task.c
src/plugins/dkim_check.c

index 3ec13937aebf192bcb37370e9d495c34f227f020..0a3b22ecbffa359f1f998e40261ed2c70b40a12c 100644 (file)
@@ -7,6 +7,7 @@ SET(LIBRSPAMDMIMESRC
                                ${CMAKE_CURRENT_SOURCE_DIR}/message.c
                                ${CMAKE_CURRENT_SOURCE_DIR}/archives.c
                                ${CMAKE_CURRENT_SOURCE_DIR}/content_type.c
-                               ${CMAKE_CURRENT_SOURCE_DIR}/mime_headers.c)
+                               ${CMAKE_CURRENT_SOURCE_DIR}/mime_headers.c
+                               ${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c)
 
 SET(RSPAMD_MIME ${LIBRSPAMDMIMESRC} PARENT_SCOPE)
\ No newline at end of file
index 98be33abe640049961c409dd5e942c6ae584d0c3..de91237db8ae6da2614fd7dc7ba1f96ae15ffef7 100644 (file)
@@ -19,7 +19,7 @@
 #include "config.h"
 #include "ref.h"
 
-struct raw_header;
+struct rspamd_mime_header;
 
 enum rspamd_email_address_flags {
        RSPAMD_EMAIL_ADDR_VALID = (1 << 0),
index 72f585a143d50ff2b3e7fd332013c567c0f8d5d3..a0b9c30d537937d5798b7922dbc0af89059418d7 100644 (file)
@@ -553,7 +553,7 @@ process_image (struct rspamd_task *task, struct rspamd_mime_part *part)
 {
        enum rspamd_image_type type;
        struct rspamd_image *img = NULL;
-       struct raw_header *rh;
+       struct rspamd_mime_header *rh;
        struct rspamd_mime_text_part *tp;
        struct html_image *himg;
        const gchar *cid, *html_cid;
index cb9009d7b0e8bfae24d31b744118778e02cd38f2..e8b8f0bab14d7c95194e731d42cb60cf929d1ca5 100644 (file)
@@ -1138,7 +1138,7 @@ rspamd_message_parse (struct rspamd_task *task)
        GPtrArray *hdrs;
        GMimeObject *parent;
        const GMimeContentType *ct;
-       struct raw_header *rh;
+       struct rspamd_mime_header *rh;
        struct rspamd_mime_text_part *p1, *p2;
        struct mime_foreach_data md;
        struct received_header *recv, *trecv;
@@ -1527,7 +1527,7 @@ rspamd_message_get_header_from_hash (GHashTable *htb,
                gboolean strong)
 {
        GPtrArray *ret, *ar;
-       struct raw_header *cur;
+       struct rspamd_mime_header *cur;
        guint i;
 
        ar = g_hash_table_lookup (htb, field);
@@ -1573,7 +1573,7 @@ rspamd_message_get_mime_header_array (struct rspamd_task *task,
                gboolean strong)
 {
        GPtrArray *ret, *ar;
-       struct raw_header *cur;
+       struct rspamd_mime_header *cur;
        guint nelems = 0, i, j;
        struct rspamd_mime_part *mp;
 
index 4fd1a893c591b5e1df87537a2c7d45d7d4d85e19..928949e8a83bcf72b4d8ec00a510b7a1d3e4ef14 100644 (file)
@@ -1350,7 +1350,7 @@ rspamd_compare_transfer_encoding (struct rspamd_task * task,
        GPtrArray *headerlist;
        struct expression_argument *arg;
        guint i;
-       struct raw_header *rh;
+       struct rspamd_mime_header *rh;
        static const char *hname = "Content-Transfer-Encoding";
 
        if (args == NULL) {
index 8ba8e6c60d037f7736945945e15103282d46ce07..43e0fe7bc1ca7eb0d678b6379468284e9a02761c 100644 (file)
@@ -19,7 +19,7 @@
 
 static void
 rspamd_mime_header_add (struct rspamd_task *task,
-               GHashTable *target, struct raw_header *rh)
+               GHashTable *target, struct rspamd_mime_header *rh)
 {
        GPtrArray *ar;
 
@@ -40,7 +40,7 @@ void
 rspamd_mime_headers_process (struct rspamd_task *task, GHashTable *target,
                const gchar *in, gsize len, gboolean check_newlines)
 {
-       struct raw_header *new = NULL;
+       struct rspamd_mime_header *new = NULL;
        const gchar *p, *c, *end;
        gchar *tmp, *tp;
        gint state = 0, l, next_state = 100, err_state = 100, t_state;
@@ -73,7 +73,7 @@ rspamd_mime_headers_process (struct rspamd_task *task, GHashTable *target,
                        if (*p == ':') {
                                new =
                                        rspamd_mempool_alloc0 (task->task_pool,
-                                               sizeof (struct raw_header));
+                                               sizeof (struct rspamd_mime_header));
                                l = p - c;
                                tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
                                rspamd_strlcpy (tmp, c, l + 1);
index aa76bed1806f25093a231e067186d6d369ffe9a2..98176cab50df49d06ec7a5701ee633f3465c430d 100644 (file)
@@ -20,7 +20,7 @@
 
 struct rspamd_task;
 
-struct raw_header {
+struct rspamd_mime_header {
        gchar *name;
        gchar *value;
        const gchar *raw_value; /* As it is in the message (unfolded and unparsed) */
diff --git a/src/libmime/mime_parser.c b/src/libmime/mime_parser.c
new file mode 100644 (file)
index 0000000..eab42b9
--- /dev/null
@@ -0,0 +1,425 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config.h"
+#include "task.h"
+#include "mime_parser.h"
+#include "mime_headers.h"
+#include "message.h"
+#include "content_type.h"
+#include "multipattern.h"
+#include "cryptobox.h"
+
+static struct rspamd_multipattern *mp_boundary = NULL;
+
+struct rspamd_mime_parser_stack {
+       GPtrArray *stack; /* Stack of parts */
+       struct rspamd_mime_part *cur_part;
+       const gchar *pos;
+       const gchar *end;
+};
+
+#define RSPAMD_MIME_QUARK (rspamd_mime_parser_quark())
+static GQuark
+rspamd_mime_parser_quark (void)
+{
+       return g_quark_from_static_string ("mime-parser");
+}
+
+static void
+rspamd_mime_parser_init_mp (void)
+{
+       mp_boundary = rspamd_multipattern_create (RSPAMD_MULTIPATTERN_DEFAULT);
+       g_assert (mp_boundary != NULL);
+       rspamd_multipattern_add_pattern (mp_boundary, "\r--", 0);
+       rspamd_multipattern_add_pattern (mp_boundary, "\n--", 0);
+}
+
+static enum rspamd_cte
+rspamd_mime_parse_cte (const gchar *in, gsize len)
+{
+       guint64 h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64,
+                       in, len, 0xdeadbabe);
+       enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
+
+       switch (h) {
+       case 0xCEDAA7056B4753F7ULL: /* 7bit */
+               ret = RSPAMD_CTE_7BIT;
+               break;
+       case 0x42E0745448B39FC1ULL: /* 8bit */
+               ret = RSPAMD_CTE_8BIT;
+               break;
+       case 0x6D69A5BB02A633B0ULL: /* quoted-printable */
+               ret = RSPAMD_CTE_QP;
+               break;
+       case 0x96305588A76DC9A9ULL: /* base64 */
+       case 0x171029DE1B0423A9ULL: /* base-64 */
+               ret = RSPAMD_CTE_B64;
+               break;
+       }
+
+       return ret;
+}
+
+static void
+rspamd_mime_part_get_cte_heuristic (struct rspamd_mime_part *part)
+{
+       const guint check_len = 80;
+       guint real_len, nspaces = 0, neqsign = 0, n8bit = 0;
+       gboolean b64_chars = TRUE;
+       const guchar *p, *end;
+       enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
+
+       real_len = MIN (check_len, part->raw_data.len);
+       p = (const guchar *)part->raw_data.begin;
+       end = p + real_len;
+
+       while (p < end) {
+               if (*p == ' ') {
+                       nspaces ++;
+               }
+               else if (*p == '=') {
+                       neqsign ++;
+               }
+               else if (*p >= 0x80) {
+                       n8bit ++;
+                       b64_chars = FALSE;
+               }
+               else if (!(g_ascii_isalnum (*p) || *p == '/' || *p == '+')) {
+                       b64_chars = FALSE;
+               }
+
+               p ++;
+       }
+
+       if (b64_chars && neqsign < 2 && nspaces == 0) {
+               ret = RSPAMD_CTE_B64;
+       }
+       else if (n8bit == 0) {
+               if (neqsign > 2 && nspaces > 2) {
+                       ret = RSPAMD_CTE_QP;
+               }
+               else {
+                       ret = RSPAMD_CTE_7BIT;
+               }
+       }
+       else {
+               ret = RSPAMD_CTE_8BIT;
+       }
+}
+
+static void
+rspamd_mime_part_get_cte (struct rspamd_task *task, struct rspamd_mime_part *part)
+{
+       struct rspamd_mime_header *hdr;
+       guint i;
+       GPtrArray *hdrs;
+       enum rspamd_cte cte = RSPAMD_CTE_UNKNOWN;
+
+       hdrs = rspamd_message_get_header_from_hash (part->raw_headers,
+                       task->task_pool,
+                       "Content-Transfer-Encoding", FALSE);
+
+       if (hdrs == NULL) {
+               rspamd_mime_part_get_cte_heuristic (part);
+       }
+       else {
+               for (i = 0; i < hdrs->len; i ++) {
+                       gsize hlen;
+
+                       hdr = g_ptr_array_index (hdrs, i);
+                       hlen = strlen (hdr->value);
+                       rspamd_str_lc (hdr->value, hlen);
+                       cte = rspamd_mime_parse_cte (hdr->value, hlen);
+
+                       if (cte != RSPAMD_CTE_UNKNOWN) {
+                               break;
+                       }
+               }
+
+               if (cte == RSPAMD_CTE_UNKNOWN) {
+                       rspamd_mime_part_get_cte_heuristic (part);
+               }
+               else {
+                       part->cte = cte;
+               }
+       }
+}
+
+static gboolean
+rspamd_mime_parse_normal_part (struct rspamd_task *task,
+               struct rspamd_mime_part *part,
+               struct rspamd_mime_parser_stack *st,
+               GError **err)
+{
+       rspamd_fstring_t *parsed;
+       gssize r;
+
+       g_assert (part != NULL);
+
+       rspamd_mime_part_get_cte (task, part);
+
+       switch (part->cte) {
+       case RSPAMD_CTE_7BIT:
+       case RSPAMD_CTE_8BIT:
+       case RSPAMD_CTE_UNKNOWN:
+               part->parsed_data.begin = part->raw_data.begin;
+               part->parsed_data.len = part->raw_data.len;
+               break;
+       case RSPAMD_CTE_QP:
+               parsed = rspamd_fstring_sized_new (part->raw_data.len);
+               r = rspamd_decode_qp_buf (part->raw_data.begin, part->raw_data.len,
+                               parsed->str, parsed->allocated);
+               g_assert (r != -1);
+               parsed->len = r;
+               part->parsed_data.begin = parsed->str;
+               part->parsed_data.len = parsed->len;
+               rspamd_mempool_add_destructor (task->task_pool,
+                               (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
+               break;
+       case RSPAMD_CTE_B64:
+               parsed = rspamd_fstring_sized_new (part->raw_data.len / 4 * 3 + 12);
+               rspamd_cryptobox_base64_decode (part->raw_data.begin, part->raw_data.len,
+                               parsed->str, &parsed->len);
+               part->parsed_data.begin = parsed->str;
+               part->parsed_data.len = parsed->len;
+               rspamd_mempool_add_destructor (task->task_pool,
+                               (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
+               break;
+       default:
+               g_assert_not_reached ();
+       }
+
+       return TRUE;
+}
+
+static gboolean
+rspamd_mime_parse_multipart_part (struct rspamd_task *task,
+               struct rspamd_mime_part *part,
+               struct rspamd_mime_parser_stack *st,
+               GError **err)
+{
+       return FALSE;
+}
+
+static gboolean
+rspamd_mime_parse_message (struct rspamd_task *task,
+               struct rspamd_mime_part *part,
+               struct rspamd_mime_parser_stack *st,
+               GError **err)
+{
+       struct rspamd_content_type *ct, *sel = NULL;
+       struct rspamd_mime_header *hdr;
+       GPtrArray *hdrs = NULL;
+       const gchar *pbegin, *p;
+       gsize plen, len;
+       struct rspamd_mime_part *npart;
+       goffset hdr_pos, body_pos;
+       guint i;
+       gboolean ret = FALSE;
+       GString str;
+
+       /* Parse headers */
+       if (st->cur_part == NULL) {
+               p = task->msg.begin;
+               len = task->msg.len;
+               /* Skip any space characters to avoid some bad messages to be unparsed */
+               while (len > 0 && g_ascii_isspace (*p)) {
+                       p ++;
+                       len --;
+               }
+
+               /*
+                * Exim somehow uses mailbox format for messages being scanned:
+                * From xxx@xxx.com Fri May 13 19:08:48 2016
+                *
+                * So we check if a task has non-http format then we check for such a line
+                * at the beginning to avoid errors
+                */
+               if (!(task->flags & RSPAMD_TASK_FLAG_JSON) || (task->flags &
+                               RSPAMD_TASK_FLAG_LOCAL_CLIENT)) {
+                       if (len > sizeof ("From ") - 1) {
+                               if (memcmp (p, "From ", sizeof ("From ") - 1) == 0) {
+                                       /* Skip to CRLF */
+                                       msg_info_task ("mailbox input detected, enable workaround");
+                                       p += sizeof ("From ") - 1;
+                                       len -= sizeof ("From ") - 1;
+
+                                       while (len > 0 && *p != '\n') {
+                                               p ++;
+                                               len --;
+                                       }
+                                       while (len > 0 && g_ascii_isspace (*p)) {
+                                               p ++;
+                                               len --;
+                                       }
+                               }
+                       }
+               }
+
+               str.str = (gchar *)p;
+               str.len = len;
+       }
+       else {
+               p = part->raw_data.begin;
+               len = part->raw_data.len;
+
+               /* Skip any space characters to avoid some bad messages to be unparsed */
+               while (len > 0 && g_ascii_isspace (*p)) {
+                       p ++;
+                       len --;
+               }
+
+               str.str = (gchar *)p;
+               str.len = len;
+       }
+
+       hdr_pos = rspamd_string_find_eoh (&str, &body_pos);
+
+       if (hdr_pos > 0 && hdr_pos < str.len) {
+
+               if (st->cur_part == NULL) {
+                       task->raw_headers_content.begin = (gchar *) (str.str);
+                       task->raw_headers_content.len = hdr_pos;
+                       task->raw_headers_content.body_start = str.str + body_pos;
+
+                       if (task->raw_headers_content.len > 0) {
+                               rspamd_mime_headers_process (task, task->raw_headers,
+                                               task->raw_headers_content.begin,
+                                               task->raw_headers_content.len,
+                                               TRUE);
+                       }
+
+                       hdrs = rspamd_message_get_header_from_hash (task->raw_headers,
+                                       task->task_pool,
+                                       "Content-Type", FALSE);
+               }
+               else {
+                       part->raw_headers_str = str.str;
+                       part->raw_headers_len = str.len;
+                       part->raw_data.begin = p + body_pos;
+                       part->raw_data.len -= body_pos;
+                       task->raw_headers_content.body_start = p + body_pos;
+
+                       if (task->raw_headers_content.len > 0) {
+                               rspamd_mime_headers_process (task, task->raw_headers,
+                                               task->raw_headers_content.begin,
+                                               task->raw_headers_content.len,
+                                               TRUE);
+                       }
+
+                       hdrs = rspamd_message_get_header_from_hash (st->cur_part->raw_headers,
+                                       task->task_pool,
+                                       "Content-Type", FALSE);
+               }
+
+       }
+
+
+       if (hdrs == NULL) {
+               g_set_error (err, RSPAMD_MIME_QUARK, EINVAL,
+                               "Content type header is absent");
+
+               return FALSE;
+       }
+
+       for (i = 0; i < hdrs->len; i ++) {
+               hdr = g_ptr_array_index (hdrs, i);
+               ct = rspamd_content_type_parse (hdr->value, strlen (hdr->value),
+                               task->task_pool);
+
+               /* Here we prefer multipart content-type or any content-type */
+               if (ct) {
+                       if (sel == NULL) {
+                               sel = ct;
+                       }
+                       else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
+                               sel = ct;
+                       }
+               }
+       }
+
+       if (sel == NULL) {
+               g_set_error (err, RSPAMD_MIME_QUARK, EINVAL,
+                               "Content type header cannot be parsed");
+
+               return FALSE;
+       }
+
+       if (part) {
+               pbegin = part->raw_data.begin;
+               plen = part->raw_data.len;
+       }
+       else {
+               pbegin = st->pos;
+               plen = st->end - pbegin;
+       }
+
+       npart = rspamd_mempool_alloc0 (task->task_pool,
+                       sizeof (struct rspamd_mime_part));
+       npart->raw_data.begin = pbegin;
+       npart->raw_data.len = plen;
+       npart->parent = part;
+       npart->ct = sel;
+
+       if (st->cur_part == NULL) {
+               npart->raw_headers = g_hash_table_ref (task->raw_headers);
+       }
+       else {
+               npart->raw_headers = g_hash_table_ref (st->cur_part->raw_headers);
+       }
+
+       if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
+               st->cur_part = npart;
+               g_ptr_array_add (st->stack, npart);
+               ret = rspamd_mime_parse_multipart_part (task, npart, st, err);
+       }
+       else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) {
+               st->cur_part = npart;
+               g_ptr_array_add (st->stack, npart);
+               ret = rspamd_mime_parse_message (task, npart, st, err);
+       }
+       else {
+               ret = rspamd_mime_parse_normal_part (task, npart, st, err);
+       }
+
+       return ret;
+}
+
+gboolean
+rspamd_mime_parse_task (struct rspamd_task *task, GError **err)
+{
+       struct rspamd_mime_parser_stack *st;
+       gboolean ret;
+
+       if (mp_boundary == NULL) {
+               rspamd_mime_parser_init_mp ();
+       }
+
+       st = g_slice_alloc0 (sizeof (*st));
+       st->stack = g_ptr_array_sized_new (4);
+       st->pos = task->raw_headers_content.body_start;
+       st->end = task->msg.begin + task->msg.len;
+
+       if (st->pos == NULL) {
+               st->pos = task->msg.begin;
+       }
+
+       ret = rspamd_mime_parse_message (task, NULL, st, err);
+       //rspamd_mime_parse_stack_free (st);
+
+       return ret;
+}
diff --git a/src/libmime/mime_parser.h b/src/libmime/mime_parser.h
new file mode 100644 (file)
index 0000000..f1355d7
--- /dev/null
@@ -0,0 +1,25 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_LIBMIME_MIME_PARSER_H_
+#define SRC_LIBMIME_MIME_PARSER_H_
+
+#include "config.h"
+
+struct rspamd_task;
+
+gboolean rspamd_mime_parse_task (struct rspamd_task *task, GError **err);
+
+#endif /* SRC_LIBMIME_MIME_PARSER_H_ */
index fbba759e0c4cedca71ac0c1e110ab371c6bfec14..873d59d22b495e5b7249348a00811c6807a0b6bd 100644 (file)
@@ -1713,7 +1713,7 @@ rspamd_dkim_canonize_header (struct rspamd_dkim_common_ctx *ctx,
        const gchar *dkim_header,
        const gchar *dkim_domain)
 {
-       struct raw_header *rh;
+       struct rspamd_mime_header *rh;
        guint rh_num = 0, i;
        GPtrArray *ar;
 
index 961fcf2916566c0428faee0311807662fb4b11ce..1e713dc86b3c256c5849be139d0843626a444391 100644 (file)
@@ -753,7 +753,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
        guint ret = 0, i, re_id;
        GPtrArray *headerlist;
        GHashTableIter it;
-       struct raw_header *rh;
+       struct rspamd_mime_header *rh;
        const gchar *in, *end;
        const guchar **scvec;
        guint *lenvec;
index 768b9af69ca0a36d4b4fbd36303cc19f4bb62016..be90dff83fde833e0cbbf3a6e3077ad144402aa1 100644 (file)
@@ -35,7 +35,7 @@ static void
 rspamd_stat_tokenize_header (struct rspamd_task *task,
                const gchar *name, const gchar *prefix, GArray *ar)
 {
-       struct raw_header *cur;
+       struct rspamd_mime_header *cur;
        GPtrArray *hdrs;
        guint i;
        rspamd_ftok_t str;
index ea821f488f3187fcb6e24683a891f9e7ffc325b3..8d922757edcdde5d49157352aed019c503604885 100644 (file)
@@ -1426,7 +1426,7 @@ rspamd_lua_push_header (lua_State * L,
                gboolean raw)
 {
 
-       struct raw_header *rh;
+       struct rspamd_mime_header *rh;
        guint i;
        const gchar *val;
 
index 5d30cdcb37a46f15eb13d6e346ddd76b681e056c..7c00f4e90850524b2a26d9b81ec960c574b04a29 100644 (file)
@@ -702,7 +702,7 @@ dkim_symbol_callback (struct rspamd_task *task, void *unused)
        rspamd_dkim_context_t *ctx;
        rspamd_dkim_key_t *key;
        GError *err = NULL;
-       struct raw_header *rh;
+       struct rspamd_mime_header *rh;
        struct dkim_check_result *res = NULL, *cur;
        guint checked = 0, i;