From 99a740ed5d4bbdeee3ede37d6ca77d148d300c13 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 16 Dec 2016 13:44:16 +0000 Subject: [PATCH] [Rework] Start massive rework to get rid of gmime --- src/libmime/content_type.h | 4 + src/libmime/message.c | 786 ++++++------------------------------ src/libmime/message.h | 33 +- src/libmime/mime_encoding.c | 6 +- src/libmime/mime_parser.c | 23 ++ src/libserver/task.c | 4 - src/libserver/task.h | 1 - 7 files changed, 163 insertions(+), 694 deletions(-) diff --git a/src/libmime/content_type.h b/src/libmime/content_type.h index dc5355918..69086fa7f 100644 --- a/src/libmime/content_type.h +++ b/src/libmime/content_type.h @@ -28,6 +28,10 @@ enum rspamd_content_type_flags { RSPAMD_CONTENT_TYPE_MESSAGE = 1 << 3, }; +#define IS_CT_MULTIPART(ct) ((ct)->flags & RSPAMD_CONTENT_TYPE_MULTIPART) +#define IS_CT_TEXT(ct) ((ct)->flags & RSPAMD_CONTENT_TYPE_TEXT) +#define IS_CT_MESSAGE(ct) (((ct)->flags & RSPAMD_CONTENT_TYPE_MESSAGE)) + struct rspamd_content_type_param { rspamd_ftok_t name; rspamd_ftok_t value; diff --git a/src/libmime/message.c b/src/libmime/message.c index b259fdb0c..0d5ea98c0 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -27,6 +27,8 @@ #include "tokenizers/tokenizers.h" #include "cryptobox.h" #include "smtp_parsers.h" +#include "mime_parser.h" +#include "mime_encoding.h" #ifdef WITH_SNOWBALL #include "libstemmer.h" @@ -34,8 +36,6 @@ #include -#define RECURSION_LIMIT 5 -#define UTF8_CHARSET "UTF-8" #define GTUBE_SYMBOL "GTUBE" #define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF) @@ -43,15 +43,8 @@ static const gchar gtube_pattern[] = "XJS*C4JDBQADN1.NSBN3*2IDNEN*" "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X"; -static rspamd_regexp_t *utf_compatible_re = NULL; static const guint64 words_hash_seed = 0xdeadbabe; -static GQuark -rspamd_message_quark (void) -{ - return g_quark_from_static_string ("mime-error"); -} - static void free_byte_array_callback (void *pointer) @@ -60,246 +53,6 @@ free_byte_array_callback (void *pointer) g_byte_array_free (arr, TRUE); } -static gboolean -charset_validate (rspamd_mempool_t *pool, const gchar *in, gchar **out) -{ - /* - * This is a simple routine to validate input charset - * we just check that charset starts with alphanumeric and ends - * with alphanumeric - */ - const gchar *begin, *end; - gboolean changed = FALSE, to_uppercase = FALSE; - - begin = in; - - while (!g_ascii_isalnum (*begin)) { - begin ++; - changed = TRUE; - } - - if (g_ascii_islower (*begin)) { - changed = TRUE; - to_uppercase = TRUE; - } - - end = begin + strlen (begin) - 1; - while (!g_ascii_isalnum (*end)) { - end --; - changed = TRUE; - } - - if (!changed) { - *out = (gchar *)in; - } - else { - *out = rspamd_mempool_alloc (pool, end - begin + 2); - if (to_uppercase) { - gchar *o = *out; - - while (begin != end + 1) { - if (g_ascii_islower (*begin)) { - *o++ = g_ascii_toupper (*begin ++); - } - else { - *o++ = *begin++; - } - } - *o = '\0'; - } - else { - rspamd_strlcpy (*out, begin, end - begin + 2); - } - } - - return TRUE; -} - -static const gchar * -charset_heuristic_detection (const gchar *in, rspamd_mempool_t *pool) -{ - gchar *ret = NULL, *h, *t; - - if (strchr (in, '-') != NULL) { - /* Try to remove '-' chars from encoding: e.g. CP-100 to CP100 */ - ret = rspamd_mempool_strdup (pool, in); - - h = ret; - t = ret; - - while (*h != '\0') { - if (*h != '-') { - *t++ = *h; - } - - h ++; - } - - *t = '\0'; - - return ret; - } - - return in; -} - -static GQuark -converter_error_quark (void) -{ - return g_quark_from_static_string ("conversion error"); -} - -static gchar * -rspamd_text_to_utf8 (struct rspamd_task *task, - gchar *input, gsize len, const gchar *in_enc, - gsize *olen, GError **err) -{ - gchar *s, *d; - gsize outlen; - iconv_t ic; - rspamd_fstring_t *dst; - gsize remain, ret, inremain = len; - - ic = iconv_open (UTF8_CHARSET, in_enc); - - if (ic == (iconv_t)-1) { - in_enc = charset_heuristic_detection (in_enc, task->task_pool); - - ic = iconv_open (UTF8_CHARSET, in_enc); - - if (ic == (iconv_t)-1) { - g_set_error (err, converter_error_quark(), EINVAL, - "cannot open iconv for: %s", in_enc); - - return NULL; - } - } - - /* Preallocate for half of characters to be converted */ - outlen = len + len / 2 + 1; - dst = rspamd_fstring_sized_new (outlen); - s = input; - d = dst->str; - remain = outlen - 1; - - while (inremain > 0 && remain > 0) { - ret = iconv (ic, &s, &inremain, &d, &remain); - dst->len = d - dst->str; - - if (ret == (gsize)-1) { - switch (errno) { - case E2BIG: - /* Enlarge string */ - if (inremain > 0) { - dst = rspamd_fstring_grow (dst, inremain * 2); - d = dst->str + dst->len; - remain = dst->allocated - dst->len - 1; - } - break; - case EILSEQ: - case EINVAL: - /* Ignore bad characters */ - if (remain > 0 && inremain > 0) { - *d++ = '?'; - s++; - inremain --; - remain --; - } - break; - } - } - else if (ret == 0) { - break; - } - } - - *d = '\0'; - *olen = dst->len; - iconv_close (ic); - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t)rspamd_fstring_free, dst); - msg_info_task ("converted from %s to UTF-8 inlen: %z, outlen: %z", - in_enc, len, dst->len); - - return dst->str; -} - - -static GByteArray * -convert_text_to_utf (struct rspamd_task *task, - GByteArray * part_content, - GMimeContentType * type, - struct rspamd_mime_text_part *text_part) -{ - GError *err = NULL; - gsize write_bytes; - const gchar *charset; - gchar *res_str, *ocharset; - GByteArray *result_array; - - if (task->cfg && task->cfg->raw_mode) { - SET_PART_RAW (text_part); - return part_content; - } - - if (utf_compatible_re == NULL) { - utf_compatible_re = rspamd_regexp_new ( - "^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:us)|(?:ISO-8859-1)|" - "(?:latin.*)|(?:CSASCII)$", - "i", NULL); - } - - if ((charset = - g_mime_content_type_get_parameter (type, "charset")) == NULL) { - SET_PART_RAW (text_part); - return part_content; - } - if (!charset_validate (task->task_pool, charset, &ocharset)) { - msg_info_task ( - "<%s>: has invalid charset", - task->message_id); - SET_PART_RAW (text_part); - return part_content; - } - - if (rspamd_regexp_match (utf_compatible_re, ocharset, strlen (ocharset), TRUE)) { - if (g_utf8_validate (part_content->data, part_content->len, NULL)) { - SET_PART_UTF (text_part); - return part_content; - } - else { - msg_info_task ( - "<%s>: contains invalid utf8 characters, assume it as raw", - task->message_id); - SET_PART_RAW (text_part); - return part_content; - } - } - else { - res_str = rspamd_text_to_utf8 (task, part_content->data, - part_content->len, - ocharset, - &write_bytes, - &err); - if (res_str == NULL) { - msg_warn_task ("<%s>: cannot convert from %s to utf8: %s", - task->message_id, - ocharset, - err ? err->message : "unknown problem"); - SET_PART_RAW (text_part); - g_error_free (err); - return part_content; - } - } - - result_array = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray)); - result_array->data = res_str; - result_array->len = write_bytes; - SET_PART_UTF (text_part); - - return result_array; -} - struct language_match { const char *code; const char *name; @@ -650,59 +403,46 @@ exceptions_compare_func (gconstpointer a, gconstpointer b) } static void -process_text_part (struct rspamd_task *task, - GByteArray *part_content, - GMimeContentType *type, - struct rspamd_mime_part *mime_part, - GMimeObject *parent, - gboolean is_empty) +rspamd_message_process_text_part (struct rspamd_task *task, + struct rspamd_mime_part *mime_part) { struct rspamd_mime_text_part *text_part; - const gchar *cd; - GMimeStream *st; + rspamd_ftok_t html_tok, xhtml_tok; + GByteArray *part_content; /* Skip attachments */ -#ifndef GMIME24 - cd = g_mime_part_get_content_disposition (GMIME_PART (rspamd_mime_part->mime)); - if (cd && - g_ascii_strcasecmp (cd, - "attachment") == 0 && (task->cfg && !task->cfg->check_text_attachements)) { - debug_task ("skip attachments for checking as text parts"); - return; - } -#else - cd = g_mime_object_get_disposition (GMIME_OBJECT (mime_part->mime)); - if (cd && - g_ascii_strcasecmp (cd, - GMIME_DISPOSITION_ATTACHMENT) == 0 && + if (mime_part->cd && mime_part->cd->type == RSPAMD_CT_ATTACHMENT && (task->cfg && !task->cfg->check_text_attachements)) { debug_task ("skip attachments for checking as text parts"); return; } -#endif - if (g_mime_content_type_is_type (type, "text", - "html") || g_mime_content_type_is_type (type, "text", "xhtml")) { + html_tok.begin = "html"; + html_tok.len = 4; + xhtml_tok.begin = "xhtml"; + xhtml_tok.len = 5; - text_part = - rspamd_mempool_alloc0 (task->task_pool, + if (rspamd_ftok_cmp (&mime_part->ct->subtype, &html_tok) == 0 || + rspamd_ftok_cmp (&mime_part->ct->subtype, &xhtml_tok) == 0) { + + text_part = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_mime_text_part)); + text_part->raw.begin = mime_part->raw_data.begin; + text_part->raw.len = mime_part->raw_data.len; + text_part->parsed.begin = mime_part->parsed_data.begin; + text_part->parsed.len = mime_part->parsed_data.len; text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML; - if (is_empty) { + + if (mime_part->parsed_data.len == 0) { text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; - text_part->orig = NULL; - text_part->content = NULL; g_ptr_array_add (task->text_parts, text_part); return; } - text_part->orig = part_content; - part_content = convert_text_to_utf (task, - text_part->orig, - type, - text_part); + + part_content = rspamd_mime_text_part_maybe_convert (task, text_part); + text_part->html = rspamd_mempool_alloc0 (task->task_pool, sizeof (*text_part->html)); - text_part->parent = parent; text_part->mime_part = mime_part; text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED; @@ -723,35 +463,31 @@ process_text_part (struct rspamd_task *task, text_part->content); g_ptr_array_add (task->text_parts, text_part); } - else if (g_mime_content_type_is_type (type, "text", "*")) { + else { text_part = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_mime_text_part)); - text_part->parent = parent; text_part->mime_part = mime_part; + text_part->raw.begin = mime_part->raw_data.begin; + text_part->raw.len = mime_part->raw_data.len; + text_part->parsed.begin = mime_part->parsed_data.begin; + text_part->parsed.len = mime_part->parsed_data.len; - if (is_empty) { + if (mime_part->parsed_data.len == 0) { text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; - text_part->orig = NULL; - text_part->content = NULL; g_ptr_array_add (task->text_parts, text_part); return; } - text_part->content = convert_text_to_utf (task, - part_content, - type, + text_part->content = rspamd_mime_text_part_maybe_convert (task, text_part); - text_part->orig = part_content; g_ptr_array_add (task->text_parts, text_part); } - else { - return; - } + mime_part->flags |= RSPAMD_MIME_PART_TEXT; - mime_part->specific_data = text_part; + mime_part->specific.txt = text_part; if (rspamd_check_gtube (task, text_part)) { struct rspamd_metric_result *mres; @@ -790,356 +526,62 @@ process_text_part (struct rspamd_task *task, } rspamd_extract_words (task, text_part); - - if (!(text_part->flags & RSPAMD_MIME_TEXT_PART_FLAG_EMPTY)) { - text_part->raw = g_byte_array_sized_new (part_content->len * 1.5 + 0.5); - st = g_mime_stream_mem_new_with_byte_array (text_part->raw); - g_mime_object_write_to_stream (GMIME_OBJECT (mime_part->mime), st); - g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (st), FALSE); - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) free_byte_array_callback, - text_part->raw); - } - else { - text_part->raw = NULL; - } -} - -struct mime_foreach_data { - struct rspamd_task *task; - guint parser_recursion; - GMimeObject *parent; -}; - -#ifdef GMIME24 -static void -mime_foreach_callback (GMimeObject * parent, - GMimeObject * part, - gpointer user_data) -#else -static void -mime_foreach_callback (GMimeObject * part, gpointer user_data) -#endif -{ - struct mime_foreach_data *md = user_data; - struct rspamd_task *task; - struct rspamd_mime_part *mime_part; - GMimeContentType *type; - GMimeDataWrapper *wrapper; - GMimeStream *part_stream; - GByteArray *part_content; - gchar *hdrs; - /* Blake2b applied to string 'rspamd' */ - static const guchar hash_key[] = { - 0xef,0x43,0xae,0x80,0xcc,0x8d,0xc3,0x4c, - 0x6f,0x1b,0xd6,0x18,0x1b,0xae,0x87,0x74, - 0x0c,0xca,0xf7,0x8e,0x5f,0x2e,0x54,0x32, - 0xf6,0x79,0xb9,0x27,0x26,0x96,0x20,0x92, - 0x70,0x07,0x85,0xeb,0x83,0xf7,0x89,0xe0, - 0xd7,0x32,0x2a,0xd2,0x1a,0x64,0x41,0xef, - 0x49,0xff,0xc3,0x8c,0x54,0xf9,0x67,0x74, - 0x30,0x1e,0x70,0x2e,0xb7,0x12,0x09,0xfe, - }; - - task = md->task; - /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */ - - /* find out what class 'part' is... */ - if (GMIME_IS_MESSAGE_PART (part)) { - /* message/rfc822 or message/news */ - GMimeMessage *message; - - /* g_mime_message_foreach_part() won't descend into - child message parts, so if we want to count any - subparts of this child message, we'll have to call - g_mime_message_foreach_part() again here. */ - - message = g_mime_message_part_get_message ((GMimeMessagePart *) part); - if (md->parser_recursion++ < RECURSION_LIMIT) { -#ifdef GMIME24 - g_mime_message_foreach (message, mime_foreach_callback, md); -#else - g_mime_message_foreach_part (message, mime_foreach_callback, md); -#endif - } - else { - msg_err_task ("too deep mime recursion detected: %d", md->parser_recursion); - return; - } -#ifndef GMIME24 - g_object_unref (message); -#endif - } - else if (GMIME_IS_MESSAGE_PARTIAL (part)) { - /* message/partial */ - - /* this is an incomplete message part, probably a - large message that the sender has broken into - smaller parts and is sending us bit by bit. we - could save some info about it so that we could - piece this back together again once we get all the - parts? */ - } - else if (GMIME_IS_MULTIPART (part)) { - /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */ -#ifndef GMIME24 - debug_task ("detected multipart part"); - /* we'll get to finding out if this is a signed/encrypted multipart later... */ - if (task->parser_recursion++ < RECURSION_LIMIT) { - g_mime_multipart_foreach ((GMimeMultipart *) part, - mime_foreach_callback, - md); - } - else { - msg_err_task ("endless recursion detected: %d", task->parser_recursion); - return; - } -#endif - type = (GMimeContentType *) g_mime_object_get_content_type (GMIME_OBJECT ( - part)); - mime_part = rspamd_mempool_alloc0 (task->task_pool, - sizeof (struct rspamd_mime_part)); - - if (md->parent != NULL) { - hdrs = g_mime_object_get_headers (GMIME_OBJECT (part)); - } - else { - hdrs = NULL; - } - - mime_part->raw_headers = g_hash_table_new_full (rspamd_strcase_hash, - rspamd_strcase_equal, NULL, rspamd_ptr_array_free_hard); - - if (hdrs != NULL) { - rspamd_mime_headers_process (task, mime_part->raw_headers, - hdrs, strlen (hdrs), FALSE); - mime_part->raw_headers_str = hdrs; - rspamd_mempool_add_destructor (task->task_pool, g_free, hdrs); - } - - mime_part->type = type; - /* XXX: we don't need it, but it's sometimes dereferenced */ - mime_part->content = g_byte_array_new (); - mime_part->parent = md->parent; - mime_part->filename = NULL; - mime_part->mime = part; - mime_part->boundary = g_mime_multipart_get_boundary (GMIME_MULTIPART (part)); - - if (mime_part->boundary) { - rspamd_cryptobox_hash (mime_part->digest, mime_part->boundary, - strlen (mime_part->boundary), hash_key, sizeof (hash_key)); - } - - debug_task ("found part with content-type: %s/%s", - type->type, - type->subtype); - g_ptr_array_add (task->parts, mime_part); - - md->parent = part; - } - else if (GMIME_IS_PART (part)) { - /* a normal leaf part, could be text/plain or image/jpeg etc */ -#ifdef GMIME24 - type = (GMimeContentType *) g_mime_object_get_content_type (GMIME_OBJECT ( - part)); -#else - type = - (GMimeContentType *) g_mime_part_get_content_type (GMIME_PART (part)); -#endif - - if (type == NULL) { - msg_warn_task ("type of part is unknown, assume text/plain"); - type = g_mime_content_type_new ("text", "plain"); -#ifdef GMIME24 - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) g_object_unref, type); -#else - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) g_mime_content_type_destroy, type); -#endif - } - wrapper = g_mime_part_get_content_object (GMIME_PART (part)); -#ifdef GMIME24 - if (wrapper != NULL && GMIME_IS_DATA_WRAPPER (wrapper)) { -#else - if (wrapper != NULL) { -#endif - part_stream = g_mime_stream_mem_new (); - if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) { - - g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (part_stream), - FALSE); - part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM ( - part_stream)); - g_object_unref (part_stream); - mime_part = - rspamd_mempool_alloc0 (task->task_pool, - sizeof (struct rspamd_mime_part)); - - hdrs = g_mime_object_get_headers (GMIME_OBJECT (part)); - mime_part->raw_headers = g_hash_table_new_full (rspamd_strcase_hash, - rspamd_strcase_equal, NULL, rspamd_ptr_array_free_hard); - - if (hdrs != NULL) { - rspamd_mime_headers_process (task, mime_part->raw_headers, - hdrs, strlen (hdrs), FALSE); - mime_part->raw_headers_str = hdrs; - rspamd_mempool_add_destructor (task->task_pool, g_free, hdrs); - } - - mime_part->type = type; - mime_part->content = part_content; - mime_part->parent = md->parent; - mime_part->filename = g_mime_part_get_filename (GMIME_PART ( - part)); - mime_part->mime = part; - - if (mime_part->content->len > 0) { - rspamd_cryptobox_hash (mime_part->digest, - mime_part->content->data, mime_part->content->len, - hash_key, sizeof (hash_key)); - } - - debug_task ("found part with content-type: %s/%s", - type->type, - type->subtype); - g_ptr_array_add (task->parts, mime_part); - /* Skip empty parts */ - process_text_part (task, - part_content, - type, - mime_part, - md->parent, - (part_content->len <= 0)); - } - else { - msg_warn_task ("write to stream failed: %d, %s", errno, - strerror (errno)); - } -#ifndef GMIME24 - g_object_unref (wrapper); -#endif - } - else { - msg_warn_task ("cannot get wrapper for mime part, type of part: %s/%s", - type->type, - type->subtype); - } - } - else { - g_assert_not_reached (); - } -} - -static void -destroy_message (void *pointer) -{ - GMimeMessage *msg = pointer; - - g_object_unref (msg); } /* Creates message from various data using libmagic to detect type */ static void -rspamd_message_from_data (struct rspamd_task *task, GByteArray *data, - GMimeStream *stream) +rspamd_message_from_data (struct rspamd_task *task, const guchar *start, + gsize len) { - GMimeMessage *message; - GMimePart *part; - GMimeDataWrapper *wrapper; - GMimeContentType *ct = NULL; + struct rspamd_content_type *ct = NULL; + struct rspamd_mime_part *part; const char *mb = NULL; gchar *mid; rspamd_ftok_t srch, *tok; - struct rspamd_email_address *addr; - - g_assert (data != NULL); - message = g_mime_message_new (TRUE); - task->message = message; - if (task->from_envelope) { - addr = rspamd_task_get_sender (task); - - if (addr->addr_len > 0) { - srch.begin = addr->addr; - srch.len = addr->addr_len; - g_mime_message_set_sender (task->message, - rspamd_mempool_ftokdup (task->task_pool, &srch)); - } - } + g_assert (start != NULL); tok = rspamd_task_get_request_header (task, "Content-Type"); if (tok) { /* We have Content-Type defined */ - gchar *ct_cpy = g_malloc (tok->len + 1); - - rspamd_strlcpy (ct_cpy, tok->begin, tok->len + 1); - ct = g_mime_content_type_new_from_string (ct_cpy); - g_free (ct_cpy); + ct = rspamd_content_type_parse (tok->begin, tok->len, + task->task_pool); } else if (task->cfg && task->cfg->libs_ctx) { /* Try to predict it by content (slow) */ mb = magic_buffer (task->cfg->libs_ctx->libmagic, - data->data, - data->len); + start, + len); if (mb) { - ct = g_mime_content_type_new_from_string (mb); + srch.begin = mb; + srch.len = strlen (mb); + ct = rspamd_content_type_parse (srch.begin, srch.len, + task->task_pool); } } msg_warn_task ("construct fake mime of type: %s", mb); - - part = g_mime_part_new (); - - if (ct != NULL) { - g_mime_object_set_content_type (GMIME_OBJECT (part), ct); - g_object_unref (ct); - } - -#ifdef GMIME24 - wrapper = g_mime_data_wrapper_new_with_stream (stream, - GMIME_CONTENT_ENCODING_8BIT); -#else - wrapper = g_mime_data_wrapper_new_with_stream (stream, - GMIME_PART_ENCODING_8BIT); -#endif - - g_mime_part_set_content_object (part, wrapper); - g_mime_message_set_mime_part (task->message, GMIME_OBJECT (part)); - /* Register destructors */ - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) g_object_unref, wrapper); - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) g_object_unref, part); - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) destroy_message, task->message); + part = rspamd_mempool_alloc0 (task->task_pool, sizeof (*part)); + part->ct = ct; + part->raw_data.begin = start; + part->raw_data.len = len; + part->parsed_data.begin = start; + part->parsed_data.len = len; /* Generate message ID */ mid = g_mime_utils_generate_message_id ("localhost.localdomain"); rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t) g_free, mid); - g_mime_message_set_message_id (task->message, mid); task->message_id = mid; task->queue_id = mid; - - /* Set headers for message */ - if (task->subject) { - g_mime_message_set_subject (task->message, task->subject); - } } gboolean rspamd_message_parse (struct rspamd_task *task) { - GMimeMessage *message; - GMimeParser *parser; - GMimeStream *stream; - GByteArray *tmp; GPtrArray *hdrs; - GMimeObject *parent; - const GMimeContentType *ct; struct rspamd_mime_header *rh; struct rspamd_mime_text_part *p1, *p2; struct mime_foreach_data md; @@ -1150,6 +592,7 @@ rspamd_message_parse (struct rspamd_task *task) gint i; gdouble diff, *pdiff; guint tw, *ptw, dw; + GError *err = NULL; rspamd_cryptobox_hash_state_t st; guchar digest_out[rspamd_cryptobox_HASHBYTES]; @@ -1159,7 +602,6 @@ rspamd_message_parse (struct rspamd_task *task) return TRUE; } - tmp = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray)); p = task->msg.begin; len = task->msg.len; @@ -1197,30 +639,20 @@ rspamd_message_parse (struct rspamd_task *task) } } - tmp->data = (guint8 *)p; - tmp->len = len; task->msg.begin = p; task->msg.len = len; - - stream = g_mime_stream_mem_new_with_byte_array (tmp); - /* - * This causes g_mime_stream not to free memory by itself as it is memory allocated by - * pool allocator - */ - g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (stream), FALSE); rspamd_cryptobox_hash_init (&st, NULL, 0); if (task->flags & RSPAMD_TASK_FLAG_MIME) { debug_task ("construct mime parser from string length %d", (gint) task->msg.len); - /* create a new parser object to parse the stream */ - parser = g_mime_parser_new_with_stream (stream); - - /* parse the message from the stream */ - message = g_mime_parser_construct_message (parser); + if (!rspamd_mime_parse_task (task, &err)) { + msg_err_task ("cannot construct mime from stream: %e", err); - if (message == NULL) { + if (err) { + g_error_free (err); + } if (task->cfg && (!task->cfg->allow_raw_input)) { msg_err_task ("cannot construct mime from stream"); g_set_error (&task->err, @@ -1228,26 +660,22 @@ rspamd_message_parse (struct rspamd_task *task) RSPAMD_FILTER_ERROR, \ "cannot parse MIME in the message"); - g_object_unref (parser); return FALSE; } else { task->flags &= ~RSPAMD_TASK_FLAG_MIME; - rspamd_message_from_data (task, tmp, stream); + rspamd_message_from_data (task, p, len); } } else { GString str; - task->message = message; - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) destroy_message, task->message); - str.str = tmp->data; - str.len = tmp->len; + str.str = p; + str.len = len; hdr_pos = rspamd_string_find_eoh (&str, &body_pos); - if (hdr_pos > 0 && hdr_pos < tmp->len) { + if (hdr_pos > 0 && hdr_pos < len) { task->raw_headers_content.begin = (gchar *) (p); task->raw_headers_content.len = hdr_pos; task->raw_headers_content.body_start = p + body_pos; @@ -1260,43 +688,43 @@ rspamd_message_parse (struct rspamd_task *task) } } } - - /* free the parser (and the stream) */ - g_object_unref (stream); - g_object_unref (parser); } else { task->flags &= ~RSPAMD_TASK_FLAG_MIME; - rspamd_message_from_data (task, tmp, stream); - g_object_unref (stream); + rspamd_message_from_data (task, p, len); } /* Save message id for future use */ - task->message_id = g_mime_message_get_message_id (task->message); + hdrs = rspamd_message_get_header_array (task, "Message-ID", FALSE); + + if (hdrs) { + rh = g_ptr_array_index (hdrs, 0); + task->message_id = rh->decoded; + } + if (task->message_id == NULL) { task->message_id = "undef"; } memset (&md, 0, sizeof (md)); md.task = task; -#ifdef GMIME24 - g_mime_message_foreach (task->message, mime_foreach_callback, &md); -#else - /* - * This is rather strange, but gmime 2.2 do NOT pass top-level part to foreach callback - * so we need to set up parent part by hands - */ - md.parent = g_mime_message_get_mime_part (task->message); - g_object_unref (md.parent); - g_mime_message_foreach_part (task->message, mime_foreach_callback, &md); -#endif debug_task ("found %ud parts in message", task->parts->len); if (task->queue_id == NULL) { task->queue_id = "undef"; } + for (i = 0; i < task->parts->len; i ++) { + struct rspamd_mime_part *part; + + part = g_ptr_array_index (task->parts, i); + + if (IS_CT_TEXT (part->ct)) { + rspamd_message_process_text_part (task, part); + } + } + rspamd_images_process (task); rspamd_archives_process (task); @@ -1396,19 +824,33 @@ rspamd_message_parse (struct rspamd_task *task) } /* Set mime recipients and sender for the task */ - task->rcpt_mime = g_mime_message_get_all_recipients (task->message); - if (task->rcpt_mime) { -#ifdef GMIME24 - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) g_object_unref, - task->rcpt_mime); -#else - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) internet_address_list_destroy, - task->rcpt_mime); -#endif + /* TODO: kill it with fire */ + task->rcpt_mime = internet_address_list_new (); + + static const gchar *to_hdrs[] = {"To", "Cc", "Bcc"}; + + for (int k = 0; k < G_N_ELEMENTS (to_hdrs); k ++) { + hdrs = rspamd_message_get_header_array (task, to_hdrs[k], FALSE); + if (hdrs && hdrs->len > 0) { + + InternetAddress *tia; + + for (i = 0; i < hdrs->len; i ++) { + rh = g_ptr_array_index (hdrs, i); + + tia = internet_address_list_parse_string (rh->decoded); + + if (tia) { + internet_address_list_append (task->rcpt_mime, tia); + g_object_unref (tia); + } + } + } } + rspamd_mempool_add_destructor (task->task_pool, + (rspamd_mempool_destruct_t) g_object_unref, + task->rcpt_mime); hdrs = rspamd_message_get_header_array (task, "From", FALSE); @@ -1444,16 +886,14 @@ rspamd_message_parse (struct rspamd_task *task) p2 = g_ptr_array_index (task->text_parts, 1); /* First of all check parent object */ - if (p1->parent && p1->parent == p2->parent) { - parent = p1->parent; - ct = g_mime_object_get_content_type (parent); - if (ct == NULL || - !g_mime_content_type_is_type ((GMimeContentType *)ct, - "multipart", "alternative")) { - debug_task ( - "two parts are not belong to multipart/alternative container, skip check"); - } - else { + if (p1->mime_part->parent_part && + p1->mime_part->parent_part == p2->mime_part->parent_part) { + rspamd_ftok_t srch; + + srch.begin = "alternative"; + srch.len = 11; + + if (rspamd_ftok_cmp (&p1->mime_part->parent_part->ct->subtype, &srch) == 0) { if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) && p1->normalized_hashes && p2->normalized_hashes) { diff --git a/src/libmime/message.h b/src/libmime/message.h index b6a1f998f..cd5911927 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -33,25 +33,30 @@ enum rspamd_cte { RSPAMD_CTE_B64 = 4, }; +struct rspamd_mime_text_part; + struct rspamd_mime_part { - GMimeContentType *type; struct rspamd_content_type *ct; + struct rspamd_content_disposition *cd; rspamd_ftok_t raw_data; rspamd_ftok_t parsed_data; struct rspamd_mime_part *parent_part; - GPtrArray *children; - enum rspamd_cte cte; - GByteArray *content; - GMimeObject *parent; - GMimeObject *mime; GHashTable *raw_headers; gchar *raw_headers_str; gsize raw_headers_len; - guchar digest[rspamd_cryptobox_HASHBYTES]; - const gchar *filename; - const gchar *boundary; - gpointer specific_data; + enum rspamd_cte cte; + + union { + struct rspamd_mime_multipart *mp; + struct rspamd_mime_text_part *txt; + } specific; + enum rspamd_mime_part_flags flags; + guchar digest[rspamd_cryptobox_HASHBYTES]; +}; + +struct rspamd_mime_multipart { + GPtrArray *children; }; #define RSPAMD_MIME_TEXT_PART_FLAG_UTF (1 << 0) @@ -70,19 +75,17 @@ struct rspamd_mime_text_part { const gchar *lang_code; const gchar *language; const gchar *real_charset; - GByteArray *raw; /**< undecoded mime part */ - GByteArray *orig; + rspamd_ftok_t raw; + rspamd_ftok_t parsed; GByteArray *content; - GByteArray *stripped_content; /**< no newlines or html tags */ + GByteArray *stripped_content; GPtrArray *newlines; /**< positions of newlines in text */ struct html_content *html; GList *exceptions; /**< list of offsets of urls */ - GMimeObject *parent; struct rspamd_mime_part *mime_part; GArray *normalized_words; GArray *normalized_hashes; guint nlines; - guint64 hash; }; enum rspamd_received_type { diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c index d2081e6c0..5629e1ee4 100644 --- a/src/libmime/mime_encoding.c +++ b/src/libmime/mime_encoding.c @@ -206,9 +206,13 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, gsize write_bytes; const gchar *charset; gchar *res_str; - GByteArray *result_array, *part_content = text_part->orig; + GByteArray *result_array, *part_content; struct rspamd_mime_part *part = text_part->mime_part; + part_content = rspamd_mempool_alloc0 (sizeof (GByteArray)); + part_content->data = (guint8 *)text_part->parsed.begin; + part_content->len = text_part->parsed.len; + if (task->cfg && task->cfg->raw_mode) { SET_PART_RAW (text_part); return part_content; diff --git a/src/libmime/mime_parser.c b/src/libmime/mime_parser.c index 6fa78b3e0..7586a803e 100644 --- a/src/libmime/mime_parser.c +++ b/src/libmime/mime_parser.c @@ -234,6 +234,28 @@ rspamd_mime_part_get_cte (struct rspamd_task *task, struct rspamd_mime_part *par } } +static void +rspamd_mime_parser_calc_digest (struct rspamd_mime_part *part) +{ + /* Blake2b applied to string 'rspamd' */ + static const guchar hash_key[] = { + 0xef,0x43,0xae,0x80,0xcc,0x8d,0xc3,0x4c, + 0x6f,0x1b,0xd6,0x18,0x1b,0xae,0x87,0x74, + 0x0c,0xca,0xf7,0x8e,0x5f,0x2e,0x54,0x32, + 0xf6,0x79,0xb9,0x27,0x26,0x96,0x20,0x92, + 0x70,0x07,0x85,0xeb,0x83,0xf7,0x89,0xe0, + 0xd7,0x32,0x2a,0xd2,0x1a,0x64,0x41,0xef, + 0x49,0xff,0xc3,0x8c,0x54,0xf9,0x67,0x74, + 0x30,0x1e,0x70,0x2e,0xb7,0x12,0x09,0xfe, + }; + + if (part->parsed_data.len > 0) { + rspamd_cryptobox_hash (part->digest, + part->parsed_data.begin, part->parsed_data.len, + hash_key, sizeof (hash_key)); + } +} + static gboolean rspamd_mime_parse_normal_part (struct rspamd_task *task, struct rspamd_mime_part *part, @@ -282,6 +304,7 @@ rspamd_mime_parse_normal_part (struct rspamd_task *task, msg_debug_mime ("parsed data part %T/%T of length %z (%z orig), %s cte", &part->ct->type, &part->ct->subtype, part->parsed_data.len, part->raw_data.len, rspamd_cte_to_string (part->cte)); + rspamd_mime_parser_calc_digest (part); return TRUE; } diff --git a/src/libserver/task.c b/src/libserver/task.c index bcf66aee8..cee604acd 100644 --- a/src/libserver/task.c +++ b/src/libserver/task.c @@ -200,10 +200,6 @@ rspamd_task_free (struct rspamd_task *task) for (i = 0; i < task->parts->len; i ++) { p = g_ptr_array_index (task->parts, i); - if (p->content) { - g_byte_array_free (p->content, TRUE); - } - if (p->raw_headers) { g_hash_table_unref (p->raw_headers); } diff --git a/src/libserver/task.h b/src/libserver/task.h index 341edcc7d..e7df63719 100644 --- a/src/libserver/task.h +++ b/src/libserver/task.h @@ -141,7 +141,6 @@ struct rspamd_task { rspamd_ftok_t msg; /**< message buffer */ struct rspamd_http_connection *http_conn; /**< HTTP server connection */ struct rspamd_async_session * s; /**< async session object */ - GMimeMessage *message; /**< message, parsed with GMime */ GPtrArray *parts; /**< list of parsed parts */ GPtrArray *text_parts; /**< list of text parts */ struct { -- 2.39.5