/* * Copyright 2024 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "config.h" #include "task.h" #include "mime_parser.h" #include "mime_headers.h" #include "message.h" #include "multipattern.h" #include "contrib/libottery/ottery.h" #include "contrib/uthash/utlist.h" #include #include #include "rspamd_simdutf.h" struct rspamd_mime_parser_lib_ctx { struct rspamd_multipattern *mp_boundary; unsigned char hkey[rspamd_cryptobox_SIPKEYBYTES]; /* Key for hashing */ unsigned int key_usages; }; struct rspamd_mime_parser_lib_ctx *lib_ctx = NULL; static const unsigned int max_nested = 64; static const unsigned int max_key_usages = 10000; #define msg_debug_mime(...) rspamd_conditional_debug_fast(NULL, task->from_addr, \ rspamd_mime_log_id, "mime", task->task_pool->tag.uid, \ RSPAMD_LOG_FUNC, \ __VA_ARGS__) INIT_LOG_MODULE(mime) #define RSPAMD_MIME_BOUNDARY_FLAG_CLOSED (1 << 0) #define RSPAMD_BOUNDARY_IS_CLOSED(b) ((b)->flags & RSPAMD_MIME_BOUNDARY_FLAG_CLOSED) struct rspamd_mime_boundary { goffset boundary; goffset start; uint64_t hash; uint64_t closed_hash; int flags; }; struct rspamd_mime_parser_ctx { GPtrArray *stack; /* Stack of parts */ GArray *boundaries; /* Boundaries found in the whole message */ const char *start; const char *pos; const char *end; struct rspamd_task *task; unsigned int nesting; }; static enum rspamd_mime_parse_error rspamd_mime_parse_multipart_part(struct rspamd_task *task, struct rspamd_mime_part *part, struct rspamd_mime_parser_ctx *st, GError **err); static enum rspamd_mime_parse_error rspamd_mime_parse_message(struct rspamd_task *task, struct rspamd_mime_part *part, struct rspamd_mime_parser_ctx *st, GError **err); static enum rspamd_mime_parse_error rspamd_mime_parse_normal_part(struct rspamd_task *task, struct rspamd_mime_part *part, struct rspamd_mime_parser_ctx *st, struct rspamd_content_type *ct, GError **err); static enum rspamd_mime_parse_error rspamd_mime_process_multipart_node(struct rspamd_task *task, struct rspamd_mime_parser_ctx *st, struct rspamd_mime_part *multipart, const char *start, const char *end, gboolean is_finished, GError **err); #define RSPAMD_MIME_QUARK (rspamd_mime_parser_quark()) static GQuark rspamd_mime_parser_quark(void) { return g_quark_from_static_string("mime-parser"); } const char * rspamd_cte_to_string(enum rspamd_cte ct) { const char *ret = "unknown"; switch (ct) { case RSPAMD_CTE_7BIT: ret = "7bit"; break; case RSPAMD_CTE_8BIT: ret = "8bit"; break; case RSPAMD_CTE_QP: ret = "quoted-printable"; break; case RSPAMD_CTE_B64: ret = "base64"; break; case RSPAMD_CTE_UUE: ret = "X-uuencode"; break; default: break; } return ret; } enum rspamd_cte rspamd_cte_from_string(const char *str) { enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN; g_assert(str != NULL); if (strcmp(str, "7bit") == 0) { ret = RSPAMD_CTE_7BIT; } else if (strcmp(str, "8bit") == 0) { ret = RSPAMD_CTE_8BIT; } else if (strcmp(str, "quoted-printable") == 0) { ret = RSPAMD_CTE_QP; } else if (strcmp(str, "base64") == 0) { ret = RSPAMD_CTE_B64; } else if (strcmp(str, "X-uuencode") == 0) { ret = RSPAMD_CTE_UUE; } else if (strcmp(str, "uuencode") == 0) { ret = RSPAMD_CTE_UUE; } else if (strcmp(str, "X-uue") == 0) { ret = RSPAMD_CTE_UUE; } return ret; } static void rspamd_mime_parser_init_lib(void) { lib_ctx = g_malloc0(sizeof(*lib_ctx)); lib_ctx->mp_boundary = rspamd_multipattern_create(RSPAMD_MULTIPATTERN_DEFAULT); g_assert(lib_ctx->mp_boundary != NULL); rspamd_multipattern_add_pattern(lib_ctx->mp_boundary, "\r--", 0); rspamd_multipattern_add_pattern(lib_ctx->mp_boundary, "\n--", 0); GError *err = NULL; if (!rspamd_multipattern_compile(lib_ctx->mp_boundary, RSPAMD_MULTIPATTERN_COMPILE_NO_FS, &err)) { msg_err("fatal error: cannot compile multipattern for mime parser boundaries: %e", err); g_error_free(err); g_abort(); } ottery_rand_bytes(lib_ctx->hkey, sizeof(lib_ctx->hkey)); } static enum rspamd_cte rspamd_mime_parse_cte(const char *in, gsize len) { uint64_t h; enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN; in = rspamd_string_len_strip(in, &len, " \t;,.+-#!`~'"); h = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64, in, len, 0xdeadbabe); switch (h) { case 0xCEDAA7056B4753F7ULL: /* 7bit */ ret = RSPAMD_CTE_7BIT; break; case 0x42E0745448B39FC1ULL: /* 8bit */ case 0x6B169E6B155BADC0ULL: /* binary */ ret = RSPAMD_CTE_8BIT; break; case 0x6D69A5BB02A633B0ULL: /* quoted-printable */ ret = RSPAMD_CTE_QP; break; case 0x96305588A76DC9A9ULL: /* base64 */ case 0x171029DE1B0423A9ULL: /* base-64 */ ret = RSPAMD_CTE_B64; break; case 0x420b54dc00d13cecULL: /* uuencode */ case 0x8df6700b8f6c4cf9ULL: /* x-uuencode */ case 0x41f725ec544356d3ULL: /* x-uue */ ret = RSPAMD_CTE_UUE; break; } return ret; } static enum rspamd_cte rspamd_mime_part_get_cte_heuristic(struct rspamd_task *task, struct rspamd_mime_part *part) { const unsigned int check_len = 128; unsigned int real_len, nspaces = 0, neqsign = 0, n8bit = 0, nqpencoded = 0, padeqsign = 0, nupper = 0, nlower = 0; gboolean b64_chars = TRUE; const unsigned char *p, *end; enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN; real_len = MIN(check_len, part->raw_data.len); p = (const unsigned char *) part->raw_data.begin; end = p + part->raw_data.len; while (p < end && g_ascii_isspace(*p)) { p++; } if (end - p > sizeof("begin-base64 ")) { const unsigned char *uue_start; if (memcmp(p, "begin ", sizeof("begin ") - 1) == 0) { uue_start = p + sizeof("begin ") - 1; while (uue_start < end && g_ascii_isspace(*uue_start)) { uue_start++; } if (uue_start < end && g_ascii_isdigit(*uue_start)) { return RSPAMD_CTE_UUE; } } else if (memcmp(p, "begin-base64 ", sizeof("begin-base64 ") - 1) == 0) { uue_start = p + sizeof("begin ") - 1; while (uue_start < end && g_ascii_isspace(*uue_start)) { uue_start++; } if (uue_start < end && g_ascii_isdigit(*uue_start)) { return RSPAMD_CTE_UUE; } } } /* Skip trailing spaces */ while (end > p && g_ascii_isspace(*(end - 1))) { end--; } if (end > p + 2) { if (*(end - 1) == '=') { padeqsign++; end--; } if (*(end - 1) == '=') { padeqsign++; end--; } } /* Adjust end to analyse only first characters */ if (end - p > real_len) { end = p + real_len; } while (p < end) { if (*p == ' ') { nspaces++; } else if (*p == '=') { b64_chars = FALSE; /* Eqsign must not be inside base64 */ neqsign++; p++; if (p + 2 < end && g_ascii_isxdigit(*p) && g_ascii_isxdigit(*(p + 1))) { p++; nqpencoded++; } continue; } else if (*p >= 0x80) { n8bit++; b64_chars = FALSE; } else if (!(g_ascii_isalnum(*p) || *p == '/' || *p == '+')) { b64_chars = FALSE; } else if (g_ascii_isupper(*p)) { nupper++; } else if (g_ascii_islower(*p)) { nlower++; } p++; } if (b64_chars && neqsign <= 2 && nspaces == 0) { /* Need more thinking */ if (part->raw_data.len > 80) { if (padeqsign > 0) { ret = RSPAMD_CTE_B64; } else { /* We have a large piece of data with no spaces and base64 * symbols only, no padding is detected as well... * * There is a small chance that our first 128 characters * are either some garbage or it is a base64 with no padding * (e.g. when it is not needed) */ if (nupper > 1 && nlower > 1) { /* * We have both uppercase and lowercase letters, so it can be * base64 */ ret = RSPAMD_CTE_B64; } else { ret = RSPAMD_CTE_7BIT; } } } else { if (((end - (const unsigned char *) part->raw_data.begin) + padeqsign) % 4 == 0) { if (padeqsign == 0) { /* * It can be either base64 or plain text, hard to say * Let's assume that if we have > 1 uppercase it is * likely base64 */ if (nupper > 1 && nlower > 1) { ret = RSPAMD_CTE_B64; } else { ret = RSPAMD_CTE_7BIT; } } else { ret = RSPAMD_CTE_B64; } } else { /* No way */ if (padeqsign == 1 || padeqsign == 2) { ret = RSPAMD_CTE_B64; } else { ret = RSPAMD_CTE_7BIT; } } } } else if (n8bit == 0) { if (neqsign > 2 && nqpencoded > 2) { ret = RSPAMD_CTE_QP; } else { ret = RSPAMD_CTE_7BIT; } } else { ret = RSPAMD_CTE_8BIT; } msg_debug_mime("detected cte: %s", rspamd_cte_to_string(ret)); return ret; } static void rspamd_mime_part_get_cte(struct rspamd_task *task, struct rspamd_mime_headers_table *hdrs, struct rspamd_mime_part *part, gboolean apply_heuristic) { struct rspamd_mime_header *hdr, *cur; enum rspamd_cte cte = RSPAMD_CTE_UNKNOWN; gboolean parent_propagated = FALSE; hdr = rspamd_message_get_header_from_hash(hdrs, "Content-Transfer-Encoding", FALSE); if (hdr == NULL) { if (part->parent_part && part->parent_part->cte != RSPAMD_CTE_UNKNOWN && !(part->parent_part->flags & RSPAMD_MIME_PART_MISSING_CTE)) { part->cte = part->parent_part->cte; parent_propagated = TRUE; goto check_cte; } if (apply_heuristic) { part->cte = rspamd_mime_part_get_cte_heuristic(task, part); msg_info_task("detected missing CTE for part as: %s", rspamd_cte_to_string(part->cte)); } part->flags |= RSPAMD_MIME_PART_MISSING_CTE; } else { DL_FOREACH(hdr, cur) { gsize hlen; char lc_buf[128]; hlen = rspamd_snprintf(lc_buf, sizeof(lc_buf), "%s", cur->value); rspamd_str_lc(lc_buf, hlen); cte = rspamd_mime_parse_cte(lc_buf, hlen); if (cte != RSPAMD_CTE_UNKNOWN) { part->cte = cte; break; } } check_cte: if (apply_heuristic) { if (part->cte == RSPAMD_CTE_UNKNOWN) { part->cte = rspamd_mime_part_get_cte_heuristic(task, part); msg_info_task("corrected bad CTE for part to: %s", rspamd_cte_to_string(part->cte)); } else if (part->cte == RSPAMD_CTE_B64 || part->cte == RSPAMD_CTE_QP) { /* Additionally check sanity */ cte = rspamd_mime_part_get_cte_heuristic(task, part); if (cte == RSPAMD_CTE_8BIT) { msg_info_task( "incorrect cte specified for part: %s, %s detected", rspamd_cte_to_string(part->cte), rspamd_cte_to_string(cte)); part->cte = cte; part->flags |= RSPAMD_MIME_PART_BAD_CTE; } else if (cte != part->cte && parent_propagated) { part->cte = cte; msg_info_task("detected missing CTE for part as: %s", rspamd_cte_to_string(part->cte)); } } else { msg_debug_mime("processed cte: %s", rspamd_cte_to_string(cte)); } } else { msg_debug_mime("processed cte: %s", rspamd_cte_to_string(cte)); } } } static void rspamd_mime_part_get_cd(struct rspamd_task *task, struct rspamd_mime_part *part) { struct rspamd_mime_header *hdr, *cur; struct rspamd_content_disposition *cd = NULL; rspamd_ftok_t srch; struct rspamd_content_type_param *found; hdr = rspamd_message_get_header_from_hash(part->raw_headers, "Content-Disposition", FALSE); if (hdr == NULL) { cd = rspamd_mempool_alloc0(task->task_pool, sizeof(*cd)); cd->type = RSPAMD_CT_INLINE; /* We can also have content disposition definitions in Content-Type */ if (part->ct && part->ct->attrs) { RSPAMD_FTOK_ASSIGN(&srch, "name"); found = g_hash_table_lookup(part->ct->attrs, &srch); if (!found) { RSPAMD_FTOK_ASSIGN(&srch, "filename"); found = g_hash_table_lookup(part->ct->attrs, &srch); } if (found) { cd->type = RSPAMD_CT_ATTACHMENT; memcpy(&cd->filename, &found->value, sizeof(cd->filename)); } } } else { DL_FOREACH(hdr, cur) { gsize hlen; cd = NULL; if (cur->value) { hlen = strlen(cur->value); cd = rspamd_content_disposition_parse(cur->value, hlen, task->task_pool); } if (cd) { /* We still need to check filename */ if (cd->filename.len == 0) { if (part->ct && part->ct->attrs) { RSPAMD_FTOK_ASSIGN(&srch, "name"); found = g_hash_table_lookup(part->ct->attrs, &srch); if (!found) { RSPAMD_FTOK_ASSIGN(&srch, "filename"); found = g_hash_table_lookup(part->ct->attrs, &srch); } if (found) { cd->type = RSPAMD_CT_ATTACHMENT; memcpy(&cd->filename, &found->value, sizeof(cd->filename)); } } } msg_debug_mime("processed content disposition: %s, file: \"%T\"", cd->lc_data, &cd->filename); break; } else if (part->ct) { /* * Even in case of malformed Content-Disposition, we can still * fall back to Content-Type */ cd = rspamd_mempool_alloc0(task->task_pool, sizeof(*cd)); cd->type = RSPAMD_CT_INLINE; /* We can also have content disposition definitions in Content-Type */ if (part->ct->attrs) { RSPAMD_FTOK_ASSIGN(&srch, "name"); found = g_hash_table_lookup(part->ct->attrs, &srch); if (!found) { RSPAMD_FTOK_ASSIGN(&srch, "filename"); found = g_hash_table_lookup(part->ct->attrs, &srch); } if (found) { cd->type = RSPAMD_CT_ATTACHMENT; memcpy(&cd->filename, &found->value, sizeof(cd->filename)); } } } } } part->cd = cd; } void rspamd_mime_parser_calc_digest(struct rspamd_mime_part *part) { /* Blake2b applied to string 'rspamd' */ static const unsigned char hash_key[] = { 0xef, 0x43, 0xae, 0x80, 0xcc, 0x8d, 0xc3, 0x4c, 0x6f, 0x1b, 0xd6, 0x18, 0x1b, 0xae, 0x87, 0x74, 0x0c, 0xca, 0xf7, 0x8e, 0x5f, 0x2e, 0x54, 0x32, 0xf6, 0x79, 0xb9, 0x27, 0x26, 0x96, 0x20, 0x92, 0x70, 0x07, 0x85, 0xeb, 0x83, 0xf7, 0x89, 0xe0, 0xd7, 0x32, 0x2a, 0xd2, 0x1a, 0x64, 0x41, 0xef, 0x49, 0xff, 0xc3, 0x8c, 0x54, 0xf9, 0x67, 0x74, 0x30, 0x1e, 0x70, 0x2e, 0xb7, 0x12, 0x09, 0xfe, }; if (part->parsed_data.len > 0) { rspamd_cryptobox_hash(part->digest, part->parsed_data.begin, part->parsed_data.len, hash_key, sizeof(hash_key)); } } static enum rspamd_mime_parse_error rspamd_mime_parse_normal_part(struct rspamd_task *task, struct rspamd_mime_part *part, struct rspamd_mime_parser_ctx *st, struct rspamd_content_type *ct, GError **err) { rspamd_fstring_t *parsed; gssize r; g_assert(part != NULL); rspamd_mime_part_get_cte(task, part->raw_headers, part, part->ct && !(part->ct->flags & RSPAMD_CONTENT_TYPE_MESSAGE)); rspamd_mime_part_get_cd(task, part); switch (part->cte) { case RSPAMD_CTE_7BIT: case RSPAMD_CTE_8BIT: case RSPAMD_CTE_UNKNOWN: if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING)) { if (part->cte != RSPAMD_CTE_7BIT) { /* We have something that has a missing content-type, * but it has non-7bit characters. * * In theory, it is very unsafe to process it as a text part * as we unlikely get some sane result */ /* * On the other hand, there is an evidence that some * emails actually rely on that. * So we apply an expensive hack here: * if there are no 8bit characters -OR- the content is valid * UTF8, we can still imply Content-Type == text/plain */ if (rspamd_str_has_8bit(part->raw_data.begin, part->raw_data.len) && !rspamd_fast_utf8_validate(part->raw_data.begin, part->raw_data.len)) { part->ct->flags &= ~RSPAMD_CONTENT_TYPE_TEXT; part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN; } } } if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) { /* Need to copy text as we have couple of in-place change functions */ parsed = rspamd_fstring_sized_new(part->raw_data.len); parsed->len = part->raw_data.len; memcpy(parsed->str, part->raw_data.begin, parsed->len); part->parsed_data.begin = parsed->str; part->parsed_data.len = parsed->len; rspamd_mempool_notify_alloc(task->task_pool, parsed->len); rspamd_mempool_add_destructor(task->task_pool, (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed); } else { part->parsed_data.begin = part->raw_data.begin; part->parsed_data.len = part->raw_data.len; } break; case RSPAMD_CTE_QP: parsed = rspamd_fstring_sized_new(part->raw_data.len); r = rspamd_decode_qp_buf(part->raw_data.begin, part->raw_data.len, parsed->str, parsed->allocated); if (r != -1) { parsed->len = r; part->parsed_data.begin = parsed->str; part->parsed_data.len = parsed->len; rspamd_mempool_notify_alloc(task->task_pool, parsed->len); rspamd_mempool_add_destructor(task->task_pool, (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed); } else { msg_err_task("invalid quoted-printable encoded part, assume 8bit"); if (part->ct) { part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN; } part->cte = RSPAMD_CTE_8BIT; memcpy(parsed->str, part->raw_data.begin, part->raw_data.len); parsed->len = part->raw_data.len; part->parsed_data.begin = parsed->str; part->parsed_data.len = parsed->len; rspamd_mempool_notify_alloc(task->task_pool, parsed->len); rspamd_mempool_add_destructor(task->task_pool, (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed); } break; case RSPAMD_CTE_B64: parsed = rspamd_fstring_sized_new(part->raw_data.len / 4 * 3 + 12); rspamd_cryptobox_base64_decode(part->raw_data.begin, part->raw_data.len, parsed->str, &parsed->len); part->parsed_data.begin = parsed->str; part->parsed_data.len = parsed->len; rspamd_mempool_notify_alloc(task->task_pool, parsed->len); rspamd_mempool_add_destructor(task->task_pool, (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed); break; case RSPAMD_CTE_UUE: parsed = rspamd_fstring_sized_new(part->raw_data.len / 4 * 3 + 12); r = rspamd_decode_uue_buf(part->raw_data.begin, part->raw_data.len, parsed->str, parsed->allocated); rspamd_mempool_notify_alloc(task->task_pool, parsed->len); rspamd_mempool_add_destructor(task->task_pool, (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed); if (r != -1) { parsed->len = r; part->parsed_data.begin = parsed->str; part->parsed_data.len = parsed->len; } else { msg_err_task("invalid uuencoding in encoded part, assume 8bit"); if (part->ct) { part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN; } part->cte = RSPAMD_CTE_8BIT; parsed->len = MIN(part->raw_data.len, parsed->allocated); memcpy(parsed->str, part->raw_data.begin, parsed->len); rspamd_mempool_notify_alloc(task->task_pool, parsed->len); part->parsed_data.begin = parsed->str; part->parsed_data.len = parsed->len; } break; default: g_assert_not_reached(); } part->part_number = MESSAGE_FIELD(task, parts)->len; part->urls = g_ptr_array_new(); g_ptr_array_add(MESSAGE_FIELD(task, parts), part); msg_debug_mime("parsed data part %T/%T of length %z (%z orig), %s cte", &part->ct->type, &part->ct->subtype, part->parsed_data.len, part->raw_data.len, rspamd_cte_to_string(part->cte)); rspamd_mime_parser_calc_digest(part); if (ct && (ct->flags & RSPAMD_CONTENT_TYPE_SMIME)) { CMS_ContentInfo *cms; const unsigned char *der_beg = part->parsed_data.begin; cms = d2i_CMS_ContentInfo(NULL, &der_beg, part->parsed_data.len); if (cms) { const ASN1_OBJECT *asn_ct = CMS_get0_eContentType(cms); int ct_nid = OBJ_obj2nid(asn_ct); if (ct_nid == NID_pkcs7_data) { BIO *bio = BIO_new_mem_buf(part->parsed_data.begin, part->parsed_data.len); PKCS7 *p7; p7 = d2i_PKCS7_bio(bio, NULL); if (p7) { ct_nid = OBJ_obj2nid(p7->type); if (ct_nid == NID_pkcs7_signed) { PKCS7 *p7_signed_content = p7->d.sign->contents; ct_nid = OBJ_obj2nid(p7_signed_content->type); if (ct_nid == NID_pkcs7_data && p7_signed_content->d.data) { int ret; msg_debug_mime("found an additional part inside of " "smime structure of type %T/%T; length=%d", &ct->type, &ct->subtype, p7_signed_content->d.data->length); /* * Since ASN.1 structures are freed, we need to copy * the content */ char *cpy = rspamd_mempool_alloc(task->task_pool, p7_signed_content->d.data->length); memcpy(cpy, p7_signed_content->d.data->data, p7_signed_content->d.data->length); ret = rspamd_mime_process_multipart_node(task, st, NULL, cpy, cpy + p7_signed_content->d.data->length, TRUE, err); PKCS7_free(p7); BIO_free(bio); CMS_ContentInfo_free(cms); return ret; } } PKCS7_free(p7); } BIO_free(bio); } CMS_ContentInfo_free(cms); } } return RSPAMD_MIME_PARSE_OK; } struct rspamd_mime_multipart_cbdata { struct rspamd_task *task; struct rspamd_mime_part *multipart; struct rspamd_mime_parser_ctx *st; const char *part_start; rspamd_ftok_t *cur_boundary; uint64_t bhash; GError **err; }; static enum rspamd_mime_parse_error rspamd_mime_process_multipart_node(struct rspamd_task *task, struct rspamd_mime_parser_ctx *st, struct rspamd_mime_part *multipart, const char *start, const char *end, gboolean is_finished, GError **err) { struct rspamd_content_type *ct, *sel = NULL; struct rspamd_mime_header *hdr = NULL, *cur; struct rspamd_mime_part *npart; GString str; goffset hdr_pos, body_pos; enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_FATAL; str.str = (char *) start; str.len = end - start; if (*start == '\n' || *start == '\r') { /* * We have a part that starts from newline which means that * there are completely no headers in this part, * hence we assume it as a text part */ hdr_pos = 0; body_pos = 0; if (!is_finished) { /* Ignore garbage */ const char *p = start; gboolean seen_something = FALSE; while (p < end) { if (g_ascii_isalnum(*p)) { seen_something = TRUE; break; } p++; } if (!seen_something) { return RSPAMD_MIME_PARSE_NO_PART; } } } else { hdr_pos = rspamd_string_find_eoh(&str, &body_pos); } npart = rspamd_mempool_alloc0(task->task_pool, sizeof(struct rspamd_mime_part)); npart->parent_part = multipart; npart->raw_headers = rspamd_message_headers_new(); npart->headers_order = NULL; if (multipart) { if (multipart->specific.mp->children == NULL) { multipart->specific.mp->children = g_ptr_array_sized_new(2); } g_ptr_array_add(multipart->specific.mp->children, npart); } if (hdr_pos > 0 && hdr_pos < str.len) { npart->raw_headers_str = str.str; npart->raw_headers_len = hdr_pos; npart->raw_data.begin = start + body_pos; npart->raw_data.len = (end - start) - body_pos; if (npart->raw_headers_len > 0) { rspamd_mime_headers_process(task, npart->raw_headers, &npart->headers_order, npart->raw_headers_str, npart->raw_headers_len, FALSE); /* Preserve the natural order */ if (npart->headers_order) { LL_REVERSE2(npart->headers_order, ord_next); } } hdr = rspamd_message_get_header_from_hash(npart->raw_headers, "Content-Type", FALSE); } else { npart->raw_headers_str = 0; npart->raw_headers_len = 0; npart->raw_data.begin = start; npart->raw_data.len = end - start; } if (hdr != NULL) { DL_FOREACH(hdr, cur) { ct = rspamd_content_type_parse(cur->value, strlen(cur->value), task->task_pool); /* Here we prefer multipart content-type or any content-type */ if (ct) { if (sel == NULL) { sel = ct; } else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) { sel = ct; } } } } if (sel == NULL) { sel = rspamd_mempool_alloc0(task->task_pool, sizeof(*sel)); RSPAMD_FTOK_ASSIGN(&sel->type, "text"); RSPAMD_FTOK_ASSIGN(&sel->subtype, "plain"); } npart->ct = sel; if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) { st->nesting++; g_ptr_array_add(st->stack, npart); npart->part_type = RSPAMD_MIME_PART_MULTIPART; npart->specific.mp = rspamd_mempool_alloc0(task->task_pool, sizeof(struct rspamd_mime_multipart)); memcpy(&npart->specific.mp->boundary, &sel->orig_boundary, sizeof(rspamd_ftok_t)); ret = rspamd_mime_parse_multipart_part(task, npart, st, err); } else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) { st->nesting++; g_ptr_array_add(st->stack, npart); npart->part_type = RSPAMD_MIME_PART_MESSAGE; if ((ret = rspamd_mime_parse_normal_part(task, npart, st, sel, err)) == RSPAMD_MIME_PARSE_OK) { ret = rspamd_mime_parse_message(task, npart, st, err); } } else { ret = rspamd_mime_parse_normal_part(task, npart, st, sel, err); } return ret; } static enum rspamd_mime_parse_error rspamd_mime_parse_multipart_cb(struct rspamd_task *task, struct rspamd_mime_part *multipart, struct rspamd_mime_parser_ctx *st, struct rspamd_mime_multipart_cbdata *cb, struct rspamd_mime_boundary *b) { const char *pos = st->start + b->boundary; enum rspamd_mime_parse_error ret; task = cb->task; /* Now check boundary */ if (!cb->part_start) { cb->part_start = st->start + b->start; st->pos = cb->part_start; } else { /* * We have seen the start of the boundary, * but it might be unsuitable (e.g. in broken headers) */ if (cb->part_start < pos && cb->cur_boundary) { if ((ret = rspamd_mime_process_multipart_node(task, cb->st, cb->multipart, cb->part_start, pos, TRUE, cb->err)) != RSPAMD_MIME_PARSE_OK) { return ret; } if (b->start > 0) { /* Go towards the next part */ cb->part_start = st->start + b->start; cb->st->pos = cb->part_start; } } else { /* We have an empty boundary, do nothing */ } } return RSPAMD_MIME_PARSE_OK; } static enum rspamd_mime_parse_error rspamd_multipart_boundaries_filter(struct rspamd_task *task, struct rspamd_mime_part *multipart, struct rspamd_mime_parser_ctx *st, struct rspamd_mime_multipart_cbdata *cb) { struct rspamd_mime_boundary *cur; goffset last_offset; unsigned int i, sel = 0; enum rspamd_mime_parse_error ret; bool enforce_closing = false; last_offset = (multipart->raw_data.begin - st->start) + multipart->raw_data.len; /* Find the first offset suitable for this part */ for (i = 0; i < st->boundaries->len; i++) { cur = &g_array_index(st->boundaries, struct rspamd_mime_boundary, i); if (cur->start >= multipart->raw_data.begin - st->start) { if (cb->cur_boundary) { /* Check boundary */ msg_debug_mime("compare %L and %L (and %L)", cb->bhash, cur->hash, cur->closed_hash); if (cb->bhash == cur->hash) { sel = i; break; } else if (cb->bhash == cur->closed_hash) { /* Not a closing element in fact */ cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED); cur->hash = cur->closed_hash; sel = i; break; } } else { /* Set current boundary */ cb->cur_boundary = rspamd_mempool_alloc(task->task_pool, sizeof(rspamd_ftok_t)); cb->cur_boundary->begin = st->start + cur->boundary; cb->cur_boundary->len = 0; cb->bhash = cur->hash; sel = i; break; } } } /* Now we can go forward with boundaries that are same to what we have */ for (i = sel; i < st->boundaries->len; i++) { cur = &g_array_index(st->boundaries, struct rspamd_mime_boundary, i); if (cur->boundary > last_offset) { /* * We have reached the end of the part, so we have to close it implicitly * like MUA do */ task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS; enforce_closing = true; break; } if (cur->hash == cb->bhash || cur->closed_hash == cb->bhash) { if ((ret = rspamd_mime_parse_multipart_cb(task, multipart, st, cb, cur)) != RSPAMD_MIME_PARSE_OK) { return ret; } if (cur->closed_hash == cb->bhash) { /* We have again fake closed hash */ cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED); cur->hash = cur->closed_hash; } if (RSPAMD_BOUNDARY_IS_CLOSED(cur)) { /* We also might check the next boundary... */ if (i < st->boundaries->len - 1) { cur = &g_array_index(st->boundaries, struct rspamd_mime_boundary, i + 1); if (cur->hash == cb->bhash) { continue; } else if (cur->closed_hash == cb->bhash) { /* We have again fake closed hash */ cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED); cur->hash = cur->closed_hash; continue; } } break; } } } if (enforce_closing || (i == st->boundaries->len && cb->cur_boundary)) { /* Process the last part */ struct rspamd_mime_boundary fb; fb.boundary = last_offset; fb.start = -1; if ((ret = rspamd_mime_parse_multipart_cb(task, multipart, st, cb, &fb)) != RSPAMD_MIME_PARSE_OK) { return ret; } } return RSPAMD_MIME_PARSE_OK; } static enum rspamd_mime_parse_error rspamd_mime_parse_multipart_part(struct rspamd_task *task, struct rspamd_mime_part *part, struct rspamd_mime_parser_ctx *st, GError **err) { struct rspamd_mime_multipart_cbdata cbdata; enum rspamd_mime_parse_error ret; if (st->nesting > max_nested) { g_set_error(err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d", st->nesting); return RSPAMD_MIME_PARSE_NESTING; } part->part_number = MESSAGE_FIELD(task, parts)->len; part->urls = g_ptr_array_new(); g_ptr_array_add(MESSAGE_FIELD(task, parts), part); st->nesting++; rspamd_mime_part_get_cte(task, part->raw_headers, part, FALSE); st->pos = part->raw_data.begin; cbdata.multipart = part; cbdata.task = task; cbdata.st = st; cbdata.part_start = NULL; cbdata.err = err; if (part->ct->boundary.len > 0) { /* We know our boundary */ cbdata.cur_boundary = &part->ct->boundary; rspamd_cryptobox_siphash((unsigned char *) &cbdata.bhash, cbdata.cur_boundary->begin, cbdata.cur_boundary->len, lib_ctx->hkey); msg_debug_mime("hash: %T -> %L", cbdata.cur_boundary, cbdata.bhash); } else { /* Guess boundary */ cbdata.cur_boundary = NULL; cbdata.bhash = 0; } ret = rspamd_multipart_boundaries_filter(task, part, st, &cbdata); /* Cleanup stack */ st->nesting--; g_ptr_array_remove_index_fast(st->stack, st->stack->len - 1); return ret; } /* Process boundary like structures in a message */ static int rspamd_mime_preprocess_cb(struct rspamd_multipattern *mp, unsigned int strnum, int match_start, int match_pos, const char *text, gsize len, void *context) { const char *end = text + len, *p = text + match_pos, *bend; gsize blen; gboolean closing = FALSE; struct rspamd_mime_boundary b; struct rspamd_mime_parser_ctx *st = context; struct rspamd_task *task; task = st->task; if (G_LIKELY(p < end)) { blen = 0; while (p < end) { if (*p == '\r' || *p == '\n') { break; } blen++; p++; } if (blen > 0) { /* We have found something like boundary */ p = text + match_pos; bend = p + blen - 1; if (*bend == '-') { /* We need to verify last -- */ if (bend > p + 1 && *(bend - 1) == '-') { closing = TRUE; bend--; blen -= 2; } else { /* Not a closing boundary somehow, e.g. if a boundary=='-' */ bend++; } } else { bend++; } while (bend < end) { if (*bend == '\r') { bend++; /* \r\n */ if (bend < end && *bend == '\n') { bend++; } } else if (*bend == '\n') { /* \n */ bend++; } else if (g_ascii_isspace(*bend)) { /* Spaces in the same line, skip them */ bend++; continue; } break; } b.boundary = p - st->start - 2; b.start = bend - st->start; /* Small optimisation as boundaries are usually short strings */ char *lc_copy, lc_copy_buf[128]; if (blen + 2 < sizeof(lc_copy_buf)) { lc_copy = lc_copy_buf; } else { lc_copy = g_malloc(blen + 2); } if (closing) { memcpy(lc_copy, p, blen + 2); rspamd_str_lc(lc_copy, blen + 2); } else { memcpy(lc_copy, p, blen); rspamd_str_lc(lc_copy, blen); } rspamd_cryptobox_siphash((unsigned char *) &b.hash, lc_copy, blen, lib_ctx->hkey); msg_debug_mime("normal hash: %*s -> %L, %d boffset, %d data offset", (int) blen, lc_copy, b.hash, (int) b.boundary, (int) b.start); if (closing) { b.flags = RSPAMD_MIME_BOUNDARY_FLAG_CLOSED; rspamd_cryptobox_siphash((unsigned char *) &b.closed_hash, lc_copy, blen + 2, lib_ctx->hkey); msg_debug_mime("closing hash: %*s -> %L, %d boffset, %d data offset", (int) blen + 2, lc_copy, b.closed_hash, (int) b.boundary, (int) b.start); } else { b.flags = 0; b.closed_hash = 0; } /* Check if a string has been allocated on the heap */ if (blen + 2 >= sizeof(lc_copy_buf)) { g_free(lc_copy); } g_array_append_val(st->boundaries, b); } } return 0; } static goffset rspamd_mime_parser_headers_heuristic(GString *input, goffset *body_start) { const gsize default_max_len = 76; gsize max_len = MIN(input->len, default_max_len); const char *p, *end; enum { st_before_colon = 0, st_colon, st_spaces_after_colon, st_value, st_error } state = st_before_colon; p = input->str; end = p + max_len; while (p < end) { switch (state) { case st_before_colon: if (G_UNLIKELY(*p == ':')) { state = st_colon; } else if (G_UNLIKELY(!g_ascii_isgraph(*p))) { state = st_error; } p++; break; case st_colon: if (g_ascii_isspace(*p)) { state = st_spaces_after_colon; } else { state = st_value; } p++; break; case st_spaces_after_colon: if (!g_ascii_isspace(*p)) { state = st_value; } p++; break; case st_value: /* We accept any value */ goto end; break; case st_error: return (-1); break; } } end: if (state == st_value) { if (body_start) { *body_start = input->len; } return input->len; } return (-1); } static void rspamd_mime_preprocess_message(struct rspamd_task *task, struct rspamd_mime_part *top, struct rspamd_mime_parser_ctx *st) { if (top->raw_data.begin >= st->pos) { rspamd_multipattern_lookup(lib_ctx->mp_boundary, top->raw_data.begin - 1, top->raw_data.len + 1, rspamd_mime_preprocess_cb, st, NULL); } else { rspamd_multipattern_lookup(lib_ctx->mp_boundary, st->pos, st->end - st->pos, rspamd_mime_preprocess_cb, st, NULL); } } static void rspamd_mime_parse_stack_free(struct rspamd_mime_parser_ctx *st) { if (st) { g_ptr_array_free(st->stack, TRUE); g_array_free(st->boundaries, TRUE); g_free(st); } } static enum rspamd_mime_parse_error rspamd_mime_parse_message(struct rspamd_task *task, struct rspamd_mime_part *part, struct rspamd_mime_parser_ctx *st, GError **err) { struct rspamd_content_type *ct, *sel = NULL; struct rspamd_mime_header *hdr = NULL, *cur; const char *pbegin, *p; gsize plen, len; struct rspamd_mime_part *npart; goffset hdr_pos, body_pos; unsigned int i; enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK; GString str; struct rspamd_mime_parser_ctx *nst = st; if (st->nesting > max_nested) { g_set_error(err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d", st->nesting); return RSPAMD_MIME_PARSE_NESTING; } /* Allocate real part */ npart = rspamd_mempool_alloc0(task->task_pool, sizeof(struct rspamd_mime_part)); if (part == NULL) { /* Top level message */ p = task->msg.begin; len = task->msg.len; str.str = (char *) p; str.len = len; hdr_pos = rspamd_string_find_eoh(&str, &body_pos); if (hdr_pos > 0 && hdr_pos < str.len) { MESSAGE_FIELD(task, raw_headers_content).begin = str.str; MESSAGE_FIELD(task, raw_headers_content).len = hdr_pos; MESSAGE_FIELD(task, raw_headers_content).body_start = str.str + body_pos; if (MESSAGE_FIELD(task, raw_headers_content).len > 0) { rspamd_mime_headers_process(task, MESSAGE_FIELD(task, raw_headers), &MESSAGE_FIELD(task, headers_order), MESSAGE_FIELD(task, raw_headers_content).begin, MESSAGE_FIELD(task, raw_headers_content).len, TRUE); npart->raw_headers = rspamd_message_headers_ref( MESSAGE_FIELD(task, raw_headers)); /* Preserve the natural order */ if (MESSAGE_FIELD(task, headers_order)) { LL_REVERSE2(MESSAGE_FIELD(task, headers_order), ord_next); } } hdr = rspamd_message_get_header_from_hash( MESSAGE_FIELD(task, raw_headers), "Content-Type", FALSE); } else { /* First apply heuristic, maybe we have just headers */ hdr_pos = rspamd_mime_parser_headers_heuristic(&str, &body_pos); if (hdr_pos > 0 && hdr_pos <= str.len) { MESSAGE_FIELD(task, raw_headers_content).begin = str.str; MESSAGE_FIELD(task, raw_headers_content).len = hdr_pos; MESSAGE_FIELD(task, raw_headers_content).body_start = str.str + body_pos; if (MESSAGE_FIELD(task, raw_headers_content).len > 0) { rspamd_mime_headers_process(task, MESSAGE_FIELD(task, raw_headers), &MESSAGE_FIELD(task, headers_order), MESSAGE_FIELD(task, raw_headers_content).begin, MESSAGE_FIELD(task, raw_headers_content).len, TRUE); npart->raw_headers = rspamd_message_headers_ref( MESSAGE_FIELD(task, raw_headers)); /* Preserve the natural order */ if (MESSAGE_FIELD(task, headers_order)) { LL_REVERSE2(MESSAGE_FIELD(task, headers_order), ord_next); } } hdr = rspamd_message_get_header_from_hash( MESSAGE_FIELD(task, raw_headers), "Content-Type", FALSE); task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS; } else { body_pos = 0; } } pbegin = st->start + body_pos; plen = st->end - pbegin; npart->headers_order = NULL; } else { /* * Here are dragons: * We allocate new parser context as we need to shift pointers */ nst = g_malloc0(sizeof(*st)); nst->stack = g_ptr_array_sized_new(4); nst->boundaries = g_array_sized_new(FALSE, FALSE, sizeof(struct rspamd_mime_boundary), 8); nst->start = part->parsed_data.begin; nst->end = nst->start + part->parsed_data.len; nst->pos = nst->start; nst->task = st->task; nst->nesting = st->nesting; st->nesting++; str.str = (char *) part->parsed_data.begin; str.len = part->parsed_data.len; hdr_pos = rspamd_string_find_eoh(&str, &body_pos); npart->raw_headers = rspamd_message_headers_new(); npart->headers_order = NULL; if (hdr_pos > 0 && hdr_pos < str.len) { npart->raw_headers_str = str.str; npart->raw_headers_len = hdr_pos; npart->raw_data.begin = str.str + body_pos; if (npart->raw_headers_len > 0) { rspamd_mime_headers_process(task, npart->raw_headers, &npart->headers_order, npart->raw_headers_str, npart->raw_headers_len, FALSE); /* Preserve the natural order */ if (npart->headers_order) { LL_REVERSE2(npart->headers_order, ord_next); } } hdr = rspamd_message_get_header_from_hash(npart->raw_headers, "Content-Type", FALSE); } else { body_pos = 0; } pbegin = part->parsed_data.begin + body_pos; plen = part->parsed_data.len - body_pos; } npart->raw_data.begin = pbegin; npart->raw_data.len = plen; npart->parent_part = part; if (hdr == NULL) { sel = NULL; } else { DL_FOREACH(hdr, cur) { ct = rspamd_content_type_parse(cur->value, strlen(cur->value), task->task_pool); /* Here we prefer multipart content-type or any content-type */ if (ct) { if (sel == NULL) { sel = ct; } else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) { sel = ct; } } } } if (sel == NULL) { /* For messages we automatically assume plaintext */ msg_info_task("cannot find content-type for a message, assume text/plain"); sel = rspamd_mempool_alloc0(task->task_pool, sizeof(*sel)); sel->flags = RSPAMD_CONTENT_TYPE_TEXT | RSPAMD_CONTENT_TYPE_MISSING; RSPAMD_FTOK_ASSIGN(&sel->type, "text"); RSPAMD_FTOK_ASSIGN(&sel->subtype, "plain"); } npart->ct = sel; if ((part == NULL || nst != st) && (sel->flags & (RSPAMD_CONTENT_TYPE_MULTIPART | RSPAMD_CONTENT_TYPE_MESSAGE))) { /* Not a trivial message, need to preprocess */ rspamd_mime_preprocess_message(task, npart, nst); } if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) { g_ptr_array_add(nst->stack, npart); nst->nesting++; npart->part_type = RSPAMD_MIME_PART_MULTIPART; npart->specific.mp = rspamd_mempool_alloc0(task->task_pool, sizeof(struct rspamd_mime_multipart)); memcpy(&npart->specific.mp->boundary, &sel->orig_boundary, sizeof(rspamd_ftok_t)); ret = rspamd_mime_parse_multipart_part(task, npart, nst, err); } else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) { if ((ret = rspamd_mime_parse_normal_part(task, npart, nst, sel, err)) == RSPAMD_MIME_PARSE_OK) { npart->part_type = RSPAMD_MIME_PART_MESSAGE; ret = rspamd_mime_parse_message(task, npart, nst, err); } } else { ret = rspamd_mime_parse_normal_part(task, npart, nst, sel, err); } if (ret != RSPAMD_MIME_PARSE_OK) { return ret; } if (part && st->stack->len > 0) { /* Remove message part from the parent stack */ g_ptr_array_remove_index_fast(st->stack, st->stack->len - 1); st->nesting--; } /* Process leftovers for boundaries */ if (nst->boundaries) { struct rspamd_mime_boundary *boundary, *start_boundary = NULL, *end_boundary = NULL; goffset cur_offset = nst->pos - nst->start, end_offset = st->end - st->start; unsigned int sel_idx = 0; for (;;) { start_boundary = NULL; for (i = sel_idx; i < nst->boundaries->len; i++) { boundary = &g_array_index(nst->boundaries, struct rspamd_mime_boundary, i); if (boundary->start > cur_offset && boundary->boundary < end_offset && !RSPAMD_BOUNDARY_IS_CLOSED(boundary)) { start_boundary = boundary; sel_idx = i; break; } } if (start_boundary) { const char *start, *end; if (nst->boundaries->len > sel_idx + 1) { end_boundary = &g_array_index(nst->boundaries, struct rspamd_mime_boundary, sel_idx + 1); end = nst->start + end_boundary->boundary; } else { end = nst->end; } sel_idx++; start = nst->start + start_boundary->start; if (end > start && (ret = rspamd_mime_process_multipart_node(task, nst, NULL, start, end, FALSE, err)) != RSPAMD_MIME_PARSE_OK) { if (nst != st) { rspamd_mime_parse_stack_free(nst); } if (ret == RSPAMD_MIME_PARSE_NO_PART) { return RSPAMD_MIME_PARSE_OK; } return ret; } } else { break; } } } if (nst != st) { rspamd_mime_parse_stack_free(nst); } return ret; } enum rspamd_mime_parse_error rspamd_mime_parse_task(struct rspamd_task *task, GError **err) { struct rspamd_mime_parser_ctx *st; enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK; if (lib_ctx == NULL) { rspamd_mime_parser_init_lib(); } if (++lib_ctx->key_usages > max_key_usages) { /* Regenerate siphash key */ ottery_rand_bytes(lib_ctx->hkey, sizeof(lib_ctx->hkey)); lib_ctx->key_usages = 0; } st = g_malloc0(sizeof(*st)); st->stack = g_ptr_array_sized_new(4); st->pos = MESSAGE_FIELD(task, raw_headers_content).body_start; st->end = task->msg.begin + task->msg.len; st->boundaries = g_array_sized_new(FALSE, FALSE, sizeof(struct rspamd_mime_boundary), 8); st->task = task; if (st->pos == NULL) { st->pos = task->msg.begin; } st->start = task->msg.begin; ret = rspamd_mime_parse_message(task, NULL, st, err); rspamd_mime_parse_stack_free(st); return ret; }