123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758 |
- /*
- * Copyright 2024 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
- #include "config.h"
- #include "task.h"
- #include "mime_parser.h"
- #include "mime_headers.h"
- #include "message.h"
- #include "multipattern.h"
- #include "contrib/libottery/ottery.h"
- #include "contrib/uthash/utlist.h"
- #include <openssl/cms.h>
- #include <openssl/pkcs7.h>
- #include "contrib/fastutf8/fastutf8.h"
-
- struct rspamd_mime_parser_lib_ctx {
- struct rspamd_multipattern *mp_boundary;
- guchar hkey[rspamd_cryptobox_SIPKEYBYTES]; /* Key for hashing */
- guint key_usages;
- };
-
- struct rspamd_mime_parser_lib_ctx *lib_ctx = NULL;
-
- static const guint max_nested = 64;
- static const guint max_key_usages = 10000;
-
- #define msg_debug_mime(...) rspamd_conditional_debug_fast(NULL, task->from_addr, \
- rspamd_mime_log_id, "mime", task->task_pool->tag.uid, \
- RSPAMD_LOG_FUNC, \
- __VA_ARGS__)
-
- INIT_LOG_MODULE(mime)
-
- #define RSPAMD_MIME_BOUNDARY_FLAG_CLOSED (1 << 0)
- #define RSPAMD_BOUNDARY_IS_CLOSED(b) ((b)->flags & RSPAMD_MIME_BOUNDARY_FLAG_CLOSED)
-
- struct rspamd_mime_boundary {
- goffset boundary;
- goffset start;
- uint64_t hash;
- uint64_t closed_hash;
- gint flags;
- };
-
- struct rspamd_mime_parser_ctx {
- GPtrArray *stack; /* Stack of parts */
- GArray *boundaries; /* Boundaries found in the whole message */
- const gchar *start;
- const gchar *pos;
- const gchar *end;
- struct rspamd_task *task;
- guint nesting;
- };
-
- static enum rspamd_mime_parse_error
- rspamd_mime_parse_multipart_part(struct rspamd_task *task,
- struct rspamd_mime_part *part,
- struct rspamd_mime_parser_ctx *st,
- GError **err);
- static enum rspamd_mime_parse_error
- rspamd_mime_parse_message(struct rspamd_task *task,
- struct rspamd_mime_part *part,
- struct rspamd_mime_parser_ctx *st,
- GError **err);
- static enum rspamd_mime_parse_error
- rspamd_mime_parse_normal_part(struct rspamd_task *task,
- struct rspamd_mime_part *part,
- struct rspamd_mime_parser_ctx *st,
- struct rspamd_content_type *ct,
- GError **err);
-
- static enum rspamd_mime_parse_error
- rspamd_mime_process_multipart_node(struct rspamd_task *task,
- struct rspamd_mime_parser_ctx *st,
- struct rspamd_mime_part *multipart,
- const gchar *start, const gchar *end,
- gboolean is_finished,
- GError **err);
-
-
- #define RSPAMD_MIME_QUARK (rspamd_mime_parser_quark())
- static GQuark
- rspamd_mime_parser_quark(void)
- {
- return g_quark_from_static_string("mime-parser");
- }
-
- const gchar *
- rspamd_cte_to_string(enum rspamd_cte ct)
- {
- const gchar *ret = "unknown";
-
- switch (ct) {
- case RSPAMD_CTE_7BIT:
- ret = "7bit";
- break;
- case RSPAMD_CTE_8BIT:
- ret = "8bit";
- break;
- case RSPAMD_CTE_QP:
- ret = "quoted-printable";
- break;
- case RSPAMD_CTE_B64:
- ret = "base64";
- break;
- case RSPAMD_CTE_UUE:
- ret = "X-uuencode";
- break;
- default:
- break;
- }
-
- return ret;
- }
-
- enum rspamd_cte
- rspamd_cte_from_string(const gchar *str)
- {
- enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
-
- g_assert(str != NULL);
-
- if (strcmp(str, "7bit") == 0) {
- ret = RSPAMD_CTE_7BIT;
- }
- else if (strcmp(str, "8bit") == 0) {
- ret = RSPAMD_CTE_8BIT;
- }
- else if (strcmp(str, "quoted-printable") == 0) {
- ret = RSPAMD_CTE_QP;
- }
- else if (strcmp(str, "base64") == 0) {
- ret = RSPAMD_CTE_B64;
- }
- else if (strcmp(str, "X-uuencode") == 0) {
- ret = RSPAMD_CTE_UUE;
- }
- else if (strcmp(str, "uuencode") == 0) {
- ret = RSPAMD_CTE_UUE;
- }
- else if (strcmp(str, "X-uue") == 0) {
- ret = RSPAMD_CTE_UUE;
- }
-
- return ret;
- }
-
- static void
- rspamd_mime_parser_init_lib(void)
- {
- lib_ctx = g_malloc0(sizeof(*lib_ctx));
- lib_ctx->mp_boundary = rspamd_multipattern_create(RSPAMD_MULTIPATTERN_DEFAULT);
- g_assert(lib_ctx->mp_boundary != NULL);
- rspamd_multipattern_add_pattern(lib_ctx->mp_boundary, "\r--", 0);
- rspamd_multipattern_add_pattern(lib_ctx->mp_boundary, "\n--", 0);
-
- GError *err = NULL;
- if (!rspamd_multipattern_compile(lib_ctx->mp_boundary, RSPAMD_MULTIPATTERN_COMPILE_NO_FS, &err)) {
- msg_err("fatal error: cannot compile multipattern for mime parser boundaries: %e", err);
- g_error_free(err);
- g_abort();
- }
- ottery_rand_bytes(lib_ctx->hkey, sizeof(lib_ctx->hkey));
- }
-
- static enum rspamd_cte
- rspamd_mime_parse_cte(const gchar *in, gsize len)
- {
- uint64_t h;
- enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
-
- in = rspamd_string_len_strip(in, &len, " \t;,.+-#!`~'");
- h = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64,
- in, len, 0xdeadbabe);
-
- switch (h) {
- case 0xCEDAA7056B4753F7ULL: /* 7bit */
- ret = RSPAMD_CTE_7BIT;
- break;
- case 0x42E0745448B39FC1ULL: /* 8bit */
- case 0x6B169E6B155BADC0ULL: /* binary */
- ret = RSPAMD_CTE_8BIT;
- break;
- case 0x6D69A5BB02A633B0ULL: /* quoted-printable */
- ret = RSPAMD_CTE_QP;
- break;
- case 0x96305588A76DC9A9ULL: /* base64 */
- case 0x171029DE1B0423A9ULL: /* base-64 */
- ret = RSPAMD_CTE_B64;
- break;
- case 0x420b54dc00d13cecULL: /* uuencode */
- case 0x8df6700b8f6c4cf9ULL: /* x-uuencode */
- case 0x41f725ec544356d3ULL: /* x-uue */
- ret = RSPAMD_CTE_UUE;
- break;
- }
-
- return ret;
- }
-
- static enum rspamd_cte
- rspamd_mime_part_get_cte_heuristic(struct rspamd_task *task,
- struct rspamd_mime_part *part)
- {
- const guint check_len = 128;
- guint real_len, nspaces = 0, neqsign = 0, n8bit = 0, nqpencoded = 0,
- padeqsign = 0, nupper = 0, nlower = 0;
- gboolean b64_chars = TRUE;
- const guchar *p, *end;
- enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
-
- real_len = MIN(check_len, part->raw_data.len);
- p = (const guchar *) part->raw_data.begin;
- end = p + part->raw_data.len;
-
- while (p < end && g_ascii_isspace(*p)) {
- p++;
- }
-
- if (end - p > sizeof("begin-base64 ")) {
- const guchar *uue_start;
-
- if (memcmp(p, "begin ", sizeof("begin ") - 1) == 0) {
- uue_start = p + sizeof("begin ") - 1;
-
- while (uue_start < end && g_ascii_isspace(*uue_start)) {
- uue_start++;
- }
-
- if (uue_start < end && g_ascii_isdigit(*uue_start)) {
- return RSPAMD_CTE_UUE;
- }
- }
- else if (memcmp(p, "begin-base64 ", sizeof("begin-base64 ") - 1) == 0) {
- uue_start = p + sizeof("begin ") - 1;
-
- while (uue_start < end && g_ascii_isspace(*uue_start)) {
- uue_start++;
- }
-
- if (uue_start < end && g_ascii_isdigit(*uue_start)) {
- return RSPAMD_CTE_UUE;
- }
- }
- }
-
- /* Skip trailing spaces */
- while (end > p && g_ascii_isspace(*(end - 1))) {
- end--;
- }
-
- if (end > p + 2) {
- if (*(end - 1) == '=') {
- padeqsign++;
- end--;
- }
-
- if (*(end - 1) == '=') {
- padeqsign++;
- end--;
- }
- }
-
- /* Adjust end to analyse only first characters */
- if (end - p > real_len) {
- end = p + real_len;
- }
-
- while (p < end) {
- if (*p == ' ') {
- nspaces++;
- }
- else if (*p == '=') {
- b64_chars = FALSE; /* Eqsign must not be inside base64 */
- neqsign++;
- p++;
-
- if (p + 2 < end && g_ascii_isxdigit(*p) && g_ascii_isxdigit(*(p + 1))) {
- p++;
- nqpencoded++;
- }
-
- continue;
- }
- else if (*p >= 0x80) {
- n8bit++;
- b64_chars = FALSE;
- }
- else if (!(g_ascii_isalnum(*p) || *p == '/' || *p == '+')) {
- b64_chars = FALSE;
- }
- else if (g_ascii_isupper(*p)) {
- nupper++;
- }
- else if (g_ascii_islower(*p)) {
- nlower++;
- }
-
- p++;
- }
-
- if (b64_chars && neqsign <= 2 && nspaces == 0) {
- /* Need more thinking */
-
- if (part->raw_data.len > 80) {
- if (padeqsign > 0) {
- ret = RSPAMD_CTE_B64;
- }
- else {
- /* We have a large piece of data with no spaces and base64
- * symbols only, no padding is detected as well...
- *
- * There is a small chance that our first 128 characters
- * are either some garbage or it is a base64 with no padding
- * (e.g. when it is not needed)
- */
- if (nupper > 1 && nlower > 1) {
- /*
- * We have both uppercase and lowercase letters, so it can be
- * base64
- */
- ret = RSPAMD_CTE_B64;
- }
- else {
- ret = RSPAMD_CTE_7BIT;
- }
- }
- }
- else {
-
- if (((end - (const guchar *) part->raw_data.begin) + padeqsign) % 4 == 0) {
- if (padeqsign == 0) {
- /*
- * It can be either base64 or plain text, hard to say
- * Let's assume that if we have > 1 uppercase it is
- * likely base64
- */
- if (nupper > 1 && nlower > 1) {
- ret = RSPAMD_CTE_B64;
- }
- else {
- ret = RSPAMD_CTE_7BIT;
- }
- }
- else {
- ret = RSPAMD_CTE_B64;
- }
- }
- else {
- /* No way */
- if (padeqsign == 1 || padeqsign == 2) {
- ret = RSPAMD_CTE_B64;
- }
- else {
- ret = RSPAMD_CTE_7BIT;
- }
- }
- }
- }
- else if (n8bit == 0) {
- if (neqsign > 2 && nqpencoded > 2) {
- ret = RSPAMD_CTE_QP;
- }
- else {
- ret = RSPAMD_CTE_7BIT;
- }
- }
- else {
- ret = RSPAMD_CTE_8BIT;
- }
-
- msg_debug_mime("detected cte: %s", rspamd_cte_to_string(ret));
-
- return ret;
- }
-
- static void
- rspamd_mime_part_get_cte(struct rspamd_task *task,
- struct rspamd_mime_headers_table *hdrs,
- struct rspamd_mime_part *part,
- gboolean apply_heuristic)
- {
- struct rspamd_mime_header *hdr, *cur;
- enum rspamd_cte cte = RSPAMD_CTE_UNKNOWN;
- gboolean parent_propagated = FALSE;
-
- hdr = rspamd_message_get_header_from_hash(hdrs, "Content-Transfer-Encoding", FALSE);
-
- if (hdr == NULL) {
- if (part->parent_part && part->parent_part->cte != RSPAMD_CTE_UNKNOWN &&
- !(part->parent_part->flags & RSPAMD_MIME_PART_MISSING_CTE)) {
- part->cte = part->parent_part->cte;
- parent_propagated = TRUE;
-
- goto check_cte;
- }
-
- if (apply_heuristic) {
- part->cte = rspamd_mime_part_get_cte_heuristic(task, part);
- msg_info_task("detected missing CTE for part as: %s",
- rspamd_cte_to_string(part->cte));
- }
-
- part->flags |= RSPAMD_MIME_PART_MISSING_CTE;
- }
- else {
- DL_FOREACH(hdr, cur)
- {
- gsize hlen;
- gchar lc_buf[128];
-
- hlen = rspamd_snprintf(lc_buf, sizeof(lc_buf), "%s", cur->value);
- rspamd_str_lc(lc_buf, hlen);
- cte = rspamd_mime_parse_cte(lc_buf, hlen);
-
- if (cte != RSPAMD_CTE_UNKNOWN) {
- part->cte = cte;
- break;
- }
- }
-
- check_cte:
- if (apply_heuristic) {
- if (part->cte == RSPAMD_CTE_UNKNOWN) {
- part->cte = rspamd_mime_part_get_cte_heuristic(task, part);
-
- msg_info_task("corrected bad CTE for part to: %s",
- rspamd_cte_to_string(part->cte));
- }
- else if (part->cte == RSPAMD_CTE_B64 ||
- part->cte == RSPAMD_CTE_QP) {
- /* Additionally check sanity */
- cte = rspamd_mime_part_get_cte_heuristic(task, part);
-
- if (cte == RSPAMD_CTE_8BIT) {
- msg_info_task(
- "incorrect cte specified for part: %s, %s detected",
- rspamd_cte_to_string(part->cte),
- rspamd_cte_to_string(cte));
- part->cte = cte;
- part->flags |= RSPAMD_MIME_PART_BAD_CTE;
- }
- else if (cte != part->cte && parent_propagated) {
- part->cte = cte;
- msg_info_task("detected missing CTE for part as: %s",
- rspamd_cte_to_string(part->cte));
- }
- }
- else {
- msg_debug_mime("processed cte: %s",
- rspamd_cte_to_string(cte));
- }
- }
- else {
- msg_debug_mime("processed cte: %s", rspamd_cte_to_string(cte));
- }
- }
- }
- static void
- rspamd_mime_part_get_cd(struct rspamd_task *task, struct rspamd_mime_part *part)
- {
- struct rspamd_mime_header *hdr, *cur;
- struct rspamd_content_disposition *cd = NULL;
- rspamd_ftok_t srch;
- struct rspamd_content_type_param *found;
-
- hdr = rspamd_message_get_header_from_hash(part->raw_headers,
- "Content-Disposition", FALSE);
-
-
- if (hdr == NULL) {
- cd = rspamd_mempool_alloc0(task->task_pool, sizeof(*cd));
- cd->type = RSPAMD_CT_INLINE;
-
- /* We can also have content disposition definitions in Content-Type */
- if (part->ct && part->ct->attrs) {
- RSPAMD_FTOK_ASSIGN(&srch, "name");
- found = g_hash_table_lookup(part->ct->attrs, &srch);
-
- if (!found) {
- RSPAMD_FTOK_ASSIGN(&srch, "filename");
- found = g_hash_table_lookup(part->ct->attrs, &srch);
- }
-
- if (found) {
- cd->type = RSPAMD_CT_ATTACHMENT;
- memcpy(&cd->filename, &found->value, sizeof(cd->filename));
- }
- }
- }
- else {
- DL_FOREACH(hdr, cur)
- {
- gsize hlen;
- cd = NULL;
-
- if (cur->value) {
- hlen = strlen(cur->value);
- cd = rspamd_content_disposition_parse(cur->value, hlen,
- task->task_pool);
- }
-
- if (cd) {
- /* We still need to check filename */
- if (cd->filename.len == 0) {
- if (part->ct && part->ct->attrs) {
- RSPAMD_FTOK_ASSIGN(&srch, "name");
- found = g_hash_table_lookup(part->ct->attrs, &srch);
-
- if (!found) {
- RSPAMD_FTOK_ASSIGN(&srch, "filename");
- found = g_hash_table_lookup(part->ct->attrs, &srch);
- }
-
- if (found) {
- cd->type = RSPAMD_CT_ATTACHMENT;
- memcpy(&cd->filename, &found->value,
- sizeof(cd->filename));
- }
- }
- }
-
- msg_debug_mime("processed content disposition: %s, file: \"%T\"",
- cd->lc_data, &cd->filename);
- break;
- }
- else if (part->ct) {
- /*
- * Even in case of malformed Content-Disposition, we can still
- * fall back to Content-Type
- */
- cd = rspamd_mempool_alloc0(task->task_pool, sizeof(*cd));
- cd->type = RSPAMD_CT_INLINE;
-
- /* We can also have content disposition definitions in Content-Type */
- if (part->ct->attrs) {
- RSPAMD_FTOK_ASSIGN(&srch, "name");
- found = g_hash_table_lookup(part->ct->attrs, &srch);
-
- if (!found) {
- RSPAMD_FTOK_ASSIGN(&srch, "filename");
- found = g_hash_table_lookup(part->ct->attrs, &srch);
- }
-
- if (found) {
- cd->type = RSPAMD_CT_ATTACHMENT;
- memcpy(&cd->filename, &found->value, sizeof(cd->filename));
- }
- }
- }
- }
- }
-
- part->cd = cd;
- }
-
- void rspamd_mime_parser_calc_digest(struct rspamd_mime_part *part)
- {
- /* Blake2b applied to string 'rspamd' */
- static const guchar hash_key[] = {
- 0xef,
- 0x43,
- 0xae,
- 0x80,
- 0xcc,
- 0x8d,
- 0xc3,
- 0x4c,
- 0x6f,
- 0x1b,
- 0xd6,
- 0x18,
- 0x1b,
- 0xae,
- 0x87,
- 0x74,
- 0x0c,
- 0xca,
- 0xf7,
- 0x8e,
- 0x5f,
- 0x2e,
- 0x54,
- 0x32,
- 0xf6,
- 0x79,
- 0xb9,
- 0x27,
- 0x26,
- 0x96,
- 0x20,
- 0x92,
- 0x70,
- 0x07,
- 0x85,
- 0xeb,
- 0x83,
- 0xf7,
- 0x89,
- 0xe0,
- 0xd7,
- 0x32,
- 0x2a,
- 0xd2,
- 0x1a,
- 0x64,
- 0x41,
- 0xef,
- 0x49,
- 0xff,
- 0xc3,
- 0x8c,
- 0x54,
- 0xf9,
- 0x67,
- 0x74,
- 0x30,
- 0x1e,
- 0x70,
- 0x2e,
- 0xb7,
- 0x12,
- 0x09,
- 0xfe,
- };
-
- if (part->parsed_data.len > 0) {
- rspamd_cryptobox_hash(part->digest,
- part->parsed_data.begin, part->parsed_data.len,
- hash_key, sizeof(hash_key));
- }
- }
-
- static enum rspamd_mime_parse_error
- rspamd_mime_parse_normal_part(struct rspamd_task *task,
- struct rspamd_mime_part *part,
- struct rspamd_mime_parser_ctx *st,
- struct rspamd_content_type *ct,
- GError **err)
- {
- rspamd_fstring_t *parsed;
- gssize r;
-
- g_assert(part != NULL);
-
- rspamd_mime_part_get_cte(task, part->raw_headers, part,
- part->ct && !(part->ct->flags & RSPAMD_CONTENT_TYPE_MESSAGE));
- rspamd_mime_part_get_cd(task, part);
-
- switch (part->cte) {
- case RSPAMD_CTE_7BIT:
- case RSPAMD_CTE_8BIT:
- case RSPAMD_CTE_UNKNOWN:
- if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING)) {
- if (part->cte != RSPAMD_CTE_7BIT) {
- /* We have something that has a missing content-type,
- * but it has non-7bit characters.
- *
- * In theory, it is very unsafe to process it as a text part
- * as we unlikely get some sane result
- */
-
- /*
- * On the other hand, there is an evidence that some
- * emails actually rely on that.
- * So we apply an expensive hack here:
- * if there are no 8bit characters -OR- the content is valid
- * UTF8, we can still imply Content-Type == text/plain
- */
-
- if (rspamd_str_has_8bit(part->raw_data.begin, part->raw_data.len) &&
- !rspamd_fast_utf8_validate(part->raw_data.begin, part->raw_data.len)) {
- part->ct->flags &= ~RSPAMD_CONTENT_TYPE_TEXT;
- part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
- }
- }
- }
-
- if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) {
- /* Need to copy text as we have couple of in-place change functions */
- parsed = rspamd_fstring_sized_new(part->raw_data.len);
- parsed->len = part->raw_data.len;
- memcpy(parsed->str, part->raw_data.begin, parsed->len);
- part->parsed_data.begin = parsed->str;
- part->parsed_data.len = parsed->len;
- rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
- rspamd_mempool_add_destructor(task->task_pool,
- (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
- }
- else {
- part->parsed_data.begin = part->raw_data.begin;
- part->parsed_data.len = part->raw_data.len;
- }
- break;
- case RSPAMD_CTE_QP:
- parsed = rspamd_fstring_sized_new(part->raw_data.len);
- r = rspamd_decode_qp_buf(part->raw_data.begin, part->raw_data.len,
- parsed->str, parsed->allocated);
- if (r != -1) {
- parsed->len = r;
- part->parsed_data.begin = parsed->str;
- part->parsed_data.len = parsed->len;
- rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
- rspamd_mempool_add_destructor(task->task_pool,
- (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
- }
- else {
- msg_err_task("invalid quoted-printable encoded part, assume 8bit");
- if (part->ct) {
- part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
- }
- part->cte = RSPAMD_CTE_8BIT;
- memcpy(parsed->str, part->raw_data.begin, part->raw_data.len);
- parsed->len = part->raw_data.len;
- part->parsed_data.begin = parsed->str;
- part->parsed_data.len = parsed->len;
- rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
- rspamd_mempool_add_destructor(task->task_pool,
- (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
- }
- break;
- case RSPAMD_CTE_B64:
- parsed = rspamd_fstring_sized_new(part->raw_data.len / 4 * 3 + 12);
- rspamd_cryptobox_base64_decode(part->raw_data.begin,
- part->raw_data.len,
- parsed->str, &parsed->len);
- part->parsed_data.begin = parsed->str;
- part->parsed_data.len = parsed->len;
- rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
- rspamd_mempool_add_destructor(task->task_pool,
- (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
- break;
- case RSPAMD_CTE_UUE:
- parsed = rspamd_fstring_sized_new(part->raw_data.len / 4 * 3 + 12);
- r = rspamd_decode_uue_buf(part->raw_data.begin, part->raw_data.len,
- parsed->str, parsed->allocated);
- rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
- rspamd_mempool_add_destructor(task->task_pool,
- (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
- if (r != -1) {
- parsed->len = r;
- part->parsed_data.begin = parsed->str;
- part->parsed_data.len = parsed->len;
- }
- else {
- msg_err_task("invalid uuencoding in encoded part, assume 8bit");
- if (part->ct) {
- part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
- }
- part->cte = RSPAMD_CTE_8BIT;
- parsed->len = MIN(part->raw_data.len, parsed->allocated);
- memcpy(parsed->str, part->raw_data.begin, parsed->len);
- rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
- part->parsed_data.begin = parsed->str;
- part->parsed_data.len = parsed->len;
- }
- break;
- default:
- g_assert_not_reached();
- }
-
- part->part_number = MESSAGE_FIELD(task, parts)->len;
- part->urls = g_ptr_array_new();
- g_ptr_array_add(MESSAGE_FIELD(task, parts), part);
- msg_debug_mime("parsed data part %T/%T of length %z (%z orig), %s cte",
- &part->ct->type, &part->ct->subtype, part->parsed_data.len,
- part->raw_data.len, rspamd_cte_to_string(part->cte));
- rspamd_mime_parser_calc_digest(part);
-
- if (ct && (ct->flags & RSPAMD_CONTENT_TYPE_SMIME)) {
- CMS_ContentInfo *cms;
- const unsigned char *der_beg = part->parsed_data.begin;
- cms = d2i_CMS_ContentInfo(NULL, &der_beg, part->parsed_data.len);
-
- if (cms) {
- const ASN1_OBJECT *asn_ct = CMS_get0_eContentType(cms);
- int ct_nid = OBJ_obj2nid(asn_ct);
-
- if (ct_nid == NID_pkcs7_data) {
- BIO *bio = BIO_new_mem_buf(part->parsed_data.begin,
- part->parsed_data.len);
-
- PKCS7 *p7;
- p7 = d2i_PKCS7_bio(bio, NULL);
-
- if (p7) {
- ct_nid = OBJ_obj2nid(p7->type);
-
- if (ct_nid == NID_pkcs7_signed) {
- PKCS7 *p7_signed_content = p7->d.sign->contents;
-
- ct_nid = OBJ_obj2nid(p7_signed_content->type);
-
- if (ct_nid == NID_pkcs7_data && p7_signed_content->d.data) {
- int ret;
-
- msg_debug_mime("found an additional part inside of "
- "smime structure of type %T/%T; length=%d",
- &ct->type, &ct->subtype, p7_signed_content->d.data->length);
- /*
- * Since ASN.1 structures are freed, we need to copy
- * the content
- */
- gchar *cpy = rspamd_mempool_alloc(task->task_pool,
- p7_signed_content->d.data->length);
- memcpy(cpy, p7_signed_content->d.data->data,
- p7_signed_content->d.data->length);
- ret = rspamd_mime_process_multipart_node(task,
- st, NULL,
- cpy, cpy + p7_signed_content->d.data->length,
- TRUE, err);
-
- PKCS7_free(p7);
- BIO_free(bio);
- CMS_ContentInfo_free(cms);
-
- return ret;
- }
- }
-
- PKCS7_free(p7);
- }
-
- BIO_free(bio);
- }
-
- CMS_ContentInfo_free(cms);
- }
- }
-
- return RSPAMD_MIME_PARSE_OK;
- }
-
- struct rspamd_mime_multipart_cbdata {
- struct rspamd_task *task;
- struct rspamd_mime_part *multipart;
- struct rspamd_mime_parser_ctx *st;
- const gchar *part_start;
- rspamd_ftok_t *cur_boundary;
- uint64_t bhash;
- GError **err;
- };
-
- static enum rspamd_mime_parse_error
- rspamd_mime_process_multipart_node(struct rspamd_task *task,
- struct rspamd_mime_parser_ctx *st,
- struct rspamd_mime_part *multipart,
- const gchar *start, const gchar *end,
- gboolean is_finished,
- GError **err)
- {
- struct rspamd_content_type *ct, *sel = NULL;
- struct rspamd_mime_header *hdr = NULL, *cur;
- struct rspamd_mime_part *npart;
- GString str;
- goffset hdr_pos, body_pos;
- enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_FATAL;
-
-
- str.str = (gchar *) start;
- str.len = end - start;
-
- if (*start == '\n' || *start == '\r') {
- /*
- * We have a part that starts from newline which means that
- * there are completely no headers in this part,
- * hence we assume it as a text part
- */
- hdr_pos = 0;
- body_pos = 0;
-
- if (!is_finished) {
- /* Ignore garbage */
- const gchar *p = start;
- gboolean seen_something = FALSE;
-
- while (p < end) {
- if (g_ascii_isalnum(*p)) {
- seen_something = TRUE;
- break;
- }
- p++;
- }
-
- if (!seen_something) {
- return RSPAMD_MIME_PARSE_NO_PART;
- }
- }
- }
- else {
- hdr_pos = rspamd_string_find_eoh(&str, &body_pos);
- }
-
- npart = rspamd_mempool_alloc0(task->task_pool,
- sizeof(struct rspamd_mime_part));
- npart->parent_part = multipart;
- npart->raw_headers = rspamd_message_headers_new();
- npart->headers_order = NULL;
-
- if (multipart) {
- if (multipart->specific.mp->children == NULL) {
- multipart->specific.mp->children = g_ptr_array_sized_new(2);
- }
-
- g_ptr_array_add(multipart->specific.mp->children, npart);
- }
-
- if (hdr_pos > 0 && hdr_pos < str.len) {
- npart->raw_headers_str = str.str;
- npart->raw_headers_len = hdr_pos;
- npart->raw_data.begin = start + body_pos;
- npart->raw_data.len = (end - start) - body_pos;
-
- if (npart->raw_headers_len > 0) {
- rspamd_mime_headers_process(task, npart->raw_headers,
- &npart->headers_order,
- npart->raw_headers_str,
- npart->raw_headers_len,
- FALSE);
-
- /* Preserve the natural order */
- if (npart->headers_order) {
- LL_REVERSE2(npart->headers_order, ord_next);
- }
- }
-
- hdr = rspamd_message_get_header_from_hash(npart->raw_headers,
- "Content-Type", FALSE);
- }
- else {
- npart->raw_headers_str = 0;
- npart->raw_headers_len = 0;
- npart->raw_data.begin = start;
- npart->raw_data.len = end - start;
- }
-
-
- if (hdr != NULL) {
-
- DL_FOREACH(hdr, cur)
- {
- ct = rspamd_content_type_parse(cur->value, strlen(cur->value),
- task->task_pool);
-
- /* Here we prefer multipart content-type or any content-type */
- if (ct) {
- if (sel == NULL) {
- sel = ct;
- }
- else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
- sel = ct;
- }
- }
- }
- }
-
- if (sel == NULL) {
- sel = rspamd_mempool_alloc0(task->task_pool, sizeof(*sel));
- RSPAMD_FTOK_ASSIGN(&sel->type, "text");
- RSPAMD_FTOK_ASSIGN(&sel->subtype, "plain");
- }
-
- npart->ct = sel;
-
- if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
- st->nesting++;
- g_ptr_array_add(st->stack, npart);
- npart->part_type = RSPAMD_MIME_PART_MULTIPART;
- npart->specific.mp = rspamd_mempool_alloc0(task->task_pool,
- sizeof(struct rspamd_mime_multipart));
- memcpy(&npart->specific.mp->boundary, &sel->orig_boundary,
- sizeof(rspamd_ftok_t));
- ret = rspamd_mime_parse_multipart_part(task, npart, st, err);
- }
- else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) {
- st->nesting++;
- g_ptr_array_add(st->stack, npart);
- npart->part_type = RSPAMD_MIME_PART_MESSAGE;
-
- if ((ret = rspamd_mime_parse_normal_part(task, npart, st, sel, err)) == RSPAMD_MIME_PARSE_OK) {
- ret = rspamd_mime_parse_message(task, npart, st, err);
- }
- }
- else {
- ret = rspamd_mime_parse_normal_part(task, npart, st, sel, err);
- }
-
- return ret;
- }
-
- static enum rspamd_mime_parse_error
- rspamd_mime_parse_multipart_cb(struct rspamd_task *task,
- struct rspamd_mime_part *multipart,
- struct rspamd_mime_parser_ctx *st,
- struct rspamd_mime_multipart_cbdata *cb,
- struct rspamd_mime_boundary *b)
- {
- const gchar *pos = st->start + b->boundary;
- enum rspamd_mime_parse_error ret;
-
- task = cb->task;
-
- /* Now check boundary */
- if (!cb->part_start) {
- cb->part_start = st->start + b->start;
- st->pos = cb->part_start;
- }
- else {
- /*
- * We have seen the start of the boundary,
- * but it might be unsuitable (e.g. in broken headers)
- */
- if (cb->part_start < pos && cb->cur_boundary) {
-
- if ((ret = rspamd_mime_process_multipart_node(task, cb->st,
- cb->multipart, cb->part_start, pos, TRUE, cb->err)) != RSPAMD_MIME_PARSE_OK) {
- return ret;
- }
-
- if (b->start > 0) {
- /* Go towards the next part */
- cb->part_start = st->start + b->start;
- cb->st->pos = cb->part_start;
- }
- }
- else {
- /* We have an empty boundary, do nothing */
- }
- }
-
- return RSPAMD_MIME_PARSE_OK;
- }
-
- static enum rspamd_mime_parse_error
- rspamd_multipart_boundaries_filter(struct rspamd_task *task,
- struct rspamd_mime_part *multipart,
- struct rspamd_mime_parser_ctx *st,
- struct rspamd_mime_multipart_cbdata *cb)
- {
- struct rspamd_mime_boundary *cur;
- goffset last_offset;
- guint i, sel = 0;
- enum rspamd_mime_parse_error ret;
-
- last_offset = (multipart->raw_data.begin - st->start) +
- multipart->raw_data.len;
-
- /* Find the first offset suitable for this part */
- for (i = 0; i < st->boundaries->len; i++) {
- cur = &g_array_index(st->boundaries, struct rspamd_mime_boundary, i);
-
- if (cur->start >= multipart->raw_data.begin - st->start) {
- if (cb->cur_boundary) {
- /* Check boundary */
- msg_debug_mime("compare %L and %L (and %L)",
- cb->bhash, cur->hash, cur->closed_hash);
-
- if (cb->bhash == cur->hash) {
- sel = i;
- break;
- }
- else if (cb->bhash == cur->closed_hash) {
- /* Not a closing element in fact */
- cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
- cur->hash = cur->closed_hash;
- sel = i;
- break;
- }
- }
- else {
- /* Set current boundary */
- cb->cur_boundary = rspamd_mempool_alloc(task->task_pool,
- sizeof(rspamd_ftok_t));
- cb->cur_boundary->begin = st->start + cur->boundary;
- cb->cur_boundary->len = 0;
- cb->bhash = cur->hash;
- sel = i;
- break;
- }
- }
- }
-
- /* Now we can go forward with boundaries that are same to what we have */
- for (i = sel; i < st->boundaries->len; i++) {
- cur = &g_array_index(st->boundaries, struct rspamd_mime_boundary, i);
-
- if (cur->boundary > last_offset) {
- break;
- }
-
- if (cur->hash == cb->bhash || cur->closed_hash == cb->bhash) {
- if ((ret = rspamd_mime_parse_multipart_cb(task, multipart, st,
- cb, cur)) != RSPAMD_MIME_PARSE_OK) {
- return ret;
- }
-
- if (cur->closed_hash == cb->bhash) {
- /* We have again fake closed hash */
- cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
- cur->hash = cur->closed_hash;
- }
-
- if (RSPAMD_BOUNDARY_IS_CLOSED(cur)) {
- /* We also might check the next boundary... */
- if (i < st->boundaries->len - 1) {
- cur = &g_array_index(st->boundaries,
- struct rspamd_mime_boundary, i + 1);
-
- if (cur->hash == cb->bhash) {
- continue;
- }
- else if (cur->closed_hash == cb->bhash) {
- /* We have again fake closed hash */
- cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
- cur->hash = cur->closed_hash;
- continue;
- }
- }
-
- break;
- }
- }
- }
-
- if (i == st->boundaries->len && cb->cur_boundary) {
- /* Process the last part */
- struct rspamd_mime_boundary fb;
-
- fb.boundary = last_offset;
- fb.start = -1;
-
- if ((ret = rspamd_mime_parse_multipart_cb(task, multipart, st,
- cb, &fb)) != RSPAMD_MIME_PARSE_OK) {
- return ret;
- }
- }
-
- return RSPAMD_MIME_PARSE_OK;
- }
-
- static enum rspamd_mime_parse_error
- rspamd_mime_parse_multipart_part(struct rspamd_task *task,
- struct rspamd_mime_part *part,
- struct rspamd_mime_parser_ctx *st,
- GError **err)
- {
- struct rspamd_mime_multipart_cbdata cbdata;
- enum rspamd_mime_parse_error ret;
-
- if (st->nesting > max_nested) {
- g_set_error(err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d",
- st->nesting);
- return RSPAMD_MIME_PARSE_NESTING;
- }
-
- part->part_number = MESSAGE_FIELD(task, parts)->len;
- part->urls = g_ptr_array_new();
- g_ptr_array_add(MESSAGE_FIELD(task, parts), part);
- st->nesting++;
- rspamd_mime_part_get_cte(task, part->raw_headers, part, FALSE);
-
- st->pos = part->raw_data.begin;
- cbdata.multipart = part;
- cbdata.task = task;
- cbdata.st = st;
- cbdata.part_start = NULL;
- cbdata.err = err;
-
- if (part->ct->boundary.len > 0) {
- /* We know our boundary */
- cbdata.cur_boundary = &part->ct->boundary;
- rspamd_cryptobox_siphash((guchar *) &cbdata.bhash,
- cbdata.cur_boundary->begin, cbdata.cur_boundary->len,
- lib_ctx->hkey);
- msg_debug_mime("hash: %T -> %L", cbdata.cur_boundary, cbdata.bhash);
- }
- else {
- /* Guess boundary */
- cbdata.cur_boundary = NULL;
- cbdata.bhash = 0;
- }
-
- ret = rspamd_multipart_boundaries_filter(task, part, st, &cbdata);
- /* Cleanup stack */
- st->nesting--;
- g_ptr_array_remove_index_fast(st->stack, st->stack->len - 1);
-
- return ret;
- }
-
- /* Process boundary like structures in a message */
- static gint
- rspamd_mime_preprocess_cb(struct rspamd_multipattern *mp,
- guint strnum,
- gint match_start,
- gint match_pos,
- const gchar *text,
- gsize len,
- void *context)
- {
- const gchar *end = text + len, *p = text + match_pos, *bend;
- gsize blen;
- gboolean closing = FALSE;
- struct rspamd_mime_boundary b;
- struct rspamd_mime_parser_ctx *st = context;
- struct rspamd_task *task;
-
- task = st->task;
-
- if (G_LIKELY(p < end)) {
-
- blen = 0;
-
- while (p < end) {
- if (*p == '\r' || *p == '\n') {
- break;
- }
-
- blen++;
- p++;
- }
-
- if (blen > 0) {
- /* We have found something like boundary */
- p = text + match_pos;
- bend = p + blen - 1;
-
- if (*bend == '-') {
- /* We need to verify last -- */
- if (bend > p + 1 && *(bend - 1) == '-') {
- closing = TRUE;
- bend--;
- blen -= 2;
- }
- else {
- /* Not a closing boundary somehow, e.g. if a boundary=='-' */
- bend++;
- }
- }
- else {
- bend++;
- }
-
- while (bend < end) {
- if (*bend == '\r') {
- bend++;
-
- /* \r\n */
- if (bend < end && *bend == '\n') {
- bend++;
- }
- }
- else if (*bend == '\n') {
- /* \n */
- bend++;
- }
- else if (g_ascii_isspace(*bend)) {
- /* Spaces in the same line, skip them */
- bend++;
- continue;
- }
-
- break;
- }
-
- b.boundary = p - st->start - 2;
- b.start = bend - st->start;
-
- /* Small optimisation as boundaries are usually short strings */
- gchar *lc_copy, lc_copy_buf[128];
-
- if (blen + 2 < sizeof(lc_copy_buf)) {
- lc_copy = lc_copy_buf;
- }
- else {
- lc_copy = g_malloc(blen + 2);
- }
-
- if (closing) {
- memcpy(lc_copy, p, blen + 2);
- rspamd_str_lc(lc_copy, blen + 2);
- }
- else {
- memcpy(lc_copy, p, blen);
- rspamd_str_lc(lc_copy, blen);
- }
-
- rspamd_cryptobox_siphash((guchar *) &b.hash, lc_copy, blen,
- lib_ctx->hkey);
- msg_debug_mime("normal hash: %*s -> %L, %d boffset, %d data offset",
- (gint) blen, lc_copy, b.hash, (int) b.boundary, (int) b.start);
-
- if (closing) {
- b.flags = RSPAMD_MIME_BOUNDARY_FLAG_CLOSED;
- rspamd_cryptobox_siphash((guchar *) &b.closed_hash, lc_copy,
- blen + 2,
- lib_ctx->hkey);
- msg_debug_mime("closing hash: %*s -> %L, %d boffset, %d data offset",
- (gint) blen + 2, lc_copy,
- b.closed_hash,
- (int) b.boundary, (int) b.start);
- }
- else {
- b.flags = 0;
- b.closed_hash = 0;
- }
-
- /* Check if a string has been allocated on the heap */
- if (blen + 2 >= sizeof(lc_copy_buf)) {
- g_free(lc_copy);
- }
- g_array_append_val(st->boundaries, b);
- }
- }
-
- return 0;
- }
-
- static goffset
- rspamd_mime_parser_headers_heuristic(GString *input, goffset *body_start)
- {
- const gsize default_max_len = 76;
- gsize max_len = MIN(input->len, default_max_len);
- const gchar *p, *end;
- enum {
- st_before_colon = 0,
- st_colon,
- st_spaces_after_colon,
- st_value,
- st_error
- } state = st_before_colon;
-
- p = input->str;
- end = p + max_len;
-
- while (p < end) {
- switch (state) {
- case st_before_colon:
- if (G_UNLIKELY(*p == ':')) {
- state = st_colon;
- }
- else if (G_UNLIKELY(!g_ascii_isgraph(*p))) {
- state = st_error;
- }
-
- p++;
- break;
- case st_colon:
- if (g_ascii_isspace(*p)) {
- state = st_spaces_after_colon;
- }
- else {
- state = st_value;
- }
- p++;
- break;
- case st_spaces_after_colon:
- if (!g_ascii_isspace(*p)) {
- state = st_value;
- }
- p++;
- break;
- case st_value:
- /* We accept any value */
- goto end;
- break;
- case st_error:
- return (-1);
- break;
- }
- }
-
- end:
- if (state == st_value) {
- if (body_start) {
- *body_start = input->len;
- }
-
- return input->len;
- }
-
- return (-1);
- }
-
- static void
- rspamd_mime_preprocess_message(struct rspamd_task *task,
- struct rspamd_mime_part *top,
- struct rspamd_mime_parser_ctx *st)
- {
-
- if (top->raw_data.begin >= st->pos) {
- rspamd_multipattern_lookup(lib_ctx->mp_boundary,
- top->raw_data.begin - 1,
- top->raw_data.len + 1,
- rspamd_mime_preprocess_cb, st, NULL);
- }
- else {
- rspamd_multipattern_lookup(lib_ctx->mp_boundary,
- st->pos,
- st->end - st->pos,
- rspamd_mime_preprocess_cb, st, NULL);
- }
- }
-
- static void
- rspamd_mime_parse_stack_free(struct rspamd_mime_parser_ctx *st)
- {
- if (st) {
- g_ptr_array_free(st->stack, TRUE);
- g_array_free(st->boundaries, TRUE);
- g_free(st);
- }
- }
-
- static enum rspamd_mime_parse_error
- rspamd_mime_parse_message(struct rspamd_task *task,
- struct rspamd_mime_part *part,
- struct rspamd_mime_parser_ctx *st,
- GError **err)
- {
- struct rspamd_content_type *ct, *sel = NULL;
- struct rspamd_mime_header *hdr = NULL, *cur;
- const gchar *pbegin, *p;
- gsize plen, len;
- struct rspamd_mime_part *npart;
- goffset hdr_pos, body_pos;
- guint i;
- enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK;
- GString str;
- struct rspamd_mime_parser_ctx *nst = st;
-
- if (st->nesting > max_nested) {
- g_set_error(err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d",
- st->nesting);
- return RSPAMD_MIME_PARSE_NESTING;
- }
-
- /* Allocate real part */
- npart = rspamd_mempool_alloc0(task->task_pool,
- sizeof(struct rspamd_mime_part));
-
- if (part == NULL) {
- /* Top level message */
- p = task->msg.begin;
- len = task->msg.len;
-
- str.str = (gchar *) p;
- str.len = len;
-
- hdr_pos = rspamd_string_find_eoh(&str, &body_pos);
-
- if (hdr_pos > 0 && hdr_pos < str.len) {
-
- MESSAGE_FIELD(task, raw_headers_content).begin = str.str;
- MESSAGE_FIELD(task, raw_headers_content).len = hdr_pos;
- MESSAGE_FIELD(task, raw_headers_content).body_start = str.str + body_pos;
-
- if (MESSAGE_FIELD(task, raw_headers_content).len > 0) {
- rspamd_mime_headers_process(task,
- MESSAGE_FIELD(task, raw_headers),
- &MESSAGE_FIELD(task, headers_order),
- MESSAGE_FIELD(task, raw_headers_content).begin,
- MESSAGE_FIELD(task, raw_headers_content).len,
- TRUE);
- npart->raw_headers = rspamd_message_headers_ref(
- MESSAGE_FIELD(task, raw_headers));
-
- /* Preserve the natural order */
- if (MESSAGE_FIELD(task, headers_order)) {
- LL_REVERSE2(MESSAGE_FIELD(task, headers_order), ord_next);
- }
- }
-
- hdr = rspamd_message_get_header_from_hash(
- MESSAGE_FIELD(task, raw_headers),
- "Content-Type", FALSE);
- }
- else {
- /* First apply heuristic, maybe we have just headers */
- hdr_pos = rspamd_mime_parser_headers_heuristic(&str, &body_pos);
-
- if (hdr_pos > 0 && hdr_pos <= str.len) {
- MESSAGE_FIELD(task, raw_headers_content).begin = str.str;
- MESSAGE_FIELD(task, raw_headers_content).len = hdr_pos;
- MESSAGE_FIELD(task, raw_headers_content).body_start = str.str +
- body_pos;
-
- if (MESSAGE_FIELD(task, raw_headers_content).len > 0) {
- rspamd_mime_headers_process(task,
- MESSAGE_FIELD(task, raw_headers),
- &MESSAGE_FIELD(task, headers_order),
- MESSAGE_FIELD(task, raw_headers_content).begin,
- MESSAGE_FIELD(task, raw_headers_content).len,
- TRUE);
- npart->raw_headers = rspamd_message_headers_ref(
- MESSAGE_FIELD(task, raw_headers));
-
- /* Preserve the natural order */
- if (MESSAGE_FIELD(task, headers_order)) {
- LL_REVERSE2(MESSAGE_FIELD(task, headers_order), ord_next);
- }
- }
-
- hdr = rspamd_message_get_header_from_hash(
- MESSAGE_FIELD(task, raw_headers),
- "Content-Type", FALSE);
- task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS;
- }
- else {
- body_pos = 0;
- }
- }
-
- pbegin = st->start + body_pos;
- plen = st->end - pbegin;
- npart->headers_order = NULL;
- }
- else {
- /*
- * Here are dragons:
- * We allocate new parser context as we need to shift pointers
- */
- nst = g_malloc0(sizeof(*st));
- nst->stack = g_ptr_array_sized_new(4);
- nst->boundaries = g_array_sized_new(FALSE, FALSE,
- sizeof(struct rspamd_mime_boundary), 8);
- nst->start = part->parsed_data.begin;
- nst->end = nst->start + part->parsed_data.len;
- nst->pos = nst->start;
- nst->task = st->task;
- nst->nesting = st->nesting;
- st->nesting++;
-
- str.str = (gchar *) part->parsed_data.begin;
- str.len = part->parsed_data.len;
-
- hdr_pos = rspamd_string_find_eoh(&str, &body_pos);
- npart->raw_headers = rspamd_message_headers_new();
- npart->headers_order = NULL;
-
- if (hdr_pos > 0 && hdr_pos < str.len) {
- npart->raw_headers_str = str.str;
- npart->raw_headers_len = hdr_pos;
- npart->raw_data.begin = str.str + body_pos;
-
- if (npart->raw_headers_len > 0) {
- rspamd_mime_headers_process(task,
- npart->raw_headers,
- &npart->headers_order,
- npart->raw_headers_str,
- npart->raw_headers_len,
- FALSE);
-
- /* Preserve the natural order */
- if (npart->headers_order) {
- LL_REVERSE2(npart->headers_order, ord_next);
- }
- }
-
- hdr = rspamd_message_get_header_from_hash(npart->raw_headers,
- "Content-Type", FALSE);
- }
- else {
- body_pos = 0;
- }
-
- pbegin = part->parsed_data.begin + body_pos;
- plen = part->parsed_data.len - body_pos;
- }
-
- npart->raw_data.begin = pbegin;
- npart->raw_data.len = plen;
- npart->parent_part = part;
-
- if (hdr == NULL) {
- sel = NULL;
- }
- else {
- DL_FOREACH(hdr, cur)
- {
- ct = rspamd_content_type_parse(cur->value, strlen(cur->value),
- task->task_pool);
-
- /* Here we prefer multipart content-type or any content-type */
- if (ct) {
- if (sel == NULL) {
- sel = ct;
- }
- else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
- sel = ct;
- }
- }
- }
- }
-
- if (sel == NULL) {
- /* For messages we automatically assume plaintext */
- msg_info_task("cannot find content-type for a message, assume text/plain");
- sel = rspamd_mempool_alloc0(task->task_pool, sizeof(*sel));
- sel->flags = RSPAMD_CONTENT_TYPE_TEXT | RSPAMD_CONTENT_TYPE_MISSING;
- RSPAMD_FTOK_ASSIGN(&sel->type, "text");
- RSPAMD_FTOK_ASSIGN(&sel->subtype, "plain");
- }
-
- npart->ct = sel;
-
- if ((part == NULL || nst != st) &&
- (sel->flags & (RSPAMD_CONTENT_TYPE_MULTIPART | RSPAMD_CONTENT_TYPE_MESSAGE))) {
- /* Not a trivial message, need to preprocess */
- rspamd_mime_preprocess_message(task, npart, nst);
- }
-
- if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
- g_ptr_array_add(nst->stack, npart);
- nst->nesting++;
- npart->part_type = RSPAMD_MIME_PART_MULTIPART;
- npart->specific.mp = rspamd_mempool_alloc0(task->task_pool,
- sizeof(struct rspamd_mime_multipart));
- memcpy(&npart->specific.mp->boundary, &sel->orig_boundary,
- sizeof(rspamd_ftok_t));
- ret = rspamd_mime_parse_multipart_part(task, npart, nst, err);
- }
- else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) {
- if ((ret = rspamd_mime_parse_normal_part(task, npart, nst, sel, err)) == RSPAMD_MIME_PARSE_OK) {
- npart->part_type = RSPAMD_MIME_PART_MESSAGE;
- ret = rspamd_mime_parse_message(task, npart, nst, err);
- }
- }
- else {
- ret = rspamd_mime_parse_normal_part(task, npart, nst, sel, err);
- }
-
- if (ret != RSPAMD_MIME_PARSE_OK) {
- return ret;
- }
-
- if (part && st->stack->len > 0) {
- /* Remove message part from the parent stack */
- g_ptr_array_remove_index_fast(st->stack, st->stack->len - 1);
- st->nesting--;
- }
-
- /* Process leftovers for boundaries */
- if (nst->boundaries) {
- struct rspamd_mime_boundary *boundary, *start_boundary = NULL,
- *end_boundary = NULL;
- goffset cur_offset = nst->pos - nst->start,
- end_offset = st->end - st->start;
- guint sel_idx = 0;
-
- for (;;) {
- start_boundary = NULL;
-
- for (i = sel_idx; i < nst->boundaries->len; i++) {
- boundary = &g_array_index(nst->boundaries,
- struct rspamd_mime_boundary, i);
-
- if (boundary->start > cur_offset &&
- boundary->boundary < end_offset &&
- !RSPAMD_BOUNDARY_IS_CLOSED(boundary)) {
- start_boundary = boundary;
- sel_idx = i;
- break;
- }
- }
-
- if (start_boundary) {
- const gchar *start, *end;
-
- if (nst->boundaries->len > sel_idx + 1) {
- end_boundary = &g_array_index(nst->boundaries,
- struct rspamd_mime_boundary, sel_idx + 1);
- end = nst->start + end_boundary->boundary;
- }
- else {
- end = nst->end;
- }
-
- sel_idx++;
-
- start = nst->start + start_boundary->start;
-
- if (end > start &&
- (ret = rspamd_mime_process_multipart_node(task, nst,
- NULL, start, end, FALSE, err)) != RSPAMD_MIME_PARSE_OK) {
-
- if (nst != st) {
- rspamd_mime_parse_stack_free(nst);
- }
-
- if (ret == RSPAMD_MIME_PARSE_NO_PART) {
- return RSPAMD_MIME_PARSE_OK;
- }
-
- return ret;
- }
- }
- else {
- break;
- }
- }
- }
-
- if (nst != st) {
- rspamd_mime_parse_stack_free(nst);
- }
-
- return ret;
- }
-
- enum rspamd_mime_parse_error
- rspamd_mime_parse_task(struct rspamd_task *task, GError **err)
- {
- struct rspamd_mime_parser_ctx *st;
- enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK;
-
- if (lib_ctx == NULL) {
- rspamd_mime_parser_init_lib();
- }
-
- if (++lib_ctx->key_usages > max_key_usages) {
- /* Regenerate siphash key */
- ottery_rand_bytes(lib_ctx->hkey, sizeof(lib_ctx->hkey));
- lib_ctx->key_usages = 0;
- }
-
- st = g_malloc0(sizeof(*st));
- st->stack = g_ptr_array_sized_new(4);
- st->pos = MESSAGE_FIELD(task, raw_headers_content).body_start;
- st->end = task->msg.begin + task->msg.len;
- st->boundaries = g_array_sized_new(FALSE, FALSE,
- sizeof(struct rspamd_mime_boundary), 8);
- st->task = task;
-
- if (st->pos == NULL) {
- st->pos = task->msg.begin;
- }
-
- st->start = task->msg.begin;
- ret = rspamd_mime_parse_message(task, NULL, st, err);
- rspamd_mime_parse_stack_free(st);
-
- return ret;
- }
|