12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388 |
- /*-
- * Copyright 2016 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- #include "config.h"
- #include "task.h"
- #include "mime_parser.h"
- #include "mime_headers.h"
- #include "message.h"
- #include "multipattern.h"
- #include "contrib/libottery/ottery.h"
-
- struct rspamd_mime_parser_lib_ctx {
- struct rspamd_multipattern *mp_boundary;
- guchar hkey[rspamd_cryptobox_SIPKEYBYTES]; /* Key for hashing */
- guint key_usages;
- };
-
- struct rspamd_mime_parser_lib_ctx *lib_ctx = NULL;
-
- static const guint max_nested = 64;
- static const guint max_key_usages = 10000;
-
- #define msg_debug_mime(...) rspamd_conditional_debug_fast (NULL, task->from_addr, \
- rspamd_mime_log_id, "mime", task->task_pool->tag.uid, \
- G_STRFUNC, \
- __VA_ARGS__)
-
- INIT_LOG_MODULE(mime)
-
- #define RSPAMD_MIME_BOUNDARY_FLAG_CLOSED (1 << 0)
- #define RSPAMD_BOUNDARY_IS_CLOSED(b) ((b)->flags & RSPAMD_MIME_BOUNDARY_FLAG_CLOSED)
-
- struct rspamd_mime_boundary {
- goffset boundary;
- goffset start;
- guint64 hash;
- guint64 closed_hash;
- gint flags;
- };
-
- struct rspamd_mime_parser_ctx {
- GPtrArray *stack; /* Stack of parts */
- GArray *boundaries; /* Boundaries found in the whole message */
- const gchar *start;
- const gchar *pos;
- const gchar *end;
- struct rspamd_task *task;
- guint nesting;
- };
-
- static enum rspamd_mime_parse_error
- rspamd_mime_parse_multipart_part (struct rspamd_task *task,
- struct rspamd_mime_part *part,
- struct rspamd_mime_parser_ctx *st,
- GError **err);
- static enum rspamd_mime_parse_error
- rspamd_mime_parse_message (struct rspamd_task *task,
- struct rspamd_mime_part *part,
- struct rspamd_mime_parser_ctx *st,
- GError **err);
- static enum rspamd_mime_parse_error
- rspamd_mime_parse_normal_part (struct rspamd_task *task,
- struct rspamd_mime_part *part,
- struct rspamd_mime_parser_ctx *st,
- GError **err);
-
-
- #define RSPAMD_MIME_QUARK (rspamd_mime_parser_quark())
- static GQuark
- rspamd_mime_parser_quark (void)
- {
- return g_quark_from_static_string ("mime-parser");
- }
-
- const gchar*
- rspamd_cte_to_string (enum rspamd_cte ct)
- {
- const gchar *ret = "unknown";
-
- switch (ct) {
- case RSPAMD_CTE_7BIT:
- ret = "7bit";
- break;
- case RSPAMD_CTE_8BIT:
- ret = "8bit";
- break;
- case RSPAMD_CTE_QP:
- ret = "quoted-printable";
- break;
- case RSPAMD_CTE_B64:
- ret = "base64";
- break;
- default:
- break;
- }
-
- return ret;
- }
-
- enum rspamd_cte
- rspamd_cte_from_string (const gchar *str)
- {
- enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
-
- g_assert (str != NULL);
-
- if (strcmp (str, "7bit") == 0) {
- ret = RSPAMD_CTE_7BIT;
- }
- else if (strcmp (str, "8bit") == 0) {
- ret = RSPAMD_CTE_8BIT;
- }
- else if (strcmp (str, "quoted-printable") == 0) {
- ret = RSPAMD_CTE_QP;
- }
- else if (strcmp (str, "base64") == 0) {
- ret = RSPAMD_CTE_B64;
- }
-
- return ret;
- }
-
- static void
- rspamd_mime_parser_init_lib (void)
- {
- lib_ctx = g_malloc0 (sizeof (*lib_ctx));
- lib_ctx->mp_boundary = rspamd_multipattern_create (RSPAMD_MULTIPATTERN_DEFAULT);
- g_assert (lib_ctx->mp_boundary != NULL);
- rspamd_multipattern_add_pattern (lib_ctx->mp_boundary, "\r--", 0);
- rspamd_multipattern_add_pattern (lib_ctx->mp_boundary, "\n--", 0);
- g_assert (rspamd_multipattern_compile (lib_ctx->mp_boundary, NULL));
- ottery_rand_bytes (lib_ctx->hkey, sizeof (lib_ctx->hkey));
- }
-
- static enum rspamd_cte
- rspamd_mime_parse_cte (const gchar *in, gsize len)
- {
- guint64 h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64,
- in, len, 0xdeadbabe);
- enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
-
- switch (h) {
- case 0xCEDAA7056B4753F7ULL: /* 7bit */
- ret = RSPAMD_CTE_7BIT;
- break;
- case 0x42E0745448B39FC1ULL: /* 8bit */
- case 0x6B169E6B155BADC0ULL: /* binary */
- ret = RSPAMD_CTE_8BIT;
- break;
- case 0x6D69A5BB02A633B0ULL: /* quoted-printable */
- ret = RSPAMD_CTE_QP;
- break;
- case 0x96305588A76DC9A9ULL: /* base64 */
- case 0x171029DE1B0423A9ULL: /* base-64 */
- ret = RSPAMD_CTE_B64;
- break;
- }
-
- return ret;
- }
-
- static enum rspamd_cte
- rspamd_mime_part_get_cte_heuristic (struct rspamd_task *task,
- struct rspamd_mime_part *part)
- {
- const guint check_len = 128;
- guint real_len, nspaces = 0, neqsign = 0, n8bit = 0, nqpencoded = 0;
- gboolean b64_chars = TRUE;
- const guchar *p, *end;
- enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
-
- real_len = MIN (check_len, part->raw_data.len);
- p = (const guchar *)part->raw_data.begin;
- end = p + part->raw_data.len;
-
- while (p < end && g_ascii_isspace (*p)) {
- p ++;
- }
-
- if (end > p + 2) {
- if (*(end - 1) == '=') {
- neqsign ++;
- end --;
- }
-
- if (*(end - 1) == '=') {
- neqsign ++;
- end --;
- }
- }
-
- if (end - p > real_len) {
- end = p + real_len;
- }
-
- while (p < end) {
- if (*p == ' ') {
- nspaces ++;
- }
- else if (*p == '=') {
- neqsign ++;
- p ++;
-
- if (p + 2 < end && g_ascii_isxdigit (*p) && g_ascii_isxdigit (*(p + 1))) {
- p ++;
- nqpencoded ++;
- }
-
- continue;
- }
- else if (*p >= 0x80) {
- n8bit ++;
- b64_chars = FALSE;
- }
- else if (!(g_ascii_isalnum (*p) || *p == '/' || *p == '+')) {
- b64_chars = FALSE;
- }
-
- p ++;
- }
-
- if (b64_chars && neqsign < 2 && nspaces == 0) {
- ret = RSPAMD_CTE_B64;
- }
- else if (n8bit == 0) {
- if (neqsign > 2 && nqpencoded > 2) {
- ret = RSPAMD_CTE_QP;
- }
- else {
- ret = RSPAMD_CTE_7BIT;
- }
- }
- else {
- ret = RSPAMD_CTE_8BIT;
- }
-
- msg_debug_mime ("detected cte: %s", rspamd_cte_to_string (ret));
- return ret;
- }
-
- static void
- rspamd_mime_part_get_cte (struct rspamd_task *task,
- GHashTable *hdrs,
- struct rspamd_mime_part *part,
- gboolean apply_heuristic)
- {
- struct rspamd_mime_header *hdr;
- guint i;
- GPtrArray *hdrs_cte;
- enum rspamd_cte cte = RSPAMD_CTE_UNKNOWN;
-
- hdrs_cte = rspamd_message_get_header_from_hash (hdrs,
- task->task_pool,
- "Content-Transfer-Encoding", FALSE);
-
- if (hdrs_cte == NULL) {
-
- if (part->parent_part && part->parent_part->cte != RSPAMD_CTE_UNKNOWN &&
- !(part->parent_part->flags & RSPAMD_MIME_PART_MISSING_CTE)) {
- part->cte = part->parent_part->cte;
-
- goto check_cte;
- }
-
- if (apply_heuristic) {
- part->cte = rspamd_mime_part_get_cte_heuristic (task, part);
- msg_info_task ("detected missing CTE for part as: %s",
- rspamd_cte_to_string (part->cte));
- }
-
- part->flags |= RSPAMD_MIME_PART_MISSING_CTE;
- }
- else {
- for (i = 0; i < hdrs_cte->len; i ++) {
- gsize hlen;
- gchar lc_buf[128];
-
- hdr = g_ptr_array_index (hdrs_cte, i);
- hlen = rspamd_snprintf (lc_buf, sizeof (lc_buf), "%s", hdr->value);
- rspamd_str_lc (lc_buf, hlen);
- cte = rspamd_mime_parse_cte (lc_buf, hlen);
-
- if (cte != RSPAMD_CTE_UNKNOWN) {
- part->cte = cte;
- break;
- }
- }
-
- check_cte:
- if (apply_heuristic) {
- if (part->cte == RSPAMD_CTE_UNKNOWN) {
- part->cte = rspamd_mime_part_get_cte_heuristic (task, part);
-
- msg_info_task ("corrected bad CTE for part to: %s",
- rspamd_cte_to_string (part->cte));
- }
- else if (part->cte == RSPAMD_CTE_B64 ||
- part->cte == RSPAMD_CTE_QP) {
- /* Additionally check sanity */
- cte = rspamd_mime_part_get_cte_heuristic (task, part);
-
- if (cte == RSPAMD_CTE_8BIT) {
- msg_info_task (
- "incorrect cte specified for part: %s, %s detected",
- rspamd_cte_to_string (part->cte),
- rspamd_cte_to_string (cte));
- part->cte = cte;
- part->flags |= RSPAMD_MIME_PART_BAD_CTE;
- }
- }
- else {
- msg_debug_mime ("processed cte: %s",
- rspamd_cte_to_string (cte));
- }
- }
- else {
- msg_debug_mime ("processed cte: %s", rspamd_cte_to_string (cte));
- }
- }
- }
- static void
- rspamd_mime_part_get_cd (struct rspamd_task *task, struct rspamd_mime_part *part)
- {
- struct rspamd_mime_header *hdr;
- guint i;
- GPtrArray *hdrs;
- struct rspamd_content_disposition *cd = NULL;
- rspamd_ftok_t srch;
- struct rspamd_content_type_param *found;
-
- hdrs = rspamd_message_get_header_from_hash (part->raw_headers,
- task->task_pool,
- "Content-Disposition", FALSE);
-
-
- if (hdrs == NULL) {
- cd = rspamd_mempool_alloc0 (task->task_pool, sizeof (*cd));
- cd->type = RSPAMD_CT_INLINE;
-
- /* We can also have content dispositon definitions in Content-Type */
- if (part->ct && part->ct->attrs) {
- RSPAMD_FTOK_ASSIGN (&srch, "name");
- found = g_hash_table_lookup (part->ct->attrs, &srch);
-
- if (!found) {
- RSPAMD_FTOK_ASSIGN (&srch, "filename");
- found = g_hash_table_lookup (part->ct->attrs, &srch);
- }
-
- if (found) {
- cd->type = RSPAMD_CT_ATTACHMENT;
- memcpy (&cd->filename, &found->value, sizeof (cd->filename));
- }
- }
- }
- else {
- for (i = 0; i < hdrs->len; i ++) {
- gsize hlen;
-
- hdr = g_ptr_array_index (hdrs, i);
- hlen = strlen (hdr->value);
- cd = rspamd_content_disposition_parse (hdr->value, hlen,
- task->task_pool);
-
- if (cd) {
- msg_debug_mime ("processed content disposition: %s",
- cd->lc_data);
-
- /* We still need to check filename */
- if (cd->filename.len == 0) {
- if (part->ct && part->ct->attrs) {
- RSPAMD_FTOK_ASSIGN (&srch, "name");
- found = g_hash_table_lookup (part->ct->attrs, &srch);
-
- if (!found) {
- RSPAMD_FTOK_ASSIGN (&srch, "filename");
- found = g_hash_table_lookup (part->ct->attrs, &srch);
- }
-
- if (found) {
- cd->type = RSPAMD_CT_ATTACHMENT;
- memcpy (&cd->filename, &found->value,
- sizeof (cd->filename));
- }
- }
- }
- break;
- }
- }
- }
-
- part->cd = cd;
- }
-
- void
- rspamd_mime_parser_calc_digest (struct rspamd_mime_part *part)
- {
- /* Blake2b applied to string 'rspamd' */
- static const guchar hash_key[] = {
- 0xef,0x43,0xae,0x80,0xcc,0x8d,0xc3,0x4c,
- 0x6f,0x1b,0xd6,0x18,0x1b,0xae,0x87,0x74,
- 0x0c,0xca,0xf7,0x8e,0x5f,0x2e,0x54,0x32,
- 0xf6,0x79,0xb9,0x27,0x26,0x96,0x20,0x92,
- 0x70,0x07,0x85,0xeb,0x83,0xf7,0x89,0xe0,
- 0xd7,0x32,0x2a,0xd2,0x1a,0x64,0x41,0xef,
- 0x49,0xff,0xc3,0x8c,0x54,0xf9,0x67,0x74,
- 0x30,0x1e,0x70,0x2e,0xb7,0x12,0x09,0xfe,
- };
-
- if (part->parsed_data.len > 0) {
- rspamd_cryptobox_hash (part->digest,
- part->parsed_data.begin, part->parsed_data.len,
- hash_key, sizeof (hash_key));
- }
- }
-
- static enum rspamd_mime_parse_error
- rspamd_mime_parse_normal_part (struct rspamd_task *task,
- struct rspamd_mime_part *part,
- struct rspamd_mime_parser_ctx *st,
- GError **err)
- {
- rspamd_fstring_t *parsed;
- gssize r;
-
- g_assert (part != NULL);
-
- rspamd_mime_part_get_cte (task, part->raw_headers, part, TRUE);
- rspamd_mime_part_get_cd (task, part);
-
- switch (part->cte) {
- case RSPAMD_CTE_7BIT:
- case RSPAMD_CTE_8BIT:
- case RSPAMD_CTE_UNKNOWN:
- if (part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING) {
- if (part->cte != RSPAMD_CTE_7BIT) {
- /* We have something that has a missing content-type,
- * but it has non-7bit characters.
- *
- * In theory, it is very unsafe to process it as a text part
- * as we unlikely get some sane result
- */
- part->ct->flags &= ~RSPAMD_CONTENT_TYPE_TEXT;
- part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
- }
- }
-
- if (IS_CT_TEXT (part->ct)) {
- /* Need to copy text as we have couple of in-place change functions */
- parsed = rspamd_fstring_sized_new (part->raw_data.len);
- parsed->len = part->raw_data.len;
- memcpy (parsed->str, part->raw_data.begin, parsed->len);
- part->parsed_data.begin = parsed->str;
- part->parsed_data.len = parsed->len;
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
- }
- else {
- part->parsed_data.begin = part->raw_data.begin;
- part->parsed_data.len = part->raw_data.len;
- }
- break;
- case RSPAMD_CTE_QP:
- parsed = rspamd_fstring_sized_new (part->raw_data.len);
- r = rspamd_decode_qp_buf (part->raw_data.begin, part->raw_data.len,
- parsed->str, parsed->allocated);
- if (r != -1) {
- parsed->len = r;
- part->parsed_data.begin = parsed->str;
- part->parsed_data.len = parsed->len;
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
- }
- else {
- msg_err_task ("invalid quoted-printable encoded part, assume 8bit");
- part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
- part->cte = RSPAMD_CTE_8BIT;
- memcpy (parsed->str, part->raw_data.begin, part->raw_data.len);
- parsed->len = part->raw_data.len;
- part->parsed_data.begin = parsed->str;
- part->parsed_data.len = parsed->len;
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
- }
- break;
- case RSPAMD_CTE_B64:
- parsed = rspamd_fstring_sized_new (part->raw_data.len / 4 * 3 + 12);
- rspamd_cryptobox_base64_decode (part->raw_data.begin,
- part->raw_data.len,
- parsed->str, &parsed->len);
- part->parsed_data.begin = parsed->str;
- part->parsed_data.len = parsed->len;
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
- break;
- default:
- g_assert_not_reached ();
- }
-
- part->id = task->parts->len;
- g_ptr_array_add (task->parts, part);
- msg_debug_mime ("parsed data part %T/%T of length %z (%z orig), %s cte",
- &part->ct->type, &part->ct->subtype, part->parsed_data.len,
- part->raw_data.len, rspamd_cte_to_string (part->cte));
- rspamd_mime_parser_calc_digest (part);
-
- return RSPAMD_MIME_PARSE_OK;
- }
-
- struct rspamd_mime_multipart_cbdata {
- struct rspamd_task *task;
- struct rspamd_mime_part *multipart;
- struct rspamd_mime_parser_ctx *st;
- const gchar *part_start;
- rspamd_ftok_t *cur_boundary;
- guint64 bhash;
- GError **err;
- };
-
- static enum rspamd_mime_parse_error
- rspamd_mime_process_multipart_node (struct rspamd_task *task,
- struct rspamd_mime_parser_ctx *st,
- struct rspamd_mime_part *multipart,
- const gchar *start, const gchar *end,
- GError **err)
- {
- struct rspamd_content_type *ct, *sel = NULL;
- struct rspamd_mime_header *hdr;
- GPtrArray *hdrs = NULL;
- struct rspamd_mime_part *npart;
- GString str;
- goffset hdr_pos, body_pos;
- guint i;
- enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_FATAL;
-
-
- str.str = (gchar *)start;
- str.len = end - start;
-
- if (*start == '\n' || *start == '\r') {
- /*
- * We have a part that starts from newline which means that
- * there are completely no headers in this part,
- * hence we assume it as a text part
- */
- hdr_pos = 0;
- body_pos = 0;
- }
- else {
- hdr_pos = rspamd_string_find_eoh (&str, &body_pos);
- }
-
- npart = rspamd_mempool_alloc0 (task->task_pool,
- sizeof (struct rspamd_mime_part));
- npart->parent_part = multipart;
- npart->raw_headers = g_hash_table_new_full (rspamd_strcase_hash,
- rspamd_strcase_equal, NULL, rspamd_ptr_array_free_hard);
- npart->headers_order = g_queue_new ();
-
- if (multipart) {
- if (multipart->specific.mp->children == NULL) {
- multipart->specific.mp->children = g_ptr_array_sized_new (2);
- }
-
- g_ptr_array_add (multipart->specific.mp->children, npart);
- }
-
- if (hdr_pos > 0 && hdr_pos < str.len) {
- npart->raw_headers_str = str.str;
- npart->raw_headers_len = hdr_pos;
- npart->raw_data.begin = start + body_pos;
- npart->raw_data.len = (end - start) - body_pos;
-
- if (npart->raw_headers_len > 0) {
- rspamd_mime_headers_process (task, npart->raw_headers,
- npart->headers_order,
- npart->raw_headers_str,
- npart->raw_headers_len,
- FALSE);
- }
-
- hdrs = rspamd_message_get_header_from_hash (npart->raw_headers,
- task->task_pool,
- "Content-Type", FALSE);
-
- }
- else {
- npart->raw_headers_str = 0;
- npart->raw_headers_len = 0;
- npart->raw_data.begin = start;
- npart->raw_data.len = end - start;
- }
-
-
- if (hdrs != NULL) {
-
- for (i = 0; i < hdrs->len; i ++) {
- hdr = g_ptr_array_index (hdrs, i);
- ct = rspamd_content_type_parse (hdr->value, strlen (hdr->value),
- task->task_pool);
-
- /* Here we prefer multipart content-type or any content-type */
- if (ct) {
- if (sel == NULL) {
- sel = ct;
- }
- else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
- sel = ct;
- }
- }
- }
- }
-
- if (sel == NULL) {
- sel = rspamd_mempool_alloc0 (task->task_pool, sizeof (*sel));
- RSPAMD_FTOK_ASSIGN (&sel->type, "text");
- RSPAMD_FTOK_ASSIGN (&sel->subtype, "plain");
- }
-
- npart->ct = sel;
-
- if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
- st->nesting ++;
- g_ptr_array_add (st->stack, npart);
- npart->specific.mp = rspamd_mempool_alloc0 (task->task_pool,
- sizeof (struct rspamd_mime_multipart));
- memcpy (&npart->specific.mp->boundary, &sel->orig_boundary,
- sizeof (rspamd_ftok_t));
- ret = rspamd_mime_parse_multipart_part (task, npart, st, err);
- }
- else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) {
- st->nesting ++;
- g_ptr_array_add (st->stack, npart);
-
- if ((ret = rspamd_mime_parse_normal_part (task, npart, st, err))
- == RSPAMD_MIME_PARSE_OK) {
- ret = rspamd_mime_parse_message (task, npart, st, err);
- }
- }
- else {
- ret = rspamd_mime_parse_normal_part (task, npart, st, err);
- }
-
- return ret;
- }
-
- static enum rspamd_mime_parse_error
- rspamd_mime_parse_multipart_cb (struct rspamd_task *task,
- struct rspamd_mime_part *multipart,
- struct rspamd_mime_parser_ctx *st,
- struct rspamd_mime_multipart_cbdata *cb,
- struct rspamd_mime_boundary *b)
- {
- const gchar *pos = st->start + b->boundary;
- enum rspamd_mime_parse_error ret;
-
- task = cb->task;
-
- /* Now check boundary */
- if (!cb->part_start) {
- cb->part_start = st->start + b->start;
- st->pos = cb->part_start;
- }
- else {
- /* We have seen the start of the boundary */
- if (cb->part_start < pos) {
- /* We should have seen some boundary */
- g_assert (cb->cur_boundary != NULL);
-
-
- if ((ret = rspamd_mime_process_multipart_node (task, cb->st,
- cb->multipart, cb->part_start, pos, cb->err))
- != RSPAMD_MIME_PARSE_OK) {
- return ret;
- }
-
- /* Go towards the next part */
- cb->part_start = st->start + b->start;
- cb->st->pos = cb->part_start;
- }
- else {
- /* We have an empty boundary, do nothing */
- }
- }
-
- return RSPAMD_MIME_PARSE_OK;
- }
-
- static enum rspamd_mime_parse_error
- rspamd_multipart_boundaries_filter (struct rspamd_task *task,
- struct rspamd_mime_part *multipart,
- struct rspamd_mime_parser_ctx *st,
- struct rspamd_mime_multipart_cbdata *cb)
- {
- struct rspamd_mime_boundary *cur;
- goffset last_offset;
- guint i, sel = 0;
- enum rspamd_mime_parse_error ret;
-
- last_offset = (multipart->raw_data.begin - st->start) +
- multipart->raw_data.len;
-
- /* Find the first offset suitable for this part */
- for (i = 0; i < st->boundaries->len; i ++) {
- cur = &g_array_index (st->boundaries, struct rspamd_mime_boundary, i);
-
- if (cur->start >= multipart->raw_data.begin - st->start) {
- if (cb->cur_boundary) {
- /* Check boundary */
- msg_debug_mime ("compare %L and %L (and %L)",
- cb->bhash, cur->hash, cur->closed_hash);
-
- if (cb->bhash == cur->hash) {
- sel = i;
- break;
- }
- else if (cb->bhash == cur->closed_hash) {
- /* Not a closing element in fact */
- cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
- cur->hash = cur->closed_hash;
- sel = i;
- break;
- }
- }
- else {
- /* Set current boundary */
- cb->cur_boundary = rspamd_mempool_alloc (task->task_pool,
- sizeof (rspamd_ftok_t));
- cb->cur_boundary->begin = st->start + cur->boundary;
- cb->cur_boundary->len = 0;
- cb->bhash = cur->hash;
- sel = i;
- break;
- }
- }
- }
-
- /* Now we can go forward with boundaries that are same to what we have */
- for (i = sel; i < st->boundaries->len; i ++) {
- cur = &g_array_index (st->boundaries, struct rspamd_mime_boundary, i);
-
- if (cur->boundary > last_offset) {
- break;
- }
-
- if (cur->hash == cb->bhash || cur->closed_hash == cb->bhash) {
- if ((ret = rspamd_mime_parse_multipart_cb (task, multipart, st,
- cb, cur)) != RSPAMD_MIME_PARSE_OK) {
- return ret;
- }
-
- if (cur->closed_hash == cb->bhash) {
- /* We have again fake closed hash */
- cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
- cur->hash = cur->closed_hash;
- }
-
- if (RSPAMD_BOUNDARY_IS_CLOSED (cur)) {
- /* We also might check the next boundary... */
- if (i < st->boundaries->len - 1) {
- cur = &g_array_index (st->boundaries,
- struct rspamd_mime_boundary, i + 1);
-
- if (cur->hash == cb->bhash) {
- continue;
- }
- else if (cur->closed_hash == cb->bhash) {
- /* We have again fake closed hash */
- cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
- cur->hash = cur->closed_hash;
- continue;
- }
- }
-
- break;
- }
- }
- }
-
- if (i == st->boundaries->len && cb->cur_boundary) {
- /* Process the last part */
- struct rspamd_mime_boundary fb;
-
- fb.boundary = last_offset;
-
- if ((ret = rspamd_mime_parse_multipart_cb (task, multipart, st,
- cb, &fb)) != RSPAMD_MIME_PARSE_OK) {
- return ret;
- }
- }
-
- return RSPAMD_MIME_PARSE_OK;
- }
-
- static enum rspamd_mime_parse_error
- rspamd_mime_parse_multipart_part (struct rspamd_task *task,
- struct rspamd_mime_part *part,
- struct rspamd_mime_parser_ctx *st,
- GError **err)
- {
- struct rspamd_mime_multipart_cbdata cbdata;
- enum rspamd_mime_parse_error ret;
-
- if (st->nesting > max_nested) {
- g_set_error (err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d",
- st->nesting);
- return RSPAMD_MIME_PARSE_NESTING;
- }
-
- part->id = task->parts->len;
- g_ptr_array_add (task->parts, part);
- st->nesting ++;
- rspamd_mime_part_get_cte (task, part->raw_headers, part, FALSE);
-
- st->pos = part->raw_data.begin;
- cbdata.multipart = part;
- cbdata.task = task;
- cbdata.st = st;
- cbdata.part_start = NULL;
- cbdata.err = err;
-
- if (part->ct->boundary.len > 0) {
- /* We know our boundary */
- cbdata.cur_boundary = &part->ct->boundary;
- rspamd_cryptobox_siphash ((guchar *)&cbdata.bhash,
- cbdata.cur_boundary->begin, cbdata.cur_boundary->len,
- lib_ctx->hkey);
- msg_debug_mime ("hash: %T -> %L", cbdata.cur_boundary, cbdata.bhash);
- }
- else {
- /* Guess boundary */
- cbdata.cur_boundary = NULL;
- cbdata.bhash = 0;
- }
-
- ret = rspamd_multipart_boundaries_filter (task, part, st, &cbdata);
- /* Cleanup stack */
- st->nesting --;
- g_ptr_array_remove_index_fast (st->stack, st->stack->len - 1);
-
- return ret;
- }
-
- /* Process boundary like structures in a message */
- static gint
- rspamd_mime_preprocess_cb (struct rspamd_multipattern *mp,
- guint strnum,
- gint match_start,
- gint match_pos,
- const gchar *text,
- gsize len,
- void *context)
- {
- const gchar *end = text + len, *p = text + match_pos, *bend;
- gchar *lc_copy;
- gsize blen;
- gboolean closing = FALSE;
- struct rspamd_mime_boundary b;
- struct rspamd_mime_parser_ctx *st = context;
- struct rspamd_task *task;
-
- task = st->task;
-
- if (G_LIKELY (p < end)) {
- gboolean seen_non_dash = FALSE;
-
- blen = 0;
-
- while (p < end) {
- if (*p == '\r' || *p == '\n') {
- break;
- }
- else if (*p != '-') {
- seen_non_dash = TRUE;
- }
-
- blen ++;
- p ++;
- }
-
- if (blen > 0 && seen_non_dash) {
- /* We have found something like boundary */
- p = text + match_pos;
- bend = p + blen - 1;
-
- if (*bend == '-') {
- /* We need to verify last -- */
- if (bend > p + 1 && *(bend - 1) == '-') {
- closing = TRUE;
- bend --;
- blen -= 2;
- }
- else {
- /* Not a closing boundary somehow */
- bend ++;
- }
- }
- else {
- bend ++;
- }
-
- if (*bend == '\r') {
- bend ++;
-
- /* \r\n */
- if (*bend == '\n') {
- bend ++;
- }
- }
- else {
- /* \n */
- bend ++;
- }
-
- b.boundary = p - st->start - 2;
- b.start = bend - st->start;
-
- if (closing) {
- lc_copy = g_malloc (blen + 2);
- memcpy (lc_copy, p, blen + 2);
- rspamd_str_lc (lc_copy, blen + 2);
- }
- else {
- lc_copy = g_malloc (blen);
- memcpy (lc_copy, p, blen);
- rspamd_str_lc (lc_copy, blen);
- }
-
- rspamd_cryptobox_siphash ((guchar *)&b.hash, lc_copy, blen,
- lib_ctx->hkey);
- msg_debug_mime ("normal hash: %*s -> %L", (gint)blen, lc_copy, b.hash);
-
- if (closing) {
- b.flags = RSPAMD_MIME_BOUNDARY_FLAG_CLOSED;
- rspamd_cryptobox_siphash ((guchar *)&b.closed_hash, lc_copy,
- blen + 2,
- lib_ctx->hkey);
- msg_debug_mime ("closing hash: %*s -> %L", (gint)blen + 2, lc_copy,
- b.closed_hash);
- }
- else {
- b.flags = 0;
- b.closed_hash = 0;
- }
-
- g_free (lc_copy);
- g_array_append_val (st->boundaries, b);
- }
- }
-
- return 0;
- }
-
- static goffset
- rspamd_mime_parser_headers_heuristic (GString *input, goffset *body_start)
- {
- const gsize default_max_len = 76;
- gsize max_len = MIN (input->len, default_max_len);
- const gchar *p, *end;
- enum {
- st_before_colon = 0,
- st_colon,
- st_spaces_after_colon,
- st_value,
- st_error
- } state = st_before_colon;
-
- p = input->str;
- end = p + max_len;
-
- while (p < end) {
- switch (state) {
- case st_before_colon:
- if (G_UNLIKELY (*p == ':')) {
- state = st_colon;
- }
- else if (G_UNLIKELY (!g_ascii_isgraph (*p))) {
- state = st_error;
- }
-
- p ++;
- break;
- case st_colon:
- if (g_ascii_isspace (*p)) {
- state = st_spaces_after_colon;
- }
- else {
- state = st_value;
- }
- p ++;
- break;
- case st_spaces_after_colon:
- if (!g_ascii_isspace (*p)) {
- state = st_value;
- }
- p ++;
- break;
- case st_value:
- /* We accept any value */
- goto end;
- break;
- case st_error:
- return (-1);
- break;
- }
- }
-
- end:
- if (state == st_value) {
- if (body_start) {
- *body_start = input->len;
- }
-
- return input->len;
- }
-
- return (-1);
- }
-
- static void
- rspamd_mime_preprocess_message (struct rspamd_task *task,
- struct rspamd_mime_part *top,
- struct rspamd_mime_parser_ctx *st)
- {
-
- if (top->raw_data.begin >= st->pos) {
- rspamd_multipattern_lookup (lib_ctx->mp_boundary,
- top->raw_data.begin - 1,
- top->raw_data.len + 1,
- rspamd_mime_preprocess_cb, st, NULL);
- }
- else {
- rspamd_multipattern_lookup (lib_ctx->mp_boundary,
- st->pos,
- st->end - st->pos,
- rspamd_mime_preprocess_cb, st, NULL);
- }
- }
-
- static void
- rspamd_mime_parse_stack_free (struct rspamd_mime_parser_ctx *st)
- {
- if (st) {
- g_ptr_array_free (st->stack, TRUE);
- g_array_free (st->boundaries, TRUE);
- g_free (st);
- }
- }
-
- static enum rspamd_mime_parse_error
- rspamd_mime_parse_message (struct rspamd_task *task,
- struct rspamd_mime_part *part,
- struct rspamd_mime_parser_ctx *st,
- GError **err)
- {
- struct rspamd_content_type *ct, *sel = NULL;
- struct rspamd_mime_header *hdr;
- GPtrArray *hdrs = NULL;
- const gchar *pbegin, *p;
- gsize plen, len;
- struct rspamd_mime_part *npart;
- goffset hdr_pos, body_pos;
- guint i;
- enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK;
- GString str;
- struct rspamd_mime_parser_ctx *nst = st;
-
- if (st->nesting > max_nested) {
- g_set_error (err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d",
- st->nesting);
- return RSPAMD_MIME_PARSE_NESTING;
- }
-
- /* Allocate real part */
- npart = rspamd_mempool_alloc0 (task->task_pool,
- sizeof (struct rspamd_mime_part));
-
- if (part == NULL) {
- /* Top level message */
- p = task->msg.begin;
- len = task->msg.len;
- /* Skip any space characters to avoid some bad messages to be unparsed */
- while (len > 0 && g_ascii_isspace (*p)) {
- p ++;
- len --;
- }
- /*
- * Exim somehow uses mailbox format for messages being scanned:
- * From x@x.com Fri May 13 19:08:48 2016
- *
- * Need to check that for all inputs due to proxy
- */
- if (len > sizeof ("From ") - 1) {
- if (memcmp (p, "From ", sizeof ("From ") - 1) == 0) {
- /* Skip to CRLF */
- msg_info_task ("mailbox input detected, enable workaround");
- p += sizeof ("From ") - 1;
- len -= sizeof ("From ") - 1;
-
- while (len > 0 && *p != '\n') {
- p ++;
- len --;
- }
- while (len > 0 && g_ascii_isspace (*p)) {
- p ++;
- len --;
- }
- }
- }
-
- str.str = (gchar *)p;
- str.len = len;
-
- hdr_pos = rspamd_string_find_eoh (&str, &body_pos);
-
- if (hdr_pos > 0 && hdr_pos < str.len) {
-
- task->raw_headers_content.begin = str.str;
- task->raw_headers_content.len = hdr_pos;
- task->raw_headers_content.body_start = str.str + body_pos;
-
- if (task->raw_headers_content.len > 0) {
- rspamd_mime_headers_process (task, task->raw_headers,
- task->headers_order,
- task->raw_headers_content.begin,
- task->raw_headers_content.len,
- TRUE);
- }
-
- hdrs = rspamd_message_get_header_from_hash (task->raw_headers,
- task->task_pool,
- "Content-Type", FALSE);
- }
- else {
- /* First apply heuristic, maybe we have just headers */
- hdr_pos = rspamd_mime_parser_headers_heuristic (&str, &body_pos);
-
- if (hdr_pos > 0 && hdr_pos <= str.len) {
- task->raw_headers_content.begin = str.str;
- task->raw_headers_content.len = hdr_pos;
- task->raw_headers_content.body_start = str.str + body_pos;
-
- if (task->raw_headers_content.len > 0) {
- rspamd_mime_headers_process (task, task->raw_headers,
- task->headers_order,
- task->raw_headers_content.begin,
- task->raw_headers_content.len,
- TRUE);
- }
-
- hdrs = rspamd_message_get_header_from_hash (task->raw_headers,
- task->task_pool,
- "Content-Type", FALSE);
- task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS;
- }
- else {
- body_pos = 0;
- }
- }
-
- pbegin = st->start + body_pos;
- plen = st->end - pbegin;
- npart->raw_headers = g_hash_table_ref (task->raw_headers);
- npart->headers_order = NULL;
- }
- else {
- /*
- * Here are dragons:
- * We allocate new parser context as we need to shift pointers
- */
- nst = g_malloc0 (sizeof (*st));
- nst->stack = g_ptr_array_sized_new (4);
- nst->pos = task->raw_headers_content.body_start;
- nst->end = task->msg.begin + task->msg.len;
- nst->boundaries = g_array_sized_new (FALSE, FALSE,
- sizeof (struct rspamd_mime_boundary), 8);
- nst->start = part->parsed_data.begin;
- nst->end = nst->start + part->parsed_data.len;
- nst->pos = nst->start;
- nst->task = st->task;
- nst->nesting = st->nesting;
- st->nesting ++;
-
- str.str = (gchar *)part->parsed_data.begin;
- str.len = part->parsed_data.len;
-
- hdr_pos = rspamd_string_find_eoh (&str, &body_pos);
- npart->raw_headers = g_hash_table_new_full (rspamd_strcase_hash,
- rspamd_strcase_equal, NULL, rspamd_ptr_array_free_hard);
- npart->headers_order = g_queue_new ();
-
- if (hdr_pos > 0 && hdr_pos < str.len) {
- npart->raw_headers_str = str.str;
- npart->raw_headers_len = hdr_pos;
- npart->raw_data.begin = str.str + body_pos;
-
- if (npart->raw_headers_len > 0) {
- rspamd_mime_headers_process (task, npart->raw_headers,
- npart->headers_order,
- npart->raw_headers_str,
- npart->raw_headers_len,
- FALSE);
- }
- }
- else {
- body_pos = 0;
-
- hdrs = rspamd_message_get_header_from_hash (npart->raw_headers,
- task->task_pool,
- "Content-Type", FALSE);
- }
-
- pbegin = part->parsed_data.begin + body_pos;
- plen = part->parsed_data.len - body_pos;
- }
-
- npart->raw_data.begin = pbegin;
- npart->raw_data.len = plen;
- npart->parent_part = part;
-
- if (hdrs == NULL) {
- sel = NULL;
- }
- else {
- for (i = 0; i < hdrs->len; i ++) {
- hdr = g_ptr_array_index (hdrs, i);
- ct = rspamd_content_type_parse (hdr->value, strlen (hdr->value),
- task->task_pool);
-
- /* Here we prefer multipart content-type or any content-type */
- if (ct) {
- if (sel == NULL) {
- sel = ct;
- }
- else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
- sel = ct;
- }
- }
- }
- }
-
- if (sel == NULL) {
- /* For messages we automatically assume plaintext */
- msg_info_task ("cannot find content-type for a message, assume text/plain");
- sel = rspamd_mempool_alloc0 (task->task_pool, sizeof (*sel));
- sel->flags = RSPAMD_CONTENT_TYPE_TEXT|RSPAMD_CONTENT_TYPE_MISSING;
- RSPAMD_FTOK_ASSIGN (&sel->type, "text");
- RSPAMD_FTOK_ASSIGN (&sel->subtype, "plain");
- }
-
- npart->ct = sel;
-
- if ((part == NULL || nst != st) &&
- (sel->flags & (RSPAMD_CONTENT_TYPE_MULTIPART|RSPAMD_CONTENT_TYPE_MESSAGE))) {
- /* Not a trivial message, need to preprocess */
- rspamd_mime_preprocess_message (task, npart, nst);
- }
-
- if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
- g_ptr_array_add (nst->stack, npart);
- nst->nesting ++;
- npart->specific.mp = rspamd_mempool_alloc0 (task->task_pool,
- sizeof (struct rspamd_mime_multipart));
- memcpy (&npart->specific.mp->boundary, &sel->orig_boundary,
- sizeof (rspamd_ftok_t));
- ret = rspamd_mime_parse_multipart_part (task, npart, nst, err);
- }
- else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) {
- g_ptr_array_add (nst->stack, npart);
- nst->nesting ++;
- ret = rspamd_mime_parse_message (task, npart, nst, err);
- }
- else {
- ret = rspamd_mime_parse_normal_part (task, npart, nst, err);
- }
-
- if (part) {
- /* Remove message part from the parent stack */
- g_ptr_array_remove_index_fast (st->stack, st->stack->len - 1);
- st->nesting --;
- }
-
- /* Process leftovers for boundaries */
- if (nst->boundaries) {
- struct rspamd_mime_boundary *boundary, *start_boundary = NULL,
- *end_boundary = NULL;
- goffset cur_offset = nst->pos - nst->start,
- end_offset = st->end - st->start;
- guint sel_idx = 0;
-
- for (;;) {
- start_boundary = NULL;
-
- for (i = sel_idx; i < nst->boundaries->len; i++) {
- boundary = &g_array_index (nst->boundaries,
- struct rspamd_mime_boundary, i);
-
- if (boundary->start > cur_offset &&
- boundary->boundary < end_offset &&
- !RSPAMD_BOUNDARY_IS_CLOSED (boundary)) {
- start_boundary = boundary;
- sel_idx = i;
- break;
- }
- }
-
- if (start_boundary) {
- const gchar *start, *end;
-
- if (nst->boundaries->len > sel_idx + 1) {
- end_boundary = &g_array_index (nst->boundaries,
- struct rspamd_mime_boundary, sel_idx + 1);
- end = nst->start + end_boundary->boundary;
- }
- else {
- end = nst->end;
- }
-
- sel_idx ++;
-
- start = nst->start + start_boundary->start;
-
- if (end > start &&
- (ret = rspamd_mime_process_multipart_node (task, st,
- NULL, start, end, err)) != RSPAMD_MIME_PARSE_OK) {
- return ret;
- }
- }
- else {
- break;
- }
- }
- }
-
- if (nst != st) {
- rspamd_mime_parse_stack_free (nst);
- }
-
- return ret;
- }
-
- enum rspamd_mime_parse_error
- rspamd_mime_parse_task (struct rspamd_task *task, GError **err)
- {
- struct rspamd_mime_parser_ctx *st;
- enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK;
-
- if (lib_ctx == NULL) {
- rspamd_mime_parser_init_lib ();
- }
-
- if (++lib_ctx->key_usages > max_key_usages) {
- /* Regenerate siphash key */
- ottery_rand_bytes (lib_ctx->hkey, sizeof (lib_ctx->hkey));
- lib_ctx->key_usages = 0;
- }
-
- st = g_malloc0 (sizeof (*st));
- st->stack = g_ptr_array_sized_new (4);
- st->pos = task->raw_headers_content.body_start;
- st->end = task->msg.begin + task->msg.len;
- st->boundaries = g_array_sized_new (FALSE, FALSE,
- sizeof (struct rspamd_mime_boundary), 8);
- st->task = task;
-
- if (st->pos == NULL) {
- st->pos = task->msg.begin;
- }
-
- st->start = task->msg.begin;
- ret = rspamd_mime_parse_message (task, NULL, st, err);
- rspamd_mime_parse_stack_free (st);
-
- return ret;
- }
|