You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

mime_parser.c 34KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "task.h"
  18. #include "mime_parser.h"
  19. #include "mime_headers.h"
  20. #include "message.h"
  21. #include "multipattern.h"
  22. #include "contrib/libottery/ottery.h"
  23. struct rspamd_mime_parser_lib_ctx {
  24. struct rspamd_multipattern *mp_boundary;
  25. guchar hkey[rspamd_cryptobox_SIPKEYBYTES]; /* Key for hashing */
  26. guint key_usages;
  27. };
  28. struct rspamd_mime_parser_lib_ctx *lib_ctx = NULL;
  29. static const guint max_nested = 64;
  30. static const guint max_key_usages = 10000;
  31. #define msg_debug_mime(...) rspamd_conditional_debug_fast (NULL, task->from_addr, \
  32. rspamd_mime_log_id, "mime", task->task_pool->tag.uid, \
  33. G_STRFUNC, \
  34. __VA_ARGS__)
  35. INIT_LOG_MODULE(mime)
  36. #define RSPAMD_MIME_BOUNDARY_FLAG_CLOSED (1 << 0)
  37. #define RSPAMD_BOUNDARY_IS_CLOSED(b) ((b)->flags & RSPAMD_MIME_BOUNDARY_FLAG_CLOSED)
  38. struct rspamd_mime_boundary {
  39. goffset boundary;
  40. goffset start;
  41. guint64 hash;
  42. guint64 closed_hash;
  43. gint flags;
  44. };
  45. struct rspamd_mime_parser_ctx {
  46. GPtrArray *stack; /* Stack of parts */
  47. GArray *boundaries; /* Boundaries found in the whole message */
  48. const gchar *start;
  49. const gchar *pos;
  50. const gchar *end;
  51. struct rspamd_task *task;
  52. guint nesting;
  53. };
  54. static enum rspamd_mime_parse_error
  55. rspamd_mime_parse_multipart_part (struct rspamd_task *task,
  56. struct rspamd_mime_part *part,
  57. struct rspamd_mime_parser_ctx *st,
  58. GError **err);
  59. static enum rspamd_mime_parse_error
  60. rspamd_mime_parse_message (struct rspamd_task *task,
  61. struct rspamd_mime_part *part,
  62. struct rspamd_mime_parser_ctx *st,
  63. GError **err);
  64. static enum rspamd_mime_parse_error
  65. rspamd_mime_parse_normal_part (struct rspamd_task *task,
  66. struct rspamd_mime_part *part,
  67. struct rspamd_mime_parser_ctx *st,
  68. GError **err);
  69. #define RSPAMD_MIME_QUARK (rspamd_mime_parser_quark())
  70. static GQuark
  71. rspamd_mime_parser_quark (void)
  72. {
  73. return g_quark_from_static_string ("mime-parser");
  74. }
  75. const gchar*
  76. rspamd_cte_to_string (enum rspamd_cte ct)
  77. {
  78. const gchar *ret = "unknown";
  79. switch (ct) {
  80. case RSPAMD_CTE_7BIT:
  81. ret = "7bit";
  82. break;
  83. case RSPAMD_CTE_8BIT:
  84. ret = "8bit";
  85. break;
  86. case RSPAMD_CTE_QP:
  87. ret = "quoted-printable";
  88. break;
  89. case RSPAMD_CTE_B64:
  90. ret = "base64";
  91. break;
  92. default:
  93. break;
  94. }
  95. return ret;
  96. }
  97. enum rspamd_cte
  98. rspamd_cte_from_string (const gchar *str)
  99. {
  100. enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
  101. g_assert (str != NULL);
  102. if (strcmp (str, "7bit") == 0) {
  103. ret = RSPAMD_CTE_7BIT;
  104. }
  105. else if (strcmp (str, "8bit") == 0) {
  106. ret = RSPAMD_CTE_8BIT;
  107. }
  108. else if (strcmp (str, "quoted-printable") == 0) {
  109. ret = RSPAMD_CTE_QP;
  110. }
  111. else if (strcmp (str, "base64") == 0) {
  112. ret = RSPAMD_CTE_B64;
  113. }
  114. return ret;
  115. }
  116. static void
  117. rspamd_mime_parser_init_lib (void)
  118. {
  119. lib_ctx = g_malloc0 (sizeof (*lib_ctx));
  120. lib_ctx->mp_boundary = rspamd_multipattern_create (RSPAMD_MULTIPATTERN_DEFAULT);
  121. g_assert (lib_ctx->mp_boundary != NULL);
  122. rspamd_multipattern_add_pattern (lib_ctx->mp_boundary, "\r--", 0);
  123. rspamd_multipattern_add_pattern (lib_ctx->mp_boundary, "\n--", 0);
  124. g_assert (rspamd_multipattern_compile (lib_ctx->mp_boundary, NULL));
  125. ottery_rand_bytes (lib_ctx->hkey, sizeof (lib_ctx->hkey));
  126. }
  127. static enum rspamd_cte
  128. rspamd_mime_parse_cte (const gchar *in, gsize len)
  129. {
  130. guint64 h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64,
  131. in, len, 0xdeadbabe);
  132. enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
  133. switch (h) {
  134. case 0xCEDAA7056B4753F7ULL: /* 7bit */
  135. ret = RSPAMD_CTE_7BIT;
  136. break;
  137. case 0x42E0745448B39FC1ULL: /* 8bit */
  138. case 0x6B169E6B155BADC0ULL: /* binary */
  139. ret = RSPAMD_CTE_8BIT;
  140. break;
  141. case 0x6D69A5BB02A633B0ULL: /* quoted-printable */
  142. ret = RSPAMD_CTE_QP;
  143. break;
  144. case 0x96305588A76DC9A9ULL: /* base64 */
  145. case 0x171029DE1B0423A9ULL: /* base-64 */
  146. ret = RSPAMD_CTE_B64;
  147. break;
  148. }
  149. return ret;
  150. }
  151. static enum rspamd_cte
  152. rspamd_mime_part_get_cte_heuristic (struct rspamd_task *task,
  153. struct rspamd_mime_part *part)
  154. {
  155. const guint check_len = 128;
  156. guint real_len, nspaces = 0, neqsign = 0, n8bit = 0, nqpencoded = 0;
  157. gboolean b64_chars = TRUE;
  158. const guchar *p, *end;
  159. enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
  160. real_len = MIN (check_len, part->raw_data.len);
  161. p = (const guchar *)part->raw_data.begin;
  162. end = p + part->raw_data.len;
  163. while (p < end && g_ascii_isspace (*p)) {
  164. p ++;
  165. }
  166. if (end > p + 2) {
  167. if (*(end - 1) == '=') {
  168. neqsign ++;
  169. end --;
  170. }
  171. if (*(end - 1) == '=') {
  172. neqsign ++;
  173. end --;
  174. }
  175. }
  176. if (end - p > real_len) {
  177. end = p + real_len;
  178. }
  179. while (p < end) {
  180. if (*p == ' ') {
  181. nspaces ++;
  182. }
  183. else if (*p == '=') {
  184. neqsign ++;
  185. p ++;
  186. if (p + 2 < end && g_ascii_isxdigit (*p) && g_ascii_isxdigit (*(p + 1))) {
  187. p ++;
  188. nqpencoded ++;
  189. }
  190. continue;
  191. }
  192. else if (*p >= 0x80) {
  193. n8bit ++;
  194. b64_chars = FALSE;
  195. }
  196. else if (!(g_ascii_isalnum (*p) || *p == '/' || *p == '+')) {
  197. b64_chars = FALSE;
  198. }
  199. p ++;
  200. }
  201. if (b64_chars && neqsign < 2 && nspaces == 0) {
  202. ret = RSPAMD_CTE_B64;
  203. }
  204. else if (n8bit == 0) {
  205. if (neqsign > 2 && nqpencoded > 2) {
  206. ret = RSPAMD_CTE_QP;
  207. }
  208. else {
  209. ret = RSPAMD_CTE_7BIT;
  210. }
  211. }
  212. else {
  213. ret = RSPAMD_CTE_8BIT;
  214. }
  215. msg_debug_mime ("detected cte: %s", rspamd_cte_to_string (ret));
  216. return ret;
  217. }
  218. static void
  219. rspamd_mime_part_get_cte (struct rspamd_task *task,
  220. GHashTable *hdrs,
  221. struct rspamd_mime_part *part,
  222. gboolean apply_heuristic)
  223. {
  224. struct rspamd_mime_header *hdr;
  225. guint i;
  226. GPtrArray *hdrs_cte;
  227. enum rspamd_cte cte = RSPAMD_CTE_UNKNOWN;
  228. hdrs_cte = rspamd_message_get_header_from_hash (hdrs,
  229. task->task_pool,
  230. "Content-Transfer-Encoding", FALSE);
  231. if (hdrs_cte == NULL) {
  232. if (part->parent_part && part->parent_part->cte != RSPAMD_CTE_UNKNOWN &&
  233. !(part->parent_part->flags & RSPAMD_MIME_PART_MISSING_CTE)) {
  234. part->cte = part->parent_part->cte;
  235. goto check_cte;
  236. }
  237. if (apply_heuristic) {
  238. part->cte = rspamd_mime_part_get_cte_heuristic (task, part);
  239. msg_info_task ("detected missing CTE for part as: %s",
  240. rspamd_cte_to_string (part->cte));
  241. }
  242. part->flags |= RSPAMD_MIME_PART_MISSING_CTE;
  243. }
  244. else {
  245. for (i = 0; i < hdrs_cte->len; i ++) {
  246. gsize hlen;
  247. gchar lc_buf[128];
  248. hdr = g_ptr_array_index (hdrs_cte, i);
  249. hlen = rspamd_snprintf (lc_buf, sizeof (lc_buf), "%s", hdr->value);
  250. rspamd_str_lc (lc_buf, hlen);
  251. cte = rspamd_mime_parse_cte (lc_buf, hlen);
  252. if (cte != RSPAMD_CTE_UNKNOWN) {
  253. part->cte = cte;
  254. break;
  255. }
  256. }
  257. check_cte:
  258. if (apply_heuristic) {
  259. if (part->cte == RSPAMD_CTE_UNKNOWN) {
  260. part->cte = rspamd_mime_part_get_cte_heuristic (task, part);
  261. msg_info_task ("corrected bad CTE for part to: %s",
  262. rspamd_cte_to_string (part->cte));
  263. }
  264. else if (part->cte == RSPAMD_CTE_B64 ||
  265. part->cte == RSPAMD_CTE_QP) {
  266. /* Additionally check sanity */
  267. cte = rspamd_mime_part_get_cte_heuristic (task, part);
  268. if (cte == RSPAMD_CTE_8BIT) {
  269. msg_info_task (
  270. "incorrect cte specified for part: %s, %s detected",
  271. rspamd_cte_to_string (part->cte),
  272. rspamd_cte_to_string (cte));
  273. part->cte = cte;
  274. part->flags |= RSPAMD_MIME_PART_BAD_CTE;
  275. }
  276. }
  277. else {
  278. msg_debug_mime ("processed cte: %s",
  279. rspamd_cte_to_string (cte));
  280. }
  281. }
  282. else {
  283. msg_debug_mime ("processed cte: %s", rspamd_cte_to_string (cte));
  284. }
  285. }
  286. }
  287. static void
  288. rspamd_mime_part_get_cd (struct rspamd_task *task, struct rspamd_mime_part *part)
  289. {
  290. struct rspamd_mime_header *hdr;
  291. guint i;
  292. GPtrArray *hdrs;
  293. struct rspamd_content_disposition *cd = NULL;
  294. rspamd_ftok_t srch;
  295. struct rspamd_content_type_param *found;
  296. hdrs = rspamd_message_get_header_from_hash (part->raw_headers,
  297. task->task_pool,
  298. "Content-Disposition", FALSE);
  299. if (hdrs == NULL) {
  300. cd = rspamd_mempool_alloc0 (task->task_pool, sizeof (*cd));
  301. cd->type = RSPAMD_CT_INLINE;
  302. /* We can also have content dispositon definitions in Content-Type */
  303. if (part->ct && part->ct->attrs) {
  304. RSPAMD_FTOK_ASSIGN (&srch, "name");
  305. found = g_hash_table_lookup (part->ct->attrs, &srch);
  306. if (!found) {
  307. RSPAMD_FTOK_ASSIGN (&srch, "filename");
  308. found = g_hash_table_lookup (part->ct->attrs, &srch);
  309. }
  310. if (found) {
  311. cd->type = RSPAMD_CT_ATTACHMENT;
  312. memcpy (&cd->filename, &found->value, sizeof (cd->filename));
  313. }
  314. }
  315. }
  316. else {
  317. for (i = 0; i < hdrs->len; i ++) {
  318. gsize hlen;
  319. hdr = g_ptr_array_index (hdrs, i);
  320. hlen = strlen (hdr->value);
  321. cd = rspamd_content_disposition_parse (hdr->value, hlen,
  322. task->task_pool);
  323. if (cd) {
  324. msg_debug_mime ("processed content disposition: %s",
  325. cd->lc_data);
  326. /* We still need to check filename */
  327. if (cd->filename.len == 0) {
  328. if (part->ct && part->ct->attrs) {
  329. RSPAMD_FTOK_ASSIGN (&srch, "name");
  330. found = g_hash_table_lookup (part->ct->attrs, &srch);
  331. if (!found) {
  332. RSPAMD_FTOK_ASSIGN (&srch, "filename");
  333. found = g_hash_table_lookup (part->ct->attrs, &srch);
  334. }
  335. if (found) {
  336. cd->type = RSPAMD_CT_ATTACHMENT;
  337. memcpy (&cd->filename, &found->value,
  338. sizeof (cd->filename));
  339. }
  340. }
  341. }
  342. break;
  343. }
  344. }
  345. }
  346. part->cd = cd;
  347. }
  348. void
  349. rspamd_mime_parser_calc_digest (struct rspamd_mime_part *part)
  350. {
  351. /* Blake2b applied to string 'rspamd' */
  352. static const guchar hash_key[] = {
  353. 0xef,0x43,0xae,0x80,0xcc,0x8d,0xc3,0x4c,
  354. 0x6f,0x1b,0xd6,0x18,0x1b,0xae,0x87,0x74,
  355. 0x0c,0xca,0xf7,0x8e,0x5f,0x2e,0x54,0x32,
  356. 0xf6,0x79,0xb9,0x27,0x26,0x96,0x20,0x92,
  357. 0x70,0x07,0x85,0xeb,0x83,0xf7,0x89,0xe0,
  358. 0xd7,0x32,0x2a,0xd2,0x1a,0x64,0x41,0xef,
  359. 0x49,0xff,0xc3,0x8c,0x54,0xf9,0x67,0x74,
  360. 0x30,0x1e,0x70,0x2e,0xb7,0x12,0x09,0xfe,
  361. };
  362. if (part->parsed_data.len > 0) {
  363. rspamd_cryptobox_hash (part->digest,
  364. part->parsed_data.begin, part->parsed_data.len,
  365. hash_key, sizeof (hash_key));
  366. }
  367. }
  368. static enum rspamd_mime_parse_error
  369. rspamd_mime_parse_normal_part (struct rspamd_task *task,
  370. struct rspamd_mime_part *part,
  371. struct rspamd_mime_parser_ctx *st,
  372. GError **err)
  373. {
  374. rspamd_fstring_t *parsed;
  375. gssize r;
  376. g_assert (part != NULL);
  377. rspamd_mime_part_get_cte (task, part->raw_headers, part, TRUE);
  378. rspamd_mime_part_get_cd (task, part);
  379. switch (part->cte) {
  380. case RSPAMD_CTE_7BIT:
  381. case RSPAMD_CTE_8BIT:
  382. case RSPAMD_CTE_UNKNOWN:
  383. if (part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING) {
  384. if (part->cte != RSPAMD_CTE_7BIT) {
  385. /* We have something that has a missing content-type,
  386. * but it has non-7bit characters.
  387. *
  388. * In theory, it is very unsafe to process it as a text part
  389. * as we unlikely get some sane result
  390. */
  391. part->ct->flags &= ~RSPAMD_CONTENT_TYPE_TEXT;
  392. part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
  393. }
  394. }
  395. if (IS_CT_TEXT (part->ct)) {
  396. /* Need to copy text as we have couple of in-place change functions */
  397. parsed = rspamd_fstring_sized_new (part->raw_data.len);
  398. parsed->len = part->raw_data.len;
  399. memcpy (parsed->str, part->raw_data.begin, parsed->len);
  400. part->parsed_data.begin = parsed->str;
  401. part->parsed_data.len = parsed->len;
  402. rspamd_mempool_add_destructor (task->task_pool,
  403. (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
  404. }
  405. else {
  406. part->parsed_data.begin = part->raw_data.begin;
  407. part->parsed_data.len = part->raw_data.len;
  408. }
  409. break;
  410. case RSPAMD_CTE_QP:
  411. parsed = rspamd_fstring_sized_new (part->raw_data.len);
  412. r = rspamd_decode_qp_buf (part->raw_data.begin, part->raw_data.len,
  413. parsed->str, parsed->allocated);
  414. if (r != -1) {
  415. parsed->len = r;
  416. part->parsed_data.begin = parsed->str;
  417. part->parsed_data.len = parsed->len;
  418. rspamd_mempool_add_destructor (task->task_pool,
  419. (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
  420. }
  421. else {
  422. msg_err_task ("invalid quoted-printable encoded part, assume 8bit");
  423. part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
  424. part->cte = RSPAMD_CTE_8BIT;
  425. memcpy (parsed->str, part->raw_data.begin, part->raw_data.len);
  426. parsed->len = part->raw_data.len;
  427. part->parsed_data.begin = parsed->str;
  428. part->parsed_data.len = parsed->len;
  429. rspamd_mempool_add_destructor (task->task_pool,
  430. (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
  431. }
  432. break;
  433. case RSPAMD_CTE_B64:
  434. parsed = rspamd_fstring_sized_new (part->raw_data.len / 4 * 3 + 12);
  435. rspamd_cryptobox_base64_decode (part->raw_data.begin,
  436. part->raw_data.len,
  437. parsed->str, &parsed->len);
  438. part->parsed_data.begin = parsed->str;
  439. part->parsed_data.len = parsed->len;
  440. rspamd_mempool_add_destructor (task->task_pool,
  441. (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
  442. break;
  443. default:
  444. g_assert_not_reached ();
  445. }
  446. part->id = task->parts->len;
  447. g_ptr_array_add (task->parts, part);
  448. msg_debug_mime ("parsed data part %T/%T of length %z (%z orig), %s cte",
  449. &part->ct->type, &part->ct->subtype, part->parsed_data.len,
  450. part->raw_data.len, rspamd_cte_to_string (part->cte));
  451. rspamd_mime_parser_calc_digest (part);
  452. return RSPAMD_MIME_PARSE_OK;
  453. }
  454. struct rspamd_mime_multipart_cbdata {
  455. struct rspamd_task *task;
  456. struct rspamd_mime_part *multipart;
  457. struct rspamd_mime_parser_ctx *st;
  458. const gchar *part_start;
  459. rspamd_ftok_t *cur_boundary;
  460. guint64 bhash;
  461. GError **err;
  462. };
  463. static enum rspamd_mime_parse_error
  464. rspamd_mime_process_multipart_node (struct rspamd_task *task,
  465. struct rspamd_mime_parser_ctx *st,
  466. struct rspamd_mime_part *multipart,
  467. const gchar *start, const gchar *end,
  468. GError **err)
  469. {
  470. struct rspamd_content_type *ct, *sel = NULL;
  471. struct rspamd_mime_header *hdr;
  472. GPtrArray *hdrs = NULL;
  473. struct rspamd_mime_part *npart;
  474. GString str;
  475. goffset hdr_pos, body_pos;
  476. guint i;
  477. enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_FATAL;
  478. str.str = (gchar *)start;
  479. str.len = end - start;
  480. if (*start == '\n' || *start == '\r') {
  481. /*
  482. * We have a part that starts from newline which means that
  483. * there are completely no headers in this part,
  484. * hence we assume it as a text part
  485. */
  486. hdr_pos = 0;
  487. body_pos = 0;
  488. }
  489. else {
  490. hdr_pos = rspamd_string_find_eoh (&str, &body_pos);
  491. }
  492. npart = rspamd_mempool_alloc0 (task->task_pool,
  493. sizeof (struct rspamd_mime_part));
  494. npart->parent_part = multipart;
  495. npart->raw_headers = g_hash_table_new_full (rspamd_strcase_hash,
  496. rspamd_strcase_equal, NULL, rspamd_ptr_array_free_hard);
  497. npart->headers_order = g_queue_new ();
  498. if (multipart) {
  499. if (multipart->specific.mp->children == NULL) {
  500. multipart->specific.mp->children = g_ptr_array_sized_new (2);
  501. }
  502. g_ptr_array_add (multipart->specific.mp->children, npart);
  503. }
  504. if (hdr_pos > 0 && hdr_pos < str.len) {
  505. npart->raw_headers_str = str.str;
  506. npart->raw_headers_len = hdr_pos;
  507. npart->raw_data.begin = start + body_pos;
  508. npart->raw_data.len = (end - start) - body_pos;
  509. if (npart->raw_headers_len > 0) {
  510. rspamd_mime_headers_process (task, npart->raw_headers,
  511. npart->headers_order,
  512. npart->raw_headers_str,
  513. npart->raw_headers_len,
  514. FALSE);
  515. }
  516. hdrs = rspamd_message_get_header_from_hash (npart->raw_headers,
  517. task->task_pool,
  518. "Content-Type", FALSE);
  519. }
  520. else {
  521. npart->raw_headers_str = 0;
  522. npart->raw_headers_len = 0;
  523. npart->raw_data.begin = start;
  524. npart->raw_data.len = end - start;
  525. }
  526. if (hdrs != NULL) {
  527. for (i = 0; i < hdrs->len; i ++) {
  528. hdr = g_ptr_array_index (hdrs, i);
  529. ct = rspamd_content_type_parse (hdr->value, strlen (hdr->value),
  530. task->task_pool);
  531. /* Here we prefer multipart content-type or any content-type */
  532. if (ct) {
  533. if (sel == NULL) {
  534. sel = ct;
  535. }
  536. else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
  537. sel = ct;
  538. }
  539. }
  540. }
  541. }
  542. if (sel == NULL) {
  543. sel = rspamd_mempool_alloc0 (task->task_pool, sizeof (*sel));
  544. RSPAMD_FTOK_ASSIGN (&sel->type, "text");
  545. RSPAMD_FTOK_ASSIGN (&sel->subtype, "plain");
  546. }
  547. npart->ct = sel;
  548. if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
  549. st->nesting ++;
  550. g_ptr_array_add (st->stack, npart);
  551. npart->specific.mp = rspamd_mempool_alloc0 (task->task_pool,
  552. sizeof (struct rspamd_mime_multipart));
  553. memcpy (&npart->specific.mp->boundary, &sel->orig_boundary,
  554. sizeof (rspamd_ftok_t));
  555. ret = rspamd_mime_parse_multipart_part (task, npart, st, err);
  556. }
  557. else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) {
  558. st->nesting ++;
  559. g_ptr_array_add (st->stack, npart);
  560. if ((ret = rspamd_mime_parse_normal_part (task, npart, st, err))
  561. == RSPAMD_MIME_PARSE_OK) {
  562. ret = rspamd_mime_parse_message (task, npart, st, err);
  563. }
  564. }
  565. else {
  566. ret = rspamd_mime_parse_normal_part (task, npart, st, err);
  567. }
  568. return ret;
  569. }
  570. static enum rspamd_mime_parse_error
  571. rspamd_mime_parse_multipart_cb (struct rspamd_task *task,
  572. struct rspamd_mime_part *multipart,
  573. struct rspamd_mime_parser_ctx *st,
  574. struct rspamd_mime_multipart_cbdata *cb,
  575. struct rspamd_mime_boundary *b)
  576. {
  577. const gchar *pos = st->start + b->boundary;
  578. enum rspamd_mime_parse_error ret;
  579. task = cb->task;
  580. /* Now check boundary */
  581. if (!cb->part_start) {
  582. cb->part_start = st->start + b->start;
  583. st->pos = cb->part_start;
  584. }
  585. else {
  586. /* We have seen the start of the boundary */
  587. if (cb->part_start < pos) {
  588. /* We should have seen some boundary */
  589. g_assert (cb->cur_boundary != NULL);
  590. if ((ret = rspamd_mime_process_multipart_node (task, cb->st,
  591. cb->multipart, cb->part_start, pos, cb->err))
  592. != RSPAMD_MIME_PARSE_OK) {
  593. return ret;
  594. }
  595. /* Go towards the next part */
  596. cb->part_start = st->start + b->start;
  597. cb->st->pos = cb->part_start;
  598. }
  599. else {
  600. /* We have an empty boundary, do nothing */
  601. }
  602. }
  603. return RSPAMD_MIME_PARSE_OK;
  604. }
  605. static enum rspamd_mime_parse_error
  606. rspamd_multipart_boundaries_filter (struct rspamd_task *task,
  607. struct rspamd_mime_part *multipart,
  608. struct rspamd_mime_parser_ctx *st,
  609. struct rspamd_mime_multipart_cbdata *cb)
  610. {
  611. struct rspamd_mime_boundary *cur;
  612. goffset last_offset;
  613. guint i, sel = 0;
  614. enum rspamd_mime_parse_error ret;
  615. last_offset = (multipart->raw_data.begin - st->start) +
  616. multipart->raw_data.len;
  617. /* Find the first offset suitable for this part */
  618. for (i = 0; i < st->boundaries->len; i ++) {
  619. cur = &g_array_index (st->boundaries, struct rspamd_mime_boundary, i);
  620. if (cur->start >= multipart->raw_data.begin - st->start) {
  621. if (cb->cur_boundary) {
  622. /* Check boundary */
  623. msg_debug_mime ("compare %L and %L (and %L)",
  624. cb->bhash, cur->hash, cur->closed_hash);
  625. if (cb->bhash == cur->hash) {
  626. sel = i;
  627. break;
  628. }
  629. else if (cb->bhash == cur->closed_hash) {
  630. /* Not a closing element in fact */
  631. cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
  632. cur->hash = cur->closed_hash;
  633. sel = i;
  634. break;
  635. }
  636. }
  637. else {
  638. /* Set current boundary */
  639. cb->cur_boundary = rspamd_mempool_alloc (task->task_pool,
  640. sizeof (rspamd_ftok_t));
  641. cb->cur_boundary->begin = st->start + cur->boundary;
  642. cb->cur_boundary->len = 0;
  643. cb->bhash = cur->hash;
  644. sel = i;
  645. break;
  646. }
  647. }
  648. }
  649. /* Now we can go forward with boundaries that are same to what we have */
  650. for (i = sel; i < st->boundaries->len; i ++) {
  651. cur = &g_array_index (st->boundaries, struct rspamd_mime_boundary, i);
  652. if (cur->boundary > last_offset) {
  653. break;
  654. }
  655. if (cur->hash == cb->bhash || cur->closed_hash == cb->bhash) {
  656. if ((ret = rspamd_mime_parse_multipart_cb (task, multipart, st,
  657. cb, cur)) != RSPAMD_MIME_PARSE_OK) {
  658. return ret;
  659. }
  660. if (cur->closed_hash == cb->bhash) {
  661. /* We have again fake closed hash */
  662. cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
  663. cur->hash = cur->closed_hash;
  664. }
  665. if (RSPAMD_BOUNDARY_IS_CLOSED (cur)) {
  666. /* We also might check the next boundary... */
  667. if (i < st->boundaries->len - 1) {
  668. cur = &g_array_index (st->boundaries,
  669. struct rspamd_mime_boundary, i + 1);
  670. if (cur->hash == cb->bhash) {
  671. continue;
  672. }
  673. else if (cur->closed_hash == cb->bhash) {
  674. /* We have again fake closed hash */
  675. cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
  676. cur->hash = cur->closed_hash;
  677. continue;
  678. }
  679. }
  680. break;
  681. }
  682. }
  683. }
  684. if (i == st->boundaries->len && cb->cur_boundary) {
  685. /* Process the last part */
  686. struct rspamd_mime_boundary fb;
  687. fb.boundary = last_offset;
  688. if ((ret = rspamd_mime_parse_multipart_cb (task, multipart, st,
  689. cb, &fb)) != RSPAMD_MIME_PARSE_OK) {
  690. return ret;
  691. }
  692. }
  693. return RSPAMD_MIME_PARSE_OK;
  694. }
  695. static enum rspamd_mime_parse_error
  696. rspamd_mime_parse_multipart_part (struct rspamd_task *task,
  697. struct rspamd_mime_part *part,
  698. struct rspamd_mime_parser_ctx *st,
  699. GError **err)
  700. {
  701. struct rspamd_mime_multipart_cbdata cbdata;
  702. enum rspamd_mime_parse_error ret;
  703. if (st->nesting > max_nested) {
  704. g_set_error (err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d",
  705. st->nesting);
  706. return RSPAMD_MIME_PARSE_NESTING;
  707. }
  708. part->id = task->parts->len;
  709. g_ptr_array_add (task->parts, part);
  710. st->nesting ++;
  711. rspamd_mime_part_get_cte (task, part->raw_headers, part, FALSE);
  712. st->pos = part->raw_data.begin;
  713. cbdata.multipart = part;
  714. cbdata.task = task;
  715. cbdata.st = st;
  716. cbdata.part_start = NULL;
  717. cbdata.err = err;
  718. if (part->ct->boundary.len > 0) {
  719. /* We know our boundary */
  720. cbdata.cur_boundary = &part->ct->boundary;
  721. rspamd_cryptobox_siphash ((guchar *)&cbdata.bhash,
  722. cbdata.cur_boundary->begin, cbdata.cur_boundary->len,
  723. lib_ctx->hkey);
  724. msg_debug_mime ("hash: %T -> %L", cbdata.cur_boundary, cbdata.bhash);
  725. }
  726. else {
  727. /* Guess boundary */
  728. cbdata.cur_boundary = NULL;
  729. cbdata.bhash = 0;
  730. }
  731. ret = rspamd_multipart_boundaries_filter (task, part, st, &cbdata);
  732. /* Cleanup stack */
  733. st->nesting --;
  734. g_ptr_array_remove_index_fast (st->stack, st->stack->len - 1);
  735. return ret;
  736. }
  737. /* Process boundary like structures in a message */
  738. static gint
  739. rspamd_mime_preprocess_cb (struct rspamd_multipattern *mp,
  740. guint strnum,
  741. gint match_start,
  742. gint match_pos,
  743. const gchar *text,
  744. gsize len,
  745. void *context)
  746. {
  747. const gchar *end = text + len, *p = text + match_pos, *bend;
  748. gchar *lc_copy;
  749. gsize blen;
  750. gboolean closing = FALSE;
  751. struct rspamd_mime_boundary b;
  752. struct rspamd_mime_parser_ctx *st = context;
  753. struct rspamd_task *task;
  754. task = st->task;
  755. if (G_LIKELY (p < end)) {
  756. gboolean seen_non_dash = FALSE;
  757. blen = 0;
  758. while (p < end) {
  759. if (*p == '\r' || *p == '\n') {
  760. break;
  761. }
  762. else if (*p != '-') {
  763. seen_non_dash = TRUE;
  764. }
  765. blen ++;
  766. p ++;
  767. }
  768. if (blen > 0 && seen_non_dash) {
  769. /* We have found something like boundary */
  770. p = text + match_pos;
  771. bend = p + blen - 1;
  772. if (*bend == '-') {
  773. /* We need to verify last -- */
  774. if (bend > p + 1 && *(bend - 1) == '-') {
  775. closing = TRUE;
  776. bend --;
  777. blen -= 2;
  778. }
  779. else {
  780. /* Not a closing boundary somehow */
  781. bend ++;
  782. }
  783. }
  784. else {
  785. bend ++;
  786. }
  787. if (*bend == '\r') {
  788. bend ++;
  789. /* \r\n */
  790. if (*bend == '\n') {
  791. bend ++;
  792. }
  793. }
  794. else {
  795. /* \n */
  796. bend ++;
  797. }
  798. b.boundary = p - st->start - 2;
  799. b.start = bend - st->start;
  800. if (closing) {
  801. lc_copy = g_malloc (blen + 2);
  802. memcpy (lc_copy, p, blen + 2);
  803. rspamd_str_lc (lc_copy, blen + 2);
  804. }
  805. else {
  806. lc_copy = g_malloc (blen);
  807. memcpy (lc_copy, p, blen);
  808. rspamd_str_lc (lc_copy, blen);
  809. }
  810. rspamd_cryptobox_siphash ((guchar *)&b.hash, lc_copy, blen,
  811. lib_ctx->hkey);
  812. msg_debug_mime ("normal hash: %*s -> %L", (gint)blen, lc_copy, b.hash);
  813. if (closing) {
  814. b.flags = RSPAMD_MIME_BOUNDARY_FLAG_CLOSED;
  815. rspamd_cryptobox_siphash ((guchar *)&b.closed_hash, lc_copy,
  816. blen + 2,
  817. lib_ctx->hkey);
  818. msg_debug_mime ("closing hash: %*s -> %L", (gint)blen + 2, lc_copy,
  819. b.closed_hash);
  820. }
  821. else {
  822. b.flags = 0;
  823. b.closed_hash = 0;
  824. }
  825. g_free (lc_copy);
  826. g_array_append_val (st->boundaries, b);
  827. }
  828. }
  829. return 0;
  830. }
  831. static goffset
  832. rspamd_mime_parser_headers_heuristic (GString *input, goffset *body_start)
  833. {
  834. const gsize default_max_len = 76;
  835. gsize max_len = MIN (input->len, default_max_len);
  836. const gchar *p, *end;
  837. enum {
  838. st_before_colon = 0,
  839. st_colon,
  840. st_spaces_after_colon,
  841. st_value,
  842. st_error
  843. } state = st_before_colon;
  844. p = input->str;
  845. end = p + max_len;
  846. while (p < end) {
  847. switch (state) {
  848. case st_before_colon:
  849. if (G_UNLIKELY (*p == ':')) {
  850. state = st_colon;
  851. }
  852. else if (G_UNLIKELY (!g_ascii_isgraph (*p))) {
  853. state = st_error;
  854. }
  855. p ++;
  856. break;
  857. case st_colon:
  858. if (g_ascii_isspace (*p)) {
  859. state = st_spaces_after_colon;
  860. }
  861. else {
  862. state = st_value;
  863. }
  864. p ++;
  865. break;
  866. case st_spaces_after_colon:
  867. if (!g_ascii_isspace (*p)) {
  868. state = st_value;
  869. }
  870. p ++;
  871. break;
  872. case st_value:
  873. /* We accept any value */
  874. goto end;
  875. break;
  876. case st_error:
  877. return (-1);
  878. break;
  879. }
  880. }
  881. end:
  882. if (state == st_value) {
  883. if (body_start) {
  884. *body_start = input->len;
  885. }
  886. return input->len;
  887. }
  888. return (-1);
  889. }
  890. static void
  891. rspamd_mime_preprocess_message (struct rspamd_task *task,
  892. struct rspamd_mime_part *top,
  893. struct rspamd_mime_parser_ctx *st)
  894. {
  895. if (top->raw_data.begin >= st->pos) {
  896. rspamd_multipattern_lookup (lib_ctx->mp_boundary,
  897. top->raw_data.begin - 1,
  898. top->raw_data.len + 1,
  899. rspamd_mime_preprocess_cb, st, NULL);
  900. }
  901. else {
  902. rspamd_multipattern_lookup (lib_ctx->mp_boundary,
  903. st->pos,
  904. st->end - st->pos,
  905. rspamd_mime_preprocess_cb, st, NULL);
  906. }
  907. }
  908. static void
  909. rspamd_mime_parse_stack_free (struct rspamd_mime_parser_ctx *st)
  910. {
  911. if (st) {
  912. g_ptr_array_free (st->stack, TRUE);
  913. g_array_free (st->boundaries, TRUE);
  914. g_free (st);
  915. }
  916. }
  917. static enum rspamd_mime_parse_error
  918. rspamd_mime_parse_message (struct rspamd_task *task,
  919. struct rspamd_mime_part *part,
  920. struct rspamd_mime_parser_ctx *st,
  921. GError **err)
  922. {
  923. struct rspamd_content_type *ct, *sel = NULL;
  924. struct rspamd_mime_header *hdr;
  925. GPtrArray *hdrs = NULL;
  926. const gchar *pbegin, *p;
  927. gsize plen, len;
  928. struct rspamd_mime_part *npart;
  929. goffset hdr_pos, body_pos;
  930. guint i;
  931. enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK;
  932. GString str;
  933. struct rspamd_mime_parser_ctx *nst = st;
  934. if (st->nesting > max_nested) {
  935. g_set_error (err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d",
  936. st->nesting);
  937. return RSPAMD_MIME_PARSE_NESTING;
  938. }
  939. /* Allocate real part */
  940. npart = rspamd_mempool_alloc0 (task->task_pool,
  941. sizeof (struct rspamd_mime_part));
  942. if (part == NULL) {
  943. /* Top level message */
  944. p = task->msg.begin;
  945. len = task->msg.len;
  946. /* Skip any space characters to avoid some bad messages to be unparsed */
  947. while (len > 0 && g_ascii_isspace (*p)) {
  948. p ++;
  949. len --;
  950. }
  951. /*
  952. * Exim somehow uses mailbox format for messages being scanned:
  953. * From x@x.com Fri May 13 19:08:48 2016
  954. *
  955. * Need to check that for all inputs due to proxy
  956. */
  957. if (len > sizeof ("From ") - 1) {
  958. if (memcmp (p, "From ", sizeof ("From ") - 1) == 0) {
  959. /* Skip to CRLF */
  960. msg_info_task ("mailbox input detected, enable workaround");
  961. p += sizeof ("From ") - 1;
  962. len -= sizeof ("From ") - 1;
  963. while (len > 0 && *p != '\n') {
  964. p ++;
  965. len --;
  966. }
  967. while (len > 0 && g_ascii_isspace (*p)) {
  968. p ++;
  969. len --;
  970. }
  971. }
  972. }
  973. str.str = (gchar *)p;
  974. str.len = len;
  975. hdr_pos = rspamd_string_find_eoh (&str, &body_pos);
  976. if (hdr_pos > 0 && hdr_pos < str.len) {
  977. task->raw_headers_content.begin = str.str;
  978. task->raw_headers_content.len = hdr_pos;
  979. task->raw_headers_content.body_start = str.str + body_pos;
  980. if (task->raw_headers_content.len > 0) {
  981. rspamd_mime_headers_process (task, task->raw_headers,
  982. task->headers_order,
  983. task->raw_headers_content.begin,
  984. task->raw_headers_content.len,
  985. TRUE);
  986. }
  987. hdrs = rspamd_message_get_header_from_hash (task->raw_headers,
  988. task->task_pool,
  989. "Content-Type", FALSE);
  990. }
  991. else {
  992. /* First apply heuristic, maybe we have just headers */
  993. hdr_pos = rspamd_mime_parser_headers_heuristic (&str, &body_pos);
  994. if (hdr_pos > 0 && hdr_pos <= str.len) {
  995. task->raw_headers_content.begin = str.str;
  996. task->raw_headers_content.len = hdr_pos;
  997. task->raw_headers_content.body_start = str.str + body_pos;
  998. if (task->raw_headers_content.len > 0) {
  999. rspamd_mime_headers_process (task, task->raw_headers,
  1000. task->headers_order,
  1001. task->raw_headers_content.begin,
  1002. task->raw_headers_content.len,
  1003. TRUE);
  1004. }
  1005. hdrs = rspamd_message_get_header_from_hash (task->raw_headers,
  1006. task->task_pool,
  1007. "Content-Type", FALSE);
  1008. task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS;
  1009. }
  1010. else {
  1011. body_pos = 0;
  1012. }
  1013. }
  1014. pbegin = st->start + body_pos;
  1015. plen = st->end - pbegin;
  1016. npart->raw_headers = g_hash_table_ref (task->raw_headers);
  1017. npart->headers_order = NULL;
  1018. }
  1019. else {
  1020. /*
  1021. * Here are dragons:
  1022. * We allocate new parser context as we need to shift pointers
  1023. */
  1024. nst = g_malloc0 (sizeof (*st));
  1025. nst->stack = g_ptr_array_sized_new (4);
  1026. nst->pos = task->raw_headers_content.body_start;
  1027. nst->end = task->msg.begin + task->msg.len;
  1028. nst->boundaries = g_array_sized_new (FALSE, FALSE,
  1029. sizeof (struct rspamd_mime_boundary), 8);
  1030. nst->start = part->parsed_data.begin;
  1031. nst->end = nst->start + part->parsed_data.len;
  1032. nst->pos = nst->start;
  1033. nst->task = st->task;
  1034. nst->nesting = st->nesting;
  1035. st->nesting ++;
  1036. str.str = (gchar *)part->parsed_data.begin;
  1037. str.len = part->parsed_data.len;
  1038. hdr_pos = rspamd_string_find_eoh (&str, &body_pos);
  1039. npart->raw_headers = g_hash_table_new_full (rspamd_strcase_hash,
  1040. rspamd_strcase_equal, NULL, rspamd_ptr_array_free_hard);
  1041. npart->headers_order = g_queue_new ();
  1042. if (hdr_pos > 0 && hdr_pos < str.len) {
  1043. npart->raw_headers_str = str.str;
  1044. npart->raw_headers_len = hdr_pos;
  1045. npart->raw_data.begin = str.str + body_pos;
  1046. if (npart->raw_headers_len > 0) {
  1047. rspamd_mime_headers_process (task, npart->raw_headers,
  1048. npart->headers_order,
  1049. npart->raw_headers_str,
  1050. npart->raw_headers_len,
  1051. FALSE);
  1052. }
  1053. }
  1054. else {
  1055. body_pos = 0;
  1056. hdrs = rspamd_message_get_header_from_hash (npart->raw_headers,
  1057. task->task_pool,
  1058. "Content-Type", FALSE);
  1059. }
  1060. pbegin = part->parsed_data.begin + body_pos;
  1061. plen = part->parsed_data.len - body_pos;
  1062. }
  1063. npart->raw_data.begin = pbegin;
  1064. npart->raw_data.len = plen;
  1065. npart->parent_part = part;
  1066. if (hdrs == NULL) {
  1067. sel = NULL;
  1068. }
  1069. else {
  1070. for (i = 0; i < hdrs->len; i ++) {
  1071. hdr = g_ptr_array_index (hdrs, i);
  1072. ct = rspamd_content_type_parse (hdr->value, strlen (hdr->value),
  1073. task->task_pool);
  1074. /* Here we prefer multipart content-type or any content-type */
  1075. if (ct) {
  1076. if (sel == NULL) {
  1077. sel = ct;
  1078. }
  1079. else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
  1080. sel = ct;
  1081. }
  1082. }
  1083. }
  1084. }
  1085. if (sel == NULL) {
  1086. /* For messages we automatically assume plaintext */
  1087. msg_info_task ("cannot find content-type for a message, assume text/plain");
  1088. sel = rspamd_mempool_alloc0 (task->task_pool, sizeof (*sel));
  1089. sel->flags = RSPAMD_CONTENT_TYPE_TEXT|RSPAMD_CONTENT_TYPE_MISSING;
  1090. RSPAMD_FTOK_ASSIGN (&sel->type, "text");
  1091. RSPAMD_FTOK_ASSIGN (&sel->subtype, "plain");
  1092. }
  1093. npart->ct = sel;
  1094. if ((part == NULL || nst != st) &&
  1095. (sel->flags & (RSPAMD_CONTENT_TYPE_MULTIPART|RSPAMD_CONTENT_TYPE_MESSAGE))) {
  1096. /* Not a trivial message, need to preprocess */
  1097. rspamd_mime_preprocess_message (task, npart, nst);
  1098. }
  1099. if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
  1100. g_ptr_array_add (nst->stack, npart);
  1101. nst->nesting ++;
  1102. npart->specific.mp = rspamd_mempool_alloc0 (task->task_pool,
  1103. sizeof (struct rspamd_mime_multipart));
  1104. memcpy (&npart->specific.mp->boundary, &sel->orig_boundary,
  1105. sizeof (rspamd_ftok_t));
  1106. ret = rspamd_mime_parse_multipart_part (task, npart, nst, err);
  1107. }
  1108. else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) {
  1109. g_ptr_array_add (nst->stack, npart);
  1110. nst->nesting ++;
  1111. ret = rspamd_mime_parse_message (task, npart, nst, err);
  1112. }
  1113. else {
  1114. ret = rspamd_mime_parse_normal_part (task, npart, nst, err);
  1115. }
  1116. if (part) {
  1117. /* Remove message part from the parent stack */
  1118. g_ptr_array_remove_index_fast (st->stack, st->stack->len - 1);
  1119. st->nesting --;
  1120. }
  1121. /* Process leftovers for boundaries */
  1122. if (nst->boundaries) {
  1123. struct rspamd_mime_boundary *boundary, *start_boundary = NULL,
  1124. *end_boundary = NULL;
  1125. goffset cur_offset = nst->pos - nst->start,
  1126. end_offset = st->end - st->start;
  1127. guint sel_idx = 0;
  1128. for (;;) {
  1129. start_boundary = NULL;
  1130. for (i = sel_idx; i < nst->boundaries->len; i++) {
  1131. boundary = &g_array_index (nst->boundaries,
  1132. struct rspamd_mime_boundary, i);
  1133. if (boundary->start > cur_offset &&
  1134. boundary->boundary < end_offset &&
  1135. !RSPAMD_BOUNDARY_IS_CLOSED (boundary)) {
  1136. start_boundary = boundary;
  1137. sel_idx = i;
  1138. break;
  1139. }
  1140. }
  1141. if (start_boundary) {
  1142. const gchar *start, *end;
  1143. if (nst->boundaries->len > sel_idx + 1) {
  1144. end_boundary = &g_array_index (nst->boundaries,
  1145. struct rspamd_mime_boundary, sel_idx + 1);
  1146. end = nst->start + end_boundary->boundary;
  1147. }
  1148. else {
  1149. end = nst->end;
  1150. }
  1151. sel_idx ++;
  1152. start = nst->start + start_boundary->start;
  1153. if (end > start &&
  1154. (ret = rspamd_mime_process_multipart_node (task, st,
  1155. NULL, start, end, err)) != RSPAMD_MIME_PARSE_OK) {
  1156. return ret;
  1157. }
  1158. }
  1159. else {
  1160. break;
  1161. }
  1162. }
  1163. }
  1164. if (nst != st) {
  1165. rspamd_mime_parse_stack_free (nst);
  1166. }
  1167. return ret;
  1168. }
  1169. enum rspamd_mime_parse_error
  1170. rspamd_mime_parse_task (struct rspamd_task *task, GError **err)
  1171. {
  1172. struct rspamd_mime_parser_ctx *st;
  1173. enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK;
  1174. if (lib_ctx == NULL) {
  1175. rspamd_mime_parser_init_lib ();
  1176. }
  1177. if (++lib_ctx->key_usages > max_key_usages) {
  1178. /* Regenerate siphash key */
  1179. ottery_rand_bytes (lib_ctx->hkey, sizeof (lib_ctx->hkey));
  1180. lib_ctx->key_usages = 0;
  1181. }
  1182. st = g_malloc0 (sizeof (*st));
  1183. st->stack = g_ptr_array_sized_new (4);
  1184. st->pos = task->raw_headers_content.body_start;
  1185. st->end = task->msg.begin + task->msg.len;
  1186. st->boundaries = g_array_sized_new (FALSE, FALSE,
  1187. sizeof (struct rspamd_mime_boundary), 8);
  1188. st->task = task;
  1189. if (st->pos == NULL) {
  1190. st->pos = task->msg.begin;
  1191. }
  1192. st->start = task->msg.begin;
  1193. ret = rspamd_mime_parse_message (task, NULL, st, err);
  1194. rspamd_mime_parse_stack_free (st);
  1195. return ret;
  1196. }