You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

mime_parser.c 37KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "task.h"
  18. #include "mime_parser.h"
  19. #include "mime_headers.h"
  20. #include "message.h"
  21. #include "multipattern.h"
  22. #include "contrib/libottery/ottery.h"
  23. #include "contrib/uthash/utlist.h"
  24. struct rspamd_mime_parser_lib_ctx {
  25. struct rspamd_multipattern *mp_boundary;
  26. guchar hkey[rspamd_cryptobox_SIPKEYBYTES]; /* Key for hashing */
  27. guint key_usages;
  28. };
  29. struct rspamd_mime_parser_lib_ctx *lib_ctx = NULL;
  30. static const guint max_nested = 64;
  31. static const guint max_key_usages = 10000;
  32. #define msg_debug_mime(...) rspamd_conditional_debug_fast (NULL, task->from_addr, \
  33. rspamd_mime_log_id, "mime", task->task_pool->tag.uid, \
  34. G_STRFUNC, \
  35. __VA_ARGS__)
  36. INIT_LOG_MODULE(mime)
  37. #define RSPAMD_MIME_BOUNDARY_FLAG_CLOSED (1 << 0)
  38. #define RSPAMD_BOUNDARY_IS_CLOSED(b) ((b)->flags & RSPAMD_MIME_BOUNDARY_FLAG_CLOSED)
  39. struct rspamd_mime_boundary {
  40. goffset boundary;
  41. goffset start;
  42. guint64 hash;
  43. guint64 closed_hash;
  44. gint flags;
  45. };
  46. struct rspamd_mime_parser_ctx {
  47. GPtrArray *stack; /* Stack of parts */
  48. GArray *boundaries; /* Boundaries found in the whole message */
  49. const gchar *start;
  50. const gchar *pos;
  51. const gchar *end;
  52. struct rspamd_task *task;
  53. guint nesting;
  54. };
  55. static enum rspamd_mime_parse_error
  56. rspamd_mime_parse_multipart_part (struct rspamd_task *task,
  57. struct rspamd_mime_part *part,
  58. struct rspamd_mime_parser_ctx *st,
  59. GError **err);
  60. static enum rspamd_mime_parse_error
  61. rspamd_mime_parse_message (struct rspamd_task *task,
  62. struct rspamd_mime_part *part,
  63. struct rspamd_mime_parser_ctx *st,
  64. GError **err);
  65. static enum rspamd_mime_parse_error
  66. rspamd_mime_parse_normal_part (struct rspamd_task *task,
  67. struct rspamd_mime_part *part,
  68. struct rspamd_mime_parser_ctx *st,
  69. GError **err);
  70. #define RSPAMD_MIME_QUARK (rspamd_mime_parser_quark())
  71. static GQuark
  72. rspamd_mime_parser_quark (void)
  73. {
  74. return g_quark_from_static_string ("mime-parser");
  75. }
  76. const gchar*
  77. rspamd_cte_to_string (enum rspamd_cte ct)
  78. {
  79. const gchar *ret = "unknown";
  80. switch (ct) {
  81. case RSPAMD_CTE_7BIT:
  82. ret = "7bit";
  83. break;
  84. case RSPAMD_CTE_8BIT:
  85. ret = "8bit";
  86. break;
  87. case RSPAMD_CTE_QP:
  88. ret = "quoted-printable";
  89. break;
  90. case RSPAMD_CTE_B64:
  91. ret = "base64";
  92. break;
  93. case RSPAMD_CTE_UUE:
  94. ret = "X-uuencode";
  95. break;
  96. default:
  97. break;
  98. }
  99. return ret;
  100. }
  101. enum rspamd_cte
  102. rspamd_cte_from_string (const gchar *str)
  103. {
  104. enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
  105. g_assert (str != NULL);
  106. if (strcmp (str, "7bit") == 0) {
  107. ret = RSPAMD_CTE_7BIT;
  108. }
  109. else if (strcmp (str, "8bit") == 0) {
  110. ret = RSPAMD_CTE_8BIT;
  111. }
  112. else if (strcmp (str, "quoted-printable") == 0) {
  113. ret = RSPAMD_CTE_QP;
  114. }
  115. else if (strcmp (str, "base64") == 0) {
  116. ret = RSPAMD_CTE_B64;
  117. }
  118. else if (strcmp (str, "X-uuencode") == 0) {
  119. ret = RSPAMD_CTE_UUE;
  120. }
  121. else if (strcmp (str, "uuencode") == 0) {
  122. ret = RSPAMD_CTE_UUE;
  123. }
  124. else if (strcmp (str, "X-uue") == 0) {
  125. ret = RSPAMD_CTE_UUE;
  126. }
  127. return ret;
  128. }
  129. static void
  130. rspamd_mime_parser_init_lib (void)
  131. {
  132. lib_ctx = g_malloc0 (sizeof (*lib_ctx));
  133. lib_ctx->mp_boundary = rspamd_multipattern_create (RSPAMD_MULTIPATTERN_DEFAULT);
  134. g_assert (lib_ctx->mp_boundary != NULL);
  135. rspamd_multipattern_add_pattern (lib_ctx->mp_boundary, "\r--", 0);
  136. rspamd_multipattern_add_pattern (lib_ctx->mp_boundary, "\n--", 0);
  137. g_assert (rspamd_multipattern_compile (lib_ctx->mp_boundary, NULL));
  138. ottery_rand_bytes (lib_ctx->hkey, sizeof (lib_ctx->hkey));
  139. }
  140. static enum rspamd_cte
  141. rspamd_mime_parse_cte (const gchar *in, gsize len)
  142. {
  143. guint64 h;
  144. enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
  145. in = rspamd_string_len_strip (in, &len, " \t;,.+-#!`~'");
  146. h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64,
  147. in, len, 0xdeadbabe);
  148. switch (h) {
  149. case 0xCEDAA7056B4753F7ULL: /* 7bit */
  150. ret = RSPAMD_CTE_7BIT;
  151. break;
  152. case 0x42E0745448B39FC1ULL: /* 8bit */
  153. case 0x6B169E6B155BADC0ULL: /* binary */
  154. ret = RSPAMD_CTE_8BIT;
  155. break;
  156. case 0x6D69A5BB02A633B0ULL: /* quoted-printable */
  157. ret = RSPAMD_CTE_QP;
  158. break;
  159. case 0x96305588A76DC9A9ULL: /* base64 */
  160. case 0x171029DE1B0423A9ULL: /* base-64 */
  161. ret = RSPAMD_CTE_B64;
  162. break;
  163. case 0x420b54dc00d13cecULL: /* uuencode */
  164. case 0x8df6700b8f6c4cf9ULL: /* x-uuencode */
  165. case 0x41f725ec544356d3ULL: /* x-uue */
  166. ret = RSPAMD_CTE_UUE;
  167. break;
  168. }
  169. return ret;
  170. }
  171. static enum rspamd_cte
  172. rspamd_mime_part_get_cte_heuristic (struct rspamd_task *task,
  173. struct rspamd_mime_part *part)
  174. {
  175. const guint check_len = 128;
  176. guint real_len, nspaces = 0, neqsign = 0, n8bit = 0, nqpencoded = 0;
  177. gboolean b64_chars = TRUE;
  178. const guchar *p, *end;
  179. enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
  180. real_len = MIN (check_len, part->raw_data.len);
  181. p = (const guchar *)part->raw_data.begin;
  182. end = p + part->raw_data.len;
  183. while (p < end && g_ascii_isspace (*p)) {
  184. p ++;
  185. }
  186. if (end - p > sizeof ("begin-base64 ")) {
  187. const guchar *uue_start;
  188. if (memcmp (p, "begin ", sizeof ("begin ") - 1) == 0) {
  189. uue_start = p + sizeof ("begin ") - 1;
  190. while (uue_start < end && g_ascii_isspace (*uue_start)) {
  191. uue_start ++;
  192. }
  193. if (uue_start < end && g_ascii_isdigit (*uue_start)) {
  194. return RSPAMD_CTE_UUE;
  195. }
  196. }
  197. else if (memcmp (p, "begin-base64 ", sizeof ("begin-base64 ") - 1) == 0) {
  198. uue_start = p + sizeof ("begin ") - 1;
  199. while (uue_start < end && g_ascii_isspace (*uue_start)) {
  200. uue_start ++;
  201. }
  202. if (uue_start < end && g_ascii_isdigit (*uue_start)) {
  203. return RSPAMD_CTE_UUE;
  204. }
  205. }
  206. }
  207. if (end > p + 2) {
  208. if (*(end - 1) == '=') {
  209. neqsign ++;
  210. end --;
  211. }
  212. if (*(end - 1) == '=') {
  213. neqsign ++;
  214. end --;
  215. }
  216. }
  217. if (end - p > real_len) {
  218. end = p + real_len;
  219. }
  220. while (p < end) {
  221. if (*p == ' ') {
  222. nspaces ++;
  223. }
  224. else if (*p == '=') {
  225. neqsign ++;
  226. p ++;
  227. if (p + 2 < end && g_ascii_isxdigit (*p) && g_ascii_isxdigit (*(p + 1))) {
  228. p ++;
  229. nqpencoded ++;
  230. }
  231. continue;
  232. }
  233. else if (*p >= 0x80) {
  234. n8bit ++;
  235. b64_chars = FALSE;
  236. }
  237. else if (!(g_ascii_isalnum (*p) || *p == '/' || *p == '+')) {
  238. b64_chars = FALSE;
  239. }
  240. p ++;
  241. }
  242. if (b64_chars && neqsign < 2 && nspaces == 0) {
  243. ret = RSPAMD_CTE_B64;
  244. }
  245. else if (n8bit == 0) {
  246. if (neqsign > 2 && nqpencoded > 2) {
  247. ret = RSPAMD_CTE_QP;
  248. }
  249. else {
  250. ret = RSPAMD_CTE_7BIT;
  251. }
  252. }
  253. else {
  254. ret = RSPAMD_CTE_8BIT;
  255. }
  256. msg_debug_mime ("detected cte: %s", rspamd_cte_to_string (ret));
  257. return ret;
  258. }
  259. static void
  260. rspamd_mime_part_get_cte (struct rspamd_task *task,
  261. struct rspamd_mime_headers_table *hdrs,
  262. struct rspamd_mime_part *part,
  263. gboolean apply_heuristic)
  264. {
  265. struct rspamd_mime_header *hdr, *cur;
  266. enum rspamd_cte cte = RSPAMD_CTE_UNKNOWN;
  267. gboolean parent_propagated = FALSE;
  268. hdr = rspamd_message_get_header_from_hash (hdrs, "Content-Transfer-Encoding");
  269. if (hdr == NULL) {
  270. if (part->parent_part && part->parent_part->cte != RSPAMD_CTE_UNKNOWN &&
  271. !(part->parent_part->flags & RSPAMD_MIME_PART_MISSING_CTE)) {
  272. part->cte = part->parent_part->cte;
  273. parent_propagated = TRUE;
  274. goto check_cte;
  275. }
  276. if (apply_heuristic) {
  277. part->cte = rspamd_mime_part_get_cte_heuristic (task, part);
  278. msg_info_task ("detected missing CTE for part as: %s",
  279. rspamd_cte_to_string (part->cte));
  280. }
  281. part->flags |= RSPAMD_MIME_PART_MISSING_CTE;
  282. }
  283. else {
  284. DL_FOREACH (hdr, cur) {
  285. gsize hlen;
  286. gchar lc_buf[128];
  287. hlen = rspamd_snprintf (lc_buf, sizeof (lc_buf), "%s", cur->value);
  288. rspamd_str_lc (lc_buf, hlen);
  289. cte = rspamd_mime_parse_cte (lc_buf, hlen);
  290. if (cte != RSPAMD_CTE_UNKNOWN) {
  291. part->cte = cte;
  292. break;
  293. }
  294. }
  295. check_cte:
  296. if (apply_heuristic) {
  297. if (part->cte == RSPAMD_CTE_UNKNOWN) {
  298. part->cte = rspamd_mime_part_get_cte_heuristic (task, part);
  299. msg_info_task ("corrected bad CTE for part to: %s",
  300. rspamd_cte_to_string (part->cte));
  301. }
  302. else if (part->cte == RSPAMD_CTE_B64 ||
  303. part->cte == RSPAMD_CTE_QP) {
  304. /* Additionally check sanity */
  305. cte = rspamd_mime_part_get_cte_heuristic (task, part);
  306. if (cte == RSPAMD_CTE_8BIT) {
  307. msg_info_task (
  308. "incorrect cte specified for part: %s, %s detected",
  309. rspamd_cte_to_string (part->cte),
  310. rspamd_cte_to_string (cte));
  311. part->cte = cte;
  312. part->flags |= RSPAMD_MIME_PART_BAD_CTE;
  313. }
  314. else if (cte != part->cte && parent_propagated) {
  315. part->cte = cte;
  316. msg_info_task ("detected missing CTE for part as: %s",
  317. rspamd_cte_to_string (part->cte));
  318. }
  319. }
  320. else {
  321. msg_debug_mime ("processed cte: %s",
  322. rspamd_cte_to_string (cte));
  323. }
  324. }
  325. else {
  326. msg_debug_mime ("processed cte: %s", rspamd_cte_to_string (cte));
  327. }
  328. }
  329. }
  330. static void
  331. rspamd_mime_part_get_cd (struct rspamd_task *task, struct rspamd_mime_part *part)
  332. {
  333. struct rspamd_mime_header *hdr, *cur;
  334. struct rspamd_content_disposition *cd = NULL;
  335. rspamd_ftok_t srch;
  336. struct rspamd_content_type_param *found;
  337. hdr = rspamd_message_get_header_from_hash (part->raw_headers,
  338. "Content-Disposition");
  339. if (hdr == NULL) {
  340. cd = rspamd_mempool_alloc0 (task->task_pool, sizeof (*cd));
  341. cd->type = RSPAMD_CT_INLINE;
  342. /* We can also have content dispositon definitions in Content-Type */
  343. if (part->ct && part->ct->attrs) {
  344. RSPAMD_FTOK_ASSIGN (&srch, "name");
  345. found = g_hash_table_lookup (part->ct->attrs, &srch);
  346. if (!found) {
  347. RSPAMD_FTOK_ASSIGN (&srch, "filename");
  348. found = g_hash_table_lookup (part->ct->attrs, &srch);
  349. }
  350. if (found) {
  351. cd->type = RSPAMD_CT_ATTACHMENT;
  352. memcpy (&cd->filename, &found->value, sizeof (cd->filename));
  353. }
  354. }
  355. }
  356. else {
  357. DL_FOREACH (hdr, cur) {
  358. gsize hlen;
  359. cd = NULL;
  360. if (cur->decoded) {
  361. hlen = strlen (cur->decoded);
  362. cd = rspamd_content_disposition_parse (cur->decoded, hlen,
  363. task->task_pool);
  364. }
  365. if (cd) {
  366. /* We still need to check filename */
  367. if (cd->filename.len == 0) {
  368. if (part->ct && part->ct->attrs) {
  369. RSPAMD_FTOK_ASSIGN (&srch, "name");
  370. found = g_hash_table_lookup (part->ct->attrs, &srch);
  371. if (!found) {
  372. RSPAMD_FTOK_ASSIGN (&srch, "filename");
  373. found = g_hash_table_lookup (part->ct->attrs, &srch);
  374. }
  375. if (found) {
  376. cd->type = RSPAMD_CT_ATTACHMENT;
  377. memcpy (&cd->filename, &found->value,
  378. sizeof (cd->filename));
  379. }
  380. }
  381. }
  382. msg_debug_mime ("processed content disposition: %s, file: \"%T\"",
  383. cd->lc_data, &cd->filename);
  384. break;
  385. }
  386. }
  387. }
  388. part->cd = cd;
  389. }
  390. void
  391. rspamd_mime_parser_calc_digest (struct rspamd_mime_part *part)
  392. {
  393. /* Blake2b applied to string 'rspamd' */
  394. static const guchar hash_key[] = {
  395. 0xef,0x43,0xae,0x80,0xcc,0x8d,0xc3,0x4c,
  396. 0x6f,0x1b,0xd6,0x18,0x1b,0xae,0x87,0x74,
  397. 0x0c,0xca,0xf7,0x8e,0x5f,0x2e,0x54,0x32,
  398. 0xf6,0x79,0xb9,0x27,0x26,0x96,0x20,0x92,
  399. 0x70,0x07,0x85,0xeb,0x83,0xf7,0x89,0xe0,
  400. 0xd7,0x32,0x2a,0xd2,0x1a,0x64,0x41,0xef,
  401. 0x49,0xff,0xc3,0x8c,0x54,0xf9,0x67,0x74,
  402. 0x30,0x1e,0x70,0x2e,0xb7,0x12,0x09,0xfe,
  403. };
  404. if (part->parsed_data.len > 0) {
  405. rspamd_cryptobox_hash (part->digest,
  406. part->parsed_data.begin, part->parsed_data.len,
  407. hash_key, sizeof (hash_key));
  408. }
  409. }
  410. static enum rspamd_mime_parse_error
  411. rspamd_mime_parse_normal_part (struct rspamd_task *task,
  412. struct rspamd_mime_part *part,
  413. struct rspamd_mime_parser_ctx *st,
  414. GError **err)
  415. {
  416. rspamd_fstring_t *parsed;
  417. gssize r;
  418. g_assert (part != NULL);
  419. rspamd_mime_part_get_cte (task, part->raw_headers, part,
  420. !(part->ct->flags & RSPAMD_CONTENT_TYPE_MESSAGE));
  421. rspamd_mime_part_get_cd (task, part);
  422. switch (part->cte) {
  423. case RSPAMD_CTE_7BIT:
  424. case RSPAMD_CTE_8BIT:
  425. case RSPAMD_CTE_UNKNOWN:
  426. if (part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING) {
  427. if (part->cte != RSPAMD_CTE_7BIT) {
  428. /* We have something that has a missing content-type,
  429. * but it has non-7bit characters.
  430. *
  431. * In theory, it is very unsafe to process it as a text part
  432. * as we unlikely get some sane result
  433. */
  434. part->ct->flags &= ~RSPAMD_CONTENT_TYPE_TEXT;
  435. part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
  436. }
  437. }
  438. if (IS_CT_TEXT (part->ct)) {
  439. /* Need to copy text as we have couple of in-place change functions */
  440. parsed = rspamd_fstring_sized_new (part->raw_data.len);
  441. parsed->len = part->raw_data.len;
  442. memcpy (parsed->str, part->raw_data.begin, parsed->len);
  443. part->parsed_data.begin = parsed->str;
  444. part->parsed_data.len = parsed->len;
  445. rspamd_mempool_add_destructor (task->task_pool,
  446. (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
  447. }
  448. else {
  449. part->parsed_data.begin = part->raw_data.begin;
  450. part->parsed_data.len = part->raw_data.len;
  451. }
  452. break;
  453. case RSPAMD_CTE_QP:
  454. parsed = rspamd_fstring_sized_new (part->raw_data.len);
  455. r = rspamd_decode_qp_buf (part->raw_data.begin, part->raw_data.len,
  456. parsed->str, parsed->allocated);
  457. if (r != -1) {
  458. parsed->len = r;
  459. part->parsed_data.begin = parsed->str;
  460. part->parsed_data.len = parsed->len;
  461. rspamd_mempool_add_destructor (task->task_pool,
  462. (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
  463. }
  464. else {
  465. msg_err_task ("invalid quoted-printable encoded part, assume 8bit");
  466. part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
  467. part->cte = RSPAMD_CTE_8BIT;
  468. memcpy (parsed->str, part->raw_data.begin, part->raw_data.len);
  469. parsed->len = part->raw_data.len;
  470. part->parsed_data.begin = parsed->str;
  471. part->parsed_data.len = parsed->len;
  472. rspamd_mempool_add_destructor (task->task_pool,
  473. (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
  474. }
  475. break;
  476. case RSPAMD_CTE_B64:
  477. parsed = rspamd_fstring_sized_new (part->raw_data.len / 4 * 3 + 12);
  478. rspamd_cryptobox_base64_decode (part->raw_data.begin,
  479. part->raw_data.len,
  480. parsed->str, &parsed->len);
  481. part->parsed_data.begin = parsed->str;
  482. part->parsed_data.len = parsed->len;
  483. rspamd_mempool_add_destructor (task->task_pool,
  484. (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
  485. break;
  486. case RSPAMD_CTE_UUE:
  487. parsed = rspamd_fstring_sized_new (part->raw_data.len / 4 * 3 + 12);
  488. r = rspamd_decode_uue_buf (part->raw_data.begin, part->raw_data.len,
  489. parsed->str, parsed->allocated);
  490. rspamd_mempool_add_destructor (task->task_pool,
  491. (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
  492. if (r != -1) {
  493. parsed->len = r;
  494. part->parsed_data.begin = parsed->str;
  495. part->parsed_data.len = parsed->len;
  496. }
  497. else {
  498. msg_err_task ("invalid quoted-printable encoded part, assume 8bit");
  499. part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
  500. part->cte = RSPAMD_CTE_8BIT;
  501. memcpy (parsed->str, part->raw_data.begin, part->raw_data.len);
  502. parsed->len = part->raw_data.len;
  503. part->parsed_data.begin = parsed->str;
  504. part->parsed_data.len = parsed->len;
  505. }
  506. break;
  507. default:
  508. g_assert_not_reached ();
  509. }
  510. part->id = MESSAGE_FIELD (task, parts)->len;
  511. g_ptr_array_add (MESSAGE_FIELD (task, parts), part);
  512. msg_debug_mime ("parsed data part %T/%T of length %z (%z orig), %s cte",
  513. &part->ct->type, &part->ct->subtype, part->parsed_data.len,
  514. part->raw_data.len, rspamd_cte_to_string (part->cte));
  515. rspamd_mime_parser_calc_digest (part);
  516. return RSPAMD_MIME_PARSE_OK;
  517. }
  518. struct rspamd_mime_multipart_cbdata {
  519. struct rspamd_task *task;
  520. struct rspamd_mime_part *multipart;
  521. struct rspamd_mime_parser_ctx *st;
  522. const gchar *part_start;
  523. rspamd_ftok_t *cur_boundary;
  524. guint64 bhash;
  525. GError **err;
  526. };
  527. static enum rspamd_mime_parse_error
  528. rspamd_mime_process_multipart_node (struct rspamd_task *task,
  529. struct rspamd_mime_parser_ctx *st,
  530. struct rspamd_mime_part *multipart,
  531. const gchar *start, const gchar *end,
  532. gboolean is_finished,
  533. GError **err)
  534. {
  535. struct rspamd_content_type *ct, *sel = NULL;
  536. struct rspamd_mime_header *hdr = NULL, *cur;
  537. struct rspamd_mime_part *npart;
  538. GString str;
  539. goffset hdr_pos, body_pos;
  540. enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_FATAL;
  541. str.str = (gchar *)start;
  542. str.len = end - start;
  543. if (*start == '\n' || *start == '\r') {
  544. /*
  545. * We have a part that starts from newline which means that
  546. * there are completely no headers in this part,
  547. * hence we assume it as a text part
  548. */
  549. hdr_pos = 0;
  550. body_pos = 0;
  551. if (!is_finished) {
  552. /* Ignore garbage */
  553. const gchar *p = start;
  554. gboolean seen_something = FALSE;
  555. while (p < end) {
  556. if (g_ascii_isalnum (*p)) {
  557. seen_something = TRUE;
  558. break;
  559. }
  560. p ++;
  561. }
  562. if (!seen_something) {
  563. return RSPAMD_MIME_PARSE_NO_PART;
  564. }
  565. }
  566. }
  567. else {
  568. hdr_pos = rspamd_string_find_eoh (&str, &body_pos);
  569. }
  570. npart = rspamd_mempool_alloc0 (task->task_pool,
  571. sizeof (struct rspamd_mime_part));
  572. npart->parent_part = multipart;
  573. npart->raw_headers = rspamd_message_headers_new ();
  574. npart->headers_order = NULL;
  575. if (multipart) {
  576. if (multipart->specific.mp->children == NULL) {
  577. multipart->specific.mp->children = g_ptr_array_sized_new (2);
  578. }
  579. g_ptr_array_add (multipart->specific.mp->children, npart);
  580. }
  581. if (hdr_pos > 0 && hdr_pos < str.len) {
  582. npart->raw_headers_str = str.str;
  583. npart->raw_headers_len = hdr_pos;
  584. npart->raw_data.begin = start + body_pos;
  585. npart->raw_data.len = (end - start) - body_pos;
  586. if (npart->raw_headers_len > 0) {
  587. rspamd_mime_headers_process (task, npart->raw_headers,
  588. &npart->headers_order,
  589. npart->raw_headers_str,
  590. npart->raw_headers_len,
  591. FALSE);
  592. }
  593. hdr = rspamd_message_get_header_from_hash (npart->raw_headers,
  594. "Content-Type");
  595. }
  596. else {
  597. npart->raw_headers_str = 0;
  598. npart->raw_headers_len = 0;
  599. npart->raw_data.begin = start;
  600. npart->raw_data.len = end - start;
  601. }
  602. if (hdr != NULL) {
  603. DL_FOREACH (hdr, cur) {
  604. ct = rspamd_content_type_parse (cur->decoded, strlen (cur->decoded),
  605. task->task_pool);
  606. /* Here we prefer multipart content-type or any content-type */
  607. if (ct) {
  608. if (sel == NULL) {
  609. sel = ct;
  610. }
  611. else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
  612. sel = ct;
  613. }
  614. }
  615. }
  616. }
  617. if (sel == NULL) {
  618. sel = rspamd_mempool_alloc0 (task->task_pool, sizeof (*sel));
  619. RSPAMD_FTOK_ASSIGN (&sel->type, "text");
  620. RSPAMD_FTOK_ASSIGN (&sel->subtype, "plain");
  621. }
  622. npart->ct = sel;
  623. if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
  624. st->nesting ++;
  625. g_ptr_array_add (st->stack, npart);
  626. npart->specific.mp = rspamd_mempool_alloc0 (task->task_pool,
  627. sizeof (struct rspamd_mime_multipart));
  628. memcpy (&npart->specific.mp->boundary, &sel->orig_boundary,
  629. sizeof (rspamd_ftok_t));
  630. ret = rspamd_mime_parse_multipart_part (task, npart, st, err);
  631. }
  632. else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) {
  633. st->nesting ++;
  634. g_ptr_array_add (st->stack, npart);
  635. if ((ret = rspamd_mime_parse_normal_part (task, npart, st, err))
  636. == RSPAMD_MIME_PARSE_OK) {
  637. ret = rspamd_mime_parse_message (task, npart, st, err);
  638. }
  639. }
  640. else {
  641. ret = rspamd_mime_parse_normal_part (task, npart, st, err);
  642. }
  643. return ret;
  644. }
  645. static enum rspamd_mime_parse_error
  646. rspamd_mime_parse_multipart_cb (struct rspamd_task *task,
  647. struct rspamd_mime_part *multipart,
  648. struct rspamd_mime_parser_ctx *st,
  649. struct rspamd_mime_multipart_cbdata *cb,
  650. struct rspamd_mime_boundary *b)
  651. {
  652. const gchar *pos = st->start + b->boundary;
  653. enum rspamd_mime_parse_error ret;
  654. task = cb->task;
  655. /* Now check boundary */
  656. if (!cb->part_start) {
  657. cb->part_start = st->start + b->start;
  658. st->pos = cb->part_start;
  659. }
  660. else {
  661. /*
  662. * We have seen the start of the boundary,
  663. * but it might be unsuitable (e.g. in broken headers)
  664. */
  665. if (cb->part_start < pos && cb->cur_boundary) {
  666. if ((ret = rspamd_mime_process_multipart_node (task, cb->st,
  667. cb->multipart, cb->part_start, pos, TRUE, cb->err))
  668. != RSPAMD_MIME_PARSE_OK) {
  669. return ret;
  670. }
  671. if (b->start > 0) {
  672. /* Go towards the next part */
  673. cb->part_start = st->start + b->start;
  674. cb->st->pos = cb->part_start;
  675. }
  676. }
  677. else {
  678. /* We have an empty boundary, do nothing */
  679. }
  680. }
  681. return RSPAMD_MIME_PARSE_OK;
  682. }
  683. static enum rspamd_mime_parse_error
  684. rspamd_multipart_boundaries_filter (struct rspamd_task *task,
  685. struct rspamd_mime_part *multipart,
  686. struct rspamd_mime_parser_ctx *st,
  687. struct rspamd_mime_multipart_cbdata *cb)
  688. {
  689. struct rspamd_mime_boundary *cur;
  690. goffset last_offset;
  691. guint i, sel = 0;
  692. enum rspamd_mime_parse_error ret;
  693. last_offset = (multipart->raw_data.begin - st->start) +
  694. multipart->raw_data.len;
  695. /* Find the first offset suitable for this part */
  696. for (i = 0; i < st->boundaries->len; i ++) {
  697. cur = &g_array_index (st->boundaries, struct rspamd_mime_boundary, i);
  698. if (cur->start >= multipart->raw_data.begin - st->start) {
  699. if (cb->cur_boundary) {
  700. /* Check boundary */
  701. msg_debug_mime ("compare %L and %L (and %L)",
  702. cb->bhash, cur->hash, cur->closed_hash);
  703. if (cb->bhash == cur->hash) {
  704. sel = i;
  705. break;
  706. }
  707. else if (cb->bhash == cur->closed_hash) {
  708. /* Not a closing element in fact */
  709. cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
  710. cur->hash = cur->closed_hash;
  711. sel = i;
  712. break;
  713. }
  714. }
  715. else {
  716. /* Set current boundary */
  717. cb->cur_boundary = rspamd_mempool_alloc (task->task_pool,
  718. sizeof (rspamd_ftok_t));
  719. cb->cur_boundary->begin = st->start + cur->boundary;
  720. cb->cur_boundary->len = 0;
  721. cb->bhash = cur->hash;
  722. sel = i;
  723. break;
  724. }
  725. }
  726. }
  727. /* Now we can go forward with boundaries that are same to what we have */
  728. for (i = sel; i < st->boundaries->len; i ++) {
  729. cur = &g_array_index (st->boundaries, struct rspamd_mime_boundary, i);
  730. if (cur->boundary > last_offset) {
  731. break;
  732. }
  733. if (cur->hash == cb->bhash || cur->closed_hash == cb->bhash) {
  734. if ((ret = rspamd_mime_parse_multipart_cb (task, multipart, st,
  735. cb, cur)) != RSPAMD_MIME_PARSE_OK) {
  736. return ret;
  737. }
  738. if (cur->closed_hash == cb->bhash) {
  739. /* We have again fake closed hash */
  740. cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
  741. cur->hash = cur->closed_hash;
  742. }
  743. if (RSPAMD_BOUNDARY_IS_CLOSED (cur)) {
  744. /* We also might check the next boundary... */
  745. if (i < st->boundaries->len - 1) {
  746. cur = &g_array_index (st->boundaries,
  747. struct rspamd_mime_boundary, i + 1);
  748. if (cur->hash == cb->bhash) {
  749. continue;
  750. }
  751. else if (cur->closed_hash == cb->bhash) {
  752. /* We have again fake closed hash */
  753. cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
  754. cur->hash = cur->closed_hash;
  755. continue;
  756. }
  757. }
  758. break;
  759. }
  760. }
  761. }
  762. if (i == st->boundaries->len && cb->cur_boundary) {
  763. /* Process the last part */
  764. struct rspamd_mime_boundary fb;
  765. fb.boundary = last_offset;
  766. fb.start = -1;
  767. if ((ret = rspamd_mime_parse_multipart_cb (task, multipart, st,
  768. cb, &fb)) != RSPAMD_MIME_PARSE_OK) {
  769. return ret;
  770. }
  771. }
  772. return RSPAMD_MIME_PARSE_OK;
  773. }
  774. static enum rspamd_mime_parse_error
  775. rspamd_mime_parse_multipart_part (struct rspamd_task *task,
  776. struct rspamd_mime_part *part,
  777. struct rspamd_mime_parser_ctx *st,
  778. GError **err)
  779. {
  780. struct rspamd_mime_multipart_cbdata cbdata;
  781. enum rspamd_mime_parse_error ret;
  782. if (st->nesting > max_nested) {
  783. g_set_error (err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d",
  784. st->nesting);
  785. return RSPAMD_MIME_PARSE_NESTING;
  786. }
  787. part->id = MESSAGE_FIELD (task, parts)->len;
  788. g_ptr_array_add (MESSAGE_FIELD (task, parts), part);
  789. st->nesting ++;
  790. rspamd_mime_part_get_cte (task, part->raw_headers, part, FALSE);
  791. st->pos = part->raw_data.begin;
  792. cbdata.multipart = part;
  793. cbdata.task = task;
  794. cbdata.st = st;
  795. cbdata.part_start = NULL;
  796. cbdata.err = err;
  797. if (part->ct->boundary.len > 0) {
  798. /* We know our boundary */
  799. cbdata.cur_boundary = &part->ct->boundary;
  800. rspamd_cryptobox_siphash ((guchar *)&cbdata.bhash,
  801. cbdata.cur_boundary->begin, cbdata.cur_boundary->len,
  802. lib_ctx->hkey);
  803. msg_debug_mime ("hash: %T -> %L", cbdata.cur_boundary, cbdata.bhash);
  804. }
  805. else {
  806. /* Guess boundary */
  807. cbdata.cur_boundary = NULL;
  808. cbdata.bhash = 0;
  809. }
  810. ret = rspamd_multipart_boundaries_filter (task, part, st, &cbdata);
  811. /* Cleanup stack */
  812. st->nesting --;
  813. g_ptr_array_remove_index_fast (st->stack, st->stack->len - 1);
  814. return ret;
  815. }
  816. /* Process boundary like structures in a message */
  817. static gint
  818. rspamd_mime_preprocess_cb (struct rspamd_multipattern *mp,
  819. guint strnum,
  820. gint match_start,
  821. gint match_pos,
  822. const gchar *text,
  823. gsize len,
  824. void *context)
  825. {
  826. const gchar *end = text + len, *p = text + match_pos, *bend;
  827. gchar *lc_copy;
  828. gsize blen;
  829. gboolean closing = FALSE;
  830. struct rspamd_mime_boundary b;
  831. struct rspamd_mime_parser_ctx *st = context;
  832. struct rspamd_task *task;
  833. task = st->task;
  834. if (G_LIKELY (p < end)) {
  835. gboolean seen_non_dash = FALSE;
  836. blen = 0;
  837. while (p < end) {
  838. if (*p == '\r' || *p == '\n') {
  839. break;
  840. }
  841. else if (*p != '-') {
  842. seen_non_dash = TRUE;
  843. }
  844. blen ++;
  845. p ++;
  846. }
  847. if (blen > 0 && seen_non_dash) {
  848. /* We have found something like boundary */
  849. p = text + match_pos;
  850. bend = p + blen - 1;
  851. if (*bend == '-') {
  852. /* We need to verify last -- */
  853. if (bend > p + 1 && *(bend - 1) == '-') {
  854. closing = TRUE;
  855. bend --;
  856. blen -= 2;
  857. }
  858. else {
  859. /* Not a closing boundary somehow */
  860. bend ++;
  861. }
  862. }
  863. else {
  864. bend ++;
  865. }
  866. if (bend < end) {
  867. if (*bend == '\r') {
  868. bend++;
  869. /* \r\n */
  870. if (*bend == '\n') {
  871. bend++;
  872. }
  873. }
  874. else {
  875. /* \n */
  876. bend++;
  877. }
  878. }
  879. b.boundary = p - st->start - 2;
  880. b.start = bend - st->start;
  881. if (closing) {
  882. lc_copy = g_malloc (blen + 2);
  883. memcpy (lc_copy, p, blen + 2);
  884. rspamd_str_lc (lc_copy, blen + 2);
  885. }
  886. else {
  887. lc_copy = g_malloc (blen);
  888. memcpy (lc_copy, p, blen);
  889. rspamd_str_lc (lc_copy, blen);
  890. }
  891. rspamd_cryptobox_siphash ((guchar *)&b.hash, lc_copy, blen,
  892. lib_ctx->hkey);
  893. msg_debug_mime ("normal hash: %*s -> %L, %d boffset, %d data offset",
  894. (gint)blen, lc_copy, b.hash, (int)b.boundary, (int)b.start);
  895. if (closing) {
  896. b.flags = RSPAMD_MIME_BOUNDARY_FLAG_CLOSED;
  897. rspamd_cryptobox_siphash ((guchar *)&b.closed_hash, lc_copy,
  898. blen + 2,
  899. lib_ctx->hkey);
  900. msg_debug_mime ("closing hash: %*s -> %L, %d boffset, %d data offset",
  901. (gint)blen + 2, lc_copy,
  902. b.closed_hash,
  903. (int)b.boundary, (int)b.start);
  904. }
  905. else {
  906. b.flags = 0;
  907. b.closed_hash = 0;
  908. }
  909. g_free (lc_copy);
  910. g_array_append_val (st->boundaries, b);
  911. }
  912. }
  913. return 0;
  914. }
  915. static goffset
  916. rspamd_mime_parser_headers_heuristic (GString *input, goffset *body_start)
  917. {
  918. const gsize default_max_len = 76;
  919. gsize max_len = MIN (input->len, default_max_len);
  920. const gchar *p, *end;
  921. enum {
  922. st_before_colon = 0,
  923. st_colon,
  924. st_spaces_after_colon,
  925. st_value,
  926. st_error
  927. } state = st_before_colon;
  928. p = input->str;
  929. end = p + max_len;
  930. while (p < end) {
  931. switch (state) {
  932. case st_before_colon:
  933. if (G_UNLIKELY (*p == ':')) {
  934. state = st_colon;
  935. }
  936. else if (G_UNLIKELY (!g_ascii_isgraph (*p))) {
  937. state = st_error;
  938. }
  939. p ++;
  940. break;
  941. case st_colon:
  942. if (g_ascii_isspace (*p)) {
  943. state = st_spaces_after_colon;
  944. }
  945. else {
  946. state = st_value;
  947. }
  948. p ++;
  949. break;
  950. case st_spaces_after_colon:
  951. if (!g_ascii_isspace (*p)) {
  952. state = st_value;
  953. }
  954. p ++;
  955. break;
  956. case st_value:
  957. /* We accept any value */
  958. goto end;
  959. break;
  960. case st_error:
  961. return (-1);
  962. break;
  963. }
  964. }
  965. end:
  966. if (state == st_value) {
  967. if (body_start) {
  968. *body_start = input->len;
  969. }
  970. return input->len;
  971. }
  972. return (-1);
  973. }
  974. static void
  975. rspamd_mime_preprocess_message (struct rspamd_task *task,
  976. struct rspamd_mime_part *top,
  977. struct rspamd_mime_parser_ctx *st)
  978. {
  979. if (top->raw_data.begin >= st->pos) {
  980. rspamd_multipattern_lookup (lib_ctx->mp_boundary,
  981. top->raw_data.begin - 1,
  982. top->raw_data.len + 1,
  983. rspamd_mime_preprocess_cb, st, NULL);
  984. }
  985. else {
  986. rspamd_multipattern_lookup (lib_ctx->mp_boundary,
  987. st->pos,
  988. st->end - st->pos,
  989. rspamd_mime_preprocess_cb, st, NULL);
  990. }
  991. }
  992. static void
  993. rspamd_mime_parse_stack_free (struct rspamd_mime_parser_ctx *st)
  994. {
  995. if (st) {
  996. g_ptr_array_free (st->stack, TRUE);
  997. g_array_free (st->boundaries, TRUE);
  998. g_free (st);
  999. }
  1000. }
  1001. static enum rspamd_mime_parse_error
  1002. rspamd_mime_parse_message (struct rspamd_task *task,
  1003. struct rspamd_mime_part *part,
  1004. struct rspamd_mime_parser_ctx *st,
  1005. GError **err)
  1006. {
  1007. struct rspamd_content_type *ct, *sel = NULL;
  1008. struct rspamd_mime_header *hdr = NULL, *cur;
  1009. const gchar *pbegin, *p;
  1010. gsize plen, len;
  1011. struct rspamd_mime_part *npart;
  1012. goffset hdr_pos, body_pos;
  1013. guint i;
  1014. enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK;
  1015. GString str;
  1016. struct rspamd_mime_parser_ctx *nst = st;
  1017. if (st->nesting > max_nested) {
  1018. g_set_error (err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d",
  1019. st->nesting);
  1020. return RSPAMD_MIME_PARSE_NESTING;
  1021. }
  1022. /* Allocate real part */
  1023. npart = rspamd_mempool_alloc0 (task->task_pool,
  1024. sizeof (struct rspamd_mime_part));
  1025. if (part == NULL) {
  1026. /* Top level message */
  1027. p = task->msg.begin;
  1028. len = task->msg.len;
  1029. /* Skip any space characters to avoid some bad messages to be unparsed */
  1030. while (len > 0 && g_ascii_isspace (*p)) {
  1031. p ++;
  1032. len --;
  1033. }
  1034. /*
  1035. * Exim somehow uses mailbox format for messages being scanned:
  1036. * From x@x.com Fri May 13 19:08:48 2016
  1037. *
  1038. * Need to check that for all inputs due to proxy
  1039. */
  1040. if (len > sizeof ("From ") - 1) {
  1041. if (memcmp (p, "From ", sizeof ("From ") - 1) == 0) {
  1042. /* Skip to CRLF */
  1043. msg_info_task ("mailbox input detected, enable workaround");
  1044. p += sizeof ("From ") - 1;
  1045. len -= sizeof ("From ") - 1;
  1046. while (len > 0 && *p != '\n') {
  1047. p ++;
  1048. len --;
  1049. }
  1050. while (len > 0 && g_ascii_isspace (*p)) {
  1051. p ++;
  1052. len --;
  1053. }
  1054. }
  1055. }
  1056. str.str = (gchar *)p;
  1057. str.len = len;
  1058. hdr_pos = rspamd_string_find_eoh (&str, &body_pos);
  1059. if (hdr_pos > 0 && hdr_pos < str.len) {
  1060. MESSAGE_FIELD (task, raw_headers_content).begin = str.str;
  1061. MESSAGE_FIELD (task, raw_headers_content).len = hdr_pos;
  1062. MESSAGE_FIELD (task, raw_headers_content).body_start = str.str + body_pos;
  1063. if (MESSAGE_FIELD (task, raw_headers_content).len > 0) {
  1064. rspamd_mime_headers_process (task,
  1065. MESSAGE_FIELD (task, raw_headers),
  1066. &MESSAGE_FIELD (task, headers_order),
  1067. MESSAGE_FIELD (task, raw_headers_content).begin,
  1068. MESSAGE_FIELD (task, raw_headers_content).len,
  1069. TRUE);
  1070. npart->raw_headers = rspamd_message_headers_ref (
  1071. MESSAGE_FIELD (task, raw_headers));
  1072. }
  1073. hdr = rspamd_message_get_header_from_hash (
  1074. MESSAGE_FIELD (task, raw_headers),
  1075. "Content-Type");
  1076. }
  1077. else {
  1078. /* First apply heuristic, maybe we have just headers */
  1079. hdr_pos = rspamd_mime_parser_headers_heuristic (&str, &body_pos);
  1080. if (hdr_pos > 0 && hdr_pos <= str.len) {
  1081. MESSAGE_FIELD (task, raw_headers_content).begin = str.str;
  1082. MESSAGE_FIELD (task, raw_headers_content).len = hdr_pos;
  1083. MESSAGE_FIELD (task, raw_headers_content).body_start = str.str +
  1084. body_pos;
  1085. if (MESSAGE_FIELD (task, raw_headers_content).len > 0) {
  1086. rspamd_mime_headers_process (task,
  1087. MESSAGE_FIELD (task, raw_headers),
  1088. &MESSAGE_FIELD (task, headers_order),
  1089. MESSAGE_FIELD (task, raw_headers_content).begin,
  1090. MESSAGE_FIELD (task, raw_headers_content).len,
  1091. TRUE);
  1092. npart->raw_headers = rspamd_message_headers_ref (
  1093. MESSAGE_FIELD (task, raw_headers));
  1094. }
  1095. hdr = rspamd_message_get_header_from_hash (
  1096. MESSAGE_FIELD (task, raw_headers),
  1097. "Content-Type");
  1098. task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS;
  1099. }
  1100. else {
  1101. body_pos = 0;
  1102. }
  1103. }
  1104. pbegin = st->start + body_pos;
  1105. plen = st->end - pbegin;
  1106. npart->headers_order = NULL;
  1107. }
  1108. else {
  1109. /*
  1110. * Here are dragons:
  1111. * We allocate new parser context as we need to shift pointers
  1112. */
  1113. nst = g_malloc0 (sizeof (*st));
  1114. nst->stack = g_ptr_array_sized_new (4);
  1115. nst->boundaries = g_array_sized_new (FALSE, FALSE,
  1116. sizeof (struct rspamd_mime_boundary), 8);
  1117. nst->start = part->parsed_data.begin;
  1118. nst->end = nst->start + part->parsed_data.len;
  1119. nst->pos = nst->start;
  1120. nst->task = st->task;
  1121. nst->nesting = st->nesting;
  1122. st->nesting ++;
  1123. str.str = (gchar *)part->parsed_data.begin;
  1124. str.len = part->parsed_data.len;
  1125. hdr_pos = rspamd_string_find_eoh (&str, &body_pos);
  1126. npart->raw_headers = rspamd_message_headers_new ();
  1127. npart->headers_order = NULL;
  1128. if (hdr_pos > 0 && hdr_pos < str.len) {
  1129. npart->raw_headers_str = str.str;
  1130. npart->raw_headers_len = hdr_pos;
  1131. npart->raw_data.begin = str.str + body_pos;
  1132. if (npart->raw_headers_len > 0) {
  1133. rspamd_mime_headers_process (task,
  1134. npart->raw_headers,
  1135. &npart->headers_order,
  1136. npart->raw_headers_str,
  1137. npart->raw_headers_len,
  1138. FALSE);
  1139. }
  1140. hdr = rspamd_message_get_header_from_hash (npart->raw_headers,
  1141. "Content-Type");
  1142. }
  1143. else {
  1144. body_pos = 0;
  1145. }
  1146. pbegin = part->parsed_data.begin + body_pos;
  1147. plen = part->parsed_data.len - body_pos;
  1148. }
  1149. npart->raw_data.begin = pbegin;
  1150. npart->raw_data.len = plen;
  1151. npart->parent_part = part;
  1152. if (hdr == NULL) {
  1153. sel = NULL;
  1154. }
  1155. else {
  1156. DL_FOREACH (hdr, cur) {
  1157. ct = rspamd_content_type_parse (cur->decoded, strlen (cur->decoded),
  1158. task->task_pool);
  1159. /* Here we prefer multipart content-type or any content-type */
  1160. if (ct) {
  1161. if (sel == NULL) {
  1162. sel = ct;
  1163. }
  1164. else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
  1165. sel = ct;
  1166. }
  1167. }
  1168. }
  1169. }
  1170. if (sel == NULL) {
  1171. /* For messages we automatically assume plaintext */
  1172. msg_info_task ("cannot find content-type for a message, assume text/plain");
  1173. sel = rspamd_mempool_alloc0 (task->task_pool, sizeof (*sel));
  1174. sel->flags = RSPAMD_CONTENT_TYPE_TEXT|RSPAMD_CONTENT_TYPE_MISSING;
  1175. RSPAMD_FTOK_ASSIGN (&sel->type, "text");
  1176. RSPAMD_FTOK_ASSIGN (&sel->subtype, "plain");
  1177. }
  1178. npart->ct = sel;
  1179. if ((part == NULL || nst != st) &&
  1180. (sel->flags & (RSPAMD_CONTENT_TYPE_MULTIPART|RSPAMD_CONTENT_TYPE_MESSAGE))) {
  1181. /* Not a trivial message, need to preprocess */
  1182. rspamd_mime_preprocess_message (task, npart, nst);
  1183. }
  1184. if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
  1185. g_ptr_array_add (nst->stack, npart);
  1186. nst->nesting ++;
  1187. npart->specific.mp = rspamd_mempool_alloc0 (task->task_pool,
  1188. sizeof (struct rspamd_mime_multipart));
  1189. memcpy (&npart->specific.mp->boundary, &sel->orig_boundary,
  1190. sizeof (rspamd_ftok_t));
  1191. ret = rspamd_mime_parse_multipart_part (task, npart, nst, err);
  1192. }
  1193. else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) {
  1194. if ((ret = rspamd_mime_parse_normal_part (task, npart, nst, err))
  1195. == RSPAMD_MIME_PARSE_OK) {
  1196. ret = rspamd_mime_parse_message (task, npart, nst, err);
  1197. }
  1198. }
  1199. else {
  1200. ret = rspamd_mime_parse_normal_part (task, npart, nst, err);
  1201. }
  1202. if (part && st->stack->len > 0) {
  1203. /* Remove message part from the parent stack */
  1204. g_ptr_array_remove_index_fast (st->stack, st->stack->len - 1);
  1205. st->nesting --;
  1206. }
  1207. /* Process leftovers for boundaries */
  1208. if (nst->boundaries) {
  1209. struct rspamd_mime_boundary *boundary, *start_boundary = NULL,
  1210. *end_boundary = NULL;
  1211. goffset cur_offset = nst->pos - nst->start,
  1212. end_offset = st->end - st->start;
  1213. guint sel_idx = 0;
  1214. for (;;) {
  1215. start_boundary = NULL;
  1216. for (i = sel_idx; i < nst->boundaries->len; i++) {
  1217. boundary = &g_array_index (nst->boundaries,
  1218. struct rspamd_mime_boundary, i);
  1219. if (boundary->start > cur_offset &&
  1220. boundary->boundary < end_offset &&
  1221. !RSPAMD_BOUNDARY_IS_CLOSED (boundary)) {
  1222. start_boundary = boundary;
  1223. sel_idx = i;
  1224. break;
  1225. }
  1226. }
  1227. if (start_boundary) {
  1228. const gchar *start, *end;
  1229. if (nst->boundaries->len > sel_idx + 1) {
  1230. end_boundary = &g_array_index (nst->boundaries,
  1231. struct rspamd_mime_boundary, sel_idx + 1);
  1232. end = nst->start + end_boundary->boundary;
  1233. }
  1234. else {
  1235. end = nst->end;
  1236. }
  1237. sel_idx ++;
  1238. start = nst->start + start_boundary->start;
  1239. if (end > start &&
  1240. (ret = rspamd_mime_process_multipart_node (task, nst,
  1241. NULL, start, end, FALSE, err)) != RSPAMD_MIME_PARSE_OK) {
  1242. if (nst != st) {
  1243. rspamd_mime_parse_stack_free (nst);
  1244. }
  1245. if (ret == RSPAMD_MIME_PARSE_NO_PART) {
  1246. return RSPAMD_MIME_PARSE_OK;
  1247. }
  1248. return ret;
  1249. }
  1250. }
  1251. else {
  1252. break;
  1253. }
  1254. }
  1255. }
  1256. if (nst != st) {
  1257. rspamd_mime_parse_stack_free (nst);
  1258. }
  1259. return ret;
  1260. }
  1261. enum rspamd_mime_parse_error
  1262. rspamd_mime_parse_task (struct rspamd_task *task, GError **err)
  1263. {
  1264. struct rspamd_mime_parser_ctx *st;
  1265. enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK;
  1266. if (lib_ctx == NULL) {
  1267. rspamd_mime_parser_init_lib ();
  1268. }
  1269. if (++lib_ctx->key_usages > max_key_usages) {
  1270. /* Regenerate siphash key */
  1271. ottery_rand_bytes (lib_ctx->hkey, sizeof (lib_ctx->hkey));
  1272. lib_ctx->key_usages = 0;
  1273. }
  1274. st = g_malloc0 (sizeof (*st));
  1275. st->stack = g_ptr_array_sized_new (4);
  1276. st->pos = MESSAGE_FIELD (task, raw_headers_content).body_start;
  1277. st->end = task->msg.begin + task->msg.len;
  1278. st->boundaries = g_array_sized_new (FALSE, FALSE,
  1279. sizeof (struct rspamd_mime_boundary), 8);
  1280. st->task = task;
  1281. if (st->pos == NULL) {
  1282. st->pos = task->msg.begin;
  1283. }
  1284. st->start = task->msg.begin;
  1285. ret = rspamd_mime_parse_message (task, NULL, st, err);
  1286. rspamd_mime_parse_stack_free (st);
  1287. return ret;
  1288. }