You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

mime_parser.c 43KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "task.h"
  18. #include "mime_parser.h"
  19. #include "mime_headers.h"
  20. #include "message.h"
  21. #include "multipattern.h"
  22. #include "contrib/libottery/ottery.h"
  23. #include "contrib/uthash/utlist.h"
  24. #include <openssl/cms.h>
  25. #include <openssl/pkcs7.h>
  26. #include "contrib/fastutf8/fastutf8.h"
  27. struct rspamd_mime_parser_lib_ctx {
  28. struct rspamd_multipattern *mp_boundary;
  29. guchar hkey[rspamd_cryptobox_SIPKEYBYTES]; /* Key for hashing */
  30. guint key_usages;
  31. };
  32. struct rspamd_mime_parser_lib_ctx *lib_ctx = NULL;
  33. static const guint max_nested = 64;
  34. static const guint max_key_usages = 10000;
  35. #define msg_debug_mime(...) rspamd_conditional_debug_fast (NULL, task->from_addr, \
  36. rspamd_mime_log_id, "mime", task->task_pool->tag.uid, \
  37. RSPAMD_LOG_FUNC, \
  38. __VA_ARGS__)
  39. INIT_LOG_MODULE(mime)
  40. #define RSPAMD_MIME_BOUNDARY_FLAG_CLOSED (1 << 0)
  41. #define RSPAMD_BOUNDARY_IS_CLOSED(b) ((b)->flags & RSPAMD_MIME_BOUNDARY_FLAG_CLOSED)
  42. struct rspamd_mime_boundary {
  43. goffset boundary;
  44. goffset start;
  45. guint64 hash;
  46. guint64 closed_hash;
  47. gint flags;
  48. };
  49. struct rspamd_mime_parser_ctx {
  50. GPtrArray *stack; /* Stack of parts */
  51. GArray *boundaries; /* Boundaries found in the whole message */
  52. const gchar *start;
  53. const gchar *pos;
  54. const gchar *end;
  55. struct rspamd_task *task;
  56. guint nesting;
  57. };
  58. static enum rspamd_mime_parse_error
  59. rspamd_mime_parse_multipart_part (struct rspamd_task *task,
  60. struct rspamd_mime_part *part,
  61. struct rspamd_mime_parser_ctx *st,
  62. GError **err);
  63. static enum rspamd_mime_parse_error
  64. rspamd_mime_parse_message (struct rspamd_task *task,
  65. struct rspamd_mime_part *part,
  66. struct rspamd_mime_parser_ctx *st,
  67. GError **err);
  68. static enum rspamd_mime_parse_error
  69. rspamd_mime_parse_normal_part (struct rspamd_task *task,
  70. struct rspamd_mime_part *part,
  71. struct rspamd_mime_parser_ctx *st,
  72. struct rspamd_content_type *ct,
  73. GError **err);
  74. static enum rspamd_mime_parse_error
  75. rspamd_mime_process_multipart_node (struct rspamd_task *task,
  76. struct rspamd_mime_parser_ctx *st,
  77. struct rspamd_mime_part *multipart,
  78. const gchar *start, const gchar *end,
  79. gboolean is_finished,
  80. GError **err);
  81. #define RSPAMD_MIME_QUARK (rspamd_mime_parser_quark())
  82. static GQuark
  83. rspamd_mime_parser_quark (void)
  84. {
  85. return g_quark_from_static_string ("mime-parser");
  86. }
  87. const gchar*
  88. rspamd_cte_to_string (enum rspamd_cte ct)
  89. {
  90. const gchar *ret = "unknown";
  91. switch (ct) {
  92. case RSPAMD_CTE_7BIT:
  93. ret = "7bit";
  94. break;
  95. case RSPAMD_CTE_8BIT:
  96. ret = "8bit";
  97. break;
  98. case RSPAMD_CTE_QP:
  99. ret = "quoted-printable";
  100. break;
  101. case RSPAMD_CTE_B64:
  102. ret = "base64";
  103. break;
  104. case RSPAMD_CTE_UUE:
  105. ret = "X-uuencode";
  106. break;
  107. default:
  108. break;
  109. }
  110. return ret;
  111. }
  112. enum rspamd_cte
  113. rspamd_cte_from_string (const gchar *str)
  114. {
  115. enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
  116. g_assert (str != NULL);
  117. if (strcmp (str, "7bit") == 0) {
  118. ret = RSPAMD_CTE_7BIT;
  119. }
  120. else if (strcmp (str, "8bit") == 0) {
  121. ret = RSPAMD_CTE_8BIT;
  122. }
  123. else if (strcmp (str, "quoted-printable") == 0) {
  124. ret = RSPAMD_CTE_QP;
  125. }
  126. else if (strcmp (str, "base64") == 0) {
  127. ret = RSPAMD_CTE_B64;
  128. }
  129. else if (strcmp (str, "X-uuencode") == 0) {
  130. ret = RSPAMD_CTE_UUE;
  131. }
  132. else if (strcmp (str, "uuencode") == 0) {
  133. ret = RSPAMD_CTE_UUE;
  134. }
  135. else if (strcmp (str, "X-uue") == 0) {
  136. ret = RSPAMD_CTE_UUE;
  137. }
  138. return ret;
  139. }
  140. static void
  141. rspamd_mime_parser_init_lib (void)
  142. {
  143. lib_ctx = g_malloc0 (sizeof (*lib_ctx));
  144. lib_ctx->mp_boundary = rspamd_multipattern_create (RSPAMD_MULTIPATTERN_DEFAULT);
  145. g_assert (lib_ctx->mp_boundary != NULL);
  146. rspamd_multipattern_add_pattern (lib_ctx->mp_boundary, "\r--", 0);
  147. rspamd_multipattern_add_pattern (lib_ctx->mp_boundary, "\n--", 0);
  148. g_assert (rspamd_multipattern_compile (lib_ctx->mp_boundary, NULL));
  149. ottery_rand_bytes (lib_ctx->hkey, sizeof (lib_ctx->hkey));
  150. }
  151. static enum rspamd_cte
  152. rspamd_mime_parse_cte (const gchar *in, gsize len)
  153. {
  154. guint64 h;
  155. enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
  156. in = rspamd_string_len_strip (in, &len, " \t;,.+-#!`~'");
  157. h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64,
  158. in, len, 0xdeadbabe);
  159. switch (h) {
  160. case 0xCEDAA7056B4753F7ULL: /* 7bit */
  161. ret = RSPAMD_CTE_7BIT;
  162. break;
  163. case 0x42E0745448B39FC1ULL: /* 8bit */
  164. case 0x6B169E6B155BADC0ULL: /* binary */
  165. ret = RSPAMD_CTE_8BIT;
  166. break;
  167. case 0x6D69A5BB02A633B0ULL: /* quoted-printable */
  168. ret = RSPAMD_CTE_QP;
  169. break;
  170. case 0x96305588A76DC9A9ULL: /* base64 */
  171. case 0x171029DE1B0423A9ULL: /* base-64 */
  172. ret = RSPAMD_CTE_B64;
  173. break;
  174. case 0x420b54dc00d13cecULL: /* uuencode */
  175. case 0x8df6700b8f6c4cf9ULL: /* x-uuencode */
  176. case 0x41f725ec544356d3ULL: /* x-uue */
  177. ret = RSPAMD_CTE_UUE;
  178. break;
  179. }
  180. return ret;
  181. }
  182. static enum rspamd_cte
  183. rspamd_mime_part_get_cte_heuristic (struct rspamd_task *task,
  184. struct rspamd_mime_part *part)
  185. {
  186. const guint check_len = 128;
  187. guint real_len, nspaces = 0, neqsign = 0, n8bit = 0, nqpencoded = 0,
  188. padeqsign = 0, nupper = 0, nlower = 0;
  189. gboolean b64_chars = TRUE;
  190. const guchar *p, *end;
  191. enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
  192. real_len = MIN (check_len, part->raw_data.len);
  193. p = (const guchar *)part->raw_data.begin;
  194. end = p + part->raw_data.len;
  195. while (p < end && g_ascii_isspace (*p)) {
  196. p ++;
  197. }
  198. if (end - p > sizeof ("begin-base64 ")) {
  199. const guchar *uue_start;
  200. if (memcmp (p, "begin ", sizeof ("begin ") - 1) == 0) {
  201. uue_start = p + sizeof ("begin ") - 1;
  202. while (uue_start < end && g_ascii_isspace (*uue_start)) {
  203. uue_start ++;
  204. }
  205. if (uue_start < end && g_ascii_isdigit (*uue_start)) {
  206. return RSPAMD_CTE_UUE;
  207. }
  208. }
  209. else if (memcmp (p, "begin-base64 ", sizeof ("begin-base64 ") - 1) == 0) {
  210. uue_start = p + sizeof ("begin ") - 1;
  211. while (uue_start < end && g_ascii_isspace (*uue_start)) {
  212. uue_start ++;
  213. }
  214. if (uue_start < end && g_ascii_isdigit (*uue_start)) {
  215. return RSPAMD_CTE_UUE;
  216. }
  217. }
  218. }
  219. /* Skip trailing spaces */
  220. while (end > p && g_ascii_isspace (*(end - 1))) {
  221. end --;
  222. }
  223. if (end > p + 2) {
  224. if (*(end - 1) == '=') {
  225. padeqsign ++;
  226. end --;
  227. }
  228. if (*(end - 1) == '=') {
  229. padeqsign ++;
  230. end --;
  231. }
  232. }
  233. /* Adjust end to analyse only first characters */
  234. if (end - p > real_len) {
  235. end = p + real_len;
  236. }
  237. while (p < end) {
  238. if (*p == ' ') {
  239. nspaces ++;
  240. }
  241. else if (*p == '=') {
  242. b64_chars = FALSE; /* Eqsign must not be inside base64 */
  243. neqsign ++;
  244. p ++;
  245. if (p + 2 < end && g_ascii_isxdigit (*p) && g_ascii_isxdigit (*(p + 1))) {
  246. p ++;
  247. nqpencoded ++;
  248. }
  249. continue;
  250. }
  251. else if (*p >= 0x80) {
  252. n8bit ++;
  253. b64_chars = FALSE;
  254. }
  255. else if (!(g_ascii_isalnum (*p) || *p == '/' || *p == '+')) {
  256. b64_chars = FALSE;
  257. }
  258. else if (g_ascii_isupper (*p)) {
  259. nupper ++;
  260. }
  261. else if (g_ascii_islower (*p)) {
  262. nlower ++;
  263. }
  264. p ++;
  265. }
  266. if (b64_chars && neqsign <= 2 && nspaces == 0) {
  267. /* Need more thinking */
  268. if (part->raw_data.len > 80) {
  269. if (padeqsign > 0) {
  270. ret = RSPAMD_CTE_B64;
  271. }
  272. else {
  273. /* We have a large piece of data with no spaces and base64
  274. * symbols only, no padding is detected as well...
  275. *
  276. * There is a small chance that our first 128 characters
  277. * are either some garbage or it is a base64 with no padding
  278. * (e.g. when it is not needed)
  279. */
  280. if (nupper > 1 && nlower > 1) {
  281. /*
  282. * We have both uppercase and lowercase letters, so it can be
  283. * base64
  284. */
  285. ret = RSPAMD_CTE_B64;
  286. }
  287. else {
  288. ret = RSPAMD_CTE_7BIT;
  289. }
  290. }
  291. }
  292. else {
  293. if (((end - (const guchar *)part->raw_data.begin) + padeqsign) % 4 == 0) {
  294. if (padeqsign == 0) {
  295. /*
  296. * It can be either base64 or plain text, hard to say
  297. * Let's assume that if we have > 1 uppercase it is
  298. * likely base64
  299. */
  300. if (nupper > 1 && nlower > 1) {
  301. ret = RSPAMD_CTE_B64;
  302. }
  303. else {
  304. ret = RSPAMD_CTE_7BIT;
  305. }
  306. }
  307. else {
  308. ret = RSPAMD_CTE_B64;
  309. }
  310. }
  311. else {
  312. /* No way */
  313. if (padeqsign == 1 || padeqsign == 2) {
  314. ret = RSPAMD_CTE_B64;
  315. }
  316. else {
  317. ret = RSPAMD_CTE_7BIT;
  318. }
  319. }
  320. }
  321. }
  322. else if (n8bit == 0) {
  323. if (neqsign > 2 && nqpencoded > 2) {
  324. ret = RSPAMD_CTE_QP;
  325. }
  326. else {
  327. ret = RSPAMD_CTE_7BIT;
  328. }
  329. }
  330. else {
  331. ret = RSPAMD_CTE_8BIT;
  332. }
  333. msg_debug_mime ("detected cte: %s", rspamd_cte_to_string (ret));
  334. return ret;
  335. }
  336. static void
  337. rspamd_mime_part_get_cte (struct rspamd_task *task,
  338. struct rspamd_mime_headers_table *hdrs,
  339. struct rspamd_mime_part *part,
  340. gboolean apply_heuristic)
  341. {
  342. struct rspamd_mime_header *hdr, *cur;
  343. enum rspamd_cte cte = RSPAMD_CTE_UNKNOWN;
  344. gboolean parent_propagated = FALSE;
  345. hdr = rspamd_message_get_header_from_hash(hdrs, "Content-Transfer-Encoding", FALSE);
  346. if (hdr == NULL) {
  347. if (part->parent_part && part->parent_part->cte != RSPAMD_CTE_UNKNOWN &&
  348. !(part->parent_part->flags & RSPAMD_MIME_PART_MISSING_CTE)) {
  349. part->cte = part->parent_part->cte;
  350. parent_propagated = TRUE;
  351. goto check_cte;
  352. }
  353. if (apply_heuristic) {
  354. part->cte = rspamd_mime_part_get_cte_heuristic (task, part);
  355. msg_info_task ("detected missing CTE for part as: %s",
  356. rspamd_cte_to_string (part->cte));
  357. }
  358. part->flags |= RSPAMD_MIME_PART_MISSING_CTE;
  359. }
  360. else {
  361. DL_FOREACH (hdr, cur) {
  362. gsize hlen;
  363. gchar lc_buf[128];
  364. hlen = rspamd_snprintf (lc_buf, sizeof (lc_buf), "%s", cur->value);
  365. rspamd_str_lc (lc_buf, hlen);
  366. cte = rspamd_mime_parse_cte (lc_buf, hlen);
  367. if (cte != RSPAMD_CTE_UNKNOWN) {
  368. part->cte = cte;
  369. break;
  370. }
  371. }
  372. check_cte:
  373. if (apply_heuristic) {
  374. if (part->cte == RSPAMD_CTE_UNKNOWN) {
  375. part->cte = rspamd_mime_part_get_cte_heuristic (task, part);
  376. msg_info_task ("corrected bad CTE for part to: %s",
  377. rspamd_cte_to_string (part->cte));
  378. }
  379. else if (part->cte == RSPAMD_CTE_B64 ||
  380. part->cte == RSPAMD_CTE_QP) {
  381. /* Additionally check sanity */
  382. cte = rspamd_mime_part_get_cte_heuristic (task, part);
  383. if (cte == RSPAMD_CTE_8BIT) {
  384. msg_info_task (
  385. "incorrect cte specified for part: %s, %s detected",
  386. rspamd_cte_to_string (part->cte),
  387. rspamd_cte_to_string (cte));
  388. part->cte = cte;
  389. part->flags |= RSPAMD_MIME_PART_BAD_CTE;
  390. }
  391. else if (cte != part->cte && parent_propagated) {
  392. part->cte = cte;
  393. msg_info_task ("detected missing CTE for part as: %s",
  394. rspamd_cte_to_string (part->cte));
  395. }
  396. }
  397. else {
  398. msg_debug_mime ("processed cte: %s",
  399. rspamd_cte_to_string (cte));
  400. }
  401. }
  402. else {
  403. msg_debug_mime ("processed cte: %s", rspamd_cte_to_string (cte));
  404. }
  405. }
  406. }
  407. static void
  408. rspamd_mime_part_get_cd (struct rspamd_task *task, struct rspamd_mime_part *part)
  409. {
  410. struct rspamd_mime_header *hdr, *cur;
  411. struct rspamd_content_disposition *cd = NULL;
  412. rspamd_ftok_t srch;
  413. struct rspamd_content_type_param *found;
  414. hdr = rspamd_message_get_header_from_hash(part->raw_headers,
  415. "Content-Disposition", FALSE);
  416. if (hdr == NULL) {
  417. cd = rspamd_mempool_alloc0 (task->task_pool, sizeof (*cd));
  418. cd->type = RSPAMD_CT_INLINE;
  419. /* We can also have content disposition definitions in Content-Type */
  420. if (part->ct && part->ct->attrs) {
  421. RSPAMD_FTOK_ASSIGN (&srch, "name");
  422. found = g_hash_table_lookup (part->ct->attrs, &srch);
  423. if (!found) {
  424. RSPAMD_FTOK_ASSIGN (&srch, "filename");
  425. found = g_hash_table_lookup (part->ct->attrs, &srch);
  426. }
  427. if (found) {
  428. cd->type = RSPAMD_CT_ATTACHMENT;
  429. memcpy (&cd->filename, &found->value, sizeof (cd->filename));
  430. }
  431. }
  432. }
  433. else {
  434. DL_FOREACH (hdr, cur) {
  435. gsize hlen;
  436. cd = NULL;
  437. if (cur->value) {
  438. hlen = strlen (cur->value);
  439. cd = rspamd_content_disposition_parse (cur->value, hlen,
  440. task->task_pool);
  441. }
  442. if (cd) {
  443. /* We still need to check filename */
  444. if (cd->filename.len == 0) {
  445. if (part->ct && part->ct->attrs) {
  446. RSPAMD_FTOK_ASSIGN (&srch, "name");
  447. found = g_hash_table_lookup (part->ct->attrs, &srch);
  448. if (!found) {
  449. RSPAMD_FTOK_ASSIGN (&srch, "filename");
  450. found = g_hash_table_lookup (part->ct->attrs, &srch);
  451. }
  452. if (found) {
  453. cd->type = RSPAMD_CT_ATTACHMENT;
  454. memcpy (&cd->filename, &found->value,
  455. sizeof (cd->filename));
  456. }
  457. }
  458. }
  459. msg_debug_mime ("processed content disposition: %s, file: \"%T\"",
  460. cd->lc_data, &cd->filename);
  461. break;
  462. }
  463. else if (part->ct) {
  464. /*
  465. * Even in case of malformed Content-Disposition, we can still
  466. * fall back to Content-Type
  467. */
  468. cd = rspamd_mempool_alloc0 (task->task_pool, sizeof (*cd));
  469. cd->type = RSPAMD_CT_INLINE;
  470. /* We can also have content disposition definitions in Content-Type */
  471. if (part->ct->attrs) {
  472. RSPAMD_FTOK_ASSIGN (&srch, "name");
  473. found = g_hash_table_lookup (part->ct->attrs, &srch);
  474. if (!found) {
  475. RSPAMD_FTOK_ASSIGN (&srch, "filename");
  476. found = g_hash_table_lookup (part->ct->attrs, &srch);
  477. }
  478. if (found) {
  479. cd->type = RSPAMD_CT_ATTACHMENT;
  480. memcpy (&cd->filename, &found->value, sizeof (cd->filename));
  481. }
  482. }
  483. }
  484. }
  485. }
  486. part->cd = cd;
  487. }
  488. void
  489. rspamd_mime_parser_calc_digest (struct rspamd_mime_part *part)
  490. {
  491. /* Blake2b applied to string 'rspamd' */
  492. static const guchar hash_key[] = {
  493. 0xef,0x43,0xae,0x80,0xcc,0x8d,0xc3,0x4c,
  494. 0x6f,0x1b,0xd6,0x18,0x1b,0xae,0x87,0x74,
  495. 0x0c,0xca,0xf7,0x8e,0x5f,0x2e,0x54,0x32,
  496. 0xf6,0x79,0xb9,0x27,0x26,0x96,0x20,0x92,
  497. 0x70,0x07,0x85,0xeb,0x83,0xf7,0x89,0xe0,
  498. 0xd7,0x32,0x2a,0xd2,0x1a,0x64,0x41,0xef,
  499. 0x49,0xff,0xc3,0x8c,0x54,0xf9,0x67,0x74,
  500. 0x30,0x1e,0x70,0x2e,0xb7,0x12,0x09,0xfe,
  501. };
  502. if (part->parsed_data.len > 0) {
  503. rspamd_cryptobox_hash (part->digest,
  504. part->parsed_data.begin, part->parsed_data.len,
  505. hash_key, sizeof (hash_key));
  506. }
  507. }
  508. static enum rspamd_mime_parse_error
  509. rspamd_mime_parse_normal_part (struct rspamd_task *task,
  510. struct rspamd_mime_part *part,
  511. struct rspamd_mime_parser_ctx *st,
  512. struct rspamd_content_type *ct,
  513. GError **err)
  514. {
  515. rspamd_fstring_t *parsed;
  516. gssize r;
  517. g_assert (part != NULL);
  518. rspamd_mime_part_get_cte (task, part->raw_headers, part,
  519. part->ct && !(part->ct->flags & RSPAMD_CONTENT_TYPE_MESSAGE));
  520. rspamd_mime_part_get_cd (task, part);
  521. switch (part->cte) {
  522. case RSPAMD_CTE_7BIT:
  523. case RSPAMD_CTE_8BIT:
  524. case RSPAMD_CTE_UNKNOWN:
  525. if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING)) {
  526. if (part->cte != RSPAMD_CTE_7BIT) {
  527. /* We have something that has a missing content-type,
  528. * but it has non-7bit characters.
  529. *
  530. * In theory, it is very unsafe to process it as a text part
  531. * as we unlikely get some sane result
  532. */
  533. /*
  534. * On the other hand, there is an evidence that some
  535. * emails actually rely on that.
  536. * So we apply an expensive hack here:
  537. * if there are no 8bit characters -OR- the content is valid
  538. * UTF8, we can still imply Content-Type == text/plain
  539. */
  540. if (rspamd_str_has_8bit (part->raw_data.begin, part->raw_data.len) &&
  541. !rspamd_fast_utf8_validate (part->raw_data.begin, part->raw_data.len)) {
  542. part->ct->flags &= ~RSPAMD_CONTENT_TYPE_TEXT;
  543. part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
  544. }
  545. }
  546. }
  547. if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) {
  548. /* Need to copy text as we have couple of in-place change functions */
  549. parsed = rspamd_fstring_sized_new (part->raw_data.len);
  550. parsed->len = part->raw_data.len;
  551. memcpy (parsed->str, part->raw_data.begin, parsed->len);
  552. part->parsed_data.begin = parsed->str;
  553. part->parsed_data.len = parsed->len;
  554. rspamd_mempool_notify_alloc (task->task_pool, parsed->len);
  555. rspamd_mempool_add_destructor (task->task_pool,
  556. (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
  557. }
  558. else {
  559. part->parsed_data.begin = part->raw_data.begin;
  560. part->parsed_data.len = part->raw_data.len;
  561. }
  562. break;
  563. case RSPAMD_CTE_QP:
  564. parsed = rspamd_fstring_sized_new (part->raw_data.len);
  565. r = rspamd_decode_qp_buf (part->raw_data.begin, part->raw_data.len,
  566. parsed->str, parsed->allocated);
  567. if (r != -1) {
  568. parsed->len = r;
  569. part->parsed_data.begin = parsed->str;
  570. part->parsed_data.len = parsed->len;
  571. rspamd_mempool_notify_alloc (task->task_pool, parsed->len);
  572. rspamd_mempool_add_destructor (task->task_pool,
  573. (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
  574. }
  575. else {
  576. msg_err_task ("invalid quoted-printable encoded part, assume 8bit");
  577. if (part->ct) {
  578. part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
  579. }
  580. part->cte = RSPAMD_CTE_8BIT;
  581. memcpy (parsed->str, part->raw_data.begin, part->raw_data.len);
  582. parsed->len = part->raw_data.len;
  583. part->parsed_data.begin = parsed->str;
  584. part->parsed_data.len = parsed->len;
  585. rspamd_mempool_notify_alloc (task->task_pool, parsed->len);
  586. rspamd_mempool_add_destructor (task->task_pool,
  587. (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
  588. }
  589. break;
  590. case RSPAMD_CTE_B64:
  591. parsed = rspamd_fstring_sized_new (part->raw_data.len / 4 * 3 + 12);
  592. rspamd_cryptobox_base64_decode (part->raw_data.begin,
  593. part->raw_data.len,
  594. parsed->str, &parsed->len);
  595. part->parsed_data.begin = parsed->str;
  596. part->parsed_data.len = parsed->len;
  597. rspamd_mempool_notify_alloc (task->task_pool, parsed->len);
  598. rspamd_mempool_add_destructor (task->task_pool,
  599. (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
  600. break;
  601. case RSPAMD_CTE_UUE:
  602. parsed = rspamd_fstring_sized_new (part->raw_data.len / 4 * 3 + 12);
  603. r = rspamd_decode_uue_buf (part->raw_data.begin, part->raw_data.len,
  604. parsed->str, parsed->allocated);
  605. rspamd_mempool_notify_alloc (task->task_pool, parsed->len);
  606. rspamd_mempool_add_destructor (task->task_pool,
  607. (rspamd_mempool_destruct_t)rspamd_fstring_free, parsed);
  608. if (r != -1) {
  609. parsed->len = r;
  610. part->parsed_data.begin = parsed->str;
  611. part->parsed_data.len = parsed->len;
  612. }
  613. else {
  614. msg_err_task ("invalid uuencoding in encoded part, assume 8bit");
  615. if (part->ct) {
  616. part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
  617. }
  618. part->cte = RSPAMD_CTE_8BIT;
  619. parsed->len = MIN (part->raw_data.len, parsed->allocated);
  620. memcpy (parsed->str, part->raw_data.begin, parsed->len);
  621. rspamd_mempool_notify_alloc (task->task_pool, parsed->len);
  622. part->parsed_data.begin = parsed->str;
  623. part->parsed_data.len = parsed->len;
  624. }
  625. break;
  626. default:
  627. g_assert_not_reached ();
  628. }
  629. part->part_number = MESSAGE_FIELD (task, parts)->len;
  630. part->urls = g_ptr_array_new ();
  631. g_ptr_array_add (MESSAGE_FIELD (task, parts), part);
  632. msg_debug_mime ("parsed data part %T/%T of length %z (%z orig), %s cte",
  633. &part->ct->type, &part->ct->subtype, part->parsed_data.len,
  634. part->raw_data.len, rspamd_cte_to_string (part->cte));
  635. rspamd_mime_parser_calc_digest (part);
  636. if (ct && (ct->flags & RSPAMD_CONTENT_TYPE_SMIME)) {
  637. CMS_ContentInfo *cms;
  638. const unsigned char *der_beg = part->parsed_data.begin;
  639. cms = d2i_CMS_ContentInfo (NULL, &der_beg, part->parsed_data.len);
  640. if (cms) {
  641. const ASN1_OBJECT *asn_ct = CMS_get0_eContentType (cms);
  642. int ct_nid = OBJ_obj2nid (asn_ct);
  643. if (ct_nid == NID_pkcs7_data) {
  644. BIO *bio = BIO_new_mem_buf (part->parsed_data.begin,
  645. part->parsed_data.len);
  646. PKCS7 *p7;
  647. p7 = d2i_PKCS7_bio (bio, NULL);
  648. if (p7) {
  649. ct_nid = OBJ_obj2nid (p7->type);
  650. if (ct_nid == NID_pkcs7_signed) {
  651. PKCS7 *p7_signed_content = p7->d.sign->contents;
  652. ct_nid = OBJ_obj2nid (p7_signed_content->type);
  653. if (ct_nid == NID_pkcs7_data && p7_signed_content->d.data) {
  654. int ret;
  655. msg_debug_mime ("found an additional part inside of "
  656. "smime structure of type %T/%T; length=%d",
  657. &ct->type, &ct->subtype, p7_signed_content->d.data->length);
  658. /*
  659. * Since ASN.1 structures are freed, we need to copy
  660. * the content
  661. */
  662. gchar *cpy = rspamd_mempool_alloc (task->task_pool,
  663. p7_signed_content->d.data->length);
  664. memcpy (cpy, p7_signed_content->d.data->data,
  665. p7_signed_content->d.data->length);
  666. ret = rspamd_mime_process_multipart_node (task,
  667. st, NULL,
  668. cpy,cpy + p7_signed_content->d.data->length,
  669. TRUE, err);
  670. PKCS7_free (p7);
  671. BIO_free (bio);
  672. CMS_ContentInfo_free (cms);
  673. return ret;
  674. }
  675. }
  676. PKCS7_free (p7);
  677. }
  678. BIO_free (bio);
  679. }
  680. CMS_ContentInfo_free (cms);
  681. }
  682. }
  683. return RSPAMD_MIME_PARSE_OK;
  684. }
  685. struct rspamd_mime_multipart_cbdata {
  686. struct rspamd_task *task;
  687. struct rspamd_mime_part *multipart;
  688. struct rspamd_mime_parser_ctx *st;
  689. const gchar *part_start;
  690. rspamd_ftok_t *cur_boundary;
  691. guint64 bhash;
  692. GError **err;
  693. };
  694. static enum rspamd_mime_parse_error
  695. rspamd_mime_process_multipart_node (struct rspamd_task *task,
  696. struct rspamd_mime_parser_ctx *st,
  697. struct rspamd_mime_part *multipart,
  698. const gchar *start, const gchar *end,
  699. gboolean is_finished,
  700. GError **err)
  701. {
  702. struct rspamd_content_type *ct, *sel = NULL;
  703. struct rspamd_mime_header *hdr = NULL, *cur;
  704. struct rspamd_mime_part *npart;
  705. GString str;
  706. goffset hdr_pos, body_pos;
  707. enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_FATAL;
  708. str.str = (gchar *)start;
  709. str.len = end - start;
  710. if (*start == '\n' || *start == '\r') {
  711. /*
  712. * We have a part that starts from newline which means that
  713. * there are completely no headers in this part,
  714. * hence we assume it as a text part
  715. */
  716. hdr_pos = 0;
  717. body_pos = 0;
  718. if (!is_finished) {
  719. /* Ignore garbage */
  720. const gchar *p = start;
  721. gboolean seen_something = FALSE;
  722. while (p < end) {
  723. if (g_ascii_isalnum (*p)) {
  724. seen_something = TRUE;
  725. break;
  726. }
  727. p ++;
  728. }
  729. if (!seen_something) {
  730. return RSPAMD_MIME_PARSE_NO_PART;
  731. }
  732. }
  733. }
  734. else {
  735. hdr_pos = rspamd_string_find_eoh (&str, &body_pos);
  736. }
  737. npart = rspamd_mempool_alloc0 (task->task_pool,
  738. sizeof (struct rspamd_mime_part));
  739. npart->parent_part = multipart;
  740. npart->raw_headers = rspamd_message_headers_new ();
  741. npart->headers_order = NULL;
  742. if (multipart) {
  743. if (multipart->specific.mp->children == NULL) {
  744. multipart->specific.mp->children = g_ptr_array_sized_new (2);
  745. }
  746. g_ptr_array_add (multipart->specific.mp->children, npart);
  747. }
  748. if (hdr_pos > 0 && hdr_pos < str.len) {
  749. npart->raw_headers_str = str.str;
  750. npart->raw_headers_len = hdr_pos;
  751. npart->raw_data.begin = start + body_pos;
  752. npart->raw_data.len = (end - start) - body_pos;
  753. if (npart->raw_headers_len > 0) {
  754. rspamd_mime_headers_process (task, npart->raw_headers,
  755. &npart->headers_order,
  756. npart->raw_headers_str,
  757. npart->raw_headers_len,
  758. FALSE);
  759. /* Preserve the natural order */
  760. if (npart->headers_order) {
  761. LL_REVERSE2 (npart->headers_order, ord_next);
  762. }
  763. }
  764. hdr = rspamd_message_get_header_from_hash(npart->raw_headers,
  765. "Content-Type", FALSE);
  766. }
  767. else {
  768. npart->raw_headers_str = 0;
  769. npart->raw_headers_len = 0;
  770. npart->raw_data.begin = start;
  771. npart->raw_data.len = end - start;
  772. }
  773. if (hdr != NULL) {
  774. DL_FOREACH (hdr, cur) {
  775. ct = rspamd_content_type_parse (cur->value, strlen (cur->value),
  776. task->task_pool);
  777. /* Here we prefer multipart content-type or any content-type */
  778. if (ct) {
  779. if (sel == NULL) {
  780. sel = ct;
  781. }
  782. else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
  783. sel = ct;
  784. }
  785. }
  786. }
  787. }
  788. if (sel == NULL) {
  789. sel = rspamd_mempool_alloc0 (task->task_pool, sizeof (*sel));
  790. RSPAMD_FTOK_ASSIGN (&sel->type, "text");
  791. RSPAMD_FTOK_ASSIGN (&sel->subtype, "plain");
  792. }
  793. npart->ct = sel;
  794. if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
  795. st->nesting ++;
  796. g_ptr_array_add (st->stack, npart);
  797. npart->part_type = RSPAMD_MIME_PART_MULTIPART;
  798. npart->specific.mp = rspamd_mempool_alloc0 (task->task_pool,
  799. sizeof (struct rspamd_mime_multipart));
  800. memcpy (&npart->specific.mp->boundary, &sel->orig_boundary,
  801. sizeof (rspamd_ftok_t));
  802. ret = rspamd_mime_parse_multipart_part (task, npart, st, err);
  803. }
  804. else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) {
  805. st->nesting ++;
  806. g_ptr_array_add (st->stack, npart);
  807. npart->part_type = RSPAMD_MIME_PART_MESSAGE;
  808. if ((ret = rspamd_mime_parse_normal_part (task, npart, st, sel, err))
  809. == RSPAMD_MIME_PARSE_OK) {
  810. ret = rspamd_mime_parse_message (task, npart, st, err);
  811. }
  812. }
  813. else {
  814. ret = rspamd_mime_parse_normal_part (task, npart, st, sel, err);
  815. }
  816. return ret;
  817. }
  818. static enum rspamd_mime_parse_error
  819. rspamd_mime_parse_multipart_cb (struct rspamd_task *task,
  820. struct rspamd_mime_part *multipart,
  821. struct rspamd_mime_parser_ctx *st,
  822. struct rspamd_mime_multipart_cbdata *cb,
  823. struct rspamd_mime_boundary *b)
  824. {
  825. const gchar *pos = st->start + b->boundary;
  826. enum rspamd_mime_parse_error ret;
  827. task = cb->task;
  828. /* Now check boundary */
  829. if (!cb->part_start) {
  830. cb->part_start = st->start + b->start;
  831. st->pos = cb->part_start;
  832. }
  833. else {
  834. /*
  835. * We have seen the start of the boundary,
  836. * but it might be unsuitable (e.g. in broken headers)
  837. */
  838. if (cb->part_start < pos && cb->cur_boundary) {
  839. if ((ret = rspamd_mime_process_multipart_node (task, cb->st,
  840. cb->multipart, cb->part_start, pos, TRUE, cb->err))
  841. != RSPAMD_MIME_PARSE_OK) {
  842. return ret;
  843. }
  844. if (b->start > 0) {
  845. /* Go towards the next part */
  846. cb->part_start = st->start + b->start;
  847. cb->st->pos = cb->part_start;
  848. }
  849. }
  850. else {
  851. /* We have an empty boundary, do nothing */
  852. }
  853. }
  854. return RSPAMD_MIME_PARSE_OK;
  855. }
  856. static enum rspamd_mime_parse_error
  857. rspamd_multipart_boundaries_filter (struct rspamd_task *task,
  858. struct rspamd_mime_part *multipart,
  859. struct rspamd_mime_parser_ctx *st,
  860. struct rspamd_mime_multipart_cbdata *cb)
  861. {
  862. struct rspamd_mime_boundary *cur;
  863. goffset last_offset;
  864. guint i, sel = 0;
  865. enum rspamd_mime_parse_error ret;
  866. last_offset = (multipart->raw_data.begin - st->start) +
  867. multipart->raw_data.len;
  868. /* Find the first offset suitable for this part */
  869. for (i = 0; i < st->boundaries->len; i ++) {
  870. cur = &g_array_index (st->boundaries, struct rspamd_mime_boundary, i);
  871. if (cur->start >= multipart->raw_data.begin - st->start) {
  872. if (cb->cur_boundary) {
  873. /* Check boundary */
  874. msg_debug_mime ("compare %L and %L (and %L)",
  875. cb->bhash, cur->hash, cur->closed_hash);
  876. if (cb->bhash == cur->hash) {
  877. sel = i;
  878. break;
  879. }
  880. else if (cb->bhash == cur->closed_hash) {
  881. /* Not a closing element in fact */
  882. cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
  883. cur->hash = cur->closed_hash;
  884. sel = i;
  885. break;
  886. }
  887. }
  888. else {
  889. /* Set current boundary */
  890. cb->cur_boundary = rspamd_mempool_alloc (task->task_pool,
  891. sizeof (rspamd_ftok_t));
  892. cb->cur_boundary->begin = st->start + cur->boundary;
  893. cb->cur_boundary->len = 0;
  894. cb->bhash = cur->hash;
  895. sel = i;
  896. break;
  897. }
  898. }
  899. }
  900. /* Now we can go forward with boundaries that are same to what we have */
  901. for (i = sel; i < st->boundaries->len; i ++) {
  902. cur = &g_array_index (st->boundaries, struct rspamd_mime_boundary, i);
  903. if (cur->boundary > last_offset) {
  904. break;
  905. }
  906. if (cur->hash == cb->bhash || cur->closed_hash == cb->bhash) {
  907. if ((ret = rspamd_mime_parse_multipart_cb (task, multipart, st,
  908. cb, cur)) != RSPAMD_MIME_PARSE_OK) {
  909. return ret;
  910. }
  911. if (cur->closed_hash == cb->bhash) {
  912. /* We have again fake closed hash */
  913. cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
  914. cur->hash = cur->closed_hash;
  915. }
  916. if (RSPAMD_BOUNDARY_IS_CLOSED (cur)) {
  917. /* We also might check the next boundary... */
  918. if (i < st->boundaries->len - 1) {
  919. cur = &g_array_index (st->boundaries,
  920. struct rspamd_mime_boundary, i + 1);
  921. if (cur->hash == cb->bhash) {
  922. continue;
  923. }
  924. else if (cur->closed_hash == cb->bhash) {
  925. /* We have again fake closed hash */
  926. cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
  927. cur->hash = cur->closed_hash;
  928. continue;
  929. }
  930. }
  931. break;
  932. }
  933. }
  934. }
  935. if (i == st->boundaries->len && cb->cur_boundary) {
  936. /* Process the last part */
  937. struct rspamd_mime_boundary fb;
  938. fb.boundary = last_offset;
  939. fb.start = -1;
  940. if ((ret = rspamd_mime_parse_multipart_cb (task, multipart, st,
  941. cb, &fb)) != RSPAMD_MIME_PARSE_OK) {
  942. return ret;
  943. }
  944. }
  945. return RSPAMD_MIME_PARSE_OK;
  946. }
  947. static enum rspamd_mime_parse_error
  948. rspamd_mime_parse_multipart_part (struct rspamd_task *task,
  949. struct rspamd_mime_part *part,
  950. struct rspamd_mime_parser_ctx *st,
  951. GError **err)
  952. {
  953. struct rspamd_mime_multipart_cbdata cbdata;
  954. enum rspamd_mime_parse_error ret;
  955. if (st->nesting > max_nested) {
  956. g_set_error (err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d",
  957. st->nesting);
  958. return RSPAMD_MIME_PARSE_NESTING;
  959. }
  960. part->part_number = MESSAGE_FIELD (task, parts)->len;
  961. part->urls = g_ptr_array_new ();
  962. g_ptr_array_add (MESSAGE_FIELD (task, parts), part);
  963. st->nesting ++;
  964. rspamd_mime_part_get_cte (task, part->raw_headers, part, FALSE);
  965. st->pos = part->raw_data.begin;
  966. cbdata.multipart = part;
  967. cbdata.task = task;
  968. cbdata.st = st;
  969. cbdata.part_start = NULL;
  970. cbdata.err = err;
  971. if (part->ct->boundary.len > 0) {
  972. /* We know our boundary */
  973. cbdata.cur_boundary = &part->ct->boundary;
  974. rspamd_cryptobox_siphash ((guchar *)&cbdata.bhash,
  975. cbdata.cur_boundary->begin, cbdata.cur_boundary->len,
  976. lib_ctx->hkey);
  977. msg_debug_mime ("hash: %T -> %L", cbdata.cur_boundary, cbdata.bhash);
  978. }
  979. else {
  980. /* Guess boundary */
  981. cbdata.cur_boundary = NULL;
  982. cbdata.bhash = 0;
  983. }
  984. ret = rspamd_multipart_boundaries_filter (task, part, st, &cbdata);
  985. /* Cleanup stack */
  986. st->nesting --;
  987. g_ptr_array_remove_index_fast (st->stack, st->stack->len - 1);
  988. return ret;
  989. }
  990. /* Process boundary like structures in a message */
  991. static gint
  992. rspamd_mime_preprocess_cb (struct rspamd_multipattern *mp,
  993. guint strnum,
  994. gint match_start,
  995. gint match_pos,
  996. const gchar *text,
  997. gsize len,
  998. void *context)
  999. {
  1000. const gchar *end = text + len, *p = text + match_pos, *bend;
  1001. gsize blen;
  1002. gboolean closing = FALSE;
  1003. struct rspamd_mime_boundary b;
  1004. struct rspamd_mime_parser_ctx *st = context;
  1005. struct rspamd_task *task;
  1006. task = st->task;
  1007. if (G_LIKELY (p < end)) {
  1008. gboolean seen_non_dash = FALSE;
  1009. blen = 0;
  1010. while (p < end) {
  1011. if (*p == '\r' || *p == '\n') {
  1012. break;
  1013. }
  1014. else if (*p != '-') {
  1015. seen_non_dash = TRUE;
  1016. }
  1017. blen ++;
  1018. p ++;
  1019. }
  1020. if (blen > 0 && seen_non_dash) {
  1021. /* We have found something like boundary */
  1022. p = text + match_pos;
  1023. bend = p + blen - 1;
  1024. if (*bend == '-') {
  1025. /* We need to verify last -- */
  1026. if (bend > p + 1 && *(bend - 1) == '-') {
  1027. closing = TRUE;
  1028. bend --;
  1029. blen -= 2;
  1030. }
  1031. else {
  1032. /* Not a closing boundary somehow */
  1033. bend ++;
  1034. }
  1035. }
  1036. else {
  1037. bend ++;
  1038. }
  1039. while (bend < end) {
  1040. if (*bend == '\r') {
  1041. bend ++;
  1042. /* \r\n */
  1043. if (bend < end && *bend == '\n') {
  1044. bend ++;
  1045. }
  1046. }
  1047. else if (*bend == '\n') {
  1048. /* \n */
  1049. bend ++;
  1050. }
  1051. else if (g_ascii_isspace (*bend)){
  1052. /* Spaces in the same line, skip them */
  1053. bend ++;
  1054. continue;
  1055. }
  1056. break;
  1057. }
  1058. b.boundary = p - st->start - 2;
  1059. b.start = bend - st->start;
  1060. /* Small optimisation as boundaries are usually short strings */
  1061. gchar *lc_copy, lc_copy_buf[128];
  1062. if (blen + 2 < sizeof(lc_copy_buf)) {
  1063. lc_copy = lc_copy_buf;
  1064. }
  1065. else {
  1066. lc_copy = g_malloc (blen + 2);
  1067. }
  1068. if (closing) {
  1069. memcpy (lc_copy, p, blen + 2);
  1070. rspamd_str_lc (lc_copy, blen + 2);
  1071. }
  1072. else {
  1073. memcpy (lc_copy, p, blen);
  1074. rspamd_str_lc (lc_copy, blen);
  1075. }
  1076. rspamd_cryptobox_siphash ((guchar *)&b.hash, lc_copy, blen,
  1077. lib_ctx->hkey);
  1078. msg_debug_mime ("normal hash: %*s -> %L, %d boffset, %d data offset",
  1079. (gint)blen, lc_copy, b.hash, (int)b.boundary, (int)b.start);
  1080. if (closing) {
  1081. b.flags = RSPAMD_MIME_BOUNDARY_FLAG_CLOSED;
  1082. rspamd_cryptobox_siphash ((guchar *)&b.closed_hash, lc_copy,
  1083. blen + 2,
  1084. lib_ctx->hkey);
  1085. msg_debug_mime ("closing hash: %*s -> %L, %d boffset, %d data offset",
  1086. (gint)blen + 2, lc_copy,
  1087. b.closed_hash,
  1088. (int)b.boundary, (int)b.start);
  1089. }
  1090. else {
  1091. b.flags = 0;
  1092. b.closed_hash = 0;
  1093. }
  1094. /* Check if a string has been allocated on the heap */
  1095. if (blen + 2 >= sizeof(lc_copy_buf)) {
  1096. g_free(lc_copy);
  1097. }
  1098. g_array_append_val (st->boundaries, b);
  1099. }
  1100. }
  1101. return 0;
  1102. }
  1103. static goffset
  1104. rspamd_mime_parser_headers_heuristic (GString *input, goffset *body_start)
  1105. {
  1106. const gsize default_max_len = 76;
  1107. gsize max_len = MIN (input->len, default_max_len);
  1108. const gchar *p, *end;
  1109. enum {
  1110. st_before_colon = 0,
  1111. st_colon,
  1112. st_spaces_after_colon,
  1113. st_value,
  1114. st_error
  1115. } state = st_before_colon;
  1116. p = input->str;
  1117. end = p + max_len;
  1118. while (p < end) {
  1119. switch (state) {
  1120. case st_before_colon:
  1121. if (G_UNLIKELY (*p == ':')) {
  1122. state = st_colon;
  1123. }
  1124. else if (G_UNLIKELY (!g_ascii_isgraph (*p))) {
  1125. state = st_error;
  1126. }
  1127. p ++;
  1128. break;
  1129. case st_colon:
  1130. if (g_ascii_isspace (*p)) {
  1131. state = st_spaces_after_colon;
  1132. }
  1133. else {
  1134. state = st_value;
  1135. }
  1136. p ++;
  1137. break;
  1138. case st_spaces_after_colon:
  1139. if (!g_ascii_isspace (*p)) {
  1140. state = st_value;
  1141. }
  1142. p ++;
  1143. break;
  1144. case st_value:
  1145. /* We accept any value */
  1146. goto end;
  1147. break;
  1148. case st_error:
  1149. return (-1);
  1150. break;
  1151. }
  1152. }
  1153. end:
  1154. if (state == st_value) {
  1155. if (body_start) {
  1156. *body_start = input->len;
  1157. }
  1158. return input->len;
  1159. }
  1160. return (-1);
  1161. }
  1162. static void
  1163. rspamd_mime_preprocess_message (struct rspamd_task *task,
  1164. struct rspamd_mime_part *top,
  1165. struct rspamd_mime_parser_ctx *st)
  1166. {
  1167. if (top->raw_data.begin >= st->pos) {
  1168. rspamd_multipattern_lookup (lib_ctx->mp_boundary,
  1169. top->raw_data.begin - 1,
  1170. top->raw_data.len + 1,
  1171. rspamd_mime_preprocess_cb, st, NULL);
  1172. }
  1173. else {
  1174. rspamd_multipattern_lookup (lib_ctx->mp_boundary,
  1175. st->pos,
  1176. st->end - st->pos,
  1177. rspamd_mime_preprocess_cb, st, NULL);
  1178. }
  1179. }
  1180. static void
  1181. rspamd_mime_parse_stack_free (struct rspamd_mime_parser_ctx *st)
  1182. {
  1183. if (st) {
  1184. g_ptr_array_free (st->stack, TRUE);
  1185. g_array_free (st->boundaries, TRUE);
  1186. g_free (st);
  1187. }
  1188. }
  1189. static enum rspamd_mime_parse_error
  1190. rspamd_mime_parse_message (struct rspamd_task *task,
  1191. struct rspamd_mime_part *part,
  1192. struct rspamd_mime_parser_ctx *st,
  1193. GError **err)
  1194. {
  1195. struct rspamd_content_type *ct, *sel = NULL;
  1196. struct rspamd_mime_header *hdr = NULL, *cur;
  1197. const gchar *pbegin, *p;
  1198. gsize plen, len;
  1199. struct rspamd_mime_part *npart;
  1200. goffset hdr_pos, body_pos;
  1201. guint i;
  1202. enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK;
  1203. GString str;
  1204. struct rspamd_mime_parser_ctx *nst = st;
  1205. if (st->nesting > max_nested) {
  1206. g_set_error (err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d",
  1207. st->nesting);
  1208. return RSPAMD_MIME_PARSE_NESTING;
  1209. }
  1210. /* Allocate real part */
  1211. npart = rspamd_mempool_alloc0 (task->task_pool,
  1212. sizeof (struct rspamd_mime_part));
  1213. if (part == NULL) {
  1214. /* Top level message */
  1215. p = task->msg.begin;
  1216. len = task->msg.len;
  1217. str.str = (gchar *)p;
  1218. str.len = len;
  1219. hdr_pos = rspamd_string_find_eoh (&str, &body_pos);
  1220. if (hdr_pos > 0 && hdr_pos < str.len) {
  1221. MESSAGE_FIELD (task, raw_headers_content).begin = str.str;
  1222. MESSAGE_FIELD (task, raw_headers_content).len = hdr_pos;
  1223. MESSAGE_FIELD (task, raw_headers_content).body_start = str.str + body_pos;
  1224. if (MESSAGE_FIELD (task, raw_headers_content).len > 0) {
  1225. rspamd_mime_headers_process (task,
  1226. MESSAGE_FIELD (task, raw_headers),
  1227. &MESSAGE_FIELD (task, headers_order),
  1228. MESSAGE_FIELD (task, raw_headers_content).begin,
  1229. MESSAGE_FIELD (task, raw_headers_content).len,
  1230. TRUE);
  1231. npart->raw_headers = rspamd_message_headers_ref (
  1232. MESSAGE_FIELD (task, raw_headers));
  1233. /* Preserve the natural order */
  1234. if (MESSAGE_FIELD (task, headers_order)) {
  1235. LL_REVERSE2 (MESSAGE_FIELD (task, headers_order), ord_next);
  1236. }
  1237. }
  1238. hdr = rspamd_message_get_header_from_hash(
  1239. MESSAGE_FIELD (task, raw_headers),
  1240. "Content-Type", FALSE);
  1241. }
  1242. else {
  1243. /* First apply heuristic, maybe we have just headers */
  1244. hdr_pos = rspamd_mime_parser_headers_heuristic (&str, &body_pos);
  1245. if (hdr_pos > 0 && hdr_pos <= str.len) {
  1246. MESSAGE_FIELD (task, raw_headers_content).begin = str.str;
  1247. MESSAGE_FIELD (task, raw_headers_content).len = hdr_pos;
  1248. MESSAGE_FIELD (task, raw_headers_content).body_start = str.str +
  1249. body_pos;
  1250. if (MESSAGE_FIELD (task, raw_headers_content).len > 0) {
  1251. rspamd_mime_headers_process (task,
  1252. MESSAGE_FIELD (task, raw_headers),
  1253. &MESSAGE_FIELD (task, headers_order),
  1254. MESSAGE_FIELD (task, raw_headers_content).begin,
  1255. MESSAGE_FIELD (task, raw_headers_content).len,
  1256. TRUE);
  1257. npart->raw_headers = rspamd_message_headers_ref (
  1258. MESSAGE_FIELD (task, raw_headers));
  1259. /* Preserve the natural order */
  1260. if (MESSAGE_FIELD (task, headers_order)) {
  1261. LL_REVERSE2 (MESSAGE_FIELD (task, headers_order), ord_next);
  1262. }
  1263. }
  1264. hdr = rspamd_message_get_header_from_hash(
  1265. MESSAGE_FIELD (task, raw_headers),
  1266. "Content-Type", FALSE);
  1267. task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS;
  1268. }
  1269. else {
  1270. body_pos = 0;
  1271. }
  1272. }
  1273. pbegin = st->start + body_pos;
  1274. plen = st->end - pbegin;
  1275. npart->headers_order = NULL;
  1276. }
  1277. else {
  1278. /*
  1279. * Here are dragons:
  1280. * We allocate new parser context as we need to shift pointers
  1281. */
  1282. nst = g_malloc0 (sizeof (*st));
  1283. nst->stack = g_ptr_array_sized_new (4);
  1284. nst->boundaries = g_array_sized_new (FALSE, FALSE,
  1285. sizeof (struct rspamd_mime_boundary), 8);
  1286. nst->start = part->parsed_data.begin;
  1287. nst->end = nst->start + part->parsed_data.len;
  1288. nst->pos = nst->start;
  1289. nst->task = st->task;
  1290. nst->nesting = st->nesting;
  1291. st->nesting ++;
  1292. str.str = (gchar *)part->parsed_data.begin;
  1293. str.len = part->parsed_data.len;
  1294. hdr_pos = rspamd_string_find_eoh (&str, &body_pos);
  1295. npart->raw_headers = rspamd_message_headers_new ();
  1296. npart->headers_order = NULL;
  1297. if (hdr_pos > 0 && hdr_pos < str.len) {
  1298. npart->raw_headers_str = str.str;
  1299. npart->raw_headers_len = hdr_pos;
  1300. npart->raw_data.begin = str.str + body_pos;
  1301. if (npart->raw_headers_len > 0) {
  1302. rspamd_mime_headers_process (task,
  1303. npart->raw_headers,
  1304. &npart->headers_order,
  1305. npart->raw_headers_str,
  1306. npart->raw_headers_len,
  1307. FALSE);
  1308. /* Preserve the natural order */
  1309. if (npart->headers_order) {
  1310. LL_REVERSE2 (npart->headers_order, ord_next);
  1311. }
  1312. }
  1313. hdr = rspamd_message_get_header_from_hash(npart->raw_headers,
  1314. "Content-Type", FALSE);
  1315. }
  1316. else {
  1317. body_pos = 0;
  1318. }
  1319. pbegin = part->parsed_data.begin + body_pos;
  1320. plen = part->parsed_data.len - body_pos;
  1321. }
  1322. npart->raw_data.begin = pbegin;
  1323. npart->raw_data.len = plen;
  1324. npart->parent_part = part;
  1325. if (hdr == NULL) {
  1326. sel = NULL;
  1327. }
  1328. else {
  1329. DL_FOREACH (hdr, cur) {
  1330. ct = rspamd_content_type_parse (cur->value, strlen (cur->value),
  1331. task->task_pool);
  1332. /* Here we prefer multipart content-type or any content-type */
  1333. if (ct) {
  1334. if (sel == NULL) {
  1335. sel = ct;
  1336. }
  1337. else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
  1338. sel = ct;
  1339. }
  1340. }
  1341. }
  1342. }
  1343. if (sel == NULL) {
  1344. /* For messages we automatically assume plaintext */
  1345. msg_info_task ("cannot find content-type for a message, assume text/plain");
  1346. sel = rspamd_mempool_alloc0 (task->task_pool, sizeof (*sel));
  1347. sel->flags = RSPAMD_CONTENT_TYPE_TEXT|RSPAMD_CONTENT_TYPE_MISSING;
  1348. RSPAMD_FTOK_ASSIGN (&sel->type, "text");
  1349. RSPAMD_FTOK_ASSIGN (&sel->subtype, "plain");
  1350. }
  1351. npart->ct = sel;
  1352. if ((part == NULL || nst != st) &&
  1353. (sel->flags & (RSPAMD_CONTENT_TYPE_MULTIPART|RSPAMD_CONTENT_TYPE_MESSAGE))) {
  1354. /* Not a trivial message, need to preprocess */
  1355. rspamd_mime_preprocess_message (task, npart, nst);
  1356. }
  1357. if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
  1358. g_ptr_array_add (nst->stack, npart);
  1359. nst->nesting ++;
  1360. npart->part_type = RSPAMD_MIME_PART_MULTIPART;
  1361. npart->specific.mp = rspamd_mempool_alloc0 (task->task_pool,
  1362. sizeof (struct rspamd_mime_multipart));
  1363. memcpy (&npart->specific.mp->boundary, &sel->orig_boundary,
  1364. sizeof (rspamd_ftok_t));
  1365. ret = rspamd_mime_parse_multipart_part (task, npart, nst, err);
  1366. }
  1367. else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) {
  1368. if ((ret = rspamd_mime_parse_normal_part (task, npart, nst, sel, err))
  1369. == RSPAMD_MIME_PARSE_OK) {
  1370. npart->part_type = RSPAMD_MIME_PART_MESSAGE;
  1371. ret = rspamd_mime_parse_message (task, npart, nst, err);
  1372. }
  1373. }
  1374. else {
  1375. ret = rspamd_mime_parse_normal_part (task, npart, nst, sel, err);
  1376. }
  1377. if (ret != RSPAMD_MIME_PARSE_OK) {
  1378. return ret;
  1379. }
  1380. if (part && st->stack->len > 0) {
  1381. /* Remove message part from the parent stack */
  1382. g_ptr_array_remove_index_fast (st->stack, st->stack->len - 1);
  1383. st->nesting --;
  1384. }
  1385. /* Process leftovers for boundaries */
  1386. if (nst->boundaries) {
  1387. struct rspamd_mime_boundary *boundary, *start_boundary = NULL,
  1388. *end_boundary = NULL;
  1389. goffset cur_offset = nst->pos - nst->start,
  1390. end_offset = st->end - st->start;
  1391. guint sel_idx = 0;
  1392. for (;;) {
  1393. start_boundary = NULL;
  1394. for (i = sel_idx; i < nst->boundaries->len; i++) {
  1395. boundary = &g_array_index (nst->boundaries,
  1396. struct rspamd_mime_boundary, i);
  1397. if (boundary->start > cur_offset &&
  1398. boundary->boundary < end_offset &&
  1399. !RSPAMD_BOUNDARY_IS_CLOSED (boundary)) {
  1400. start_boundary = boundary;
  1401. sel_idx = i;
  1402. break;
  1403. }
  1404. }
  1405. if (start_boundary) {
  1406. const gchar *start, *end;
  1407. if (nst->boundaries->len > sel_idx + 1) {
  1408. end_boundary = &g_array_index (nst->boundaries,
  1409. struct rspamd_mime_boundary, sel_idx + 1);
  1410. end = nst->start + end_boundary->boundary;
  1411. }
  1412. else {
  1413. end = nst->end;
  1414. }
  1415. sel_idx ++;
  1416. start = nst->start + start_boundary->start;
  1417. if (end > start &&
  1418. (ret = rspamd_mime_process_multipart_node (task, nst,
  1419. NULL, start, end, FALSE, err)) != RSPAMD_MIME_PARSE_OK) {
  1420. if (nst != st) {
  1421. rspamd_mime_parse_stack_free (nst);
  1422. }
  1423. if (ret == RSPAMD_MIME_PARSE_NO_PART) {
  1424. return RSPAMD_MIME_PARSE_OK;
  1425. }
  1426. return ret;
  1427. }
  1428. }
  1429. else {
  1430. break;
  1431. }
  1432. }
  1433. }
  1434. if (nst != st) {
  1435. rspamd_mime_parse_stack_free (nst);
  1436. }
  1437. return ret;
  1438. }
  1439. enum rspamd_mime_parse_error
  1440. rspamd_mime_parse_task (struct rspamd_task *task, GError **err)
  1441. {
  1442. struct rspamd_mime_parser_ctx *st;
  1443. enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK;
  1444. if (lib_ctx == NULL) {
  1445. rspamd_mime_parser_init_lib ();
  1446. }
  1447. if (++lib_ctx->key_usages > max_key_usages) {
  1448. /* Regenerate siphash key */
  1449. ottery_rand_bytes (lib_ctx->hkey, sizeof (lib_ctx->hkey));
  1450. lib_ctx->key_usages = 0;
  1451. }
  1452. st = g_malloc0 (sizeof (*st));
  1453. st->stack = g_ptr_array_sized_new (4);
  1454. st->pos = MESSAGE_FIELD (task, raw_headers_content).body_start;
  1455. st->end = task->msg.begin + task->msg.len;
  1456. st->boundaries = g_array_sized_new (FALSE, FALSE,
  1457. sizeof (struct rspamd_mime_boundary), 8);
  1458. st->task = task;
  1459. if (st->pos == NULL) {
  1460. st->pos = task->msg.begin;
  1461. }
  1462. st->start = task->msg.begin;
  1463. ret = rspamd_mime_parse_message (task, NULL, st, err);
  1464. rspamd_mime_parse_stack_free (st);
  1465. return ret;
  1466. }