You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

mime_parser.c 44KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758
  1. /*
  2. * Copyright 2024 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "task.h"
  18. #include "mime_parser.h"
  19. #include "mime_headers.h"
  20. #include "message.h"
  21. #include "multipattern.h"
  22. #include "contrib/libottery/ottery.h"
  23. #include "contrib/uthash/utlist.h"
  24. #include <openssl/cms.h>
  25. #include <openssl/pkcs7.h>
  26. #include "contrib/fastutf8/fastutf8.h"
  27. struct rspamd_mime_parser_lib_ctx {
  28. struct rspamd_multipattern *mp_boundary;
  29. guchar hkey[rspamd_cryptobox_SIPKEYBYTES]; /* Key for hashing */
  30. guint key_usages;
  31. };
  32. struct rspamd_mime_parser_lib_ctx *lib_ctx = NULL;
  33. static const guint max_nested = 64;
  34. static const guint max_key_usages = 10000;
  35. #define msg_debug_mime(...) rspamd_conditional_debug_fast(NULL, task->from_addr, \
  36. rspamd_mime_log_id, "mime", task->task_pool->tag.uid, \
  37. RSPAMD_LOG_FUNC, \
  38. __VA_ARGS__)
  39. INIT_LOG_MODULE(mime)
  40. #define RSPAMD_MIME_BOUNDARY_FLAG_CLOSED (1 << 0)
  41. #define RSPAMD_BOUNDARY_IS_CLOSED(b) ((b)->flags & RSPAMD_MIME_BOUNDARY_FLAG_CLOSED)
  42. struct rspamd_mime_boundary {
  43. goffset boundary;
  44. goffset start;
  45. uint64_t hash;
  46. uint64_t closed_hash;
  47. gint flags;
  48. };
  49. struct rspamd_mime_parser_ctx {
  50. GPtrArray *stack; /* Stack of parts */
  51. GArray *boundaries; /* Boundaries found in the whole message */
  52. const gchar *start;
  53. const gchar *pos;
  54. const gchar *end;
  55. struct rspamd_task *task;
  56. guint nesting;
  57. };
  58. static enum rspamd_mime_parse_error
  59. rspamd_mime_parse_multipart_part(struct rspamd_task *task,
  60. struct rspamd_mime_part *part,
  61. struct rspamd_mime_parser_ctx *st,
  62. GError **err);
  63. static enum rspamd_mime_parse_error
  64. rspamd_mime_parse_message(struct rspamd_task *task,
  65. struct rspamd_mime_part *part,
  66. struct rspamd_mime_parser_ctx *st,
  67. GError **err);
  68. static enum rspamd_mime_parse_error
  69. rspamd_mime_parse_normal_part(struct rspamd_task *task,
  70. struct rspamd_mime_part *part,
  71. struct rspamd_mime_parser_ctx *st,
  72. struct rspamd_content_type *ct,
  73. GError **err);
  74. static enum rspamd_mime_parse_error
  75. rspamd_mime_process_multipart_node(struct rspamd_task *task,
  76. struct rspamd_mime_parser_ctx *st,
  77. struct rspamd_mime_part *multipart,
  78. const gchar *start, const gchar *end,
  79. gboolean is_finished,
  80. GError **err);
  81. #define RSPAMD_MIME_QUARK (rspamd_mime_parser_quark())
  82. static GQuark
  83. rspamd_mime_parser_quark(void)
  84. {
  85. return g_quark_from_static_string("mime-parser");
  86. }
  87. const gchar *
  88. rspamd_cte_to_string(enum rspamd_cte ct)
  89. {
  90. const gchar *ret = "unknown";
  91. switch (ct) {
  92. case RSPAMD_CTE_7BIT:
  93. ret = "7bit";
  94. break;
  95. case RSPAMD_CTE_8BIT:
  96. ret = "8bit";
  97. break;
  98. case RSPAMD_CTE_QP:
  99. ret = "quoted-printable";
  100. break;
  101. case RSPAMD_CTE_B64:
  102. ret = "base64";
  103. break;
  104. case RSPAMD_CTE_UUE:
  105. ret = "X-uuencode";
  106. break;
  107. default:
  108. break;
  109. }
  110. return ret;
  111. }
  112. enum rspamd_cte
  113. rspamd_cte_from_string(const gchar *str)
  114. {
  115. enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
  116. g_assert(str != NULL);
  117. if (strcmp(str, "7bit") == 0) {
  118. ret = RSPAMD_CTE_7BIT;
  119. }
  120. else if (strcmp(str, "8bit") == 0) {
  121. ret = RSPAMD_CTE_8BIT;
  122. }
  123. else if (strcmp(str, "quoted-printable") == 0) {
  124. ret = RSPAMD_CTE_QP;
  125. }
  126. else if (strcmp(str, "base64") == 0) {
  127. ret = RSPAMD_CTE_B64;
  128. }
  129. else if (strcmp(str, "X-uuencode") == 0) {
  130. ret = RSPAMD_CTE_UUE;
  131. }
  132. else if (strcmp(str, "uuencode") == 0) {
  133. ret = RSPAMD_CTE_UUE;
  134. }
  135. else if (strcmp(str, "X-uue") == 0) {
  136. ret = RSPAMD_CTE_UUE;
  137. }
  138. return ret;
  139. }
  140. static void
  141. rspamd_mime_parser_init_lib(void)
  142. {
  143. lib_ctx = g_malloc0(sizeof(*lib_ctx));
  144. lib_ctx->mp_boundary = rspamd_multipattern_create(RSPAMD_MULTIPATTERN_DEFAULT);
  145. g_assert(lib_ctx->mp_boundary != NULL);
  146. rspamd_multipattern_add_pattern(lib_ctx->mp_boundary, "\r--", 0);
  147. rspamd_multipattern_add_pattern(lib_ctx->mp_boundary, "\n--", 0);
  148. GError *err = NULL;
  149. if (!rspamd_multipattern_compile(lib_ctx->mp_boundary, RSPAMD_MULTIPATTERN_COMPILE_NO_FS, &err)) {
  150. msg_err("fatal error: cannot compile multipattern for mime parser boundaries: %e", err);
  151. g_error_free(err);
  152. g_abort();
  153. }
  154. ottery_rand_bytes(lib_ctx->hkey, sizeof(lib_ctx->hkey));
  155. }
  156. static enum rspamd_cte
  157. rspamd_mime_parse_cte(const gchar *in, gsize len)
  158. {
  159. uint64_t h;
  160. enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
  161. in = rspamd_string_len_strip(in, &len, " \t;,.+-#!`~'");
  162. h = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64,
  163. in, len, 0xdeadbabe);
  164. switch (h) {
  165. case 0xCEDAA7056B4753F7ULL: /* 7bit */
  166. ret = RSPAMD_CTE_7BIT;
  167. break;
  168. case 0x42E0745448B39FC1ULL: /* 8bit */
  169. case 0x6B169E6B155BADC0ULL: /* binary */
  170. ret = RSPAMD_CTE_8BIT;
  171. break;
  172. case 0x6D69A5BB02A633B0ULL: /* quoted-printable */
  173. ret = RSPAMD_CTE_QP;
  174. break;
  175. case 0x96305588A76DC9A9ULL: /* base64 */
  176. case 0x171029DE1B0423A9ULL: /* base-64 */
  177. ret = RSPAMD_CTE_B64;
  178. break;
  179. case 0x420b54dc00d13cecULL: /* uuencode */
  180. case 0x8df6700b8f6c4cf9ULL: /* x-uuencode */
  181. case 0x41f725ec544356d3ULL: /* x-uue */
  182. ret = RSPAMD_CTE_UUE;
  183. break;
  184. }
  185. return ret;
  186. }
  187. static enum rspamd_cte
  188. rspamd_mime_part_get_cte_heuristic(struct rspamd_task *task,
  189. struct rspamd_mime_part *part)
  190. {
  191. const guint check_len = 128;
  192. guint real_len, nspaces = 0, neqsign = 0, n8bit = 0, nqpencoded = 0,
  193. padeqsign = 0, nupper = 0, nlower = 0;
  194. gboolean b64_chars = TRUE;
  195. const guchar *p, *end;
  196. enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
  197. real_len = MIN(check_len, part->raw_data.len);
  198. p = (const guchar *) part->raw_data.begin;
  199. end = p + part->raw_data.len;
  200. while (p < end && g_ascii_isspace(*p)) {
  201. p++;
  202. }
  203. if (end - p > sizeof("begin-base64 ")) {
  204. const guchar *uue_start;
  205. if (memcmp(p, "begin ", sizeof("begin ") - 1) == 0) {
  206. uue_start = p + sizeof("begin ") - 1;
  207. while (uue_start < end && g_ascii_isspace(*uue_start)) {
  208. uue_start++;
  209. }
  210. if (uue_start < end && g_ascii_isdigit(*uue_start)) {
  211. return RSPAMD_CTE_UUE;
  212. }
  213. }
  214. else if (memcmp(p, "begin-base64 ", sizeof("begin-base64 ") - 1) == 0) {
  215. uue_start = p + sizeof("begin ") - 1;
  216. while (uue_start < end && g_ascii_isspace(*uue_start)) {
  217. uue_start++;
  218. }
  219. if (uue_start < end && g_ascii_isdigit(*uue_start)) {
  220. return RSPAMD_CTE_UUE;
  221. }
  222. }
  223. }
  224. /* Skip trailing spaces */
  225. while (end > p && g_ascii_isspace(*(end - 1))) {
  226. end--;
  227. }
  228. if (end > p + 2) {
  229. if (*(end - 1) == '=') {
  230. padeqsign++;
  231. end--;
  232. }
  233. if (*(end - 1) == '=') {
  234. padeqsign++;
  235. end--;
  236. }
  237. }
  238. /* Adjust end to analyse only first characters */
  239. if (end - p > real_len) {
  240. end = p + real_len;
  241. }
  242. while (p < end) {
  243. if (*p == ' ') {
  244. nspaces++;
  245. }
  246. else if (*p == '=') {
  247. b64_chars = FALSE; /* Eqsign must not be inside base64 */
  248. neqsign++;
  249. p++;
  250. if (p + 2 < end && g_ascii_isxdigit(*p) && g_ascii_isxdigit(*(p + 1))) {
  251. p++;
  252. nqpencoded++;
  253. }
  254. continue;
  255. }
  256. else if (*p >= 0x80) {
  257. n8bit++;
  258. b64_chars = FALSE;
  259. }
  260. else if (!(g_ascii_isalnum(*p) || *p == '/' || *p == '+')) {
  261. b64_chars = FALSE;
  262. }
  263. else if (g_ascii_isupper(*p)) {
  264. nupper++;
  265. }
  266. else if (g_ascii_islower(*p)) {
  267. nlower++;
  268. }
  269. p++;
  270. }
  271. if (b64_chars && neqsign <= 2 && nspaces == 0) {
  272. /* Need more thinking */
  273. if (part->raw_data.len > 80) {
  274. if (padeqsign > 0) {
  275. ret = RSPAMD_CTE_B64;
  276. }
  277. else {
  278. /* We have a large piece of data with no spaces and base64
  279. * symbols only, no padding is detected as well...
  280. *
  281. * There is a small chance that our first 128 characters
  282. * are either some garbage or it is a base64 with no padding
  283. * (e.g. when it is not needed)
  284. */
  285. if (nupper > 1 && nlower > 1) {
  286. /*
  287. * We have both uppercase and lowercase letters, so it can be
  288. * base64
  289. */
  290. ret = RSPAMD_CTE_B64;
  291. }
  292. else {
  293. ret = RSPAMD_CTE_7BIT;
  294. }
  295. }
  296. }
  297. else {
  298. if (((end - (const guchar *) part->raw_data.begin) + padeqsign) % 4 == 0) {
  299. if (padeqsign == 0) {
  300. /*
  301. * It can be either base64 or plain text, hard to say
  302. * Let's assume that if we have > 1 uppercase it is
  303. * likely base64
  304. */
  305. if (nupper > 1 && nlower > 1) {
  306. ret = RSPAMD_CTE_B64;
  307. }
  308. else {
  309. ret = RSPAMD_CTE_7BIT;
  310. }
  311. }
  312. else {
  313. ret = RSPAMD_CTE_B64;
  314. }
  315. }
  316. else {
  317. /* No way */
  318. if (padeqsign == 1 || padeqsign == 2) {
  319. ret = RSPAMD_CTE_B64;
  320. }
  321. else {
  322. ret = RSPAMD_CTE_7BIT;
  323. }
  324. }
  325. }
  326. }
  327. else if (n8bit == 0) {
  328. if (neqsign > 2 && nqpencoded > 2) {
  329. ret = RSPAMD_CTE_QP;
  330. }
  331. else {
  332. ret = RSPAMD_CTE_7BIT;
  333. }
  334. }
  335. else {
  336. ret = RSPAMD_CTE_8BIT;
  337. }
  338. msg_debug_mime("detected cte: %s", rspamd_cte_to_string(ret));
  339. return ret;
  340. }
  341. static void
  342. rspamd_mime_part_get_cte(struct rspamd_task *task,
  343. struct rspamd_mime_headers_table *hdrs,
  344. struct rspamd_mime_part *part,
  345. gboolean apply_heuristic)
  346. {
  347. struct rspamd_mime_header *hdr, *cur;
  348. enum rspamd_cte cte = RSPAMD_CTE_UNKNOWN;
  349. gboolean parent_propagated = FALSE;
  350. hdr = rspamd_message_get_header_from_hash(hdrs, "Content-Transfer-Encoding", FALSE);
  351. if (hdr == NULL) {
  352. if (part->parent_part && part->parent_part->cte != RSPAMD_CTE_UNKNOWN &&
  353. !(part->parent_part->flags & RSPAMD_MIME_PART_MISSING_CTE)) {
  354. part->cte = part->parent_part->cte;
  355. parent_propagated = TRUE;
  356. goto check_cte;
  357. }
  358. if (apply_heuristic) {
  359. part->cte = rspamd_mime_part_get_cte_heuristic(task, part);
  360. msg_info_task("detected missing CTE for part as: %s",
  361. rspamd_cte_to_string(part->cte));
  362. }
  363. part->flags |= RSPAMD_MIME_PART_MISSING_CTE;
  364. }
  365. else {
  366. DL_FOREACH(hdr, cur)
  367. {
  368. gsize hlen;
  369. gchar lc_buf[128];
  370. hlen = rspamd_snprintf(lc_buf, sizeof(lc_buf), "%s", cur->value);
  371. rspamd_str_lc(lc_buf, hlen);
  372. cte = rspamd_mime_parse_cte(lc_buf, hlen);
  373. if (cte != RSPAMD_CTE_UNKNOWN) {
  374. part->cte = cte;
  375. break;
  376. }
  377. }
  378. check_cte:
  379. if (apply_heuristic) {
  380. if (part->cte == RSPAMD_CTE_UNKNOWN) {
  381. part->cte = rspamd_mime_part_get_cte_heuristic(task, part);
  382. msg_info_task("corrected bad CTE for part to: %s",
  383. rspamd_cte_to_string(part->cte));
  384. }
  385. else if (part->cte == RSPAMD_CTE_B64 ||
  386. part->cte == RSPAMD_CTE_QP) {
  387. /* Additionally check sanity */
  388. cte = rspamd_mime_part_get_cte_heuristic(task, part);
  389. if (cte == RSPAMD_CTE_8BIT) {
  390. msg_info_task(
  391. "incorrect cte specified for part: %s, %s detected",
  392. rspamd_cte_to_string(part->cte),
  393. rspamd_cte_to_string(cte));
  394. part->cte = cte;
  395. part->flags |= RSPAMD_MIME_PART_BAD_CTE;
  396. }
  397. else if (cte != part->cte && parent_propagated) {
  398. part->cte = cte;
  399. msg_info_task("detected missing CTE for part as: %s",
  400. rspamd_cte_to_string(part->cte));
  401. }
  402. }
  403. else {
  404. msg_debug_mime("processed cte: %s",
  405. rspamd_cte_to_string(cte));
  406. }
  407. }
  408. else {
  409. msg_debug_mime("processed cte: %s", rspamd_cte_to_string(cte));
  410. }
  411. }
  412. }
  413. static void
  414. rspamd_mime_part_get_cd(struct rspamd_task *task, struct rspamd_mime_part *part)
  415. {
  416. struct rspamd_mime_header *hdr, *cur;
  417. struct rspamd_content_disposition *cd = NULL;
  418. rspamd_ftok_t srch;
  419. struct rspamd_content_type_param *found;
  420. hdr = rspamd_message_get_header_from_hash(part->raw_headers,
  421. "Content-Disposition", FALSE);
  422. if (hdr == NULL) {
  423. cd = rspamd_mempool_alloc0(task->task_pool, sizeof(*cd));
  424. cd->type = RSPAMD_CT_INLINE;
  425. /* We can also have content disposition definitions in Content-Type */
  426. if (part->ct && part->ct->attrs) {
  427. RSPAMD_FTOK_ASSIGN(&srch, "name");
  428. found = g_hash_table_lookup(part->ct->attrs, &srch);
  429. if (!found) {
  430. RSPAMD_FTOK_ASSIGN(&srch, "filename");
  431. found = g_hash_table_lookup(part->ct->attrs, &srch);
  432. }
  433. if (found) {
  434. cd->type = RSPAMD_CT_ATTACHMENT;
  435. memcpy(&cd->filename, &found->value, sizeof(cd->filename));
  436. }
  437. }
  438. }
  439. else {
  440. DL_FOREACH(hdr, cur)
  441. {
  442. gsize hlen;
  443. cd = NULL;
  444. if (cur->value) {
  445. hlen = strlen(cur->value);
  446. cd = rspamd_content_disposition_parse(cur->value, hlen,
  447. task->task_pool);
  448. }
  449. if (cd) {
  450. /* We still need to check filename */
  451. if (cd->filename.len == 0) {
  452. if (part->ct && part->ct->attrs) {
  453. RSPAMD_FTOK_ASSIGN(&srch, "name");
  454. found = g_hash_table_lookup(part->ct->attrs, &srch);
  455. if (!found) {
  456. RSPAMD_FTOK_ASSIGN(&srch, "filename");
  457. found = g_hash_table_lookup(part->ct->attrs, &srch);
  458. }
  459. if (found) {
  460. cd->type = RSPAMD_CT_ATTACHMENT;
  461. memcpy(&cd->filename, &found->value,
  462. sizeof(cd->filename));
  463. }
  464. }
  465. }
  466. msg_debug_mime("processed content disposition: %s, file: \"%T\"",
  467. cd->lc_data, &cd->filename);
  468. break;
  469. }
  470. else if (part->ct) {
  471. /*
  472. * Even in case of malformed Content-Disposition, we can still
  473. * fall back to Content-Type
  474. */
  475. cd = rspamd_mempool_alloc0(task->task_pool, sizeof(*cd));
  476. cd->type = RSPAMD_CT_INLINE;
  477. /* We can also have content disposition definitions in Content-Type */
  478. if (part->ct->attrs) {
  479. RSPAMD_FTOK_ASSIGN(&srch, "name");
  480. found = g_hash_table_lookup(part->ct->attrs, &srch);
  481. if (!found) {
  482. RSPAMD_FTOK_ASSIGN(&srch, "filename");
  483. found = g_hash_table_lookup(part->ct->attrs, &srch);
  484. }
  485. if (found) {
  486. cd->type = RSPAMD_CT_ATTACHMENT;
  487. memcpy(&cd->filename, &found->value, sizeof(cd->filename));
  488. }
  489. }
  490. }
  491. }
  492. }
  493. part->cd = cd;
  494. }
  495. void rspamd_mime_parser_calc_digest(struct rspamd_mime_part *part)
  496. {
  497. /* Blake2b applied to string 'rspamd' */
  498. static const guchar hash_key[] = {
  499. 0xef,
  500. 0x43,
  501. 0xae,
  502. 0x80,
  503. 0xcc,
  504. 0x8d,
  505. 0xc3,
  506. 0x4c,
  507. 0x6f,
  508. 0x1b,
  509. 0xd6,
  510. 0x18,
  511. 0x1b,
  512. 0xae,
  513. 0x87,
  514. 0x74,
  515. 0x0c,
  516. 0xca,
  517. 0xf7,
  518. 0x8e,
  519. 0x5f,
  520. 0x2e,
  521. 0x54,
  522. 0x32,
  523. 0xf6,
  524. 0x79,
  525. 0xb9,
  526. 0x27,
  527. 0x26,
  528. 0x96,
  529. 0x20,
  530. 0x92,
  531. 0x70,
  532. 0x07,
  533. 0x85,
  534. 0xeb,
  535. 0x83,
  536. 0xf7,
  537. 0x89,
  538. 0xe0,
  539. 0xd7,
  540. 0x32,
  541. 0x2a,
  542. 0xd2,
  543. 0x1a,
  544. 0x64,
  545. 0x41,
  546. 0xef,
  547. 0x49,
  548. 0xff,
  549. 0xc3,
  550. 0x8c,
  551. 0x54,
  552. 0xf9,
  553. 0x67,
  554. 0x74,
  555. 0x30,
  556. 0x1e,
  557. 0x70,
  558. 0x2e,
  559. 0xb7,
  560. 0x12,
  561. 0x09,
  562. 0xfe,
  563. };
  564. if (part->parsed_data.len > 0) {
  565. rspamd_cryptobox_hash(part->digest,
  566. part->parsed_data.begin, part->parsed_data.len,
  567. hash_key, sizeof(hash_key));
  568. }
  569. }
  570. static enum rspamd_mime_parse_error
  571. rspamd_mime_parse_normal_part(struct rspamd_task *task,
  572. struct rspamd_mime_part *part,
  573. struct rspamd_mime_parser_ctx *st,
  574. struct rspamd_content_type *ct,
  575. GError **err)
  576. {
  577. rspamd_fstring_t *parsed;
  578. gssize r;
  579. g_assert(part != NULL);
  580. rspamd_mime_part_get_cte(task, part->raw_headers, part,
  581. part->ct && !(part->ct->flags & RSPAMD_CONTENT_TYPE_MESSAGE));
  582. rspamd_mime_part_get_cd(task, part);
  583. switch (part->cte) {
  584. case RSPAMD_CTE_7BIT:
  585. case RSPAMD_CTE_8BIT:
  586. case RSPAMD_CTE_UNKNOWN:
  587. if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING)) {
  588. if (part->cte != RSPAMD_CTE_7BIT) {
  589. /* We have something that has a missing content-type,
  590. * but it has non-7bit characters.
  591. *
  592. * In theory, it is very unsafe to process it as a text part
  593. * as we unlikely get some sane result
  594. */
  595. /*
  596. * On the other hand, there is an evidence that some
  597. * emails actually rely on that.
  598. * So we apply an expensive hack here:
  599. * if there are no 8bit characters -OR- the content is valid
  600. * UTF8, we can still imply Content-Type == text/plain
  601. */
  602. if (rspamd_str_has_8bit(part->raw_data.begin, part->raw_data.len) &&
  603. !rspamd_fast_utf8_validate(part->raw_data.begin, part->raw_data.len)) {
  604. part->ct->flags &= ~RSPAMD_CONTENT_TYPE_TEXT;
  605. part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
  606. }
  607. }
  608. }
  609. if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) {
  610. /* Need to copy text as we have couple of in-place change functions */
  611. parsed = rspamd_fstring_sized_new(part->raw_data.len);
  612. parsed->len = part->raw_data.len;
  613. memcpy(parsed->str, part->raw_data.begin, parsed->len);
  614. part->parsed_data.begin = parsed->str;
  615. part->parsed_data.len = parsed->len;
  616. rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
  617. rspamd_mempool_add_destructor(task->task_pool,
  618. (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
  619. }
  620. else {
  621. part->parsed_data.begin = part->raw_data.begin;
  622. part->parsed_data.len = part->raw_data.len;
  623. }
  624. break;
  625. case RSPAMD_CTE_QP:
  626. parsed = rspamd_fstring_sized_new(part->raw_data.len);
  627. r = rspamd_decode_qp_buf(part->raw_data.begin, part->raw_data.len,
  628. parsed->str, parsed->allocated);
  629. if (r != -1) {
  630. parsed->len = r;
  631. part->parsed_data.begin = parsed->str;
  632. part->parsed_data.len = parsed->len;
  633. rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
  634. rspamd_mempool_add_destructor(task->task_pool,
  635. (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
  636. }
  637. else {
  638. msg_err_task("invalid quoted-printable encoded part, assume 8bit");
  639. if (part->ct) {
  640. part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
  641. }
  642. part->cte = RSPAMD_CTE_8BIT;
  643. memcpy(parsed->str, part->raw_data.begin, part->raw_data.len);
  644. parsed->len = part->raw_data.len;
  645. part->parsed_data.begin = parsed->str;
  646. part->parsed_data.len = parsed->len;
  647. rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
  648. rspamd_mempool_add_destructor(task->task_pool,
  649. (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
  650. }
  651. break;
  652. case RSPAMD_CTE_B64:
  653. parsed = rspamd_fstring_sized_new(part->raw_data.len / 4 * 3 + 12);
  654. rspamd_cryptobox_base64_decode(part->raw_data.begin,
  655. part->raw_data.len,
  656. parsed->str, &parsed->len);
  657. part->parsed_data.begin = parsed->str;
  658. part->parsed_data.len = parsed->len;
  659. rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
  660. rspamd_mempool_add_destructor(task->task_pool,
  661. (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
  662. break;
  663. case RSPAMD_CTE_UUE:
  664. parsed = rspamd_fstring_sized_new(part->raw_data.len / 4 * 3 + 12);
  665. r = rspamd_decode_uue_buf(part->raw_data.begin, part->raw_data.len,
  666. parsed->str, parsed->allocated);
  667. rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
  668. rspamd_mempool_add_destructor(task->task_pool,
  669. (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
  670. if (r != -1) {
  671. parsed->len = r;
  672. part->parsed_data.begin = parsed->str;
  673. part->parsed_data.len = parsed->len;
  674. }
  675. else {
  676. msg_err_task("invalid uuencoding in encoded part, assume 8bit");
  677. if (part->ct) {
  678. part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
  679. }
  680. part->cte = RSPAMD_CTE_8BIT;
  681. parsed->len = MIN(part->raw_data.len, parsed->allocated);
  682. memcpy(parsed->str, part->raw_data.begin, parsed->len);
  683. rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
  684. part->parsed_data.begin = parsed->str;
  685. part->parsed_data.len = parsed->len;
  686. }
  687. break;
  688. default:
  689. g_assert_not_reached();
  690. }
  691. part->part_number = MESSAGE_FIELD(task, parts)->len;
  692. part->urls = g_ptr_array_new();
  693. g_ptr_array_add(MESSAGE_FIELD(task, parts), part);
  694. msg_debug_mime("parsed data part %T/%T of length %z (%z orig), %s cte",
  695. &part->ct->type, &part->ct->subtype, part->parsed_data.len,
  696. part->raw_data.len, rspamd_cte_to_string(part->cte));
  697. rspamd_mime_parser_calc_digest(part);
  698. if (ct && (ct->flags & RSPAMD_CONTENT_TYPE_SMIME)) {
  699. CMS_ContentInfo *cms;
  700. const unsigned char *der_beg = part->parsed_data.begin;
  701. cms = d2i_CMS_ContentInfo(NULL, &der_beg, part->parsed_data.len);
  702. if (cms) {
  703. const ASN1_OBJECT *asn_ct = CMS_get0_eContentType(cms);
  704. int ct_nid = OBJ_obj2nid(asn_ct);
  705. if (ct_nid == NID_pkcs7_data) {
  706. BIO *bio = BIO_new_mem_buf(part->parsed_data.begin,
  707. part->parsed_data.len);
  708. PKCS7 *p7;
  709. p7 = d2i_PKCS7_bio(bio, NULL);
  710. if (p7) {
  711. ct_nid = OBJ_obj2nid(p7->type);
  712. if (ct_nid == NID_pkcs7_signed) {
  713. PKCS7 *p7_signed_content = p7->d.sign->contents;
  714. ct_nid = OBJ_obj2nid(p7_signed_content->type);
  715. if (ct_nid == NID_pkcs7_data && p7_signed_content->d.data) {
  716. int ret;
  717. msg_debug_mime("found an additional part inside of "
  718. "smime structure of type %T/%T; length=%d",
  719. &ct->type, &ct->subtype, p7_signed_content->d.data->length);
  720. /*
  721. * Since ASN.1 structures are freed, we need to copy
  722. * the content
  723. */
  724. gchar *cpy = rspamd_mempool_alloc(task->task_pool,
  725. p7_signed_content->d.data->length);
  726. memcpy(cpy, p7_signed_content->d.data->data,
  727. p7_signed_content->d.data->length);
  728. ret = rspamd_mime_process_multipart_node(task,
  729. st, NULL,
  730. cpy, cpy + p7_signed_content->d.data->length,
  731. TRUE, err);
  732. PKCS7_free(p7);
  733. BIO_free(bio);
  734. CMS_ContentInfo_free(cms);
  735. return ret;
  736. }
  737. }
  738. PKCS7_free(p7);
  739. }
  740. BIO_free(bio);
  741. }
  742. CMS_ContentInfo_free(cms);
  743. }
  744. }
  745. return RSPAMD_MIME_PARSE_OK;
  746. }
  747. struct rspamd_mime_multipart_cbdata {
  748. struct rspamd_task *task;
  749. struct rspamd_mime_part *multipart;
  750. struct rspamd_mime_parser_ctx *st;
  751. const gchar *part_start;
  752. rspamd_ftok_t *cur_boundary;
  753. uint64_t bhash;
  754. GError **err;
  755. };
  756. static enum rspamd_mime_parse_error
  757. rspamd_mime_process_multipart_node(struct rspamd_task *task,
  758. struct rspamd_mime_parser_ctx *st,
  759. struct rspamd_mime_part *multipart,
  760. const gchar *start, const gchar *end,
  761. gboolean is_finished,
  762. GError **err)
  763. {
  764. struct rspamd_content_type *ct, *sel = NULL;
  765. struct rspamd_mime_header *hdr = NULL, *cur;
  766. struct rspamd_mime_part *npart;
  767. GString str;
  768. goffset hdr_pos, body_pos;
  769. enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_FATAL;
  770. str.str = (gchar *) start;
  771. str.len = end - start;
  772. if (*start == '\n' || *start == '\r') {
  773. /*
  774. * We have a part that starts from newline which means that
  775. * there are completely no headers in this part,
  776. * hence we assume it as a text part
  777. */
  778. hdr_pos = 0;
  779. body_pos = 0;
  780. if (!is_finished) {
  781. /* Ignore garbage */
  782. const gchar *p = start;
  783. gboolean seen_something = FALSE;
  784. while (p < end) {
  785. if (g_ascii_isalnum(*p)) {
  786. seen_something = TRUE;
  787. break;
  788. }
  789. p++;
  790. }
  791. if (!seen_something) {
  792. return RSPAMD_MIME_PARSE_NO_PART;
  793. }
  794. }
  795. }
  796. else {
  797. hdr_pos = rspamd_string_find_eoh(&str, &body_pos);
  798. }
  799. npart = rspamd_mempool_alloc0(task->task_pool,
  800. sizeof(struct rspamd_mime_part));
  801. npart->parent_part = multipart;
  802. npart->raw_headers = rspamd_message_headers_new();
  803. npart->headers_order = NULL;
  804. if (multipart) {
  805. if (multipart->specific.mp->children == NULL) {
  806. multipart->specific.mp->children = g_ptr_array_sized_new(2);
  807. }
  808. g_ptr_array_add(multipart->specific.mp->children, npart);
  809. }
  810. if (hdr_pos > 0 && hdr_pos < str.len) {
  811. npart->raw_headers_str = str.str;
  812. npart->raw_headers_len = hdr_pos;
  813. npart->raw_data.begin = start + body_pos;
  814. npart->raw_data.len = (end - start) - body_pos;
  815. if (npart->raw_headers_len > 0) {
  816. rspamd_mime_headers_process(task, npart->raw_headers,
  817. &npart->headers_order,
  818. npart->raw_headers_str,
  819. npart->raw_headers_len,
  820. FALSE);
  821. /* Preserve the natural order */
  822. if (npart->headers_order) {
  823. LL_REVERSE2(npart->headers_order, ord_next);
  824. }
  825. }
  826. hdr = rspamd_message_get_header_from_hash(npart->raw_headers,
  827. "Content-Type", FALSE);
  828. }
  829. else {
  830. npart->raw_headers_str = 0;
  831. npart->raw_headers_len = 0;
  832. npart->raw_data.begin = start;
  833. npart->raw_data.len = end - start;
  834. }
  835. if (hdr != NULL) {
  836. DL_FOREACH(hdr, cur)
  837. {
  838. ct = rspamd_content_type_parse(cur->value, strlen(cur->value),
  839. task->task_pool);
  840. /* Here we prefer multipart content-type or any content-type */
  841. if (ct) {
  842. if (sel == NULL) {
  843. sel = ct;
  844. }
  845. else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
  846. sel = ct;
  847. }
  848. }
  849. }
  850. }
  851. if (sel == NULL) {
  852. sel = rspamd_mempool_alloc0(task->task_pool, sizeof(*sel));
  853. RSPAMD_FTOK_ASSIGN(&sel->type, "text");
  854. RSPAMD_FTOK_ASSIGN(&sel->subtype, "plain");
  855. }
  856. npart->ct = sel;
  857. if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
  858. st->nesting++;
  859. g_ptr_array_add(st->stack, npart);
  860. npart->part_type = RSPAMD_MIME_PART_MULTIPART;
  861. npart->specific.mp = rspamd_mempool_alloc0(task->task_pool,
  862. sizeof(struct rspamd_mime_multipart));
  863. memcpy(&npart->specific.mp->boundary, &sel->orig_boundary,
  864. sizeof(rspamd_ftok_t));
  865. ret = rspamd_mime_parse_multipart_part(task, npart, st, err);
  866. }
  867. else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) {
  868. st->nesting++;
  869. g_ptr_array_add(st->stack, npart);
  870. npart->part_type = RSPAMD_MIME_PART_MESSAGE;
  871. if ((ret = rspamd_mime_parse_normal_part(task, npart, st, sel, err)) == RSPAMD_MIME_PARSE_OK) {
  872. ret = rspamd_mime_parse_message(task, npart, st, err);
  873. }
  874. }
  875. else {
  876. ret = rspamd_mime_parse_normal_part(task, npart, st, sel, err);
  877. }
  878. return ret;
  879. }
  880. static enum rspamd_mime_parse_error
  881. rspamd_mime_parse_multipart_cb(struct rspamd_task *task,
  882. struct rspamd_mime_part *multipart,
  883. struct rspamd_mime_parser_ctx *st,
  884. struct rspamd_mime_multipart_cbdata *cb,
  885. struct rspamd_mime_boundary *b)
  886. {
  887. const gchar *pos = st->start + b->boundary;
  888. enum rspamd_mime_parse_error ret;
  889. task = cb->task;
  890. /* Now check boundary */
  891. if (!cb->part_start) {
  892. cb->part_start = st->start + b->start;
  893. st->pos = cb->part_start;
  894. }
  895. else {
  896. /*
  897. * We have seen the start of the boundary,
  898. * but it might be unsuitable (e.g. in broken headers)
  899. */
  900. if (cb->part_start < pos && cb->cur_boundary) {
  901. if ((ret = rspamd_mime_process_multipart_node(task, cb->st,
  902. cb->multipart, cb->part_start, pos, TRUE, cb->err)) != RSPAMD_MIME_PARSE_OK) {
  903. return ret;
  904. }
  905. if (b->start > 0) {
  906. /* Go towards the next part */
  907. cb->part_start = st->start + b->start;
  908. cb->st->pos = cb->part_start;
  909. }
  910. }
  911. else {
  912. /* We have an empty boundary, do nothing */
  913. }
  914. }
  915. return RSPAMD_MIME_PARSE_OK;
  916. }
  917. static enum rspamd_mime_parse_error
  918. rspamd_multipart_boundaries_filter(struct rspamd_task *task,
  919. struct rspamd_mime_part *multipart,
  920. struct rspamd_mime_parser_ctx *st,
  921. struct rspamd_mime_multipart_cbdata *cb)
  922. {
  923. struct rspamd_mime_boundary *cur;
  924. goffset last_offset;
  925. guint i, sel = 0;
  926. enum rspamd_mime_parse_error ret;
  927. last_offset = (multipart->raw_data.begin - st->start) +
  928. multipart->raw_data.len;
  929. /* Find the first offset suitable for this part */
  930. for (i = 0; i < st->boundaries->len; i++) {
  931. cur = &g_array_index(st->boundaries, struct rspamd_mime_boundary, i);
  932. if (cur->start >= multipart->raw_data.begin - st->start) {
  933. if (cb->cur_boundary) {
  934. /* Check boundary */
  935. msg_debug_mime("compare %L and %L (and %L)",
  936. cb->bhash, cur->hash, cur->closed_hash);
  937. if (cb->bhash == cur->hash) {
  938. sel = i;
  939. break;
  940. }
  941. else if (cb->bhash == cur->closed_hash) {
  942. /* Not a closing element in fact */
  943. cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
  944. cur->hash = cur->closed_hash;
  945. sel = i;
  946. break;
  947. }
  948. }
  949. else {
  950. /* Set current boundary */
  951. cb->cur_boundary = rspamd_mempool_alloc(task->task_pool,
  952. sizeof(rspamd_ftok_t));
  953. cb->cur_boundary->begin = st->start + cur->boundary;
  954. cb->cur_boundary->len = 0;
  955. cb->bhash = cur->hash;
  956. sel = i;
  957. break;
  958. }
  959. }
  960. }
  961. /* Now we can go forward with boundaries that are same to what we have */
  962. for (i = sel; i < st->boundaries->len; i++) {
  963. cur = &g_array_index(st->boundaries, struct rspamd_mime_boundary, i);
  964. if (cur->boundary > last_offset) {
  965. break;
  966. }
  967. if (cur->hash == cb->bhash || cur->closed_hash == cb->bhash) {
  968. if ((ret = rspamd_mime_parse_multipart_cb(task, multipart, st,
  969. cb, cur)) != RSPAMD_MIME_PARSE_OK) {
  970. return ret;
  971. }
  972. if (cur->closed_hash == cb->bhash) {
  973. /* We have again fake closed hash */
  974. cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
  975. cur->hash = cur->closed_hash;
  976. }
  977. if (RSPAMD_BOUNDARY_IS_CLOSED(cur)) {
  978. /* We also might check the next boundary... */
  979. if (i < st->boundaries->len - 1) {
  980. cur = &g_array_index(st->boundaries,
  981. struct rspamd_mime_boundary, i + 1);
  982. if (cur->hash == cb->bhash) {
  983. continue;
  984. }
  985. else if (cur->closed_hash == cb->bhash) {
  986. /* We have again fake closed hash */
  987. cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
  988. cur->hash = cur->closed_hash;
  989. continue;
  990. }
  991. }
  992. break;
  993. }
  994. }
  995. }
  996. if (i == st->boundaries->len && cb->cur_boundary) {
  997. /* Process the last part */
  998. struct rspamd_mime_boundary fb;
  999. fb.boundary = last_offset;
  1000. fb.start = -1;
  1001. if ((ret = rspamd_mime_parse_multipart_cb(task, multipart, st,
  1002. cb, &fb)) != RSPAMD_MIME_PARSE_OK) {
  1003. return ret;
  1004. }
  1005. }
  1006. return RSPAMD_MIME_PARSE_OK;
  1007. }
  1008. static enum rspamd_mime_parse_error
  1009. rspamd_mime_parse_multipart_part(struct rspamd_task *task,
  1010. struct rspamd_mime_part *part,
  1011. struct rspamd_mime_parser_ctx *st,
  1012. GError **err)
  1013. {
  1014. struct rspamd_mime_multipart_cbdata cbdata;
  1015. enum rspamd_mime_parse_error ret;
  1016. if (st->nesting > max_nested) {
  1017. g_set_error(err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d",
  1018. st->nesting);
  1019. return RSPAMD_MIME_PARSE_NESTING;
  1020. }
  1021. part->part_number = MESSAGE_FIELD(task, parts)->len;
  1022. part->urls = g_ptr_array_new();
  1023. g_ptr_array_add(MESSAGE_FIELD(task, parts), part);
  1024. st->nesting++;
  1025. rspamd_mime_part_get_cte(task, part->raw_headers, part, FALSE);
  1026. st->pos = part->raw_data.begin;
  1027. cbdata.multipart = part;
  1028. cbdata.task = task;
  1029. cbdata.st = st;
  1030. cbdata.part_start = NULL;
  1031. cbdata.err = err;
  1032. if (part->ct->boundary.len > 0) {
  1033. /* We know our boundary */
  1034. cbdata.cur_boundary = &part->ct->boundary;
  1035. rspamd_cryptobox_siphash((guchar *) &cbdata.bhash,
  1036. cbdata.cur_boundary->begin, cbdata.cur_boundary->len,
  1037. lib_ctx->hkey);
  1038. msg_debug_mime("hash: %T -> %L", cbdata.cur_boundary, cbdata.bhash);
  1039. }
  1040. else {
  1041. /* Guess boundary */
  1042. cbdata.cur_boundary = NULL;
  1043. cbdata.bhash = 0;
  1044. }
  1045. ret = rspamd_multipart_boundaries_filter(task, part, st, &cbdata);
  1046. /* Cleanup stack */
  1047. st->nesting--;
  1048. g_ptr_array_remove_index_fast(st->stack, st->stack->len - 1);
  1049. return ret;
  1050. }
  1051. /* Process boundary like structures in a message */
  1052. static gint
  1053. rspamd_mime_preprocess_cb(struct rspamd_multipattern *mp,
  1054. guint strnum,
  1055. gint match_start,
  1056. gint match_pos,
  1057. const gchar *text,
  1058. gsize len,
  1059. void *context)
  1060. {
  1061. const gchar *end = text + len, *p = text + match_pos, *bend;
  1062. gsize blen;
  1063. gboolean closing = FALSE;
  1064. struct rspamd_mime_boundary b;
  1065. struct rspamd_mime_parser_ctx *st = context;
  1066. struct rspamd_task *task;
  1067. task = st->task;
  1068. if (G_LIKELY(p < end)) {
  1069. blen = 0;
  1070. while (p < end) {
  1071. if (*p == '\r' || *p == '\n') {
  1072. break;
  1073. }
  1074. blen++;
  1075. p++;
  1076. }
  1077. if (blen > 0) {
  1078. /* We have found something like boundary */
  1079. p = text + match_pos;
  1080. bend = p + blen - 1;
  1081. if (*bend == '-') {
  1082. /* We need to verify last -- */
  1083. if (bend > p + 1 && *(bend - 1) == '-') {
  1084. closing = TRUE;
  1085. bend--;
  1086. blen -= 2;
  1087. }
  1088. else {
  1089. /* Not a closing boundary somehow, e.g. if a boundary=='-' */
  1090. bend++;
  1091. }
  1092. }
  1093. else {
  1094. bend++;
  1095. }
  1096. while (bend < end) {
  1097. if (*bend == '\r') {
  1098. bend++;
  1099. /* \r\n */
  1100. if (bend < end && *bend == '\n') {
  1101. bend++;
  1102. }
  1103. }
  1104. else if (*bend == '\n') {
  1105. /* \n */
  1106. bend++;
  1107. }
  1108. else if (g_ascii_isspace(*bend)) {
  1109. /* Spaces in the same line, skip them */
  1110. bend++;
  1111. continue;
  1112. }
  1113. break;
  1114. }
  1115. b.boundary = p - st->start - 2;
  1116. b.start = bend - st->start;
  1117. /* Small optimisation as boundaries are usually short strings */
  1118. gchar *lc_copy, lc_copy_buf[128];
  1119. if (blen + 2 < sizeof(lc_copy_buf)) {
  1120. lc_copy = lc_copy_buf;
  1121. }
  1122. else {
  1123. lc_copy = g_malloc(blen + 2);
  1124. }
  1125. if (closing) {
  1126. memcpy(lc_copy, p, blen + 2);
  1127. rspamd_str_lc(lc_copy, blen + 2);
  1128. }
  1129. else {
  1130. memcpy(lc_copy, p, blen);
  1131. rspamd_str_lc(lc_copy, blen);
  1132. }
  1133. rspamd_cryptobox_siphash((guchar *) &b.hash, lc_copy, blen,
  1134. lib_ctx->hkey);
  1135. msg_debug_mime("normal hash: %*s -> %L, %d boffset, %d data offset",
  1136. (gint) blen, lc_copy, b.hash, (int) b.boundary, (int) b.start);
  1137. if (closing) {
  1138. b.flags = RSPAMD_MIME_BOUNDARY_FLAG_CLOSED;
  1139. rspamd_cryptobox_siphash((guchar *) &b.closed_hash, lc_copy,
  1140. blen + 2,
  1141. lib_ctx->hkey);
  1142. msg_debug_mime("closing hash: %*s -> %L, %d boffset, %d data offset",
  1143. (gint) blen + 2, lc_copy,
  1144. b.closed_hash,
  1145. (int) b.boundary, (int) b.start);
  1146. }
  1147. else {
  1148. b.flags = 0;
  1149. b.closed_hash = 0;
  1150. }
  1151. /* Check if a string has been allocated on the heap */
  1152. if (blen + 2 >= sizeof(lc_copy_buf)) {
  1153. g_free(lc_copy);
  1154. }
  1155. g_array_append_val(st->boundaries, b);
  1156. }
  1157. }
  1158. return 0;
  1159. }
  1160. static goffset
  1161. rspamd_mime_parser_headers_heuristic(GString *input, goffset *body_start)
  1162. {
  1163. const gsize default_max_len = 76;
  1164. gsize max_len = MIN(input->len, default_max_len);
  1165. const gchar *p, *end;
  1166. enum {
  1167. st_before_colon = 0,
  1168. st_colon,
  1169. st_spaces_after_colon,
  1170. st_value,
  1171. st_error
  1172. } state = st_before_colon;
  1173. p = input->str;
  1174. end = p + max_len;
  1175. while (p < end) {
  1176. switch (state) {
  1177. case st_before_colon:
  1178. if (G_UNLIKELY(*p == ':')) {
  1179. state = st_colon;
  1180. }
  1181. else if (G_UNLIKELY(!g_ascii_isgraph(*p))) {
  1182. state = st_error;
  1183. }
  1184. p++;
  1185. break;
  1186. case st_colon:
  1187. if (g_ascii_isspace(*p)) {
  1188. state = st_spaces_after_colon;
  1189. }
  1190. else {
  1191. state = st_value;
  1192. }
  1193. p++;
  1194. break;
  1195. case st_spaces_after_colon:
  1196. if (!g_ascii_isspace(*p)) {
  1197. state = st_value;
  1198. }
  1199. p++;
  1200. break;
  1201. case st_value:
  1202. /* We accept any value */
  1203. goto end;
  1204. break;
  1205. case st_error:
  1206. return (-1);
  1207. break;
  1208. }
  1209. }
  1210. end:
  1211. if (state == st_value) {
  1212. if (body_start) {
  1213. *body_start = input->len;
  1214. }
  1215. return input->len;
  1216. }
  1217. return (-1);
  1218. }
  1219. static void
  1220. rspamd_mime_preprocess_message(struct rspamd_task *task,
  1221. struct rspamd_mime_part *top,
  1222. struct rspamd_mime_parser_ctx *st)
  1223. {
  1224. if (top->raw_data.begin >= st->pos) {
  1225. rspamd_multipattern_lookup(lib_ctx->mp_boundary,
  1226. top->raw_data.begin - 1,
  1227. top->raw_data.len + 1,
  1228. rspamd_mime_preprocess_cb, st, NULL);
  1229. }
  1230. else {
  1231. rspamd_multipattern_lookup(lib_ctx->mp_boundary,
  1232. st->pos,
  1233. st->end - st->pos,
  1234. rspamd_mime_preprocess_cb, st, NULL);
  1235. }
  1236. }
  1237. static void
  1238. rspamd_mime_parse_stack_free(struct rspamd_mime_parser_ctx *st)
  1239. {
  1240. if (st) {
  1241. g_ptr_array_free(st->stack, TRUE);
  1242. g_array_free(st->boundaries, TRUE);
  1243. g_free(st);
  1244. }
  1245. }
  1246. static enum rspamd_mime_parse_error
  1247. rspamd_mime_parse_message(struct rspamd_task *task,
  1248. struct rspamd_mime_part *part,
  1249. struct rspamd_mime_parser_ctx *st,
  1250. GError **err)
  1251. {
  1252. struct rspamd_content_type *ct, *sel = NULL;
  1253. struct rspamd_mime_header *hdr = NULL, *cur;
  1254. const gchar *pbegin, *p;
  1255. gsize plen, len;
  1256. struct rspamd_mime_part *npart;
  1257. goffset hdr_pos, body_pos;
  1258. guint i;
  1259. enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK;
  1260. GString str;
  1261. struct rspamd_mime_parser_ctx *nst = st;
  1262. if (st->nesting > max_nested) {
  1263. g_set_error(err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d",
  1264. st->nesting);
  1265. return RSPAMD_MIME_PARSE_NESTING;
  1266. }
  1267. /* Allocate real part */
  1268. npart = rspamd_mempool_alloc0(task->task_pool,
  1269. sizeof(struct rspamd_mime_part));
  1270. if (part == NULL) {
  1271. /* Top level message */
  1272. p = task->msg.begin;
  1273. len = task->msg.len;
  1274. str.str = (gchar *) p;
  1275. str.len = len;
  1276. hdr_pos = rspamd_string_find_eoh(&str, &body_pos);
  1277. if (hdr_pos > 0 && hdr_pos < str.len) {
  1278. MESSAGE_FIELD(task, raw_headers_content).begin = str.str;
  1279. MESSAGE_FIELD(task, raw_headers_content).len = hdr_pos;
  1280. MESSAGE_FIELD(task, raw_headers_content).body_start = str.str + body_pos;
  1281. if (MESSAGE_FIELD(task, raw_headers_content).len > 0) {
  1282. rspamd_mime_headers_process(task,
  1283. MESSAGE_FIELD(task, raw_headers),
  1284. &MESSAGE_FIELD(task, headers_order),
  1285. MESSAGE_FIELD(task, raw_headers_content).begin,
  1286. MESSAGE_FIELD(task, raw_headers_content).len,
  1287. TRUE);
  1288. npart->raw_headers = rspamd_message_headers_ref(
  1289. MESSAGE_FIELD(task, raw_headers));
  1290. /* Preserve the natural order */
  1291. if (MESSAGE_FIELD(task, headers_order)) {
  1292. LL_REVERSE2(MESSAGE_FIELD(task, headers_order), ord_next);
  1293. }
  1294. }
  1295. hdr = rspamd_message_get_header_from_hash(
  1296. MESSAGE_FIELD(task, raw_headers),
  1297. "Content-Type", FALSE);
  1298. }
  1299. else {
  1300. /* First apply heuristic, maybe we have just headers */
  1301. hdr_pos = rspamd_mime_parser_headers_heuristic(&str, &body_pos);
  1302. if (hdr_pos > 0 && hdr_pos <= str.len) {
  1303. MESSAGE_FIELD(task, raw_headers_content).begin = str.str;
  1304. MESSAGE_FIELD(task, raw_headers_content).len = hdr_pos;
  1305. MESSAGE_FIELD(task, raw_headers_content).body_start = str.str +
  1306. body_pos;
  1307. if (MESSAGE_FIELD(task, raw_headers_content).len > 0) {
  1308. rspamd_mime_headers_process(task,
  1309. MESSAGE_FIELD(task, raw_headers),
  1310. &MESSAGE_FIELD(task, headers_order),
  1311. MESSAGE_FIELD(task, raw_headers_content).begin,
  1312. MESSAGE_FIELD(task, raw_headers_content).len,
  1313. TRUE);
  1314. npart->raw_headers = rspamd_message_headers_ref(
  1315. MESSAGE_FIELD(task, raw_headers));
  1316. /* Preserve the natural order */
  1317. if (MESSAGE_FIELD(task, headers_order)) {
  1318. LL_REVERSE2(MESSAGE_FIELD(task, headers_order), ord_next);
  1319. }
  1320. }
  1321. hdr = rspamd_message_get_header_from_hash(
  1322. MESSAGE_FIELD(task, raw_headers),
  1323. "Content-Type", FALSE);
  1324. task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS;
  1325. }
  1326. else {
  1327. body_pos = 0;
  1328. }
  1329. }
  1330. pbegin = st->start + body_pos;
  1331. plen = st->end - pbegin;
  1332. npart->headers_order = NULL;
  1333. }
  1334. else {
  1335. /*
  1336. * Here are dragons:
  1337. * We allocate new parser context as we need to shift pointers
  1338. */
  1339. nst = g_malloc0(sizeof(*st));
  1340. nst->stack = g_ptr_array_sized_new(4);
  1341. nst->boundaries = g_array_sized_new(FALSE, FALSE,
  1342. sizeof(struct rspamd_mime_boundary), 8);
  1343. nst->start = part->parsed_data.begin;
  1344. nst->end = nst->start + part->parsed_data.len;
  1345. nst->pos = nst->start;
  1346. nst->task = st->task;
  1347. nst->nesting = st->nesting;
  1348. st->nesting++;
  1349. str.str = (gchar *) part->parsed_data.begin;
  1350. str.len = part->parsed_data.len;
  1351. hdr_pos = rspamd_string_find_eoh(&str, &body_pos);
  1352. npart->raw_headers = rspamd_message_headers_new();
  1353. npart->headers_order = NULL;
  1354. if (hdr_pos > 0 && hdr_pos < str.len) {
  1355. npart->raw_headers_str = str.str;
  1356. npart->raw_headers_len = hdr_pos;
  1357. npart->raw_data.begin = str.str + body_pos;
  1358. if (npart->raw_headers_len > 0) {
  1359. rspamd_mime_headers_process(task,
  1360. npart->raw_headers,
  1361. &npart->headers_order,
  1362. npart->raw_headers_str,
  1363. npart->raw_headers_len,
  1364. FALSE);
  1365. /* Preserve the natural order */
  1366. if (npart->headers_order) {
  1367. LL_REVERSE2(npart->headers_order, ord_next);
  1368. }
  1369. }
  1370. hdr = rspamd_message_get_header_from_hash(npart->raw_headers,
  1371. "Content-Type", FALSE);
  1372. }
  1373. else {
  1374. body_pos = 0;
  1375. }
  1376. pbegin = part->parsed_data.begin + body_pos;
  1377. plen = part->parsed_data.len - body_pos;
  1378. }
  1379. npart->raw_data.begin = pbegin;
  1380. npart->raw_data.len = plen;
  1381. npart->parent_part = part;
  1382. if (hdr == NULL) {
  1383. sel = NULL;
  1384. }
  1385. else {
  1386. DL_FOREACH(hdr, cur)
  1387. {
  1388. ct = rspamd_content_type_parse(cur->value, strlen(cur->value),
  1389. task->task_pool);
  1390. /* Here we prefer multipart content-type or any content-type */
  1391. if (ct) {
  1392. if (sel == NULL) {
  1393. sel = ct;
  1394. }
  1395. else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
  1396. sel = ct;
  1397. }
  1398. }
  1399. }
  1400. }
  1401. if (sel == NULL) {
  1402. /* For messages we automatically assume plaintext */
  1403. msg_info_task("cannot find content-type for a message, assume text/plain");
  1404. sel = rspamd_mempool_alloc0(task->task_pool, sizeof(*sel));
  1405. sel->flags = RSPAMD_CONTENT_TYPE_TEXT | RSPAMD_CONTENT_TYPE_MISSING;
  1406. RSPAMD_FTOK_ASSIGN(&sel->type, "text");
  1407. RSPAMD_FTOK_ASSIGN(&sel->subtype, "plain");
  1408. }
  1409. npart->ct = sel;
  1410. if ((part == NULL || nst != st) &&
  1411. (sel->flags & (RSPAMD_CONTENT_TYPE_MULTIPART | RSPAMD_CONTENT_TYPE_MESSAGE))) {
  1412. /* Not a trivial message, need to preprocess */
  1413. rspamd_mime_preprocess_message(task, npart, nst);
  1414. }
  1415. if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
  1416. g_ptr_array_add(nst->stack, npart);
  1417. nst->nesting++;
  1418. npart->part_type = RSPAMD_MIME_PART_MULTIPART;
  1419. npart->specific.mp = rspamd_mempool_alloc0(task->task_pool,
  1420. sizeof(struct rspamd_mime_multipart));
  1421. memcpy(&npart->specific.mp->boundary, &sel->orig_boundary,
  1422. sizeof(rspamd_ftok_t));
  1423. ret = rspamd_mime_parse_multipart_part(task, npart, nst, err);
  1424. }
  1425. else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) {
  1426. if ((ret = rspamd_mime_parse_normal_part(task, npart, nst, sel, err)) == RSPAMD_MIME_PARSE_OK) {
  1427. npart->part_type = RSPAMD_MIME_PART_MESSAGE;
  1428. ret = rspamd_mime_parse_message(task, npart, nst, err);
  1429. }
  1430. }
  1431. else {
  1432. ret = rspamd_mime_parse_normal_part(task, npart, nst, sel, err);
  1433. }
  1434. if (ret != RSPAMD_MIME_PARSE_OK) {
  1435. return ret;
  1436. }
  1437. if (part && st->stack->len > 0) {
  1438. /* Remove message part from the parent stack */
  1439. g_ptr_array_remove_index_fast(st->stack, st->stack->len - 1);
  1440. st->nesting--;
  1441. }
  1442. /* Process leftovers for boundaries */
  1443. if (nst->boundaries) {
  1444. struct rspamd_mime_boundary *boundary, *start_boundary = NULL,
  1445. *end_boundary = NULL;
  1446. goffset cur_offset = nst->pos - nst->start,
  1447. end_offset = st->end - st->start;
  1448. guint sel_idx = 0;
  1449. for (;;) {
  1450. start_boundary = NULL;
  1451. for (i = sel_idx; i < nst->boundaries->len; i++) {
  1452. boundary = &g_array_index(nst->boundaries,
  1453. struct rspamd_mime_boundary, i);
  1454. if (boundary->start > cur_offset &&
  1455. boundary->boundary < end_offset &&
  1456. !RSPAMD_BOUNDARY_IS_CLOSED(boundary)) {
  1457. start_boundary = boundary;
  1458. sel_idx = i;
  1459. break;
  1460. }
  1461. }
  1462. if (start_boundary) {
  1463. const gchar *start, *end;
  1464. if (nst->boundaries->len > sel_idx + 1) {
  1465. end_boundary = &g_array_index(nst->boundaries,
  1466. struct rspamd_mime_boundary, sel_idx + 1);
  1467. end = nst->start + end_boundary->boundary;
  1468. }
  1469. else {
  1470. end = nst->end;
  1471. }
  1472. sel_idx++;
  1473. start = nst->start + start_boundary->start;
  1474. if (end > start &&
  1475. (ret = rspamd_mime_process_multipart_node(task, nst,
  1476. NULL, start, end, FALSE, err)) != RSPAMD_MIME_PARSE_OK) {
  1477. if (nst != st) {
  1478. rspamd_mime_parse_stack_free(nst);
  1479. }
  1480. if (ret == RSPAMD_MIME_PARSE_NO_PART) {
  1481. return RSPAMD_MIME_PARSE_OK;
  1482. }
  1483. return ret;
  1484. }
  1485. }
  1486. else {
  1487. break;
  1488. }
  1489. }
  1490. }
  1491. if (nst != st) {
  1492. rspamd_mime_parse_stack_free(nst);
  1493. }
  1494. return ret;
  1495. }
  1496. enum rspamd_mime_parse_error
  1497. rspamd_mime_parse_task(struct rspamd_task *task, GError **err)
  1498. {
  1499. struct rspamd_mime_parser_ctx *st;
  1500. enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK;
  1501. if (lib_ctx == NULL) {
  1502. rspamd_mime_parser_init_lib();
  1503. }
  1504. if (++lib_ctx->key_usages > max_key_usages) {
  1505. /* Regenerate siphash key */
  1506. ottery_rand_bytes(lib_ctx->hkey, sizeof(lib_ctx->hkey));
  1507. lib_ctx->key_usages = 0;
  1508. }
  1509. st = g_malloc0(sizeof(*st));
  1510. st->stack = g_ptr_array_sized_new(4);
  1511. st->pos = MESSAGE_FIELD(task, raw_headers_content).body_start;
  1512. st->end = task->msg.begin + task->msg.len;
  1513. st->boundaries = g_array_sized_new(FALSE, FALSE,
  1514. sizeof(struct rspamd_mime_boundary), 8);
  1515. st->task = task;
  1516. if (st->pos == NULL) {
  1517. st->pos = task->msg.begin;
  1518. }
  1519. st->start = task->msg.begin;
  1520. ret = rspamd_mime_parse_message(task, NULL, st, err);
  1521. rspamd_mime_parse_stack_free(st);
  1522. return ret;
  1523. }