You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

archives.c 43KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "message.h"
  18. #include "task.h"
  19. #include "archives.h"
  20. #include "libmime/mime_encoding.h"
  21. #include <unicode/uchar.h>
  22. #include <unicode/utf8.h>
  23. #include <unicode/utf16.h>
  24. #include <unicode/ucnv.h>
  25. #define msg_debug_archive(...) rspamd_conditional_debug_fast (NULL, NULL, \
  26. rspamd_archive_log_id, "archive", task->task_pool->tag.uid, \
  27. G_STRFUNC, \
  28. __VA_ARGS__)
  29. INIT_LOG_MODULE(archive)
  30. static void
  31. rspamd_archive_dtor (gpointer p)
  32. {
  33. struct rspamd_archive *arch = p;
  34. struct rspamd_archive_file *f;
  35. guint i;
  36. for (i = 0; i < arch->files->len; i ++) {
  37. f = g_ptr_array_index (arch->files, i);
  38. if (f->fname) {
  39. g_string_free (f->fname, TRUE);
  40. }
  41. g_free (f);
  42. }
  43. g_ptr_array_free (arch->files, TRUE);
  44. }
  45. static GString *
  46. rspamd_archive_file_try_utf (struct rspamd_task *task,
  47. const gchar *in, gsize inlen)
  48. {
  49. const gchar *charset = NULL, *p, *end;
  50. GString *res;
  51. charset = rspamd_mime_charset_find_by_content (in, inlen, TRUE);
  52. if (charset) {
  53. UChar *tmp;
  54. UErrorCode uc_err = U_ZERO_ERROR;
  55. gint32 r, clen, dlen;
  56. struct rspamd_charset_converter *conv;
  57. UConverter *utf8_converter;
  58. conv = rspamd_mime_get_converter_cached (charset, task->task_pool,
  59. TRUE, &uc_err);
  60. utf8_converter = rspamd_get_utf8_converter ();
  61. if (conv == NULL) {
  62. msg_info_task ("cannot open converter for %s: %s",
  63. charset, u_errorName (uc_err));
  64. return NULL;
  65. }
  66. tmp = g_malloc (sizeof (*tmp) * (inlen + 1));
  67. r = rspamd_converter_to_uchars (conv, tmp, inlen + 1,
  68. in, inlen, &uc_err);
  69. if (!U_SUCCESS (uc_err)) {
  70. msg_info_task ("cannot convert data to unicode from %s: %s",
  71. charset, u_errorName (uc_err));
  72. g_free (tmp);
  73. return NULL;
  74. }
  75. clen = ucnv_getMaxCharSize (utf8_converter);
  76. dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
  77. res = g_string_sized_new (dlen);
  78. r = ucnv_fromUChars (utf8_converter, res->str, dlen, tmp, r, &uc_err);
  79. if (!U_SUCCESS (uc_err)) {
  80. msg_info_task ("cannot convert data from unicode from %s: %s",
  81. charset, u_errorName (uc_err));
  82. g_free (tmp);
  83. g_string_free (res, TRUE);
  84. return NULL;
  85. }
  86. g_free (tmp);
  87. res->len = r;
  88. msg_debug_archive ("converted from %s to UTF-8 inlen: %z, outlen: %d",
  89. charset, inlen, r);
  90. }
  91. else {
  92. /* Convert unsafe characters to '?' */
  93. res = g_string_sized_new (inlen);
  94. p = in;
  95. end = in + inlen;
  96. while (p < end) {
  97. if (g_ascii_isgraph (*p)) {
  98. g_string_append_c (res, *p);
  99. }
  100. else {
  101. g_string_append_c (res, '?');
  102. }
  103. p ++;
  104. }
  105. }
  106. return res;
  107. }
  108. static void
  109. rspamd_archive_process_zip (struct rspamd_task *task,
  110. struct rspamd_mime_part *part)
  111. {
  112. const guchar *p, *start, *end, *eocd = NULL, *cd;
  113. const guint32 eocd_magic = 0x06054b50, cd_basic_len = 46;
  114. const guchar cd_magic[] = {0x50, 0x4b, 0x01, 0x02};
  115. const guint max_processed = 1024;
  116. guint32 cd_offset, cd_size, comp_size, uncomp_size, processed = 0;
  117. guint16 extra_len, fname_len, comment_len;
  118. struct rspamd_archive *arch;
  119. struct rspamd_archive_file *f = NULL;
  120. /* Zip files have interesting data at the end of archive */
  121. p = part->parsed_data.begin + part->parsed_data.len - 1;
  122. start = part->parsed_data.begin;
  123. end = p;
  124. /* Search for EOCD:
  125. * 22 bytes is a typical size of eocd without a comment and
  126. * end points one byte after the last character
  127. */
  128. p -= 21;
  129. while (p > start + sizeof (guint32)) {
  130. guint32 t;
  131. if (processed > max_processed) {
  132. break;
  133. }
  134. /* XXX: not an efficient approach */
  135. memcpy (&t, p, sizeof (t));
  136. if (GUINT32_FROM_LE (t) == eocd_magic) {
  137. eocd = p;
  138. break;
  139. }
  140. p --;
  141. processed ++;
  142. }
  143. if (eocd == NULL) {
  144. /* Not a zip file */
  145. msg_info_task ("zip archive is invalid (no EOCD)");
  146. return;
  147. }
  148. if (end - eocd < 21) {
  149. msg_info_task ("zip archive is invalid (short EOCD)");
  150. return;
  151. }
  152. memcpy (&cd_size, eocd + 12, sizeof (cd_size));
  153. cd_size = GUINT32_FROM_LE (cd_size);
  154. memcpy (&cd_offset, eocd + 16, sizeof (cd_offset));
  155. cd_offset = GUINT32_FROM_LE (cd_offset);
  156. /* We need to check sanity as well */
  157. if (cd_offset + cd_size > (guint)(eocd - start)) {
  158. msg_info_task ("zip archive is invalid (bad size/offset for CD)");
  159. return;
  160. }
  161. cd = start + cd_offset;
  162. arch = rspamd_mempool_alloc0 (task->task_pool, sizeof (*arch));
  163. arch->files = g_ptr_array_new ();
  164. arch->type = RSPAMD_ARCHIVE_ZIP;
  165. rspamd_mempool_add_destructor (task->task_pool, rspamd_archive_dtor,
  166. arch);
  167. while (cd < start + cd_offset + cd_size) {
  168. guint16 flags;
  169. /* Read central directory record */
  170. if (eocd - cd < cd_basic_len ||
  171. memcmp (cd, cd_magic, sizeof (cd_magic)) != 0) {
  172. msg_info_task ("zip archive is invalid (bad cd record)");
  173. return;
  174. }
  175. memcpy (&flags, cd + 8, sizeof (guint16));
  176. flags = GUINT16_FROM_LE (flags);
  177. memcpy (&comp_size, cd + 20, sizeof (guint32));
  178. comp_size = GUINT32_FROM_LE (comp_size);
  179. memcpy (&uncomp_size, cd + 24, sizeof (guint32));
  180. uncomp_size = GUINT32_FROM_LE (uncomp_size);
  181. memcpy (&fname_len, cd + 28, sizeof (fname_len));
  182. fname_len = GUINT16_FROM_LE (fname_len);
  183. memcpy (&extra_len, cd + 30, sizeof (extra_len));
  184. extra_len = GUINT16_FROM_LE (extra_len);
  185. memcpy (&comment_len, cd + 32, sizeof (comment_len));
  186. comment_len = GUINT16_FROM_LE (comment_len);
  187. if (cd + fname_len + comment_len + extra_len + cd_basic_len > eocd) {
  188. msg_info_task ("zip archive is invalid (too large cd record)");
  189. return;
  190. }
  191. f = g_malloc0 (sizeof (*f));
  192. f->fname = rspamd_archive_file_try_utf (task,
  193. cd + cd_basic_len, fname_len);
  194. f->compressed_size = comp_size;
  195. f->uncompressed_size = uncomp_size;
  196. if (flags & 0x41u) {
  197. f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED;
  198. }
  199. if (f->fname) {
  200. g_ptr_array_add (arch->files, f);
  201. msg_debug_archive ("found file in zip archive: %v", f->fname);
  202. }
  203. else {
  204. g_free (f);
  205. return;
  206. }
  207. /* Process extra fields */
  208. const guchar *extra = cd + fname_len + cd_basic_len;
  209. p = extra;
  210. while (p + sizeof (guint16) * 2 < extra + extra_len) {
  211. guint16 hid, hlen;
  212. memcpy (&hid, p, sizeof (guint16));
  213. hid = GUINT16_FROM_LE (hid);
  214. memcpy (&hlen, p + sizeof (guint16), sizeof (guint16));
  215. hlen = GUINT16_FROM_LE (hlen);
  216. if (hid == 0x0017) {
  217. f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED;
  218. }
  219. p += hlen + sizeof (guint16) * 2;
  220. }
  221. cd += fname_len + comment_len + extra_len + cd_basic_len;
  222. }
  223. part->part_type = RSPAMD_MIME_PART_ARCHIVE;
  224. part->specific.arch = arch;
  225. if (part->cd) {
  226. arch->archive_name = &part->cd->filename;
  227. }
  228. arch->size = part->parsed_data.len;
  229. }
  230. static inline gint
  231. rspamd_archive_rar_read_vint (const guchar *start, gsize remain, guint64 *res)
  232. {
  233. /*
  234. * From http://www.rarlab.com/technote.htm:
  235. * Variable length integer. Can include one or more bytes, where
  236. * lower 7 bits of every byte contain integer data and highest bit
  237. * in every byte is the continuation flag.
  238. * If highest bit is 0, this is the last byte in sequence.
  239. * So first byte contains 7 least significant bits of integer and
  240. * continuation flag. Second byte, if present, contains next 7 bits and so on.
  241. */
  242. guint64 t = 0;
  243. guint shift = 0;
  244. const guchar *p = start;
  245. while (remain > 0 && shift <= 57) {
  246. if (*p & 0x80) {
  247. t |= ((guint64)(*p & 0x7f)) << shift;
  248. }
  249. else {
  250. t |= ((guint64)(*p & 0x7f)) << shift;
  251. p ++;
  252. break;
  253. }
  254. shift += 7;
  255. p++;
  256. remain --;
  257. }
  258. if (remain == 0 || shift > 64) {
  259. return -1;
  260. }
  261. *res = GUINT64_FROM_LE (t);
  262. return p - start;
  263. }
  264. #define RAR_SKIP_BYTES(n) do { \
  265. if ((n) <= 0) { \
  266. msg_debug_archive ("rar archive is invalid (bad skip value)"); \
  267. return; \
  268. } \
  269. if ((gsize)(end - p) < (n)) { \
  270. msg_debug_archive ("rar archive is invalid (truncated)"); \
  271. return; \
  272. } \
  273. p += (n); \
  274. } while (0)
  275. #define RAR_READ_VINT() do { \
  276. r = rspamd_archive_rar_read_vint (p, end - p, &vint); \
  277. if (r == -1) { \
  278. msg_debug_archive ("rar archive is invalid (bad vint)"); \
  279. return; \
  280. } \
  281. else if (r == 0) { \
  282. msg_debug_archive ("rar archive is invalid (BAD vint offset)"); \
  283. return; \
  284. }\
  285. } while (0)
  286. #define RAR_READ_VINT_SKIP() do { \
  287. r = rspamd_archive_rar_read_vint (p, end - p, &vint); \
  288. if (r == -1) { \
  289. msg_debug_archive ("rar archive is invalid (bad vint)"); \
  290. return; \
  291. } \
  292. p += r; \
  293. } while (0)
  294. #define RAR_READ_UINT16(n) do { \
  295. if (end - p < (glong)sizeof (guint16)) { \
  296. msg_debug_archive ("rar archive is invalid (bad int16)"); \
  297. return; \
  298. } \
  299. n = p[0] + (p[1] << 8); \
  300. p += sizeof (guint16); \
  301. } while (0)
  302. #define RAR_READ_UINT32(n) do { \
  303. if (end - p < (glong)sizeof (guint32)) { \
  304. msg_debug_archive ("rar archive is invalid (bad int32)"); \
  305. return; \
  306. } \
  307. n = (guint)p[0] + ((guint)p[1] << 8) + ((guint)p[2] << 16) + ((guint)p[3] << 24); \
  308. p += sizeof (guint32); \
  309. } while (0)
  310. static void
  311. rspamd_archive_process_rar_v4 (struct rspamd_task *task, const guchar *start,
  312. const guchar *end, struct rspamd_mime_part *part)
  313. {
  314. const guchar *p = start, *start_section;
  315. guint8 type;
  316. guint flags;
  317. guint64 sz, comp_sz = 0, uncomp_sz = 0;
  318. struct rspamd_archive *arch;
  319. struct rspamd_archive_file *f;
  320. arch = rspamd_mempool_alloc0 (task->task_pool, sizeof (*arch));
  321. arch->files = g_ptr_array_new ();
  322. arch->type = RSPAMD_ARCHIVE_RAR;
  323. rspamd_mempool_add_destructor (task->task_pool, rspamd_archive_dtor,
  324. arch);
  325. while (p < end) {
  326. /* Crc16 */
  327. start_section = p;
  328. RAR_SKIP_BYTES (sizeof (guint16));
  329. type = *p;
  330. p ++;
  331. RAR_READ_UINT16 (flags);
  332. if (type == 0x73) {
  333. /* Main header, check for encryption */
  334. if (flags & 0x80) {
  335. arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
  336. goto end;
  337. }
  338. }
  339. RAR_READ_UINT16 (sz);
  340. if (flags & 0x8000) {
  341. /* We also need to read ADD_SIZE element */
  342. guint32 tmp;
  343. RAR_READ_UINT32 (tmp);
  344. sz += tmp;
  345. /* This is also used as PACK_SIZE */
  346. comp_sz = tmp;
  347. }
  348. if (sz == 0) {
  349. /* Zero sized block - error */
  350. msg_debug_archive ("rar archive is invalid (zero size block)");
  351. return;
  352. }
  353. if (type == 0x74) {
  354. guint fname_len;
  355. /* File header */
  356. /* Uncompressed size */
  357. RAR_READ_UINT32 (uncomp_sz);
  358. /* Skip to NAME_SIZE element */
  359. RAR_SKIP_BYTES (11);
  360. RAR_READ_UINT16 (fname_len);
  361. if (fname_len == 0 || fname_len > (gsize)(end - p)) {
  362. msg_debug_archive ("rar archive is invalid (bad filename size: %d)",
  363. fname_len);
  364. return;
  365. }
  366. /* Attrs */
  367. RAR_SKIP_BYTES (4);
  368. if (flags & 0x100) {
  369. /* We also need to read HIGH_PACK_SIZE */
  370. guint32 tmp;
  371. RAR_READ_UINT32 (tmp);
  372. sz += tmp;
  373. comp_sz += tmp;
  374. /* HIGH_UNP_SIZE */
  375. RAR_READ_UINT32 (tmp);
  376. uncomp_sz += tmp;
  377. }
  378. f = g_malloc0 (sizeof (*f));
  379. if (flags & 0x200) {
  380. /* We have unicode + normal version */
  381. guchar *tmp;
  382. tmp = memchr (p, '\0', fname_len);
  383. if (tmp != NULL) {
  384. /* Just use ASCII version */
  385. f->fname = rspamd_archive_file_try_utf (task, p, tmp - p);
  386. msg_debug_archive ("found ascii filename in rarv4 archive: %v",
  387. f->fname);
  388. }
  389. else {
  390. /* We have UTF8 filename, use it as is */
  391. f->fname = rspamd_archive_file_try_utf (task, p, fname_len);
  392. msg_debug_archive ("found utf filename in rarv4 archive: %v",
  393. f->fname);
  394. }
  395. }
  396. else {
  397. f->fname = rspamd_archive_file_try_utf (task, p, fname_len);
  398. msg_debug_archive ("found ascii (old) filename in rarv4 archive: %v",
  399. f->fname);
  400. }
  401. f->compressed_size = comp_sz;
  402. f->uncompressed_size = uncomp_sz;
  403. if (flags & 0x4) {
  404. f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED;
  405. }
  406. if (f->fname) {
  407. g_ptr_array_add (arch->files, f);
  408. }
  409. else {
  410. g_free (f);
  411. }
  412. }
  413. p = start_section;
  414. RAR_SKIP_BYTES (sz);
  415. }
  416. end:
  417. part->part_type = RSPAMD_MIME_PART_ARCHIVE;
  418. part->specific.arch = arch;
  419. arch->archive_name = &part->cd->filename;
  420. arch->size = part->parsed_data.len;
  421. }
  422. static void
  423. rspamd_archive_process_rar (struct rspamd_task *task,
  424. struct rspamd_mime_part *part)
  425. {
  426. const guchar *p, *end, *section_start;
  427. const guchar rar_v5_magic[] = {0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x01, 0x00},
  428. rar_v4_magic[] = {0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x00};
  429. const guint rar_encrypted_header = 4, rar_main_header = 1,
  430. rar_file_header = 2;
  431. guint64 vint, sz, comp_sz = 0, uncomp_sz = 0, flags = 0, type = 0,
  432. extra_sz = 0;
  433. struct rspamd_archive *arch;
  434. struct rspamd_archive_file *f;
  435. gint r;
  436. p = part->parsed_data.begin;
  437. end = p + part->parsed_data.len;
  438. if ((gsize)(end - p) <= sizeof (rar_v5_magic)) {
  439. msg_debug_archive ("rar archive is invalid (too small)");
  440. return;
  441. }
  442. if (memcmp (p, rar_v5_magic, sizeof (rar_v5_magic)) == 0) {
  443. p += sizeof (rar_v5_magic);
  444. }
  445. else if (memcmp (p, rar_v4_magic, sizeof (rar_v4_magic)) == 0) {
  446. p += sizeof (rar_v4_magic);
  447. rspamd_archive_process_rar_v4 (task, p, end, part);
  448. return;
  449. }
  450. else {
  451. msg_debug_archive ("rar archive is invalid (no rar magic)");
  452. return;
  453. }
  454. /* Rar v5 format */
  455. arch = rspamd_mempool_alloc0 (task->task_pool, sizeof (*arch));
  456. arch->files = g_ptr_array_new ();
  457. arch->type = RSPAMD_ARCHIVE_RAR;
  458. rspamd_mempool_add_destructor (task->task_pool, rspamd_archive_dtor,
  459. arch);
  460. /* Now we can have either encryption header or archive header */
  461. /* Crc 32 */
  462. RAR_SKIP_BYTES (sizeof (guint32));
  463. /* Size */
  464. RAR_READ_VINT_SKIP ();
  465. sz = vint;
  466. /* Type */
  467. section_start = p;
  468. RAR_READ_VINT_SKIP ();
  469. type = vint;
  470. /* Header flags */
  471. RAR_READ_VINT_SKIP ();
  472. flags = vint;
  473. if (flags & 0x1) {
  474. /* Have extra zone */
  475. RAR_READ_VINT_SKIP ();
  476. }
  477. if (flags & 0x2) {
  478. /* Data zone is presented */
  479. RAR_READ_VINT_SKIP ();
  480. sz += vint;
  481. }
  482. if (type == rar_encrypted_header) {
  483. /* We can't read any further information as archive is encrypted */
  484. arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
  485. goto end;
  486. }
  487. else if (type != rar_main_header) {
  488. msg_debug_archive ("rar archive is invalid (bad main header)");
  489. return;
  490. }
  491. /* Nothing useful in main header */
  492. p = section_start;
  493. RAR_SKIP_BYTES (sz);
  494. while (p < end) {
  495. gboolean has_extra = FALSE;
  496. /* Read the next header */
  497. /* Crc 32 */
  498. RAR_SKIP_BYTES (sizeof (guint32));
  499. /* Size */
  500. RAR_READ_VINT_SKIP ();
  501. sz = vint;
  502. if (sz == 0) {
  503. /* Zero sized block - error */
  504. msg_debug_archive ("rar archive is invalid (zero size block)");
  505. return;
  506. }
  507. section_start = p;
  508. /* Type */
  509. RAR_READ_VINT_SKIP ();
  510. type = vint;
  511. /* Header flags */
  512. RAR_READ_VINT_SKIP ();
  513. flags = vint;
  514. if (flags & 0x1) {
  515. /* Have extra zone */
  516. RAR_READ_VINT_SKIP ();
  517. extra_sz = vint;
  518. has_extra = TRUE;
  519. }
  520. if (flags & 0x2) {
  521. /* Data zone is presented */
  522. RAR_READ_VINT_SKIP ();
  523. sz += vint;
  524. comp_sz = vint;
  525. }
  526. if (type != rar_file_header) {
  527. p = section_start;
  528. RAR_SKIP_BYTES (sz);
  529. }
  530. else {
  531. /* We have a file header, go forward */
  532. guint64 fname_len;
  533. /* File header specific flags */
  534. RAR_READ_VINT_SKIP ();
  535. flags = vint;
  536. /* Unpacked size */
  537. RAR_READ_VINT_SKIP ();
  538. uncomp_sz = vint;
  539. /* Attributes */
  540. RAR_READ_VINT_SKIP ();
  541. if (flags & 0x2) {
  542. /* Unix mtime */
  543. RAR_SKIP_BYTES (sizeof (guint32));
  544. }
  545. if (flags & 0x4) {
  546. /* Crc32 */
  547. RAR_SKIP_BYTES (sizeof (guint32));
  548. }
  549. /* Compression */
  550. RAR_READ_VINT_SKIP ();
  551. /* Host OS */
  552. RAR_READ_VINT_SKIP ();
  553. /* Filename length (finally!) */
  554. RAR_READ_VINT_SKIP ();
  555. fname_len = vint;
  556. if (fname_len == 0 || fname_len > (gsize)(end - p)) {
  557. msg_debug_archive ("rar archive is invalid (bad filename size)");
  558. return;
  559. }
  560. f = g_malloc0 (sizeof (*f));
  561. f->uncompressed_size = uncomp_sz;
  562. f->compressed_size = comp_sz;
  563. f->fname = rspamd_archive_file_try_utf (task, p, fname_len);
  564. if (f->fname) {
  565. msg_debug_archive ("added rarv5 file: %v", f->fname);
  566. g_ptr_array_add (arch->files, f);
  567. }
  568. else {
  569. g_free (f);
  570. f = NULL;
  571. }
  572. if (f && has_extra && extra_sz > 0 &&
  573. p + fname_len + extra_sz < end) {
  574. /* Try to find encryption record in extra field */
  575. const guchar *ex = p + fname_len;
  576. while (ex < p + extra_sz) {
  577. const guchar *t;
  578. gint64 cur_sz = 0, sec_type = 0;
  579. r = rspamd_archive_rar_read_vint (ex, extra_sz, &cur_sz);
  580. if (r == -1) {
  581. msg_debug_archive ("rar archive is invalid (bad vint)");
  582. return;
  583. }
  584. t = ex + r;
  585. r = rspamd_archive_rar_read_vint (t, extra_sz - r, &sec_type);
  586. if (r == -1) {
  587. msg_debug_archive ("rar archive is invalid (bad vint)");
  588. return;
  589. }
  590. if (sec_type == 0x01) {
  591. f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED;
  592. arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
  593. break;
  594. }
  595. ex += cur_sz;
  596. }
  597. }
  598. /* Restore p to the beginning of the header */
  599. p = section_start;
  600. RAR_SKIP_BYTES (sz);
  601. }
  602. }
  603. end:
  604. part->part_type = RSPAMD_MIME_PART_ARCHIVE;
  605. part->specific.arch = arch;
  606. if (part->cd != NULL) {
  607. arch->archive_name = &part->cd->filename;
  608. }
  609. arch->size = part->parsed_data.len;
  610. }
  611. static inline gint
  612. rspamd_archive_7zip_read_vint (const guchar *start, gsize remain, guint64 *res)
  613. {
  614. /*
  615. * REAL_UINT64 means real UINT64.
  616. * UINT64 means real UINT64 encoded with the following scheme:
  617. *
  618. * Size of encoding sequence depends from first byte:
  619. * First_Byte Extra_Bytes Value
  620. * (binary)
  621. * 0xxxxxxx : ( xxxxxxx )
  622. * 10xxxxxx BYTE y[1] : ( xxxxxx << (8 * 1)) + y
  623. * 110xxxxx BYTE y[2] : ( xxxxx << (8 * 2)) + y
  624. * ...
  625. * 1111110x BYTE y[6] : ( x << (8 * 6)) + y
  626. * 11111110 BYTE y[7] : y
  627. * 11111111 BYTE y[8] : y
  628. */
  629. guchar t;
  630. if (remain == 0) {
  631. return -1;
  632. }
  633. t = *start;
  634. if (!isset (&t, 7)) {
  635. /* Trivial case */
  636. *res = t;
  637. return 1;
  638. }
  639. else if (t == 0xFF) {
  640. if (remain >= sizeof (guint64) + 1) {
  641. memcpy (res, start + 1, sizeof (guint64));
  642. *res = GUINT64_FROM_LE (*res);
  643. return sizeof (guint64) + 1;
  644. }
  645. }
  646. else {
  647. gint cur_bit = 6, intlen = 1;
  648. const guchar bmask = 0xFF;
  649. guint64 tgt;
  650. while (cur_bit > 0) {
  651. if (!isset (&t, cur_bit)) {
  652. if (remain >= intlen + 1) {
  653. memcpy (&tgt, start + 1, intlen);
  654. tgt = GUINT64_FROM_LE (tgt);
  655. /* Shift back */
  656. tgt >>= sizeof (tgt) - NBBY * intlen;
  657. /* Add masked value */
  658. tgt += (guint64)(t & (bmask >> (NBBY - cur_bit)))
  659. << (NBBY * intlen);
  660. *res = tgt;
  661. return intlen + 1;
  662. }
  663. }
  664. cur_bit --;
  665. intlen ++;
  666. }
  667. }
  668. return -1;
  669. }
  670. #define SZ_READ_VINT_SKIP() do { \
  671. r = rspamd_archive_7zip_read_vint (p, end - p, &vint); \
  672. if (r == -1) { \
  673. msg_debug_archive ("7z archive is invalid (bad vint)"); \
  674. return; \
  675. } \
  676. p += r; \
  677. } while (0)
  678. #define SZ_READ_VINT(var) do { \
  679. int r; \
  680. r = rspamd_archive_7zip_read_vint (p, end - p, &(var)); \
  681. if (r == -1) { \
  682. msg_debug_archive ("7z archive is invalid (bad vint): %s", G_STRLOC); \
  683. return NULL; \
  684. } \
  685. p += r; \
  686. } while (0)
  687. #define SZ_READ_UINT64(n) do { \
  688. if (end - p < (goffset)sizeof (guint64)) { \
  689. msg_debug_archive ("7zip archive is invalid (bad uint64): %s", G_STRLOC); \
  690. return; \
  691. } \
  692. memcpy (&(n), p, sizeof (guint64)); \
  693. n = GUINT64_FROM_LE(n); \
  694. p += sizeof (guint64); \
  695. } while (0)
  696. #define SZ_SKIP_BYTES(n) do { \
  697. if (end - p >= (n)) { \
  698. p += (n); \
  699. } \
  700. else { \
  701. msg_debug_archive ("7zip archive is invalid (truncated); wanted to read %d bytes, %d avail: %s", (gint)(n), (gint)(end - p), G_STRLOC); \
  702. return NULL; \
  703. } \
  704. } while (0)
  705. enum rspamd_7zip_header_mark {
  706. kEnd = 0x00,
  707. kHeader = 0x01,
  708. kArchiveProperties = 0x02,
  709. kAdditionalStreamsInfo = 0x03,
  710. kMainStreamsInfo = 0x04,
  711. kFilesInfo = 0x05,
  712. kPackInfo = 0x06,
  713. kUnPackInfo = 0x07,
  714. kSubStreamsInfo = 0x08,
  715. kSize = 0x09,
  716. kCRC = 0x0A,
  717. kFolder = 0x0B,
  718. kCodersUnPackSize = 0x0C,
  719. kNumUnPackStream = 0x0D,
  720. kEmptyStream = 0x0E,
  721. kEmptyFile = 0x0F,
  722. kAnti = 0x10,
  723. kName = 0x11,
  724. kCTime = 0x12,
  725. kATime = 0x13,
  726. kMTime = 0x14,
  727. kWinAttributes = 0x15,
  728. kComment = 0x16,
  729. kEncodedHeader = 0x17,
  730. kStartPos = 0x18,
  731. kDummy = 0x19,
  732. };
  733. #define _7Z_CRYPTO_MAIN_ZIP 0x06F10101 /* Main Zip crypto algo */
  734. #define _7Z_CRYPTO_RAR_29 0x06F10303 /* Rar29 AES-128 + (modified SHA-1) */
  735. #define _7Z_CRYPTO_AES_256_SHA_256 0x06F10701 /* AES-256 + SHA-256 */
  736. #define IS_SZ_ENCRYPTED(codec_id) (((codec_id) == _7Z_CRYPTO_MAIN_ZIP) || \
  737. ((codec_id) == _7Z_CRYPTO_RAR_29) || \
  738. ((codec_id) == _7Z_CRYPTO_AES_256_SHA_256))
  739. static const guchar *
  740. rspamd_7zip_read_bits (struct rspamd_task *task,
  741. const guchar *p, const guchar *end,
  742. struct rspamd_archive *arch, guint nbits,
  743. guint *pbits_set)
  744. {
  745. unsigned mask = 0, avail = 0, i;
  746. gboolean bit_set = 0;
  747. for (i = 0; i < nbits; i++) {
  748. if (mask == 0) {
  749. avail = *p;
  750. SZ_SKIP_BYTES(1);
  751. mask = 0x80;
  752. }
  753. bit_set = (avail & mask) ? 1 : 0;
  754. if (bit_set && pbits_set) {
  755. (*pbits_set) ++;
  756. }
  757. mask >>= 1;
  758. }
  759. return p;
  760. }
  761. static const guchar *
  762. rspamd_7zip_read_digest (struct rspamd_task *task,
  763. const guchar *p, const guchar *end,
  764. struct rspamd_archive *arch,
  765. guint64 num_streams,
  766. guint *pdigest_read)
  767. {
  768. guchar all_defined = *p;
  769. guint64 i;
  770. guint num_defined = 0;
  771. /*
  772. * BYTE AllAreDefined
  773. * if (AllAreDefined == 0)
  774. * {
  775. * for(NumStreams)
  776. * BIT Defined
  777. * }
  778. * UINT32 CRCs[NumDefined]
  779. */
  780. SZ_SKIP_BYTES(1);
  781. if (all_defined) {
  782. num_defined = num_streams;
  783. }
  784. else {
  785. if (num_streams > 8192) {
  786. /* Gah */
  787. return NULL;
  788. }
  789. p = rspamd_7zip_read_bits (task, p, end, arch, num_streams, &num_defined);
  790. if (p == NULL) {
  791. return NULL;
  792. }
  793. }
  794. for (i = 0; i < num_defined; i ++) {
  795. SZ_SKIP_BYTES(sizeof(guint32));
  796. }
  797. if (pdigest_read) {
  798. *pdigest_read = num_defined;
  799. }
  800. return p;
  801. }
  802. static const guchar *
  803. rspamd_7zip_read_pack_info (struct rspamd_task *task,
  804. const guchar *p, const guchar *end,
  805. struct rspamd_archive *arch)
  806. {
  807. guint64 pack_pos = 0, pack_streams = 0, i, cur_sz;
  808. guint num_digests = 0;
  809. guchar t;
  810. /*
  811. * UINT64 PackPos
  812. * UINT64 NumPackStreams
  813. *
  814. * []
  815. * BYTE NID::kSize (0x09)
  816. * UINT64 PackSizes[NumPackStreams]
  817. * []
  818. *
  819. * []
  820. * BYTE NID::kCRC (0x0A)
  821. * PackStreamDigests[NumPackStreams]
  822. * []
  823. * BYTE NID::kEnd
  824. */
  825. SZ_READ_VINT(pack_pos);
  826. SZ_READ_VINT(pack_streams);
  827. while (p != NULL && p < end) {
  828. t = *p;
  829. SZ_SKIP_BYTES(1);
  830. msg_debug_archive ("7zip: read pack info %xc", t);
  831. switch (t) {
  832. case kSize:
  833. /* We need to skip pack_streams VINTS */
  834. for (i = 0; i < pack_streams; i++) {
  835. SZ_READ_VINT(cur_sz);
  836. }
  837. break;
  838. case kCRC:
  839. /* CRCs are more complicated */
  840. p = rspamd_7zip_read_digest (task, p, end, arch, pack_streams,
  841. &num_digests);
  842. break;
  843. case kEnd:
  844. goto end;
  845. break;
  846. default:
  847. p = NULL;
  848. msg_debug_archive ("bad 7zip type: %xc; %s", t, G_STRLOC);
  849. goto end;
  850. break;
  851. }
  852. }
  853. end:
  854. return p;
  855. }
  856. static const guchar *
  857. rspamd_7zip_read_folder (struct rspamd_task *task,
  858. const guchar *p, const guchar *end,
  859. struct rspamd_archive *arch, guint *pnstreams, guint *ndigests)
  860. {
  861. guint64 ncoders = 0, i, j, noutstreams = 0, ninstreams = 0;
  862. SZ_READ_VINT (ncoders);
  863. for (i = 0; i < ncoders && p != NULL && p < end; i ++) {
  864. guint64 sz, tmp;
  865. guchar t;
  866. /*
  867. * BYTE
  868. * {
  869. * 0:3 CodecIdSize
  870. * 4: Is Complex Coder
  871. * 5: There Are Attributes
  872. * 6: Reserved
  873. * 7: There are more alternative methods. (Not used anymore, must be 0).
  874. * }
  875. * BYTE CodecId[CodecIdSize]
  876. * if (Is Complex Coder)
  877. * {
  878. * UINT64 NumInStreams;
  879. * UINT64 NumOutStreams;
  880. * }
  881. * if (There Are Attributes)
  882. * {
  883. * UINT64 PropertiesSize
  884. * BYTE Properties[PropertiesSize]
  885. * }
  886. */
  887. t = *p;
  888. SZ_SKIP_BYTES (1);
  889. sz = t & 0xF;
  890. /* Codec ID */
  891. tmp = 0;
  892. for (j = 0; j < sz; j++) {
  893. tmp <<= 8;
  894. tmp += p[j];
  895. }
  896. msg_debug_archive ("7zip: read codec id: %L", tmp);
  897. if (IS_SZ_ENCRYPTED (tmp)) {
  898. arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
  899. }
  900. SZ_SKIP_BYTES (sz);
  901. if (t & (1u << 4)) {
  902. /* Complex */
  903. SZ_READ_VINT (tmp); /* InStreams */
  904. ninstreams += tmp;
  905. SZ_READ_VINT (tmp); /* OutStreams */
  906. noutstreams += tmp;
  907. }
  908. else {
  909. /* XXX: is it correct ? */
  910. noutstreams ++;
  911. ninstreams ++;
  912. }
  913. if (t & (1u << 5)) {
  914. /* Attributes ... */
  915. SZ_READ_VINT (tmp); /* Size of attrs */
  916. SZ_SKIP_BYTES (tmp);
  917. }
  918. }
  919. if (noutstreams > 1) {
  920. /* BindPairs, WTF, huh */
  921. for (i = 0; i < noutstreams - 1; i ++) {
  922. guint64 tmp;
  923. SZ_READ_VINT (tmp);
  924. SZ_READ_VINT (tmp);
  925. }
  926. }
  927. gint64 npacked = (gint64)ninstreams - (gint64)noutstreams + 1;
  928. msg_debug_archive ("7zip: instreams=%L, outstreams=%L, packed=%L",
  929. ninstreams, noutstreams, npacked);
  930. if (npacked > 1) {
  931. /* Gah... */
  932. for (i = 0; i < npacked; i ++) {
  933. guint64 tmp;
  934. SZ_READ_VINT (tmp);
  935. }
  936. }
  937. *pnstreams = noutstreams;
  938. (*ndigests) += npacked;
  939. return p;
  940. }
  941. static const guchar *
  942. rspamd_7zip_read_coders_info (struct rspamd_task *task,
  943. const guchar *p, const guchar *end,
  944. struct rspamd_archive *arch,
  945. guint *pnum_folders, guint *pnum_nodigest)
  946. {
  947. guint64 num_folders = 0, i, tmp;
  948. guchar t;
  949. guint *folder_nstreams = NULL, num_digests = 0, digests_read = 0;
  950. while (p != NULL && p < end) {
  951. /*
  952. * BYTE NID::kFolder (0x0B)
  953. * UINT64 NumFolders
  954. * BYTE External
  955. * switch(External)
  956. * {
  957. * case 0:
  958. * Folders[NumFolders]
  959. * case 1:
  960. * UINT64 DataStreamIndex
  961. * }
  962. * BYTE ID::kCodersUnPackSize (0x0C)
  963. * for(Folders)
  964. * for(Folder.NumOutStreams)
  965. * UINT64 UnPackSize;
  966. * []
  967. * BYTE NID::kCRC (0x0A)
  968. * UnPackDigests[NumFolders]
  969. * []
  970. * BYTE NID::kEnd
  971. */
  972. t = *p;
  973. SZ_SKIP_BYTES(1);
  974. msg_debug_archive ("7zip: read coders info %xc", t);
  975. switch (t) {
  976. case kFolder:
  977. SZ_READ_VINT (num_folders);
  978. msg_debug_archive ("7zip: nfolders=%L", num_folders);
  979. if (*p != 0) {
  980. /* External folders */
  981. SZ_SKIP_BYTES(1);
  982. SZ_READ_VINT (tmp);
  983. }
  984. else {
  985. SZ_SKIP_BYTES(1);
  986. if (num_folders > 8192) {
  987. /* Gah */
  988. return NULL;
  989. }
  990. if (folder_nstreams) {
  991. g_free (folder_nstreams);
  992. }
  993. folder_nstreams = g_malloc (sizeof (int) * num_folders);
  994. for (i = 0; i < num_folders && p != NULL && p < end; i++) {
  995. p = rspamd_7zip_read_folder (task, p, end, arch,
  996. &folder_nstreams[i], &num_digests);
  997. }
  998. }
  999. break;
  1000. case kCodersUnPackSize:
  1001. for (i = 0; i < num_folders && p != NULL && p < end; i++) {
  1002. if (folder_nstreams) {
  1003. for (guint j = 0; j < folder_nstreams[i]; j++) {
  1004. SZ_READ_VINT (tmp); /* Unpacked size */
  1005. msg_debug_archive ("7zip: unpacked size "
  1006. "(folder=%d, stream=%d) = %L",
  1007. (gint)i, j, tmp);
  1008. }
  1009. }
  1010. else {
  1011. msg_err_task ("internal 7zip error");
  1012. }
  1013. }
  1014. break;
  1015. case kCRC:
  1016. /*
  1017. * Here are dragons. Spec tells that here there could be up
  1018. * to nfolders digests. However, according to the actual source
  1019. * code, in case of multiple out streams there should be digests
  1020. * for all out streams.
  1021. *
  1022. * In the real life (tm) it is even more idiotic: all these digests
  1023. * are in another section! But that section needs number of digests
  1024. * that are absent here. It is the most stupid thing I've ever seen
  1025. * in any file format.
  1026. *
  1027. * I hope there *WAS* some reason to do such shit...
  1028. */
  1029. p = rspamd_7zip_read_digest (task, p, end, arch, num_digests,
  1030. &digests_read);
  1031. break;
  1032. case kEnd:
  1033. goto end;
  1034. break;
  1035. default:
  1036. p = NULL;
  1037. msg_debug_archive ("bad 7zip type: %xc; %s", t, G_STRLOC);
  1038. goto end;
  1039. break;
  1040. }
  1041. }
  1042. end:
  1043. if (pnum_nodigest) {
  1044. *pnum_nodigest = num_digests - digests_read;
  1045. }
  1046. if (pnum_folders) {
  1047. *pnum_folders = num_folders;
  1048. }
  1049. if (folder_nstreams) {
  1050. g_free (folder_nstreams);
  1051. }
  1052. return p;
  1053. }
  1054. static const guchar *
  1055. rspamd_7zip_read_substreams_info (struct rspamd_task *task,
  1056. const guchar *p, const guchar *end,
  1057. struct rspamd_archive *arch,
  1058. guint num_folders, guint num_nodigest)
  1059. {
  1060. guchar t;
  1061. guint i;
  1062. guint64 *folder_nstreams;
  1063. if (num_folders > 8192) {
  1064. /* Gah */
  1065. return NULL;
  1066. }
  1067. folder_nstreams = g_alloca (sizeof (guint64) * num_folders);
  1068. memset (folder_nstreams, 0, sizeof (guint64) * num_folders);
  1069. while (p != NULL && p < end) {
  1070. /*
  1071. * []
  1072. * BYTE NID::kNumUnPackStream; (0x0D)
  1073. * UINT64 NumUnPackStreamsInFolders[NumFolders];
  1074. * []
  1075. *
  1076. * []
  1077. * BYTE NID::kSize (0x09)
  1078. * UINT64 UnPackSizes[??]
  1079. * []
  1080. *
  1081. *
  1082. * []
  1083. * BYTE NID::kCRC (0x0A)
  1084. * Digests[Number of streams with unknown CRC]
  1085. * []
  1086. */
  1087. t = *p;
  1088. SZ_SKIP_BYTES(1);
  1089. msg_debug_archive ("7zip: read substream info %xc", t);
  1090. switch (t) {
  1091. case kNumUnPackStream:
  1092. for (i = 0; i < num_folders; i ++) {
  1093. guint64 tmp;
  1094. SZ_READ_VINT (tmp);
  1095. folder_nstreams[i] = tmp;
  1096. }
  1097. break;
  1098. case kCRC:
  1099. /*
  1100. * Read the comment in the rspamd_7zip_read_coders_info
  1101. */
  1102. p = rspamd_7zip_read_digest (task, p, end, arch, num_nodigest,
  1103. NULL);
  1104. break;
  1105. case kSize:
  1106. /*
  1107. * Another brain damaged logic, but we have to support it
  1108. * as there are no ways to proceed without it.
  1109. * In fact, it is just absent in the real life...
  1110. */
  1111. for (i = 0; i < num_folders; i ++) {
  1112. for (guint j = 0; j < folder_nstreams[i]; j++) {
  1113. guint64 tmp;
  1114. SZ_READ_VINT (tmp); /* Who cares indeed */
  1115. }
  1116. }
  1117. break;
  1118. case kEnd:
  1119. goto end;
  1120. break;
  1121. default:
  1122. p = NULL;
  1123. msg_debug_archive ("bad 7zip type: %xc; %s", t, G_STRLOC);
  1124. goto end;
  1125. break;
  1126. }
  1127. }
  1128. end:
  1129. return p;
  1130. }
  1131. static const guchar *
  1132. rspamd_7zip_read_main_streams_info (struct rspamd_task *task,
  1133. const guchar *p, const guchar *end,
  1134. struct rspamd_archive *arch)
  1135. {
  1136. guchar t;
  1137. guint num_folders = 0, unknown_digests = 0;
  1138. while (p != NULL && p < end) {
  1139. t = *p;
  1140. SZ_SKIP_BYTES(1);
  1141. msg_debug_archive ("7zip: read main streams info %xc", t);
  1142. /*
  1143. *
  1144. * []
  1145. * PackInfo
  1146. * []
  1147. * []
  1148. * CodersInfo
  1149. * []
  1150. *
  1151. * []
  1152. * SubStreamsInfo
  1153. * []
  1154. *
  1155. * BYTE NID::kEnd
  1156. */
  1157. switch (t) {
  1158. case kPackInfo:
  1159. p = rspamd_7zip_read_pack_info (task, p, end, arch);
  1160. break;
  1161. case kUnPackInfo:
  1162. p = rspamd_7zip_read_coders_info (task, p, end, arch, &num_folders,
  1163. &unknown_digests);
  1164. break;
  1165. case kSubStreamsInfo:
  1166. p = rspamd_7zip_read_substreams_info (task, p, end, arch, num_folders,
  1167. unknown_digests);
  1168. break;
  1169. break;
  1170. case kEnd:
  1171. goto end;
  1172. break;
  1173. default:
  1174. p = NULL;
  1175. msg_debug_archive ("bad 7zip type: %xc; %s", t, G_STRLOC);
  1176. goto end;
  1177. break;
  1178. }
  1179. }
  1180. end:
  1181. return p;
  1182. }
  1183. static const guchar *
  1184. rspamd_7zip_read_archive_props (struct rspamd_task *task,
  1185. const guchar *p, const guchar *end,
  1186. struct rspamd_archive *arch)
  1187. {
  1188. guchar proptype;
  1189. guint64 proplen;
  1190. /*
  1191. * for (;;)
  1192. * {
  1193. * BYTE PropertyType;
  1194. * if (aType == 0)
  1195. * break;
  1196. * UINT64 PropertySize;
  1197. * BYTE PropertyData[PropertySize];
  1198. * }
  1199. */
  1200. if (p != NULL) {
  1201. proptype = *p;
  1202. SZ_SKIP_BYTES(1);
  1203. while (proptype != 0) {
  1204. SZ_READ_VINT(proplen);
  1205. if (p + proplen < end) {
  1206. p += proplen;
  1207. }
  1208. else {
  1209. return NULL;
  1210. }
  1211. proptype = *p;
  1212. SZ_SKIP_BYTES(1);
  1213. }
  1214. }
  1215. return p;
  1216. }
  1217. static GString *
  1218. rspamd_7zip_ucs2_to_utf8 (struct rspamd_task *task, const guchar *p,
  1219. const guchar *end)
  1220. {
  1221. GString *res;
  1222. goffset dest_pos = 0, src_pos = 0;
  1223. const gsize len = (end - p) / sizeof (guint16);
  1224. guint16 *up;
  1225. UChar32 wc;
  1226. UBool is_error = 0;
  1227. res = g_string_sized_new ((end - p) * 3 / 2 + sizeof (wc) + 1);
  1228. up = (guint16 *)p;
  1229. while (src_pos < len) {
  1230. U16_NEXT (up, src_pos, len, wc);
  1231. if (wc > 0) {
  1232. U8_APPEND (res->str, dest_pos,
  1233. res->allocated_len - 1,
  1234. wc, is_error);
  1235. }
  1236. if (is_error) {
  1237. g_string_free (res, TRUE);
  1238. return NULL;
  1239. }
  1240. }
  1241. g_assert (dest_pos < res->allocated_len);
  1242. res->len = dest_pos;
  1243. res->str[dest_pos] = '\0';
  1244. return res;
  1245. }
  1246. static const guchar *
  1247. rspamd_7zip_read_files_info (struct rspamd_task *task,
  1248. const guchar *p, const guchar *end,
  1249. struct rspamd_archive *arch)
  1250. {
  1251. guint64 nfiles = 0, sz, i;
  1252. guchar t, b;
  1253. struct rspamd_archive_file *fentry;
  1254. SZ_READ_VINT (nfiles);
  1255. for (;p != NULL && p < end;) {
  1256. t = *p;
  1257. SZ_SKIP_BYTES (1);
  1258. msg_debug_archive ("7zip: read file data type %xc", t);
  1259. if (t == kEnd) {
  1260. goto end;
  1261. }
  1262. /* This is SO SPECIAL, gah */
  1263. SZ_READ_VINT (sz);
  1264. switch (t) {
  1265. case kEmptyStream:
  1266. case kEmptyFile:
  1267. case kAnti: /* AntiFile, OMFG */
  1268. /* We don't care about these bits */
  1269. case kCTime:
  1270. case kATime:
  1271. case kMTime:
  1272. /* We don't care of these guys, but we still have to parse them, gah */
  1273. if (sz > 0) {
  1274. SZ_SKIP_BYTES (sz);
  1275. }
  1276. break;
  1277. case kName:
  1278. /* The most useful part in this whole bloody format */
  1279. b = *p; /* External flag */
  1280. SZ_SKIP_BYTES (1);
  1281. if (b) {
  1282. /* TODO: for the god sake, do something about external
  1283. * filenames...
  1284. */
  1285. guint64 tmp;
  1286. SZ_READ_VINT (tmp);
  1287. }
  1288. else {
  1289. for (i = 0; i < nfiles; i ++) {
  1290. /* Zero terminated wchar_t: happy converting... */
  1291. /* First, find terminator */
  1292. const guchar *fend = NULL, *tp = p;
  1293. GString *res;
  1294. while (tp < end - 1) {
  1295. if (*tp == 0 && *(tp + 1) == 0) {
  1296. fend = tp;
  1297. break;
  1298. }
  1299. tp += 2;
  1300. }
  1301. if (fend == NULL || fend - p == 0) {
  1302. /* Crap instead of fname */
  1303. msg_debug_archive ("bad 7zip name; %s", G_STRLOC);
  1304. goto end;
  1305. }
  1306. res = rspamd_7zip_ucs2_to_utf8 (task, p, fend);
  1307. if (res != NULL) {
  1308. fentry = g_malloc0 (sizeof (*fentry));
  1309. fentry->fname = res;
  1310. g_ptr_array_add (arch->files, fentry);
  1311. msg_debug_archive ("7zip: found file %v", res);
  1312. }
  1313. else {
  1314. msg_debug_archive ("bad 7zip name; %s", G_STRLOC);
  1315. }
  1316. /* Skip zero terminating character */
  1317. p = fend + 2;
  1318. }
  1319. }
  1320. break;
  1321. case kDummy:
  1322. case kWinAttributes:
  1323. if (sz > 0) {
  1324. SZ_SKIP_BYTES (sz);
  1325. }
  1326. break;
  1327. default:
  1328. p = NULL;
  1329. msg_debug_archive ("bad 7zip type: %xc; %s", t, G_STRLOC);
  1330. goto end;
  1331. break;
  1332. }
  1333. }
  1334. end:
  1335. return p;
  1336. }
  1337. static const guchar *
  1338. rspamd_7zip_read_next_section (struct rspamd_task *task,
  1339. const guchar *p, const guchar *end,
  1340. struct rspamd_archive *arch)
  1341. {
  1342. guchar t = *p;
  1343. SZ_SKIP_BYTES(1);
  1344. msg_debug_archive ("7zip: read section %xc", t);
  1345. switch (t) {
  1346. case kHeader:
  1347. /* We just skip byte and go further */
  1348. break;
  1349. case kEncodedHeader:
  1350. /*
  1351. * In fact, headers are just packed, but we assume it as
  1352. * encrypted to distinguish from the normal archives
  1353. */
  1354. msg_debug_archive ("7zip: encoded header, needs to be uncompressed");
  1355. arch->flags |= RSPAMD_ARCHIVE_CANNOT_READ;
  1356. p = NULL; /* Cannot get anything useful */
  1357. break;
  1358. case kArchiveProperties:
  1359. p = rspamd_7zip_read_archive_props (task, p, end, arch);
  1360. break;
  1361. case kMainStreamsInfo:
  1362. p = rspamd_7zip_read_main_streams_info (task, p, end, arch);
  1363. break;
  1364. case kAdditionalStreamsInfo:
  1365. p = rspamd_7zip_read_main_streams_info (task, p, end, arch);
  1366. break;
  1367. case kFilesInfo:
  1368. p = rspamd_7zip_read_files_info (task, p, end, arch);
  1369. break;
  1370. case kEnd:
  1371. p = NULL;
  1372. msg_debug_archive ("7zip: read final section");
  1373. break;
  1374. default:
  1375. p = NULL;
  1376. msg_debug_archive ("bad 7zip type: %xc; %s", t, G_STRLOC);
  1377. break;
  1378. }
  1379. return p;
  1380. }
  1381. static void
  1382. rspamd_archive_process_7zip (struct rspamd_task *task,
  1383. struct rspamd_mime_part *part)
  1384. {
  1385. struct rspamd_archive *arch;
  1386. const guchar *start, *p, *end;
  1387. const guchar sz_magic[] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C};
  1388. guint64 section_offset = 0, section_length = 0;
  1389. start = part->parsed_data.begin;
  1390. p = start;
  1391. end = p + part->parsed_data.len;
  1392. if (end - p <= sizeof (guint64) + sizeof (guint32) ||
  1393. memcmp (p, sz_magic, sizeof (sz_magic)) != 0) {
  1394. msg_debug_archive ("7z archive is invalid (no 7z magic)");
  1395. return;
  1396. }
  1397. arch = rspamd_mempool_alloc0 (task->task_pool, sizeof (*arch));
  1398. arch->files = g_ptr_array_new ();
  1399. arch->type = RSPAMD_ARCHIVE_7ZIP;
  1400. rspamd_mempool_add_destructor (task->task_pool, rspamd_archive_dtor,
  1401. arch);
  1402. /* Magic (6 bytes) + version (2 bytes) + crc32 (4 bytes) */
  1403. p += sizeof (guint64) + sizeof (guint32);
  1404. SZ_READ_UINT64(section_offset);
  1405. SZ_READ_UINT64(section_length);
  1406. if (end - p > sizeof (guint32)) {
  1407. p += sizeof (guint32);
  1408. }
  1409. else {
  1410. msg_debug_archive ("7z archive is invalid (truncated crc)");
  1411. return;
  1412. }
  1413. if (end - p > section_offset) {
  1414. p += section_offset;
  1415. }
  1416. else {
  1417. msg_debug_archive ("7z archive is invalid (incorrect section offset)");
  1418. return;
  1419. }
  1420. while ((p = rspamd_7zip_read_next_section (task, p, end, arch)) != NULL);
  1421. part->part_type = RSPAMD_MIME_PART_ARCHIVE;
  1422. part->specific.arch = arch;
  1423. if (part->cd != NULL) {
  1424. arch->archive_name = &part->cd->filename;
  1425. }
  1426. arch->size = part->parsed_data.len;
  1427. }
  1428. static void
  1429. rspamd_archive_process_gzip (struct rspamd_task *task,
  1430. struct rspamd_mime_part *part) {
  1431. struct rspamd_archive *arch;
  1432. const guchar *start, *p, *end;
  1433. const guchar gz_magic[] = {0x1F, 0x8B};
  1434. guchar flags;
  1435. start = part->parsed_data.begin;
  1436. p = start;
  1437. end = p + part->parsed_data.len;
  1438. if (end - p <= 10 || memcmp (p, gz_magic, sizeof (gz_magic)) != 0) {
  1439. msg_debug_archive ("gzip archive is invalid (no gzip magic)");
  1440. return;
  1441. }
  1442. arch = rspamd_mempool_alloc0 (task->task_pool, sizeof (*arch));
  1443. arch->files = g_ptr_array_sized_new (1);
  1444. arch->type = RSPAMD_ARCHIVE_GZIP;
  1445. rspamd_mempool_add_destructor (task->task_pool, rspamd_archive_dtor,
  1446. arch);
  1447. flags = p[3];
  1448. if (flags & (1u << 5)) {
  1449. arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
  1450. }
  1451. if (flags & (1u << 3)) {
  1452. /* We have file name presented in archive, try to use it */
  1453. if (flags & (1u << 1)) {
  1454. /* Multipart */
  1455. p += 12;
  1456. }
  1457. else {
  1458. p += 10;
  1459. }
  1460. if (flags & (1u << 2)) {
  1461. /* Optional section */
  1462. guint16 optlen = 0;
  1463. RAR_READ_UINT16 (optlen);
  1464. if (end <= p + optlen) {
  1465. msg_debug_archive ("gzip archive is invalid, bad extra length: %d",
  1466. (int)optlen);
  1467. return;
  1468. }
  1469. p += optlen;
  1470. }
  1471. /* Read file name */
  1472. const guchar *fname_start = p;
  1473. while (p < end) {
  1474. if (*p == '\0') {
  1475. if (p > fname_start) {
  1476. struct rspamd_archive_file *f;
  1477. f = g_malloc0 (sizeof (*f));
  1478. f->fname = rspamd_archive_file_try_utf (task, fname_start,
  1479. p - fname_start);
  1480. if (f->fname) {
  1481. g_ptr_array_add (arch->files, f);
  1482. }
  1483. else {
  1484. /* Invalid filename, skip */
  1485. g_free (f);
  1486. }
  1487. goto set;
  1488. }
  1489. }
  1490. p ++;
  1491. }
  1492. /* Wrong filename, not zero terminated */
  1493. msg_debug_archive ("gzip archive is invalid, bad filename at pos %d",
  1494. (int)(p - start));
  1495. return;
  1496. }
  1497. /* Fallback, we need to extract file name from archive name if possible */
  1498. if (part->cd && part->cd->filename.len > 0) {
  1499. const gchar *dot_pos, *slash_pos;
  1500. dot_pos = rspamd_memrchr (part->cd->filename.begin, '.',
  1501. part->cd->filename.len);
  1502. if (dot_pos) {
  1503. struct rspamd_archive_file *f;
  1504. slash_pos = rspamd_memrchr (part->cd->filename.begin, '/',
  1505. part->cd->filename.len);
  1506. if (slash_pos && slash_pos < dot_pos) {
  1507. f = g_malloc0 (sizeof (*f));
  1508. f->fname = g_string_sized_new (dot_pos - slash_pos);
  1509. g_string_append_len (f->fname, slash_pos + 1,
  1510. dot_pos - slash_pos - 1);
  1511. msg_debug_archive ("fallback to gzip filename based on cd: %v",
  1512. f->fname);
  1513. g_ptr_array_add (arch->files, f);
  1514. goto set;
  1515. }
  1516. else {
  1517. const gchar *fname_start = part->cd->filename.begin;
  1518. f = g_malloc0 (sizeof (*f));
  1519. if (memchr (fname_start, '.', part->cd->filename.len) != dot_pos) {
  1520. /* Double dots, something like foo.exe.gz */
  1521. f->fname = g_string_sized_new (dot_pos - fname_start);
  1522. g_string_append_len (f->fname, fname_start,
  1523. dot_pos - fname_start);
  1524. }
  1525. else {
  1526. /* Single dot, something like foo.gzz */
  1527. f->fname = g_string_sized_new (part->cd->filename.len);
  1528. g_string_append_len (f->fname, fname_start,
  1529. part->cd->filename.len);
  1530. }
  1531. msg_debug_archive ("fallback to gzip filename based on cd: %v",
  1532. f->fname);
  1533. g_ptr_array_add (arch->files, f);
  1534. goto set;
  1535. }
  1536. }
  1537. }
  1538. return;
  1539. set:
  1540. /* Set archive data */
  1541. part->part_type = RSPAMD_MIME_PART_ARCHIVE;
  1542. part->specific.arch = arch;
  1543. if (part->cd) {
  1544. arch->archive_name = &part->cd->filename;
  1545. }
  1546. arch->size = part->parsed_data.len;
  1547. }
  1548. static gboolean
  1549. rspamd_archive_cheat_detect (struct rspamd_mime_part *part, const gchar *str,
  1550. const guchar *magic_start, gsize magic_len)
  1551. {
  1552. struct rspamd_content_type *ct;
  1553. const gchar *p;
  1554. rspamd_ftok_t srch, *fname;
  1555. ct = part->ct;
  1556. RSPAMD_FTOK_ASSIGN (&srch, "application");
  1557. if (ct && ct->type.len && ct->subtype.len > 0 && rspamd_ftok_cmp (&ct->type,
  1558. &srch) == 0) {
  1559. if (rspamd_substring_search_caseless (ct->subtype.begin, ct->subtype.len,
  1560. str, strlen (str)) != -1) {
  1561. /* We still need to check magic, see #1848 */
  1562. if (magic_start != NULL) {
  1563. if (part->parsed_data.len > magic_len &&
  1564. memcmp (part->parsed_data.begin,
  1565. magic_start, magic_len) == 0) {
  1566. return TRUE;
  1567. }
  1568. /* No magic, refuse this type of archive */
  1569. return FALSE;
  1570. }
  1571. else {
  1572. return TRUE;
  1573. }
  1574. }
  1575. }
  1576. if (part->cd) {
  1577. fname = &part->cd->filename;
  1578. if (fname && fname->len > strlen (str)) {
  1579. p = fname->begin + fname->len - strlen (str);
  1580. if (rspamd_lc_cmp (p, str, strlen (str)) == 0) {
  1581. if (*(p - 1) == '.') {
  1582. if (magic_start != NULL) {
  1583. if (part->parsed_data.len > magic_len &&
  1584. memcmp (part->parsed_data.begin,
  1585. magic_start, magic_len) == 0) {
  1586. return TRUE;
  1587. }
  1588. /* No magic, refuse this type of archive */
  1589. return FALSE;
  1590. }
  1591. return TRUE;
  1592. }
  1593. }
  1594. }
  1595. if (magic_start != NULL) {
  1596. if (part->parsed_data.len > magic_len &&
  1597. memcmp (part->parsed_data.begin, magic_start, magic_len) == 0) {
  1598. return TRUE;
  1599. }
  1600. }
  1601. }
  1602. else {
  1603. if (magic_start != NULL) {
  1604. if (part->parsed_data.len > magic_len &&
  1605. memcmp (part->parsed_data.begin, magic_start, magic_len) == 0) {
  1606. return TRUE;
  1607. }
  1608. }
  1609. }
  1610. return FALSE;
  1611. }
  1612. void
  1613. rspamd_archives_process (struct rspamd_task *task)
  1614. {
  1615. guint i;
  1616. struct rspamd_mime_part *part;
  1617. const guchar rar_magic[] = {0x52, 0x61, 0x72, 0x21, 0x1A, 0x07};
  1618. const guchar zip_magic[] = {0x50, 0x4b, 0x03, 0x04};
  1619. const guchar sz_magic[] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C};
  1620. const guchar gz_magic[] = {0x1F, 0x8B, 0x08};
  1621. PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, parts), i, part) {
  1622. if (part->part_type == RSPAMD_MIME_PART_UNDEFINED) {
  1623. if (part->parsed_data.len > 0) {
  1624. if (rspamd_archive_cheat_detect (part, "zip",
  1625. zip_magic, sizeof (zip_magic))) {
  1626. rspamd_archive_process_zip (task, part);
  1627. }
  1628. else if (rspamd_archive_cheat_detect (part, "rar",
  1629. rar_magic, sizeof (rar_magic))) {
  1630. rspamd_archive_process_rar (task, part);
  1631. }
  1632. else if (rspamd_archive_cheat_detect (part, "7z",
  1633. sz_magic, sizeof (sz_magic))) {
  1634. rspamd_archive_process_7zip (task, part);
  1635. }
  1636. else if (rspamd_archive_cheat_detect (part, "gz",
  1637. gz_magic, sizeof (gz_magic))) {
  1638. rspamd_archive_process_gzip (task, part);
  1639. }
  1640. if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT) &&
  1641. part->part_type == RSPAMD_MIME_PART_ARCHIVE &&
  1642. part->specific.arch) {
  1643. struct rspamd_archive *arch = part->specific.arch;
  1644. msg_info_task ("found %s archive with incorrect content-type: %T/%T",
  1645. rspamd_archive_type_str (arch->type),
  1646. &part->ct->type, &part->ct->subtype);
  1647. if (!(part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING)) {
  1648. part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
  1649. }
  1650. }
  1651. }
  1652. }
  1653. }
  1654. }
  1655. const gchar *
  1656. rspamd_archive_type_str (enum rspamd_archive_type type)
  1657. {
  1658. const gchar *ret = "unknown";
  1659. switch (type) {
  1660. case RSPAMD_ARCHIVE_ZIP:
  1661. ret = "zip";
  1662. break;
  1663. case RSPAMD_ARCHIVE_RAR:
  1664. ret = "rar";
  1665. break;
  1666. case RSPAMD_ARCHIVE_7ZIP:
  1667. ret = "7z";
  1668. break;
  1669. case RSPAMD_ARCHIVE_GZIP:
  1670. ret = "gz";
  1671. break;
  1672. }
  1673. return ret;
  1674. }