You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

archives.c 50KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "message.h"
  18. #include "task.h"
  19. #include "archives.h"
  20. #include "libmime/mime_encoding.h"
  21. #include <unicode/uchar.h>
  22. #include <unicode/utf8.h>
  23. #include <unicode/utf16.h>
  24. #include <unicode/ucnv.h>
  25. #define msg_debug_archive(...) rspamd_conditional_debug_fast(NULL, NULL, \
  26. rspamd_archive_log_id, "archive", task->task_pool->tag.uid, \
  27. G_STRFUNC, \
  28. __VA_ARGS__)
  29. INIT_LOG_MODULE(archive)
  30. static void
  31. rspamd_archive_dtor(gpointer p)
  32. {
  33. struct rspamd_archive *arch = p;
  34. struct rspamd_archive_file *f;
  35. guint i;
  36. for (i = 0; i < arch->files->len; i++) {
  37. f = g_ptr_array_index(arch->files, i);
  38. if (f->fname) {
  39. g_string_free(f->fname, TRUE);
  40. }
  41. g_free(f);
  42. }
  43. g_ptr_array_free(arch->files, TRUE);
  44. }
  45. static bool
  46. rspamd_archive_file_try_utf(struct rspamd_task *task,
  47. struct rspamd_archive *arch,
  48. struct rspamd_archive_file *fentry,
  49. const gchar *in, gsize inlen)
  50. {
  51. const gchar *charset = NULL, *p, *end;
  52. GString *res;
  53. charset = rspamd_mime_charset_find_by_content(in, inlen, TRUE);
  54. if (charset) {
  55. UChar *tmp;
  56. UErrorCode uc_err = U_ZERO_ERROR;
  57. gint32 r, clen, dlen;
  58. struct rspamd_charset_converter *conv;
  59. UConverter *utf8_converter;
  60. conv = rspamd_mime_get_converter_cached(charset, task->task_pool,
  61. TRUE, &uc_err);
  62. utf8_converter = rspamd_get_utf8_converter();
  63. if (conv == NULL) {
  64. msg_info_task("cannot open converter for %s: %s",
  65. charset, u_errorName(uc_err));
  66. fentry->flags |= RSPAMD_ARCHIVE_FILE_OBFUSCATED;
  67. fentry->fname = g_string_new_len(in, inlen);
  68. return false;
  69. }
  70. tmp = g_malloc(sizeof(*tmp) * (inlen + 1));
  71. r = rspamd_converter_to_uchars(conv, tmp, inlen + 1,
  72. in, inlen, &uc_err);
  73. if (!U_SUCCESS(uc_err)) {
  74. msg_info_task("cannot convert data to unicode from %s: %s",
  75. charset, u_errorName(uc_err));
  76. g_free(tmp);
  77. fentry->flags |= RSPAMD_ARCHIVE_FILE_OBFUSCATED;
  78. fentry->fname = g_string_new_len(in, inlen);
  79. return NULL;
  80. }
  81. int i = 0;
  82. while (i < r) {
  83. UChar32 uc;
  84. U16_NEXT(tmp, i, r, uc);
  85. if (IS_ZERO_WIDTH_SPACE(uc) || u_iscntrl(uc)) {
  86. msg_info_task("control character in archive file name found: 0x%02xd "
  87. "(filename=%T)",
  88. uc, arch->archive_name);
  89. fentry->flags |= RSPAMD_ARCHIVE_FILE_OBFUSCATED;
  90. break;
  91. }
  92. }
  93. clen = ucnv_getMaxCharSize(utf8_converter);
  94. dlen = UCNV_GET_MAX_BYTES_FOR_STRING(r, clen);
  95. res = g_string_sized_new(dlen);
  96. r = ucnv_fromUChars(utf8_converter, res->str, dlen, tmp, r, &uc_err);
  97. if (!U_SUCCESS(uc_err)) {
  98. msg_info_task("cannot convert data from unicode from %s: %s",
  99. charset, u_errorName(uc_err));
  100. g_free(tmp);
  101. g_string_free(res, TRUE);
  102. fentry->flags |= RSPAMD_ARCHIVE_FILE_OBFUSCATED;
  103. fentry->fname = g_string_new_len(in, inlen);
  104. return NULL;
  105. }
  106. g_free(tmp);
  107. res->len = r;
  108. msg_debug_archive("converted from %s to UTF-8 inlen: %z, outlen: %d",
  109. charset, inlen, r);
  110. fentry->fname = res;
  111. }
  112. else {
  113. /* Convert unsafe characters to '?' */
  114. res = g_string_sized_new(inlen);
  115. p = in;
  116. end = in + inlen;
  117. while (p < end) {
  118. if (g_ascii_isgraph(*p)) {
  119. g_string_append_c(res, *p);
  120. }
  121. else {
  122. g_string_append_c(res, '?');
  123. if (*p < 0x7f && (g_ascii_iscntrl(*p) || *p == '\0')) {
  124. if (!(fentry->flags & RSPAMD_ARCHIVE_FILE_OBFUSCATED)) {
  125. msg_info_task("suspicious character in archive file name found: 0x%02xd "
  126. "(filename=%T)",
  127. (int) *p, arch->archive_name);
  128. fentry->flags |= RSPAMD_ARCHIVE_FILE_OBFUSCATED;
  129. }
  130. }
  131. }
  132. p++;
  133. }
  134. fentry->fname = res;
  135. }
  136. return true;
  137. }
  138. static void
  139. rspamd_archive_process_zip(struct rspamd_task *task,
  140. struct rspamd_mime_part *part)
  141. {
  142. const guchar *p, *start, *end, *eocd = NULL, *cd;
  143. const guint32 eocd_magic = 0x06054b50, cd_basic_len = 46;
  144. const guchar cd_magic[] = {0x50, 0x4b, 0x01, 0x02};
  145. const guint max_processed = 1024;
  146. guint32 cd_offset, cd_size, comp_size, uncomp_size, processed = 0;
  147. guint16 extra_len, fname_len, comment_len;
  148. struct rspamd_archive *arch;
  149. struct rspamd_archive_file *f = NULL;
  150. /* Zip files have interesting data at the end of archive */
  151. p = part->parsed_data.begin + part->parsed_data.len - 1;
  152. start = part->parsed_data.begin;
  153. end = p;
  154. /* Search for EOCD:
  155. * 22 bytes is a typical size of eocd without a comment and
  156. * end points one byte after the last character
  157. */
  158. p -= 21;
  159. while (p > start + sizeof(guint32)) {
  160. guint32 t;
  161. if (processed > max_processed) {
  162. break;
  163. }
  164. /* XXX: not an efficient approach */
  165. memcpy(&t, p, sizeof(t));
  166. if (GUINT32_FROM_LE(t) == eocd_magic) {
  167. eocd = p;
  168. break;
  169. }
  170. p--;
  171. processed++;
  172. }
  173. if (eocd == NULL) {
  174. /* Not a zip file */
  175. msg_info_task("zip archive is invalid (no EOCD)");
  176. return;
  177. }
  178. if (end - eocd < 21) {
  179. msg_info_task("zip archive is invalid (short EOCD)");
  180. return;
  181. }
  182. memcpy(&cd_size, eocd + 12, sizeof(cd_size));
  183. cd_size = GUINT32_FROM_LE(cd_size);
  184. memcpy(&cd_offset, eocd + 16, sizeof(cd_offset));
  185. cd_offset = GUINT32_FROM_LE(cd_offset);
  186. /* We need to check sanity as well */
  187. if (cd_offset + cd_size > (guint) (eocd - start)) {
  188. msg_info_task("zip archive is invalid (bad size/offset for CD)");
  189. return;
  190. }
  191. cd = start + cd_offset;
  192. arch = rspamd_mempool_alloc0(task->task_pool, sizeof(*arch));
  193. arch->files = g_ptr_array_new();
  194. arch->type = RSPAMD_ARCHIVE_ZIP;
  195. if (part->cd) {
  196. arch->archive_name = &part->cd->filename;
  197. }
  198. rspamd_mempool_add_destructor(task->task_pool, rspamd_archive_dtor,
  199. arch);
  200. while (cd < start + cd_offset + cd_size) {
  201. guint16 flags;
  202. /* Read central directory record */
  203. if (eocd - cd < cd_basic_len ||
  204. memcmp(cd, cd_magic, sizeof(cd_magic)) != 0) {
  205. msg_info_task("zip archive is invalid (bad cd record)");
  206. return;
  207. }
  208. memcpy(&flags, cd + 8, sizeof(guint16));
  209. flags = GUINT16_FROM_LE(flags);
  210. memcpy(&comp_size, cd + 20, sizeof(guint32));
  211. comp_size = GUINT32_FROM_LE(comp_size);
  212. memcpy(&uncomp_size, cd + 24, sizeof(guint32));
  213. uncomp_size = GUINT32_FROM_LE(uncomp_size);
  214. memcpy(&fname_len, cd + 28, sizeof(fname_len));
  215. fname_len = GUINT16_FROM_LE(fname_len);
  216. memcpy(&extra_len, cd + 30, sizeof(extra_len));
  217. extra_len = GUINT16_FROM_LE(extra_len);
  218. memcpy(&comment_len, cd + 32, sizeof(comment_len));
  219. comment_len = GUINT16_FROM_LE(comment_len);
  220. if (cd + fname_len + comment_len + extra_len + cd_basic_len > eocd) {
  221. msg_info_task("zip archive is invalid (too large cd record)");
  222. return;
  223. }
  224. f = g_malloc0(sizeof(*f));
  225. rspamd_archive_file_try_utf(task, arch, f, cd + cd_basic_len, fname_len);
  226. f->compressed_size = comp_size;
  227. f->uncompressed_size = uncomp_size;
  228. if (flags & 0x41u) {
  229. f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED;
  230. }
  231. if (f->fname) {
  232. if (f->flags & RSPAMD_ARCHIVE_FILE_OBFUSCATED) {
  233. arch->flags |= RSPAMD_ARCHIVE_HAS_OBFUSCATED_FILES;
  234. }
  235. g_ptr_array_add(arch->files, f);
  236. msg_debug_archive("found file in zip archive: %v", f->fname);
  237. }
  238. else {
  239. g_free(f);
  240. return;
  241. }
  242. /* Process extra fields */
  243. const guchar *extra = cd + fname_len + cd_basic_len;
  244. p = extra;
  245. while (p + sizeof(guint16) * 2 < extra + extra_len) {
  246. guint16 hid, hlen;
  247. memcpy(&hid, p, sizeof(guint16));
  248. hid = GUINT16_FROM_LE(hid);
  249. memcpy(&hlen, p + sizeof(guint16), sizeof(guint16));
  250. hlen = GUINT16_FROM_LE(hlen);
  251. if (hid == 0x0017) {
  252. f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED;
  253. }
  254. p += hlen + sizeof(guint16) * 2;
  255. }
  256. cd += fname_len + comment_len + extra_len + cd_basic_len;
  257. }
  258. part->part_type = RSPAMD_MIME_PART_ARCHIVE;
  259. part->specific.arch = arch;
  260. arch->size = part->parsed_data.len;
  261. }
  262. static inline gint
  263. rspamd_archive_rar_read_vint(const guchar *start, gsize remain, guint64 *res)
  264. {
  265. /*
  266. * From http://www.rarlab.com/technote.htm:
  267. * Variable length integer. Can include one or more bytes, where
  268. * lower 7 bits of every byte contain integer data and highest bit
  269. * in every byte is the continuation flag.
  270. * If highest bit is 0, this is the last byte in sequence.
  271. * So first byte contains 7 least significant bits of integer and
  272. * continuation flag. Second byte, if present, contains next 7 bits and so on.
  273. */
  274. guint64 t = 0;
  275. guint shift = 0;
  276. const guchar *p = start;
  277. while (remain > 0 && shift <= 57) {
  278. if (*p & 0x80) {
  279. t |= ((guint64) (*p & 0x7f)) << shift;
  280. }
  281. else {
  282. t |= ((guint64) (*p & 0x7f)) << shift;
  283. p++;
  284. break;
  285. }
  286. shift += 7;
  287. p++;
  288. remain--;
  289. }
  290. if (remain == 0 || shift > 64) {
  291. return -1;
  292. }
  293. *res = GUINT64_FROM_LE(t);
  294. return p - start;
  295. }
  296. #define RAR_SKIP_BYTES(n) \
  297. do { \
  298. if ((n) <= 0) { \
  299. msg_debug_archive("rar archive is invalid (bad skip value)"); \
  300. return; \
  301. } \
  302. if ((gsize) (end - p) < (n)) { \
  303. msg_debug_archive("rar archive is invalid (truncated)"); \
  304. return; \
  305. } \
  306. p += (n); \
  307. } while (0)
  308. #define RAR_READ_VINT() \
  309. do { \
  310. r = rspamd_archive_rar_read_vint(p, end - p, &vint); \
  311. if (r == -1) { \
  312. msg_debug_archive("rar archive is invalid (bad vint)"); \
  313. return; \
  314. } \
  315. else if (r == 0) { \
  316. msg_debug_archive("rar archive is invalid (BAD vint offset)"); \
  317. return; \
  318. } \
  319. } while (0)
  320. #define RAR_READ_VINT_SKIP() \
  321. do { \
  322. r = rspamd_archive_rar_read_vint(p, end - p, &vint); \
  323. if (r == -1) { \
  324. msg_debug_archive("rar archive is invalid (bad vint)"); \
  325. return; \
  326. } \
  327. p += r; \
  328. } while (0)
  329. #define RAR_READ_UINT16(n) \
  330. do { \
  331. if (end - p < (glong) sizeof(guint16)) { \
  332. msg_debug_archive("rar archive is invalid (bad int16)"); \
  333. return; \
  334. } \
  335. n = p[0] + (p[1] << 8); \
  336. p += sizeof(guint16); \
  337. } while (0)
  338. #define RAR_READ_UINT32(n) \
  339. do { \
  340. if (end - p < (glong) sizeof(guint32)) { \
  341. msg_debug_archive("rar archive is invalid (bad int32)"); \
  342. return; \
  343. } \
  344. n = (guint) p[0] + ((guint) p[1] << 8) + ((guint) p[2] << 16) + ((guint) p[3] << 24); \
  345. p += sizeof(guint32); \
  346. } while (0)
  347. static void
  348. rspamd_archive_process_rar_v4(struct rspamd_task *task, const guchar *start,
  349. const guchar *end, struct rspamd_mime_part *part)
  350. {
  351. const guchar *p = start, *start_section;
  352. guint8 type;
  353. guint flags;
  354. guint64 sz, comp_sz = 0, uncomp_sz = 0;
  355. struct rspamd_archive *arch;
  356. struct rspamd_archive_file *f;
  357. arch = rspamd_mempool_alloc0(task->task_pool, sizeof(*arch));
  358. arch->files = g_ptr_array_new();
  359. arch->type = RSPAMD_ARCHIVE_RAR;
  360. if (part->cd) {
  361. arch->archive_name = &part->cd->filename;
  362. }
  363. rspamd_mempool_add_destructor(task->task_pool, rspamd_archive_dtor,
  364. arch);
  365. while (p < end) {
  366. /* Crc16 */
  367. start_section = p;
  368. RAR_SKIP_BYTES(sizeof(guint16));
  369. type = *p;
  370. p++;
  371. RAR_READ_UINT16(flags);
  372. if (type == 0x73) {
  373. /* Main header, check for encryption */
  374. if (flags & 0x80) {
  375. arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
  376. goto end;
  377. }
  378. }
  379. RAR_READ_UINT16(sz);
  380. if (flags & 0x8000) {
  381. /* We also need to read ADD_SIZE element */
  382. guint32 tmp;
  383. RAR_READ_UINT32(tmp);
  384. sz += tmp;
  385. /* This is also used as PACK_SIZE */
  386. comp_sz = tmp;
  387. }
  388. if (sz == 0) {
  389. /* Zero sized block - error */
  390. msg_debug_archive("rar archive is invalid (zero size block)");
  391. return;
  392. }
  393. if (type == 0x74) {
  394. guint fname_len;
  395. /* File header */
  396. /* Uncompressed size */
  397. RAR_READ_UINT32(uncomp_sz);
  398. /* Skip to NAME_SIZE element */
  399. RAR_SKIP_BYTES(11);
  400. RAR_READ_UINT16(fname_len);
  401. if (fname_len == 0 || fname_len > (gsize) (end - p)) {
  402. msg_debug_archive("rar archive is invalid (bad filename size: %d)",
  403. fname_len);
  404. return;
  405. }
  406. /* Attrs */
  407. RAR_SKIP_BYTES(4);
  408. if (flags & 0x100) {
  409. /* We also need to read HIGH_PACK_SIZE */
  410. guint32 tmp;
  411. RAR_READ_UINT32(tmp);
  412. sz += tmp;
  413. comp_sz += tmp;
  414. /* HIGH_UNP_SIZE */
  415. RAR_READ_UINT32(tmp);
  416. uncomp_sz += tmp;
  417. }
  418. f = g_malloc0(sizeof(*f));
  419. if (flags & 0x200) {
  420. /* We have unicode + normal version */
  421. guchar *tmp;
  422. tmp = memchr(p, '\0', fname_len);
  423. if (tmp != NULL) {
  424. /* Just use ASCII version */
  425. rspamd_archive_file_try_utf(task, arch, f, p, tmp - p);
  426. msg_debug_archive("found ascii filename in rarv4 archive: %v",
  427. f->fname);
  428. }
  429. else {
  430. /* We have UTF8 filename, use it as is */
  431. rspamd_archive_file_try_utf(task, arch, f, p, fname_len);
  432. msg_debug_archive("found utf filename in rarv4 archive: %v",
  433. f->fname);
  434. }
  435. }
  436. else {
  437. rspamd_archive_file_try_utf(task, arch, f, p, fname_len);
  438. msg_debug_archive("found ascii (old) filename in rarv4 archive: %v",
  439. f->fname);
  440. }
  441. f->compressed_size = comp_sz;
  442. f->uncompressed_size = uncomp_sz;
  443. if (flags & 0x4) {
  444. f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED;
  445. }
  446. if (f->fname) {
  447. if (f->flags & RSPAMD_ARCHIVE_FILE_OBFUSCATED) {
  448. arch->flags |= RSPAMD_ARCHIVE_HAS_OBFUSCATED_FILES;
  449. }
  450. g_ptr_array_add(arch->files, f);
  451. }
  452. else {
  453. g_free(f);
  454. }
  455. }
  456. p = start_section;
  457. RAR_SKIP_BYTES(sz);
  458. }
  459. end:
  460. part->part_type = RSPAMD_MIME_PART_ARCHIVE;
  461. part->specific.arch = arch;
  462. arch->size = part->parsed_data.len;
  463. }
  464. static void
  465. rspamd_archive_process_rar(struct rspamd_task *task,
  466. struct rspamd_mime_part *part)
  467. {
  468. const guchar *p, *end, *section_start;
  469. const guchar rar_v5_magic[] = {0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x01, 0x00},
  470. rar_v4_magic[] = {0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x00};
  471. const guint rar_encrypted_header = 4, rar_main_header = 1,
  472. rar_file_header = 2;
  473. guint64 vint, sz, comp_sz = 0, uncomp_sz = 0, flags = 0, type = 0,
  474. extra_sz = 0;
  475. struct rspamd_archive *arch;
  476. struct rspamd_archive_file *f;
  477. gint r;
  478. p = part->parsed_data.begin;
  479. end = p + part->parsed_data.len;
  480. if ((gsize) (end - p) <= sizeof(rar_v5_magic)) {
  481. msg_debug_archive("rar archive is invalid (too small)");
  482. return;
  483. }
  484. if (memcmp(p, rar_v5_magic, sizeof(rar_v5_magic)) == 0) {
  485. p += sizeof(rar_v5_magic);
  486. }
  487. else if (memcmp(p, rar_v4_magic, sizeof(rar_v4_magic)) == 0) {
  488. p += sizeof(rar_v4_magic);
  489. rspamd_archive_process_rar_v4(task, p, end, part);
  490. return;
  491. }
  492. else {
  493. msg_debug_archive("rar archive is invalid (no rar magic)");
  494. return;
  495. }
  496. /* Rar v5 format */
  497. arch = rspamd_mempool_alloc0(task->task_pool, sizeof(*arch));
  498. arch->files = g_ptr_array_new();
  499. arch->type = RSPAMD_ARCHIVE_RAR;
  500. if (part->cd) {
  501. arch->archive_name = &part->cd->filename;
  502. }
  503. rspamd_mempool_add_destructor(task->task_pool, rspamd_archive_dtor,
  504. arch);
  505. /* Now we can have either encryption header or archive header */
  506. /* Crc 32 */
  507. RAR_SKIP_BYTES(sizeof(guint32));
  508. /* Size */
  509. RAR_READ_VINT_SKIP();
  510. sz = vint;
  511. /* Type */
  512. section_start = p;
  513. RAR_READ_VINT_SKIP();
  514. type = vint;
  515. /* Header flags */
  516. RAR_READ_VINT_SKIP();
  517. flags = vint;
  518. if (flags & 0x1) {
  519. /* Have extra zone */
  520. RAR_READ_VINT_SKIP();
  521. }
  522. if (flags & 0x2) {
  523. /* Data zone is presented */
  524. RAR_READ_VINT_SKIP();
  525. sz += vint;
  526. }
  527. if (type == rar_encrypted_header) {
  528. /* We can't read any further information as archive is encrypted */
  529. arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
  530. goto end;
  531. }
  532. else if (type != rar_main_header) {
  533. msg_debug_archive("rar archive is invalid (bad main header)");
  534. return;
  535. }
  536. /* Nothing useful in main header */
  537. p = section_start;
  538. RAR_SKIP_BYTES(sz);
  539. while (p < end) {
  540. gboolean has_extra = FALSE;
  541. /* Read the next header */
  542. /* Crc 32 */
  543. RAR_SKIP_BYTES(sizeof(guint32));
  544. /* Size */
  545. RAR_READ_VINT_SKIP();
  546. sz = vint;
  547. if (sz == 0) {
  548. /* Zero sized block - error */
  549. msg_debug_archive("rar archive is invalid (zero size block)");
  550. return;
  551. }
  552. section_start = p;
  553. /* Type */
  554. RAR_READ_VINT_SKIP();
  555. type = vint;
  556. /* Header flags */
  557. RAR_READ_VINT_SKIP();
  558. flags = vint;
  559. if (flags & 0x1) {
  560. /* Have extra zone */
  561. RAR_READ_VINT_SKIP();
  562. extra_sz = vint;
  563. has_extra = TRUE;
  564. }
  565. if (flags & 0x2) {
  566. /* Data zone is presented */
  567. RAR_READ_VINT_SKIP();
  568. sz += vint;
  569. comp_sz = vint;
  570. }
  571. if (type != rar_file_header) {
  572. p = section_start;
  573. RAR_SKIP_BYTES(sz);
  574. }
  575. else {
  576. /* We have a file header, go forward */
  577. guint64 fname_len;
  578. bool is_directory = false;
  579. /* File header specific flags */
  580. RAR_READ_VINT_SKIP();
  581. flags = vint;
  582. /* Unpacked size */
  583. RAR_READ_VINT_SKIP();
  584. uncomp_sz = vint;
  585. /* Attributes */
  586. RAR_READ_VINT_SKIP();
  587. if (flags & 0x2) {
  588. /* Unix mtime */
  589. RAR_SKIP_BYTES(sizeof(guint32));
  590. }
  591. if (flags & 0x4) {
  592. /* Crc32 */
  593. RAR_SKIP_BYTES(sizeof(guint32));
  594. }
  595. if (flags & 0x1) {
  596. /* Ignore directories for sanity purposes */
  597. is_directory = true;
  598. msg_debug_archive("skip directory record in a rar archive");
  599. }
  600. if (!is_directory) {
  601. /* Compression */
  602. RAR_READ_VINT_SKIP();
  603. /* Host OS */
  604. RAR_READ_VINT_SKIP();
  605. /* Filename length (finally!) */
  606. RAR_READ_VINT_SKIP();
  607. fname_len = vint;
  608. if (fname_len == 0 || fname_len > (gsize) (end - p)) {
  609. msg_debug_archive("rar archive is invalid (bad filename size)");
  610. return;
  611. }
  612. f = g_malloc0(sizeof(*f));
  613. f->uncompressed_size = uncomp_sz;
  614. f->compressed_size = comp_sz;
  615. rspamd_archive_file_try_utf(task, arch, f, p, fname_len);
  616. if (f->fname) {
  617. msg_debug_archive("added rarv5 file: %v", f->fname);
  618. g_ptr_array_add(arch->files, f);
  619. if (f->flags & RSPAMD_ARCHIVE_FILE_OBFUSCATED) {
  620. arch->flags |= RSPAMD_ARCHIVE_HAS_OBFUSCATED_FILES;
  621. }
  622. }
  623. else {
  624. g_free(f);
  625. f = NULL;
  626. }
  627. if (f && has_extra && extra_sz > 0 &&
  628. p + fname_len + extra_sz < end) {
  629. /* Try to find encryption record in extra field */
  630. const guchar *ex = p + fname_len;
  631. while (ex < p + extra_sz) {
  632. const guchar *t;
  633. gint64 cur_sz = 0, sec_type = 0;
  634. r = rspamd_archive_rar_read_vint(ex, extra_sz, &cur_sz);
  635. if (r == -1) {
  636. msg_debug_archive("rar archive is invalid (bad vint)");
  637. return;
  638. }
  639. t = ex + r;
  640. r = rspamd_archive_rar_read_vint(t, extra_sz - r, &sec_type);
  641. if (r == -1) {
  642. msg_debug_archive("rar archive is invalid (bad vint)");
  643. return;
  644. }
  645. if (sec_type == 0x01) {
  646. f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED;
  647. arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
  648. break;
  649. }
  650. ex += cur_sz;
  651. }
  652. }
  653. }
  654. /* Restore p to the beginning of the header */
  655. p = section_start;
  656. RAR_SKIP_BYTES(sz);
  657. }
  658. }
  659. end:
  660. part->part_type = RSPAMD_MIME_PART_ARCHIVE;
  661. part->specific.arch = arch;
  662. arch->size = part->parsed_data.len;
  663. }
  664. static inline gint
  665. rspamd_archive_7zip_read_vint(const guchar *start, gsize remain, guint64 *res)
  666. {
  667. /*
  668. * REAL_UINT64 means real UINT64.
  669. * UINT64 means real UINT64 encoded with the following scheme:
  670. *
  671. * Size of encoding sequence depends from first byte:
  672. * First_Byte Extra_Bytes Value
  673. * (binary)
  674. * 0xxxxxxx : ( xxxxxxx )
  675. * 10xxxxxx BYTE y[1] : ( xxxxxx << (8 * 1)) + y
  676. * 110xxxxx BYTE y[2] : ( xxxxx << (8 * 2)) + y
  677. * ...
  678. * 1111110x BYTE y[6] : ( x << (8 * 6)) + y
  679. * 11111110 BYTE y[7] : y
  680. * 11111111 BYTE y[8] : y
  681. */
  682. guchar t;
  683. if (remain == 0) {
  684. return -1;
  685. }
  686. t = *start;
  687. if (!isset(&t, 7)) {
  688. /* Trivial case */
  689. *res = t;
  690. return 1;
  691. }
  692. else if (t == 0xFF) {
  693. if (remain >= sizeof(guint64) + 1) {
  694. memcpy(res, start + 1, sizeof(guint64));
  695. *res = GUINT64_FROM_LE(*res);
  696. return sizeof(guint64) + 1;
  697. }
  698. }
  699. else {
  700. gint cur_bit = 6, intlen = 1;
  701. const guchar bmask = 0xFF;
  702. guint64 tgt;
  703. while (cur_bit > 0) {
  704. if (!isset(&t, cur_bit)) {
  705. if (remain >= intlen + 1) {
  706. memcpy(&tgt, start + 1, intlen);
  707. tgt = GUINT64_FROM_LE(tgt);
  708. /* Shift back */
  709. tgt >>= sizeof(tgt) - NBBY * intlen;
  710. /* Add masked value */
  711. tgt += (guint64) (t & (bmask >> (NBBY - cur_bit)))
  712. << (NBBY * intlen);
  713. *res = tgt;
  714. return intlen + 1;
  715. }
  716. }
  717. cur_bit--;
  718. intlen++;
  719. }
  720. }
  721. return -1;
  722. }
  723. #define SZ_READ_VINT_SKIP() \
  724. do { \
  725. r = rspamd_archive_7zip_read_vint(p, end - p, &vint); \
  726. if (r == -1) { \
  727. msg_debug_archive("7z archive is invalid (bad vint)"); \
  728. return; \
  729. } \
  730. p += r; \
  731. } while (0)
  732. #define SZ_READ_VINT(var) \
  733. do { \
  734. int r; \
  735. r = rspamd_archive_7zip_read_vint(p, end - p, &(var)); \
  736. if (r == -1) { \
  737. msg_debug_archive("7z archive is invalid (bad vint): %s", G_STRLOC); \
  738. return NULL; \
  739. } \
  740. p += r; \
  741. } while (0)
  742. #define SZ_READ_UINT64(n) \
  743. do { \
  744. if (end - p < (goffset) sizeof(guint64)) { \
  745. msg_debug_archive("7zip archive is invalid (bad uint64): %s", G_STRLOC); \
  746. return; \
  747. } \
  748. memcpy(&(n), p, sizeof(guint64)); \
  749. n = GUINT64_FROM_LE(n); \
  750. p += sizeof(guint64); \
  751. } while (0)
  752. #define SZ_SKIP_BYTES(n) \
  753. do { \
  754. if (end - p >= (n)) { \
  755. p += (n); \
  756. } \
  757. else { \
  758. msg_debug_archive("7zip archive is invalid (truncated); wanted to read %d bytes, %d avail: %s", (gint) (n), (gint) (end - p), G_STRLOC); \
  759. return NULL; \
  760. } \
  761. } while (0)
  762. enum rspamd_7zip_header_mark {
  763. kEnd = 0x00,
  764. kHeader = 0x01,
  765. kArchiveProperties = 0x02,
  766. kAdditionalStreamsInfo = 0x03,
  767. kMainStreamsInfo = 0x04,
  768. kFilesInfo = 0x05,
  769. kPackInfo = 0x06,
  770. kUnPackInfo = 0x07,
  771. kSubStreamsInfo = 0x08,
  772. kSize = 0x09,
  773. kCRC = 0x0A,
  774. kFolder = 0x0B,
  775. kCodersUnPackSize = 0x0C,
  776. kNumUnPackStream = 0x0D,
  777. kEmptyStream = 0x0E,
  778. kEmptyFile = 0x0F,
  779. kAnti = 0x10,
  780. kName = 0x11,
  781. kCTime = 0x12,
  782. kATime = 0x13,
  783. kMTime = 0x14,
  784. kWinAttributes = 0x15,
  785. kComment = 0x16,
  786. kEncodedHeader = 0x17,
  787. kStartPos = 0x18,
  788. kDummy = 0x19,
  789. };
  790. #define _7Z_CRYPTO_MAIN_ZIP 0x06F10101 /* Main Zip crypto algo */
  791. #define _7Z_CRYPTO_RAR_29 0x06F10303 /* Rar29 AES-128 + (modified SHA-1) */
  792. #define _7Z_CRYPTO_AES_256_SHA_256 0x06F10701 /* AES-256 + SHA-256 */
  793. #define IS_SZ_ENCRYPTED(codec_id) (((codec_id) == _7Z_CRYPTO_MAIN_ZIP) || \
  794. ((codec_id) == _7Z_CRYPTO_RAR_29) || \
  795. ((codec_id) == _7Z_CRYPTO_AES_256_SHA_256))
  796. static const guchar *
  797. rspamd_7zip_read_bits(struct rspamd_task *task,
  798. const guchar *p, const guchar *end,
  799. struct rspamd_archive *arch, guint nbits,
  800. guint *pbits_set)
  801. {
  802. unsigned mask = 0, avail = 0, i;
  803. gboolean bit_set = 0;
  804. for (i = 0; i < nbits; i++) {
  805. if (mask == 0) {
  806. avail = *p;
  807. SZ_SKIP_BYTES(1);
  808. mask = 0x80;
  809. }
  810. bit_set = (avail & mask) ? 1 : 0;
  811. if (bit_set && pbits_set) {
  812. (*pbits_set)++;
  813. }
  814. mask >>= 1;
  815. }
  816. return p;
  817. }
  818. static const guchar *
  819. rspamd_7zip_read_digest(struct rspamd_task *task,
  820. const guchar *p, const guchar *end,
  821. struct rspamd_archive *arch,
  822. guint64 num_streams,
  823. guint *pdigest_read)
  824. {
  825. guchar all_defined = *p;
  826. guint64 i;
  827. guint num_defined = 0;
  828. /*
  829. * BYTE AllAreDefined
  830. * if (AllAreDefined == 0)
  831. * {
  832. * for(NumStreams)
  833. * BIT Defined
  834. * }
  835. * UINT32 CRCs[NumDefined]
  836. */
  837. SZ_SKIP_BYTES(1);
  838. if (all_defined) {
  839. num_defined = num_streams;
  840. }
  841. else {
  842. if (num_streams > 8192) {
  843. /* Gah */
  844. return NULL;
  845. }
  846. p = rspamd_7zip_read_bits(task, p, end, arch, num_streams, &num_defined);
  847. if (p == NULL) {
  848. return NULL;
  849. }
  850. }
  851. for (i = 0; i < num_defined; i++) {
  852. SZ_SKIP_BYTES(sizeof(guint32));
  853. }
  854. if (pdigest_read) {
  855. *pdigest_read = num_defined;
  856. }
  857. return p;
  858. }
  859. static const guchar *
  860. rspamd_7zip_read_pack_info(struct rspamd_task *task,
  861. const guchar *p, const guchar *end,
  862. struct rspamd_archive *arch)
  863. {
  864. guint64 pack_pos = 0, pack_streams = 0, i, cur_sz;
  865. guint num_digests = 0;
  866. guchar t;
  867. /*
  868. * UINT64 PackPos
  869. * UINT64 NumPackStreams
  870. *
  871. * []
  872. * BYTE NID::kSize (0x09)
  873. * UINT64 PackSizes[NumPackStreams]
  874. * []
  875. *
  876. * []
  877. * BYTE NID::kCRC (0x0A)
  878. * PackStreamDigests[NumPackStreams]
  879. * []
  880. * BYTE NID::kEnd
  881. */
  882. SZ_READ_VINT(pack_pos);
  883. SZ_READ_VINT(pack_streams);
  884. while (p != NULL && p < end) {
  885. t = *p;
  886. SZ_SKIP_BYTES(1);
  887. msg_debug_archive("7zip: read pack info %xc", t);
  888. switch (t) {
  889. case kSize:
  890. /* We need to skip pack_streams VINTS */
  891. for (i = 0; i < pack_streams; i++) {
  892. SZ_READ_VINT(cur_sz);
  893. }
  894. break;
  895. case kCRC:
  896. /* CRCs are more complicated */
  897. p = rspamd_7zip_read_digest(task, p, end, arch, pack_streams,
  898. &num_digests);
  899. break;
  900. case kEnd:
  901. goto end;
  902. break;
  903. default:
  904. p = NULL;
  905. msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC);
  906. goto end;
  907. break;
  908. }
  909. }
  910. end:
  911. return p;
  912. }
  913. static const guchar *
  914. rspamd_7zip_read_folder(struct rspamd_task *task,
  915. const guchar *p, const guchar *end,
  916. struct rspamd_archive *arch, guint *pnstreams, guint *ndigests)
  917. {
  918. guint64 ncoders = 0, i, j, noutstreams = 0, ninstreams = 0;
  919. SZ_READ_VINT(ncoders);
  920. for (i = 0; i < ncoders && p != NULL && p < end; i++) {
  921. guint64 sz, tmp;
  922. guchar t;
  923. /*
  924. * BYTE
  925. * {
  926. * 0:3 CodecIdSize
  927. * 4: Is Complex Coder
  928. * 5: There Are Attributes
  929. * 6: Reserved
  930. * 7: There are more alternative methods. (Not used anymore, must be 0).
  931. * }
  932. * BYTE CodecId[CodecIdSize]
  933. * if (Is Complex Coder)
  934. * {
  935. * UINT64 NumInStreams;
  936. * UINT64 NumOutStreams;
  937. * }
  938. * if (There Are Attributes)
  939. * {
  940. * UINT64 PropertiesSize
  941. * BYTE Properties[PropertiesSize]
  942. * }
  943. */
  944. t = *p;
  945. SZ_SKIP_BYTES(1);
  946. sz = t & 0xF;
  947. /* Codec ID */
  948. tmp = 0;
  949. for (j = 0; j < sz; j++) {
  950. tmp <<= 8;
  951. tmp += p[j];
  952. }
  953. msg_debug_archive("7zip: read codec id: %L", tmp);
  954. if (IS_SZ_ENCRYPTED(tmp)) {
  955. arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
  956. }
  957. SZ_SKIP_BYTES(sz);
  958. if (t & (1u << 4)) {
  959. /* Complex */
  960. SZ_READ_VINT(tmp); /* InStreams */
  961. ninstreams += tmp;
  962. SZ_READ_VINT(tmp); /* OutStreams */
  963. noutstreams += tmp;
  964. }
  965. else {
  966. /* XXX: is it correct ? */
  967. noutstreams++;
  968. ninstreams++;
  969. }
  970. if (t & (1u << 5)) {
  971. /* Attributes ... */
  972. SZ_READ_VINT(tmp); /* Size of attrs */
  973. SZ_SKIP_BYTES(tmp);
  974. }
  975. }
  976. if (noutstreams > 1) {
  977. /* BindPairs, WTF, huh */
  978. for (i = 0; i < noutstreams - 1; i++) {
  979. guint64 tmp;
  980. SZ_READ_VINT(tmp);
  981. SZ_READ_VINT(tmp);
  982. }
  983. }
  984. gint64 npacked = (gint64) ninstreams - (gint64) noutstreams + 1;
  985. msg_debug_archive("7zip: instreams=%L, outstreams=%L, packed=%L",
  986. ninstreams, noutstreams, npacked);
  987. if (npacked > 1) {
  988. /* Gah... */
  989. for (i = 0; i < npacked; i++) {
  990. guint64 tmp;
  991. SZ_READ_VINT(tmp);
  992. }
  993. }
  994. *pnstreams = noutstreams;
  995. (*ndigests) += npacked;
  996. return p;
  997. }
  998. static const guchar *
  999. rspamd_7zip_read_coders_info(struct rspamd_task *task,
  1000. const guchar *p, const guchar *end,
  1001. struct rspamd_archive *arch,
  1002. guint *pnum_folders, guint *pnum_nodigest)
  1003. {
  1004. guint64 num_folders = 0, i, tmp;
  1005. guchar t;
  1006. guint *folder_nstreams = NULL, num_digests = 0, digests_read = 0;
  1007. while (p != NULL && p < end) {
  1008. /*
  1009. * BYTE NID::kFolder (0x0B)
  1010. * UINT64 NumFolders
  1011. * BYTE External
  1012. * switch(External)
  1013. * {
  1014. * case 0:
  1015. * Folders[NumFolders]
  1016. * case 1:
  1017. * UINT64 DataStreamIndex
  1018. * }
  1019. * BYTE ID::kCodersUnPackSize (0x0C)
  1020. * for(Folders)
  1021. * for(Folder.NumOutStreams)
  1022. * UINT64 UnPackSize;
  1023. * []
  1024. * BYTE NID::kCRC (0x0A)
  1025. * UnPackDigests[NumFolders]
  1026. * []
  1027. * BYTE NID::kEnd
  1028. */
  1029. t = *p;
  1030. SZ_SKIP_BYTES(1);
  1031. msg_debug_archive("7zip: read coders info %xc", t);
  1032. switch (t) {
  1033. case kFolder:
  1034. SZ_READ_VINT(num_folders);
  1035. msg_debug_archive("7zip: nfolders=%L", num_folders);
  1036. if (*p != 0) {
  1037. /* External folders */
  1038. SZ_SKIP_BYTES(1);
  1039. SZ_READ_VINT(tmp);
  1040. }
  1041. else {
  1042. SZ_SKIP_BYTES(1);
  1043. if (num_folders > 8192) {
  1044. /* Gah */
  1045. return NULL;
  1046. }
  1047. if (folder_nstreams) {
  1048. g_free(folder_nstreams);
  1049. }
  1050. folder_nstreams = g_malloc(sizeof(int) * num_folders);
  1051. for (i = 0; i < num_folders && p != NULL && p < end; i++) {
  1052. p = rspamd_7zip_read_folder(task, p, end, arch,
  1053. &folder_nstreams[i], &num_digests);
  1054. }
  1055. }
  1056. break;
  1057. case kCodersUnPackSize:
  1058. for (i = 0; i < num_folders && p != NULL && p < end; i++) {
  1059. if (folder_nstreams) {
  1060. for (guint j = 0; j < folder_nstreams[i]; j++) {
  1061. SZ_READ_VINT(tmp); /* Unpacked size */
  1062. msg_debug_archive("7zip: unpacked size "
  1063. "(folder=%d, stream=%d) = %L",
  1064. (gint) i, j, tmp);
  1065. }
  1066. }
  1067. else {
  1068. msg_err_task("internal 7zip error");
  1069. }
  1070. }
  1071. break;
  1072. case kCRC:
  1073. /*
  1074. * Here are dragons. Spec tells that here there could be up
  1075. * to nfolders digests. However, according to the actual source
  1076. * code, in case of multiple out streams there should be digests
  1077. * for all out streams.
  1078. *
  1079. * In the real life (tm) it is even more idiotic: all these digests
  1080. * are in another section! But that section needs number of digests
  1081. * that are absent here. It is the most stupid thing I've ever seen
  1082. * in any file format.
  1083. *
  1084. * I hope there *WAS* some reason to do such shit...
  1085. */
  1086. p = rspamd_7zip_read_digest(task, p, end, arch, num_digests,
  1087. &digests_read);
  1088. break;
  1089. case kEnd:
  1090. goto end;
  1091. break;
  1092. default:
  1093. p = NULL;
  1094. msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC);
  1095. goto end;
  1096. break;
  1097. }
  1098. }
  1099. end:
  1100. if (pnum_nodigest) {
  1101. *pnum_nodigest = num_digests - digests_read;
  1102. }
  1103. if (pnum_folders) {
  1104. *pnum_folders = num_folders;
  1105. }
  1106. if (folder_nstreams) {
  1107. g_free(folder_nstreams);
  1108. }
  1109. return p;
  1110. }
  1111. static const guchar *
  1112. rspamd_7zip_read_substreams_info(struct rspamd_task *task,
  1113. const guchar *p, const guchar *end,
  1114. struct rspamd_archive *arch,
  1115. guint num_folders, guint num_nodigest)
  1116. {
  1117. guchar t;
  1118. guint i;
  1119. guint64 *folder_nstreams;
  1120. if (num_folders > 8192) {
  1121. /* Gah */
  1122. return NULL;
  1123. }
  1124. folder_nstreams = g_alloca(sizeof(guint64) * num_folders);
  1125. memset(folder_nstreams, 0, sizeof(guint64) * num_folders);
  1126. while (p != NULL && p < end) {
  1127. /*
  1128. * []
  1129. * BYTE NID::kNumUnPackStream; (0x0D)
  1130. * UINT64 NumUnPackStreamsInFolders[NumFolders];
  1131. * []
  1132. *
  1133. * []
  1134. * BYTE NID::kSize (0x09)
  1135. * UINT64 UnPackSizes[??]
  1136. * []
  1137. *
  1138. *
  1139. * []
  1140. * BYTE NID::kCRC (0x0A)
  1141. * Digests[Number of streams with unknown CRC]
  1142. * []
  1143. */
  1144. t = *p;
  1145. SZ_SKIP_BYTES(1);
  1146. msg_debug_archive("7zip: read substream info %xc", t);
  1147. switch (t) {
  1148. case kNumUnPackStream:
  1149. for (i = 0; i < num_folders; i++) {
  1150. guint64 tmp;
  1151. SZ_READ_VINT(tmp);
  1152. folder_nstreams[i] = tmp;
  1153. }
  1154. break;
  1155. case kCRC:
  1156. /*
  1157. * Read the comment in the rspamd_7zip_read_coders_info
  1158. */
  1159. p = rspamd_7zip_read_digest(task, p, end, arch, num_nodigest,
  1160. NULL);
  1161. break;
  1162. case kSize:
  1163. /*
  1164. * Another brain damaged logic, but we have to support it
  1165. * as there are no ways to proceed without it.
  1166. * In fact, it is just absent in the real life...
  1167. */
  1168. for (i = 0; i < num_folders; i++) {
  1169. for (guint j = 0; j < folder_nstreams[i]; j++) {
  1170. guint64 tmp;
  1171. SZ_READ_VINT(tmp); /* Who cares indeed */
  1172. }
  1173. }
  1174. break;
  1175. case kEnd:
  1176. goto end;
  1177. break;
  1178. default:
  1179. p = NULL;
  1180. msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC);
  1181. goto end;
  1182. break;
  1183. }
  1184. }
  1185. end:
  1186. return p;
  1187. }
  1188. static const guchar *
  1189. rspamd_7zip_read_main_streams_info(struct rspamd_task *task,
  1190. const guchar *p, const guchar *end,
  1191. struct rspamd_archive *arch)
  1192. {
  1193. guchar t;
  1194. guint num_folders = 0, unknown_digests = 0;
  1195. while (p != NULL && p < end) {
  1196. t = *p;
  1197. SZ_SKIP_BYTES(1);
  1198. msg_debug_archive("7zip: read main streams info %xc", t);
  1199. /*
  1200. *
  1201. * []
  1202. * PackInfo
  1203. * []
  1204. * []
  1205. * CodersInfo
  1206. * []
  1207. *
  1208. * []
  1209. * SubStreamsInfo
  1210. * []
  1211. *
  1212. * BYTE NID::kEnd
  1213. */
  1214. switch (t) {
  1215. case kPackInfo:
  1216. p = rspamd_7zip_read_pack_info(task, p, end, arch);
  1217. break;
  1218. case kUnPackInfo:
  1219. p = rspamd_7zip_read_coders_info(task, p, end, arch, &num_folders,
  1220. &unknown_digests);
  1221. break;
  1222. case kSubStreamsInfo:
  1223. p = rspamd_7zip_read_substreams_info(task, p, end, arch, num_folders,
  1224. unknown_digests);
  1225. break;
  1226. break;
  1227. case kEnd:
  1228. goto end;
  1229. break;
  1230. default:
  1231. p = NULL;
  1232. msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC);
  1233. goto end;
  1234. break;
  1235. }
  1236. }
  1237. end:
  1238. return p;
  1239. }
  1240. static const guchar *
  1241. rspamd_7zip_read_archive_props(struct rspamd_task *task,
  1242. const guchar *p, const guchar *end,
  1243. struct rspamd_archive *arch)
  1244. {
  1245. guchar proptype;
  1246. guint64 proplen;
  1247. /*
  1248. * for (;;)
  1249. * {
  1250. * BYTE PropertyType;
  1251. * if (aType == 0)
  1252. * break;
  1253. * UINT64 PropertySize;
  1254. * BYTE PropertyData[PropertySize];
  1255. * }
  1256. */
  1257. if (p != NULL) {
  1258. proptype = *p;
  1259. SZ_SKIP_BYTES(1);
  1260. while (proptype != 0) {
  1261. SZ_READ_VINT(proplen);
  1262. if (p + proplen < end) {
  1263. p += proplen;
  1264. }
  1265. else {
  1266. return NULL;
  1267. }
  1268. proptype = *p;
  1269. SZ_SKIP_BYTES(1);
  1270. }
  1271. }
  1272. return p;
  1273. }
  1274. static GString *
  1275. rspamd_7zip_ucs2_to_utf8(struct rspamd_task *task, const guchar *p,
  1276. const guchar *end)
  1277. {
  1278. GString *res;
  1279. goffset dest_pos = 0, src_pos = 0;
  1280. const gsize len = (end - p) / sizeof(guint16);
  1281. guint16 *up;
  1282. UChar32 wc;
  1283. UBool is_error = 0;
  1284. res = g_string_sized_new((end - p) * 3 / 2 + sizeof(wc) + 1);
  1285. up = (guint16 *) p;
  1286. while (src_pos < len) {
  1287. U16_NEXT(up, src_pos, len, wc);
  1288. if (wc > 0) {
  1289. U8_APPEND(res->str, dest_pos,
  1290. res->allocated_len - 1,
  1291. wc, is_error);
  1292. }
  1293. if (is_error) {
  1294. g_string_free(res, TRUE);
  1295. return NULL;
  1296. }
  1297. }
  1298. g_assert(dest_pos < res->allocated_len);
  1299. res->len = dest_pos;
  1300. res->str[dest_pos] = '\0';
  1301. return res;
  1302. }
  1303. static const guchar *
  1304. rspamd_7zip_read_files_info(struct rspamd_task *task,
  1305. const guchar *p, const guchar *end,
  1306. struct rspamd_archive *arch)
  1307. {
  1308. guint64 nfiles = 0, sz, i;
  1309. guchar t, b;
  1310. struct rspamd_archive_file *fentry;
  1311. SZ_READ_VINT(nfiles);
  1312. for (; p != NULL && p < end;) {
  1313. t = *p;
  1314. SZ_SKIP_BYTES(1);
  1315. msg_debug_archive("7zip: read file data type %xc", t);
  1316. if (t == kEnd) {
  1317. goto end;
  1318. }
  1319. /* This is SO SPECIAL, gah */
  1320. SZ_READ_VINT(sz);
  1321. switch (t) {
  1322. case kEmptyStream:
  1323. case kEmptyFile:
  1324. case kAnti: /* AntiFile, OMFG */
  1325. /* We don't care about these bits */
  1326. case kCTime:
  1327. case kATime:
  1328. case kMTime:
  1329. /* We don't care of these guys, but we still have to parse them, gah */
  1330. if (sz > 0) {
  1331. SZ_SKIP_BYTES(sz);
  1332. }
  1333. break;
  1334. case kName:
  1335. /* The most useful part in this whole bloody format */
  1336. b = *p; /* External flag */
  1337. SZ_SKIP_BYTES(1);
  1338. if (b) {
  1339. /* TODO: for the god sake, do something about external
  1340. * filenames...
  1341. */
  1342. guint64 tmp;
  1343. SZ_READ_VINT(tmp);
  1344. }
  1345. else {
  1346. for (i = 0; i < nfiles; i++) {
  1347. /* Zero terminated wchar_t: happy converting... */
  1348. /* First, find terminator */
  1349. const guchar *fend = NULL, *tp = p;
  1350. GString *res;
  1351. while (tp < end - 1) {
  1352. if (*tp == 0 && *(tp + 1) == 0) {
  1353. fend = tp;
  1354. break;
  1355. }
  1356. tp += 2;
  1357. }
  1358. if (fend == NULL || fend - p == 0) {
  1359. /* Crap instead of fname */
  1360. msg_debug_archive("bad 7zip name; %s", G_STRLOC);
  1361. goto end;
  1362. }
  1363. res = rspamd_7zip_ucs2_to_utf8(task, p, fend);
  1364. if (res != NULL) {
  1365. fentry = g_malloc0(sizeof(*fentry));
  1366. fentry->fname = res;
  1367. g_ptr_array_add(arch->files, fentry);
  1368. msg_debug_archive("7zip: found file %v", res);
  1369. }
  1370. else {
  1371. msg_debug_archive("bad 7zip name; %s", G_STRLOC);
  1372. }
  1373. /* Skip zero terminating character */
  1374. p = fend + 2;
  1375. }
  1376. }
  1377. break;
  1378. case kDummy:
  1379. case kWinAttributes:
  1380. if (sz > 0) {
  1381. SZ_SKIP_BYTES(sz);
  1382. }
  1383. break;
  1384. default:
  1385. p = NULL;
  1386. msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC);
  1387. goto end;
  1388. break;
  1389. }
  1390. }
  1391. end:
  1392. return p;
  1393. }
  1394. static const guchar *
  1395. rspamd_7zip_read_next_section(struct rspamd_task *task,
  1396. const guchar *p, const guchar *end,
  1397. struct rspamd_archive *arch)
  1398. {
  1399. guchar t = *p;
  1400. SZ_SKIP_BYTES(1);
  1401. msg_debug_archive("7zip: read section %xc", t);
  1402. switch (t) {
  1403. case kHeader:
  1404. /* We just skip byte and go further */
  1405. break;
  1406. case kEncodedHeader:
  1407. /*
  1408. * In fact, headers are just packed, but we assume it as
  1409. * encrypted to distinguish from the normal archives
  1410. */
  1411. msg_debug_archive("7zip: encoded header, needs to be uncompressed");
  1412. arch->flags |= RSPAMD_ARCHIVE_CANNOT_READ;
  1413. p = NULL; /* Cannot get anything useful */
  1414. break;
  1415. case kArchiveProperties:
  1416. p = rspamd_7zip_read_archive_props(task, p, end, arch);
  1417. break;
  1418. case kMainStreamsInfo:
  1419. p = rspamd_7zip_read_main_streams_info(task, p, end, arch);
  1420. break;
  1421. case kAdditionalStreamsInfo:
  1422. p = rspamd_7zip_read_main_streams_info(task, p, end, arch);
  1423. break;
  1424. case kFilesInfo:
  1425. p = rspamd_7zip_read_files_info(task, p, end, arch);
  1426. break;
  1427. case kEnd:
  1428. p = NULL;
  1429. msg_debug_archive("7zip: read final section");
  1430. break;
  1431. default:
  1432. p = NULL;
  1433. msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC);
  1434. break;
  1435. }
  1436. return p;
  1437. }
  1438. static void
  1439. rspamd_archive_process_7zip(struct rspamd_task *task,
  1440. struct rspamd_mime_part *part)
  1441. {
  1442. struct rspamd_archive *arch;
  1443. const guchar *start, *p, *end;
  1444. const guchar sz_magic[] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C};
  1445. guint64 section_offset = 0, section_length = 0;
  1446. start = part->parsed_data.begin;
  1447. p = start;
  1448. end = p + part->parsed_data.len;
  1449. if (end - p <= sizeof(guint64) + sizeof(guint32) ||
  1450. memcmp(p, sz_magic, sizeof(sz_magic)) != 0) {
  1451. msg_debug_archive("7z archive is invalid (no 7z magic)");
  1452. return;
  1453. }
  1454. arch = rspamd_mempool_alloc0(task->task_pool, sizeof(*arch));
  1455. arch->files = g_ptr_array_new();
  1456. arch->type = RSPAMD_ARCHIVE_7ZIP;
  1457. rspamd_mempool_add_destructor(task->task_pool, rspamd_archive_dtor,
  1458. arch);
  1459. /* Magic (6 bytes) + version (2 bytes) + crc32 (4 bytes) */
  1460. p += sizeof(guint64) + sizeof(guint32);
  1461. SZ_READ_UINT64(section_offset);
  1462. SZ_READ_UINT64(section_length);
  1463. if (end - p > sizeof(guint32)) {
  1464. p += sizeof(guint32);
  1465. }
  1466. else {
  1467. msg_debug_archive("7z archive is invalid (truncated crc)");
  1468. return;
  1469. }
  1470. if (end - p > section_offset) {
  1471. p += section_offset;
  1472. }
  1473. else {
  1474. msg_debug_archive("7z archive is invalid (incorrect section offset)");
  1475. return;
  1476. }
  1477. while ((p = rspamd_7zip_read_next_section(task, p, end, arch)) != NULL)
  1478. ;
  1479. part->part_type = RSPAMD_MIME_PART_ARCHIVE;
  1480. part->specific.arch = arch;
  1481. if (part->cd != NULL) {
  1482. arch->archive_name = &part->cd->filename;
  1483. }
  1484. arch->size = part->parsed_data.len;
  1485. }
  1486. static void
  1487. rspamd_archive_process_gzip(struct rspamd_task *task,
  1488. struct rspamd_mime_part *part)
  1489. {
  1490. struct rspamd_archive *arch;
  1491. const guchar *start, *p, *end;
  1492. const guchar gz_magic[] = {0x1F, 0x8B};
  1493. guchar flags;
  1494. start = part->parsed_data.begin;
  1495. p = start;
  1496. end = p + part->parsed_data.len;
  1497. if (end - p <= 10 || memcmp(p, gz_magic, sizeof(gz_magic)) != 0) {
  1498. msg_debug_archive("gzip archive is invalid (no gzip magic)");
  1499. return;
  1500. }
  1501. arch = rspamd_mempool_alloc0(task->task_pool, sizeof(*arch));
  1502. arch->files = g_ptr_array_sized_new(1);
  1503. arch->type = RSPAMD_ARCHIVE_GZIP;
  1504. if (part->cd) {
  1505. arch->archive_name = &part->cd->filename;
  1506. }
  1507. rspamd_mempool_add_destructor(task->task_pool, rspamd_archive_dtor,
  1508. arch);
  1509. flags = p[3];
  1510. if (flags & (1u << 5)) {
  1511. arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
  1512. }
  1513. if (flags & (1u << 3)) {
  1514. /* We have file name presented in archive, try to use it */
  1515. if (flags & (1u << 1)) {
  1516. /* Multipart */
  1517. p += 12;
  1518. }
  1519. else {
  1520. p += 10;
  1521. }
  1522. if (flags & (1u << 2)) {
  1523. /* Optional section */
  1524. guint16 optlen = 0;
  1525. RAR_READ_UINT16(optlen);
  1526. if (end <= p + optlen) {
  1527. msg_debug_archive("gzip archive is invalid, bad extra length: %d",
  1528. (int) optlen);
  1529. return;
  1530. }
  1531. p += optlen;
  1532. }
  1533. /* Read file name */
  1534. const guchar *fname_start = p;
  1535. while (p < end) {
  1536. if (*p == '\0') {
  1537. if (p > fname_start) {
  1538. struct rspamd_archive_file *f;
  1539. f = g_malloc0(sizeof(*f));
  1540. rspamd_archive_file_try_utf(task, arch, f,
  1541. fname_start, p - fname_start);
  1542. if (f->fname) {
  1543. g_ptr_array_add(arch->files, f);
  1544. if (f->flags & RSPAMD_ARCHIVE_FILE_OBFUSCATED) {
  1545. arch->flags |= RSPAMD_ARCHIVE_HAS_OBFUSCATED_FILES;
  1546. }
  1547. }
  1548. else {
  1549. /* Invalid filename, skip */
  1550. g_free(f);
  1551. }
  1552. goto set;
  1553. }
  1554. }
  1555. p++;
  1556. }
  1557. /* Wrong filename, not zero terminated */
  1558. msg_debug_archive("gzip archive is invalid, bad filename at pos %d",
  1559. (int) (p - start));
  1560. return;
  1561. }
  1562. /* Fallback, we need to extract file name from archive name if possible */
  1563. if (part->cd && part->cd->filename.len > 0) {
  1564. const gchar *dot_pos, *slash_pos;
  1565. dot_pos = rspamd_memrchr(part->cd->filename.begin, '.',
  1566. part->cd->filename.len);
  1567. if (dot_pos) {
  1568. struct rspamd_archive_file *f;
  1569. slash_pos = rspamd_memrchr(part->cd->filename.begin, '/',
  1570. part->cd->filename.len);
  1571. if (slash_pos && slash_pos < dot_pos) {
  1572. f = g_malloc0(sizeof(*f));
  1573. f->fname = g_string_sized_new(dot_pos - slash_pos);
  1574. g_string_append_len(f->fname, slash_pos + 1,
  1575. dot_pos - slash_pos - 1);
  1576. msg_debug_archive("fallback to gzip filename based on cd: %v",
  1577. f->fname);
  1578. g_ptr_array_add(arch->files, f);
  1579. goto set;
  1580. }
  1581. else {
  1582. const gchar *fname_start = part->cd->filename.begin;
  1583. f = g_malloc0(sizeof(*f));
  1584. if (memchr(fname_start, '.', part->cd->filename.len) != dot_pos) {
  1585. /* Double dots, something like foo.exe.gz */
  1586. f->fname = g_string_sized_new(dot_pos - fname_start);
  1587. g_string_append_len(f->fname, fname_start,
  1588. dot_pos - fname_start);
  1589. }
  1590. else {
  1591. /* Single dot, something like foo.gzz */
  1592. f->fname = g_string_sized_new(part->cd->filename.len);
  1593. g_string_append_len(f->fname, fname_start,
  1594. part->cd->filename.len);
  1595. }
  1596. msg_debug_archive("fallback to gzip filename based on cd: %v",
  1597. f->fname);
  1598. g_ptr_array_add(arch->files, f);
  1599. goto set;
  1600. }
  1601. }
  1602. }
  1603. return;
  1604. set:
  1605. /* Set archive data */
  1606. part->part_type = RSPAMD_MIME_PART_ARCHIVE;
  1607. part->specific.arch = arch;
  1608. arch->size = part->parsed_data.len;
  1609. }
  1610. static gboolean
  1611. rspamd_archive_cheat_detect(struct rspamd_mime_part *part, const gchar *str,
  1612. const guchar *magic_start, gsize magic_len)
  1613. {
  1614. struct rspamd_content_type *ct;
  1615. const gchar *p;
  1616. rspamd_ftok_t srch, *fname;
  1617. ct = part->ct;
  1618. RSPAMD_FTOK_ASSIGN(&srch, "application");
  1619. if (ct && ct->type.len && ct->subtype.len > 0 && rspamd_ftok_cmp(&ct->type, &srch) == 0) {
  1620. if (rspamd_substring_search_caseless(ct->subtype.begin, ct->subtype.len,
  1621. str, strlen(str)) != -1) {
  1622. /* We still need to check magic, see #1848 */
  1623. if (magic_start != NULL) {
  1624. if (part->parsed_data.len > magic_len &&
  1625. memcmp(part->parsed_data.begin,
  1626. magic_start, magic_len) == 0) {
  1627. return TRUE;
  1628. }
  1629. /* No magic, refuse this type of archive */
  1630. return FALSE;
  1631. }
  1632. else {
  1633. return TRUE;
  1634. }
  1635. }
  1636. }
  1637. if (part->cd) {
  1638. fname = &part->cd->filename;
  1639. if (fname && fname->len > strlen(str)) {
  1640. p = fname->begin + fname->len - strlen(str);
  1641. if (rspamd_lc_cmp(p, str, strlen(str)) == 0) {
  1642. if (*(p - 1) == '.') {
  1643. if (magic_start != NULL) {
  1644. if (part->parsed_data.len > magic_len &&
  1645. memcmp(part->parsed_data.begin,
  1646. magic_start, magic_len) == 0) {
  1647. return TRUE;
  1648. }
  1649. /* No magic, refuse this type of archive */
  1650. return FALSE;
  1651. }
  1652. return TRUE;
  1653. }
  1654. }
  1655. }
  1656. if (magic_start != NULL) {
  1657. if (part->parsed_data.len > magic_len &&
  1658. memcmp(part->parsed_data.begin, magic_start, magic_len) == 0) {
  1659. return TRUE;
  1660. }
  1661. }
  1662. }
  1663. else {
  1664. if (magic_start != NULL) {
  1665. if (part->parsed_data.len > magic_len &&
  1666. memcmp(part->parsed_data.begin, magic_start, magic_len) == 0) {
  1667. return TRUE;
  1668. }
  1669. }
  1670. }
  1671. return FALSE;
  1672. }
  1673. void rspamd_archives_process(struct rspamd_task *task)
  1674. {
  1675. guint i;
  1676. struct rspamd_mime_part *part;
  1677. const guchar rar_magic[] = {0x52, 0x61, 0x72, 0x21, 0x1A, 0x07};
  1678. const guchar zip_magic[] = {0x50, 0x4b, 0x03, 0x04};
  1679. const guchar sz_magic[] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C};
  1680. const guchar gz_magic[] = {0x1F, 0x8B, 0x08};
  1681. PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part)
  1682. {
  1683. if (part->part_type == RSPAMD_MIME_PART_UNDEFINED) {
  1684. if (part->parsed_data.len > 0) {
  1685. if (rspamd_archive_cheat_detect(part, "zip",
  1686. zip_magic, sizeof(zip_magic))) {
  1687. rspamd_archive_process_zip(task, part);
  1688. }
  1689. else if (rspamd_archive_cheat_detect(part, "rar",
  1690. rar_magic, sizeof(rar_magic))) {
  1691. rspamd_archive_process_rar(task, part);
  1692. }
  1693. else if (rspamd_archive_cheat_detect(part, "7z",
  1694. sz_magic, sizeof(sz_magic))) {
  1695. rspamd_archive_process_7zip(task, part);
  1696. }
  1697. else if (rspamd_archive_cheat_detect(part, "gz",
  1698. gz_magic, sizeof(gz_magic))) {
  1699. rspamd_archive_process_gzip(task, part);
  1700. }
  1701. if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT) &&
  1702. part->part_type == RSPAMD_MIME_PART_ARCHIVE &&
  1703. part->specific.arch) {
  1704. struct rspamd_archive *arch = part->specific.arch;
  1705. msg_info_task("found %s archive with incorrect content-type: %T/%T",
  1706. rspamd_archive_type_str(arch->type),
  1707. &part->ct->type, &part->ct->subtype);
  1708. if (!(part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING)) {
  1709. part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
  1710. }
  1711. }
  1712. }
  1713. }
  1714. }
  1715. }
  1716. const gchar *
  1717. rspamd_archive_type_str(enum rspamd_archive_type type)
  1718. {
  1719. const gchar *ret = "unknown";
  1720. switch (type) {
  1721. case RSPAMD_ARCHIVE_ZIP:
  1722. ret = "zip";
  1723. break;
  1724. case RSPAMD_ARCHIVE_RAR:
  1725. ret = "rar";
  1726. break;
  1727. case RSPAMD_ARCHIVE_7ZIP:
  1728. ret = "7z";
  1729. break;
  1730. case RSPAMD_ARCHIVE_GZIP:
  1731. ret = "gz";
  1732. break;
  1733. }
  1734. return ret;
  1735. }