You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605
  1. --[[
  2. Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. --[[[
  14. -- @module lua_magic/heuristics
  15. -- This module contains heuristics for some specific cases
  16. --]]
  17. local rspamd_trie = require "rspamd_trie"
  18. local rspamd_util = require "rspamd_util"
  19. local lua_util = require "lua_util"
  20. local bit = require "bit"
  21. local fun = require "fun"
  22. local N = "lua_magic"
  23. local msoffice_trie
  24. local msoffice_patterns = {
  25. doc = { [[WordDocument]] },
  26. xls = { [[Workbook]], [[Book]] },
  27. ppt = { [[PowerPoint Document]], [[Current User]] },
  28. vsd = { [[VisioDocument]] },
  29. }
  30. local msoffice_trie_clsid
  31. local msoffice_clsids = {
  32. doc = { [[0609020000000000c000000000000046]] },
  33. xls = { [[1008020000000000c000000000000046]], [[2008020000000000c000000000000046]] },
  34. ppt = { [[108d81649b4fcf1186ea00aa00b929e8]] },
  35. msg = { [[46f0060000000000c000000000000046]], [[0b0d020000000000c000000000000046]] },
  36. msi = { [[84100c0000000000c000000000000046]] },
  37. }
  38. local zip_trie
  39. local zip_patterns = {
  40. -- https://lists.oasis-open.org/archives/office/200505/msg00006.html
  41. odt = {
  42. [[mimetypeapplication/vnd\.oasis\.opendocument\.text]],
  43. [[mimetypeapplication/vnd\.oasis\.opendocument\.image]],
  44. [[mimetypeapplication/vnd\.oasis\.opendocument\.graphic]]
  45. },
  46. ods = {
  47. [[mimetypeapplication/vnd\.oasis\.opendocument\.spreadsheet]],
  48. [[mimetypeapplication/vnd\.oasis\.opendocument\.formula]],
  49. [[mimetypeapplication/vnd\.oasis\.opendocument\.chart]]
  50. },
  51. odp = { [[mimetypeapplication/vnd\.oasis\.opendocument\.presentation]] },
  52. epub = { [[epub\+zip]] },
  53. asice = { [[mimetypeapplication/vnd\.etsi\.asic-e\+zipPK]] },
  54. asics = { [[mimetypeapplication/vnd\.etsi\.asic-s\+zipPK]] },
  55. }
  56. local txt_trie
  57. local txt_patterns = {
  58. html = {
  59. { [=[(?i)<html[\s>]]=], 32 },
  60. { [[(?i)<script\b]], 20 }, -- Commonly used by spammers
  61. { [[<script\s+type="text\/javascript">]], 31 }, -- Another spammy pattern
  62. { [[(?i)<\!DOCTYPE HTML\b]], 33 },
  63. { [[(?i)<body\b]], 20 },
  64. { [[(?i)<table\b]], 20 },
  65. { [[(?i)<a\s]], 10 },
  66. { [[(?i)<p\b]], 10 },
  67. { [[(?i)<div\b]], 10 },
  68. { [[(?i)<span\b]], 10 },
  69. },
  70. csv = {
  71. { [[(?:[-a-zA-Z0-9_]+\s*,){2,}(?:[-a-zA-Z0-9_]+,?[ ]*[\r\n])]], 20 }
  72. },
  73. ics = {
  74. { [[^BEGIN:VCALENDAR\r?\n]], 40 },
  75. },
  76. vcf = {
  77. { [[^BEGIN:VCARD\r?\n]], 40 },
  78. },
  79. xml = {
  80. { [[<\?xml\b.+\?>]], 31 },
  81. }
  82. }
  83. -- Used to match pattern index and extension
  84. local msoffice_clsid_indexes = {}
  85. local msoffice_patterns_indexes = {}
  86. local zip_patterns_indexes = {}
  87. local txt_patterns_indexes = {}
  88. local exports = {}
  89. local function compile_tries()
  90. local default_compile_flags = bit.bor(rspamd_trie.flags.re,
  91. rspamd_trie.flags.dot_all,
  92. rspamd_trie.flags.single_match,
  93. rspamd_trie.flags.no_start)
  94. local function compile_pats(patterns, indexes, transform_func, compile_flags)
  95. local strs = {}
  96. for ext, pats in pairs(patterns) do
  97. for _, pat in ipairs(pats) do
  98. -- These are utf16 strings in fact...
  99. strs[#strs + 1] = transform_func(pat)
  100. indexes[#indexes + 1] = { ext, pat }
  101. end
  102. end
  103. return rspamd_trie.create(strs, compile_flags or default_compile_flags)
  104. end
  105. if not msoffice_trie then
  106. -- Directory names
  107. local function msoffice_pattern_transform(pat)
  108. return '^' ..
  109. table.concat(
  110. fun.totable(
  111. fun.map(function(c)
  112. return c .. [[\x{00}]]
  113. end,
  114. fun.iter(pat))))
  115. end
  116. local function msoffice_clsid_transform(pat)
  117. local hex_table = {}
  118. for i = 1, #pat, 2 do
  119. local subc = pat:sub(i, i + 1)
  120. hex_table[#hex_table + 1] = string.format('\\x{%s}', subc)
  121. end
  122. return '^' .. table.concat(hex_table) .. '$'
  123. end
  124. -- Directory entries
  125. msoffice_trie = compile_pats(msoffice_patterns, msoffice_patterns_indexes,
  126. msoffice_pattern_transform)
  127. -- Clsids
  128. msoffice_trie_clsid = compile_pats(msoffice_clsids, msoffice_clsid_indexes,
  129. msoffice_clsid_transform)
  130. -- Misc zip patterns at the initial fragment
  131. zip_trie = compile_pats(zip_patterns, zip_patterns_indexes,
  132. function(pat)
  133. return pat
  134. end)
  135. -- Text patterns at the initial fragment
  136. txt_trie = compile_pats(txt_patterns, txt_patterns_indexes,
  137. function(pat_tbl)
  138. return pat_tbl[1]
  139. end,
  140. bit.bor(rspamd_trie.flags.re,
  141. rspamd_trie.flags.dot_all,
  142. rspamd_trie.flags.no_start))
  143. end
  144. end
  145. -- Call immediately on require
  146. compile_tries()
  147. local function detect_ole_format(input, log_obj, _, part)
  148. local inplen = #input
  149. if inplen < 0x31 + 4 then
  150. lua_util.debugm(N, log_obj, "short length: %s", inplen)
  151. return nil
  152. end
  153. local bom, sec_size = rspamd_util.unpack('<I2<I2', input:span(29, 4))
  154. if bom == 0xFFFE then
  155. bom = '<'
  156. else
  157. lua_util.debugm(N, log_obj, "bom file!: %s", bom)
  158. bom = '>';
  159. sec_size = bit.bswap(sec_size)
  160. end
  161. if sec_size < 7 or sec_size > 31 then
  162. lua_util.debugm(N, log_obj, "bad sec_size: %s", sec_size)
  163. return nil
  164. end
  165. sec_size = 2 ^ sec_size
  166. -- SecID of first sector of the directory stream
  167. local directory_offset = (rspamd_util.unpack(bom .. 'I4', input:span(0x31, 4)))
  168. * sec_size + 512 + 1
  169. lua_util.debugm(N, log_obj, "directory: %s", directory_offset)
  170. if inplen < directory_offset then
  171. lua_util.debugm(N, log_obj, "short length: %s", inplen)
  172. return nil
  173. end
  174. local function process_dir_entry(offset)
  175. local dtype = input:byte(offset + 66)
  176. lua_util.debugm(N, log_obj, "dtype: %s, offset: %s", dtype, offset)
  177. if dtype then
  178. if dtype == 5 then
  179. -- Extract clsid
  180. local matches = msoffice_trie_clsid:match(input:span(offset + 80, 16))
  181. if matches then
  182. for n, _ in pairs(matches) do
  183. if msoffice_clsid_indexes[n] then
  184. lua_util.debugm(N, log_obj, "found valid clsid for %s",
  185. msoffice_clsid_indexes[n][1])
  186. return true, msoffice_clsid_indexes[n][1]
  187. end
  188. end
  189. end
  190. return true, nil
  191. elseif dtype == 2 then
  192. local matches = msoffice_trie:match(input:span(offset, 64))
  193. if matches then
  194. for n, _ in pairs(matches) do
  195. if msoffice_patterns_indexes[n] then
  196. return true, msoffice_patterns_indexes[n][1]
  197. end
  198. end
  199. end
  200. return true, nil
  201. elseif dtype >= 0 and dtype < 5 then
  202. -- Bad type
  203. return true, nil
  204. end
  205. end
  206. return false, nil
  207. end
  208. repeat
  209. local res, ext = process_dir_entry(directory_offset)
  210. if res and ext then
  211. return ext, 60
  212. end
  213. if not res then
  214. break
  215. end
  216. directory_offset = directory_offset + 128
  217. until directory_offset >= inplen
  218. end
  219. exports.ole_format_heuristic = detect_ole_format
  220. local function process_top_detected(res)
  221. local extensions = lua_util.keys(res)
  222. if #extensions > 0 then
  223. table.sort(extensions, function(ex1, ex2)
  224. return res[ex1] > res[ex2]
  225. end)
  226. return extensions[1], res[extensions[1]]
  227. end
  228. return nil
  229. end
  230. local function detect_archive_flaw(part, arch, log_obj, _)
  231. local arch_type = arch:get_type()
  232. local res = {
  233. docx = 0,
  234. xlsx = 0,
  235. pptx = 0,
  236. jar = 0,
  237. odt = 0,
  238. odp = 0,
  239. ods = 0,
  240. apk = 0,
  241. } -- ext + confidence pairs
  242. -- General msoffice patterns
  243. local function add_msoffice_confidence(incr)
  244. res.docx = res.docx + incr
  245. res.xlsx = res.xlsx + incr
  246. res.pptx = res.pptx + incr
  247. end
  248. if arch_type == 'zip' then
  249. -- Find specific files/folders in zip file
  250. local files = arch:get_files(100) or {}
  251. for _, file in ipairs(files) do
  252. if file == '[Content_Types].xml' then
  253. add_msoffice_confidence(10)
  254. elseif file:sub(1, 3) == 'xl/' then
  255. res.xlsx = res.xlsx + 30
  256. elseif file:sub(1, 5) == 'word/' then
  257. res.docx = res.docx + 30
  258. elseif file:sub(1, 4) == 'ppt/' then
  259. res.pptx = res.pptx + 30
  260. elseif file == 'META-INF/MANIFEST.MF' then
  261. res.jar = res.jar + 40
  262. elseif file == 'AndroidManifest.xml' then
  263. res.apk = res.apk + 60
  264. end
  265. end
  266. local ext, weight = process_top_detected(res)
  267. if weight >= 40 then
  268. return ext, weight
  269. end
  270. -- Apply misc Zip detection logic
  271. local content = part:get_content()
  272. if #content > 128 then
  273. local start_span = content:span(1, 128)
  274. local matches = zip_trie:match(start_span)
  275. if matches then
  276. for n, _ in pairs(matches) do
  277. if zip_patterns_indexes[n] then
  278. lua_util.debugm(N, log_obj, "found zip pattern for %s",
  279. zip_patterns_indexes[n][1])
  280. return zip_patterns_indexes[n][1], 40
  281. end
  282. end
  283. end
  284. end
  285. end
  286. return arch_type:lower(), 40
  287. end
  288. local csv_grammar
  289. -- Returns a grammar that will count commas
  290. local function get_csv_grammar()
  291. if not csv_grammar then
  292. local lpeg = require 'lpeg'
  293. local field = '"' * lpeg.Cs(((lpeg.P(1) - '"') + lpeg.P '""' / '"') ^ 0) * '"' +
  294. lpeg.C((1 - lpeg.S ',\n"') ^ 0)
  295. csv_grammar = lpeg.Cf(lpeg.Cc(0) * field * lpeg.P((lpeg.P(',') +
  296. lpeg.P('\t')) * field) ^ 1 * (lpeg.S '\r\n' + -1),
  297. function(acc)
  298. return acc + 1
  299. end)
  300. end
  301. return csv_grammar
  302. end
  303. local function validate_csv(part, content, log_obj)
  304. local max_chunk = 32768
  305. local chunk = content:sub(1, max_chunk)
  306. local expected_commas
  307. local matched_lines = 0
  308. local max_matched_lines = 10
  309. lua_util.debugm(N, log_obj, "check for csv pattern")
  310. for s in chunk:lines() do
  311. local ncommas = get_csv_grammar():match(s)
  312. if not ncommas then
  313. lua_util.debugm(N, log_obj, "not a csv line at line number %s",
  314. matched_lines)
  315. return false
  316. end
  317. if expected_commas and ncommas ~= expected_commas then
  318. -- Mismatched commas
  319. lua_util.debugm(N, log_obj, "missmatched commas on line %s: %s != %s",
  320. matched_lines, ncommas, expected_commas)
  321. return false
  322. elseif not expected_commas then
  323. if ncommas == 0 then
  324. lua_util.debugm(N, log_obj, "no commas in the first line")
  325. return false
  326. end
  327. expected_commas = ncommas
  328. end
  329. matched_lines = matched_lines + 1
  330. if matched_lines > max_matched_lines then
  331. break
  332. end
  333. end
  334. lua_util.debugm(N, log_obj, "csv content is sane: %s fields; %s lines checked",
  335. expected_commas, matched_lines)
  336. return true
  337. end
  338. exports.mime_part_heuristic = function(part, log_obj, _)
  339. if part:is_archive() then
  340. local arch = part:get_archive()
  341. return detect_archive_flaw(part, arch, log_obj)
  342. end
  343. return nil
  344. end
  345. exports.text_part_heuristic = function(part, log_obj, _)
  346. -- We get some span of data and check it
  347. local function is_span_text(span)
  348. -- We examine 8 bit content, and we assume it might be localized text
  349. -- if it has more than 3 subsequent 8 bit characters
  350. local function rough_8bit_check(bytes, idx, remain, len)
  351. local b = bytes[idx]
  352. local n8bit = 0
  353. while b >= 127 and idx < len do
  354. -- utf8 part
  355. if bit.band(b, 0xe0) == 0xc0 and remain > 1 and
  356. bit.band(bytes[idx + 1], 0xc0) == 0x80 then
  357. return true, 1
  358. elseif bit.band(b, 0xf0) == 0xe0 and remain > 2 and
  359. bit.band(bytes[idx + 1], 0xc0) == 0x80 and
  360. bit.band(bytes[idx + 2], 0xc0) == 0x80 then
  361. return true, 2
  362. elseif bit.band(b, 0xf8) == 0xf0 and remain > 3 and
  363. bit.band(bytes[idx + 1], 0xc0) == 0x80 and
  364. bit.band(bytes[idx + 2], 0xc0) == 0x80 and
  365. bit.band(bytes[idx + 3], 0xc0) == 0x80 then
  366. return true, 3
  367. end
  368. n8bit = n8bit + 1
  369. idx = idx + 1
  370. b = bytes[idx]
  371. remain = remain - 1
  372. end
  373. if n8bit >= 3 then
  374. return true, n8bit
  375. end
  376. return false, 0
  377. end
  378. -- Convert to string as LuaJIT can optimise string.sub (and fun.iter) but not C calls
  379. local tlen = #span
  380. local non_printable = 0
  381. local bytes = span:bytes()
  382. local i = 1
  383. repeat
  384. local b = bytes[i]
  385. if (b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09) then
  386. non_printable = non_printable + 1
  387. elseif b >= 127 then
  388. local c, nskip = rough_8bit_check(bytes, i, tlen - i, tlen)
  389. if not c then
  390. non_printable = non_printable + 1
  391. else
  392. i = i + nskip
  393. end
  394. end
  395. i = i + 1
  396. until i > tlen
  397. lua_util.debugm(N, log_obj, "text part check: %s printable, %s non-printable, %s total",
  398. tlen - non_printable, non_printable, tlen)
  399. if non_printable / tlen > 0.0078125 then
  400. return false
  401. end
  402. return true
  403. end
  404. local parent = part:get_parent()
  405. if parent then
  406. local parent_type, parent_subtype = parent:get_type()
  407. if parent_type == 'multipart' and parent_subtype == 'encrypted' then
  408. -- Skip text heuristics for encrypted parts
  409. lua_util.debugm(N, log_obj, "text part check: parent is encrypted, not a text part")
  410. return false
  411. end
  412. end
  413. local content = part:get_content()
  414. local mtype, msubtype = part:get_type()
  415. local clen = #content
  416. local is_text
  417. if clen > 0 then
  418. if clen > 80 * 3 then
  419. -- Use chunks
  420. is_text = is_span_text(content:span(1, 160)) and is_span_text(content:span(clen - 80, 80))
  421. else
  422. is_text = is_span_text(content)
  423. end
  424. if is_text and mtype ~= 'message' then
  425. -- Try patterns
  426. local span_len = math.min(4096, clen)
  427. local start_span = content:span(1, span_len)
  428. local matches = txt_trie:match(start_span)
  429. local res = {}
  430. local fname = part:get_filename()
  431. if matches then
  432. -- Require at least 2 occurrences of those patterns
  433. for n, positions in pairs(matches) do
  434. local ext, weight = txt_patterns_indexes[n][1], txt_patterns_indexes[n][2][2]
  435. if ext then
  436. res[ext] = (res[ext] or 0) + weight * #positions
  437. lua_util.debugm(N, log_obj, "found txt pattern for %s: %s, total: %s; %s/%s announced",
  438. ext, weight * #positions, res[ext], mtype, msubtype)
  439. end
  440. end
  441. if res.html and res.html >= 40 then
  442. -- HTML has priority over something like js...
  443. return 'html', res.html
  444. end
  445. local ext, weight = process_top_detected(res)
  446. if weight then
  447. if weight >= 40 then
  448. -- Extra validation for csv extension
  449. if ext ~= 'csv' or validate_csv(part, content, log_obj) then
  450. return ext, weight
  451. end
  452. elseif fname and weight >= 20 then
  453. return ext, weight
  454. end
  455. end
  456. end
  457. -- Content type stuff
  458. if (mtype == 'text' or mtype == 'application') and
  459. (msubtype == 'html' or msubtype == 'xhtml+xml') then
  460. return 'html', 21
  461. end
  462. if msubtype:lower() == 'csv' then
  463. if validate_csv(part, content, log_obj) then
  464. return 'csv', 40
  465. end
  466. end
  467. -- Extension stuff
  468. local function has_extension(file, ext)
  469. local ext_len = ext:len()
  470. return file:len() > ext_len + 1
  471. and file:sub(-ext_len):lower() == ext
  472. and file:sub(-ext_len - 1, -ext_len - 1) == '.'
  473. end
  474. if fname and (has_extension(fname, 'htm') or has_extension(fname, 'html')) then
  475. return 'html', 21
  476. end
  477. if mtype ~= 'text' then
  478. -- Do not treat non text patterns as text
  479. return nil
  480. end
  481. return 'txt', 40
  482. end
  483. end
  484. end
  485. exports.pdf_format_heuristic = function(input, log_obj, pos, part)
  486. local weight = 10
  487. local ext = string.match(part:get_filename() or '', '%.([^.]+)$')
  488. -- If we found a pattern at the beginning
  489. if pos <= 10 then
  490. weight = weight + 30
  491. end
  492. -- If the announced extension is `pdf`
  493. if ext and ext:lower() == 'pdf' then
  494. weight = weight + 30
  495. end
  496. return 'pdf', weight
  497. end
  498. exports.pe_part_heuristic = function(input, log_obj, pos, part)
  499. if not input then
  500. return
  501. end
  502. -- pe header should start at the offset that is placed in msdos header at position 60..64
  503. local pe_ptr_bin = input:sub(60, 64)
  504. if #pe_ptr_bin ~= 4 then
  505. return
  506. end
  507. -- it is an LE 32 bit integer
  508. local pe_ptr = rspamd_util.unpack("<I4", pe_ptr_bin)
  509. -- if pe header magic matches the offset, it is definitely a PE file
  510. if pe_ptr ~= pos then
  511. return
  512. end
  513. return 'exe', 30
  514. end
  515. return exports