You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

heuristics.lua 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596
  1. --[[
  2. Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. --[[[
  14. -- @module lua_magic/heuristics
  15. -- This module contains heuristics for some specific cases
  16. --]]
  17. local rspamd_trie = require "rspamd_trie"
  18. local rspamd_util = require "rspamd_util"
  19. local lua_util = require "lua_util"
  20. local bit = require "bit"
  21. local fun = require "fun"
  22. local N = "lua_magic"
  23. local msoffice_trie
  24. local msoffice_patterns = {
  25. doc = {[[WordDocument]]},
  26. xls = {[[Workbook]], [[Book]]},
  27. ppt = {[[PowerPoint Document]], [[Current User]]},
  28. vsd = {[[VisioDocument]]},
  29. }
  30. local msoffice_trie_clsid
  31. local msoffice_clsids = {
  32. doc = {[[0609020000000000c000000000000046]]},
  33. xls = {[[1008020000000000c000000000000046]], [[2008020000000000c000000000000046]]},
  34. ppt = {[[108d81649b4fcf1186ea00aa00b929e8]]},
  35. msg = {[[46f0060000000000c000000000000046]], [[0b0d020000000000c000000000000046]]},
  36. msi = {[[84100c0000000000c000000000000046]]},
  37. }
  38. local zip_trie
  39. local zip_patterns = {
  40. -- https://lists.oasis-open.org/archives/office/200505/msg00006.html
  41. odt = {
  42. [[mimetypeapplication/vnd\.oasis\.opendocument\.text]],
  43. [[mimetypeapplication/vnd\.oasis\.opendocument\.image]],
  44. [[mimetypeapplication/vnd\.oasis\.opendocument\.graphic]]
  45. },
  46. ods = {
  47. [[mimetypeapplication/vnd\.oasis\.opendocument\.spreadsheet]],
  48. [[mimetypeapplication/vnd\.oasis\.opendocument\.formula]],
  49. [[mimetypeapplication/vnd\.oasis\.opendocument\.chart]]
  50. },
  51. odp = {[[mimetypeapplication/vnd\.oasis\.opendocument\.presentation]]},
  52. epub = {[[epub\+zip]]},
  53. asice = {[[mimetypeapplication/vnd\.etsi\.asic-e\+zipPK]]},
  54. asics = {[[mimetypeapplication/vnd\.etsi\.asic-s\+zipPK]]},
  55. }
  56. local txt_trie
  57. local txt_patterns = {
  58. html = {
  59. {[[(?i)<html\b]], 32},
  60. {[[(?i)<script\b]], 20}, -- Commonly used by spammers
  61. {[[<script\s+type="text\/javascript">]], 31}, -- Another spammy pattern
  62. {[[(?i)<\!DOCTYPE HTML\b]], 33},
  63. {[[(?i)<body\b]], 20},
  64. {[[(?i)<table\b]], 20},
  65. {[[(?i)<a\b]], 10},
  66. {[[(?i)<p\b]], 10},
  67. {[[(?i)<div\b]], 10},
  68. {[[(?i)<span\b]], 10},
  69. },
  70. csv = {
  71. {[[(?:[-a-zA-Z0-9_]+\s*,){2,}(?:[-a-zA-Z0-9_]+,?[ ]*[\r\n])]], 20}
  72. },
  73. ics = {
  74. {[[^BEGIN:VCALENDAR\r?\n]], 40},
  75. },
  76. vcf = {
  77. {[[^BEGIN:VCARD\r?\n]], 40},
  78. },
  79. xml = {
  80. {[[<\?xml\b.+\?>]], 31},
  81. }
  82. }
  83. -- Used to match pattern index and extension
  84. local msoffice_clsid_indexes = {}
  85. local msoffice_patterns_indexes = {}
  86. local zip_patterns_indexes = {}
  87. local txt_patterns_indexes = {}
  88. local exports = {}
  89. local function compile_tries()
  90. local default_compile_flags = bit.bor(rspamd_trie.flags.re,
  91. rspamd_trie.flags.dot_all,
  92. rspamd_trie.flags.single_match,
  93. rspamd_trie.flags.no_start)
  94. local function compile_pats(patterns, indexes, transform_func, compile_flags)
  95. local strs = {}
  96. for ext,pats in pairs(patterns) do
  97. for _,pat in ipairs(pats) do
  98. -- These are utf16 strings in fact...
  99. strs[#strs + 1] = transform_func(pat)
  100. indexes[#indexes + 1] = {ext, pat}
  101. end
  102. end
  103. return rspamd_trie.create(strs, compile_flags or default_compile_flags)
  104. end
  105. if not msoffice_trie then
  106. -- Directory names
  107. local function msoffice_pattern_transform(pat)
  108. return '^' ..
  109. table.concat(
  110. fun.totable(
  111. fun.map(function(c) return c .. [[\x{00}]] end,
  112. fun.iter(pat))))
  113. end
  114. local function msoffice_clsid_transform(pat)
  115. local hex_table = {}
  116. for i=1,#pat,2 do
  117. local subc = pat:sub(i, i + 1)
  118. hex_table[#hex_table + 1] = string.format('\\x{%s}', subc)
  119. end
  120. return '^' .. table.concat(hex_table) .. '$'
  121. end
  122. -- Directory entries
  123. msoffice_trie = compile_pats(msoffice_patterns, msoffice_patterns_indexes,
  124. msoffice_pattern_transform)
  125. -- Clsids
  126. msoffice_trie_clsid = compile_pats(msoffice_clsids, msoffice_clsid_indexes,
  127. msoffice_clsid_transform)
  128. -- Misc zip patterns at the initial fragment
  129. zip_trie = compile_pats(zip_patterns, zip_patterns_indexes,
  130. function(pat) return pat end)
  131. -- Text patterns at the initial fragment
  132. txt_trie = compile_pats(txt_patterns, txt_patterns_indexes,
  133. function(pat_tbl) return pat_tbl[1] end,
  134. bit.bor(rspamd_trie.flags.re,
  135. rspamd_trie.flags.dot_all,
  136. rspamd_trie.flags.no_start))
  137. end
  138. end
  139. -- Call immediately on require
  140. compile_tries()
  141. local function detect_ole_format(input, log_obj, _, part)
  142. local inplen = #input
  143. if inplen < 0x31 + 4 then
  144. lua_util.debugm(N, log_obj, "short length: %s", inplen)
  145. return nil
  146. end
  147. local bom,sec_size = rspamd_util.unpack('<I2<I2', input:span(29, 4))
  148. if bom == 0xFFFE then
  149. bom = '<'
  150. else
  151. lua_util.debugm(N, log_obj, "bom file!: %s", bom)
  152. bom = '>'; sec_size = bit.bswap(sec_size)
  153. end
  154. if sec_size < 7 or sec_size > 31 then
  155. lua_util.debugm(N, log_obj, "bad sec_size: %s", sec_size)
  156. return nil
  157. end
  158. sec_size = 2 ^ sec_size
  159. -- SecID of first sector of the directory stream
  160. local directory_offset = (rspamd_util.unpack(bom .. 'I4', input:span(0x31, 4)))
  161. * sec_size + 512 + 1
  162. lua_util.debugm(N, log_obj, "directory: %s", directory_offset)
  163. if inplen < directory_offset then
  164. lua_util.debugm(N, log_obj, "short length: %s", inplen)
  165. return nil
  166. end
  167. local function process_dir_entry(offset)
  168. local dtype = input:byte(offset + 66)
  169. lua_util.debugm(N, log_obj, "dtype: %s, offset: %s", dtype, offset)
  170. if dtype then
  171. if dtype == 5 then
  172. -- Extract clsid
  173. local matches = msoffice_trie_clsid:match(input:span(offset + 80, 16))
  174. if matches then
  175. for n,_ in pairs(matches) do
  176. if msoffice_clsid_indexes[n] then
  177. lua_util.debugm(N, log_obj, "found valid clsid for %s",
  178. msoffice_clsid_indexes[n][1])
  179. return true,msoffice_clsid_indexes[n][1]
  180. end
  181. end
  182. end
  183. return true,nil
  184. elseif dtype == 2 then
  185. local matches = msoffice_trie:match(input:span(offset, 64))
  186. if matches then
  187. for n,_ in pairs(matches) do
  188. if msoffice_patterns_indexes[n] then
  189. return true,msoffice_patterns_indexes[n][1]
  190. end
  191. end
  192. end
  193. return true,nil
  194. elseif dtype >= 0 and dtype < 5 then
  195. -- Bad type
  196. return true,nil
  197. end
  198. end
  199. return false,nil
  200. end
  201. repeat
  202. local res,ext = process_dir_entry(directory_offset)
  203. if res and ext then
  204. return ext,60
  205. end
  206. if not res then
  207. break
  208. end
  209. directory_offset = directory_offset + 128
  210. until directory_offset >= inplen
  211. end
  212. exports.ole_format_heuristic = detect_ole_format
  213. local function process_top_detected(res)
  214. local extensions = lua_util.keys(res)
  215. if #extensions > 0 then
  216. table.sort(extensions, function(ex1, ex2)
  217. return res[ex1] > res[ex2]
  218. end)
  219. return extensions[1],res[extensions[1]]
  220. end
  221. return nil
  222. end
  223. local function detect_archive_flaw(part, arch, log_obj, _)
  224. local arch_type = arch:get_type()
  225. local res = {
  226. docx = 0,
  227. xlsx = 0,
  228. pptx = 0,
  229. jar = 0,
  230. odt = 0,
  231. odp = 0,
  232. ods = 0,
  233. apk = 0,
  234. } -- ext + confidence pairs
  235. -- General msoffice patterns
  236. local function add_msoffice_confidence(incr)
  237. res.docx = res.docx + incr
  238. res.xlsx = res.xlsx + incr
  239. res.pptx = res.pptx + incr
  240. end
  241. if arch_type == 'zip' then
  242. -- Find specific files/folders in zip file
  243. local files = arch:get_files(100) or {}
  244. for _,file in ipairs(files) do
  245. if file == '[Content_Types].xml' then
  246. add_msoffice_confidence(10)
  247. elseif file:sub(1, 3) == 'xl/' then
  248. res.xlsx = res.xlsx + 30
  249. elseif file:sub(1, 5) == 'word/' then
  250. res.docx = res.docx + 30
  251. elseif file:sub(1, 4) == 'ppt/' then
  252. res.pptx = res.pptx + 30
  253. elseif file == 'META-INF/MANIFEST.MF' then
  254. res.jar = res.jar + 40
  255. elseif file == 'AndroidManifest.xml' then
  256. res.apk = res.apk + 60
  257. end
  258. end
  259. local ext,weight = process_top_detected(res)
  260. if weight >= 40 then
  261. return ext,weight
  262. end
  263. -- Apply misc Zip detection logic
  264. local content = part:get_content()
  265. if #content > 128 then
  266. local start_span = content:span(1, 128)
  267. local matches = zip_trie:match(start_span)
  268. if matches then
  269. for n,_ in pairs(matches) do
  270. if zip_patterns_indexes[n] then
  271. lua_util.debugm(N, log_obj, "found zip pattern for %s",
  272. zip_patterns_indexes[n][1])
  273. return zip_patterns_indexes[n][1],40
  274. end
  275. end
  276. end
  277. end
  278. end
  279. return arch_type:lower(),40
  280. end
  281. local csv_grammar
  282. -- Returns a grammar that will count commas
  283. local function get_csv_grammar()
  284. if not csv_grammar then
  285. local lpeg = require'lpeg'
  286. local field = '"' * lpeg.Cs(((lpeg.P(1) - '"') + lpeg.P'""' / '"')^0) * '"' +
  287. lpeg.C((1 - lpeg.S',\n"')^0)
  288. csv_grammar = lpeg.Cf(lpeg.Cc(0) * field * lpeg.P( (lpeg.P(',') +
  289. lpeg.P('\t')) * field)^1 * (lpeg.S'\r\n' + -1),
  290. function(acc) return acc + 1 end)
  291. end
  292. return csv_grammar
  293. end
  294. local function validate_csv(part, content, log_obj)
  295. local max_chunk = 32768
  296. local chunk = content:sub(1, max_chunk)
  297. local expected_commas
  298. local matched_lines = 0
  299. local max_matched_lines = 10
  300. lua_util.debugm(N, log_obj, "check for csv pattern")
  301. for s in chunk:lines() do
  302. local ncommas = get_csv_grammar():match(s)
  303. if not ncommas then
  304. lua_util.debugm(N, log_obj, "not a csv line at line number %s",
  305. matched_lines)
  306. return false
  307. end
  308. if expected_commas and ncommas ~= expected_commas then
  309. -- Mismatched commas
  310. lua_util.debugm(N, log_obj, "missmatched commas on line %s: %s != %s",
  311. matched_lines, ncommas, expected_commas)
  312. return false
  313. elseif not expected_commas then
  314. if ncommas == 0 then
  315. lua_util.debugm(N, log_obj, "no commas in the first line")
  316. return false
  317. end
  318. expected_commas = ncommas
  319. end
  320. matched_lines = matched_lines + 1
  321. if matched_lines > max_matched_lines then
  322. break
  323. end
  324. end
  325. lua_util.debugm(N, log_obj, "csv content is sane: %s fields; %s lines checked",
  326. expected_commas, matched_lines)
  327. return true
  328. end
  329. exports.mime_part_heuristic = function(part, log_obj, _)
  330. if part:is_archive() then
  331. local arch = part:get_archive()
  332. return detect_archive_flaw(part, arch, log_obj)
  333. end
  334. return nil
  335. end
  336. exports.text_part_heuristic = function(part, log_obj, _)
  337. -- We get some span of data and check it
  338. local function is_span_text(span)
  339. -- We examine 8 bit content, and we assume it might be localized text
  340. -- if it has more than 3 subsequent 8 bit characters
  341. local function rough_8bit_check(bytes, idx, remain, len)
  342. local b = bytes[idx]
  343. local n8bit = 0
  344. while b >= 127 and idx < len do
  345. -- utf8 part
  346. if bit.band(b, 0xe0) == 0xc0 and remain > 1 and
  347. bit.band(bytes[idx + 1], 0xc0) == 0x80 then
  348. return true,1
  349. elseif bit.band(b, 0xf0) == 0xe0 and remain > 2 and
  350. bit.band(bytes[idx + 1], 0xc0) == 0x80 and
  351. bit.band(bytes[idx + 2], 0xc0) == 0x80 then
  352. return true,2
  353. elseif bit.band(b, 0xf8) == 0xf0 and remain > 3 and
  354. bit.band(bytes[idx + 1], 0xc0) == 0x80 and
  355. bit.band(bytes[idx + 2], 0xc0) == 0x80 and
  356. bit.band(bytes[idx + 3], 0xc0) == 0x80 then
  357. return true,3
  358. end
  359. n8bit = n8bit + 1
  360. idx = idx + 1
  361. b = bytes[idx]
  362. remain = remain - 1
  363. end
  364. if n8bit >= 3 then
  365. return true,n8bit
  366. end
  367. return false,0
  368. end
  369. -- Convert to string as LuaJIT can optimise string.sub (and fun.iter) but not C calls
  370. local tlen = #span
  371. local non_printable = 0
  372. local bytes = span:bytes()
  373. local i = 1
  374. repeat
  375. local b = bytes[i]
  376. if (b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09) then
  377. non_printable = non_printable + 1
  378. elseif b >= 127 then
  379. local c,nskip = rough_8bit_check(bytes, i, tlen - i, tlen)
  380. if not c then
  381. non_printable = non_printable + 1
  382. else
  383. i = i + nskip
  384. end
  385. end
  386. i = i + 1
  387. until i > tlen
  388. lua_util.debugm(N, log_obj, "text part check: %s printable, %s non-printable, %s total",
  389. tlen - non_printable, non_printable, tlen)
  390. if non_printable / tlen > 0.0078125 then
  391. return false
  392. end
  393. return true
  394. end
  395. local parent = part:get_parent()
  396. if parent then
  397. local parent_type,parent_subtype = parent:get_type()
  398. if parent_type == 'multipart' and parent_subtype == 'encrypted' then
  399. -- Skip text heuristics for encrypted parts
  400. lua_util.debugm(N, log_obj, "text part check: parent is encrypted, not a text part")
  401. return false
  402. end
  403. end
  404. local content = part:get_content()
  405. local mtype,msubtype = part:get_type()
  406. local clen = #content
  407. local is_text
  408. if clen > 0 then
  409. if clen > 80 * 3 then
  410. -- Use chunks
  411. is_text = is_span_text(content:span(1, 160)) and is_span_text(content:span(clen - 80, 80))
  412. else
  413. is_text = is_span_text(content)
  414. end
  415. if is_text and mtype ~= 'message' then
  416. -- Try patterns
  417. local span_len = math.min(4096, clen)
  418. local start_span = content:span(1, span_len)
  419. local matches = txt_trie:match(start_span)
  420. local res = {}
  421. local fname = part:get_filename()
  422. if matches then
  423. -- Require at least 2 occurrences of those patterns
  424. for n,positions in pairs(matches) do
  425. local ext,weight = txt_patterns_indexes[n][1], txt_patterns_indexes[n][2][2]
  426. if ext then
  427. res[ext] = (res[ext] or 0) + weight * #positions
  428. lua_util.debugm(N, log_obj, "found txt pattern for %s: %s, total: %s; %s/%s announced",
  429. ext, weight * #positions, res[ext], mtype, msubtype)
  430. end
  431. end
  432. if res.html and res.html >= 40 then
  433. -- HTML has priority over something like js...
  434. return 'html', res.html
  435. end
  436. local ext, weight = process_top_detected(res)
  437. if weight then
  438. if weight >= 40 then
  439. -- Extra validation for csv extension
  440. if ext ~= 'csv' or validate_csv(part, content, log_obj) then
  441. return ext, weight
  442. end
  443. elseif fname and weight >= 20 then
  444. return ext, weight
  445. end
  446. end
  447. end
  448. -- Content type stuff
  449. if (mtype == 'text' or mtype == 'application') and
  450. (msubtype == 'html' or msubtype == 'xhtml+xml') then
  451. return 'html', 21
  452. end
  453. if msubtype:lower() == 'csv' then
  454. if validate_csv(part, content, log_obj) then
  455. return 'csv', 40
  456. end
  457. end
  458. -- Extension stuff
  459. local function has_extension(file, ext)
  460. local ext_len = ext:len()
  461. return file:len() > ext_len + 1
  462. and file:sub(-ext_len):lower() == ext
  463. and file:sub(-ext_len - 1, -ext_len - 1) == '.'
  464. end
  465. if fname and (has_extension(fname, 'htm') or has_extension(fname, 'html')) then
  466. return 'html',21
  467. end
  468. if mtype ~= 'text' then
  469. -- Do not treat non text patterns as text
  470. return nil
  471. end
  472. return 'txt',40
  473. end
  474. end
  475. end
  476. exports.pdf_format_heuristic = function(input, log_obj, pos, part)
  477. local weight = 10
  478. local ext = string.match(part:get_filename() or '', '%.([^.]+)$')
  479. -- If we found a pattern at the beginning
  480. if pos <= 10 then
  481. weight = weight + 30
  482. end
  483. -- If the announced extension is `pdf`
  484. if ext and ext:lower() == 'pdf' then
  485. weight = weight + 30
  486. end
  487. return 'pdf',weight
  488. end
  489. exports.pe_part_heuristic = function(input, log_obj, pos, part)
  490. if not input then
  491. return
  492. end
  493. -- pe header should start at the offset that is placed in msdos header at position 60..64
  494. local pe_ptr_bin = input:sub(60, 64)
  495. if #pe_ptr_bin ~= 4 then
  496. return
  497. end
  498. -- it is an LE 32 bit integer
  499. local pe_ptr = rspamd_util.unpack("<I4", pe_ptr_bin)
  500. -- if pe header magic matches the offset, it is definitely a PE file
  501. if pe_ptr ~= pos then
  502. return
  503. end
  504. return 'exe',30
  505. end
  506. return exports