You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

heuristics.lua 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. --[[
  2. Copyright (c) 2019, Vsevolod Stakhov <vsevolod@highsecure.ru>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. --[[[
  14. -- @module lua_magic/heuristics
  15. -- This module contains heuristics for some specific cases
  16. --]]
  17. local rspamd_trie = require "rspamd_trie"
  18. local rspamd_util = require "rspamd_util"
  19. local lua_util = require "lua_util"
  20. local bit = require "bit"
  21. local fun = require "fun"
  22. local N = "lua_magic"
  23. local msoffice_trie
  24. local msoffice_patterns = {
  25. doc = {[[WordDocument]]},
  26. xls = {[[Workbook]], [[Book]]},
  27. ppt = {[[PowerPoint Document]], [[Current User]]},
  28. vsd = {[[VisioDocument]]},
  29. }
  30. local msoffice_trie_clsid
  31. local msoffice_clsids = {
  32. doc = {[[0609020000000000c000000000000046]]},
  33. xls = {[[1008020000000000c000000000000046]], [[2008020000000000c000000000000046]]},
  34. ppt = {[[108d81649b4fcf1186ea00aa00b929e8]]},
  35. msg = {[[46f0060000000000c000000000000046]], [[0b0d020000000000c000000000000046]]},
  36. msi = {[[84100c0000000000c000000000000046]]},
  37. }
  38. local zip_trie
  39. local zip_patterns = {
  40. -- https://lists.oasis-open.org/archives/office/200505/msg00006.html
  41. odt = {
  42. [[mimetypeapplication/vnd\.oasis\.opendocument.text]],
  43. [[mimetypeapplication/vnd\.oasis.opendocument\.image]],
  44. [[mimetypeapplication/vnd\.oasis\.opendocument\.graphic]]
  45. },
  46. ods = {
  47. [[mimetypeapplication/vnd\.oasis\.opendocument\.spreadsheet]],
  48. [[mimetypeapplication/vnd\.oasis\.opendocument.formula]],
  49. [[mimetypeapplication/vnd\.oasis\.opendocument\.chart]]
  50. },
  51. odp = {[[mimetypeapplication/vnd\.oasis\.opendocument\.presentation]]},
  52. epub = {[[epub\+zip]]}
  53. }
  54. local txt_trie
  55. local txt_patterns = {
  56. html = {
  57. {[[(?i)\s*<html\b]], 30},
  58. {[[(?i)\s*<script\b]], 20}, -- Commonly used by spammers
  59. {[[(?i)\s*<\!DOCTYPE HTML\b]], 30},
  60. {[[(?i)\s*<xml\b]], 20},
  61. {[[(?i)\s*<body\b]], 20},
  62. {[[(?i)\s*<table\b]], 20},
  63. {[[(?i)\s*<a\b]], 10},
  64. {[[(?i)\s*<p\b]], 10},
  65. {[[(?i)\s*<div\b]], 10},
  66. {[[(?i)\s*<span\b]], 10},
  67. },
  68. csv = {
  69. {[[(?:[-a-zA-Z0-9_]+\s*,){2,}(?:[-a-zA-Z0-9_]+,?[ ]*[\r\n])]], 20}
  70. },
  71. ics = {
  72. {[[^BEGIN:VCALENDAR\r?\n]], 40},
  73. },
  74. vcf = {
  75. {[[^BEGIN:VCARD\r?\n]], 40},
  76. },
  77. }
  78. -- Used to match pattern index and extension
  79. local msoffice_clsid_indexes = {}
  80. local msoffice_patterns_indexes = {}
  81. local zip_patterns_indexes = {}
  82. local txt_patterns_indexes = {}
  83. local exports = {}
  84. local function compile_tries()
  85. local default_compile_flags = bit.bor(rspamd_trie.flags.re,
  86. rspamd_trie.flags.dot_all,
  87. rspamd_trie.flags.single_match,
  88. rspamd_trie.flags.no_start)
  89. local function compile_pats(patterns, indexes, transform_func, compile_flags)
  90. local strs = {}
  91. for ext,pats in pairs(patterns) do
  92. for _,pat in ipairs(pats) do
  93. -- These are utf16 strings in fact...
  94. strs[#strs + 1] = transform_func(pat)
  95. indexes[#indexes + 1] = {ext, pat}
  96. end
  97. end
  98. return rspamd_trie.create(strs, compile_flags or default_compile_flags)
  99. end
  100. if not msoffice_trie then
  101. -- Directory names
  102. local function msoffice_pattern_transform(pat)
  103. return '^' ..
  104. table.concat(
  105. fun.totable(
  106. fun.map(function(c) return c .. [[\x{00}]] end,
  107. fun.iter(pat))))
  108. end
  109. local function msoffice_clsid_transform(pat)
  110. local hex_table = {}
  111. for i=1,#pat,2 do
  112. local subc = pat:sub(i, i + 1)
  113. hex_table[#hex_table + 1] = string.format('\\x{%s}', subc)
  114. end
  115. return '^' .. table.concat(hex_table) .. '$'
  116. end
  117. -- Directory entries
  118. msoffice_trie = compile_pats(msoffice_patterns, msoffice_patterns_indexes,
  119. msoffice_pattern_transform)
  120. -- Clsids
  121. msoffice_trie_clsid = compile_pats(msoffice_clsids, msoffice_clsid_indexes,
  122. msoffice_clsid_transform)
  123. -- Misc zip patterns at the initial fragment
  124. zip_trie = compile_pats(zip_patterns, zip_patterns_indexes,
  125. function(pat) return pat end)
  126. -- Text patterns at the initial fragment
  127. txt_trie = compile_pats(txt_patterns, txt_patterns_indexes,
  128. function(pat_tbl) return pat_tbl[1] end,
  129. bit.bor(rspamd_trie.flags.re,
  130. rspamd_trie.flags.dot_all,
  131. rspamd_trie.flags.no_start))
  132. end
  133. end
  134. -- Call immediately on require
  135. compile_tries()
  136. local function detect_ole_format(input, log_obj, _, part)
  137. local inplen = #input
  138. if inplen < 0x31 + 4 then
  139. lua_util.debugm(N, log_obj, "short length: %s", inplen)
  140. return nil
  141. end
  142. local bom,sec_size = rspamd_util.unpack('<I2<I2', input:span(29, 4))
  143. if bom == 0xFFFE then
  144. bom = '<'
  145. else
  146. lua_util.debugm(N, log_obj, "bom file!: %s", bom)
  147. bom = '>'; sec_size = bit.bswap(sec_size)
  148. end
  149. if sec_size < 7 or sec_size > 31 then
  150. lua_util.debugm(N, log_obj, "bad sec_size: %s", sec_size)
  151. return nil
  152. end
  153. sec_size = 2 ^ sec_size
  154. -- SecID of first sector of the directory stream
  155. local directory_offset = (rspamd_util.unpack(bom .. 'I4', input:span(0x31, 4)))
  156. * sec_size + 512 + 1
  157. lua_util.debugm(N, log_obj, "directory: %s", directory_offset)
  158. if inplen < directory_offset then
  159. lua_util.debugm(N, log_obj, "short length: %s", inplen)
  160. return nil
  161. end
  162. local function process_dir_entry(offset)
  163. local dtype = input:at(offset + 66)
  164. lua_util.debugm(N, log_obj, "dtype: %s, offset: %s", dtype, offset)
  165. if dtype then
  166. if dtype == 5 then
  167. -- Extract clsid
  168. local matches = msoffice_trie_clsid:match(input:span(offset + 80, 16))
  169. if matches then
  170. for n,_ in pairs(matches) do
  171. if msoffice_clsid_indexes[n] then
  172. lua_util.debugm(N, log_obj, "found valid clsid for %s",
  173. msoffice_clsid_indexes[n][1])
  174. return true,msoffice_clsid_indexes[n][1]
  175. end
  176. end
  177. end
  178. return true,nil
  179. elseif dtype == 2 then
  180. local matches = msoffice_trie:match(input:span(offset, 64))
  181. if matches then
  182. for n,_ in pairs(matches) do
  183. if msoffice_patterns_indexes[n] then
  184. return true,msoffice_patterns_indexes[n][1]
  185. end
  186. end
  187. end
  188. return true,nil
  189. elseif dtype >= 0 and dtype < 5 then
  190. -- Bad type
  191. return true,nil
  192. end
  193. end
  194. return false,nil
  195. end
  196. repeat
  197. local res,ext = process_dir_entry(directory_offset)
  198. if res and ext then
  199. return ext,60
  200. end
  201. if not res then
  202. break
  203. end
  204. directory_offset = directory_offset + 128
  205. until directory_offset >= inplen
  206. end
  207. exports.ole_format_heuristic = detect_ole_format
  208. local function process_top_detected(res)
  209. local extensions = lua_util.keys(res)
  210. if #extensions > 0 then
  211. table.sort(extensions, function(ex1, ex2)
  212. return res[ex1] > res[ex2]
  213. end)
  214. return extensions[1],res[extensions[1]]
  215. end
  216. return nil
  217. end
  218. local function detect_archive_flaw(part, arch, log_obj, _)
  219. local arch_type = arch:get_type()
  220. local res = {
  221. docx = 0,
  222. xlsx = 0,
  223. pptx = 0,
  224. jar = 0,
  225. odt = 0,
  226. odp = 0,
  227. ods = 0,
  228. apk = 0,
  229. } -- ext + confidence pairs
  230. -- General msoffice patterns
  231. local function add_msoffice_confidence(incr)
  232. res.docx = res.docx + incr
  233. res.xlsx = res.xlsx + incr
  234. res.pptx = res.pptx + incr
  235. end
  236. if arch_type == 'zip' then
  237. -- Find specific files/folders in zip file
  238. local files = arch:get_files(100) or {}
  239. for _,file in ipairs(files) do
  240. if file == '[Content_Types].xml' then
  241. add_msoffice_confidence(10)
  242. elseif file:sub(1, 3) == 'xl/' then
  243. res.xlsx = res.xlsx + 30
  244. elseif file:sub(1, 5) == 'word/' then
  245. res.docx = res.docx + 30
  246. elseif file:sub(1, 4) == 'ppt/' then
  247. res.pptx = res.pptx + 30
  248. elseif file == 'META-INF/MANIFEST.MF' then
  249. res.jar = res.jar + 40
  250. elseif file == 'AndroidManifest.xml' then
  251. res.apk = res.apk + 60
  252. end
  253. end
  254. local ext,weight = process_top_detected(res)
  255. if weight >= 40 then
  256. return ext,weight
  257. end
  258. -- Apply misc Zip detection logic
  259. local content = part:get_content()
  260. if #content > 128 then
  261. local start_span = content:span(1, 128)
  262. local matches = zip_trie:match(start_span)
  263. if matches then
  264. for n,_ in pairs(matches) do
  265. if zip_patterns_indexes[n] then
  266. lua_util.debugm(N, log_obj, "found zip pattern for %s",
  267. zip_patterns_indexes[n][1])
  268. return zip_patterns_indexes[n][1],40
  269. end
  270. end
  271. end
  272. end
  273. end
  274. return arch_type:lower(),40
  275. end
  276. exports.mime_part_heuristic = function(part, log_obj, _)
  277. if part:is_archive() then
  278. local arch = part:get_archive()
  279. return detect_archive_flaw(part, arch, log_obj)
  280. end
  281. return nil
  282. end
  283. exports.text_part_heuristic = function(part, log_obj, _)
  284. -- We get some span of data and check it
  285. local function is_span_text(span)
  286. local function rough_utf8_check(bytes, idx, remain)
  287. local b = bytes[idx]
  288. if b >= 127 then
  289. if bit.band(b, 0xe0) == 0xc0 and remain > 1 and
  290. bit.band(bytes[idx + 1], 0xc0) == 0x80 then
  291. return true,1
  292. elseif bit.band(b, 0xf0) == 0xe0 and remain > 2 and
  293. bit.band(bytes[idx + 1], 0xc0) == 0x80 and
  294. bit.band(bytes[idx + 2], 0xc0) == 0x80 then
  295. return true,2
  296. elseif bit.band(b, 0xf8) == 0xf0 and remain > 3 and
  297. bit.band(bytes[idx + 1], 0xc0) == 0x80 and
  298. bit.band(bytes[idx + 2], 0xc0) == 0x80 and
  299. bit.band(bytes[idx + 3], 0xc0) == 0x80 then
  300. return true,3
  301. end
  302. return false
  303. else
  304. return true,0
  305. end
  306. end
  307. -- Convert to string as LuaJIT can optimise string.sub (and fun.iter) but not C calls
  308. local tlen = #span
  309. local non_printable = 0
  310. local bytes = span:bytes()
  311. local i = 1
  312. repeat
  313. local b = bytes[i]
  314. if (b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09) then
  315. non_printable = non_printable + 1
  316. elseif b >= 127 then
  317. local c,nskip = rough_utf8_check(bytes, i, tlen - i)
  318. if not c then
  319. non_printable = non_printable + 1
  320. else
  321. i = i + nskip
  322. end
  323. end
  324. i = i + 1
  325. until i > tlen
  326. lua_util.debugm(N, log_obj, "text part check: %s printable, %s non-printable, %s total",
  327. tlen - non_printable, non_printable, tlen)
  328. if non_printable / tlen > 0.0078125 then
  329. return false
  330. end
  331. return true
  332. end
  333. local parent = part:get_parent()
  334. if parent then
  335. local parent_type,parent_subtype = parent:get_type()
  336. if parent_type == 'multipart' and parent_subtype == 'encrypted' then
  337. -- Skip text heuristics for encrypted parts
  338. lua_util.debugm(N, log_obj, "text part check: parent is encrypted, not a text part")
  339. return false
  340. end
  341. end
  342. local content = part:get_content()
  343. local mtype,msubtype = part:get_type()
  344. local clen = #content
  345. local is_text
  346. if clen > 0 then
  347. if clen > 80 * 3 then
  348. -- Use chunks
  349. is_text = is_span_text(content:span(1, 160)) and is_span_text(content:span(clen - 80, 80))
  350. else
  351. is_text = is_span_text(content)
  352. end
  353. if is_text then
  354. -- Try patterns
  355. local span_len = math.min(160, clen)
  356. local start_span = content:span(1, span_len)
  357. local matches = txt_trie:match(start_span)
  358. local res = {}
  359. if matches then
  360. -- Require at least 2 occurrences of those patterns
  361. for n,positions in pairs(matches) do
  362. local ext,weight = txt_patterns_indexes[n][1], txt_patterns_indexes[n][2][2]
  363. if ext then
  364. res[ext] = (res[ext] or 0) + weight * #positions
  365. lua_util.debugm(N, log_obj, "found txt pattern for %s: %s, total: %s",
  366. ext, weight * #positions, res[ext])
  367. end
  368. end
  369. if res.html and res.html >= 40 then
  370. -- HTML has priority over something like js...
  371. return 'html',res.html
  372. end
  373. local ext,weight = process_top_detected(res)
  374. if weight and weight >= 40 then
  375. return ext,weight
  376. end
  377. end
  378. -- Content type stuff
  379. if (mtype == 'text' or mtype == 'application') and (msubtype == 'html' or msubtype == 'xhtml+xml') then
  380. return 'html',21
  381. end
  382. -- Extension stuff
  383. local fname = part:get_filename()
  384. if fname and fname:match('html?$') then
  385. return 'html',21
  386. end
  387. return 'txt',40
  388. end
  389. end
  390. end
  391. exports.pdf_format_heuristic = function(input, log_obj, pos, part)
  392. local weight = 10
  393. local ext = string.match(part:get_filename() or '', '%.([^.]+)$')
  394. -- If we found a pattern at the beginning
  395. if pos <= 10 then
  396. weight = weight + 30
  397. end
  398. -- If the announced extension is `pdf`
  399. if ext and ext:lower() == 'pdf' then
  400. weight = weight + 30
  401. end
  402. return 'pdf',weight
  403. end
  404. return exports