123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596 |
- --[[
- Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com>
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ]]--
-
- --[[[
- -- @module lua_magic/heuristics
- -- This module contains heuristics for some specific cases
- --]]
-
- local rspamd_trie = require "rspamd_trie"
- local rspamd_util = require "rspamd_util"
- local lua_util = require "lua_util"
- local bit = require "bit"
- local fun = require "fun"
-
- local N = "lua_magic"
- local msoffice_trie
- local msoffice_patterns = {
- doc = {[[WordDocument]]},
- xls = {[[Workbook]], [[Book]]},
- ppt = {[[PowerPoint Document]], [[Current User]]},
- vsd = {[[VisioDocument]]},
- }
- local msoffice_trie_clsid
- local msoffice_clsids = {
- doc = {[[0609020000000000c000000000000046]]},
- xls = {[[1008020000000000c000000000000046]], [[2008020000000000c000000000000046]]},
- ppt = {[[108d81649b4fcf1186ea00aa00b929e8]]},
- msg = {[[46f0060000000000c000000000000046]], [[0b0d020000000000c000000000000046]]},
- msi = {[[84100c0000000000c000000000000046]]},
- }
- local zip_trie
- local zip_patterns = {
- -- https://lists.oasis-open.org/archives/office/200505/msg00006.html
- odt = {
- [[mimetypeapplication/vnd\.oasis\.opendocument\.text]],
- [[mimetypeapplication/vnd\.oasis\.opendocument\.image]],
- [[mimetypeapplication/vnd\.oasis\.opendocument\.graphic]]
- },
- ods = {
- [[mimetypeapplication/vnd\.oasis\.opendocument\.spreadsheet]],
- [[mimetypeapplication/vnd\.oasis\.opendocument\.formula]],
- [[mimetypeapplication/vnd\.oasis\.opendocument\.chart]]
- },
- odp = {[[mimetypeapplication/vnd\.oasis\.opendocument\.presentation]]},
- epub = {[[epub\+zip]]},
- asice = {[[mimetypeapplication/vnd\.etsi\.asic-e\+zipPK]]},
- asics = {[[mimetypeapplication/vnd\.etsi\.asic-s\+zipPK]]},
- }
-
- local txt_trie
- local txt_patterns = {
- html = {
- {[[(?i)<html\b]], 32},
- {[[(?i)<script\b]], 20}, -- Commonly used by spammers
- {[[<script\s+type="text\/javascript">]], 31}, -- Another spammy pattern
- {[[(?i)<\!DOCTYPE HTML\b]], 33},
- {[[(?i)<body\b]], 20},
- {[[(?i)<table\b]], 20},
- {[[(?i)<a\b]], 10},
- {[[(?i)<p\b]], 10},
- {[[(?i)<div\b]], 10},
- {[[(?i)<span\b]], 10},
- },
- csv = {
- {[[(?:[-a-zA-Z0-9_]+\s*,){2,}(?:[-a-zA-Z0-9_]+,?[ ]*[\r\n])]], 20}
- },
- ics = {
- {[[^BEGIN:VCALENDAR\r?\n]], 40},
- },
- vcf = {
- {[[^BEGIN:VCARD\r?\n]], 40},
- },
- xml = {
- {[[<\?xml\b.+\?>]], 31},
- }
- }
-
- -- Used to match pattern index and extension
- local msoffice_clsid_indexes = {}
- local msoffice_patterns_indexes = {}
- local zip_patterns_indexes = {}
- local txt_patterns_indexes = {}
-
- local exports = {}
-
- local function compile_tries()
- local default_compile_flags = bit.bor(rspamd_trie.flags.re,
- rspamd_trie.flags.dot_all,
- rspamd_trie.flags.single_match,
- rspamd_trie.flags.no_start)
- local function compile_pats(patterns, indexes, transform_func, compile_flags)
- local strs = {}
- for ext,pats in pairs(patterns) do
- for _,pat in ipairs(pats) do
- -- These are utf16 strings in fact...
- strs[#strs + 1] = transform_func(pat)
- indexes[#indexes + 1] = {ext, pat}
- end
- end
-
- return rspamd_trie.create(strs, compile_flags or default_compile_flags)
- end
-
- if not msoffice_trie then
- -- Directory names
- local function msoffice_pattern_transform(pat)
- return '^' ..
- table.concat(
- fun.totable(
- fun.map(function(c) return c .. [[\x{00}]] end,
- fun.iter(pat))))
- end
- local function msoffice_clsid_transform(pat)
- local hex_table = {}
- for i=1,#pat,2 do
- local subc = pat:sub(i, i + 1)
- hex_table[#hex_table + 1] = string.format('\\x{%s}', subc)
- end
-
- return '^' .. table.concat(hex_table) .. '$'
- end
- -- Directory entries
- msoffice_trie = compile_pats(msoffice_patterns, msoffice_patterns_indexes,
- msoffice_pattern_transform)
- -- Clsids
- msoffice_trie_clsid = compile_pats(msoffice_clsids, msoffice_clsid_indexes,
- msoffice_clsid_transform)
- -- Misc zip patterns at the initial fragment
- zip_trie = compile_pats(zip_patterns, zip_patterns_indexes,
- function(pat) return pat end)
- -- Text patterns at the initial fragment
- txt_trie = compile_pats(txt_patterns, txt_patterns_indexes,
- function(pat_tbl) return pat_tbl[1] end,
- bit.bor(rspamd_trie.flags.re,
- rspamd_trie.flags.dot_all,
- rspamd_trie.flags.no_start))
- end
- end
-
- -- Call immediately on require
- compile_tries()
-
- local function detect_ole_format(input, log_obj, _, part)
- local inplen = #input
- if inplen < 0x31 + 4 then
- lua_util.debugm(N, log_obj, "short length: %s", inplen)
- return nil
- end
-
- local bom,sec_size = rspamd_util.unpack('<I2<I2', input:span(29, 4))
- if bom == 0xFFFE then
- bom = '<'
- else
- lua_util.debugm(N, log_obj, "bom file!: %s", bom)
- bom = '>'; sec_size = bit.bswap(sec_size)
- end
-
- if sec_size < 7 or sec_size > 31 then
- lua_util.debugm(N, log_obj, "bad sec_size: %s", sec_size)
- return nil
- end
-
- sec_size = 2 ^ sec_size
-
- -- SecID of first sector of the directory stream
- local directory_offset = (rspamd_util.unpack(bom .. 'I4', input:span(0x31, 4)))
- * sec_size + 512 + 1
- lua_util.debugm(N, log_obj, "directory: %s", directory_offset)
-
- if inplen < directory_offset then
- lua_util.debugm(N, log_obj, "short length: %s", inplen)
- return nil
- end
-
- local function process_dir_entry(offset)
- local dtype = input:byte(offset + 66)
- lua_util.debugm(N, log_obj, "dtype: %s, offset: %s", dtype, offset)
-
- if dtype then
- if dtype == 5 then
- -- Extract clsid
- local matches = msoffice_trie_clsid:match(input:span(offset + 80, 16))
- if matches then
- for n,_ in pairs(matches) do
- if msoffice_clsid_indexes[n] then
- lua_util.debugm(N, log_obj, "found valid clsid for %s",
- msoffice_clsid_indexes[n][1])
- return true,msoffice_clsid_indexes[n][1]
- end
- end
- end
- return true,nil
- elseif dtype == 2 then
- local matches = msoffice_trie:match(input:span(offset, 64))
- if matches then
- for n,_ in pairs(matches) do
- if msoffice_patterns_indexes[n] then
- return true,msoffice_patterns_indexes[n][1]
- end
- end
- end
- return true,nil
- elseif dtype >= 0 and dtype < 5 then
- -- Bad type
- return true,nil
- end
- end
-
- return false,nil
- end
-
- repeat
- local res,ext = process_dir_entry(directory_offset)
-
- if res and ext then
- return ext,60
- end
-
- if not res then
- break
- end
-
- directory_offset = directory_offset + 128
- until directory_offset >= inplen
- end
-
- exports.ole_format_heuristic = detect_ole_format
-
- local function process_top_detected(res)
- local extensions = lua_util.keys(res)
-
- if #extensions > 0 then
- table.sort(extensions, function(ex1, ex2)
- return res[ex1] > res[ex2]
- end)
-
- return extensions[1],res[extensions[1]]
- end
-
- return nil
- end
-
- local function detect_archive_flaw(part, arch, log_obj, _)
- local arch_type = arch:get_type()
- local res = {
- docx = 0,
- xlsx = 0,
- pptx = 0,
- jar = 0,
- odt = 0,
- odp = 0,
- ods = 0,
- apk = 0,
- } -- ext + confidence pairs
-
- -- General msoffice patterns
- local function add_msoffice_confidence(incr)
- res.docx = res.docx + incr
- res.xlsx = res.xlsx + incr
- res.pptx = res.pptx + incr
- end
-
- if arch_type == 'zip' then
- -- Find specific files/folders in zip file
- local files = arch:get_files(100) or {}
- for _,file in ipairs(files) do
- if file == '[Content_Types].xml' then
- add_msoffice_confidence(10)
- elseif file:sub(1, 3) == 'xl/' then
- res.xlsx = res.xlsx + 30
- elseif file:sub(1, 5) == 'word/' then
- res.docx = res.docx + 30
- elseif file:sub(1, 4) == 'ppt/' then
- res.pptx = res.pptx + 30
- elseif file == 'META-INF/MANIFEST.MF' then
- res.jar = res.jar + 40
- elseif file == 'AndroidManifest.xml' then
- res.apk = res.apk + 60
- end
- end
-
- local ext,weight = process_top_detected(res)
-
- if weight >= 40 then
- return ext,weight
- end
-
- -- Apply misc Zip detection logic
- local content = part:get_content()
-
- if #content > 128 then
- local start_span = content:span(1, 128)
-
- local matches = zip_trie:match(start_span)
- if matches then
- for n,_ in pairs(matches) do
- if zip_patterns_indexes[n] then
- lua_util.debugm(N, log_obj, "found zip pattern for %s",
- zip_patterns_indexes[n][1])
- return zip_patterns_indexes[n][1],40
- end
- end
- end
- end
- end
-
- return arch_type:lower(),40
- end
-
- local csv_grammar
- -- Returns a grammar that will count commas
- local function get_csv_grammar()
- if not csv_grammar then
- local lpeg = require'lpeg'
-
- local field = '"' * lpeg.Cs(((lpeg.P(1) - '"') + lpeg.P'""' / '"')^0) * '"' +
- lpeg.C((1 - lpeg.S',\n"')^0)
-
- csv_grammar = lpeg.Cf(lpeg.Cc(0) * field * lpeg.P( (lpeg.P(',') +
- lpeg.P('\t')) * field)^1 * (lpeg.S'\r\n' + -1),
- function(acc) return acc + 1 end)
- end
-
- return csv_grammar
- end
- local function validate_csv(part, content, log_obj)
- local max_chunk = 32768
- local chunk = content:sub(1, max_chunk)
-
- local expected_commas
- local matched_lines = 0
- local max_matched_lines = 10
-
- lua_util.debugm(N, log_obj, "check for csv pattern")
-
- for s in chunk:lines() do
- local ncommas = get_csv_grammar():match(s)
-
- if not ncommas then
- lua_util.debugm(N, log_obj, "not a csv line at line number %s",
- matched_lines)
- return false
- end
-
- if expected_commas and ncommas ~= expected_commas then
- -- Mismatched commas
- lua_util.debugm(N, log_obj, "missmatched commas on line %s: %s != %s",
- matched_lines, ncommas, expected_commas)
- return false
- elseif not expected_commas then
- if ncommas == 0 then
- lua_util.debugm(N, log_obj, "no commas in the first line")
- return false
- end
- expected_commas = ncommas
- end
-
- matched_lines = matched_lines + 1
-
- if matched_lines > max_matched_lines then
- break
- end
- end
-
- lua_util.debugm(N, log_obj, "csv content is sane: %s fields; %s lines checked",
- expected_commas, matched_lines)
-
- return true
- end
-
- exports.mime_part_heuristic = function(part, log_obj, _)
- if part:is_archive() then
- local arch = part:get_archive()
- return detect_archive_flaw(part, arch, log_obj)
- end
-
- return nil
- end
-
- exports.text_part_heuristic = function(part, log_obj, _)
- -- We get some span of data and check it
- local function is_span_text(span)
- -- We examine 8 bit content, and we assume it might be localized text
- -- if it has more than 3 subsequent 8 bit characters
- local function rough_8bit_check(bytes, idx, remain, len)
- local b = bytes[idx]
- local n8bit = 0
-
- while b >= 127 and idx < len do
- -- utf8 part
- if bit.band(b, 0xe0) == 0xc0 and remain > 1 and
- bit.band(bytes[idx + 1], 0xc0) == 0x80 then
- return true,1
- elseif bit.band(b, 0xf0) == 0xe0 and remain > 2 and
- bit.band(bytes[idx + 1], 0xc0) == 0x80 and
- bit.band(bytes[idx + 2], 0xc0) == 0x80 then
- return true,2
- elseif bit.band(b, 0xf8) == 0xf0 and remain > 3 and
- bit.band(bytes[idx + 1], 0xc0) == 0x80 and
- bit.band(bytes[idx + 2], 0xc0) == 0x80 and
- bit.band(bytes[idx + 3], 0xc0) == 0x80 then
- return true,3
- end
-
- n8bit = n8bit + 1
- idx = idx + 1
- b = bytes[idx]
- remain = remain - 1
- end
-
- if n8bit >= 3 then
- return true,n8bit
- end
-
- return false,0
- end
-
- -- Convert to string as LuaJIT can optimise string.sub (and fun.iter) but not C calls
- local tlen = #span
- local non_printable = 0
- local bytes = span:bytes()
- local i = 1
- repeat
- local b = bytes[i]
-
- if (b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09) then
- non_printable = non_printable + 1
- elseif b >= 127 then
- local c,nskip = rough_8bit_check(bytes, i, tlen - i, tlen)
-
- if not c then
- non_printable = non_printable + 1
- else
- i = i + nskip
- end
- end
- i = i + 1
- until i > tlen
-
- lua_util.debugm(N, log_obj, "text part check: %s printable, %s non-printable, %s total",
- tlen - non_printable, non_printable, tlen)
- if non_printable / tlen > 0.0078125 then
- return false
- end
-
- return true
- end
-
- local parent = part:get_parent()
-
- if parent then
- local parent_type,parent_subtype = parent:get_type()
-
- if parent_type == 'multipart' and parent_subtype == 'encrypted' then
- -- Skip text heuristics for encrypted parts
- lua_util.debugm(N, log_obj, "text part check: parent is encrypted, not a text part")
-
- return false
- end
- end
-
- local content = part:get_content()
- local mtype,msubtype = part:get_type()
- local clen = #content
- local is_text
-
- if clen > 0 then
- if clen > 80 * 3 then
- -- Use chunks
- is_text = is_span_text(content:span(1, 160)) and is_span_text(content:span(clen - 80, 80))
- else
- is_text = is_span_text(content)
- end
-
- if is_text and mtype ~= 'message' then
- -- Try patterns
- local span_len = math.min(4096, clen)
- local start_span = content:span(1, span_len)
- local matches = txt_trie:match(start_span)
- local res = {}
- local fname = part:get_filename()
-
- if matches then
- -- Require at least 2 occurrences of those patterns
- for n,positions in pairs(matches) do
- local ext,weight = txt_patterns_indexes[n][1], txt_patterns_indexes[n][2][2]
- if ext then
- res[ext] = (res[ext] or 0) + weight * #positions
- lua_util.debugm(N, log_obj, "found txt pattern for %s: %s, total: %s; %s/%s announced",
- ext, weight * #positions, res[ext], mtype, msubtype)
- end
- end
-
- if res.html and res.html >= 40 then
- -- HTML has priority over something like js...
- return 'html', res.html
- end
-
- local ext, weight = process_top_detected(res)
-
- if weight then
- if weight >= 40 then
- -- Extra validation for csv extension
- if ext ~= 'csv' or validate_csv(part, content, log_obj) then
- return ext, weight
- end
- elseif fname and weight >= 20 then
- return ext, weight
- end
- end
- end
-
- -- Content type stuff
- if (mtype == 'text' or mtype == 'application') and
- (msubtype == 'html' or msubtype == 'xhtml+xml') then
- return 'html', 21
- end
-
- if msubtype:lower() == 'csv' then
- if validate_csv(part, content, log_obj) then
- return 'csv', 40
- end
- end
-
- -- Extension stuff
- local function has_extension(file, ext)
- local ext_len = ext:len()
- return file:len() > ext_len + 1
- and file:sub(-ext_len):lower() == ext
- and file:sub(-ext_len - 1, -ext_len - 1) == '.'
- end
-
- if fname and (has_extension(fname, 'htm') or has_extension(fname, 'html')) then
- return 'html',21
- end
-
- if mtype ~= 'text' then
- -- Do not treat non text patterns as text
- return nil
- end
-
- return 'txt',40
- end
- end
- end
-
- exports.pdf_format_heuristic = function(input, log_obj, pos, part)
- local weight = 10
- local ext = string.match(part:get_filename() or '', '%.([^.]+)$')
- -- If we found a pattern at the beginning
- if pos <= 10 then
- weight = weight + 30
- end
- -- If the announced extension is `pdf`
- if ext and ext:lower() == 'pdf' then
- weight = weight + 30
- end
-
- return 'pdf',weight
- end
-
- exports.pe_part_heuristic = function(input, log_obj, pos, part)
- if not input then
- return
- end
-
- -- pe header should start at the offset that is placed in msdos header at position 60..64
- local pe_ptr_bin = input:sub(60, 64)
- if #pe_ptr_bin ~= 4 then
- return
- end
-
- -- it is an LE 32 bit integer
- local pe_ptr = rspamd_util.unpack("<I4", pe_ptr_bin)
- -- if pe header magic matches the offset, it is definitely a PE file
- if pe_ptr ~= pos then
- return
- end
-
- return 'exe',30
- end
-
- return exports
|