summaryrefslogtreecommitdiffstats
path: root/lualib/lua_magic/heuristics.lua
diff options
context:
space:
mode:
Diffstat (limited to 'lualib/lua_magic/heuristics.lua')
-rw-r--r--lualib/lua_magic/heuristics.lua175
1 files changed, 92 insertions, 83 deletions
diff --git a/lualib/lua_magic/heuristics.lua b/lualib/lua_magic/heuristics.lua
index 98cfb0eee..b8a1b4188 100644
--- a/lualib/lua_magic/heuristics.lua
+++ b/lualib/lua_magic/heuristics.lua
@@ -28,18 +28,18 @@ local fun = require "fun"
local N = "lua_magic"
local msoffice_trie
local msoffice_patterns = {
- doc = {[[WordDocument]]},
- xls = {[[Workbook]], [[Book]]},
- ppt = {[[PowerPoint Document]], [[Current User]]},
- vsd = {[[VisioDocument]]},
+ doc = { [[WordDocument]] },
+ xls = { [[Workbook]], [[Book]] },
+ ppt = { [[PowerPoint Document]], [[Current User]] },
+ vsd = { [[VisioDocument]] },
}
local msoffice_trie_clsid
local msoffice_clsids = {
- doc = {[[0609020000000000c000000000000046]]},
- xls = {[[1008020000000000c000000000000046]], [[2008020000000000c000000000000046]]},
- ppt = {[[108d81649b4fcf1186ea00aa00b929e8]]},
- msg = {[[46f0060000000000c000000000000046]], [[0b0d020000000000c000000000000046]]},
- msi = {[[84100c0000000000c000000000000046]]},
+ doc = { [[0609020000000000c000000000000046]] },
+ xls = { [[1008020000000000c000000000000046]], [[2008020000000000c000000000000046]] },
+ ppt = { [[108d81649b4fcf1186ea00aa00b929e8]] },
+ msg = { [[46f0060000000000c000000000000046]], [[0b0d020000000000c000000000000046]] },
+ msi = { [[84100c0000000000c000000000000046]] },
}
local zip_trie
local zip_patterns = {
@@ -54,37 +54,37 @@ local zip_patterns = {
[[mimetypeapplication/vnd\.oasis\.opendocument\.formula]],
[[mimetypeapplication/vnd\.oasis\.opendocument\.chart]]
},
- odp = {[[mimetypeapplication/vnd\.oasis\.opendocument\.presentation]]},
- epub = {[[epub\+zip]]},
- asice = {[[mimetypeapplication/vnd\.etsi\.asic-e\+zipPK]]},
- asics = {[[mimetypeapplication/vnd\.etsi\.asic-s\+zipPK]]},
+ odp = { [[mimetypeapplication/vnd\.oasis\.opendocument\.presentation]] },
+ epub = { [[epub\+zip]] },
+ asice = { [[mimetypeapplication/vnd\.etsi\.asic-e\+zipPK]] },
+ asics = { [[mimetypeapplication/vnd\.etsi\.asic-s\+zipPK]] },
}
local txt_trie
local txt_patterns = {
html = {
- {[=[(?i)<html[\s>]]=], 32},
- {[[(?i)<script\b]], 20}, -- Commonly used by spammers
- {[[<script\s+type="text\/javascript">]], 31}, -- Another spammy pattern
- {[[(?i)<\!DOCTYPE HTML\b]], 33},
- {[[(?i)<body\b]], 20},
- {[[(?i)<table\b]], 20},
- {[[(?i)<a\s]], 10},
- {[[(?i)<p\b]], 10},
- {[[(?i)<div\b]], 10},
- {[[(?i)<span\b]], 10},
+ { [=[(?i)<html[\s>]]=], 32 },
+ { [[(?i)<script\b]], 20 }, -- Commonly used by spammers
+ { [[<script\s+type="text\/javascript">]], 31 }, -- Another spammy pattern
+ { [[(?i)<\!DOCTYPE HTML\b]], 33 },
+ { [[(?i)<body\b]], 20 },
+ { [[(?i)<table\b]], 20 },
+ { [[(?i)<a\s]], 10 },
+ { [[(?i)<p\b]], 10 },
+ { [[(?i)<div\b]], 10 },
+ { [[(?i)<span\b]], 10 },
},
csv = {
- {[[(?:[-a-zA-Z0-9_]+\s*,){2,}(?:[-a-zA-Z0-9_]+,?[ ]*[\r\n])]], 20}
+ { [[(?:[-a-zA-Z0-9_]+\s*,){2,}(?:[-a-zA-Z0-9_]+,?[ ]*[\r\n])]], 20 }
},
ics = {
- {[[^BEGIN:VCALENDAR\r?\n]], 40},
+ { [[^BEGIN:VCALENDAR\r?\n]], 40 },
},
vcf = {
- {[[^BEGIN:VCARD\r?\n]], 40},
+ { [[^BEGIN:VCARD\r?\n]], 40 },
},
xml = {
- {[[<\?xml\b.+\?>]], 31},
+ { [[<\?xml\b.+\?>]], 31 },
}
}
@@ -103,11 +103,11 @@ local function compile_tries()
rspamd_trie.flags.no_start)
local function compile_pats(patterns, indexes, transform_func, compile_flags)
local strs = {}
- for ext,pats in pairs(patterns) do
- for _,pat in ipairs(pats) do
+ for ext, pats in pairs(patterns) do
+ for _, pat in ipairs(pats) do
-- These are utf16 strings in fact...
strs[#strs + 1] = transform_func(pat)
- indexes[#indexes + 1] = {ext, pat}
+ indexes[#indexes + 1] = { ext, pat }
end
end
@@ -120,12 +120,14 @@ local function compile_tries()
return '^' ..
table.concat(
fun.totable(
- fun.map(function(c) return c .. [[\x{00}]] end,
+ fun.map(function(c)
+ return c .. [[\x{00}]]
+ end,
fun.iter(pat))))
end
local function msoffice_clsid_transform(pat)
local hex_table = {}
- for i=1,#pat,2 do
+ for i = 1, #pat, 2 do
local subc = pat:sub(i, i + 1)
hex_table[#hex_table + 1] = string.format('\\x{%s}', subc)
end
@@ -140,10 +142,14 @@ local function compile_tries()
msoffice_clsid_transform)
-- Misc zip patterns at the initial fragment
zip_trie = compile_pats(zip_patterns, zip_patterns_indexes,
- function(pat) return pat end)
+ function(pat)
+ return pat
+ end)
-- Text patterns at the initial fragment
txt_trie = compile_pats(txt_patterns, txt_patterns_indexes,
- function(pat_tbl) return pat_tbl[1] end,
+ function(pat_tbl)
+ return pat_tbl[1]
+ end,
bit.bor(rspamd_trie.flags.re,
rspamd_trie.flags.dot_all,
rspamd_trie.flags.no_start))
@@ -160,12 +166,13 @@ local function detect_ole_format(input, log_obj, _, part)
return nil
end
- local bom,sec_size = rspamd_util.unpack('<I2<I2', input:span(29, 4))
+ local bom, sec_size = rspamd_util.unpack('<I2<I2', input:span(29, 4))
if bom == 0xFFFE then
bom = '<'
else
lua_util.debugm(N, log_obj, "bom file!: %s", bom)
- bom = '>'; sec_size = bit.bswap(sec_size)
+ bom = '>';
+ sec_size = bit.bswap(sec_size)
end
if sec_size < 7 or sec_size > 31 then
@@ -194,39 +201,39 @@ local function detect_ole_format(input, log_obj, _, part)
-- Extract clsid
local matches = msoffice_trie_clsid:match(input:span(offset + 80, 16))
if matches then
- for n,_ in pairs(matches) do
+ for n, _ in pairs(matches) do
if msoffice_clsid_indexes[n] then
lua_util.debugm(N, log_obj, "found valid clsid for %s",
msoffice_clsid_indexes[n][1])
- return true,msoffice_clsid_indexes[n][1]
+ return true, msoffice_clsid_indexes[n][1]
end
end
end
- return true,nil
+ return true, nil
elseif dtype == 2 then
local matches = msoffice_trie:match(input:span(offset, 64))
if matches then
- for n,_ in pairs(matches) do
+ for n, _ in pairs(matches) do
if msoffice_patterns_indexes[n] then
- return true,msoffice_patterns_indexes[n][1]
+ return true, msoffice_patterns_indexes[n][1]
end
end
end
- return true,nil
+ return true, nil
elseif dtype >= 0 and dtype < 5 then
-- Bad type
- return true,nil
+ return true, nil
end
end
- return false,nil
+ return false, nil
end
repeat
- local res,ext = process_dir_entry(directory_offset)
+ local res, ext = process_dir_entry(directory_offset)
if res and ext then
- return ext,60
+ return ext, 60
end
if not res then
@@ -247,7 +254,7 @@ local function process_top_detected(res)
return res[ex1] > res[ex2]
end)
- return extensions[1],res[extensions[1]]
+ return extensions[1], res[extensions[1]]
end
return nil
@@ -276,7 +283,7 @@ local function detect_archive_flaw(part, arch, log_obj, _)
if arch_type == 'zip' then
-- Find specific files/folders in zip file
local files = arch:get_files(100) or {}
- for _,file in ipairs(files) do
+ for _, file in ipairs(files) do
if file == '[Content_Types].xml' then
add_msoffice_confidence(10)
elseif file:sub(1, 3) == 'xl/' then
@@ -292,10 +299,10 @@ local function detect_archive_flaw(part, arch, log_obj, _)
end
end
- local ext,weight = process_top_detected(res)
+ local ext, weight = process_top_detected(res)
if weight >= 40 then
- return ext,weight
+ return ext, weight
end
-- Apply misc Zip detection logic
@@ -306,32 +313,34 @@ local function detect_archive_flaw(part, arch, log_obj, _)
local matches = zip_trie:match(start_span)
if matches then
- for n,_ in pairs(matches) do
+ for n, _ in pairs(matches) do
if zip_patterns_indexes[n] then
lua_util.debugm(N, log_obj, "found zip pattern for %s",
zip_patterns_indexes[n][1])
- return zip_patterns_indexes[n][1],40
+ return zip_patterns_indexes[n][1], 40
end
end
end
end
end
- return arch_type:lower(),40
+ return arch_type:lower(), 40
end
local csv_grammar
-- Returns a grammar that will count commas
local function get_csv_grammar()
if not csv_grammar then
- local lpeg = require'lpeg'
+ local lpeg = require 'lpeg'
- local field = '"' * lpeg.Cs(((lpeg.P(1) - '"') + lpeg.P'""' / '"')^0) * '"' +
- lpeg.C((1 - lpeg.S',\n"')^0)
+ local field = '"' * lpeg.Cs(((lpeg.P(1) - '"') + lpeg.P '""' / '"') ^ 0) * '"' +
+ lpeg.C((1 - lpeg.S ',\n"') ^ 0)
- csv_grammar = lpeg.Cf(lpeg.Cc(0) * field * lpeg.P( (lpeg.P(',') +
- lpeg.P('\t')) * field)^1 * (lpeg.S'\r\n' + -1),
- function(acc) return acc + 1 end)
+ csv_grammar = lpeg.Cf(lpeg.Cc(0) * field * lpeg.P((lpeg.P(',') +
+ lpeg.P('\t')) * field) ^ 1 * (lpeg.S '\r\n' + -1),
+ function(acc)
+ return acc + 1
+ end)
end
return csv_grammar
@@ -402,17 +411,17 @@ exports.text_part_heuristic = function(part, log_obj, _)
while b >= 127 and idx < len do
-- utf8 part
if bit.band(b, 0xe0) == 0xc0 and remain > 1 and
- bit.band(bytes[idx + 1], 0xc0) == 0x80 then
- return true,1
+ bit.band(bytes[idx + 1], 0xc0) == 0x80 then
+ return true, 1
elseif bit.band(b, 0xf0) == 0xe0 and remain > 2 and
- bit.band(bytes[idx + 1], 0xc0) == 0x80 and
- bit.band(bytes[idx + 2], 0xc0) == 0x80 then
- return true,2
+ bit.band(bytes[idx + 1], 0xc0) == 0x80 and
+ bit.band(bytes[idx + 2], 0xc0) == 0x80 then
+ return true, 2
elseif bit.band(b, 0xf8) == 0xf0 and remain > 3 and
- bit.band(bytes[idx + 1], 0xc0) == 0x80 and
- bit.band(bytes[idx + 2], 0xc0) == 0x80 and
- bit.band(bytes[idx + 3], 0xc0) == 0x80 then
- return true,3
+ bit.band(bytes[idx + 1], 0xc0) == 0x80 and
+ bit.band(bytes[idx + 2], 0xc0) == 0x80 and
+ bit.band(bytes[idx + 3], 0xc0) == 0x80 then
+ return true, 3
end
n8bit = n8bit + 1
@@ -422,10 +431,10 @@ exports.text_part_heuristic = function(part, log_obj, _)
end
if n8bit >= 3 then
- return true,n8bit
+ return true, n8bit
end
- return false,0
+ return false, 0
end
-- Convert to string as LuaJIT can optimise string.sub (and fun.iter) but not C calls
@@ -439,7 +448,7 @@ exports.text_part_heuristic = function(part, log_obj, _)
if (b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09) then
non_printable = non_printable + 1
elseif b >= 127 then
- local c,nskip = rough_8bit_check(bytes, i, tlen - i, tlen)
+ local c, nskip = rough_8bit_check(bytes, i, tlen - i, tlen)
if not c then
non_printable = non_printable + 1
@@ -462,7 +471,7 @@ exports.text_part_heuristic = function(part, log_obj, _)
local parent = part:get_parent()
if parent then
- local parent_type,parent_subtype = parent:get_type()
+ local parent_type, parent_subtype = parent:get_type()
if parent_type == 'multipart' and parent_subtype == 'encrypted' then
-- Skip text heuristics for encrypted parts
@@ -473,7 +482,7 @@ exports.text_part_heuristic = function(part, log_obj, _)
end
local content = part:get_content()
- local mtype,msubtype = part:get_type()
+ local mtype, msubtype = part:get_type()
local clen = #content
local is_text
@@ -495,8 +504,8 @@ exports.text_part_heuristic = function(part, log_obj, _)
if matches then
-- Require at least 2 occurrences of those patterns
- for n,positions in pairs(matches) do
- local ext,weight = txt_patterns_indexes[n][1], txt_patterns_indexes[n][2][2]
+ for n, positions in pairs(matches) do
+ local ext, weight = txt_patterns_indexes[n][1], txt_patterns_indexes[n][2][2]
if ext then
res[ext] = (res[ext] or 0) + weight * #positions
lua_util.debugm(N, log_obj, "found txt pattern for %s: %s, total: %s; %s/%s announced",
@@ -504,7 +513,7 @@ exports.text_part_heuristic = function(part, log_obj, _)
end
end
- if res.html and res.html >= 40 then
+ if res.html and res.html >= 40 then
-- HTML has priority over something like js...
return 'html', res.html
end
@@ -525,7 +534,7 @@ exports.text_part_heuristic = function(part, log_obj, _)
-- Content type stuff
if (mtype == 'text' or mtype == 'application') and
- (msubtype == 'html' or msubtype == 'xhtml+xml') then
+ (msubtype == 'html' or msubtype == 'xhtml+xml') then
return 'html', 21
end
@@ -539,12 +548,12 @@ exports.text_part_heuristic = function(part, log_obj, _)
local function has_extension(file, ext)
local ext_len = ext:len()
return file:len() > ext_len + 1
- and file:sub(-ext_len):lower() == ext
- and file:sub(-ext_len - 1, -ext_len - 1) == '.'
+ and file:sub(-ext_len):lower() == ext
+ and file:sub(-ext_len - 1, -ext_len - 1) == '.'
end
if fname and (has_extension(fname, 'htm') or has_extension(fname, 'html')) then
- return 'html',21
+ return 'html', 21
end
if mtype ~= 'text' then
@@ -552,7 +561,7 @@ exports.text_part_heuristic = function(part, log_obj, _)
return nil
end
- return 'txt',40
+ return 'txt', 40
end
end
end
@@ -569,7 +578,7 @@ exports.pdf_format_heuristic = function(input, log_obj, pos, part)
weight = weight + 30
end
- return 'pdf',weight
+ return 'pdf', weight
end
exports.pe_part_heuristic = function(input, log_obj, pos, part)
@@ -590,7 +599,7 @@ exports.pe_part_heuristic = function(input, log_obj, pos, part)
return
end
- return 'exe',30
+ return 'exe', 30
end
return exports