diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-09-09 15:01:28 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-09-09 15:01:28 +0100 |
commit | da99d2d9118c0e67e40d8e568fcda998ad329d45 (patch) | |
tree | fc86e285f41896081265a57f421377ab4fd5c30b /lualib | |
parent | ccea1d01a88980a47104b13371c2cec88f47fb8b (diff) | |
download | rspamd-da99d2d9118c0e67e40d8e568fcda998ad329d45.tar.gz rspamd-da99d2d9118c0e67e40d8e568fcda998ad329d45.zip |
[Project] Lua_magic: Add heuristics for text parts
Diffstat (limited to 'lualib')
-rw-r--r-- | lualib/lua_magic/heuristics.lua | 126 | ||||
-rw-r--r-- | lualib/lua_magic/init.lua | 12 | ||||
-rw-r--r-- | lualib/lua_magic/types.lua | 12 |
3 files changed, 136 insertions, 14 deletions
diff --git a/lualib/lua_magic/heuristics.lua b/lualib/lua_magic/heuristics.lua index 8469fa9f8..d8c134e57 100644 --- a/lualib/lua_magic/heuristics.lua +++ b/lualib/lua_magic/heuristics.lua @@ -44,20 +44,46 @@ local msoffice_clsids = { local zip_trie local zip_patterns = { -- https://lists.oasis-open.org/archives/office/200505/msg00006.html - odt = {[[mimetypeapplication/vnd\.oasis\.opendocument.text]], - [[mimetypeapplication/vnd\.oasis.opendocument\.image]], - [[mimetypeapplication/vnd\.oasis\.opendocument\.graphic]]}, - ods = {[[mimetypeapplication/vnd\.oasis\.opendocument\.spreadsheet]], - [[mimetypeapplication/vnd\.oasis\.opendocument.formula]], - [[mimetypeapplication/vnd\.oasis\.opendocument\.chart]]}, + odt = { + [[mimetypeapplication/vnd\.oasis\.opendocument.text]], + [[mimetypeapplication/vnd\.oasis.opendocument\.image]], + [[mimetypeapplication/vnd\.oasis\.opendocument\.graphic]] + }, + ods = { + [[mimetypeapplication/vnd\.oasis\.opendocument\.spreadsheet]], + [[mimetypeapplication/vnd\.oasis\.opendocument.formula]], + [[mimetypeapplication/vnd\.oasis\.opendocument\.chart]] + }, odp = {[[mimetypeapplication/vnd\.oasis\.opendocument\.presentation]]}, epub = {[[epub\+zip]]} } +local txt_trie +local txt_patterns = { + html = { + [[(?i)\s*<html]], + [[(?i)\s*<\!DOCTYPE HTML]], + [[(?i)\s*<xml]], + [[(?i)\s*<body]], + [[(?i)\s*<table]], + [[(?i)\s*<a]], + [[(?i)\s*<p]], + [[(?i)\s*<div]], + [[(?i)\s*<span]], + }, + csv = { + [[(?:[-a-zA-Z0-9_]+\s*,){2,}(?:[-a-zA-Z0-9_]+[\r\n])]] + }, + js = { + [[\s*function\s*\(]], + }, +} + -- Used to match pattern index and extension local msoffice_clsid_indexes = {} local msoffice_patterns_indexes = {} local zip_patterns_indexes = {} +local txt_patterns_indexes = {} local exports = {} @@ -102,6 +128,9 @@ local function compile_tries() -- Misc zip patterns at the initial fragment zip_trie = compile_pats(zip_patterns, zip_patterns_indexes, function(pat) return pat end) + -- Text patterns at the initial fragment + txt_trie = compile_pats(txt_patterns, txt_patterns_indexes, + function(pat) return pat end) end end @@ -271,13 +300,6 @@ local function detect_archive_flaw(part, arch, log_obj) end exports.mime_part_heuristic = function(part, log_obj) - if part:is_text() then - if part:get_text():is_html() then - return 'html',60 - else - return 'txt',60 - end - end if part:is_image() then local img = part:get_image() @@ -292,4 +314,82 @@ exports.mime_part_heuristic = function(part, log_obj) return nil end +exports.text_part_heuristic = function(part, log_obj) + -- We get some span of data and check it + local function is_span_text(span) + local function rough_utf8_check(b) + if b >= 127 then + if bit.band(b, 0xe0) == 0xc0 or bit.band(b, 0xf0) == 0xe0 or bit.band(b, 0xf8) == 0xf0 then + return true + end + return false + else + return true + end + end + + -- Convert to string as LuaJIT can optimise string.sub (and fun.iter) but not C calls + local tlen = #span + local non_printable = 0 + for _,b in ipairs(span:bytes()) do + if ((b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09)) + or (not rough_utf8_check(b)) then + non_printable = non_printable + 1 + end + end + lua_util.debugm(N, log_obj, "text part check: %s printable, %s non-printable, %s total", + tlen - non_printable, non_printable, tlen) + if non_printable / tlen > 0.0625 then + return false + end + + return true + end + + local content = part:get_content() + local clen = #content + local is_text + + if clen > 0 then + if clen > 80 * 3 then + -- Use chunks + is_text = is_span_text(content:span(1, 160)) and is_span_text(content:span(clen - 80, 80)) + else + is_text = is_span_text(content) + end + + if is_text then + -- Try patterns + local span_len = math.min(160, clen) + local start_span = content:span(1, span_len) + local matches = txt_trie:match(start_span) + local res = {} + if matches then + -- Require at least 2 occurrences of those patterns + for n,positions in pairs(matches) do + local ext = txt_patterns_indexes[n] + if ext then + res[ext] = (res[ext] or 0) + 20 * #positions + lua_util.debugm(N, log_obj, "found txt pattern for %s: %s", + ext, #positions) + end + end + + if res.html and res.html >= 40 then + -- HTML has priority over something like js... + return 'html',res.html + end + + local ext,weight = process_top_detected(res) + + if weight and weight >= 40 then + return ext,weight + end + end + + return 'txt',40 + end + end +end + return exports
\ No newline at end of file diff --git a/lualib/lua_magic/init.lua b/lualib/lua_magic/init.lua index e9e0297e9..27f968149 100644 --- a/lualib/lua_magic/init.lua +++ b/lualib/lua_magic/init.lua @@ -326,7 +326,17 @@ exports.detect_mime_part = function(part, log_obj) return ext,types[ext] end - return exports.detect(part:get_content(), log_obj) + ext,weight = exports.detect(part:get_content(), log_obj) + + if ext and weight and weight > 20 then + return ext,types[ext] + end + + -- Text/html and other parts + ext,weight = heuristics.text_part_heuristic(part, log_obj) + if ext and weight and weight > 20 then + return ext,types[ext] + end end -- This parameter specifies how many bytes are checked in the input diff --git a/lualib/lua_magic/types.lua b/lualib/lua_magic/types.lua index 3ecd0575a..93bfa6641 100644 --- a/lualib/lua_magic/types.lua +++ b/lualib/lua_magic/types.lua @@ -250,6 +250,18 @@ local types = { type = 'text', ct = 'text/html', }, + csv = { + type = 'text', + ct = 'text/csv', + }, + eml = { + type = 'message', + ct = 'message/rfc822', + }, + js = { + type = 'application', + ct = 'application/javascript', + }, } return types
\ No newline at end of file |