From: Vsevolod Stakhov Date: Mon, 9 Sep 2019 14:01:28 +0000 (+0100) Subject: [Project] Lua_magic: Add heuristics for text parts X-Git-Tag: 2.0~240 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=da99d2d9118c0e67e40d8e568fcda998ad329d45;p=rspamd.git [Project] Lua_magic: Add heuristics for text parts --- diff --git a/lualib/lua_magic/heuristics.lua b/lualib/lua_magic/heuristics.lua index 8469fa9f8..d8c134e57 100644 --- a/lualib/lua_magic/heuristics.lua +++ b/lualib/lua_magic/heuristics.lua @@ -44,20 +44,46 @@ local msoffice_clsids = { local zip_trie local zip_patterns = { -- https://lists.oasis-open.org/archives/office/200505/msg00006.html - odt = {[[mimetypeapplication/vnd\.oasis\.opendocument.text]], - [[mimetypeapplication/vnd\.oasis.opendocument\.image]], - [[mimetypeapplication/vnd\.oasis\.opendocument\.graphic]]}, - ods = {[[mimetypeapplication/vnd\.oasis\.opendocument\.spreadsheet]], - [[mimetypeapplication/vnd\.oasis\.opendocument.formula]], - [[mimetypeapplication/vnd\.oasis\.opendocument\.chart]]}, + odt = { + [[mimetypeapplication/vnd\.oasis\.opendocument.text]], + [[mimetypeapplication/vnd\.oasis.opendocument\.image]], + [[mimetypeapplication/vnd\.oasis\.opendocument\.graphic]] + }, + ods = { + [[mimetypeapplication/vnd\.oasis\.opendocument\.spreadsheet]], + [[mimetypeapplication/vnd\.oasis\.opendocument.formula]], + [[mimetypeapplication/vnd\.oasis\.opendocument\.chart]] + }, odp = {[[mimetypeapplication/vnd\.oasis\.opendocument\.presentation]]}, epub = {[[epub\+zip]]} } +local txt_trie +local txt_patterns = { + html = { + [[(?i)\s*= 127 then + if bit.band(b, 0xe0) == 0xc0 or bit.band(b, 0xf0) == 0xe0 or bit.band(b, 0xf8) == 0xf0 then + return true + end + return false + else + return true + end + end + + -- Convert to string as LuaJIT can optimise string.sub (and fun.iter) but not C calls + local tlen = #span + local non_printable = 0 + for _,b in ipairs(span:bytes()) do + if ((b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09)) + or (not rough_utf8_check(b)) then + non_printable = non_printable + 1 + end + end + lua_util.debugm(N, log_obj, "text part check: %s printable, %s non-printable, %s total", + tlen - non_printable, non_printable, tlen) + if non_printable / tlen > 0.0625 then + return false + end + + return true + end + + local content = part:get_content() + local clen = #content + local is_text + + if clen > 0 then + if clen > 80 * 3 then + -- Use chunks + is_text = is_span_text(content:span(1, 160)) and is_span_text(content:span(clen - 80, 80)) + else + is_text = is_span_text(content) + end + + if is_text then + -- Try patterns + local span_len = math.min(160, clen) + local start_span = content:span(1, span_len) + local matches = txt_trie:match(start_span) + local res = {} + if matches then + -- Require at least 2 occurrences of those patterns + for n,positions in pairs(matches) do + local ext = txt_patterns_indexes[n] + if ext then + res[ext] = (res[ext] or 0) + 20 * #positions + lua_util.debugm(N, log_obj, "found txt pattern for %s: %s", + ext, #positions) + end + end + + if res.html and res.html >= 40 then + -- HTML has priority over something like js... + return 'html',res.html + end + + local ext,weight = process_top_detected(res) + + if weight and weight >= 40 then + return ext,weight + end + end + + return 'txt',40 + end + end +end + return exports \ No newline at end of file diff --git a/lualib/lua_magic/init.lua b/lualib/lua_magic/init.lua index e9e0297e9..27f968149 100644 --- a/lualib/lua_magic/init.lua +++ b/lualib/lua_magic/init.lua @@ -326,7 +326,17 @@ exports.detect_mime_part = function(part, log_obj) return ext,types[ext] end - return exports.detect(part:get_content(), log_obj) + ext,weight = exports.detect(part:get_content(), log_obj) + + if ext and weight and weight > 20 then + return ext,types[ext] + end + + -- Text/html and other parts + ext,weight = heuristics.text_part_heuristic(part, log_obj) + if ext and weight and weight > 20 then + return ext,types[ext] + end end -- This parameter specifies how many bytes are checked in the input diff --git a/lualib/lua_magic/types.lua b/lualib/lua_magic/types.lua index 3ecd0575a..93bfa6641 100644 --- a/lualib/lua_magic/types.lua +++ b/lualib/lua_magic/types.lua @@ -250,6 +250,18 @@ local types = { type = 'text', ct = 'text/html', }, + csv = { + type = 'text', + ct = 'text/csv', + }, + eml = { + type = 'message', + ct = 'message/rfc822', + }, + js = { + type = 'application', + ct = 'application/javascript', + }, } return types \ No newline at end of file