From ef54307ee4621ee2645c7cf9456e2542f51875f6 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sun, 8 Sep 2019 09:50:27 +0100 Subject: [PATCH] [Project] Lua_magic: Add Oasis documents detection --- lualib/lua_magic/heuristics.lua | 53 +++++++++++++++++++++++++++++++++ lualib/lua_magic/types.lua | 31 +++++++++++++------ 2 files changed, 75 insertions(+), 9 deletions(-) diff --git a/lualib/lua_magic/heuristics.lua b/lualib/lua_magic/heuristics.lua index 167edd0c9..b30f95794 100644 --- a/lualib/lua_magic/heuristics.lua +++ b/lualib/lua_magic/heuristics.lua @@ -167,6 +167,20 @@ end exports.ole_format_heuristic = detect_ole_format +local function process_detected(res) + local extensions = lua_util.keys(res) + + if #extensions > 0 then + table.sort(extensions, function(ex1, ex2) + return res[ex1] > res[ex2] + end) + + return extensions,res[extensions[1]] + end + + return nil +end + local function detect_archive_flaw(part, arch) local arch_type = arch:get_type() local res = { @@ -174,6 +188,9 @@ local function detect_archive_flaw(part, arch) xlsx = 0, pptx = 0, jar = 0, + odt = 0, + odp = 0, + ods = 0 } -- ext + confidence pairs -- General msoffice patterns @@ -195,8 +212,44 @@ local function detect_archive_flaw(part, arch) res.xlsx = res.docx + 30 elseif file == 'ppt/' then res.xlsx = res.pptx + 30 + elseif file == 'META-INF/manifest.xml' then + -- Apply ODT detection logic + local content = part:get_content() + + if #content > 80 then + -- https://lists.oasis-open.org/archives/office/200505/msg00006.html + local start_span = content:span(30, 50) + + local mp = tostring(start_span:span(1, 8)) + if mp == 'mimetype' then + local spec_type = tostring(start_span:span(9)) + if spec_type:find('vnd.oasis.opendocument.text') then + res.odt = 40 + elseif spec_type:find('vnd.oasis.opendocument.spreadsheet') then + res.ods = 40 + elseif spec_type:find('vnd.oasis.opendocument.formula') then + res.ods = 40 + elseif spec_type:find('vnd.oasis.opendocument.chart') then + res.ods = 40 + elseif spec_type:find('vnd.oasis.opendocument.presentation') then + res.odp = 40 + elseif spec_type:find('vnd.oasis.opendocument.image') then + -- Assume image as odt + res.odt = 40 + elseif spec_type:find('vnd.oasis.opendocument.graphics') then + -- Assume image as odt + res.odt = 40 + end + end + end end end + + local ext,weight = process_detected(res) + + if weight >= 40 then + return ext,weight + end end return arch_type:lower(),40 diff --git a/lualib/lua_magic/types.lua b/lualib/lua_magic/types.lua index c5de552c8..299dc1924 100644 --- a/lualib/lua_magic/types.lua +++ b/lualib/lua_magic/types.lua @@ -133,23 +133,23 @@ local types = { -- Ole files ole = { ct = 'application/octet-stream', - type = 'msoffice' + type = 'office' }, doc = { ct = 'application/msword', - type = 'msoffice' + type = 'office' }, xls = { ct = 'application/vnd.ms-excel', - type = 'msoffice' + type = 'office' }, ppt = { ct = 'application/vnd.ms-powerpoint', - type = 'msoffice' + type = 'office' }, vsd = { ct = 'application/vnd.visio', - type = 'msoffice' + type = 'office' }, msi = { ct = 'application/x-msi', @@ -157,20 +157,33 @@ local types = { }, msg = { ct = 'application/vnd.ms-outlook', - type = 'msoffice' + type = 'office' }, -- newer office (2007+) docx = { ct = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', - type = 'msoffice' + type = 'office' }, xlsx = { ct = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', - type = 'msoffice' + type = 'office' }, pptx = { ct = 'application/vnd.openxmlformats-officedocument.presentationml.presentation', - type = 'msoffice' + type = 'office' + }, + -- OpenOffice formats + odt = { + ct = 'application/vnd.oasis.opendocument.text', + type = 'office' + }, + ods = { + ct = 'application/vnd.oasis.opendocument.spreadsheet', + type = 'office' + }, + odp = { + ct = 'application/vnd.oasis.opendocument.presentation', + type = 'office' }, -- other pgp = { -- 2.39.5