Ver código fonte

[Project] Lua_magic: Add heuristics for Office 2007+

tags/2.0
Vsevolod Stakhov 4 anos atrás
pai
commit
eb120f830e
2 arquivos alterados com 49 adições e 4 exclusões
  1. 35
    3
      lualib/lua_magic/heuristics.lua
  2. 14
    1
      lualib/lua_magic/types.lua

+ 35
- 3
lualib/lua_magic/heuristics.lua Ver arquivo

@@ -165,9 +165,42 @@ local function detect_ole_format(input, log_obj)
until directory_offset >= inplen
end


exports.ole_format_heuristic = detect_ole_format

local function detect_archive_flaw(part, arch)
local arch_type = arch:get_type()
local res = {
docx = 0,
xlsx = 0,
pptx = 0,
jar = 0,
} -- ext + confidence pairs

-- General msoffice patterns
local function add_msoffice_confidence(incr)
res.docx = res.docx + incr
res.xlsx = res.xlsx + incr
res.pptx = res.pptx + incr
end

if arch_type == 'zip' then
-- Find specific files/folders in zip file
local files = arch:get_files() or {}
for _,file in ipairs(files) do
if file == '[Content_Types].xml' then
add_msoffice_confidence(10)
elseif file == 'xl/' then
res.xlsx = res.xlsx + 30
elseif file == 'word/' then
res.xlsx = res.docx + 30
elseif file == 'ppt/' then
res.xlsx = res.pptx + 30
end
end
end

return arch_type:lower(),40
end
exports.mime_part_heuristic = function(part)
if part:is_text() then
if part:get_text():is_html() then
@@ -184,8 +217,7 @@ exports.mime_part_heuristic = function(part)

if part:is_archive() then
local arch = part:get_archive()
-- TODO: add files heuristics
return arch:get_type():lower(),60
return detect_archive_flaw(part, arch)
end

return nil

+ 14
- 1
lualib/lua_magic/types.lua Ver arquivo

@@ -157,7 +157,20 @@ local types = {
},
msg = {
ct = 'application/vnd.ms-outlook',
type = 'executable'
type = 'msoffice'
},
-- newer office (2007+)
docx = {
ct = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
type = 'msoffice'
},
xlsx = {
ct = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
type = 'msoffice'
},
pptx = {
ct = 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
type = 'msoffice'
},
-- other
pgp = {

Carregando…
Cancelar
Salvar