From 7cf7a889a60efb651fdf2062b1773ad17c9eec7f Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 7 Sep 2019 16:37:58 +0100 Subject: [Project] Lua_magic: Add mime parts detection function --- lualib/lua_magic/heuristics.lua | 26 ++++++++++++++++++++++++ lualib/lua_magic/init.lua | 11 +++++++++++ lualib/lua_magic/types.lua | 44 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+) (limited to 'lualib') diff --git a/lualib/lua_magic/heuristics.lua b/lualib/lua_magic/heuristics.lua index d60c87162..6a407f5e9 100644 --- a/lualib/lua_magic/heuristics.lua +++ b/lualib/lua_magic/heuristics.lua @@ -52,6 +52,7 @@ local function compile_msoffice_trie(log_obj) local strs = {} for ext,pats in pairs(msoffice_patterns) do for _,pat in ipairs(pats) do + -- These are utf16 strings in fact... strs[#strs + 1] = '^' .. table.concat( fun.totable( @@ -66,6 +67,7 @@ local function compile_msoffice_trie(log_obj) strs = {} for ext,pats in pairs(msoffice_clsids) do for _,pat in ipairs(pats) do + -- Convert hex to re local hex_table = {} for i=1,#pat,2 do local subc = pat:sub(i, i + 1) @@ -163,6 +165,30 @@ local function detect_ole_format(input, log_obj) until directory_offset >= inplen end + exports.ole_format_heuristic = detect_ole_format +exports.mime_part_heuristic = function(part) + if part:is_text() then + if part:get_text():is_html() then + return 'html',60 + else + return 'txt',60 + end + end + + if part:is_image() then + local img = part:get_image() + return img:get_type():lower(),60 + end + + if part:is_archive() then + local arch = part:get_archive() + -- TODO: add files heuristics + return arch:get_type():lower(),60 + end + + return nil +end + return exports \ No newline at end of file diff --git a/lualib/lua_magic/init.lua b/lualib/lua_magic/init.lua index 59e2a6e36..8b5064bfe 100644 --- a/lualib/lua_magic/init.lua +++ b/lualib/lua_magic/init.lua @@ -21,6 +21,7 @@ limitations under the License. local patterns = require "lua_magic/patterns" local types = require "lua_magic/types" +local heuristics = require "lua_magic/heuristics" local fun = require "fun" local lua_util = require "lua_util" @@ -317,6 +318,16 @@ exports.detect = function(input, log_obj) return nil end +exports.detect_mime_part = function(part, log_obj) + local ext,weight = heuristics.mime_part_heuristic(part) + + if ext and weight and weight > 20 then + return ext,types[ext] + end + + return exports.detect(part:get_content(), log_obj) +end + -- This parameter specifies how many bytes are checked in the input -- Rspamd checks 2 chunks at start and 1 chunk at the end exports.chunk_size = 32768 diff --git a/lualib/lua_magic/types.lua b/lualib/lua_magic/types.lua index 8255af663..c8850cd18 100644 --- a/lualib/lua_magic/types.lua +++ b/lualib/lua_magic/types.lua @@ -168,6 +168,50 @@ local types = { ct = 'application/x-uuencoded', type = 'binary', }, + -- Types that are detected by Rspamd itself + -- Archives + zip = { + ct = 'application/zip', + type = 'archive', + }, + rar = { + ct = 'application/x-rar', + type = 'archive', + }, + ['7z'] = { + ct = 'x-7z-compressed', + type = 'archive', + }, + gz = { + ct = 'application/gzip', + type = 'archive', + }, + -- Images + png = { + ct = 'image/png', + type = 'image', + }, + gif = { + ct = 'image/gif', + type = 'image', + }, + jpg = { + ct = 'image/jpeg', + type = 'image', + }, + bmp = { + type = 'image', + ct = 'image/bmp', + }, + -- Text + txt = { + type = 'text', + ct = 'text/plain', + }, + html = { + type = 'text', + ct = 'text/html', + }, } return types \ No newline at end of file -- cgit v1.2.3