diff options
author | korgoth1 <vladislav.stakhov@gmail.com> | 2019-09-08 19:43:42 +0300 |
---|---|---|
committer | korgoth1 <vladislav.stakhov@gmail.com> | 2019-09-08 19:43:42 +0300 |
commit | a6d26a6ac0c6cd16718f85f398fab4541439edbd (patch) | |
tree | e2cf4a252eb2e22beddbd5b9b4a2e4cfb8350a67 | |
parent | b38a8298263120c857a95522decd31d09cb42504 (diff) | |
parent | 76b863e79e39ec5e1c99cee6e4263abe02f6f8d0 (diff) | |
download | rspamd-a6d26a6ac0c6cd16718f85f398fab4541439edbd.tar.gz rspamd-a6d26a6ac0c6cd16718f85f398fab4541439edbd.zip |
Test
-rw-r--r-- | lualib/lua_magic/heuristics.lua | 279 | ||||
-rw-r--r-- | lualib/lua_magic/init.lua | 119 | ||||
-rw-r--r-- | lualib/lua_magic/patterns.lua | 22 | ||||
-rw-r--r-- | lualib/lua_magic/types.lua | 103 | ||||
-rw-r--r-- | src/lua/lua_text.c | 30 | ||||
-rw-r--r-- | src/lua/lua_util.c | 21 | ||||
-rw-r--r-- | src/plugins/lua/clickhouse.lua | 2 |
7 files changed, 529 insertions, 47 deletions
diff --git a/lualib/lua_magic/heuristics.lua b/lualib/lua_magic/heuristics.lua new file mode 100644 index 000000000..b30f95794 --- /dev/null +++ b/lualib/lua_magic/heuristics.lua @@ -0,0 +1,279 @@ +--[[ +Copyright (c) 2019, Vsevolod Stakhov <vsevolod@highsecure.ru> + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +]]-- + +--[[[ +-- @module lua_magic/heuristics +-- This module contains heuristics for some specific cases +--]] + +local rspamd_trie = require "rspamd_trie" +local rspamd_util = require "rspamd_util" +local lua_util = require "lua_util" +local bit = require "bit" +local fun = require "fun" + +local N = "lua_magic" +local msoffice_trie +local msoffice_patterns = { + doc = {[[WordDocument]]}, + xls = {[[Workbook]], [[Book]]}, + ppt = {[[PowerPoint Document]], [[Current User]]}, + vsd = {[[VisioDocument]]}, +} +local msoffice_trie_clsid +local msoffice_clsids = { + doc = {[[0609020000000000c000000000000046]]}, + xls = {[[1008020000000000c000000000000046]], [[2008020000000000c000000000000046]]}, + ppt = {[[108d81649b4fcf1186ea00aa00b929e8]]}, + msg = {[[46f0060000000000c000000000000046]], [[0b0d020000000000c000000000000046]]}, + msi = {[[84100c0000000000c000000000000046]]}, +} +local msoffice_clsid_indexes = {} +local msoffice_patterns_indexes = {} + +local exports = {} + +local function compile_msoffice_trie(log_obj) + if not msoffice_trie then + -- Directory names + local strs = {} + for ext,pats in pairs(msoffice_patterns) do + for _,pat in ipairs(pats) do + -- These are utf16 strings in fact... + strs[#strs + 1] = '^' .. + table.concat( + fun.totable( + fun.map(function(c) return c .. [[\x{00}]] end, + fun.iter(pat)))) + msoffice_patterns_indexes[#msoffice_patterns_indexes + 1] = ext + + end + end + msoffice_trie = rspamd_trie.create(strs, rspamd_trie.flags.re) + -- Clsids + strs = {} + for ext,pats in pairs(msoffice_clsids) do + for _,pat in ipairs(pats) do + -- Convert hex to re + local hex_table = {} + for i=1,#pat,2 do + local subc = pat:sub(i, i + 1) + hex_table[#hex_table + 1] = string.format('\\x{%s}', subc) + end + strs[#strs + 1] = '^' .. table.concat(hex_table) .. '$' + msoffice_clsid_indexes[#msoffice_clsid_indexes + 1] = ext + + end + end + msoffice_trie_clsid = rspamd_trie.create(strs, rspamd_trie.flags.re) + end +end + +local function detect_ole_format(input, log_obj) + local inplen = #input + if inplen < 0x31 + 4 then + lua_util.debugm(N, log_obj, "short length: %s", inplen) + return nil + end + + compile_msoffice_trie(log_obj) + local bom,sec_size = rspamd_util.unpack('<I2<I2', input:span(29, 4)) + if bom == 0xFFFE then + bom = '<' + else + lua_util.debugm(N, log_obj, "bom file!: %s", bom) + bom = '>'; sec_size = bit.bswap(sec_size) + end + + if sec_size < 7 or sec_size > 31 then + lua_util.debugm(N, log_obj, "bad sec_size: %s", sec_size) + return nil + end + + sec_size = 2 ^ sec_size + + -- SecID of first sector of the directory stream + local directory_offset = (rspamd_util.unpack(bom .. 'I4', input:span(0x31, 4))) + * sec_size + 512 + 1 + lua_util.debugm(N, log_obj, "directory: %s", directory_offset) + + if inplen < directory_offset then + lua_util.debugm(N, log_obj, "short length: %s", inplen) + return nil + end + + local function process_dir_entry(offset) + local dtype = input:at(offset + 66) + lua_util.debugm(N, log_obj, "dtype: %s, offset: %s", dtype, offset) + + if dtype == 5 then + -- Extract clsid + local matches = msoffice_trie_clsid:match(input:span(offset + 80, 16)) + if matches then + for n,_ in pairs(matches) do + if msoffice_clsid_indexes[n] then + lua_util.debugm(N, log_obj, "found valid clsid for %s", + msoffice_clsid_indexes[n]) + return true,msoffice_clsid_indexes[n] + end + end + end + return true,nil + elseif dtype == 2 then + local matches = msoffice_trie:match(input:span(offset, 64)) + if matches then + for n,_ in pairs(matches) do + if msoffice_patterns_indexes[n] then + return true,msoffice_patterns_indexes[n] + end + end + end + return true,nil + elseif dtype >= 0 and dtype < 5 then + -- Bad type + return true,nil + end + + return false,nil + end + + repeat + local res,ext = process_dir_entry(directory_offset) + + if res and ext then + return ext,60 + end + + if not res then + break + end + + directory_offset = directory_offset + 128 + until directory_offset >= inplen +end + +exports.ole_format_heuristic = detect_ole_format + +local function process_detected(res) + local extensions = lua_util.keys(res) + + if #extensions > 0 then + table.sort(extensions, function(ex1, ex2) + return res[ex1] > res[ex2] + end) + + return extensions,res[extensions[1]] + end + + return nil +end + +local function detect_archive_flaw(part, arch) + local arch_type = arch:get_type() + local res = { + docx = 0, + xlsx = 0, + pptx = 0, + jar = 0, + odt = 0, + odp = 0, + ods = 0 + } -- ext + confidence pairs + + -- General msoffice patterns + local function add_msoffice_confidence(incr) + res.docx = res.docx + incr + res.xlsx = res.xlsx + incr + res.pptx = res.pptx + incr + end + + if arch_type == 'zip' then + -- Find specific files/folders in zip file + local files = arch:get_files() or {} + for _,file in ipairs(files) do + if file == '[Content_Types].xml' then + add_msoffice_confidence(10) + elseif file == 'xl/' then + res.xlsx = res.xlsx + 30 + elseif file == 'word/' then + res.xlsx = res.docx + 30 + elseif file == 'ppt/' then + res.xlsx = res.pptx + 30 + elseif file == 'META-INF/manifest.xml' then + -- Apply ODT detection logic + local content = part:get_content() + + if #content > 80 then + -- https://lists.oasis-open.org/archives/office/200505/msg00006.html + local start_span = content:span(30, 50) + + local mp = tostring(start_span:span(1, 8)) + if mp == 'mimetype' then + local spec_type = tostring(start_span:span(9)) + if spec_type:find('vnd.oasis.opendocument.text') then + res.odt = 40 + elseif spec_type:find('vnd.oasis.opendocument.spreadsheet') then + res.ods = 40 + elseif spec_type:find('vnd.oasis.opendocument.formula') then + res.ods = 40 + elseif spec_type:find('vnd.oasis.opendocument.chart') then + res.ods = 40 + elseif spec_type:find('vnd.oasis.opendocument.presentation') then + res.odp = 40 + elseif spec_type:find('vnd.oasis.opendocument.image') then + -- Assume image as odt + res.odt = 40 + elseif spec_type:find('vnd.oasis.opendocument.graphics') then + -- Assume image as odt + res.odt = 40 + end + end + end + end + end + + local ext,weight = process_detected(res) + + if weight >= 40 then + return ext,weight + end + end + + return arch_type:lower(),40 +end +exports.mime_part_heuristic = function(part) + if part:is_text() then + if part:get_text():is_html() then + return 'html',60 + else + return 'txt',60 + end + end + + if part:is_image() then + local img = part:get_image() + return img:get_type():lower(),60 + end + + if part:is_archive() then + local arch = part:get_archive() + return detect_archive_flaw(part, arch) + end + + return nil +end + +return exports
\ No newline at end of file diff --git a/lualib/lua_magic/init.lua b/lualib/lua_magic/init.lua index 5a4154c79..8b5064bfe 100644 --- a/lualib/lua_magic/init.lua +++ b/lualib/lua_magic/init.lua @@ -21,6 +21,7 @@ limitations under the License. local patterns = require "lua_magic/patterns" local types = require "lua_magic/types" +local heuristics = require "lua_magic/heuristics" local fun = require "fun" local lua_util = require "lua_util" @@ -131,23 +132,48 @@ local function process_patterns(log_obj) end end -local function match_chunk(input, tlen, offset, trie, processed_tbl, log_obj, res) - local matches = trie:match(input) +local function match_chunk(chunk, input, tlen, offset, trie, processed_tbl, log_obj, res) + local matches = trie:match(chunk) local last = tlen - local function add_result(match, pattern) - if not res[pattern.ext] then - res[pattern.ext] = 0 + local function add_result(weight, ext) + if not res[ext] then + res[ext] = 0 end - if match.weight then - res[pattern.ext] = res[pattern.ext] + match.weight + if weight then + res[ext] = res[ext] + weight else - res[pattern.ext] = res[pattern.ext] + 1 + res[ext] = res[ext] + 1 end lua_util.debugm(N, log_obj,'add pattern for %s, weight %s, total weight %s', - pattern.ext, match.weight, res[pattern.ext]) + ext, weight, res[ext]) + end + + local function match_position(pos, expected) + local cmp = function(a, b) return a == b end + if type(expected) == 'table' then + -- Something like {'>', 0} + if expected[1] == '>' then + cmp = function(a, b) return a > b end + elseif expected[1] == '>=' then + cmp = function(a, b) return a >= b end + elseif expected[1] == '<' then + cmp = function(a, b) return a < b end + elseif expected[1] == '<=' then + cmp = function(a, b) return a <= b end + elseif expected[1] == '!=' then + cmp = function(a, b) return a ~= b end + end + expected = expected[2] + end + + -- Tail match + if expected < 0 then + expected = last + expected + 1 + end + return cmp(pos, expected) end for npat,matched_positions in pairs(matches) do @@ -155,30 +181,6 @@ local function match_chunk(input, tlen, offset, trie, processed_tbl, log_obj, re local pattern = pat_data[3] local match = pat_data[2] - local function match_position(pos, expected) - local cmp = function(a, b) return a == b end - if type(expected) == 'table' then - -- Something like {'>', 0} - if expected[1] == '>' then - cmp = function(a, b) return a > b end - elseif expected[1] == '>=' then - cmp = function(a, b) return a >= b end - elseif expected[1] == '<' then - cmp = function(a, b) return a < b end - elseif expected[1] == '<=' then - cmp = function(a, b) return a <= b end - elseif expected[1] == '!=' then - cmp = function(a, b) return a ~= b end - end - expected = expected[2] - end - - -- Tail match - if expected < 0 then - expected = last + expected + 1 - end - return cmp(pos, expected) - end -- Single position if match.position then local position = match.position @@ -187,13 +189,21 @@ local function match_chunk(input, tlen, offset, trie, processed_tbl, log_obj, re lua_util.debugm(N, log_obj, 'found match %s at offset %s(from %s)', pattern.ext, pos, offset) if match_position(pos + offset, position) then - add_result(match, pattern) - break + if match.heuristic then + local ext,weight = match.heuristic(input, log_obj) + + if ext then + add_result(weight, ext) + break + end + else + add_result(match.weight, pattern.ext) + break + end end end - end - -- Match all positions - if match.positions then + elseif match.positions then + -- Match all positions local all_right = true for _,position in ipairs(match.positions) do local matched = false @@ -210,10 +220,21 @@ local function match_chunk(input, tlen, offset, trie, processed_tbl, log_obj, re end if all_right then - add_result(match, pattern) + if match.heuristic then + local ext,weight = match.heuristic(input, log_obj) + + if ext then + add_result(weight, ext) + break + end + else + add_result(match.weight, pattern.ext) + break + end end end end + end local function process_detected(res) @@ -248,13 +269,13 @@ exports.detect = function(input, log_obj) -- Check tail matches if inplen > min_tail_offset then local tail = input:span(inplen - min_tail_offset, min_tail_offset) - match_chunk(tail, inplen, inplen - min_tail_offset, + match_chunk(tail, input, inplen, inplen - min_tail_offset, compiled_tail_patterns, tail_patterns, log_obj, res) end -- Try short match local head = input:span(1, math.min(max_short_offset, inplen)) - match_chunk(head, inplen, 0, + match_chunk(head, input, inplen, 0, compiled_short_patterns, short_patterns, log_obj, res) -- Check if we have enough data or go to long patterns @@ -273,13 +294,13 @@ exports.detect = function(input, log_obj) input:span(inplen - exports.chunk_size, exports.chunk_size) local offset1, offset2 = 0, inplen - exports.chunk_size - match_chunk(chunk1, inplen, + match_chunk(chunk1, input, inplen, offset1, compiled_patterns, processed_patterns, log_obj, res) - match_chunk(chunk2, inplen, + match_chunk(chunk2, input, inplen, offset2, compiled_patterns, processed_patterns, log_obj, res) else -- Input is short enough to match it at all - match_chunk(input, inplen, 0, + match_chunk(input, input, inplen, 0, compiled_patterns, processed_patterns, log_obj, res) end else @@ -297,6 +318,16 @@ exports.detect = function(input, log_obj) return nil end +exports.detect_mime_part = function(part, log_obj) + local ext,weight = heuristics.mime_part_heuristic(part) + + if ext and weight and weight > 20 then + return ext,types[ext] + end + + return exports.detect(part:get_content(), log_obj) +end + -- This parameter specifies how many bytes are checked in the input -- Rspamd checks 2 chunks at start and 1 chunk at the end exports.chunk_size = 32768 diff --git a/lualib/lua_magic/patterns.lua b/lualib/lua_magic/patterns.lua index 003073cab..87ed3c0e9 100644 --- a/lualib/lua_magic/patterns.lua +++ b/lualib/lua_magic/patterns.lua @@ -19,6 +19,8 @@ limitations under the License. -- This module contains most common patterns --]] +local heuristics = require "lua_magic/heuristics" + local patterns = { pdf = { -- These are alternatives @@ -82,6 +84,17 @@ local patterns = { } } }, + -- MS Office format, needs heuristic + ole = { + matches = { + { + hex = [[d0cf11e0a1b11ae1]], + relative_position = 0, + weight = 60, + heuristic = heuristics.ole_format_heuristic + } + } + }, -- MS Exe file exe = { matches = { @@ -202,6 +215,15 @@ local patterns = { }, } }, + zoo = { + matches = { + { + hex = [[dca7c4fd]], + relative_position = 20, + weight = 60, + }, + } + }, iso = { matches = { { diff --git a/lualib/lua_magic/types.lua b/lualib/lua_magic/types.lua index b3af668c8..e8e4e45e2 100644 --- a/lualib/lua_magic/types.lua +++ b/lualib/lua_magic/types.lua @@ -101,6 +101,10 @@ local types = { ct = 'application/x-iso', type = 'archive', }, + zoo = { + ct = 'application/x-zoo', + type = 'archive', + }, szdd = { -- in fact, their MSDOS extension is like FOO.TX_ or FOO.TX$ ct = 'application/x-compressed', type = 'archive', @@ -130,6 +134,61 @@ local types = { ct = 'application/x-shockwave-flash', type = 'image', }, + -- Ole files + ole = { + ct = 'application/octet-stream', + type = 'office' + }, + doc = { + ct = 'application/msword', + type = 'office' + }, + xls = { + ct = 'application/vnd.ms-excel', + type = 'office' + }, + ppt = { + ct = 'application/vnd.ms-powerpoint', + type = 'office' + }, + vsd = { + ct = 'application/vnd.visio', + type = 'office' + }, + msi = { + ct = 'application/x-msi', + type = 'executable' + }, + msg = { + ct = 'application/vnd.ms-outlook', + type = 'office' + }, + -- newer office (2007+) + docx = { + ct = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + type = 'office' + }, + xlsx = { + ct = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + type = 'office' + }, + pptx = { + ct = 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + type = 'office' + }, + -- OpenOffice formats + odt = { + ct = 'application/vnd.oasis.opendocument.text', + type = 'office' + }, + ods = { + ct = 'application/vnd.oasis.opendocument.spreadsheet', + type = 'office' + }, + odp = { + ct = 'application/vnd.oasis.opendocument.presentation', + type = 'office' + }, -- other pgp = { ct = 'application/encrypted', @@ -139,6 +198,50 @@ local types = { ct = 'application/x-uuencoded', type = 'binary', }, + -- Types that are detected by Rspamd itself + -- Archives + zip = { + ct = 'application/zip', + type = 'archive', + }, + rar = { + ct = 'application/x-rar', + type = 'archive', + }, + ['7z'] = { + ct = 'x-7z-compressed', + type = 'archive', + }, + gz = { + ct = 'application/gzip', + type = 'archive', + }, + -- Images + png = { + ct = 'image/png', + type = 'image', + }, + gif = { + ct = 'image/gif', + type = 'image', + }, + jpg = { + ct = 'image/jpeg', + type = 'image', + }, + bmp = { + type = 'image', + ct = 'image/bmp', + }, + -- Text + txt = { + type = 'text', + ct = 'text/plain', + }, + html = { + type = 'text', + ct = 'text/html', + }, } return types
\ No newline at end of file diff --git a/src/lua/lua_text.c b/src/lua/lua_text.c index a41775230..68897019d 100644 --- a/src/lua/lua_text.c +++ b/src/lua/lua_text.c @@ -74,6 +74,13 @@ LUA_FUNCTION_DEF (text, save_in_file); * @return {rspamd_text} new rspamd_text with span (must be careful when using with owned texts...) */ LUA_FUNCTION_DEF (text, span); +/*** + * @method rspamd_text:at(pos) + * Returns a byte at the position `pos` + * @param {integer} pos index + * @return {integer} byte at the position `pos` or nil if pos out of bound + */ +LUA_FUNCTION_DEF (text, at); LUA_FUNCTION_DEF (text, take_ownership); LUA_FUNCTION_DEF (text, gc); LUA_FUNCTION_DEF (text, eq); @@ -91,6 +98,7 @@ static const struct luaL_reg textlib_m[] = { LUA_INTERFACE_DEF (text, take_ownership), LUA_INTERFACE_DEF (text, save_in_file), LUA_INTERFACE_DEF (text, span), + LUA_INTERFACE_DEF (text, at), {"write", lua_text_save_in_file}, {"__len", lua_text_len}, {"__tostring", lua_text_str}, @@ -342,6 +350,28 @@ lua_text_span (lua_State *L) } static gint +lua_text_at (lua_State *L) +{ + LUA_TRACE_POINT; + struct rspamd_lua_text *t = lua_check_text (L, 1); + gint pos = lua_tointeger (L, 2); + + if (t) { + if (pos > 0 && pos <= t->len) { + lua_pushinteger (L, t->start[pos - 1]); + } + else { + lua_pushnil (L); + } + } + else { + return luaL_error (L, "invalid arguments"); + } + + return 1; +} + +static gint lua_text_save_in_file (lua_State *L) { LUA_TRACE_POINT; diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index e18912e9a..c25d20471 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -3715,11 +3715,28 @@ lua_util_unpack (lua_State *L) Header h; const char *fmt = luaL_checkstring(L, 1); size_t ld; - const char *data = luaL_checklstring (L, 2, &ld); - size_t pos = (size_t) posrelat (luaL_optinteger (L, 3, 1), ld) - 1; + const char *data; int n = 0; /* number of results */ + + if (lua_type (L, 2) == LUA_TUSERDATA) { + struct rspamd_lua_text *t = lua_check_text (L, 2); + + if (!t) { + return luaL_error (L, "invalid arguments"); + } + + data = t->start; + ld = t->len; + } + else { + data = luaL_checklstring (L, 2, &ld); + } + + size_t pos = (size_t) posrelat (luaL_optinteger (L, 3, 1), ld) - 1; luaL_argcheck(L, pos <= ld, 3, "initial position out of string"); + initheader (L, &h); + while (*fmt != '\0') { int size, ntoalign; KOption opt = getdetails (&h, pos, &fmt, &size, &ntoalign); diff --git a/src/plugins/lua/clickhouse.lua b/src/plugins/lua/clickhouse.lua index 5c8eb5d4d..1505b51aa 100644 --- a/src/plugins/lua/clickhouse.lua +++ b/src/plugins/lua/clickhouse.lua @@ -841,7 +841,7 @@ end local function do_remove_partition(ev_base, cfg, table_name, partition_id) lua_util.debugm(N, rspamd_config, "removing partition %s.%s", table_name, partition_id) local upstream = settings.upstream:get_upstream_round_robin() - local remove_partition_sql = "ALTER TABLE ${table_name} ${remove_method} PARTITION ${partition_id}" + local remove_partition_sql = "ALTER TABLE ${table_name} ${remove_method} PARTITION '${partition_id}'" local remove_method = (settings.retention.method == 'drop') and 'DROP' or 'DETACH' local sql_params = { ['table_name'] = table_name, |