diff options
authorkorgoth1 <vladislav.stakhov@gmail.com>2019-09-08 19:43:42 +0300
committerkorgoth1 <vladislav.stakhov@gmail.com>2019-09-08 19:43:42 +0300
commita6d26a6ac0c6cd16718f85f398fab4541439edbd (patch)
parentb38a8298263120c857a95522decd31d09cb42504 (diff)
parent76b863e79e39ec5e1c99cee6e4263abe02f6f8d0 (diff)
7 files changed, 529 insertions, 47 deletions
diff --git a/lualib/lua_magic/heuristics.lua b/lualib/lua_magic/heuristics.lua
new file mode 100644
index 000000000..b30f95794
--- /dev/null
+++ b/lualib/lua_magic/heuristics.lua
@@ -0,0 +1,279 @@
+Copyright (c) 2019, Vsevolod Stakhov <vsevolod@highsecure.ru>
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+See the License for the specific language governing permissions and
+limitations under the License.
+-- @module lua_magic/heuristics
+-- This module contains heuristics for some specific cases
+local rspamd_trie = require "rspamd_trie"
+local rspamd_util = require "rspamd_util"
+local lua_util = require "lua_util"
+local bit = require "bit"
+local fun = require "fun"
+local N = "lua_magic"
+local msoffice_trie
+local msoffice_patterns = {
+ doc = {[[WordDocument]]},
+ xls = {[[Workbook]], [[Book]]},
+ ppt = {[[PowerPoint Document]], [[Current User]]},
+ vsd = {[[VisioDocument]]},
+local msoffice_trie_clsid
+local msoffice_clsids = {
+ doc = {[[0609020000000000c000000000000046]]},
+ xls = {[[1008020000000000c000000000000046]], [[2008020000000000c000000000000046]]},
+ ppt = {[[108d81649b4fcf1186ea00aa00b929e8]]},
+ msg = {[[46f0060000000000c000000000000046]], [[0b0d020000000000c000000000000046]]},
+ msi = {[[84100c0000000000c000000000000046]]},
+local msoffice_clsid_indexes = {}
+local msoffice_patterns_indexes = {}
+local exports = {}
+local function compile_msoffice_trie(log_obj)
+ if not msoffice_trie then
+ -- Directory names
+ local strs = {}
+ for ext,pats in pairs(msoffice_patterns) do
+ for _,pat in ipairs(pats) do
+ -- These are utf16 strings in fact...
+ strs[#strs + 1] = '^' ..
+ table.concat(
+ fun.totable(
+ fun.map(function(c) return c .. [[\x{00}]] end,
+ fun.iter(pat))))
+ msoffice_patterns_indexes[#msoffice_patterns_indexes + 1] = ext
+ end
+ end
+ msoffice_trie = rspamd_trie.create(strs, rspamd_trie.flags.re)
+ -- Clsids
+ strs = {}
+ for ext,pats in pairs(msoffice_clsids) do
+ for _,pat in ipairs(pats) do
+ -- Convert hex to re
+ local hex_table = {}
+ for i=1,#pat,2 do
+ local subc = pat:sub(i, i + 1)
+ hex_table[#hex_table + 1] = string.format('\\x{%s}', subc)
+ end
+ strs[#strs + 1] = '^' .. table.concat(hex_table) .. '$'
+ msoffice_clsid_indexes[#msoffice_clsid_indexes + 1] = ext
+ end
+ end
+ msoffice_trie_clsid = rspamd_trie.create(strs, rspamd_trie.flags.re)
+ end
+local function detect_ole_format(input, log_obj)
+ local inplen = #input
+ if inplen < 0x31 + 4 then
+ lua_util.debugm(N, log_obj, "short length: %s", inplen)
+ return nil
+ end
+ compile_msoffice_trie(log_obj)
+ local bom,sec_size = rspamd_util.unpack('<I2<I2', input:span(29, 4))
+ if bom == 0xFFFE then
+ bom = '<'
+ else
+ lua_util.debugm(N, log_obj, "bom file!: %s", bom)
+ bom = '>'; sec_size = bit.bswap(sec_size)
+ end
+ if sec_size < 7 or sec_size > 31 then
+ lua_util.debugm(N, log_obj, "bad sec_size: %s", sec_size)
+ return nil
+ end
+ sec_size = 2 ^ sec_size
+ -- SecID of first sector of the directory stream
+ local directory_offset = (rspamd_util.unpack(bom .. 'I4', input:span(0x31, 4)))
+ * sec_size + 512 + 1
+ lua_util.debugm(N, log_obj, "directory: %s", directory_offset)
+ if inplen < directory_offset then
+ lua_util.debugm(N, log_obj, "short length: %s", inplen)
+ return nil
+ end
+ local function process_dir_entry(offset)
+ local dtype = input:at(offset + 66)
+ lua_util.debugm(N, log_obj, "dtype: %s, offset: %s", dtype, offset)
+ if dtype == 5 then
+ -- Extract clsid
+ local matches = msoffice_trie_clsid:match(input:span(offset + 80, 16))
+ if matches then
+ for n,_ in pairs(matches) do
+ if msoffice_clsid_indexes[n] then
+ lua_util.debugm(N, log_obj, "found valid clsid for %s",
+ msoffice_clsid_indexes[n])
+ return true,msoffice_clsid_indexes[n]
+ end
+ end
+ end
+ return true,nil
+ elseif dtype == 2 then
+ local matches = msoffice_trie:match(input:span(offset, 64))
+ if matches then
+ for n,_ in pairs(matches) do
+ if msoffice_patterns_indexes[n] then
+ return true,msoffice_patterns_indexes[n]
+ end
+ end
+ end
+ return true,nil
+ elseif dtype >= 0 and dtype < 5 then
+ -- Bad type
+ return true,nil
+ end
+ return false,nil
+ end
+ repeat
+ local res,ext = process_dir_entry(directory_offset)
+ if res and ext then
+ return ext,60
+ end
+ if not res then
+ break
+ end
+ directory_offset = directory_offset + 128
+ until directory_offset >= inplen
+exports.ole_format_heuristic = detect_ole_format
+local function process_detected(res)
+ local extensions = lua_util.keys(res)
+ if #extensions > 0 then
+ table.sort(extensions, function(ex1, ex2)
+ return res[ex1] > res[ex2]
+ end)
+ return extensions,res[extensions[1]]
+ end
+ return nil
+local function detect_archive_flaw(part, arch)
+ local arch_type = arch:get_type()
+ local res = {
+ docx = 0,
+ xlsx = 0,
+ pptx = 0,
+ jar = 0,
+ odt = 0,
+ odp = 0,
+ ods = 0
+ } -- ext + confidence pairs
+ -- General msoffice patterns
+ local function add_msoffice_confidence(incr)
+ res.docx = res.docx + incr
+ res.xlsx = res.xlsx + incr
+ res.pptx = res.pptx + incr
+ end
+ if arch_type == 'zip' then
+ -- Find specific files/folders in zip file
+ local files = arch:get_files() or {}
+ for _,file in ipairs(files) do
+ if file == '[Content_Types].xml' then
+ add_msoffice_confidence(10)
+ elseif file == 'xl/' then
+ res.xlsx = res.xlsx + 30
+ elseif file == 'word/' then
+ res.xlsx = res.docx + 30
+ elseif file == 'ppt/' then
+ res.xlsx = res.pptx + 30
+ elseif file == 'META-INF/manifest.xml' then
+ -- Apply ODT detection logic
+ local content = part:get_content()
+ if #content > 80 then
+ -- https://lists.oasis-open.org/archives/office/200505/msg00006.html
+ local start_span = content:span(30, 50)
+ local mp = tostring(start_span:span(1, 8))
+ if mp == 'mimetype' then
+ local spec_type = tostring(start_span:span(9))
+ if spec_type:find('vnd.oasis.opendocument.text') then
+ res.odt = 40
+ elseif spec_type:find('vnd.oasis.opendocument.spreadsheet') then
+ res.ods = 40
+ elseif spec_type:find('vnd.oasis.opendocument.formula') then
+ res.ods = 40
+ elseif spec_type:find('vnd.oasis.opendocument.chart') then
+ res.ods = 40
+ elseif spec_type:find('vnd.oasis.opendocument.presentation') then
+ res.odp = 40
+ elseif spec_type:find('vnd.oasis.opendocument.image') then
+ -- Assume image as odt
+ res.odt = 40
+ elseif spec_type:find('vnd.oasis.opendocument.graphics') then
+ -- Assume image as odt
+ res.odt = 40
+ end
+ end
+ end
+ end
+ end
+ local ext,weight = process_detected(res)
+ if weight >= 40 then
+ return ext,weight
+ end
+ end
+ return arch_type:lower(),40
+exports.mime_part_heuristic = function(part)
+ if part:is_text() then
+ if part:get_text():is_html() then
+ return 'html',60
+ else
+ return 'txt',60
+ end
+ end
+ if part:is_image() then
+ local img = part:get_image()
+ return img:get_type():lower(),60
+ end
+ if part:is_archive() then
+ local arch = part:get_archive()
+ return detect_archive_flaw(part, arch)
+ end
+ return nil
+return exports \ No newline at end of file
diff --git a/lualib/lua_magic/init.lua b/lualib/lua_magic/init.lua
index 5a4154c79..8b5064bfe 100644
--- a/lualib/lua_magic/init.lua
+++ b/lualib/lua_magic/init.lua
@@ -21,6 +21,7 @@ limitations under the License.
local patterns = require "lua_magic/patterns"
local types = require "lua_magic/types"
+local heuristics = require "lua_magic/heuristics"
local fun = require "fun"
local lua_util = require "lua_util"
@@ -131,23 +132,48 @@ local function process_patterns(log_obj)
-local function match_chunk(input, tlen, offset, trie, processed_tbl, log_obj, res)
- local matches = trie:match(input)
+local function match_chunk(chunk, input, tlen, offset, trie, processed_tbl, log_obj, res)
+ local matches = trie:match(chunk)
local last = tlen
- local function add_result(match, pattern)
- if not res[pattern.ext] then
- res[pattern.ext] = 0
+ local function add_result(weight, ext)
+ if not res[ext] then
+ res[ext] = 0
- if match.weight then
- res[pattern.ext] = res[pattern.ext] + match.weight
+ if weight then
+ res[ext] = res[ext] + weight
- res[pattern.ext] = res[pattern.ext] + 1
+ res[ext] = res[ext] + 1
lua_util.debugm(N, log_obj,'add pattern for %s, weight %s, total weight %s',
- pattern.ext, match.weight, res[pattern.ext])
+ ext, weight, res[ext])
+ end
+ local function match_position(pos, expected)
+ local cmp = function(a, b) return a == b end
+ if type(expected) == 'table' then
+ -- Something like {'>', 0}
+ if expected[1] == '>' then
+ cmp = function(a, b) return a > b end
+ elseif expected[1] == '>=' then
+ cmp = function(a, b) return a >= b end
+ elseif expected[1] == '<' then
+ cmp = function(a, b) return a < b end
+ elseif expected[1] == '<=' then
+ cmp = function(a, b) return a <= b end
+ elseif expected[1] == '!=' then
+ cmp = function(a, b) return a ~= b end
+ end
+ expected = expected[2]
+ end
+ -- Tail match
+ if expected < 0 then
+ expected = last + expected + 1
+ end
+ return cmp(pos, expected)
for npat,matched_positions in pairs(matches) do
@@ -155,30 +181,6 @@ local function match_chunk(input, tlen, offset, trie, processed_tbl, log_obj, re
local pattern = pat_data[3]
local match = pat_data[2]
- local function match_position(pos, expected)
- local cmp = function(a, b) return a == b end
- if type(expected) == 'table' then
- -- Something like {'>', 0}
- if expected[1] == '>' then
- cmp = function(a, b) return a > b end
- elseif expected[1] == '>=' then
- cmp = function(a, b) return a >= b end
- elseif expected[1] == '<' then
- cmp = function(a, b) return a < b end
- elseif expected[1] == '<=' then
- cmp = function(a, b) return a <= b end
- elseif expected[1] == '!=' then
- cmp = function(a, b) return a ~= b end
- end
- expected = expected[2]
- end
- -- Tail match
- if expected < 0 then
- expected = last + expected + 1
- end
- return cmp(pos, expected)
- end
-- Single position
if match.position then
local position = match.position
@@ -187,13 +189,21 @@ local function match_chunk(input, tlen, offset, trie, processed_tbl, log_obj, re
lua_util.debugm(N, log_obj, 'found match %s at offset %s(from %s)',
pattern.ext, pos, offset)
if match_position(pos + offset, position) then
- add_result(match, pattern)
- break
+ if match.heuristic then
+ local ext,weight = match.heuristic(input, log_obj)
+ if ext then
+ add_result(weight, ext)
+ break
+ end
+ else
+ add_result(match.weight, pattern.ext)
+ break
+ end
- end
- -- Match all positions
- if match.positions then
+ elseif match.positions then
+ -- Match all positions
local all_right = true
for _,position in ipairs(match.positions) do
local matched = false
@@ -210,10 +220,21 @@ local function match_chunk(input, tlen, offset, trie, processed_tbl, log_obj, re
if all_right then
- add_result(match, pattern)
+ if match.heuristic then
+ local ext,weight = match.heuristic(input, log_obj)
+ if ext then
+ add_result(weight, ext)
+ break
+ end
+ else
+ add_result(match.weight, pattern.ext)
+ break
+ end
local function process_detected(res)
@@ -248,13 +269,13 @@ exports.detect = function(input, log_obj)
-- Check tail matches
if inplen > min_tail_offset then
local tail = input:span(inplen - min_tail_offset, min_tail_offset)
- match_chunk(tail, inplen, inplen - min_tail_offset,
+ match_chunk(tail, input, inplen, inplen - min_tail_offset,
compiled_tail_patterns, tail_patterns, log_obj, res)
-- Try short match
local head = input:span(1, math.min(max_short_offset, inplen))
- match_chunk(head, inplen, 0,
+ match_chunk(head, input, inplen, 0,
compiled_short_patterns, short_patterns, log_obj, res)
-- Check if we have enough data or go to long patterns
@@ -273,13 +294,13 @@ exports.detect = function(input, log_obj)
input:span(inplen - exports.chunk_size, exports.chunk_size)
local offset1, offset2 = 0, inplen - exports.chunk_size
- match_chunk(chunk1, inplen,
+ match_chunk(chunk1, input, inplen,
offset1, compiled_patterns, processed_patterns, log_obj, res)
- match_chunk(chunk2, inplen,
+ match_chunk(chunk2, input, inplen,
offset2, compiled_patterns, processed_patterns, log_obj, res)
-- Input is short enough to match it at all
- match_chunk(input, inplen, 0,
+ match_chunk(input, input, inplen, 0,
compiled_patterns, processed_patterns, log_obj, res)
@@ -297,6 +318,16 @@ exports.detect = function(input, log_obj)
return nil
+exports.detect_mime_part = function(part, log_obj)
+ local ext,weight = heuristics.mime_part_heuristic(part)
+ if ext and weight and weight > 20 then
+ return ext,types[ext]
+ end
+ return exports.detect(part:get_content(), log_obj)
-- This parameter specifies how many bytes are checked in the input
-- Rspamd checks 2 chunks at start and 1 chunk at the end
exports.chunk_size = 32768
diff --git a/lualib/lua_magic/patterns.lua b/lualib/lua_magic/patterns.lua
index 003073cab..87ed3c0e9 100644
--- a/lualib/lua_magic/patterns.lua
+++ b/lualib/lua_magic/patterns.lua
@@ -19,6 +19,8 @@ limitations under the License.
-- This module contains most common patterns
+local heuristics = require "lua_magic/heuristics"
local patterns = {
pdf = {
-- These are alternatives
@@ -82,6 +84,17 @@ local patterns = {
+ -- MS Office format, needs heuristic
+ ole = {
+ matches = {
+ {
+ hex = [[d0cf11e0a1b11ae1]],
+ relative_position = 0,
+ weight = 60,
+ heuristic = heuristics.ole_format_heuristic
+ }
+ }
+ },
-- MS Exe file
exe = {
matches = {
@@ -202,6 +215,15 @@ local patterns = {
+ zoo = {
+ matches = {
+ {
+ hex = [[dca7c4fd]],
+ relative_position = 20,
+ weight = 60,
+ },
+ }
+ },
iso = {
matches = {
diff --git a/lualib/lua_magic/types.lua b/lualib/lua_magic/types.lua
index b3af668c8..e8e4e45e2 100644
--- a/lualib/lua_magic/types.lua
+++ b/lualib/lua_magic/types.lua
@@ -101,6 +101,10 @@ local types = {
ct = 'application/x-iso',
type = 'archive',
+ zoo = {
+ ct = 'application/x-zoo',
+ type = 'archive',
+ },
szdd = { -- in fact, their MSDOS extension is like FOO.TX_ or FOO.TX$
ct = 'application/x-compressed',
type = 'archive',
@@ -130,6 +134,61 @@ local types = {
ct = 'application/x-shockwave-flash',
type = 'image',
+ -- Ole files
+ ole = {
+ ct = 'application/octet-stream',
+ type = 'office'
+ },
+ doc = {
+ ct = 'application/msword',
+ type = 'office'
+ },
+ xls = {
+ ct = 'application/vnd.ms-excel',
+ type = 'office'
+ },
+ ppt = {
+ ct = 'application/vnd.ms-powerpoint',
+ type = 'office'
+ },
+ vsd = {
+ ct = 'application/vnd.visio',
+ type = 'office'
+ },
+ msi = {
+ ct = 'application/x-msi',
+ type = 'executable'
+ },
+ msg = {
+ ct = 'application/vnd.ms-outlook',
+ type = 'office'
+ },
+ -- newer office (2007+)
+ docx = {
+ ct = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+ type = 'office'
+ },
+ xlsx = {
+ ct = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+ type = 'office'
+ },
+ pptx = {
+ ct = 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+ type = 'office'
+ },
+ -- OpenOffice formats
+ odt = {
+ ct = 'application/vnd.oasis.opendocument.text',
+ type = 'office'
+ },
+ ods = {
+ ct = 'application/vnd.oasis.opendocument.spreadsheet',
+ type = 'office'
+ },
+ odp = {
+ ct = 'application/vnd.oasis.opendocument.presentation',
+ type = 'office'
+ },
-- other
pgp = {
ct = 'application/encrypted',
@@ -139,6 +198,50 @@ local types = {
ct = 'application/x-uuencoded',
type = 'binary',
+ -- Types that are detected by Rspamd itself
+ -- Archives
+ zip = {
+ ct = 'application/zip',
+ type = 'archive',
+ },
+ rar = {
+ ct = 'application/x-rar',
+ type = 'archive',
+ },
+ ['7z'] = {
+ ct = 'x-7z-compressed',
+ type = 'archive',
+ },
+ gz = {
+ ct = 'application/gzip',
+ type = 'archive',
+ },
+ -- Images
+ png = {
+ ct = 'image/png',
+ type = 'image',
+ },
+ gif = {
+ ct = 'image/gif',
+ type = 'image',
+ },
+ jpg = {
+ ct = 'image/jpeg',
+ type = 'image',
+ },
+ bmp = {
+ type = 'image',
+ ct = 'image/bmp',
+ },
+ -- Text
+ txt = {
+ type = 'text',
+ ct = 'text/plain',
+ },
+ html = {
+ type = 'text',
+ ct = 'text/html',
+ },
return types \ No newline at end of file
diff --git a/src/lua/lua_text.c b/src/lua/lua_text.c
index a41775230..68897019d 100644
--- a/src/lua/lua_text.c
+++ b/src/lua/lua_text.c
@@ -74,6 +74,13 @@ LUA_FUNCTION_DEF (text, save_in_file);
* @return {rspamd_text} new rspamd_text with span (must be careful when using with owned texts...)
LUA_FUNCTION_DEF (text, span);
+ * @method rspamd_text:at(pos)
+ * Returns a byte at the position `pos`
+ * @param {integer} pos index
+ * @return {integer} byte at the position `pos` or nil if pos out of bound
+ */
+LUA_FUNCTION_DEF (text, at);
LUA_FUNCTION_DEF (text, take_ownership);
LUA_FUNCTION_DEF (text, gc);
LUA_FUNCTION_DEF (text, eq);
@@ -91,6 +98,7 @@ static const struct luaL_reg textlib_m[] = {
LUA_INTERFACE_DEF (text, take_ownership),
LUA_INTERFACE_DEF (text, save_in_file),
LUA_INTERFACE_DEF (text, span),
+ LUA_INTERFACE_DEF (text, at),
{"write", lua_text_save_in_file},
{"__len", lua_text_len},
{"__tostring", lua_text_str},
@@ -342,6 +350,28 @@ lua_text_span (lua_State *L)
static gint
+lua_text_at (lua_State *L)
+ struct rspamd_lua_text *t = lua_check_text (L, 1);
+ gint pos = lua_tointeger (L, 2);
+ if (t) {
+ if (pos > 0 && pos <= t->len) {
+ lua_pushinteger (L, t->start[pos - 1]);
+ }
+ else {
+ lua_pushnil (L);
+ }
+ }
+ else {
+ return luaL_error (L, "invalid arguments");
+ }
+ return 1;
+static gint
lua_text_save_in_file (lua_State *L)
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index e18912e9a..c25d20471 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -3715,11 +3715,28 @@ lua_util_unpack (lua_State *L)
Header h;
const char *fmt = luaL_checkstring(L, 1);
size_t ld;
- const char *data = luaL_checklstring (L, 2, &ld);
- size_t pos = (size_t) posrelat (luaL_optinteger (L, 3, 1), ld) - 1;
+ const char *data;
int n = 0; /* number of results */
+ if (lua_type (L, 2) == LUA_TUSERDATA) {
+ struct rspamd_lua_text *t = lua_check_text (L, 2);
+ if (!t) {
+ return luaL_error (L, "invalid arguments");
+ }
+ data = t->start;
+ ld = t->len;
+ }
+ else {
+ data = luaL_checklstring (L, 2, &ld);
+ }
+ size_t pos = (size_t) posrelat (luaL_optinteger (L, 3, 1), ld) - 1;
luaL_argcheck(L, pos <= ld, 3, "initial position out of string");
initheader (L, &h);
while (*fmt != '\0') {
int size, ntoalign;
KOption opt = getdetails (&h, pos, &fmt, &size, &ntoalign);
diff --git a/src/plugins/lua/clickhouse.lua b/src/plugins/lua/clickhouse.lua
index 5c8eb5d4d..1505b51aa 100644
--- a/src/plugins/lua/clickhouse.lua
+++ b/src/plugins/lua/clickhouse.lua
@@ -841,7 +841,7 @@ end
local function do_remove_partition(ev_base, cfg, table_name, partition_id)
lua_util.debugm(N, rspamd_config, "removing partition %s.%s", table_name, partition_id)
local upstream = settings.upstream:get_upstream_round_robin()
- local remove_partition_sql = "ALTER TABLE ${table_name} ${remove_method} PARTITION ${partition_id}"
+ local remove_partition_sql = "ALTER TABLE ${table_name} ${remove_method} PARTITION '${partition_id}'"
local remove_method = (settings.retention.method == 'drop') and 'DROP' or 'DETACH'
local sql_params = {
['table_name'] = table_name,