diff options
author | korgoth1 <vladislav.stakhov@gmail.com> | 2019-09-08 19:40:00 +0300 |
---|---|---|
committer | korgoth1 <vladislav.stakhov@gmail.com> | 2019-09-08 19:40:00 +0300 |
commit | b38a8298263120c857a95522decd31d09cb42504 (patch) | |
tree | 0786d34a6eadbde5c0e5e133c582ae0eb31d93c8 | |
parent | 2b9ee5fee2c7beb9001a6df1b68a11ec724c966d (diff) | |
parent | 859483618be3602ea3c405f944595f8c0d06e720 (diff) | |
download | rspamd-b38a8298263120c857a95522decd31d09cb42504.tar.gz rspamd-b38a8298263120c857a95522decd31d09cb42504.zip |
[Test] WHITELIST_SURBL
-rw-r--r-- | lualib/lua_magic/init.lua | 198 | ||||
-rw-r--r-- | lualib/lua_magic/patterns.lua | 302 | ||||
-rw-r--r-- | lualib/lua_magic/types.lua | 113 | ||||
-rw-r--r-- | src/lua/lua_trie.c | 2 | ||||
-rw-r--r-- | test/functional/cases/340_surbl.robot | 3 | ||||
-rw-r--r-- | test/functional/configs/plugins.conf | 10 | ||||
-rw-r--r-- | test/functional/messages/whitelist.eml | 2 |
7 files changed, 600 insertions, 30 deletions
diff --git a/lualib/lua_magic/init.lua b/lualib/lua_magic/init.lua index 464a10d0a..5a4154c79 100644 --- a/lualib/lua_magic/init.lua +++ b/lualib/lua_magic/init.lua @@ -24,23 +24,89 @@ local types = require "lua_magic/types" local fun = require "fun" local lua_util = require "lua_util" +local rspamd_text = require "rspamd_text" local rspamd_trie = require "rspamd_trie" local N = "lua_magic" local exports = {} --- trie object +-- trie objects local compiled_patterns +local compiled_short_patterns +local compiled_tail_patterns -- {<str>, <match_object>, <pattern_object>} indexed by pattern number local processed_patterns = {} +local short_patterns = {} +local tail_patterns = {} + +local short_match_limit = 128 +local max_short_offset = -1 +local min_tail_offset = math.huge + +local function process_patterns(log_obj) + -- Add pattern to either short patterns or to normal patterns + local function add_processed(str, match, pattern) + if match.position and type(match.position) == 'number' then + if match.tail then + -- Tail pattern + tail_patterns[#tail_patterns + 1] = { + str, match, pattern + } + if min_tail_offset > match.tail then + min_tail_offset = match.tail + end + + lua_util.debugm(N, log_obj, 'add tail pattern %s for ext %s', + str, pattern.ext) + elseif match.position < short_match_limit then + short_patterns[#short_patterns + 1] = { + str, match, pattern + } + lua_util.debugm(N, log_obj, 'add short pattern %s for ext %s', + str, pattern.ext) + + if max_short_offset < match.position then + max_short_offset = match.position + end + else + processed_patterns[#processed_patterns + 1] = { + str, match, pattern + } + + lua_util.debugm(N, log_obj, 'add long pattern %s for ext %s', + str, pattern.ext) + end + else + processed_patterns[#processed_patterns + 1] = { + str, match, pattern + } + + lua_util.debugm(N, log_obj, 'add long pattern %s for ext %s', + str, pattern.ext) + end + end -local function process_patterns() if not compiled_patterns then - for _,pattern in ipairs(patterns) do + for ext,pattern in pairs(patterns) do + assert(types[ext], 'not found type: ' .. ext) + pattern.ext = ext for _,match in ipairs(pattern.matches) do if match.string then - processed_patterns[#processed_patterns + 1] = { - match.string, match, pattern - } + if match.relative_position and not match.position then + match.position = match.relative_position + #match.string + end + add_processed(match.string, match, pattern) + elseif match.hex then + local hex_table = {} + + for i=1,#match.hex,2 do + local subc = match.hex:sub(i, i + 1) + hex_table[#hex_table + 1] = string.format('\\x{%s}', subc) + end + + if match.relative_position and not match.position then + match.position = match.relative_position + #match.hex / 2 + end + add_processed(table.concat(hex_table), match, pattern) end end end @@ -49,18 +115,26 @@ local function process_patterns() fun.map(function(t) return t[1] end, processed_patterns)), rspamd_trie.flags.re ) + compiled_short_patterns = rspamd_trie.create(fun.totable( + fun.map(function(t) return t[1] end, short_patterns)), + rspamd_trie.flags.re + ) + compiled_tail_patterns = rspamd_trie.create(fun.totable( + fun.map(function(t) return t[1] end, tail_patterns)), + rspamd_trie.flags.re + ) - lua_util.debugm(N, rspamd_config, 'compiled %s patterns', - #processed_patterns) + lua_util.debugm(N, log_obj, + 'compiled %s (%s short; %s long; %s tail) patterns', + #processed_patterns + #short_patterns + #tail_patterns, + #short_patterns, #processed_patterns, #tail_patterns) end end -exports.detect = function(input, log_obj) - process_patterns() - local res = {} - local matches = compiled_patterns:match(input) +local function match_chunk(input, tlen, offset, trie, processed_tbl, log_obj, res) + local matches = trie:match(input) - if not log_obj then log_obj = rspamd_config end + local last = tlen local function add_result(match, pattern) if not res[pattern.ext] then @@ -77,7 +151,7 @@ exports.detect = function(input, log_obj) end for npat,matched_positions in pairs(matches) do - local pat_data = processed_patterns[npat] + local pat_data = processed_tbl[npat] local pattern = pat_data[3] local match = pat_data[2] @@ -99,6 +173,10 @@ exports.detect = function(input, log_obj) expected = expected[2] end + -- Tail match + if expected < 0 then + expected = last + expected + 1 + end return cmp(pos, expected) end -- Single position @@ -106,23 +184,39 @@ exports.detect = function(input, log_obj) local position = match.position for _,pos in ipairs(matched_positions) do - if match_position(pos, position) then + lua_util.debugm(N, log_obj, 'found match %s at offset %s(from %s)', + pattern.ext, pos, offset) + if match_position(pos + offset, position) then add_result(match, pattern) + break end end end -- Match all positions if match.positions then + local all_right = true for _,position in ipairs(match.positions) do + local matched = false for _,pos in ipairs(matched_positions) do - if match_position(pos, position) then - add_result(match, pattern) + if not match_position(pos + offset, position) then + matched = true + break end end + if not matched then + all_right = false + break + end + end + + if all_right then + add_result(match, pattern) end end end +end +local function process_detected(res) local extensions = lua_util.keys(res) if #extensions > 0 then @@ -130,6 +224,72 @@ exports.detect = function(input, log_obj) return res[ex1] > res[ex2] end) + return extensions,res[extensions[1]] + end + + return nil +end + +exports.detect = function(input, log_obj) + if not log_obj then log_obj = rspamd_config end + process_patterns(log_obj) + + local res = {} + + if type(input) == 'string' then + -- Convert to rspamd_text + input = rspamd_text.fromstring(input) + end + + + if type(input) == 'userdata' then + local inplen = #input + + -- Check tail matches + if inplen > min_tail_offset then + local tail = input:span(inplen - min_tail_offset, min_tail_offset) + match_chunk(tail, inplen, inplen - min_tail_offset, + compiled_tail_patterns, tail_patterns, log_obj, res) + end + + -- Try short match + local head = input:span(1, math.min(max_short_offset, inplen)) + match_chunk(head, inplen, 0, + compiled_short_patterns, short_patterns, log_obj, res) + + -- Check if we have enough data or go to long patterns + local extensions,confidence = process_detected(res) + + if extensions and #extensions > 0 and confidence > 30 then + -- We are done on short patterns + return extensions[1],types[extensions[1]] + end + + -- No way, let's check data in chunks or just the whole input if it is small enough + if #input > exports.chunk_size * 3 then + -- Chunked version as input is too long + local chunk1, chunk2 = + input:span(1, exports.chunk_size * 2), + input:span(inplen - exports.chunk_size, exports.chunk_size) + local offset1, offset2 = 0, inplen - exports.chunk_size + + match_chunk(chunk1, inplen, + offset1, compiled_patterns, processed_patterns, log_obj, res) + match_chunk(chunk2, inplen, + offset2, compiled_patterns, processed_patterns, log_obj, res) + else + -- Input is short enough to match it at all + match_chunk(input, inplen, 0, + compiled_patterns, processed_patterns, log_obj, res) + end + else + -- Table input is NYI + assert(0, 'table input for match') + end + + local extensions = process_detected(res) + + if extensions and #extensions > 0 then return extensions[1],types[extensions[1]] end @@ -137,4 +297,8 @@ exports.detect = function(input, log_obj) return nil end +-- This parameter specifies how many bytes are checked in the input +-- Rspamd checks 2 chunks at start and 1 chunk at the end +exports.chunk_size = 32768 + return exports
\ No newline at end of file diff --git a/lualib/lua_magic/patterns.lua b/lualib/lua_magic/patterns.lua index 354f8ec61..003073cab 100644 --- a/lualib/lua_magic/patterns.lua +++ b/lualib/lua_magic/patterns.lua @@ -20,14 +20,12 @@ limitations under the License. --]] local patterns = { - { - -- MSDOS extension to match types table - ext = 'pdf', + pdf = { -- These are alternatives matches = { { string = [[%PDF-\d]], - position = 6, -- must be end of the match, as that's how hyperscan works + position = 6, -- must be end of the match, as that's how hyperscan works (or use relative_position) weight = 60, }, { @@ -41,7 +39,301 @@ local patterns = { weight = 60, }, }, - } + }, + ps = { + matches = { + { + string = [[%!PS-Adobe]], + relative_position = 0, + weight = 60, + }, + }, + }, + -- RTF document + rtf = { + matches = { + { + string = [[{\\rtf\d]], + position = 6, + weight = 60, + } + } + }, + chm = { + matches = { + { + string = [[ITSF]], + relative_position = 0, + weight = 60, + } + } + }, + djvu = { + matches = { + { + string = [[AT&TFORM]], + relative_position = 0, + weight = 60, + }, + { + string = [[DJVM]], + relative_position = 0x0c, + weight = 60, + } + } + }, + -- MS Exe file + exe = { + matches = { + { + string = [[MZ]], + relative_position = 0, + weight = 10, + }, + -- PE part + { + string = [[PE\x{00}\x{00}]], + position = {'>=', 0x3c + 4}, + weight = 40, + } + } + }, + elf = { + matches = { + { + hex = [[7f454c46]], + relative_position = 0, + weight = 60, + }, + } + }, + lnk = { + matches = { + { + hex = [[4C0000000114020000000000C000000000000046]], + relative_position = 0, + weight = 60, + }, + } + }, + class = { + -- Technically, this also matches MachO files, but I don't care about + -- Apple and their mental health problems here: just consider Java files, + -- Mach object files and all other cafe babes as bad and block them! + matches = { + { + hex = [[cafebabe]], + relative_position = 0, + weight = 60, + }, + } + }, + -- Archives + arj = { + matches = { + { + hex = '60EA', + relative_position = 0, + weight = 60, + }, + } + }, + ace = { + matches = { + { + string = [[\*\*ACE\*\*]], + position = 14, + weight = 60, + }, + } + }, + cab = { + matches = { + { + hex = [[4d53434600000000]], -- Can be anywhere for SFX :( + position = {'>=', 8}, + weight = 60, + }, + } + }, + tar = { + matches = { + { + string = [[ustar]], + relative_position = 257, + weight = 60, + }, + } + }, + bz2 = { + matches = { + { + string = "BZ[h0]", + position = 3, + weight = 60, + }, + } + }, + lz4 = { + matches = { + { + hex = "184d2204", + relative_position = 0, + weight = 60, + }, + { + hex = "184c2103", + relative_position = 0, + weight = 60, + }, + { + hex = "184c2102", + relative_position = 0, + weight = 60, + }, + } + }, + zst = { + matches = { + { + string = [[\x{FD}\x{2F}\x{B5}[\x{22}-\x{40}].]], + position = 5, -- includes last . + weight = 60, + }, + } + }, + iso = { + matches = { + { + string = [[\x{01}CD001\x{01}]], + position = {'>=', 0x8000 + 7}, -- first 32k is unused + weight = 60, + }, + } + }, + -- Apple is a 'special' child: this needs to be matched at the data tail... + dmg = { + matches = { + { + string = [[koly]], + position = -512 + 4, + weight = 61, + tail = 512, + }, + } + }, + szdd = { + matches = { + { + hex = [[535a4444]], + relative_position = 0, + weight = 60, + }, + } + }, + xz = { + matches = { + { + hex = [[FD377A585A00]], + relative_position = 0, + weight = 60, + }, + } + }, + -- Images + psd = { + matches = { + { + string = [[8BPS]], + relative_position = 0, + weight = 60, + }, + } + }, + ico = { + matches = { + { + hex = [[00000100]], + relative_position = 0, + weight = 60, + }, + } + }, + pcx = { + matches = { + { + hex = [[0A050108]], + relative_position = 0, + weight = 60, + }, + } + }, + pic = { + matches = { + { + hex = [[FF80C9C71A00]], + relative_position = 0, + weight = 60, + }, + } + }, + swf = { + matches = { + { + hex = [[5a5753]], -- LZMA + relative_position = 0, + weight = 60, + }, + { + hex = [[435753]], -- Zlib + relative_position = 0, + weight = 60, + }, + { + hex = [[465753]], -- Uncompressed + relative_position = 0, + weight = 60, + }, + } + }, + tiff = { + matches = { + { + hex = [[49492a00]], -- LE encoded + relative_position = 0, + weight = 60, + }, + { + hex = [[4d4d]], -- BE tiff + relative_position = 0, + weight = 60, + }, + } + }, + -- Other + pgp = { + matches = { + { + hex = [[A803504750]], + relative_position = 0, + weight = 60, + }, + { + hex = [[2D424547494E20504750204D4553534147452D]], + relative_position = 0, + weight = 60, + }, + } + }, + uue = { + matches = { + { + hex = [[626567696e20]], + relative_position = 0, + weight = 60, + }, + } + }, } return patterns
\ No newline at end of file diff --git a/lualib/lua_magic/types.lua b/lualib/lua_magic/types.lua index 746c87400..b3af668c8 100644 --- a/lualib/lua_magic/types.lua +++ b/lualib/lua_magic/types.lua @@ -22,18 +22,123 @@ limitations under the License. -- This table is indexed by msdos extension for convenience local types = { + -- exe + exe = { + ct = 'application/x-ms-application', + type = 'executable', + }, + elf = { + ct = 'application/x-elf-executable', + type = 'executable', + }, + lnk = { + ct = 'application/x-ms-application', + type = 'executable', + }, + class = { + ct = 'application/x-java-applet', + type = 'executable', + }, + -- text + rtf = { + ct = "application/rtf", + type = 'text', + }, pdf = { ct = 'application/pdf', type = 'binary', }, - exe = { - ct = 'application/x-ms-application', - type = 'executable', + ps = { + ct = 'application/postscript', + type = 'binary', + }, + chm = { + ct = 'application/x-chm', + type = 'binary', + }, + djvu = { + ct = 'application/x-djvu', + type = 'binary', + }, + -- archives + arj = { + ct = 'application/x-arj', + type = 'archive', + }, + cab = { + ct = 'application/x-cab', + type = 'archive', + }, + ace = { + ct = 'application/x-ace', + type = 'archive', + }, + tar = { + ct = 'application/x-tar', + type = 'archive', + }, + bz2 = { + ct = 'application/x-bzip', + type = 'archive', + }, + xz = { + ct = 'application/x-xz', + type = 'archive', + }, + lz4 = { + ct = 'application/x-lz4', + type = 'archive', + }, + zst = { + ct = 'application/x-zstandard', + type = 'archive', + }, + dmg = { + ct = 'application/x-dmg', + type = 'archive', + }, + iso = { + ct = 'application/x-iso', + type = 'archive', + }, + szdd = { -- in fact, their MSDOS extension is like FOO.TX_ or FOO.TX$ + ct = 'application/x-compressed', + type = 'archive', + }, + -- images + psd = { + ct = 'image/psd', + type = 'image', + }, + pcx = { + ct = 'image/pcx', + type = 'image', + }, + pic = { + ct = 'image/pic', + type = 'image', }, tiff = { ct = 'image/tiff', type = 'image', - } + }, + ico = { + ct = 'image/ico', + type = 'image', + }, + swf = { + ct = 'application/x-shockwave-flash', + type = 'image', + }, + -- other + pgp = { + ct = 'application/encrypted', + type = 'encrypted' + }, + uue = { + ct = 'application/x-uuencoded', + type = 'binary', + }, } return types
\ No newline at end of file diff --git a/src/lua/lua_trie.c b/src/lua/lua_trie.c index b030c735a..82d11c50c 100644 --- a/src/lua/lua_trie.c +++ b/src/lua/lua_trie.c @@ -290,7 +290,7 @@ lua_trie_match (lua_State *L) } } else if (lua_type (L, 2) == LUA_TUSERDATA) { - t = lua_check_text (L, -1); + t = lua_check_text (L, 2); if (t && lua_trie_search_str (L, trie, t->start, t->len, cb)) { found = TRUE; diff --git a/test/functional/cases/340_surbl.robot b/test/functional/cases/340_surbl.robot index 232914932..ee80577cf 100644 --- a/test/functional/cases/340_surbl.robot +++ b/test/functional/cases/340_surbl.robot @@ -94,7 +94,8 @@ SURBL example.com encoded url in subject WHITELIST ${result} = Scan Message With Rspamc ${TESTDIR}/messages/whitelist.eml - Should Contain ${result.stdout} RSPAMD_URIBL ( + Should Not Contain ${result.stdout} RSPAMD_URIBL ( + Should Not Contain ${result.stdout} DBL_SPAM ( *** Keywords *** Surbl Setup diff --git a/test/functional/configs/plugins.conf b/test/functional/configs/plugins.conf index ac68ec5cb..839e14257 100644 --- a/test/functional/configs/plugins.conf +++ b/test/functional/configs/plugins.conf @@ -580,6 +580,16 @@ options = { replies = ["127.0.0.4", "127.0.0.11"]; }, { + name = "rspamd-test.com.test.uribl"; + type = a; + replies = ["127.0.0.2"]; + }, + { + name = "rspamd-test.com.test2.uribl"; + type = a; + replies = ["127.0.1.2"]; + }, + { name = "9.8.8.8.test4.uribl"; type = a; replies = ["127.0.0.3"]; diff --git a/test/functional/messages/whitelist.eml b/test/functional/messages/whitelist.eml index 24686a247..aa19512a1 100644 --- a/test/functional/messages/whitelist.eml +++ b/test/functional/messages/whitelist.eml @@ -1,5 +1,3 @@ Content-Type: text/plain -http://rspamd.com -http://test.rspamd.example.com http://rspamd-test.com |