summaryrefslogtreecommitdiffstats
path: root/utils
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2019-11-01 15:46:35 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2019-11-01 15:46:35 +0000
commit63f47bc68b8025ead4549fc04fa9c554b7db4eab (patch)
tree0b1c95c8459f266c2f034fdd8c498aebf81a8014 /utils
parent4f21ecbd4a343aa79b9184ab3a04f663268de6a9 (diff)
downloadrspamd-63f47bc68b8025ead4549fc04fa9c554b7db4eab.tar.gz
rspamd-63f47bc68b8025ead4549fc04fa9c554b7db4eab.zip
[Minor] Add tool to convert trivial SA rules to multimap
Diffstat (limited to 'utils')
-rw-r--r--utils/sa_trivial_convert.lua460
1 files changed, 460 insertions, 0 deletions
diff --git a/utils/sa_trivial_convert.lua b/utils/sa_trivial_convert.lua
new file mode 100644
index 000000000..8cf0b9137
--- /dev/null
+++ b/utils/sa_trivial_convert.lua
@@ -0,0 +1,460 @@
+local fun = require "fun"
+local rspamd_logger = require "rspamd_logger"
+local util = require "rspamd_util"
+local lua_util = require "lua_util"
+local rspamd_regexp = require "rspamd_regexp"
+local ucl = require "ucl"
+
+local complicated = {}
+local rules = {}
+local scores = {}
+
+local function words_to_re(words, start)
+ return table.concat(fun.totable(fun.drop_n(start, words)), " ");
+end
+
+local function split(str, delim)
+ local result = {}
+
+ if not delim then
+ delim = '[^%s]+'
+ end
+
+ for token in string.gmatch(str, delim) do
+ table.insert(result, token)
+ end
+
+ return result
+end
+
+local function handle_header_def(hline, cur_rule)
+ --Now check for modifiers inside header's name
+ local hdrs = split(hline, '[^|]+')
+ local hdr_params = {}
+ local cur_param = {}
+ -- Check if an re is an ordinary re
+ local ordinary = true
+
+ for _,h in ipairs(hdrs) do
+ if h == 'ALL' or h == 'ALL:raw' then
+ ordinary = false
+ else
+ local args = split(h, '[^:]+')
+ cur_param['strong'] = false
+ cur_param['raw'] = false
+ cur_param['header'] = args[1]
+
+ if args[2] then
+ -- We have some ops that are required for the header, so it's not ordinary
+ ordinary = false
+ end
+
+ fun.each(function(func)
+ if func == 'addr' then
+ cur_param['function'] = function(str)
+ local addr_parsed = util.parse_addr(str)
+ local ret = {}
+ if addr_parsed then
+ for _,elt in ipairs(addr_parsed) do
+ if elt['addr'] then
+ table.insert(ret, elt['addr'])
+ end
+ end
+ end
+
+ return ret
+ end
+ elseif func == 'name' then
+ cur_param['function'] = function(str)
+ local addr_parsed = util.parse_addr(str)
+ local ret = {}
+ if addr_parsed then
+ for _,elt in ipairs(addr_parsed) do
+ if elt['name'] then
+ table.insert(ret, elt['name'])
+ end
+ end
+ end
+
+ return ret
+ end
+ elseif func == 'raw' then
+ cur_param['raw'] = true
+ elseif func == 'case' then
+ cur_param['strong'] = true
+ else
+ rspamd_logger.warnx(rspamd_config, 'Function %1 is not supported in %2',
+ func, cur_rule['symbol'])
+ end
+ end, fun.tail(args))
+
+ local function split_hdr_param(param, headers)
+ for _,hh in ipairs(headers) do
+ local nparam = {}
+ for k,v in pairs(param) do
+ if k ~= 'header' then
+ nparam[k] = v
+ end
+ end
+
+ nparam['header'] = hh
+ table.insert(hdr_params, nparam)
+ end
+ end
+ -- Some header rules require splitting to check of multiple headers
+ if cur_param['header'] == 'MESSAGEID' then
+ -- Special case for spamassassin
+ ordinary = false
+ elseif cur_param['header'] == 'ToCc' then
+ ordinary = false
+ else
+ table.insert(hdr_params, cur_param)
+ end
+ end
+
+ cur_rule['ordinary'] = ordinary
+ cur_rule['header'] = hdr_params
+ end
+end
+
+local function process_sa_conf(f)
+ local cur_rule = {}
+ local valid_rule = false
+
+ local function insert_cur_rule()
+ if not rules[cur_rule.type] then
+ rules[cur_rule.type] = {}
+ end
+
+ local target = rules[cur_rule.type]
+
+ if cur_rule.type == 'header' then
+ if not cur_rule.header[1].header then
+ rspamd_logger.errx(rspamd_config, 'bad rule definition: %1', cur_rule)
+ return
+ end
+ if not target[cur_rule.header[1].header] then
+ target[cur_rule.header[1].header] = {}
+ end
+ target = target[cur_rule.header[1].header]
+ end
+
+ if not cur_rule['symbol'] then
+ rspamd_logger.errx(rspamd_config, 'bad rule definition: %1', cur_rule)
+ return
+ end
+ target[cur_rule['symbol']] = cur_rule
+ cur_rule = {}
+ valid_rule = false
+ end
+
+ local function parse_score(words)
+ if #words == 3 then
+ -- score rule <x>
+ lua_util.debugm(N, rspamd_config, 'found score for %1: %2', words[2], words[3])
+ return tonumber(words[3])
+ elseif #words == 6 then
+ -- score rule <x1> <x2> <x3> <x4>
+ -- we assume here that bayes and network are enabled and select <x4>
+ lua_util.debugm(N, rspamd_config, 'found score for %1: %2', words[2], words[6])
+ return tonumber(words[6])
+ else
+ rspamd_logger.errx(rspamd_config, 'invalid score for %1', words[2])
+ end
+
+ return 0
+ end
+
+ local skip_to_endif = false
+ local if_nested = 0
+ for l in f:lines() do
+ (function ()
+ l = lua_util.rspamd_str_trim(l)
+ -- Replace bla=~/re/ with bla =~ /re/ (#2372)
+ l = l:gsub('([^%s])%s*([=!]~)%s*([^%s])', '%1 %2 %3')
+
+ if string.len(l) == 0 or string.sub(l, 1, 1) == '#' then
+ return
+ end
+
+ -- Unbalanced if/endif
+ if if_nested < 0 then if_nested = 0 end
+ if skip_to_endif then
+ if string.match(l, '^endif') then
+ if_nested = if_nested - 1
+
+ if if_nested == 0 then
+ skip_to_endif = false
+ end
+ elseif string.match(l, '^if') then
+ if_nested = if_nested + 1
+ elseif string.match(l, '^else') then
+ -- Else counterpart for if
+ skip_to_endif = false
+ end
+ table.insert(complicated, l)
+ return
+ else
+ if string.match(l, '^ifplugin') then
+ local ls = split(l)
+
+ skip_to_endif = true
+ if_nested = if_nested + 1
+ table.insert(complicated, l)
+ elseif string.match(l, '^if !plugin%(') then
+ local pname = string.match(l, '^if !plugin%(([A-Za-z:]+)%)')
+ skip_to_endif = true
+ if_nested = if_nested + 1
+ table.insert(complicated, l)
+ elseif string.match(l, '^if') then
+ -- Unknown if
+ skip_to_endif = true
+ if_nested = if_nested + 1
+ table.insert(complicated, l)
+ elseif string.match(l, '^else') then
+ -- Else counterpart for if
+ skip_to_endif = true
+ table.insert(complicated, l)
+ elseif string.match(l, '^endif') then
+ if_nested = if_nested - 1
+ table.insert(complicated, l)
+ end
+ end
+
+ -- Skip comments
+ local words = fun.totable(fun.take_while(
+ function(w) return string.sub(w, 1, 1) ~= '#' end,
+ fun.filter(function(w)
+ return w ~= "" end,
+ fun.iter(split(l)))))
+
+ if words[1] == "header" then
+ -- header SYMBOL Header ~= /regexp/
+ if valid_rule then
+ insert_cur_rule()
+ end
+ if words[4] and (words[4] == '=~' or words[4] == '!~') then
+ cur_rule['type'] = 'header'
+ cur_rule['symbol'] = words[2]
+
+ if words[4] == '!~' then
+ table.insert(complicated, l)
+ return
+ end
+
+ cur_rule['re_expr'] = words_to_re(words, 4)
+ local unset_comp = string.find(cur_rule['re_expr'], '%s+%[if%-unset:')
+ if unset_comp then
+ table.insert(complicated, l)
+ return
+ end
+
+ cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr'])
+
+ if not cur_rule['re'] then
+ rspamd_logger.warnx(rspamd_config, "Cannot parse regexp '%1' for %2",
+ cur_rule['re_expr'], cur_rule['symbol'])
+ table.insert(complicated, l)
+ return
+ else
+ handle_header_def(words[3], cur_rule)
+ if not cur_rule['ordinary'] then
+ table.insert(complicated, l)
+ return
+ end
+ end
+
+ valid_rule = true
+ else
+ table.insert(complicated, l)
+ return
+ end
+ elseif words[1] == "body" then
+ -- body SYMBOL /regexp/
+ if valid_rule then
+ insert_cur_rule()
+ end
+
+ cur_rule['symbol'] = words[2]
+ if words[3] and (string.sub(words[3], 1, 1) == '/'
+ or string.sub(words[3], 1, 1) == 'm') then
+ cur_rule['type'] = 'sabody'
+ cur_rule['re_expr'] = words_to_re(words, 2)
+ cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr'])
+ if cur_rule['re'] then
+
+ valid_rule = true
+ end
+ else
+ -- might be function
+ table.insert(complicated, l)
+ return
+ end
+ elseif words[1] == "rawbody" then
+ -- body SYMBOL /regexp/
+ if valid_rule then
+ insert_cur_rule()
+ end
+
+ cur_rule['symbol'] = words[2]
+ if words[3] and (string.sub(words[3], 1, 1) == '/'
+ or string.sub(words[3], 1, 1) == 'm') then
+ cur_rule['type'] = 'sarawbody'
+ cur_rule['re_expr'] = words_to_re(words, 2)
+ cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr'])
+ if cur_rule['re'] then
+ valid_rule = true
+ end
+ else
+ table.insert(complicated, l)
+ return
+ end
+ elseif words[1] == "full" then
+ -- body SYMBOL /regexp/
+ if valid_rule then
+ insert_cur_rule()
+ end
+
+ cur_rule['symbol'] = words[2]
+
+ if words[3] and (string.sub(words[3], 1, 1) == '/'
+ or string.sub(words[3], 1, 1) == 'm') then
+ cur_rule['type'] = 'message'
+ cur_rule['re_expr'] = words_to_re(words, 2)
+ cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr'])
+ cur_rule['raw'] = true
+ if cur_rule['re'] then
+ valid_rule = true
+ end
+ else
+ table.insert(complicated, l)
+ return
+ end
+ elseif words[1] == "uri" then
+ -- uri SYMBOL /regexp/
+ if valid_rule then
+ insert_cur_rule()
+ end
+ cur_rule['type'] = 'uri'
+ cur_rule['symbol'] = words[2]
+ cur_rule['re_expr'] = words_to_re(words, 2)
+ cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr'])
+ if cur_rule['re'] and cur_rule['symbol'] then
+ valid_rule = true
+ else
+ table.insert(complicated, l)
+ return
+ end
+ elseif words[1] == "meta" then
+ -- meta SYMBOL expression
+ if valid_rule then
+ insert_cur_rule()
+ end
+ table.insert(complicated, l)
+ return
+ elseif words[1] == "describe" and valid_rule then
+ cur_rule['description'] = words_to_re(words, 2)
+ elseif words[1] == "score" then
+ scores[words[2]] = parse_score(words)
+ else
+ table.insert(complicated, l)
+ return
+ end
+ end)()
+ end
+ if valid_rule then
+ insert_cur_rule()
+ end
+end
+
+for _,matched in ipairs(arg) do
+ local f = io.open(matched, "r")
+ if f then
+ rspamd_logger.messagex(rspamd_config, 'loading SA rules from %s', matched)
+ process_sa_conf(f)
+ else
+ rspamd_logger.errx(rspamd_config, "cannot open %1", matched)
+ end
+end
+
+local multimap_conf = {}
+
+local function handle_rule(what, syms, hdr)
+ local mtype
+ local filter
+ local fname
+ local sym = what:upper()
+ if what == 'sabody' then
+ mtype = 'content'
+ fname = 'body_re.map'
+ filter = 'oneline'
+ elseif what == 'sarawbody' then
+ fname = 'raw_body_re.map'
+ mtype = 'content'
+ filter = 'rawtext'
+ elseif what == 'full' then
+ fname = 'full_re.map'
+ mtype = 'content'
+ filter = 'full'
+ elseif what == 'uri' then
+ fname = 'uri_re.map'
+ mtype = 'url'
+ filter = 'full'
+ elseif what == 'header' then
+ fname = ('hdr_' .. hdr .. '_re.map'):lower()
+ mtype = 'header'
+ header = hdr
+ sym = sym .. '_' .. hdr:upper()
+ else
+ rspamd_logger.errx('unknown type: %s', what)
+ return
+ end
+ local conf = {
+ type = mtype,
+ filter = filter,
+ symbol = 'SA_MAP_AUTO_' .. sym,
+ regexp = true,
+ map = fname,
+ header = header,
+ symbols = {}
+ }
+
+ local re_file = io.open(fname, 'w')
+
+ for k,r in pairs(syms) do
+ local score = 0.0
+ if scores[k] then
+ score = scores[k]
+ end
+ re_file:write(string.format('/%s/ %s:%f\n', tostring(r.re), k, score))
+ table.insert(conf.symbols, k)
+ end
+
+ re_file:close()
+
+ multimap_conf[sym:lower()] = conf
+ rspamd_logger.messagex('stored %s regexp in %s', sym:lower(), fname)
+end
+
+for k,v in pairs(rules) do
+ if k == 'header' then
+ for h,r in pairs(v) do
+ handle_rule(k, r, h)
+ end
+ else
+ handle_rule(k, v)
+ end
+end
+
+local out = ucl.to_format(multimap_conf, 'ucl')
+local mmap_conf = io.open('auto_multimap.conf', 'w')
+mmap_conf:write(out)
+mmap_conf:close()
+rspamd_logger.messagex('stored multimap conf in %s', 'auto_multimap.conf')
+
+local sa_remain = io.open('auto_sa.conf', 'w')
+fun.each(function(l)
+ sa_remain:write(l)
+end, fun.filter(function(l) return not string.match(l, '^%s+$') end, complicated))
+sa_remain:close()
+rspamd_logger.messagex('stored sa remains conf in %s', 'auto_sa.conf')