123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668 |
- --[[
- Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com>
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ]]--
-
- -- This module contains 'selectors' implementation: code to extract data
- -- from Rspamd tasks and compose those together
- --
- -- Read more at https://rspamd.com/doc/configuration/selectors.html
-
- --[[[
- -- @module lua_selectors
- -- This module contains 'selectors' implementation: code to extract data
- -- from Rspamd tasks and compose those together.
- -- Typical selector looks like this: header(User).lower.substring(1, 2):ip
- --]]
-
- local exports = {
- maps = require "lua_selectors/maps"
- }
-
- local logger = require 'rspamd_logger'
- local fun = require 'fun'
- local lua_util = require "lua_util"
- local M = "selectors"
- local rspamd_text = require "rspamd_text"
- local unpack_function = table.unpack or unpack
- local E = {}
-
- local extractors = require "lua_selectors/extractors"
- local transform_function = require "lua_selectors/transforms"
-
- local text_cookie = rspamd_text.cookie
-
- local function pure_type(ltype)
- return ltype:match('^(.*)_list$')
- end
-
- local function implicit_tostring(t, ud_or_table)
- if t == 'table' then
- -- Table (very special)
- if ud_or_table.value then
- return ud_or_table.value, 'string'
- elseif ud_or_table.addr then
- return ud_or_table.addr, 'string'
- end
-
- return logger.slog("%s", ud_or_table), 'string'
- elseif (t == 'string' or t == 'text') and type(ud_or_table) == 'userdata' then
- if ud_or_table.cookie and ud_or_table.cookie == text_cookie then
- -- Preserve opaque
- return ud_or_table, 'string'
- else
- return tostring(ud_or_table), 'string'
- end
- elseif t ~= 'nil' then
- return tostring(ud_or_table), 'string'
- end
-
- return nil
- end
-
- local function process_selector(task, sel)
- local function allowed_type(t)
- if t == 'string' or t == 'string_list' then
- return true
- end
-
- return false
- end
-
- local function list_type(t)
- return pure_type(t)
- end
-
- local input, etype = sel.selector.get_value(task, sel.selector.args)
-
- if not input then
- lua_util.debugm(M, task, 'no value extracted for %s', sel.selector.name)
- return nil
- end
-
- lua_util.debugm(M, task, 'extracted %s, type %s',
- sel.selector.name, etype)
-
- local pipe = sel.processor_pipe or E
- local first_elt = pipe[1]
-
- if first_elt and (first_elt.method or
- fun.any(function(t)
- return t == 'userdata' or t == 'table'
- end, first_elt.types)) then
- -- Explicit conversion
- local meth = first_elt
-
- if meth.types[etype] then
- lua_util.debugm(M, task, 'apply method `%s` to %s',
- meth.name, etype)
- input, etype = meth.process(input, etype, meth.args)
- else
- local pt = pure_type(etype)
-
- if meth.types[pt] then
- lua_util.debugm(M, task, 'map method `%s` to list of %s',
- meth.name, pt)
- -- Map method to a list of inputs, excluding empty elements
- -- We need to fold it down here to get a proper type resolution
- input = fun.totable(fun.filter(function(map_elt, _)
- return map_elt
- end,
- fun.map(function(list_elt)
- local ret, ty = meth.process(list_elt, pt, meth.args)
- if ret then
- etype = ty
- end
- return ret
- end, input)))
- if input and etype then
- etype = etype .. "_list"
- else
- input = nil
- end
- end
- end
- -- Remove method from the pipeline
- pipe = fun.drop_n(1, pipe)
- elseif etype:match('^userdata') or etype:match('^table') then
- -- Implicit conversion
- local pt = pure_type(etype)
-
- if not pt then
- lua_util.debugm(M, task, 'apply implicit conversion %s->string', etype)
- input = implicit_tostring(etype, input)
- etype = 'string'
- else
- lua_util.debugm(M, task, 'apply implicit map %s->string', pt)
- input = fun.filter(function(map_elt)
- return map_elt
- end,
- fun.map(function(list_elt)
- local ret = implicit_tostring(pt, list_elt)
- return ret
- end, input))
- etype = 'string_list'
- end
- else
- lua_util.debugm(M, task, 'avoid implicit conversion as the transformer accepts complex input')
- end
-
- -- Now we fold elements using left fold
- local function fold_function(acc, x)
- if acc == nil or acc[1] == nil then
- lua_util.debugm(M, task, 'do not apply %s, accumulator is nil', x.name)
- return nil
- end
-
- local value = acc[1]
- local t = acc[2]
-
- if not x.types[t] then
- local pt = pure_type(t)
-
- if pt and x.types['list'] then
- -- Generic list processor
- lua_util.debugm(M, task, 'apply list function `%s` to %s', x.name, t)
- return { x.process(value, t, x.args) }
- elseif pt and x.map_type and x.types[pt] then
- local map_type = x.map_type .. '_list'
- lua_util.debugm(M, task, 'map `%s` to list of %s resulting %s',
- x.name, pt, map_type)
- -- Apply map, filtering empty values
- return {
- fun.filter(function(map_elt)
- return map_elt
- end,
- fun.map(function(list_elt)
- if not list_elt then
- return nil
- end
- local ret, _ = x.process(list_elt, pt, x.args)
- return ret
- end, value)),
- map_type -- Returned type
- }
- end
- logger.errx(task, 'cannot apply transform %s for type %s', x.name, t)
- return nil
- end
-
- lua_util.debugm(M, task, 'apply %s to %s', x.name, t)
- return { x.process(value, t, x.args) }
- end
-
- local res = fun.foldl(fold_function,
- { input, etype },
- pipe)
-
- if not res or not res[1] then
- return nil
- end -- Pipeline failed
-
- if not allowed_type(res[2]) then
- -- Search for implicit conversion
- local pt = pure_type(res[2])
-
- if pt then
- lua_util.debugm(M, task, 'apply implicit map %s->string_list', pt)
- res[1] = fun.map(function(e)
- return implicit_tostring(pt, e)
- end, res[1])
- res[2] = 'string_list'
- else
- res[1] = implicit_tostring(res[2], res[1])
- res[2] = 'string'
- end
- end
-
- if list_type(res[2]) then
- -- Convert to table as it might have a functional form
- res[1] = fun.totable(res[1])
- end
-
- lua_util.debugm(M, task, 'final selector type: %s, value: %s', res[2], res[1])
-
- return res[1]
- end
-
- local function make_grammar()
- local l = require "lpeg"
- local spc = l.S(" \t\n") ^ 0
- local cont = l.R("\128\191") -- continuation byte
- local utf8_high = l.R("\194\223") * cont
- + l.R("\224\239") * cont * cont
- + l.R("\240\244") * cont * cont * cont
- local atom_start = (l.R("az") + l.R("AZ") + l.R("09") + utf8_high + l.S "-") ^ 1
- local atom_end = (l.R("az") + l.R("AZ") + l.R("09") + l.S "-_" + utf8_high) ^ 1
- local atom_mid = (1 - l.S("'\r\n\f\\,)(}{= " .. '"')) ^ 1
- local atom_argument = l.C(atom_start * atom_mid ^ 0 * atom_end ^ 0) -- We allow more characters for the arguments
- local atom = l.C(atom_start * atom_end ^ 0) -- We are more strict about selector names itself
- local singlequoted_string = l.P "'" * l.C(((1 - l.S "'\r\n\f\\") + (l.P '\\' * 1)) ^ 0) * "'"
- local doublequoted_string = l.P '"' * l.C(((1 - l.S '"\r\n\f\\') + (l.P '\\' * 1)) ^ 0) * '"'
- local argument = atom_argument + singlequoted_string + doublequoted_string
- local dot = l.P(".")
- local semicolon = l.P(":")
- local obrace = "(" * spc
- local tbl_obrace = "{" * spc
- local eqsign = spc * "=" * spc
- local tbl_ebrace = spc * "}"
- local ebrace = spc * ")"
- local comma = spc * "," * spc
- local sel_separator = spc * l.S ";*" * spc
-
- return l.P {
- "LIST";
- LIST = l.Ct(l.V("EXPR")) * (sel_separator * l.Ct(l.V("EXPR"))) ^ 0,
- EXPR = l.V("FUNCTION") * (semicolon * l.V("METHOD")) ^ -1 * (dot * l.V("PROCESSOR")) ^ 0,
- PROCESSOR = l.Ct(atom * spc * (obrace * l.V("ARG_LIST") * ebrace) ^ 0),
- FUNCTION = l.Ct(atom * spc * (obrace * l.V("ARG_LIST") * ebrace) ^ 0),
- METHOD = l.Ct(atom / function(e)
- return '__' .. e
- end * spc * (obrace * l.V("ARG_LIST") * ebrace) ^ 0),
- ARG_LIST = l.Ct((l.V("ARG") * comma ^ 0) ^ 0),
- ARG = l.Cf(tbl_obrace * l.V("NAMED_ARG") * tbl_ebrace, rawset) + argument + l.V("LIST_ARGS"),
- NAMED_ARG = (l.Ct("") * l.Cg(argument * eqsign * (argument + l.V("LIST_ARGS")) * comma ^ 0) ^ 0),
- LIST_ARGS = l.Ct(tbl_obrace * l.V("LIST_ARG") * tbl_ebrace),
- LIST_ARG = l.Cg(argument * comma ^ 0) ^ 0,
- }
- end
-
- local parser = make_grammar()
-
- --[[[
- -- @function lua_selectors.parse_selector(cfg, str)
- --]]
- exports.parse_selector = function(cfg, str)
- local parsed = { parser:match(str) }
- local output = {}
-
- if not parsed or not parsed[1] then
- return nil
- end
-
- local function check_args(name, schema, args)
- if schema then
- if getmetatable(schema) then
- -- Schema covers all arguments
- local res, err = schema:transform(args)
- if not res then
- logger.errx(rspamd_config, 'invalid arguments for %s: %s', name, err)
- return false
- else
- for i, elt in ipairs(res) do
- args[i] = elt
- end
- end
- else
- for i, selt in ipairs(schema) do
- local res, err = selt:transform(args[i])
-
- if err then
- logger.errx(rspamd_config, 'invalid arguments for %s: argument number: %s, error: %s', name, i, err)
- return false
- else
- args[i] = res
- end
- end
- end
- end
-
- return true
- end
-
- -- Output AST format is the following:
- -- table of individual selectors
- -- each selector: list of functions
- -- each function: function name + optional list of arguments
- for _, sel in ipairs(parsed) do
- local res = {
- selector = {},
- processor_pipe = {},
- }
-
- local selector_tbl = sel[1]
- if not selector_tbl then
- logger.errx(cfg, 'no selector represented')
- return nil
- end
- if not extractors[selector_tbl[1]] then
- logger.errx(cfg, 'selector %s is unknown', selector_tbl[1])
- return nil
- end
-
- res.selector = lua_util.shallowcopy(extractors[selector_tbl[1]])
- res.selector.name = selector_tbl[1]
- res.selector.args = selector_tbl[2] or E
-
- if not check_args(res.selector.name,
- res.selector.args_schema,
- res.selector.args) then
- return nil
- end
-
- lua_util.debugm(M, cfg, 'processed selector %s, args: %s',
- res.selector.name, res.selector.args)
-
- local pipeline_error = false
- -- Now process processors pipe
- fun.each(function(proc_tbl)
- local proc_name = proc_tbl[1]
-
- if proc_name:match('^__') then
- -- Special case - method
- local method_name = proc_name:match('^__(.*)$')
- -- Check array indexing...
- if tonumber(method_name) then
- method_name = tonumber(method_name)
- end
- local processor = {
- name = tostring(method_name),
- method = true,
- args = proc_tbl[2] or E,
- types = {
- userdata = true,
- table = true,
- string = true,
- },
- map_type = 'string',
- process = function(inp, t, args)
- local ret
- if t == 'table' then
- -- Plain table field
- ret = inp[method_name]
- else
- -- We call method unpacking arguments and dropping all but the first result returned
- ret = (inp[method_name](inp, unpack_function(args or E)))
- end
-
- local ret_type = type(ret)
-
- if ret_type == 'nil' then
- return nil
- end
- -- Now apply types heuristic
- if ret_type == 'string' then
- return ret, 'string'
- elseif ret_type == 'table' then
- -- TODO: we need to ensure that 1) table is numeric 2) table has merely strings
- return ret, 'string_list'
- else
- return implicit_tostring(ret_type, ret)
- end
- end,
- }
- lua_util.debugm(M, cfg, 'attached method %s to selector %s, args: %s',
- proc_name, res.selector.name, processor.args)
- table.insert(res.processor_pipe, processor)
- else
-
- if not transform_function[proc_name] then
- logger.errx(cfg, 'processor %s is unknown', proc_name)
- pipeline_error = proc_name
- return nil
- end
- local processor = lua_util.shallowcopy(transform_function[proc_name])
- processor.name = proc_name
- processor.args = proc_tbl[2] or E
-
- if not check_args(processor.name, processor.args_schema, processor.args) then
- pipeline_error = 'args schema for ' .. proc_name
- return nil
- end
-
- lua_util.debugm(M, cfg, 'attached processor %s to selector %s, args: %s',
- proc_name, res.selector.name, processor.args)
- table.insert(res.processor_pipe, processor)
- end
- end, fun.tail(sel))
-
- if pipeline_error then
- logger.errx(cfg, 'unknown or invalid processor used: "%s", exiting', pipeline_error)
- return nil
- end
-
- table.insert(output, res)
- end
-
- return output
- end
-
- --[[[
- -- @function lua_selectors.register_extractor(cfg, name, selector)
- --]]
- exports.register_extractor = function(cfg, name, selector)
- if selector.get_value then
- if extractors[name] then
- logger.warnx(cfg, 'redefining selector %s', name)
- end
- extractors[name] = selector
-
- return true
- end
-
- logger.errx(cfg, 'bad selector %s', name)
- return false
- end
-
- --[[[
- -- @function lua_selectors.register_transform(cfg, name, transform)
- --]]
- exports.register_transform = function(cfg, name, transform)
- if transform.process and transform.types then
- if transform_function[name] then
- logger.warnx(cfg, 'redefining transform function %s', name)
- end
- transform_function[name] = transform
-
- return true
- end
-
- logger.errx(cfg, 'bad transform function %s', name)
- return false
- end
-
- --[[[
- -- @function lua_selectors.process_selectors(task, selectors_pipe)
- --]]
- exports.process_selectors = function(task, selectors_pipe)
- local ret = {}
-
- for _, sel in ipairs(selectors_pipe) do
- local r = process_selector(task, sel)
-
- -- If any element is nil, then the whole selector is nil
- if not r then
- return nil
- end
- table.insert(ret, r)
- end
-
- return ret
- end
-
- --[[[
- -- @function lua_selectors.combine_selectors(task, selectors, delimiter)
- --]]
- exports.combine_selectors = function(_, selectors, delimiter)
- if not delimiter then
- delimiter = ''
- end
-
- if not selectors then
- return nil
- end
-
- local have_tables, have_userdata
-
- for _, s in ipairs(selectors) do
- if type(s) == 'table' then
- have_tables = true
- elseif type(s) == 'userdata' then
- have_userdata = true
- end
- end
-
- if not have_tables then
- if not have_userdata then
- return table.concat(selectors, delimiter)
- else
- return rspamd_text.fromtable(selectors, delimiter)
- end
- else
- -- We need to do a spill on each table selector and make a cortesian product
- -- e.g. s:tbl:s -> s:telt1:s + s:telt2:s ...
- local tbl = {}
- local res = {}
-
- for i, s in ipairs(selectors) do
- if type(s) == 'string' then
- rawset(tbl, i, fun.duplicate(s))
- elseif type(s) == 'userdata' then
- rawset(tbl, i, fun.duplicate(tostring(s)))
- else
- -- Raw table
- rawset(tbl, i, fun.map(tostring, s))
- end
- end
-
- fun.each(function(...)
- table.insert(res, table.concat({ ... }, delimiter))
- end, fun.zip(lua_util.unpack(tbl)))
-
- return res
- end
- end
-
- --[[[
- -- @function lua_selectors.flatten_selectors(selectors)
- -- Convert selectors to a flat table of elements
- --]]
- exports.flatten_selectors = function(_, selectors, _)
- local res = {}
-
- local function fill(tbl)
- for _, s in ipairs(tbl) do
- if type(s) == 'string' then
- rawset(res, #res + 1, s)
- elseif type(s) == 'userdata' then
- rawset(res, #res + 1, tostring(s))
- else
- fill(s)
- end
- end
- end
-
- fill(selectors)
-
- return res
- end
-
- --[[[
- -- @function lua_selectors.kv_table_from_pairs(selectors)
- -- Convert selectors to a table where the odd elements are keys and even are elements
- -- Similarly to make a map from (k, v) pairs list
- -- To specify the concrete constant keys, one can use the `id` extractor
- --]]
- exports.kv_table_from_pairs = function(log_obj, selectors, _)
- local res = {}
- local rspamd_logger = require "rspamd_logger"
-
- local function fill(tbl)
- local tbl_len = #tbl
- if tbl_len % 2 ~= 0 or tbl_len == 0 then
- rspamd_logger.errx(log_obj, "invalid invocation of the `kv_table_from_pairs`: table length is invalid %s",
- tbl_len)
- return
- end
- for i = 1, tbl_len, 2 do
- local k = tostring(tbl[i])
- local v = tbl[i + 1]
- if type(v) == 'string' then
- res[k] = v
- elseif type(v) == 'userdata' then
- res[k] = tostring(v)
- else
- res[k] = fun.totable(fun.map(function(elt)
- return tostring(elt)
- end, v))
- end
- end
- end
-
- fill(selectors)
-
- return res
- end
-
-
- --[[[
- -- @function lua_selectors.create_closure(log_obj, cfg, selector_str, delimiter, fn)
- -- Creates a closure from a string selector, using the specific combinator function
- --]]
- exports.create_selector_closure_fn = function(log_obj, cfg, selector_str, delimiter, fn)
- local selector = exports.parse_selector(cfg, selector_str)
-
- if not selector then
- return nil
- end
-
- return function(task)
- local res = exports.process_selectors(task, selector)
-
- if res then
- return fn(log_obj, res, delimiter)
- end
-
- return nil
- end
- end
-
- --[[[
- -- @function lua_selectors.create_closure(cfg, selector_str, delimiter='', flatten=false)
- -- Creates a closure from a string selector
- --]]
- exports.create_selector_closure = function(cfg, selector_str, delimiter, flatten)
- local combinator_fn = flatten and exports.flatten_selectors or exports.combine_selectors
-
- return exports.create_selector_closure_fn(nil, cfg, selector_str, delimiter, combinator_fn)
- end
-
- local function display_selectors(tbl)
- return fun.tomap(fun.map(function(k, v)
- return k, fun.tomap(fun.filter(function(kk, vv)
- return type(vv) ~= 'function'
- end, v))
- end, tbl))
- end
-
- exports.list_extractors = function()
- return display_selectors(extractors)
- end
-
- exports.list_transforms = function()
- return display_selectors(transform_function)
- end
-
- exports.add_map = function(name, map)
- if not exports.maps[name] then
- exports.maps[name] = map
- else
- logger.errx(rspamd_config, "duplicate map redefinition for the selectors: %s", name)
- end
- end
-
- -- Publish log target
- exports.M = M
-
- return exports
|