diff options
Diffstat (limited to 'lualib')
-rw-r--r-- | lualib/lua_bayes_redis.lua | 67 | ||||
-rw-r--r-- | lualib/lua_cfg_transform.lua | 22 | ||||
-rw-r--r-- | lualib/lua_dkim_tools.lua | 112 | ||||
-rw-r--r-- | lualib/lua_magic/patterns.lua | 17 | ||||
-rw-r--r-- | lualib/lua_magic/types.lua | 7 | ||||
-rw-r--r-- | lualib/lua_redis.lua | 28 | ||||
-rw-r--r-- | lualib/lua_scanners/icap.lua | 9 | ||||
-rw-r--r-- | lualib/redis_scripts/bayes_cache_learn.lua | 17 | ||||
-rw-r--r-- | lualib/redis_scripts/bayes_classify.lua | 75 | ||||
-rw-r--r-- | lualib/redis_scripts/bayes_learn.lua | 55 | ||||
-rw-r--r-- | lualib/rspamadm/dmarc_report.lua | 18 | ||||
-rw-r--r-- | lualib/rspamadm/mime.lua | 401 |
12 files changed, 491 insertions, 337 deletions
diff --git a/lualib/lua_bayes_redis.lua b/lualib/lua_bayes_redis.lua index 782e6fc47..a7af80bf1 100644 --- a/lualib/lua_bayes_redis.lua +++ b/lualib/lua_bayes_redis.lua @@ -25,27 +25,44 @@ local ucl = require "ucl" local N = "bayes" local function gen_classify_functor(redis_params, classify_script_id) - return function(task, expanded_key, id, is_spam, stat_tokens, callback) - + return function(task, expanded_key, id, class_labels, stat_tokens, callback) local function classify_redis_cb(err, data) lua_util.debugm(N, task, 'classify redis cb: %s, %s', err, data) if err then callback(task, false, err) else - callback(task, true, data[1], data[2], data[3], data[4]) + -- Pass the raw data table to the C++ callback for processing + -- The C++ callback will handle both binary and multi-class formats + callback(task, true, data) + end + end + + -- Determine class labels to send to Redis script + local script_class_labels + if type(class_labels) == "table" then + -- Use simple comma-separated string instead of messagepack + script_class_labels = "TABLE:" .. table.concat(class_labels, ",") + else + -- Single class label or boolean compatibility + if class_labels == true or class_labels == "true" then + script_class_labels = "S" -- spam + elseif class_labels == false or class_labels == "false" then + script_class_labels = "H" -- ham + else + script_class_labels = class_labels -- string class label end end lua_redis.exec_redis_script(classify_script_id, { task = task, is_write = false, key = expanded_key }, - classify_redis_cb, { expanded_key, stat_tokens }) + classify_redis_cb, { expanded_key, script_class_labels, stat_tokens }) end end local function gen_learn_functor(redis_params, learn_script_id) - return function(task, expanded_key, id, is_spam, symbol, is_unlearn, stat_tokens, callback, maybe_text_tokens) + return function(task, expanded_key, id, class_label, symbol, is_unlearn, stat_tokens, callback, maybe_text_tokens) local function learn_redis_cb(err, data) - lua_util.debugm(N, task, 'learn redis cb: %s, %s', err, data) + lua_util.debugm(N, task, 'learn redis cb: %s, %s for class %s', err, data, class_label) if err then callback(task, false, err) else @@ -53,17 +70,24 @@ local function gen_learn_functor(redis_params, learn_script_id) end end + -- Convert class_label for backward compatibility + local script_class_label = class_label + if class_label == true or class_label == "true" then + script_class_label = "S" -- spam + elseif class_label == false or class_label == "false" then + script_class_label = "H" -- ham + end + if maybe_text_tokens then lua_redis.exec_redis_script(learn_script_id, { task = task, is_write = true, key = expanded_key }, learn_redis_cb, - { expanded_key, tostring(is_spam), symbol, tostring(is_unlearn), stat_tokens, maybe_text_tokens }) + { expanded_key, script_class_label, symbol, tostring(is_unlearn), stat_tokens, maybe_text_tokens }) else lua_redis.exec_redis_script(learn_script_id, { task = task, is_write = true, key = expanded_key }, - learn_redis_cb, { expanded_key, tostring(is_spam), symbol, tostring(is_unlearn), stat_tokens }) + learn_redis_cb, { expanded_key, script_class_label, symbol, tostring(is_unlearn), stat_tokens }) end - end end @@ -112,8 +136,7 @@ end --- @param classifier_ucl ucl of the classifier config --- @param statfile_ucl ucl of the statfile config --- @return a pair of (classify_functor, learn_functor) or `nil` in case of error -exports.lua_bayes_init_statfile = function(classifier_ucl, statfile_ucl, symbol, is_spam, ev_base, stat_periodic_cb) - +exports.lua_bayes_init_statfile = function(classifier_ucl, statfile_ucl, symbol, class_label, ev_base, stat_periodic_cb) local redis_params = load_redis_params(classifier_ucl, statfile_ucl) if not redis_params then @@ -137,7 +160,6 @@ exports.lua_bayes_init_statfile = function(classifier_ucl, statfile_ucl, symbol, if ev_base then rspamd_config:add_periodic(ev_base, 0.0, function(cfg, _) - local function stat_redis_cb(err, data) lua_util.debugm(N, cfg, 'stat redis cb: %s, %s', err, data) @@ -162,11 +184,22 @@ exports.lua_bayes_init_statfile = function(classifier_ucl, statfile_ucl, symbol, end end + -- Convert class_label to learn key + local learn_key + if class_label == true or class_label == "true" or class_label == "S" then + learn_key = "learns_spam" + elseif class_label == false or class_label == "false" or class_label == "H" then + learn_key = "learns_ham" + else + -- For other class labels, use learns_<class_label> + learn_key = "learns_" .. string.lower(tostring(class_label)) + end + lua_redis.exec_redis_script(stat_script_id, { ev_base = ev_base, cfg = cfg, is_write = false }, stat_redis_cb, { tostring(cursor), symbol, - is_spam and "learns_spam" or "learns_ham", + learn_key, tostring(max_users) }) return statfile_ucl.monitor_timeout or classifier_ucl.monitor_timeout or 30.0 end) @@ -178,7 +211,6 @@ end local function gen_cache_check_functor(redis_params, check_script_id, conf) local packed_conf = ucl.to_format(conf, 'msgpack') return function(task, cache_id, callback) - local function classify_redis_cb(err, data) lua_util.debugm(N, task, 'check cache redis cb: %s, %s (%s)', err, data, type(data)) if err then @@ -201,17 +233,16 @@ end local function gen_cache_learn_functor(redis_params, learn_script_id, conf) local packed_conf = ucl.to_format(conf, 'msgpack') - return function(task, cache_id, is_spam) + return function(task, cache_id, class_name, class_id) local function learn_redis_cb(err, data) lua_util.debugm(N, task, 'learn_cache redis cb: %s, %s', err, data) end - lua_util.debugm(N, task, 'try to learn cache: %s', cache_id) + lua_util.debugm(N, task, 'try to learn cache: %s as %s (id=%s)', cache_id, class_name, class_id) lua_redis.exec_redis_script(learn_script_id, { task = task, is_write = true, key = cache_id }, learn_redis_cb, - { cache_id, is_spam and "1" or "0", packed_conf }) - + { cache_id, tostring(class_id), packed_conf }) end end diff --git a/lualib/lua_cfg_transform.lua b/lualib/lua_cfg_transform.lua index 265ca34c0..ec11ef299 100644 --- a/lualib/lua_cfg_transform.lua +++ b/lualib/lua_cfg_transform.lua @@ -198,20 +198,22 @@ end local function symbol_transform(cfg, k, v) local groups = cfg:at('group') - -- first try to find any group where there is a definition of this symbol - for gr_n, gr in groups:pairs() do - local symbols = gr:at('symbols') - if symbols and symbols:at(k) then - -- We override group symbol with ungrouped symbol - logger.infox("overriding group symbol %s in the group %s", k, gr_n) - symbols[k] = lua_util.override_defaults(symbols:at(k):unwrap(), v:unwrap()) - return + if groups then + -- first try to find any group where there is a definition of this symbol + for gr_n, gr in groups:pairs() do + local symbols = gr:at('symbols') + if symbols and symbols:at(k) then + -- We override group symbol with ungrouped symbol + logger.infox("overriding group symbol %s in the group %s", k, gr_n) + symbols[k] = lua_util.override_defaults(symbols:at(k):unwrap(), v:unwrap()) + return + end end end -- Now check what Rspamd knows about this symbol local sym = rspamd_config:get_symbol(k) - if not sym or not sym.group then + if groups and (not sym or not sym.group) then -- Otherwise we just use group 'ungrouped' if not groups:at('ungrouped') then groups.ungrouped = { @@ -374,7 +376,7 @@ return function(cfg) local next_act = actions_order[j] if actions:at(next_act) and actions:at(next_act):type() == 'number' then local next_score = actions:at(next_act):unwrap() - if next_score <= score then + if type(score) == 'number' and type(next_score) == 'number' and next_score <= score then logger.errx(rspamd_config, 'invalid actions thresholds order: action %s (%s) must have lower ' .. 'score than action %s (%s)', act, score, next_act, next_score) ret = false diff --git a/lualib/lua_dkim_tools.lua b/lualib/lua_dkim_tools.lua index b7f520fae..69c9462b5 100644 --- a/lualib/lua_dkim_tools.lua +++ b/lualib/lua_dkim_tools.lua @@ -13,7 +13,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -]]-- +]] -- local exports = {} @@ -33,7 +33,7 @@ local function check_violation(N, task, domain) if task:has_symbol(sym_check) then local sym = task:get_symbol(sym_check)[1] logger.infox(task, 'skip signing for %s: violation %s found: %s', - domain, sym_check, sym.options) + domain, sym_check, sym.options) return false end @@ -92,7 +92,6 @@ local function parse_dkim_http_headers(N, task, settings) local key = task:get_request_header(headers.key_header) if not (domain and selector and key) then - logger.errx(task, 'missing required headers to sign email') return false, {} end @@ -258,14 +257,14 @@ local function prepare_dkim_signing(N, task, settings) -- OpenDKIM style if is_skip_sign() then lua_util.debugm(N, task, - 'skip signing: is_sign_network: %s, is_authed: %s, is_local: %s', - is_sign_networks, is_authed, is_local) + 'skip signing: is_sign_network: %s, is_authed: %s, is_local: %s', + is_sign_networks, is_authed, is_local) return false, {} end if not hfrom or not hfrom[1] or not hfrom[1].addr then lua_util.debugm(N, task, - 'signing_table: cannot get data when no header from is presented') + 'signing_table: cannot get data when no header from is presented') return false, {} end local sign_entry = settings.signing_table:get_key(hfrom[1].addr:lower()) @@ -273,7 +272,7 @@ local function prepare_dkim_signing(N, task, settings) if sign_entry then -- Check opendkim style entries lua_util.debugm(N, task, - 'signing_table: found entry for %s: %s', hfrom[1].addr, sign_entry) + 'signing_table: found entry for %s: %s', hfrom[1].addr, sign_entry) if sign_entry == '%' then sign_entry = hdom end @@ -291,7 +290,7 @@ local function prepare_dkim_signing(N, task, settings) if not selector then logger.errx(task, 'no selector defined for sign_entry %s, key_entry %s', - sign_entry, key_entry) + sign_entry, key_entry) return false, {} end @@ -305,11 +304,11 @@ local function prepare_dkim_signing(N, task, settings) if st:sub(1, 1) == '/' or st == './' or st == '..' then res.key = parts[2]:gsub('%%', hdom) lua_util.debugm(N, task, 'perform dkim signing for %s, selector=%s, domain=%s, key file=%s', - hdom, selector, res.domain, res.key) + hdom, selector, res.domain, res.key) else res.rawkey = parts[2] -- No sanity check here lua_util.debugm(N, task, 'perform dkim signing for %s, selector=%s, domain=%s, raw key used', - hdom, selector, res.domain) + hdom, selector, res.domain) end return true, { res } @@ -327,56 +326,56 @@ local function prepare_dkim_signing(N, task, settings) if st:sub(1, 1) == '/' or st == './' or st == '..' then res.key = parts[3]:gsub('%%', hdom) lua_util.debugm(N, task, 'perform dkim signing for %s, selector=%s, domain=%s, key file=%s', - hdom, selector, res.domain, res.key) + hdom, selector, res.domain, res.key) else res.rawkey = parts[3] -- No sanity check here lua_util.debugm(N, task, 'perform dkim signing for %s, selector=%s, domain=%s, raw key used', - hdom, selector, res.domain) + hdom, selector, res.domain) end return true, { res } else logger.errx(task, 'invalid key entry for sign entry %s: %s; when signing %s domain', - sign_entry, key_entry, hdom) + sign_entry, key_entry, hdom) return false, {} end elseif settings.use_vault then -- Sign table is presented, the rest is covered by vault lua_util.debugm(N, task, 'check vault for %s, by sign entry %s, key entry is missing', - hdom, sign_entry) + hdom, sign_entry) return true, { domain = sign_entry, vault = true } else logger.errx(task, 'missing key entry for sign entry %s; when signing %s domain', - sign_entry, hdom) + sign_entry, hdom) return false, {} end else logger.errx(task, 'cannot get key entry for signing entry %s, when signing %s domain', - sign_entry, hdom) + sign_entry, hdom) return false, {} end else lua_util.debugm(N, task, - 'signing_table: no entry for %s', hfrom[1].addr) + 'signing_table: no entry for %s', hfrom[1].addr) return false, {} end else if settings.use_domain_sign_networks and is_sign_networks then dkim_domain = get_dkim_domain('use_domain_sign_networks') lua_util.debugm(N, task, - 'sign_networks: use domain(%s) for signature: %s', - settings.use_domain_sign_networks, dkim_domain) + 'sign_networks: use domain(%s) for signature: %s', + settings.use_domain_sign_networks, dkim_domain) elseif settings.use_domain_sign_local and is_local then dkim_domain = get_dkim_domain('use_domain_sign_local') lua_util.debugm(N, task, 'local: use domain(%s) for signature: %s', - settings.use_domain_sign_local, dkim_domain) + settings.use_domain_sign_local, dkim_domain) elseif settings.use_domain_sign_inbound and not is_local and not auser then dkim_domain = get_dkim_domain('use_domain_sign_inbound') lua_util.debugm(N, task, 'inbound: use domain(%s) for signature: %s', - settings.use_domain_sign_inbound, dkim_domain) + settings.use_domain_sign_inbound, dkim_domain) elseif settings.use_domain_custom then if type(settings.use_domain_custom) == 'string' then -- Load custom function @@ -387,10 +386,10 @@ local function prepare_dkim_signing(N, task, settings) settings.use_domain_custom = res_or_err dkim_domain = settings.use_domain_custom(task) lua_util.debugm(N, task, 'use custom domain for signing: %s', - dkim_domain) + dkim_domain) else logger.errx(task, 'cannot load dkim domain custom script: invalid type: %s, expected function', - type(res_or_err)) + type(res_or_err)) settings.use_domain_custom = nil end else @@ -400,12 +399,12 @@ local function prepare_dkim_signing(N, task, settings) else dkim_domain = settings.use_domain_custom(task) lua_util.debugm(N, task, 'use custom domain for signing: %s', - dkim_domain) + dkim_domain) end else dkim_domain = get_dkim_domain('use_domain') lua_util.debugm(N, task, 'use domain(%s) for signature: %s', - settings.use_domain, dkim_domain) + settings.use_domain, dkim_domain) end end @@ -467,7 +466,7 @@ local function prepare_dkim_signing(N, task, settings) }) else lua_util.debugm(N, task, 'domain %s is not designated for vault', - dkim_domain) + dkim_domain) end else -- TODO: try every domain in the vault @@ -501,7 +500,7 @@ local function prepare_dkim_signing(N, task, settings) if ret then table.insert(p, k) lua_util.debugm(N, task, 'using mempool selector %s with key %s', - k.selector, k.key) + k.selector, k.key) end end @@ -530,11 +529,11 @@ local function prepare_dkim_signing(N, task, settings) if not settings.use_redis then insert_or_update_prop(N, task, p, 'key', - 'default path', settings.path) + 'default path', settings.path) end insert_or_update_prop(N, task, p, 'selector', - 'default selector', settings.selector) + 'default selector', settings.selector) if settings.check_violation then if not check_violation(N, task, p.domain) then @@ -543,7 +542,7 @@ local function prepare_dkim_signing(N, task, settings) end insert_or_update_prop(N, task, p, 'domain', 'dkim_domain', - dkim_domain) + dkim_domain) return #p > 0 and true or false, p end @@ -560,53 +559,53 @@ exports.sign_using_redis = function(N, task, settings, selectors, sign_func, err local function redis_key_cb(err, data) if err then err_func(string.format("cannot make request to load DKIM key for %s: %s", - rk, err)) + rk, err)) elseif type(data) ~= 'string' then lua_util.debugm(N, task, "missing DKIM key for %s", rk) else p.rawkey = data lua_util.debugm(N, task, 'found and parsed key for %s:%s in Redis', - p.domain, p.selector) + p.domain, p.selector) sign_func(task, p) end end local rret = lua_redis.redis_make_request(task, - settings.redis_params, -- connect params - rk, -- hash key - false, -- is write - redis_key_cb, --callback - 'HGET', -- command - { settings.key_prefix, rk } -- arguments + settings.redis_params, -- connect params + rk, -- hash key + false, -- is write + redis_key_cb, --callback + 'HGET', -- command + { settings.key_prefix, rk } -- arguments ) if not rret then err_func(task, - string.format("cannot make request to load DKIM key for %s", rk)) + string.format("cannot make request to load DKIM key for %s", rk)) end end for _, p in ipairs(selectors) do if settings.selector_prefix then logger.infox(task, "using selector prefix '%s' for domain '%s'", - settings.selector_prefix, p.domain); + settings.selector_prefix, p.domain); local function redis_selector_cb(err, data) if err or type(data) ~= 'string' then err_func(task, string.format("cannot make request to load DKIM selector for domain %s: %s", - p.domain, err)) + p.domain, err)) else try_redis_key(data, p) end end local rret = lua_redis.redis_make_request(task, - settings.redis_params, -- connect params - p.domain, -- hash key - false, -- is write - redis_selector_cb, --callback - 'HGET', -- command - { settings.selector_prefix, p.domain } -- arguments + settings.redis_params, -- connect params + p.domain, -- hash key + false, -- is write + redis_selector_cb, --callback + 'HGET', -- command + { settings.selector_prefix, p.domain } -- arguments ) if not rret then err_func(task, string.format("cannot make Redis request to load DKIM selector for domain %s", - p.domain)) + p.domain)) end else try_redis_key(p.selector, p) @@ -619,25 +618,25 @@ exports.sign_using_vault = function(N, task, settings, selector, sign_func, err_ local ucl = require "ucl" local full_url = string.format('%s/v1/%s/%s', - settings.vault_url, settings.vault_path or 'dkim', selector.domain) + settings.vault_url, settings.vault_path or 'dkim', selector.domain) local upstream_list = lua_util.http_upstreams_by_url(rspamd_config:get_mempool(), settings.vault_url) local function vault_callback(err, code, body, _) if code ~= 200 then err_func(task, string.format('cannot request data from the vault url: %s; %s (%s)', - full_url, err, body)) + full_url, err, body)) else local parser = ucl.parser() local res, parser_err = parser:parse_string(body) if not res then err_func(task, string.format('vault reply for %s (data=%s) cannot be parsed: %s', - full_url, body, parser_err)) + full_url, body, parser_err)) else local obj = parser:get_object() if not obj or not obj.data then err_func(task, string.format('vault reply for %s (data=%s) is invalid, no data', - full_url, body)) + full_url, body)) else local elts = obj.data.selectors or {} local errs = {} @@ -675,13 +674,13 @@ exports.sign_using_vault = function(N, task, settings, selector, sign_func, err_ alg = p.alg, } lua_util.debugm(N, task, 'found and parsed key for %s:%s in Vault', - dkim_sign_data.domain, dkim_sign_data.selector) + dkim_sign_data.domain, dkim_sign_data.selector) nvalid = nvalid + 1 sign_func(task, dkim_sign_data) end, fun.filter(is_selector_valid, elts)) for _, e in errs do lua_util.debugm(N, task, 'error found during processing Vault selectors: %s:%s', - e[1], e[2]) + e[1], e[2]) end if nvalid == 0 then @@ -707,7 +706,7 @@ exports.sign_using_vault = function(N, task, settings, selector, sign_func, err_ if not ret then err_func(task, string.format("cannot make HTTP request to load DKIM data domain %s", - selector.domain)) + selector.domain)) end end @@ -732,8 +731,7 @@ exports.process_signing_settings = function(N, settings, opts) selector_map = { 'map', 'DKIM selectors' }, signing_table = { 'glob', 'DKIM signing table' }, key_table = { 'glob', 'DKIM keys table' }, - vault_domains = { 'glob', 'DKIM signing domains in vault' }, - whitelisted_signers_map = { 'set', 'ARC trusted signers domains' } + vault_domains = { 'glob', 'DKIM signing domains in vault' } } for k, v in pairs(opts) do local maybe_map = maps_opts[k] diff --git a/lualib/lua_magic/patterns.lua b/lualib/lua_magic/patterns.lua index 971ddd95f..4a5abd8ce 100644 --- a/lualib/lua_magic/patterns.lua +++ b/lualib/lua_magic/patterns.lua @@ -466,6 +466,23 @@ local patterns = { }, } }, + heic = { + matches = { + { + -- HEIC/HEIF file format signature + -- Starts with ftyp followed by specific brand identifiers + string = "^....ftyphe[im][cs]", + position = 12, + weight = 60, + }, + { + -- Alternative signature for HEIC/HEIF + string = [[^....ftypmif1]], + position = 12, + weight = 60, + }, + } + }, } return patterns diff --git a/lualib/lua_magic/types.lua b/lualib/lua_magic/types.lua index 3dce2e1f8..ad4ae4349 100644 --- a/lualib/lua_magic/types.lua +++ b/lualib/lua_magic/types.lua @@ -279,6 +279,11 @@ local types = { ct = 'image/bmp', av_check = false, }, + heic = { + type = 'image', + ct = 'image/heic', + av_check = false, + }, dwg = { type = 'image', ct = 'image/vnd.dwg', @@ -324,4 +329,4 @@ local types = { }, } -return types
\ No newline at end of file +return types diff --git a/lualib/lua_redis.lua b/lualib/lua_redis.lua index a21b97f89..195b7759f 100644 --- a/lualib/lua_redis.lua +++ b/lualib/lua_redis.lua @@ -1129,9 +1129,9 @@ local function redis_make_request_taskless(ev_base, cfg, redis_params, key, end --[[[ --- @function lua_redis.redis_make_request_taskless(ev_base, redis_params, key, is_write, callback, command, args) +-- @function lua_redis.redis_make_request_taskless(ev_base, cfg, redis_params, key, is_write, callback, command, args) -- Sends a request to Redis in context where `task` is not available for some specific use-cases --- Identical to redis_make_request() except in that first parameter is an `event base` object +-- Identical to redis_make_request() except in that first parameter is an `event base` object and the second one is the 'config' object --]] exports.rspamd_redis_make_request_taskless = redis_make_request_taskless @@ -1207,15 +1207,13 @@ local function prepare_redis_call(script) return options end -local function is_all_servers_ready(script) +local function is_any_server_ready(script) for _, s in ipairs(script.servers_ready) do - if s == "unsent" or s == "tempfail" then - return false + if s == "done" then + return true end end - - -- We assume that permanent errors are not recoverable, so we will just skip those servers - return true + return false end local function is_all_servers_failed(script) @@ -1269,7 +1267,7 @@ local function load_script_task(script, task, is_write) script.sha = data -- We assume that sha is the same on all servers script.servers_ready[idx] = "done" end - if is_all_servers_ready(script) then + if is_any_server_ready(script) then script_set_loaded(script) elseif is_all_servers_failed(script) then script.pending_upload = false @@ -1287,7 +1285,7 @@ local function load_script_task(script, task, is_write) end end - if is_all_servers_ready(script) then + if is_any_server_ready(script) then script_set_loaded(script) elseif is_all_servers_failed(script) then script.pending_upload = false @@ -1314,7 +1312,6 @@ local function load_script_taskless(script, cfg, ev_base, is_write) err, script.caller.short_src, script.caller.currentline) opt.upstream:fail() script.servers_ready[idx] = "failed" - return else -- Assume temporary error logger.infox(cfg, 'temporary error uploading script %s to %s: %s; registered from: %s:%s', @@ -1322,7 +1319,6 @@ local function load_script_taskless(script, cfg, ev_base, is_write) opt.upstream:get_addr():to_string(true), err, script.caller.short_src, script.caller.currentline) script.servers_ready[idx] = "tempfail" - return end else opt.upstream:ok() @@ -1335,7 +1331,7 @@ local function load_script_taskless(script, cfg, ev_base, is_write) script.servers_ready[idx] = "done" end - if is_all_servers_ready(script) then + if is_any_server_ready(script) then script_set_loaded(script) elseif is_all_servers_failed(script) then script.pending_upload = false @@ -1353,7 +1349,7 @@ local function load_script_taskless(script, cfg, ev_base, is_write) end end - if is_all_servers_ready(script) then + if is_any_server_ready(script) then script_set_loaded(script) elseif is_all_servers_failed(script) then script.pending_upload = false @@ -1482,6 +1478,10 @@ local function exec_redis_script(id, params, callback, keys, args) script.sha = nil script.loaded = nil script.pending_upload = true + -- We must initialize all servers as we don't know here which one failed + for i, _ in ipairs(script.servers_ready) do + script.servers_ready[i] = "unsent" + end -- Reload scripts if this has not been initiated yet if params.task then load_script_task(script, params.task) diff --git a/lualib/lua_scanners/icap.lua b/lualib/lua_scanners/icap.lua index 2e3ced034..532858793 100644 --- a/lualib/lua_scanners/icap.lua +++ b/lualib/lua_scanners/icap.lua @@ -239,13 +239,16 @@ local function icap_check(task, content, digest, rule, maybe_part) end end - local function get_req_headers() - + local function get_req_headers() local in_client_ip = task:get_from_ip() + local in_client_ip_str = in_client_ip:to_string() local req_hlen = 2 + if in_client_ip:get_version() == 6 then + in_client_ip_str = "ip6-" .. string.gsub(in_client_ip_str, ":", "-") + end if maybe_part then table.insert(req_headers, - string.format('GET http://%s/%s HTTP/1.0\r\n', in_client_ip, lua_util.url_encode_string(maybe_part:get_filename()))) + string.format('GET http://%s/%s HTTP/1.0\r\n', in_client_ip_str, lua_util.url_encode_string(maybe_part:get_filename()))) if rule.use_specific_content_type then table.insert(http_headers, string.format('Content-Type: %s/%s\r\n', maybe_part:get_detected_type())) --else diff --git a/lualib/redis_scripts/bayes_cache_learn.lua b/lualib/redis_scripts/bayes_cache_learn.lua index d8a2d878e..a7c9ac443 100644 --- a/lualib/redis_scripts/bayes_cache_learn.lua +++ b/lualib/redis_scripts/bayes_cache_learn.lua @@ -1,12 +1,15 @@ --- Lua script to perform cache checking for bayes classification +-- Lua script to perform cache checking for bayes classification (multi-class) -- This script accepts the following parameters: -- key1 - cache id --- key3 - is spam (1 or 0) +-- key2 - class_id (numeric hash of class name, computed by C side) -- key3 - configuration table in message pack local cache_id = KEYS[1] -local is_spam = KEYS[2] +local class_id = KEYS[2] local conf = cmsgpack.unpack(KEYS[3]) + +-- Use class_id directly as cache value +local cache_value = tostring(class_id) cache_id = string.sub(cache_id, 1, conf.cache_elt_len) -- Try each prefix that is in Redis (as some other instance might have set it) @@ -15,8 +18,8 @@ for i = 0, conf.cache_max_keys do local have = redis.call('HGET', prefix, cache_id) if have then - -- Already in cache, but is_spam changes when relearning - redis.call('HSET', prefix, cache_id, is_spam) + -- Already in cache, but cache_value changes when relearning + redis.call('HSET', prefix, cache_id, cache_value) return false end end @@ -30,7 +33,7 @@ for i = 0, conf.cache_max_keys do if count < lim then -- We can add it to this prefix - redis.call('HSET', prefix, cache_id, is_spam) + redis.call('HSET', prefix, cache_id, cache_value) added = true end end @@ -46,7 +49,7 @@ if not added then if exists then if not expired then redis.call('DEL', prefix) - redis.call('HSET', prefix, cache_id, is_spam) + redis.call('HSET', prefix, cache_id, cache_value) -- Do not expire anything else expired = true diff --git a/lualib/redis_scripts/bayes_classify.lua b/lualib/redis_scripts/bayes_classify.lua index e94f645fd..d6132e631 100644 --- a/lualib/redis_scripts/bayes_classify.lua +++ b/lualib/redis_scripts/bayes_classify.lua @@ -1,37 +1,68 @@ --- Lua script to perform bayes classification +-- Lua script to perform bayes classification (multi-class) -- This script accepts the following parameters: -- key1 - prefix for bayes tokens (e.g. for per-user classification) --- key2 - set of tokens encoded in messagepack array of strings +-- key2 - class labels: table of all class labels as "TABLE:label1,label2,..." +-- key3 - set of tokens encoded in messagepack array of strings local prefix = KEYS[1] -local output_spam = {} -local output_ham = {} +local class_labels_arg = KEYS[2] +local input_tokens = cmsgpack.unpack(KEYS[3]) -local learned_ham = tonumber(redis.call('HGET', prefix, 'learns_ham')) or 0 -local learned_spam = tonumber(redis.call('HGET', prefix, 'learns_spam')) or 0 +-- Parse class labels (always expect TABLE: format) +local class_labels = {} +if string.match(class_labels_arg, "^TABLE:") then + local labels_str = string.sub(class_labels_arg, 7) -- Remove "TABLE:" prefix + for label in string.gmatch(labels_str, "([^,]+)") do + table.insert(class_labels, label) + end +else + -- Legacy single class - convert to array + class_labels = { class_labels_arg } +end --- Output is a set of pairs (token_index, token_count), tokens that are not --- found are not filled. --- This optimisation will save a lot of space for sparse tokens, and in Bayes that assumption is normally held +-- Get learned counts for all classes (ordered) +local learned_counts = {} +for _, label in ipairs(class_labels) do + local key = 'learns_' .. string.lower(label) + -- Handle legacy keys for backward compatibility + if label == 'H' then + key = 'learns_ham' + elseif label == 'S' then + key = 'learns_spam' + end + table.insert(learned_counts, tonumber(redis.call('HGET', prefix, key)) or 0) +end -if learned_ham > 0 and learned_spam > 0 then - local input_tokens = cmsgpack.unpack(KEYS[2]) - for i, token in ipairs(input_tokens) do - local token_data = redis.call('HMGET', token, 'H', 'S') +-- Get token data for all classes (ordered) +local token_results = {} +for i, _ in ipairs(class_labels) do + token_results[i] = {} +end - if token_data then - local ham_count = token_data[1] - local spam_count = token_data[2] +-- Check if we have any learning data +local has_learns = false +for _, count in ipairs(learned_counts) do + if count > 0 then + has_learns = true + break + end +end - if ham_count then - table.insert(output_ham, { i, tonumber(ham_count) }) - end +if has_learns then + -- Process each token + for i, token in ipairs(input_tokens) do + local token_data = redis.call('HMGET', token, unpack(class_labels)) - if spam_count then - table.insert(output_spam, { i, tonumber(spam_count) }) + if token_data then + for j, _ in ipairs(class_labels) do + local count = token_data[j] + if count and tonumber(count) > 0 then + table.insert(token_results[j], { i, tonumber(count) }) + end end end end end -return { learned_ham, learned_spam, output_ham, output_spam }
\ No newline at end of file +-- Always return ordered arrays: [learned_counts_array, token_results_array] +return { learned_counts, token_results } diff --git a/lualib/redis_scripts/bayes_learn.lua b/lualib/redis_scripts/bayes_learn.lua index 5456165b6..ebc798fe0 100644 --- a/lualib/redis_scripts/bayes_learn.lua +++ b/lualib/redis_scripts/bayes_learn.lua @@ -1,14 +1,14 @@ --- Lua script to perform bayes learning +-- Lua script to perform bayes learning (multi-class) -- This script accepts the following parameters: -- key1 - prefix for bayes tokens (e.g. for per-user classification) --- key2 - boolean is_spam +-- key2 - class label string (e.g. "S", "H", "T") -- key3 - string symbol -- key4 - boolean is_unlearn -- key5 - set of tokens encoded in messagepack array of strings -- key6 - set of text tokens (if any) encoded in messagepack array of strings (size must be twice of `KEYS[5]`) local prefix = KEYS[1] -local is_spam = KEYS[2] == 'true' and true or false +local class_label = KEYS[2] local symbol = KEYS[3] local is_unlearn = KEYS[4] == 'true' and true or false local input_tokens = cmsgpack.unpack(KEYS[5]) @@ -18,15 +18,47 @@ if KEYS[6] then text_tokens = cmsgpack.unpack(KEYS[6]) end -local hash_key = is_spam and 'S' or 'H' -local learned_key = is_spam and 'learns_spam' or 'learns_ham' +-- Handle backward compatibility for boolean values +if class_label == 'true' then + class_label = 'S' -- spam +elseif class_label == 'false' then + class_label = 'H' -- ham +end + +local hash_key = class_label +local learned_key = 'learns_' .. string.lower(class_label) + +-- Handle legacy keys for backward compatibility +if class_label == 'S' then + learned_key = 'learns_spam' +elseif class_label == 'H' then + learned_key = 'learns_ham' +end redis.call('SADD', symbol .. '_keys', prefix) redis.call('HSET', prefix, 'version', '2') -- new schema -redis.call('HINCRBY', prefix, learned_key, is_unlearn and -1 or 1) -- increase or decrease learned count + +-- Update learned count, but prevent it from going negative +if is_unlearn then + local current_count = tonumber(redis.call('HGET', prefix, learned_key)) or 0 + if current_count > 0 then + redis.call('HINCRBY', prefix, learned_key, -1) + end +else + redis.call('HINCRBY', prefix, learned_key, 1) +end for i, token in ipairs(input_tokens) do - redis.call('HINCRBY', token, hash_key, is_unlearn and -1 or 1) + -- Update token count, but prevent it from going negative + if is_unlearn then + local current_token_count = tonumber(redis.call('HGET', token, hash_key)) or 0 + if current_token_count > 0 then + redis.call('HINCRBY', token, hash_key, -1) + end + else + redis.call('HINCRBY', token, hash_key, 1) + end + if text_tokens then local tok1 = text_tokens[i * 2 - 1] local tok2 = text_tokens[i * 2] @@ -38,7 +70,14 @@ for i, token in ipairs(input_tokens) do redis.call('HSET', token, 'tokens', tok1) end - redis.call('ZINCRBY', prefix .. '_z', is_unlearn and -1 or 1, token) + if is_unlearn then + local current_z_score = tonumber(redis.call('ZSCORE', prefix .. '_z', token)) or 0 + if current_z_score > 0 then + redis.call('ZINCRBY', prefix .. '_z', -1, token) + end + else + redis.call('ZINCRBY', prefix .. '_z', 1, token) + end end end end diff --git a/lualib/rspamadm/dmarc_report.lua b/lualib/rspamadm/dmarc_report.lua index 71ff5d163..fb28a9264 100644 --- a/lualib/rspamadm/dmarc_report.lua +++ b/lualib/rspamadm/dmarc_report.lua @@ -99,6 +99,8 @@ local redis_attrs = { log_obj = rspamd_config, resolver = rspamadm_dns_resolver, } +local redis_attrs_write = lua_util.shallowcopy(redis_attrs) +redis_attrs_write['is_write'] = true local pool local function load_config(opts) @@ -481,7 +483,7 @@ local function prepare_report(opts, start_time, end_time, rep_key) -- Rename report key to avoid races if not opts.no_opt then - lua_redis.request(redis_params, redis_attrs, + lua_redis.request(redis_params, redis_attrs_write, { 'RENAME', rep_key, rep_key .. '_processing' }) rep_key = rep_key .. '_processing' end @@ -491,7 +493,7 @@ local function prepare_report(opts, start_time, end_time, rep_key) if not dmarc_record then if not opts.no_opt then - lua_redis.request(redis_params, redis_attrs, + lua_redis.request(redis_params, redis_attrs_write, { 'DEL', rep_key }) end logger.messagex('Cannot process reports for domain %s; invalid dmarc record', reporting_domain) @@ -554,7 +556,7 @@ local function prepare_report(opts, start_time, end_time, rep_key) lua_util.debugm(N, 'got final message: %s', message) if not opts.no_opt then - lua_redis.request(redis_params, redis_attrs, + lua_redis.request(redis_params, redis_attrs_write, { 'DEL', rep_key }) end @@ -585,7 +587,7 @@ local function process_report_date(opts, start_time, end_time, date) -- Rename index key to avoid races if not opts.no_opt then - lua_redis.request(redis_params, redis_attrs, + lua_redis.request(redis_params, redis_attrs_write, { 'RENAME', idx_key, idx_key .. '_processing' }) idx_key = idx_key .. '_processing' end @@ -595,7 +597,7 @@ local function process_report_date(opts, start_time, end_time, date) if not ret or not results then -- Remove bad key if not opts.no_opt then - lua_redis.request(redis_params, redis_attrs, + lua_redis.request(redis_params, redis_attrs_write, { 'DEL', idx_key }) end logger.messagex('Cannot get reports for %s', date) @@ -615,7 +617,7 @@ local function process_report_date(opts, start_time, end_time, date) lua_util.shuffle(reports) -- Remove processed key if not opts.no_opt then - lua_redis.request(redis_params, redis_attrs, + lua_redis.request(redis_params, redis_attrs_write, { 'DEL', idx_key }) end @@ -715,11 +717,11 @@ local function handler(args) if not opts.no_opt then lua_util.debugm(N, 'set last report date to %s', start_collection) -- Hack to avoid coroutines + async functions mess: we use async redis call here - redis_attrs.callback = function() + redis_attrs_write.callback = function() logger.messagex('Reporting collection has finished %s dates processed, %s reports: %s completed, %s failed', ndates, nreports, nsuccess, nfail) end - lua_redis.request(redis_params, redis_attrs, + lua_redis.request(redis_params, redis_attrs_write, { 'SETEX', 'rspamd_dmarc_last_collection', dmarc_settings.reporting.keys_expire * 2, tostring(start_collection) }) else diff --git a/lualib/rspamadm/mime.lua b/lualib/rspamadm/mime.lua index e0b23e16c..a20e47e23 100644 --- a/lualib/rspamadm/mime.lua +++ b/lualib/rspamadm/mime.lua @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -]]-- +]] -- local argparse = require "argparse" local ansicolors = require "ansicolors" @@ -35,94 +35,94 @@ local parser = argparse() :require_command(true) parser:option "-c --config" - :description "Path to config file" - :argname("<cfg>") - :default(rspamd_paths["CONFDIR"] .. "/" .. "rspamd.conf") + :description "Path to config file" + :argname("<cfg>") + :default(rspamd_paths["CONFDIR"] .. "/" .. "rspamd.conf") parser:mutex( - parser:flag "-j --json" - :description "JSON output", - parser:flag "-U --ucl" - :description "UCL output", - parser:flag "-M --messagepack" - :description "MessagePack output" + parser:flag "-j --json" + :description "JSON output", + parser:flag "-U --ucl" + :description "UCL output", + parser:flag "-M --messagepack" + :description "MessagePack output" ) parser:flag "-C --compact" - :description "Use compact format" + :description "Use compact format" parser:flag "--no-file" - :description "Do not print filename" + :description "Do not print filename" -- Extract subcommand local extract = parser:command "extract ex e" - :description "Extracts data from MIME messages" + :description "Extracts data from MIME messages" extract:argument "file" - :description "File to process" - :argname "<file>" - :args "+" + :description "File to process" + :argname "<file>" + :args "+" extract:flag "-t --text" - :description "Extracts plain text data from a message" + :description "Extracts plain text data from a message" extract:flag "-H --html" - :description "Extracts htm data from a message" + :description "Extracts htm data from a message" extract:option "-o --output" - :description "Output format ('raw', 'content', 'oneline', 'decoded', 'decoded_utf')" - :argname("<type>") - :convert { - raw = "raw", - content = "content", - oneline = "content_oneline", - decoded = "raw_parsed", - decoded_utf = "raw_utf" -} - :default "content" + :description "Output format ('raw', 'content', 'oneline', 'decoded', 'decoded_utf')" + :argname("<type>") + :convert { + raw = "raw", + content = "content", + oneline = "content_oneline", + decoded = "raw_parsed", + decoded_utf = "raw_utf" + } + :default "content" extract:flag "-w --words" - :description "Extracts words" + :description "Extracts words" extract:flag "-p --part" - :description "Show part info" + :description "Show part info" extract:flag "-s --structure" - :description "Show structure info (e.g. HTML tags)" + :description "Show structure info (e.g. HTML tags)" extract:flag "-i --invisible" - :description "Show invisible content for HTML parts" + :description "Show invisible content for HTML parts" extract:option "-F --words-format" - :description "Words format ('stem', 'norm', 'raw', 'full')" - :argname("<type>") - :convert { - stem = "stem", - norm = "norm", - raw = "raw", - full = "full", -} - :default "stem" + :description "Words format ('stem', 'norm', 'raw', 'full')" + :argname("<type>") + :convert { + stem = "stem", + norm = "norm", + raw = "raw", + full = "full", + } + :default "stem" local stat = parser:command "stat st s" - :description "Extracts statistical data from MIME messages" + :description "Extracts statistical data from MIME messages" stat:argument "file" :description "File to process" :argname "<file>" :args "+" stat:mutex( - stat:flag "-m --meta" - :description "Lua metatokens", - stat:flag "-b --bayes" - :description "Bayes tokens", - stat:flag "-F --fuzzy" - :description "Fuzzy hashes" + stat:flag "-m --meta" + :description "Lua metatokens", + stat:flag "-b --bayes" + :description "Bayes tokens", + stat:flag "-F --fuzzy" + :description "Fuzzy hashes" ) stat:flag "-s --shingles" :description "Show shingles for fuzzy hashes" local urls = parser:command "urls url u" - :description "Extracts URLs from MIME messages" + :description "Extracts URLs from MIME messages" urls:argument "file" :description "File to process" :argname "<file>" :args "+" urls:mutex( - urls:flag "-t --tld" - :description "Get TLDs only", - urls:flag "-H --host" - :description "Get hosts only", - urls:flag "-f --full" - :description "Show piecewise urls as processed by Rspamd" + urls:flag "-t --tld" + :description "Get TLDs only", + urls:flag "-H --host" + :description "Get hosts only", + urls:flag "-f --full" + :description "Show piecewise urls as processed by Rspamd" ) urls:flag "-u --unique" @@ -135,75 +135,75 @@ urls:flag "-r --reverse" :description "Reverse sort order" local modify = parser:command "modify mod m" - :description "Modifies MIME message" + :description "Modifies MIME message" modify:argument "file" - :description "File to process" - :argname "<file>" - :args "+" + :description "File to process" + :argname "<file>" + :args "+" modify:option "-a --add-header" - :description "Adds specific header" - :argname "<header=value>" - :count "*" + :description "Adds specific header" + :argname "<header=value>" + :count "*" modify:option "-r --remove-header" - :description "Removes specific header (all occurrences)" - :argname "<header>" - :count "*" + :description "Removes specific header (all occurrences)" + :argname "<header>" + :count "*" modify:option "-R --rewrite-header" - :description "Rewrites specific header, uses Lua string.format pattern" - :argname "<header=pattern>" - :count "*" + :description "Rewrites specific header, uses Lua string.format pattern" + :argname "<header=pattern>" + :count "*" modify:option "-t --text-footer" - :description "Adds footer to text/plain parts from a specific file" - :argname "<file>" + :description "Adds footer to text/plain parts from a specific file" + :argname "<file>" modify:option "-H --html-footer" - :description "Adds footer to text/html parts from a specific file" - :argname "<file>" + :description "Adds footer to text/html parts from a specific file" + :argname "<file>" local strip = parser:command "strip" - :description "Strip attachments from a message" + :description "Strip attachments from a message" strip:argument "file" - :description "File to process" - :argname "<file>" - :args "+" + :description "File to process" + :argname "<file>" + :args "+" strip:flag "-i --keep-images" - :description "Keep images" + :description "Keep images" strip:option "--min-text-size" - :description "Minimal text size to keep" - :argname "<size>" - :convert(tonumber) - :default(0) + :description "Minimal text size to keep" + :argname "<size>" + :convert(tonumber) + :default(0) strip:option "--max-text-size" - :description "Max text size to keep" - :argname "<size>" - :convert(tonumber) - :default(math.huge) + :description "Max text size to keep" + :argname "<size>" + :convert(tonumber) + :default(math.huge) local anonymize = parser:command "anonymize" - :description "Try to remove sensitive information from a message" + :description "Try to remove sensitive information from a message" anonymize:argument "file" - :description "File to process" - :argname "<file>" - :args "+" + :description "File to process" + :argname "<file>" + :args "+" anonymize:option "--exclude-header -X" - :description "Exclude specific headers from anonymization" - :argname "<header>" - :count "*" + :description "Exclude specific headers from anonymization" + :argname "<header>" + :count "*" anonymize:option "--include-header -I" - :description "Include specific headers from anonymization" - :argname "<header>" - :count "*" + :description "Include specific headers from anonymization" + :argname "<header>" + :count "*" anonymize:flag "--gpt" - :description "Use LLM model for anonymization (requires GPT plugin to be configured)" + :description "Use LLM model for anonymization (requires GPT plugin to be configured)" anonymize:option "--model" - :description "Model to use for anonymization" - :argname "<model>" + :description "Model to use for anonymization" + :argname "<model>" anonymize:option "--prompt" - :description "Prompt to use for anonymization" - :argname "<prompt>" + :description "Prompt to use for anonymization" + :argname "<prompt>" local sign = parser:command "sign" - :description "Performs DKIM signing" + :description "Performs DKIM signing" sign:argument "file" :description "File to process" :argname "<file>" @@ -225,33 +225,33 @@ sign:option "-t --type" :description "ARC or DKIM signing" :argname("<arc|dkim>") :convert { - ['arc'] = 'arc', - ['dkim'] = 'dkim', -} + ['arc'] = 'arc', + ['dkim'] = 'dkim', + } :default 'dkim' sign:option "-o --output" :description "Output format" :argname("<message|signature>") :convert { - ['message'] = 'message', - ['signature'] = 'signature', -} + ['message'] = 'message', + ['signature'] = 'signature', + } :default 'message' local dump = parser:command "dump" - :description "Dumps a raw message in different formats" + :description "Dumps a raw message in different formats" dump:argument "file" :description "File to process" :argname "<file>" :args "+" -- Duplicate format for convenience dump:mutex( - parser:flag "-j --json" - :description "JSON output", - parser:flag "-U --ucl" - :description "UCL output", - parser:flag "-M --messagepack" - :description "MessagePack output" + parser:flag "-j --json" + :description "JSON output", + parser:flag "-U --ucl" + :description "UCL output", + parser:flag "-M --messagepack" + :description "MessagePack output" ) dump:flag "-s --split" :description "Split the output file contents such that no content is embedded" @@ -260,7 +260,7 @@ dump:option "-o --outdir" :description "Output directory" :argname("<directory>") -local function load_config(opts) +local function load_config(opts, load_tokenizers) local _r, err = rspamd_config:load_ucl(opts['config']) if not _r then @@ -273,6 +273,23 @@ local function load_config(opts) rspamd_logger.errx('cannot process %s: %s', opts['config'], err) os.exit(1) end + + -- Load custom tokenizers if requested + if load_tokenizers then + local success, tokenizer_err = rspamd_config:load_custom_tokenizers() + if not success then + rspamd_logger.errx('cannot load custom tokenizers: %s', tokenizer_err or 'unknown error') + -- Don't exit here as custom tokenizers are optional + rspamd_logger.warnx('proceeding without custom tokenizers') + end + end +end + +-- Helper function to ensure proper cleanup of tokenizers +local function cleanup_tokenizers() + if rspamd_config then + rspamd_config:unload_custom_tokenizers() + end end local function load_task(_, fname) @@ -288,13 +305,13 @@ local function load_task(_, fname) if not res then parser:error(string.format('cannot read message from %s: %s', fname, - task)) + task)) return nil end if not task:process_message() then parser:error(string.format('cannot read message from %s: %s', fname, - 'failed to parse')) + 'failed to parse')) return nil end @@ -335,7 +352,6 @@ local function print_elts(elts, opts, func) io.write(ucl.to_format(elts, output_fmt(opts))) else fun.each(function(fname, elt) - if not opts.json and not opts.ucl then if func then elt = fun.map(func, elt) @@ -357,7 +373,7 @@ local function extract_handler(opts) if opts.words then -- Enable stemming and urls detection - load_config(opts) + load_config(opts, true) -- Load with custom tokenizers rspamd_url.init(rspamd_config:get_tld_path()) rspamd_config:init_subsystem('langdet') end @@ -372,39 +388,38 @@ local function extract_handler(opts) if not opts.json and not opts.ucl then table.insert(out, - rspamd_logger.slog('Part: %s: %s, language: %s, size: %s (%s raw), words: %s', - part:get_mimepart():get_digest():sub(1, 8), - t, - part:get_language(), - part:get_length(), part:get_raw_length(), - part:get_words_count())) + rspamd_logger.slog('Part: %s: %s, language: %s, size: %s (%s raw), words: %s', + part:get_mimepart():get_digest():sub(1, 8), + t, + part:get_language(), + part:get_length(), part:get_raw_length(), + part:get_words_count())) table.insert(out, - rspamd_logger.slog('Stats: %s', - fun.foldl(function(acc, k, v) - if acc ~= '' then - return string.format('%s, %s:%s', acc, k, v) - else - return string.format('%s:%s', k, v) - end - end, '', part:get_stats()))) + rspamd_logger.slog('Stats: %s', + fun.foldl(function(acc, k, v) + if acc ~= '' then + return string.format('%s, %s:%s', acc, k, v) + else + return string.format('%s:%s', k, v) + end + end, '', part:get_stats()))) end end end local function maybe_print_mime_part_info(part, out) if opts.part then - if not opts.json and not opts.ucl then local mtype, msubtype = part:get_type() local det_mtype, det_msubtype = part:get_detected_type() table.insert(out, - rspamd_logger.slog('Mime Part: %s: %s/%s (%s/%s detected), filename: %s (%s detected ext), size: %s', - part:get_digest():sub(1, 8), - mtype, msubtype, - det_mtype, det_msubtype, - part:get_filename(), - part:get_detected_ext(), - part:get_length())) + rspamd_logger.slog('Mime Part: %s: %s/%s (%s/%s detected), filename: %s (%s detected ext), size: %s', + part:get_digest():sub(1, 8), + mtype, msubtype, + det_mtype, det_msubtype, + part:get_filename(), + part:get_detected_ext(), + part:get_length())) end end end @@ -416,17 +431,17 @@ local function extract_handler(opts) return table.concat(words, ' ') else return table.concat( - fun.totable( - fun.map(function(w) - -- [1] - stemmed word - -- [2] - normalised word - -- [3] - raw word - -- [4] - flags (table of strings) - return string.format('%s|%s|%s(%s)', - w[3], w[2], w[1], table.concat(w[4], ',')) - end, words) - ), - ' ' + fun.totable( + fun.map(function(w) + -- [1] - stemmed word + -- [2] - normalised word + -- [3] - raw word + -- [4] - flags (table of strings) + return string.format('%s|%s|%s(%s)', + w[3], w[2], w[1], table.concat(w[4], ',')) + end, words) + ), + ' ' ) end end @@ -443,7 +458,7 @@ local function extract_handler(opts) if opts.words then local how_words = opts['words_format'] or 'stem' table.insert(out_elts[fname], 'meta_words: ' .. - print_words(task:get_meta_words(how_words), how_words == 'full')) + print_words(task:get_meta_words(how_words), how_words == 'full')) end if opts.text or opts.html then @@ -466,7 +481,7 @@ local function extract_handler(opts) if opts.words then local how_words = opts['words_format'] or 'stem' table.insert(out_elts[fname], print_words(part:get_words(how_words), - how_words == 'full')) + how_words == 'full')) else table.insert(out_elts[fname], tostring(part:get_content(how))) end @@ -480,7 +495,7 @@ local function extract_handler(opts) if opts.words then local how_words = opts['words_format'] or 'stem' table.insert(out_elts[fname], print_words(part:get_words(how_words), - how_words == 'full')) + how_words == 'full')) else if opts.structure then local hc = part:get_html() @@ -489,11 +504,11 @@ local function extract_handler(opts) local fun = require "fun" if type(elt) == 'table' then return table.concat(fun.totable( - fun.map( - function(t) - return rspamd_logger.slog("%s", t) - end, - elt)), '\n') + fun.map( + function(t) + return rspamd_logger.slog("%s", t) + end, + elt)), '\n') else return rspamd_logger.slog("%s", elt) end @@ -524,7 +539,7 @@ local function extract_handler(opts) if opts.invisible then local hc = part:get_html() table.insert(out_elts[fname], string.format('invisible content: %s', - tostring(hc:get_invisible()))) + tostring(hc:get_invisible()))) end end end @@ -544,13 +559,18 @@ local function extract_handler(opts) for _, task in ipairs(tasks) do task:destroy() end + + -- Cleanup custom tokenizers if they were loaded + if opts.words then + cleanup_tokenizers() + end end local function stat_handler(opts) local fun = require "fun" local out_elts = {} - load_config(opts) + load_config(opts, true) -- Load with custom tokenizers for stat generation rspamd_url.init(rspamd_config:get_tld_path()) rspamd_config:init_subsystem('langdet,stat') -- Needed to gen stat tokens @@ -571,10 +591,10 @@ local function stat_handler(opts) out_elts[fname] = bt process_func = function(e) return string.format('%s (%d): "%s"+"%s", [%s]', e.data, e.win, e.t1 or "", - e.t2 or "", table.concat(fun.totable( - fun.map(function(k) - return k - end, e.flags)), ",")) + e.t2 or "", table.concat(fun.totable( + fun.map(function(k) + return k + end, e.flags)), ",")) end elseif opts.fuzzy then local parts = task:get_parts() or {} @@ -601,16 +621,16 @@ local function stat_handler(opts) digest = digest, shingles = shingles, type = string.format('%s/%s', - ({ part:get_type() })[1], - ({ part:get_type() })[2]) + ({ part:get_type() })[1], + ({ part:get_type() })[2]) }) else table.insert(out_elts[fname], { digest = part:get_digest(), file = part:get_filename(), type = string.format('%s/%s', - ({ part:get_type() })[1], - ({ part:get_type() })[2]) + ({ part:get_type() })[1], + ({ part:get_type() })[2]) }) end end @@ -621,10 +641,13 @@ local function stat_handler(opts) end print_elts(out_elts, opts, process_func) + + -- Cleanup custom tokenizers + cleanup_tokenizers() end local function urls_handler(opts) - load_config(opts) + load_config(opts, false) -- URLs don't need custom tokenizers rspamd_url.init(rspamd_config:get_tld_path()) local out_elts = {} @@ -764,7 +787,7 @@ local function newline(task) end local function modify_handler(opts) - load_config(opts) + load_config(opts, false) -- Modification doesn't need custom tokenizers rspamd_url.init(rspamd_config:get_tld_path()) local function read_file(file) @@ -804,10 +827,10 @@ local function modify_handler(opts) if hname == name then local new_value = string.format(hpattern, hdr.decoded) new_value = string.format('%s:%s%s', - name, hdr.separator, - rspamd_util.fold_header(name, - rspamd_util.mime_header_encode(new_value), - task:get_newlines_type())) + name, hdr.separator, + rspamd_util.fold_header(name, + rspamd_util.mime_header_encode(new_value), + task:get_newlines_type())) out[#out + 1] = new_value return end @@ -816,12 +839,12 @@ local function modify_handler(opts) if rewrite.need_rewrite_ct then if name:lower() == 'content-type' then local nct = string.format('%s: %s/%s; charset=utf-8', - 'Content-Type', rewrite.new_ct.type, rewrite.new_ct.subtype) + 'Content-Type', rewrite.new_ct.type, rewrite.new_ct.subtype) out[#out + 1] = nct return elseif name:lower() == 'content-transfer-encoding' then out[#out + 1] = string.format('%s: %s', - 'Content-Transfer-Encoding', rewrite.new_cte or 'quoted-printable') + 'Content-Transfer-Encoding', rewrite.new_cte or 'quoted-printable') seen_cte = true return end @@ -837,13 +860,13 @@ local function modify_handler(opts) if hname and hvalue then out[#out + 1] = string.format('%s: %s', hname, - rspamd_util.fold_header(hname, hvalue, task:get_newlines_type())) + rspamd_util.fold_header(hname, hvalue, task:get_newlines_type())) end end if not seen_cte and rewrite.need_rewrite_ct then out[#out + 1] = string.format('%s: %s', - 'Content-Transfer-Encoding', rewrite.new_cte or 'quoted-printable') + 'Content-Transfer-Encoding', rewrite.new_cte or 'quoted-printable') end -- End of headers @@ -883,7 +906,7 @@ local function modify_handler(opts) end local function sign_handler(opts) - load_config(opts) + load_config(opts, false) -- Signing doesn't need custom tokenizers rspamd_url.init(rspamd_config:get_tld_path()) local lua_dkim = require("lua_ffi").dkim @@ -927,11 +950,11 @@ local function sign_handler(opts) io.flush() else local dkim_hdr = string.format('%s: %s%s', - 'DKIM-Signature', - rspamd_util.fold_header('DKIM-Signature', - rspamd_util.mime_header_encode(sig), - task:get_newlines_type()), - newline(task)) + 'DKIM-Signature', + rspamd_util.fold_header('DKIM-Signature', + rspamd_util.mime_header_encode(sig), + task:get_newlines_type()), + newline(task)) io.write(dkim_hdr) io.flush() task:get_content():save_in_file(1) @@ -942,7 +965,7 @@ local function sign_handler(opts) end local function strip_handler(opts) - load_config(opts) + load_config(opts, false) -- Stripping doesn't need custom tokenizers rspamd_url.init(rspamd_config:get_tld_path()) for _, fname in ipairs(opts.file) do @@ -998,7 +1021,7 @@ local function strip_handler(opts) end local function anonymize_handler(opts) - load_config(opts) + load_config(opts, false) -- Anonymization doesn't need custom tokenizers rspamd_url.init(rspamd_config:get_tld_path()) for _, fname in ipairs(opts.file) do @@ -1103,7 +1126,7 @@ local function get_dump_content(task, opts, fname) end local function dump_handler(opts) - load_config(opts) + load_config(opts, false) -- Dumping doesn't need custom tokenizers rspamd_url.init(rspamd_config:get_tld_path()) for _, fname in ipairs(opts.file) do |