rspamd/lualib/rspamadm/corpus_test.lua

187 line
4.7 KiB
Lua

local rspamd_logger = require "rspamd_logger"
2017-06-02 01:07:28 +02:00
local ucl = require "ucl"
local lua_util = require "lua_util"
local argparse = require "argparse"
local parser = argparse()
:name "rspamadm corpus_test"
:description "Create logs files from email corpus"
:help_description_margin(32)
parser:option "-H --ham"
:description("Ham directory")
:argname("<dir>")
parser:option "-S --spam"
:description("Spam directory")
:argname("<dir>")
parser:option "-n --conns"
:description("Number of parallel connections")
:argname("<N>")
:convert(tonumber)
:default(10)
parser:option "-o --output"
:description("Output file")
:argname("<file>")
:default('results.log')
parser:option "-t --timeout"
:description("Timeout for client connections")
:argname("<sec>")
:convert(tonumber)
:default(60)
parser:option "-c --connect"
:description("Connect to specific host")
:argname("<host>")
:default('localhost:11334')
parser:option "-r --rspamc"
:description("Use specific rspamc path")
:argname("<path>")
:default('rspamc')
2017-06-02 01:07:28 +02:00
local HAM = "HAM"
local SPAM = "SPAM"
local opts
2017-06-02 01:07:28 +02:00
2018-03-19 13:11:24 +01:00
local function scan_email(n_parallel, path, timeout)
2017-06-02 01:07:28 +02:00
local rspamc_command = string.format("%s --connect %s -j --compact -n %s -t %.3f %s",
opts.rspamc, opts.connect, n_parallel, timeout, path)
local result = assert(io.popen(rspamc_command))
result = result:read("*all")
return result
end
2017-06-02 01:07:28 +02:00
local function write_results(results, file)
local f = io.open(file, 'w')
2017-06-02 01:07:28 +02:00
for _, result in pairs(results) do
local log_line = string.format("%s %.2f %s",
result.type, result.score, result.action)
2017-06-02 01:07:28 +02:00
for _, sym in pairs(result.symbols) do
log_line = log_line .. " " .. sym
end
2017-06-02 01:07:28 +02:00
log_line = log_line .. " " .. result.scan_time .. " " .. file .. ':' .. result.filename
2018-03-19 13:11:24 +01:00
log_line = log_line .. "\r\n"
2017-06-02 01:07:28 +02:00
f:write(log_line)
end
2017-06-02 01:07:28 +02:00
f:close()
2017-06-02 01:07:28 +02:00
end
local function encoded_json_to_log(result)
-- Returns table containing score, action, list of symbols
2017-06-02 01:07:28 +02:00
local filtered_result = {}
2018-05-29 17:58:54 +02:00
local ucl_parser = ucl.parser()
2017-06-02 01:07:28 +02:00
2018-05-29 17:58:54 +02:00
local is_good, err = ucl_parser:parse_string(result)
2017-06-02 01:07:28 +02:00
if not is_good then
rspamd_logger.errx("Parser error: %1", err)
return nil
end
2017-06-02 01:07:28 +02:00
2018-05-29 17:58:54 +02:00
result = ucl_parser:get_object()
2017-06-02 01:07:28 +02:00
filtered_result.score = result.score
if not result.action then
rspamd_logger.errx("Bad JSON: %1", result)
return nil
end
local action = result.action:gsub("%s+", "_")
filtered_result.action = action
2017-06-02 01:07:28 +02:00
filtered_result.symbols = {}
2017-06-02 01:07:28 +02:00
for sym, _ in pairs(result.symbols) do
table.insert(filtered_result.symbols, sym)
end
2017-06-02 01:07:28 +02:00
filtered_result.filename = result.filename
filtered_result.scan_time = result.scan_time
2018-03-19 13:11:24 +01:00
return filtered_result
2017-06-02 01:07:28 +02:00
end
local function scan_results_to_logs(results, actual_email_type)
local logs = {}
2017-06-02 01:07:28 +02:00
results = lua_util.rspamd_str_split(results, "\n")
2017-06-02 01:07:28 +02:00
if results[#results] == "" then
results[#results] = nil
end
2017-06-02 01:07:28 +02:00
for _, result in pairs(results) do
result = encoded_json_to_log(result)
if result then
result['type'] = actual_email_type
table.insert(logs, result)
2017-06-02 01:07:28 +02:00
end
end
2017-06-02 01:07:28 +02:00
return logs
2017-06-02 01:07:28 +02:00
end
local function handler(args)
opts = parser:parse(args)
2018-09-28 21:58:46 +02:00
local ham_directory = opts['ham']
local spam_directory = opts['spam']
local connections = opts["conns"]
local output = opts["output"]
2017-06-02 01:07:28 +02:00
local results = {}
2017-06-02 01:07:28 +02:00
local start_time = os.time()
local no_of_ham = 0
local no_of_spam = 0
2017-06-02 01:07:28 +02:00
if ham_directory then
rspamd_logger.messagex("Scanning ham corpus...")
local ham_results = scan_email(connections, ham_directory, opts["timeout"])
ham_results = scan_results_to_logs(ham_results, HAM)
2017-06-02 01:07:28 +02:00
no_of_ham = #ham_results
2017-06-02 01:07:28 +02:00
for _, result in pairs(ham_results) do
table.insert(results, result)
2017-06-02 01:07:28 +02:00
end
end
2017-06-02 01:07:28 +02:00
if spam_directory then
rspamd_logger.messagex("Scanning spam corpus...")
local spam_results = scan_email(connections, spam_directory, opts.timeout)
spam_results = scan_results_to_logs(spam_results, SPAM)
2017-06-02 01:07:28 +02:00
no_of_spam = #spam_results
2017-06-02 01:07:28 +02:00
for _, result in pairs(spam_results) do
table.insert(results, result)
2017-06-02 01:07:28 +02:00
end
end
rspamd_logger.messagex("Writing results to %s", output)
write_results(results, output)
rspamd_logger.messagex("Stats: ")
local elapsed_time = os.time() - start_time
local total_msgs = no_of_ham + no_of_spam
rspamd_logger.messagex("Elapsed time: %ss", elapsed_time)
rspamd_logger.messagex("No of ham: %s", no_of_ham)
rspamd_logger.messagex("No of spam: %s", no_of_spam)
rspamd_logger.messagex("Messages/sec: %s", (total_msgs / elapsed_time))
end
return {
name = 'corpustest',
aliases = {'corpus_test', 'corpus'},
handler = handler,
description = parser._description
2018-09-28 21:58:46 +02:00
}