2017-06-02 01:07:28 +02:00
|
|
|
local lua_util = require "lua_util"
|
|
|
|
local rspamd_util = require "rspamd_util"
|
2018-03-01 17:00:39 +01:00
|
|
|
local fun = require "fun"
|
2017-06-02 01:07:28 +02:00
|
|
|
|
|
|
|
local utility = {}
|
|
|
|
|
2018-03-03 14:36:07 +01:00
|
|
|
function utility.get_all_symbols(logs, ignore_symbols)
|
2018-03-01 17:00:39 +01:00
|
|
|
-- Returns a list of all symbols
|
2017-06-02 01:07:28 +02:00
|
|
|
|
2018-03-01 17:00:39 +01:00
|
|
|
local symbols_set = {}
|
2017-06-02 01:07:28 +02:00
|
|
|
|
2018-03-01 17:00:39 +01:00
|
|
|
for _, line in pairs(logs) do
|
|
|
|
line = lua_util.rspamd_str_split(line, " ")
|
2018-05-30 15:54:41 +02:00
|
|
|
for i=4,(#line-1) do
|
2018-03-01 17:00:39 +01:00
|
|
|
line[i] = line[i]:gsub("%s+", "")
|
|
|
|
if not symbols_set[line[i]] then
|
|
|
|
symbols_set[line[i]] = true
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2017-06-02 01:07:28 +02:00
|
|
|
|
2018-03-01 17:00:39 +01:00
|
|
|
local all_symbols = {}
|
2017-06-02 01:07:28 +02:00
|
|
|
|
2018-03-01 17:00:39 +01:00
|
|
|
for symbol, _ in pairs(symbols_set) do
|
2018-03-03 14:36:07 +01:00
|
|
|
if not ignore_symbols[symbol] then
|
|
|
|
all_symbols[#all_symbols + 1] = symbol
|
|
|
|
end
|
2018-03-01 17:00:39 +01:00
|
|
|
end
|
2017-06-02 01:07:28 +02:00
|
|
|
|
2018-03-01 17:00:39 +01:00
|
|
|
table.sort(all_symbols)
|
2017-06-02 01:07:28 +02:00
|
|
|
|
2018-03-01 17:00:39 +01:00
|
|
|
return all_symbols
|
2017-06-02 01:07:28 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
function utility.read_log_file(file)
|
|
|
|
|
2018-03-01 17:00:39 +01:00
|
|
|
local lines = {}
|
2018-05-30 15:54:41 +02:00
|
|
|
local messages = {}
|
2017-06-02 01:07:28 +02:00
|
|
|
|
2018-05-30 15:54:41 +02:00
|
|
|
local fd = assert(io.open(file, "r"))
|
|
|
|
local fname = string.gsub(file, "(.*/)(.*)", "%2")
|
2017-06-02 01:07:28 +02:00
|
|
|
|
2018-05-30 15:54:41 +02:00
|
|
|
for line in fd:lines() do
|
|
|
|
local start,stop = string.find(line, fname .. ':')
|
|
|
|
|
|
|
|
if start and stop then
|
|
|
|
table.insert(lines, string.sub(line, 1, start))
|
|
|
|
table.insert(messages, string.sub(line, stop + 1, -1))
|
|
|
|
end
|
|
|
|
end
|
2017-06-02 01:07:28 +02:00
|
|
|
|
2018-05-30 15:54:41 +02:00
|
|
|
io.close(fd)
|
2017-06-02 01:07:28 +02:00
|
|
|
|
2018-05-30 15:54:41 +02:00
|
|
|
return lines,messages
|
2017-06-02 01:07:28 +02:00
|
|
|
end
|
|
|
|
|
2018-05-30 14:39:31 +02:00
|
|
|
function utility.get_all_logs(dirs)
|
2018-03-01 17:00:39 +01:00
|
|
|
-- Reads all log files in the directory and returns a list of logs.
|
2017-06-02 01:07:28 +02:00
|
|
|
|
2018-05-30 14:39:31 +02:00
|
|
|
if type(dirs) == 'string' then
|
|
|
|
dirs = {dirs}
|
2018-03-01 17:00:39 +01:00
|
|
|
end
|
2017-06-02 01:07:28 +02:00
|
|
|
|
2018-03-01 17:00:39 +01:00
|
|
|
local all_logs = {}
|
2018-05-30 15:54:41 +02:00
|
|
|
local all_messages = {}
|
2017-06-02 01:07:28 +02:00
|
|
|
|
2018-05-30 14:39:31 +02:00
|
|
|
for _,dir in ipairs(dirs) do
|
|
|
|
if dir:sub(-1, -1) == "/" then
|
|
|
|
dir = dir:sub(1, -2)
|
|
|
|
local files = rspamd_util.glob(dir .. "/*.log")
|
|
|
|
for _, file in pairs(files) do
|
2018-05-30 15:54:41 +02:00
|
|
|
local logs,messages = utility.read_log_file(file)
|
|
|
|
for i=1,#logs do
|
|
|
|
table.insert(all_logs, logs[i])
|
|
|
|
table.insert(all_messages, messages[i])
|
2018-05-30 14:39:31 +02:00
|
|
|
end
|
|
|
|
end
|
|
|
|
else
|
2018-05-30 15:54:41 +02:00
|
|
|
local logs,messages = utility.read_log_file(dir)
|
|
|
|
for i=1,#logs do
|
|
|
|
table.insert(all_logs, logs[i])
|
|
|
|
table.insert(all_messages, messages[i])
|
2018-05-30 14:39:31 +02:00
|
|
|
end
|
2018-03-01 17:00:39 +01:00
|
|
|
end
|
|
|
|
end
|
2017-06-02 01:07:28 +02:00
|
|
|
|
2018-05-30 15:54:41 +02:00
|
|
|
return all_logs,all_messages
|
2017-06-02 01:07:28 +02:00
|
|
|
end
|
|
|
|
|
2018-03-03 14:36:07 +01:00
|
|
|
function utility.get_all_symbol_scores(conf, ignore_symbols)
|
2018-05-30 15:07:59 +02:00
|
|
|
local symbols = conf:get_symbols_scores()
|
2017-06-02 01:07:28 +02:00
|
|
|
|
2018-05-30 15:07:59 +02:00
|
|
|
return fun.tomap(fun.map(function(name, elt)
|
|
|
|
return name,elt['score']
|
|
|
|
end, fun.filter(function(name, elt)
|
|
|
|
return not ignore_symbols[name]
|
|
|
|
end, symbols)))
|
2017-06-02 01:07:28 +02:00
|
|
|
end
|
|
|
|
|
2018-05-30 15:54:41 +02:00
|
|
|
function utility.generate_statistics_from_logs(logs, messages, threshold)
|
2017-06-02 01:07:28 +02:00
|
|
|
|
2018-03-01 17:00:39 +01:00
|
|
|
-- Returns file_stats table and list of symbol_stats table.
|
|
|
|
|
|
|
|
local file_stats = {
|
|
|
|
no_of_emails = 0,
|
|
|
|
no_of_spam = 0,
|
|
|
|
no_of_ham = 0,
|
|
|
|
spam_percent = 0,
|
|
|
|
ham_percent = 0,
|
|
|
|
true_positives = 0,
|
|
|
|
true_negatives = 0,
|
|
|
|
false_negative_rate = 0,
|
|
|
|
false_positive_rate = 0,
|
|
|
|
overall_accuracy = 0,
|
2018-03-19 13:09:40 +01:00
|
|
|
fscore = 0,
|
|
|
|
avg_scan_time = 0,
|
|
|
|
slowest_file = nil,
|
|
|
|
slowest = 0
|
2018-03-01 17:00:39 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
local all_symbols_stats = {}
|
2018-03-19 13:09:40 +01:00
|
|
|
local all_fps = {}
|
|
|
|
local all_fns = {}
|
2018-03-01 17:00:39 +01:00
|
|
|
|
|
|
|
local false_positives = 0
|
|
|
|
local false_negatives = 0
|
|
|
|
local true_positives = 0
|
|
|
|
local true_negatives = 0
|
|
|
|
local no_of_emails = 0
|
|
|
|
local no_of_spam = 0
|
|
|
|
local no_of_ham = 0
|
|
|
|
|
2018-05-30 15:54:41 +02:00
|
|
|
for i, log in ipairs(logs) do
|
2018-03-01 17:00:39 +01:00
|
|
|
log = lua_util.rspamd_str_trim(log)
|
|
|
|
log = lua_util.rspamd_str_split(log, " ")
|
2018-05-30 15:54:41 +02:00
|
|
|
local message = messages[i]
|
2018-03-01 17:00:39 +01:00
|
|
|
|
|
|
|
local is_spam = (log[1] == "SPAM")
|
|
|
|
local score = tonumber(log[2])
|
|
|
|
|
|
|
|
no_of_emails = no_of_emails + 1
|
|
|
|
|
|
|
|
if is_spam then
|
|
|
|
no_of_spam = no_of_spam + 1
|
|
|
|
else
|
|
|
|
no_of_ham = no_of_ham + 1
|
|
|
|
end
|
|
|
|
|
|
|
|
if is_spam and (score >= threshold) then
|
|
|
|
true_positives = true_positives + 1
|
|
|
|
elseif is_spam and (score < threshold) then
|
|
|
|
false_negatives = false_negatives + 1
|
2018-05-30 15:54:41 +02:00
|
|
|
table.insert(all_fns, message)
|
2018-03-01 17:00:39 +01:00
|
|
|
elseif not is_spam and (score >= threshold) then
|
|
|
|
false_positives = false_positives + 1
|
2018-05-30 15:54:41 +02:00
|
|
|
table.insert(all_fps, message)
|
2018-03-01 17:00:39 +01:00
|
|
|
else
|
|
|
|
true_negatives = true_negatives + 1
|
|
|
|
end
|
|
|
|
|
2018-05-30 15:54:41 +02:00
|
|
|
for j=4, (#log-1) do
|
|
|
|
if all_symbols_stats[log[j]] == nil then
|
|
|
|
all_symbols_stats[log[j]] = {
|
|
|
|
name = message,
|
2018-03-01 17:00:39 +01:00
|
|
|
no_of_hits = 0,
|
|
|
|
spam_hits = 0,
|
|
|
|
ham_hits = 0,
|
|
|
|
spam_overall = 0
|
|
|
|
}
|
|
|
|
end
|
2018-05-30 15:54:41 +02:00
|
|
|
local sym = log[j]
|
2018-03-01 17:00:39 +01:00
|
|
|
|
2018-05-30 15:54:41 +02:00
|
|
|
all_symbols_stats[sym].no_of_hits = all_symbols_stats[sym].no_of_hits + 1
|
2018-03-01 17:00:39 +01:00
|
|
|
|
|
|
|
if is_spam then
|
2018-05-30 15:54:41 +02:00
|
|
|
all_symbols_stats[sym].spam_hits = all_symbols_stats[sym].spam_hits + 1
|
2018-03-01 17:00:39 +01:00
|
|
|
else
|
2018-05-30 15:54:41 +02:00
|
|
|
all_symbols_stats[sym].ham_hits = all_symbols_stats[sym].ham_hits + 1
|
2018-03-01 17:00:39 +01:00
|
|
|
end
|
2018-03-19 13:09:40 +01:00
|
|
|
|
|
|
|
-- Find slowest message
|
2018-05-30 15:54:41 +02:00
|
|
|
if ((tonumber(log[#log]) or 0) > file_stats.slowest) then
|
|
|
|
file_stats.slowest = tonumber(log[#log])
|
|
|
|
file_stats.slowest_file = message
|
2018-03-19 13:09:40 +01:00
|
|
|
end
|
2018-03-01 17:00:39 +01:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
-- Calculating file stats
|
|
|
|
|
|
|
|
file_stats.no_of_ham = no_of_ham
|
|
|
|
file_stats.no_of_spam = no_of_spam
|
|
|
|
file_stats.no_of_emails = no_of_emails
|
|
|
|
file_stats.true_positives = true_positives
|
|
|
|
file_stats.true_negatives = true_negatives
|
|
|
|
|
|
|
|
if no_of_emails > 0 then
|
|
|
|
file_stats.spam_percent = no_of_spam * 100 / no_of_emails
|
|
|
|
file_stats.ham_percent = no_of_ham * 100 / no_of_emails
|
|
|
|
file_stats.overall_accuracy = (true_positives + true_negatives) * 100 /
|
|
|
|
no_of_emails
|
|
|
|
end
|
|
|
|
|
|
|
|
if no_of_ham > 0 then
|
|
|
|
file_stats.false_positive_rate = false_positives * 100 / no_of_ham
|
|
|
|
end
|
|
|
|
|
|
|
|
if no_of_spam > 0 then
|
|
|
|
file_stats.false_negative_rate = false_negatives * 100 / no_of_spam
|
|
|
|
end
|
|
|
|
|
|
|
|
file_stats.fscore = 2 * true_positives / (2
|
|
|
|
* true_positives
|
|
|
|
+ false_positives
|
|
|
|
+ false_negatives)
|
|
|
|
|
|
|
|
-- Calculating symbol stats
|
|
|
|
|
|
|
|
for _, symbol_stats in pairs(all_symbols_stats) do
|
|
|
|
symbol_stats.spam_percent = symbol_stats.spam_hits * 100 / no_of_spam
|
|
|
|
symbol_stats.ham_percent = symbol_stats.ham_hits * 100 / no_of_ham
|
|
|
|
symbol_stats.overall = symbol_stats.no_of_hits * 100 / no_of_emails
|
|
|
|
symbol_stats.spam_overall = symbol_stats.spam_percent /
|
|
|
|
(symbol_stats.spam_percent + symbol_stats.ham_percent)
|
|
|
|
end
|
|
|
|
|
2018-03-19 13:09:40 +01:00
|
|
|
return file_stats, all_symbols_stats, all_fps, all_fns
|
2017-06-02 01:07:28 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
return utility
|