diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-05-24 14:13:32 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-05-24 19:56:05 +0100 |
commit | bb638f7c7d851f20071f5f9ee77224c0173e73ae (patch) | |
tree | 25e5dd540a9064e1c189ad17e1c76ef2027db05a /lualib/rescore_utility.lua | |
parent | 08e99bfde4713e6253ce705926851c6639f65437 (diff) | |
download | rspamd-bb638f7c7d851f20071f5f9ee77224c0173e73ae.tar.gz rspamd-bb638f7c7d851f20071f5f9ee77224c0173e73ae.zip |
[Project] Move rspamadm libraries to a standard place
Diffstat (limited to 'lualib/rescore_utility.lua')
-rw-r--r-- | lualib/rescore_utility.lua | 211 |
1 files changed, 211 insertions, 0 deletions
diff --git a/lualib/rescore_utility.lua b/lualib/rescore_utility.lua new file mode 100644 index 000000000..2a9372d4e --- /dev/null +++ b/lualib/rescore_utility.lua @@ -0,0 +1,211 @@ +local lua_util = require "lua_util" +local rspamd_util = require "rspamd_util" +local fun = require "fun" + +local utility = {} + +function utility.get_all_symbols(logs, ignore_symbols) + -- Returns a list of all symbols + + local symbols_set = {} + + for _, line in pairs(logs) do + line = lua_util.rspamd_str_split(line, " ") + for i=4,(#line-2) do + line[i] = line[i]:gsub("%s+", "") + if not symbols_set[line[i]] then + symbols_set[line[i]] = true + end + end + end + + local all_symbols = {} + + for symbol, _ in pairs(symbols_set) do + if not ignore_symbols[symbol] then + all_symbols[#all_symbols + 1] = symbol + end + end + + table.sort(all_symbols) + + return all_symbols +end + +function utility.read_log_file(file) + + local lines = {} + + file = assert(io.open(file, "r")) + + for line in file:lines() do + lines[#lines + 1] = line + end + + io.close(file) + + return lines +end + +function utility.get_all_logs(dir_path) + -- Reads all log files in the directory and returns a list of logs. + + if dir_path:sub(#dir_path, #dir_path) == "/" then + dir_path = dir_path:sub(1, #dir_path -1) + end + + local files = rspamd_util.glob(dir_path .. "/*.log") + local all_logs = {} + + for _, file in pairs(files) do + local logs = utility.read_log_file(file) + for _, log_line in pairs(logs) do + all_logs[#all_logs + 1] = log_line + end + end + + return all_logs +end + +function utility.get_all_symbol_scores(conf, ignore_symbols) + local counters = conf:get_symbols_counters() + + return fun.tomap(fun.map(function(elt) + return elt['symbol'],elt['weight'] + end, fun.filter(function(elt) + return not ignore_symbols[elt['symbol']] + end, counters))) +end + +function utility.generate_statistics_from_logs(logs, threshold) + + -- Returns file_stats table and list of symbol_stats table. + + local file_stats = { + no_of_emails = 0, + no_of_spam = 0, + no_of_ham = 0, + spam_percent = 0, + ham_percent = 0, + true_positives = 0, + true_negatives = 0, + false_negative_rate = 0, + false_positive_rate = 0, + overall_accuracy = 0, + fscore = 0, + avg_scan_time = 0, + slowest_file = nil, + slowest = 0 + } + + local all_symbols_stats = {} + local all_fps = {} + local all_fns = {} + + local false_positives = 0 + local false_negatives = 0 + local true_positives = 0 + local true_negatives = 0 + local no_of_emails = 0 + local no_of_spam = 0 + local no_of_ham = 0 + + for _, log in pairs(logs) do + log = lua_util.rspamd_str_trim(log) + log = lua_util.rspamd_str_split(log, " ") + + local is_spam = (log[1] == "SPAM") + local score = tonumber(log[2]) + + no_of_emails = no_of_emails + 1 + + if is_spam then + no_of_spam = no_of_spam + 1 + else + no_of_ham = no_of_ham + 1 + end + + if is_spam and (score >= threshold) then + true_positives = true_positives + 1 + elseif is_spam and (score < threshold) then + false_negatives = false_negatives + 1 + table.insert(all_fns, log[#log]) + elseif not is_spam and (score >= threshold) then + false_positives = false_positives + 1 + table.insert(all_fps, log[#log]) + else + true_negatives = true_negatives + 1 + end + + for i=4, (#log-2) do + if all_symbols_stats[log[i]] == nil then + all_symbols_stats[log[i]] = { + name = log[i], + no_of_hits = 0, + spam_hits = 0, + ham_hits = 0, + spam_overall = 0 + } + end + + all_symbols_stats[log[i]].no_of_hits = + all_symbols_stats[log[i]].no_of_hits + 1 + + if is_spam then + all_symbols_stats[log[i]].spam_hits = + all_symbols_stats[log[i]].spam_hits + 1 + else + all_symbols_stats[log[i]].ham_hits = + all_symbols_stats[log[i]].ham_hits + 1 + end + + -- Find slowest message + if (tonumber(log[#log-1]) > tonumber(file_stats.slowest)) then + file_stats.slowest = tostring(tonumber(log[#log-1])) + file_stats.slowest_file = log[#log] + end + end + end + + -- Calculating file stats + + file_stats.no_of_ham = no_of_ham + file_stats.no_of_spam = no_of_spam + file_stats.no_of_emails = no_of_emails + file_stats.true_positives = true_positives + file_stats.true_negatives = true_negatives + + if no_of_emails > 0 then + file_stats.spam_percent = no_of_spam * 100 / no_of_emails + file_stats.ham_percent = no_of_ham * 100 / no_of_emails + file_stats.overall_accuracy = (true_positives + true_negatives) * 100 / + no_of_emails + end + + if no_of_ham > 0 then + file_stats.false_positive_rate = false_positives * 100 / no_of_ham + end + + if no_of_spam > 0 then + file_stats.false_negative_rate = false_negatives * 100 / no_of_spam + end + + file_stats.fscore = 2 * true_positives / (2 + * true_positives + + false_positives + + false_negatives) + + -- Calculating symbol stats + + for _, symbol_stats in pairs(all_symbols_stats) do + symbol_stats.spam_percent = symbol_stats.spam_hits * 100 / no_of_spam + symbol_stats.ham_percent = symbol_stats.ham_hits * 100 / no_of_ham + symbol_stats.overall = symbol_stats.no_of_hits * 100 / no_of_emails + symbol_stats.spam_overall = symbol_stats.spam_percent / + (symbol_stats.spam_percent + symbol_stats.ham_percent) + end + + return file_stats, all_symbols_stats, all_fps, all_fns +end + +return utility |