diff options
author | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-08-07 12:19:35 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-08-07 12:19:35 +0100 |
commit | 416ba555dd278fa771ca862e174b578ad4bba958 (patch) | |
tree | e8ba5b1041770761aabd36556072295551794285 /lualib/rescore_utility.lua | |
parent | 460a82484915f5fcbf34f65194c3437fa2a4e0c7 (diff) | |
download | rspamd-416ba555dd278fa771ca862e174b578ad4bba958.tar.gz rspamd-416ba555dd278fa771ca862e174b578ad4bba958.zip |
[Minor] Remove unused utility, as it has been broken for ages
Diffstat (limited to 'lualib/rescore_utility.lua')
-rw-r--r-- | lualib/rescore_utility.lua | 230 |
1 files changed, 0 insertions, 230 deletions
diff --git a/lualib/rescore_utility.lua b/lualib/rescore_utility.lua deleted file mode 100644 index 01e5aabcb..000000000 --- a/lualib/rescore_utility.lua +++ /dev/null @@ -1,230 +0,0 @@ -local lua_util = require "lua_util" -local rspamd_util = require "rspamd_util" -local fun = require "fun" - -local utility = {} - -function utility.get_all_symbols(logs, ignore_symbols) - -- Returns a list of all symbols - - local symbols_set = {} - - for _, line in pairs(logs) do - line = lua_util.rspamd_str_split(line, " ") - for i = 4, (#line - 1) do - line[i] = line[i]:gsub("%s+", "") - if not symbols_set[line[i]] then - symbols_set[line[i]] = true - end - end - end - - local all_symbols = {} - - for symbol, _ in pairs(symbols_set) do - if not ignore_symbols[symbol] then - all_symbols[#all_symbols + 1] = symbol - end - end - - table.sort(all_symbols) - - return all_symbols -end - -function utility.read_log_file(file) - - local lines = {} - local messages = {} - - local fd = assert(io.open(file, "r")) - local fname = string.gsub(file, "(.*/)(.*)", "%2") - - for line in fd:lines() do - local start, stop = string.find(line, fname .. ':') - - if start and stop then - table.insert(lines, string.sub(line, 1, start)) - table.insert(messages, string.sub(line, stop + 1, -1)) - end - end - - io.close(fd) - - return lines, messages -end - -function utility.get_all_logs(dirs) - -- Reads all log files in the directory and returns a list of logs. - - if type(dirs) == 'string' then - dirs = { dirs } - end - - local all_logs = {} - local all_messages = {} - - for _, dir in ipairs(dirs) do - if dir:sub(-1, -1) == "/" then - dir = dir:sub(1, -2) - local files = rspamd_util.glob(dir .. "/*.log") - for _, file in pairs(files) do - local logs, messages = utility.read_log_file(file) - for i = 1, #logs do - table.insert(all_logs, logs[i]) - table.insert(all_messages, messages[i]) - end - end - else - local logs, messages = utility.read_log_file(dir) - for i = 1, #logs do - table.insert(all_logs, logs[i]) - table.insert(all_messages, messages[i]) - end - end - end - - return all_logs, all_messages -end - -function utility.get_all_symbol_scores(conf, ignore_symbols) - local symbols = conf:get_symbols_scores() - - return fun.tomap(fun.map(function(name, elt) - return name, elt['score'] - end, fun.filter(function(name, elt) - return not ignore_symbols[name] - end, symbols))) -end - -function utility.generate_statistics_from_logs(logs, messages, threshold) - - -- Returns file_stats table and list of symbol_stats table. - - local file_stats = { - no_of_emails = 0, - no_of_spam = 0, - no_of_ham = 0, - spam_percent = 0, - ham_percent = 0, - true_positives = 0, - true_negatives = 0, - false_negative_rate = 0, - false_positive_rate = 0, - overall_accuracy = 0, - fscore = 0, - avg_scan_time = 0, - slowest_file = nil, - slowest = 0 - } - - local all_symbols_stats = {} - local all_fps = {} - local all_fns = {} - - local false_positives = 0 - local false_negatives = 0 - local true_positives = 0 - local true_negatives = 0 - local no_of_emails = 0 - local no_of_spam = 0 - local no_of_ham = 0 - - for i, log in ipairs(logs) do - log = lua_util.rspamd_str_trim(log) - log = lua_util.rspamd_str_split(log, " ") - local message = messages[i] - - local is_spam = (log[1] == "SPAM") - local score = tonumber(log[2]) - - no_of_emails = no_of_emails + 1 - - if is_spam then - no_of_spam = no_of_spam + 1 - else - no_of_ham = no_of_ham + 1 - end - - if is_spam and (score >= threshold) then - true_positives = true_positives + 1 - elseif is_spam and (score < threshold) then - false_negatives = false_negatives + 1 - table.insert(all_fns, message) - elseif not is_spam and (score >= threshold) then - false_positives = false_positives + 1 - table.insert(all_fps, message) - else - true_negatives = true_negatives + 1 - end - - for j = 4, (#log - 1) do - if all_symbols_stats[log[j]] == nil then - all_symbols_stats[log[j]] = { - name = message, - no_of_hits = 0, - spam_hits = 0, - ham_hits = 0, - spam_overall = 0 - } - end - local sym = log[j] - - all_symbols_stats[sym].no_of_hits = all_symbols_stats[sym].no_of_hits + 1 - - if is_spam then - all_symbols_stats[sym].spam_hits = all_symbols_stats[sym].spam_hits + 1 - else - all_symbols_stats[sym].ham_hits = all_symbols_stats[sym].ham_hits + 1 - end - - -- Find slowest message - if ((tonumber(log[#log]) or 0) > file_stats.slowest) then - file_stats.slowest = tonumber(log[#log]) - file_stats.slowest_file = message - end - end - end - - -- Calculating file stats - - file_stats.no_of_ham = no_of_ham - file_stats.no_of_spam = no_of_spam - file_stats.no_of_emails = no_of_emails - file_stats.true_positives = true_positives - file_stats.true_negatives = true_negatives - - if no_of_emails > 0 then - file_stats.spam_percent = no_of_spam * 100 / no_of_emails - file_stats.ham_percent = no_of_ham * 100 / no_of_emails - file_stats.overall_accuracy = (true_positives + true_negatives) * 100 / - no_of_emails - end - - if no_of_ham > 0 then - file_stats.false_positive_rate = false_positives * 100 / no_of_ham - end - - if no_of_spam > 0 then - file_stats.false_negative_rate = false_negatives * 100 / no_of_spam - end - - file_stats.fscore = 2 * true_positives / (2 - * true_positives - + false_positives - + false_negatives) - - -- Calculating symbol stats - - for _, symbol_stats in pairs(all_symbols_stats) do - symbol_stats.spam_percent = symbol_stats.spam_hits * 100 / no_of_spam - symbol_stats.ham_percent = symbol_stats.ham_hits * 100 / no_of_ham - symbol_stats.overall = symbol_stats.no_of_hits * 100 / no_of_emails - symbol_stats.spam_overall = symbol_stats.spam_percent / - (symbol_stats.spam_percent + symbol_stats.ham_percent) - end - - return file_stats, all_symbols_stats, all_fps, all_fns -end - -return utility |