diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-03-19 14:16:02 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-03-19 14:16:02 +0000 |
commit | bef67cce03e24465a7f7ea1ed2689727fd83a859 (patch) | |
tree | e20ba11120adffdd05600247d003ccd756f93f03 | |
parent | e9c35582e7b22661975f5898891d02bbe9651ebf (diff) | |
parent | c8aef97e5c42a85c64d20c0b13230d6d1094aeef (diff) | |
download | rspamd-bef67cce03e24465a7f7ea1ed2689727fd83a859.tar.gz rspamd-bef67cce03e24465a7f7ea1ed2689727fd83a859.zip |
Merge pull request #2089 from smfreegard/rules_20180301
Corpus/Rescore improvements, new and improved rules
-rw-r--r-- | lualib/rspamadm/corpus_test.lua | 13 | ||||
-rw-r--r-- | lualib/rspamadm/rescore.lua | 95 | ||||
-rw-r--r-- | lualib/rspamadm/rescore_utility.lua | 23 | ||||
-rw-r--r-- | rules/headers_checks.lua | 18 | ||||
-rw-r--r-- | rules/regexp/compromised_hosts.lua | 23 | ||||
-rw-r--r-- | rules/regexp/headers.lua | 49 |
6 files changed, 195 insertions, 26 deletions
diff --git a/lualib/rspamadm/corpus_test.lua b/lualib/rspamadm/corpus_test.lua index eb93d586c..b71f96e9e 100644 --- a/lualib/rspamadm/corpus_test.lua +++ b/lualib/rspamadm/corpus_test.lua @@ -5,10 +5,10 @@ local lua_util = require "lua_util" local HAM = "HAM" local SPAM = "SPAM" -local function scan_email(n_parellel, path, timeout) +local function scan_email(n_parallel, path, timeout) local rspamc_command = string.format("rspamc -j --compact -n %s -t %.3f %s", - n_parellel, timeout, path) + n_parallel, timeout, path) local result = assert(io.popen(rspamc_command)) result = result:read("*all") return result @@ -25,6 +25,8 @@ local function write_results(results, file) log_line = log_line .. " " .. sym end + log_line = log_line .. " " .. result.scan_time .. " " .. file .. ':' .. result.filename + log_line = log_line .. "\r\n" f:write(log_line) @@ -62,6 +64,9 @@ local function encoded_json_to_log(result) table.insert(filtered_result.symbols, sym) end + filtered_result.filename = result.filename + filtered_result.scan_time = result.scan_time + return filtered_result end @@ -127,8 +132,10 @@ return function (_, res) write_results(results, output) io.write("\nStats: \n") + local elapsed_time = os.time() - start_time + local total_msgs = no_of_ham + no_of_spam io.write(string.format("Elapsed time: %ds\n", os.time() - start_time)) io.write(string.format("No of ham: %d\n", no_of_ham)) io.write(string.format("No of spam: %d\n", no_of_spam)) - + io.write(string.format("Messages/sec: %-.2f\n", (total_msgs/elapsed_time))) end diff --git a/lualib/rspamadm/rescore.lua b/lualib/rspamadm/rescore.lua index c8348caa3..87e0ea2c5 100644 --- a/lualib/rspamadm/rescore.lua +++ b/lualib/rspamadm/rescore.lua @@ -182,9 +182,9 @@ local function calculate_fscore_from_weights(logs, all_symbols, weights, thresho logs = update_logs(logs, new_symbol_scores) - local file_stats, _ = rescore_utility.generate_statistics_from_logs(logs, threshold) + local file_stats, _, all_fps, all_fns = rescore_utility.generate_statistics_from_logs(logs, threshold) - return file_stats.fscore + return file_stats.fscore, all_fps, all_fns end local function print_stats(logs, threshold) @@ -196,6 +196,7 @@ F-score: %.2f False positive rate: %.2f %% False negative rate: %.2f %% Overall accuracy: %.2f %% +Slowest message: %.2f (%s) ]] logger.message("\nStatistics at threshold: " .. threshold) @@ -204,7 +205,9 @@ Overall accuracy: %.2f %% file_stats.fscore, file_stats.false_positive_rate, file_stats.false_negative_rate, - file_stats.overall_accuracy)) + file_stats.overall_accuracy, + file_stats.slowest, + file_stats.slowest_file)) end @@ -463,6 +466,67 @@ return function (args, cfg) local original_symbol_scores = rescore_utility.get_all_symbol_scores(rspamd_config, ignore_symbols) + -- Display hit frequencies + if opts['z'] then + local file_stats, all_symbols_stats = rescore_utility.generate_statistics_from_logs(logs, threshold) + local t = {} + for _, symbol_stats in pairs(all_symbols_stats) do table.insert(t, symbol_stats) end + function compare_symbols (a, b) + if (a.spam_overall ~= b.spam_overall) then + return b.spam_overall < a.spam_overall + end + if (b.spam_hits ~= a.spam_hits) then + return b.spam_hits < a.spam_hits + end + return b.ham_hits < a.ham_hits + end + table.sort(t, compare_symbols) + logger.message(string.format("%-40s %6s %6s %6s %6s %6s %6s %6s", + "NAME", "HITS", "HAM", "HAM%", "SPAM", "SPAM%", "S/O", "OVER%")) + for _, symbol_stats in pairs(t) do + logger.message( + string.format("%-40s %6d %6d %6.2f %6d %6.2f %6.2f %6.2f", + symbol_stats.name, + symbol_stats.no_of_hits, + symbol_stats.ham_hits, + lua_util.round(symbol_stats.ham_percent,2), + symbol_stats.spam_hits, + lua_util.round(symbol_stats.spam_percent,2), + lua_util.round(symbol_stats.spam_overall,2), + lua_util.round(symbol_stats.overall, 2) + ) + ) + end + + -- Print file statistics + print_stats(logs, threshold) + + -- Work out how many symbols weren't seen in the corpus + local symbols_no_hits = {} + local total_symbols = 0 + for sym in pairs(original_symbol_scores) do + total_symbols = total_symbols + 1 + if (all_symbols_stats[sym] == nil) then + table.insert(symbols_no_hits, sym) + end + end + if (#symbols_no_hits > 0) then + table.sort(symbols_no_hits) + -- Calculate percentage of rules with no hits + local nhpct = lua_util.round((#symbols_no_hits/total_symbols)*100,2) + logger.message( + string.format('\nFound %s (%-.2f%%) symbols out of %s with no hits in corpus:', + #symbols_no_hits, nhpct, total_symbols + ) + ) + for _, symbol in pairs(symbols_no_hits) do + logger.messagex('%s', symbol) + end + end + + return + end + shuffle(logs) torch.setdefaulttensortype('torch.FloatTensor') @@ -471,7 +535,6 @@ return function (args, cfg) local dataset = make_dataset_from_logs(train_logs, all_symbols, reject_score) - -- Start of perceptron training local input_size = #all_symbols torch.setnumthreads(opts['threads']) @@ -490,6 +553,8 @@ return function (args, cfg) local best_weights = linear_module.weight[1]:clone() local best_learning_rate local best_weight_decay + local all_fps + local all_fns for _,lr in ipairs(learning_rates) do for _,wd in ipairs(penalty_weights) do @@ -502,7 +567,7 @@ return function (args, cfg) initial_weights) end - local fscore = calculate_fscore_from_weights(cv_logs, + local fscore, fps, fns = calculate_fscore_from_weights(cv_logs, all_symbols, linear_module.weight[1], threshold) @@ -515,6 +580,8 @@ return function (args, cfg) best_weight_decay = wd best_fscore = fscore best_weights = linear_module.weight[1]:clone() + all_fps = fps + all_fns = fns end end end @@ -533,7 +600,6 @@ return function (args, cfg) print_score_diff(new_symbol_scores, original_symbol_scores) end - -- Pre-rescore test stats logger.message("\n\nPre-rescore test stats\n") test_logs = update_logs(test_logs, original_symbol_scores) @@ -546,4 +612,19 @@ return function (args, cfg) logger.messagex('Best fscore=%s, best learning rate=%s, best weight decay=%s', best_fscore, best_learning_rate, best_weight_decay) -end
\ No newline at end of file + + -- Show all FPs/FNs, useful for corpus checking and rule creation/modification + if (all_fps and #all_fps > 0) then + logger.message("\nFalse-Positives:") + for _, fp in pairs(all_fps) do + logger.messagex('%s', fp) + end + end + + if (all_fns and #all_fns > 0) then + logger.message("\nFalse-Negatives:") + for _, fn in pairs(all_fns) do + logger.messagex('%s', fn) + end + end +end diff --git a/lualib/rspamadm/rescore_utility.lua b/lualib/rspamadm/rescore_utility.lua index 7f3f40078..2a9372d4e 100644 --- a/lualib/rspamadm/rescore_utility.lua +++ b/lualib/rspamadm/rescore_utility.lua @@ -11,7 +11,7 @@ function utility.get_all_symbols(logs, ignore_symbols) for _, line in pairs(logs) do line = lua_util.rspamd_str_split(line, " ") - for i=4,#line do + for i=4,(#line-2) do line[i] = line[i]:gsub("%s+", "") if not symbols_set[line[i]] then symbols_set[line[i]] = true @@ -54,7 +54,7 @@ function utility.get_all_logs(dir_path) dir_path = dir_path:sub(1, #dir_path -1) end - local files = rspamd_util.glob(dir_path .. "/*") + local files = rspamd_util.glob(dir_path .. "/*.log") local all_logs = {} for _, file in pairs(files) do @@ -92,10 +92,15 @@ function utility.generate_statistics_from_logs(logs, threshold) false_negative_rate = 0, false_positive_rate = 0, overall_accuracy = 0, - fscore = 0 + fscore = 0, + avg_scan_time = 0, + slowest_file = nil, + slowest = 0 } local all_symbols_stats = {} + local all_fps = {} + local all_fns = {} local false_positives = 0 local false_negatives = 0 @@ -124,13 +129,15 @@ function utility.generate_statistics_from_logs(logs, threshold) true_positives = true_positives + 1 elseif is_spam and (score < threshold) then false_negatives = false_negatives + 1 + table.insert(all_fns, log[#log]) elseif not is_spam and (score >= threshold) then false_positives = false_positives + 1 + table.insert(all_fps, log[#log]) else true_negatives = true_negatives + 1 end - for i=4, #log do + for i=4, (#log-2) do if all_symbols_stats[log[i]] == nil then all_symbols_stats[log[i]] = { name = log[i], @@ -151,6 +158,12 @@ function utility.generate_statistics_from_logs(logs, threshold) all_symbols_stats[log[i]].ham_hits = all_symbols_stats[log[i]].ham_hits + 1 end + + -- Find slowest message + if (tonumber(log[#log-1]) > tonumber(file_stats.slowest)) then + file_stats.slowest = tostring(tonumber(log[#log-1])) + file_stats.slowest_file = log[#log] + end end end @@ -192,7 +205,7 @@ function utility.generate_statistics_from_logs(logs, threshold) (symbol_stats.spam_percent + symbol_stats.ham_percent) end - return file_stats, all_symbols_stats + return file_stats, all_symbols_stats, all_fps, all_fns end return utility diff --git a/rules/headers_checks.lua b/rules/headers_checks.lua index 2d2d8ec3d..a97d7483f 100644 --- a/rules/headers_checks.lua +++ b/rules/headers_checks.lua @@ -221,7 +221,13 @@ local check_replyto_id = rspamd_config:register_callback_symbol('CHECK_REPLYTO', -- See if Reply-To matches the To address local to = task:get_recipients(2) if (to and to[1] and to[1].addr:lower() == rt[1].addr:lower()) then - task:insert_result('REPLYTO_EQ_TO_ADDR', 1.0) + -- Ignore this for mailing-lists and automatic submissions + if (not (task:get_header('List-Unsubscribe') or + task:get_header('X-List') or + task:get_header('Auto-Submitted'))) + then + task:insert_result('REPLYTO_EQ_TO_ADDR', 1.0) + end else task:insert_result('REPLYTO_DOM_NEQ_FROM_DOM', 1.0) end @@ -1027,3 +1033,13 @@ rspamd_config.INVALID_RCPT_8BIT = { score = 6.0, group = 'headers' } + +rspamd_config.XM_CASE = { + callback = function (task) + local xm = task:get_header('X-mailer', true) + if (xm) then return true end + end, + description = 'X-mailer .vs. X-Mailer', + score = 0.5, + group = 'headers', +} diff --git a/rules/regexp/compromised_hosts.lua b/rules/regexp/compromised_hosts.lua index f6427a7cd..67101a80d 100644 --- a/rules/regexp/compromised_hosts.lua +++ b/rules/regexp/compromised_hosts.lua @@ -138,12 +138,25 @@ reconf['HIDDEN_SOURCE_OBJ'] = { group = "compromised_hosts" } -reconf['URI_HIDDEN_PATH'] = { - re = "/\\/\\..+/U", - description = "URL contains a UNIX hidden file/directory", +local hidden_uri_re = rspamd_regexp.create_cached('/(?!\\/\\.well[-_]known\\/)(?:^\\.[A-Za-z0-9]|\\/\\.[A-Za-z0-9]|\\/\\.\\.\\/)/i') +rspamd_config.URI_HIDDEN_PATH = { + callback = function (task) + local urls = task:get_urls(false) + if (urls) then + for _, url in ipairs(urls) do + if (not (url:is_subject() and url:is_html_displayed())) then + local path = url:get_path() + if (hidden_uri_re:match(path)) then + -- TODO: need url:is_schemeless() to improve this + return true, 1.0, url:get_text() + end + end + end + end + end, + description = 'Message contains URI with a hidden path', score = 1.0, - one_shot = true, - group = "compromised_hosts" + group = 'compromised_hosts', } reconf['MID_RHS_WWW'] = { diff --git a/rules/regexp/headers.lua b/rules/regexp/headers.lua index e80380197..75ed91fd1 100644 --- a/rules/regexp/headers.lua +++ b/rules/regexp/headers.lua @@ -62,17 +62,23 @@ reconf['R_NO_SPACE_IN_FROM'] = { group = 'header' } -rspamd_config.MISSING_SUBJECT = { +-- Detects missing Subject header +reconf['MISSING_SUBJECT'] = { + re = '!raw_header_exists(Subject)', score = 2.0, - description = 'Subject is missing inside message', + description = 'Subject header is missing', + group = 'header' +} + +rspamd_config.EMPTY_SUBJECT = { + score = 1.0, + description = 'Subject header is empty', group = 'header', callback = function(task) local hdr = task:get_header('Subject') - - if not hdr or #hdr == 0 then + if hdr and #hdr == 0 then return true end - return false end } @@ -917,3 +923,36 @@ reconf['HAS_XOIP'] = { score = 0.0, group = 'headers' } + +reconf['HAS_LIST_UNSUB'] = { + re = string.format('%s', 'header_exists(List-Unsubscribe)'), + description = 'Has List-Unsubscribe header', + score = -0.01, + group = 'headers' +} + +reconf['HAS_GUC_PROXY_URI'] = { + re = '/\\.googleusercontent\\.com\\/proxy/{url}i', + description = 'Has googleusercontent.com proxy URI', + score = 0.01, + group = 'experimental' +} + +reconf['HAS_GOOGLE_REDIR'] = { + re = '/\\.google\\.com\\/url\\?/{url}i', + description = 'Has google.com/url redirection', + score = 0.01, + group = 'experimental' +} + +reconf['XM_UA_NO_VERSION'] = { + re = string.format('(!%s && !%s) && (%s || %s)', + 'X-Mailer=/https?:/H', + 'User-Agent=/https?:/H', + 'X-Mailer=/^[^0-9]+$/H', + 'User-Agent=/^[^0-9]+$/H'), + description = 'X-Mailer/User-Agent has no version', + score = 0.01, + group = 'experimental' +} + |