aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-03-19 14:16:02 +0000
committerGitHub <noreply@github.com>2018-03-19 14:16:02 +0000
commitbef67cce03e24465a7f7ea1ed2689727fd83a859 (patch)
treee20ba11120adffdd05600247d003ccd756f93f03
parente9c35582e7b22661975f5898891d02bbe9651ebf (diff)
parentc8aef97e5c42a85c64d20c0b13230d6d1094aeef (diff)
downloadrspamd-bef67cce03e24465a7f7ea1ed2689727fd83a859.tar.gz
rspamd-bef67cce03e24465a7f7ea1ed2689727fd83a859.zip
Merge pull request #2089 from smfreegard/rules_20180301
Corpus/Rescore improvements, new and improved rules
-rw-r--r--lualib/rspamadm/corpus_test.lua13
-rw-r--r--lualib/rspamadm/rescore.lua95
-rw-r--r--lualib/rspamadm/rescore_utility.lua23
-rw-r--r--rules/headers_checks.lua18
-rw-r--r--rules/regexp/compromised_hosts.lua23
-rw-r--r--rules/regexp/headers.lua49
6 files changed, 195 insertions, 26 deletions
diff --git a/lualib/rspamadm/corpus_test.lua b/lualib/rspamadm/corpus_test.lua
index eb93d586c..b71f96e9e 100644
--- a/lualib/rspamadm/corpus_test.lua
+++ b/lualib/rspamadm/corpus_test.lua
@@ -5,10 +5,10 @@ local lua_util = require "lua_util"
local HAM = "HAM"
local SPAM = "SPAM"
-local function scan_email(n_parellel, path, timeout)
+local function scan_email(n_parallel, path, timeout)
local rspamc_command = string.format("rspamc -j --compact -n %s -t %.3f %s",
- n_parellel, timeout, path)
+ n_parallel, timeout, path)
local result = assert(io.popen(rspamc_command))
result = result:read("*all")
return result
@@ -25,6 +25,8 @@ local function write_results(results, file)
log_line = log_line .. " " .. sym
end
+ log_line = log_line .. " " .. result.scan_time .. " " .. file .. ':' .. result.filename
+
log_line = log_line .. "\r\n"
f:write(log_line)
@@ -62,6 +64,9 @@ local function encoded_json_to_log(result)
table.insert(filtered_result.symbols, sym)
end
+ filtered_result.filename = result.filename
+ filtered_result.scan_time = result.scan_time
+
return filtered_result
end
@@ -127,8 +132,10 @@ return function (_, res)
write_results(results, output)
io.write("\nStats: \n")
+ local elapsed_time = os.time() - start_time
+ local total_msgs = no_of_ham + no_of_spam
io.write(string.format("Elapsed time: %ds\n", os.time() - start_time))
io.write(string.format("No of ham: %d\n", no_of_ham))
io.write(string.format("No of spam: %d\n", no_of_spam))
-
+ io.write(string.format("Messages/sec: %-.2f\n", (total_msgs/elapsed_time)))
end
diff --git a/lualib/rspamadm/rescore.lua b/lualib/rspamadm/rescore.lua
index c8348caa3..87e0ea2c5 100644
--- a/lualib/rspamadm/rescore.lua
+++ b/lualib/rspamadm/rescore.lua
@@ -182,9 +182,9 @@ local function calculate_fscore_from_weights(logs, all_symbols, weights, thresho
logs = update_logs(logs, new_symbol_scores)
- local file_stats, _ = rescore_utility.generate_statistics_from_logs(logs, threshold)
+ local file_stats, _, all_fps, all_fns = rescore_utility.generate_statistics_from_logs(logs, threshold)
- return file_stats.fscore
+ return file_stats.fscore, all_fps, all_fns
end
local function print_stats(logs, threshold)
@@ -196,6 +196,7 @@ F-score: %.2f
False positive rate: %.2f %%
False negative rate: %.2f %%
Overall accuracy: %.2f %%
+Slowest message: %.2f (%s)
]]
logger.message("\nStatistics at threshold: " .. threshold)
@@ -204,7 +205,9 @@ Overall accuracy: %.2f %%
file_stats.fscore,
file_stats.false_positive_rate,
file_stats.false_negative_rate,
- file_stats.overall_accuracy))
+ file_stats.overall_accuracy,
+ file_stats.slowest,
+ file_stats.slowest_file))
end
@@ -463,6 +466,67 @@ return function (args, cfg)
local original_symbol_scores = rescore_utility.get_all_symbol_scores(rspamd_config,
ignore_symbols)
+ -- Display hit frequencies
+ if opts['z'] then
+ local file_stats, all_symbols_stats = rescore_utility.generate_statistics_from_logs(logs, threshold)
+ local t = {}
+ for _, symbol_stats in pairs(all_symbols_stats) do table.insert(t, symbol_stats) end
+ function compare_symbols (a, b)
+ if (a.spam_overall ~= b.spam_overall) then
+ return b.spam_overall < a.spam_overall
+ end
+ if (b.spam_hits ~= a.spam_hits) then
+ return b.spam_hits < a.spam_hits
+ end
+ return b.ham_hits < a.ham_hits
+ end
+ table.sort(t, compare_symbols)
+ logger.message(string.format("%-40s %6s %6s %6s %6s %6s %6s %6s",
+ "NAME", "HITS", "HAM", "HAM%", "SPAM", "SPAM%", "S/O", "OVER%"))
+ for _, symbol_stats in pairs(t) do
+ logger.message(
+ string.format("%-40s %6d %6d %6.2f %6d %6.2f %6.2f %6.2f",
+ symbol_stats.name,
+ symbol_stats.no_of_hits,
+ symbol_stats.ham_hits,
+ lua_util.round(symbol_stats.ham_percent,2),
+ symbol_stats.spam_hits,
+ lua_util.round(symbol_stats.spam_percent,2),
+ lua_util.round(symbol_stats.spam_overall,2),
+ lua_util.round(symbol_stats.overall, 2)
+ )
+ )
+ end
+
+ -- Print file statistics
+ print_stats(logs, threshold)
+
+ -- Work out how many symbols weren't seen in the corpus
+ local symbols_no_hits = {}
+ local total_symbols = 0
+ for sym in pairs(original_symbol_scores) do
+ total_symbols = total_symbols + 1
+ if (all_symbols_stats[sym] == nil) then
+ table.insert(symbols_no_hits, sym)
+ end
+ end
+ if (#symbols_no_hits > 0) then
+ table.sort(symbols_no_hits)
+ -- Calculate percentage of rules with no hits
+ local nhpct = lua_util.round((#symbols_no_hits/total_symbols)*100,2)
+ logger.message(
+ string.format('\nFound %s (%-.2f%%) symbols out of %s with no hits in corpus:',
+ #symbols_no_hits, nhpct, total_symbols
+ )
+ )
+ for _, symbol in pairs(symbols_no_hits) do
+ logger.messagex('%s', symbol)
+ end
+ end
+
+ return
+ end
+
shuffle(logs)
torch.setdefaulttensortype('torch.FloatTensor')
@@ -471,7 +535,6 @@ return function (args, cfg)
local dataset = make_dataset_from_logs(train_logs, all_symbols, reject_score)
-
-- Start of perceptron training
local input_size = #all_symbols
torch.setnumthreads(opts['threads'])
@@ -490,6 +553,8 @@ return function (args, cfg)
local best_weights = linear_module.weight[1]:clone()
local best_learning_rate
local best_weight_decay
+ local all_fps
+ local all_fns
for _,lr in ipairs(learning_rates) do
for _,wd in ipairs(penalty_weights) do
@@ -502,7 +567,7 @@ return function (args, cfg)
initial_weights)
end
- local fscore = calculate_fscore_from_weights(cv_logs,
+ local fscore, fps, fns = calculate_fscore_from_weights(cv_logs,
all_symbols,
linear_module.weight[1],
threshold)
@@ -515,6 +580,8 @@ return function (args, cfg)
best_weight_decay = wd
best_fscore = fscore
best_weights = linear_module.weight[1]:clone()
+ all_fps = fps
+ all_fns = fns
end
end
end
@@ -533,7 +600,6 @@ return function (args, cfg)
print_score_diff(new_symbol_scores, original_symbol_scores)
end
-
-- Pre-rescore test stats
logger.message("\n\nPre-rescore test stats\n")
test_logs = update_logs(test_logs, original_symbol_scores)
@@ -546,4 +612,19 @@ return function (args, cfg)
logger.messagex('Best fscore=%s, best learning rate=%s, best weight decay=%s',
best_fscore, best_learning_rate, best_weight_decay)
-end \ No newline at end of file
+
+ -- Show all FPs/FNs, useful for corpus checking and rule creation/modification
+ if (all_fps and #all_fps > 0) then
+ logger.message("\nFalse-Positives:")
+ for _, fp in pairs(all_fps) do
+ logger.messagex('%s', fp)
+ end
+ end
+
+ if (all_fns and #all_fns > 0) then
+ logger.message("\nFalse-Negatives:")
+ for _, fn in pairs(all_fns) do
+ logger.messagex('%s', fn)
+ end
+ end
+end
diff --git a/lualib/rspamadm/rescore_utility.lua b/lualib/rspamadm/rescore_utility.lua
index 7f3f40078..2a9372d4e 100644
--- a/lualib/rspamadm/rescore_utility.lua
+++ b/lualib/rspamadm/rescore_utility.lua
@@ -11,7 +11,7 @@ function utility.get_all_symbols(logs, ignore_symbols)
for _, line in pairs(logs) do
line = lua_util.rspamd_str_split(line, " ")
- for i=4,#line do
+ for i=4,(#line-2) do
line[i] = line[i]:gsub("%s+", "")
if not symbols_set[line[i]] then
symbols_set[line[i]] = true
@@ -54,7 +54,7 @@ function utility.get_all_logs(dir_path)
dir_path = dir_path:sub(1, #dir_path -1)
end
- local files = rspamd_util.glob(dir_path .. "/*")
+ local files = rspamd_util.glob(dir_path .. "/*.log")
local all_logs = {}
for _, file in pairs(files) do
@@ -92,10 +92,15 @@ function utility.generate_statistics_from_logs(logs, threshold)
false_negative_rate = 0,
false_positive_rate = 0,
overall_accuracy = 0,
- fscore = 0
+ fscore = 0,
+ avg_scan_time = 0,
+ slowest_file = nil,
+ slowest = 0
}
local all_symbols_stats = {}
+ local all_fps = {}
+ local all_fns = {}
local false_positives = 0
local false_negatives = 0
@@ -124,13 +129,15 @@ function utility.generate_statistics_from_logs(logs, threshold)
true_positives = true_positives + 1
elseif is_spam and (score < threshold) then
false_negatives = false_negatives + 1
+ table.insert(all_fns, log[#log])
elseif not is_spam and (score >= threshold) then
false_positives = false_positives + 1
+ table.insert(all_fps, log[#log])
else
true_negatives = true_negatives + 1
end
- for i=4, #log do
+ for i=4, (#log-2) do
if all_symbols_stats[log[i]] == nil then
all_symbols_stats[log[i]] = {
name = log[i],
@@ -151,6 +158,12 @@ function utility.generate_statistics_from_logs(logs, threshold)
all_symbols_stats[log[i]].ham_hits =
all_symbols_stats[log[i]].ham_hits + 1
end
+
+ -- Find slowest message
+ if (tonumber(log[#log-1]) > tonumber(file_stats.slowest)) then
+ file_stats.slowest = tostring(tonumber(log[#log-1]))
+ file_stats.slowest_file = log[#log]
+ end
end
end
@@ -192,7 +205,7 @@ function utility.generate_statistics_from_logs(logs, threshold)
(symbol_stats.spam_percent + symbol_stats.ham_percent)
end
- return file_stats, all_symbols_stats
+ return file_stats, all_symbols_stats, all_fps, all_fns
end
return utility
diff --git a/rules/headers_checks.lua b/rules/headers_checks.lua
index 2d2d8ec3d..a97d7483f 100644
--- a/rules/headers_checks.lua
+++ b/rules/headers_checks.lua
@@ -221,7 +221,13 @@ local check_replyto_id = rspamd_config:register_callback_symbol('CHECK_REPLYTO',
-- See if Reply-To matches the To address
local to = task:get_recipients(2)
if (to and to[1] and to[1].addr:lower() == rt[1].addr:lower()) then
- task:insert_result('REPLYTO_EQ_TO_ADDR', 1.0)
+ -- Ignore this for mailing-lists and automatic submissions
+ if (not (task:get_header('List-Unsubscribe') or
+ task:get_header('X-List') or
+ task:get_header('Auto-Submitted')))
+ then
+ task:insert_result('REPLYTO_EQ_TO_ADDR', 1.0)
+ end
else
task:insert_result('REPLYTO_DOM_NEQ_FROM_DOM', 1.0)
end
@@ -1027,3 +1033,13 @@ rspamd_config.INVALID_RCPT_8BIT = {
score = 6.0,
group = 'headers'
}
+
+rspamd_config.XM_CASE = {
+ callback = function (task)
+ local xm = task:get_header('X-mailer', true)
+ if (xm) then return true end
+ end,
+ description = 'X-mailer .vs. X-Mailer',
+ score = 0.5,
+ group = 'headers',
+}
diff --git a/rules/regexp/compromised_hosts.lua b/rules/regexp/compromised_hosts.lua
index f6427a7cd..67101a80d 100644
--- a/rules/regexp/compromised_hosts.lua
+++ b/rules/regexp/compromised_hosts.lua
@@ -138,12 +138,25 @@ reconf['HIDDEN_SOURCE_OBJ'] = {
group = "compromised_hosts"
}
-reconf['URI_HIDDEN_PATH'] = {
- re = "/\\/\\..+/U",
- description = "URL contains a UNIX hidden file/directory",
+local hidden_uri_re = rspamd_regexp.create_cached('/(?!\\/\\.well[-_]known\\/)(?:^\\.[A-Za-z0-9]|\\/\\.[A-Za-z0-9]|\\/\\.\\.\\/)/i')
+rspamd_config.URI_HIDDEN_PATH = {
+ callback = function (task)
+ local urls = task:get_urls(false)
+ if (urls) then
+ for _, url in ipairs(urls) do
+ if (not (url:is_subject() and url:is_html_displayed())) then
+ local path = url:get_path()
+ if (hidden_uri_re:match(path)) then
+ -- TODO: need url:is_schemeless() to improve this
+ return true, 1.0, url:get_text()
+ end
+ end
+ end
+ end
+ end,
+ description = 'Message contains URI with a hidden path',
score = 1.0,
- one_shot = true,
- group = "compromised_hosts"
+ group = 'compromised_hosts',
}
reconf['MID_RHS_WWW'] = {
diff --git a/rules/regexp/headers.lua b/rules/regexp/headers.lua
index e80380197..75ed91fd1 100644
--- a/rules/regexp/headers.lua
+++ b/rules/regexp/headers.lua
@@ -62,17 +62,23 @@ reconf['R_NO_SPACE_IN_FROM'] = {
group = 'header'
}
-rspamd_config.MISSING_SUBJECT = {
+-- Detects missing Subject header
+reconf['MISSING_SUBJECT'] = {
+ re = '!raw_header_exists(Subject)',
score = 2.0,
- description = 'Subject is missing inside message',
+ description = 'Subject header is missing',
+ group = 'header'
+}
+
+rspamd_config.EMPTY_SUBJECT = {
+ score = 1.0,
+ description = 'Subject header is empty',
group = 'header',
callback = function(task)
local hdr = task:get_header('Subject')
-
- if not hdr or #hdr == 0 then
+ if hdr and #hdr == 0 then
return true
end
-
return false
end
}
@@ -917,3 +923,36 @@ reconf['HAS_XOIP'] = {
score = 0.0,
group = 'headers'
}
+
+reconf['HAS_LIST_UNSUB'] = {
+ re = string.format('%s', 'header_exists(List-Unsubscribe)'),
+ description = 'Has List-Unsubscribe header',
+ score = -0.01,
+ group = 'headers'
+}
+
+reconf['HAS_GUC_PROXY_URI'] = {
+ re = '/\\.googleusercontent\\.com\\/proxy/{url}i',
+ description = 'Has googleusercontent.com proxy URI',
+ score = 0.01,
+ group = 'experimental'
+}
+
+reconf['HAS_GOOGLE_REDIR'] = {
+ re = '/\\.google\\.com\\/url\\?/{url}i',
+ description = 'Has google.com/url redirection',
+ score = 0.01,
+ group = 'experimental'
+}
+
+reconf['XM_UA_NO_VERSION'] = {
+ re = string.format('(!%s && !%s) && (%s || %s)',
+ 'X-Mailer=/https?:/H',
+ 'User-Agent=/https?:/H',
+ 'X-Mailer=/^[^0-9]+$/H',
+ 'User-Agent=/^[^0-9]+$/H'),
+ description = 'X-Mailer/User-Agent has no version',
+ score = 0.01,
+ group = 'experimental'
+}
+