1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
|
local lua_util = require "lua_util"
local rspamd_util = require "rspamd_util"
local fun = require "fun"
local utility = {}
function utility.get_all_symbols(logs, ignore_symbols)
-- Returns a list of all symbols
local symbols_set = {}
for _, line in pairs(logs) do
line = lua_util.rspamd_str_split(line, " ")
for i=4,(#line-1) do
line[i] = line[i]:gsub("%s+", "")
if not symbols_set[line[i]] then
symbols_set[line[i]] = true
end
end
end
local all_symbols = {}
for symbol, _ in pairs(symbols_set) do
if not ignore_symbols[symbol] then
all_symbols[#all_symbols + 1] = symbol
end
end
table.sort(all_symbols)
return all_symbols
end
function utility.read_log_file(file)
local lines = {}
local messages = {}
local fd = assert(io.open(file, "r"))
local fname = string.gsub(file, "(.*/)(.*)", "%2")
for line in fd:lines() do
local start,stop = string.find(line, fname .. ':')
if start and stop then
table.insert(lines, string.sub(line, 1, start))
table.insert(messages, string.sub(line, stop + 1, -1))
end
end
io.close(fd)
return lines,messages
end
function utility.get_all_logs(dirs)
-- Reads all log files in the directory and returns a list of logs.
if type(dirs) == 'string' then
dirs = {dirs}
end
local all_logs = {}
local all_messages = {}
for _,dir in ipairs(dirs) do
if dir:sub(-1, -1) == "/" then
dir = dir:sub(1, -2)
local files = rspamd_util.glob(dir .. "/*.log")
for _, file in pairs(files) do
local logs,messages = utility.read_log_file(file)
for i=1,#logs do
table.insert(all_logs, logs[i])
table.insert(all_messages, messages[i])
end
end
else
local logs,messages = utility.read_log_file(dir)
for i=1,#logs do
table.insert(all_logs, logs[i])
table.insert(all_messages, messages[i])
end
end
end
return all_logs,all_messages
end
function utility.get_all_symbol_scores(conf, ignore_symbols)
local symbols = conf:get_symbols_scores()
return fun.tomap(fun.map(function(name, elt)
return name,elt['score']
end, fun.filter(function(name, elt)
return not ignore_symbols[name]
end, symbols)))
end
function utility.generate_statistics_from_logs(logs, messages, threshold)
-- Returns file_stats table and list of symbol_stats table.
local file_stats = {
no_of_emails = 0,
no_of_spam = 0,
no_of_ham = 0,
spam_percent = 0,
ham_percent = 0,
true_positives = 0,
true_negatives = 0,
false_negative_rate = 0,
false_positive_rate = 0,
overall_accuracy = 0,
fscore = 0,
avg_scan_time = 0,
slowest_file = nil,
slowest = 0
}
local all_symbols_stats = {}
local all_fps = {}
local all_fns = {}
local false_positives = 0
local false_negatives = 0
local true_positives = 0
local true_negatives = 0
local no_of_emails = 0
local no_of_spam = 0
local no_of_ham = 0
for i, log in ipairs(logs) do
log = lua_util.rspamd_str_trim(log)
log = lua_util.rspamd_str_split(log, " ")
local message = messages[i]
local is_spam = (log[1] == "SPAM")
local score = tonumber(log[2])
no_of_emails = no_of_emails + 1
if is_spam then
no_of_spam = no_of_spam + 1
else
no_of_ham = no_of_ham + 1
end
if is_spam and (score >= threshold) then
true_positives = true_positives + 1
elseif is_spam and (score < threshold) then
false_negatives = false_negatives + 1
table.insert(all_fns, message)
elseif not is_spam and (score >= threshold) then
false_positives = false_positives + 1
table.insert(all_fps, message)
else
true_negatives = true_negatives + 1
end
for j=4, (#log-1) do
if all_symbols_stats[log[j]] == nil then
all_symbols_stats[log[j]] = {
name = message,
no_of_hits = 0,
spam_hits = 0,
ham_hits = 0,
spam_overall = 0
}
end
local sym = log[j]
all_symbols_stats[sym].no_of_hits = all_symbols_stats[sym].no_of_hits + 1
if is_spam then
all_symbols_stats[sym].spam_hits = all_symbols_stats[sym].spam_hits + 1
else
all_symbols_stats[sym].ham_hits = all_symbols_stats[sym].ham_hits + 1
end
-- Find slowest message
if ((tonumber(log[#log]) or 0) > file_stats.slowest) then
file_stats.slowest = tonumber(log[#log])
file_stats.slowest_file = message
end
end
end
-- Calculating file stats
file_stats.no_of_ham = no_of_ham
file_stats.no_of_spam = no_of_spam
file_stats.no_of_emails = no_of_emails
file_stats.true_positives = true_positives
file_stats.true_negatives = true_negatives
if no_of_emails > 0 then
file_stats.spam_percent = no_of_spam * 100 / no_of_emails
file_stats.ham_percent = no_of_ham * 100 / no_of_emails
file_stats.overall_accuracy = (true_positives + true_negatives) * 100 /
no_of_emails
end
if no_of_ham > 0 then
file_stats.false_positive_rate = false_positives * 100 / no_of_ham
end
if no_of_spam > 0 then
file_stats.false_negative_rate = false_negatives * 100 / no_of_spam
end
file_stats.fscore = 2 * true_positives / (2
* true_positives
+ false_positives
+ false_negatives)
-- Calculating symbol stats
for _, symbol_stats in pairs(all_symbols_stats) do
symbol_stats.spam_percent = symbol_stats.spam_hits * 100 / no_of_spam
symbol_stats.ham_percent = symbol_stats.ham_hits * 100 / no_of_ham
symbol_stats.overall = symbol_stats.no_of_hits * 100 / no_of_emails
symbol_stats.spam_overall = symbol_stats.spam_percent /
(symbol_stats.spam_percent + symbol_stats.ham_percent)
end
return file_stats, all_symbols_stats, all_fps, all_fns
end
return utility
|