1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
|
local lua_util = require "lua_util"
local rspamd_util = require "rspamd_util"
local fun = require "fun"
local utility = {}
function utility.get_all_symbols(logs, ignore_symbols)
-- Returns a list of all symbols
local symbols_set = {}
for _, line in pairs(logs) do
line = lua_util.rspamd_str_split(line, " ")
for i=4,(#line-2) do
line[i] = line[i]:gsub("%s+", "")
if not symbols_set[line[i]] then
symbols_set[line[i]] = true
end
end
end
local all_symbols = {}
for symbol, _ in pairs(symbols_set) do
if not ignore_symbols[symbol] then
all_symbols[#all_symbols + 1] = symbol
end
end
table.sort(all_symbols)
return all_symbols
end
function utility.read_log_file(file)
local lines = {}
file = assert(io.open(file, "r"))
for line in file:lines() do
lines[#lines + 1] = line
end
io.close(file)
return lines
end
function utility.get_all_logs(dir_path)
-- Reads all log files in the directory and returns a list of logs.
if dir_path:sub(#dir_path, #dir_path) == "/" then
dir_path = dir_path:sub(1, #dir_path -1)
end
local files = rspamd_util.glob(dir_path .. "/*.log")
local all_logs = {}
for _, file in pairs(files) do
local logs = utility.read_log_file(file)
for _, log_line in pairs(logs) do
all_logs[#all_logs + 1] = log_line
end
end
return all_logs
end
function utility.get_all_symbol_scores(conf, ignore_symbols)
local counters = conf:get_symbols_counters()
return fun.tomap(fun.map(function(elt)
return elt['symbol'],elt['weight']
end, fun.filter(function(elt)
return not ignore_symbols[elt['symbol']]
end, counters)))
end
function utility.generate_statistics_from_logs(logs, threshold)
-- Returns file_stats table and list of symbol_stats table.
local file_stats = {
no_of_emails = 0,
no_of_spam = 0,
no_of_ham = 0,
spam_percent = 0,
ham_percent = 0,
true_positives = 0,
true_negatives = 0,
false_negative_rate = 0,
false_positive_rate = 0,
overall_accuracy = 0,
fscore = 0,
avg_scan_time = 0,
slowest_file = nil,
slowest = 0
}
local all_symbols_stats = {}
local all_fps = {}
local all_fns = {}
local false_positives = 0
local false_negatives = 0
local true_positives = 0
local true_negatives = 0
local no_of_emails = 0
local no_of_spam = 0
local no_of_ham = 0
for _, log in pairs(logs) do
log = lua_util.rspamd_str_trim(log)
log = lua_util.rspamd_str_split(log, " ")
local is_spam = (log[1] == "SPAM")
local score = tonumber(log[2])
no_of_emails = no_of_emails + 1
if is_spam then
no_of_spam = no_of_spam + 1
else
no_of_ham = no_of_ham + 1
end
if is_spam and (score >= threshold) then
true_positives = true_positives + 1
elseif is_spam and (score < threshold) then
false_negatives = false_negatives + 1
table.insert(all_fns, log[#log])
elseif not is_spam and (score >= threshold) then
false_positives = false_positives + 1
table.insert(all_fps, log[#log])
else
true_negatives = true_negatives + 1
end
for i=4, (#log-2) do
if all_symbols_stats[log[i]] == nil then
all_symbols_stats[log[i]] = {
name = log[i],
no_of_hits = 0,
spam_hits = 0,
ham_hits = 0,
spam_overall = 0
}
end
all_symbols_stats[log[i]].no_of_hits =
all_symbols_stats[log[i]].no_of_hits + 1
if is_spam then
all_symbols_stats[log[i]].spam_hits =
all_symbols_stats[log[i]].spam_hits + 1
else
all_symbols_stats[log[i]].ham_hits =
all_symbols_stats[log[i]].ham_hits + 1
end
-- Find slowest message
if (tonumber(log[#log-1]) > tonumber(file_stats.slowest)) then
file_stats.slowest = tostring(tonumber(log[#log-1]))
file_stats.slowest_file = log[#log]
end
end
end
-- Calculating file stats
file_stats.no_of_ham = no_of_ham
file_stats.no_of_spam = no_of_spam
file_stats.no_of_emails = no_of_emails
file_stats.true_positives = true_positives
file_stats.true_negatives = true_negatives
if no_of_emails > 0 then
file_stats.spam_percent = no_of_spam * 100 / no_of_emails
file_stats.ham_percent = no_of_ham * 100 / no_of_emails
file_stats.overall_accuracy = (true_positives + true_negatives) * 100 /
no_of_emails
end
if no_of_ham > 0 then
file_stats.false_positive_rate = false_positives * 100 / no_of_ham
end
if no_of_spam > 0 then
file_stats.false_negative_rate = false_negatives * 100 / no_of_spam
end
file_stats.fscore = 2 * true_positives / (2
* true_positives
+ false_positives
+ false_negatives)
-- Calculating symbol stats
for _, symbol_stats in pairs(all_symbols_stats) do
symbol_stats.spam_percent = symbol_stats.spam_hits * 100 / no_of_spam
symbol_stats.ham_percent = symbol_stats.ham_hits * 100 / no_of_ham
symbol_stats.overall = symbol_stats.no_of_hits * 100 / no_of_emails
symbol_stats.spam_overall = symbol_stats.spam_percent /
(symbol_stats.spam_percent + symbol_stats.ham_percent)
end
return file_stats, all_symbols_stats, all_fps, all_fns
end
return utility
|