summaryrefslogtreecommitdiffstats
path: root/lualib/rescore_utility.lua
blob: 2a9372d4e3f12f010f1d345c874b7bafc2c1de3b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
local lua_util = require "lua_util"
local rspamd_util = require "rspamd_util"
local fun = require "fun"

local utility = {}

function utility.get_all_symbols(logs, ignore_symbols)
  -- Returns a list of all symbols

  local symbols_set = {}

  for _, line in pairs(logs) do
    line = lua_util.rspamd_str_split(line, " ")
    for i=4,(#line-2) do
      line[i] = line[i]:gsub("%s+", "")
      if not symbols_set[line[i]] then
        symbols_set[line[i]] = true
      end
    end
  end

  local all_symbols = {}

  for symbol, _ in pairs(symbols_set) do
    if not ignore_symbols[symbol] then
      all_symbols[#all_symbols + 1] = symbol
    end
  end

  table.sort(all_symbols)

  return all_symbols
end

function utility.read_log_file(file)

  local lines = {}

  file = assert(io.open(file, "r"))

  for line in file:lines() do
    lines[#lines + 1] = line
  end

  io.close(file)

  return lines
end

function utility.get_all_logs(dir_path)
  -- Reads all log files in the directory and returns a list of logs.

  if dir_path:sub(#dir_path, #dir_path) == "/" then
    dir_path = dir_path:sub(1, #dir_path -1)
  end

  local files = rspamd_util.glob(dir_path .. "/*.log")
  local all_logs = {}

  for _, file in pairs(files) do
    local logs = utility.read_log_file(file)
    for _, log_line in pairs(logs) do
      all_logs[#all_logs + 1] = log_line
    end
  end

  return all_logs
end

function utility.get_all_symbol_scores(conf, ignore_symbols)
  local counters = conf:get_symbols_counters()

  return fun.tomap(fun.map(function(elt)
    return elt['symbol'],elt['weight']
  end, fun.filter(function(elt)
    return not ignore_symbols[elt['symbol']]
  end, counters)))
end

function utility.generate_statistics_from_logs(logs, threshold)

  -- Returns file_stats table and list of symbol_stats table.

  local file_stats = {
    no_of_emails = 0,
    no_of_spam = 0,
    no_of_ham = 0,
    spam_percent = 0,
    ham_percent = 0,
    true_positives = 0,
    true_negatives = 0,
    false_negative_rate = 0,
    false_positive_rate = 0,
    overall_accuracy = 0,
    fscore = 0,
    avg_scan_time = 0,
    slowest_file = nil,
    slowest = 0
  }

  local all_symbols_stats = {}
  local all_fps = {}
  local all_fns = {}

  local false_positives = 0
  local false_negatives = 0
  local true_positives = 0
  local true_negatives = 0
  local no_of_emails = 0
  local no_of_spam = 0
  local no_of_ham = 0

  for _, log in pairs(logs) do
    log = lua_util.rspamd_str_trim(log)
    log = lua_util.rspamd_str_split(log, " ")

    local is_spam = (log[1] == "SPAM")
    local score = tonumber(log[2])

    no_of_emails = no_of_emails + 1

    if is_spam then
      no_of_spam = no_of_spam + 1
    else
      no_of_ham = no_of_ham + 1
    end

    if is_spam and (score >= threshold) then
      true_positives = true_positives + 1
    elseif is_spam and (score < threshold) then
      false_negatives = false_negatives + 1
      table.insert(all_fns, log[#log])
    elseif not is_spam and (score >= threshold) then
      false_positives = false_positives + 1
      table.insert(all_fps, log[#log])
    else
      true_negatives = true_negatives + 1
    end

    for i=4, (#log-2) do
      if all_symbols_stats[log[i]] == nil then
        all_symbols_stats[log[i]] = {
          name = log[i],
          no_of_hits = 0,
          spam_hits = 0,
          ham_hits = 0,
          spam_overall = 0
        }
      end

      all_symbols_stats[log[i]].no_of_hits =
      all_symbols_stats[log[i]].no_of_hits + 1

      if is_spam then
        all_symbols_stats[log[i]].spam_hits =
        all_symbols_stats[log[i]].spam_hits + 1
      else
        all_symbols_stats[log[i]].ham_hits =
        all_symbols_stats[log[i]].ham_hits + 1
      end

      -- Find slowest message
      if (tonumber(log[#log-1]) > tonumber(file_stats.slowest)) then
          file_stats.slowest = tostring(tonumber(log[#log-1]))
          file_stats.slowest_file = log[#log]
      end
    end
  end

  -- Calculating file stats

  file_stats.no_of_ham = no_of_ham
  file_stats.no_of_spam = no_of_spam
  file_stats.no_of_emails = no_of_emails
  file_stats.true_positives = true_positives
  file_stats.true_negatives = true_negatives

  if no_of_emails > 0 then
    file_stats.spam_percent = no_of_spam * 100 / no_of_emails
    file_stats.ham_percent = no_of_ham * 100 / no_of_emails
    file_stats.overall_accuracy = (true_positives + true_negatives) * 100 /
        no_of_emails
  end

  if no_of_ham > 0 then
    file_stats.false_positive_rate = false_positives * 100 / no_of_ham
  end

  if no_of_spam > 0 then
    file_stats.false_negative_rate = false_negatives * 100 / no_of_spam
  end

  file_stats.fscore = 2 * true_positives / (2
      * true_positives
      + false_positives
      + false_negatives)

  -- Calculating symbol stats

  for _, symbol_stats in pairs(all_symbols_stats) do
    symbol_stats.spam_percent = symbol_stats.spam_hits * 100 / no_of_spam
    symbol_stats.ham_percent = symbol_stats.ham_hits * 100 / no_of_ham
    symbol_stats.overall = symbol_stats.no_of_hits * 100 / no_of_emails
    symbol_stats.spam_overall = symbol_stats.spam_percent /
        (symbol_stats.spam_percent + symbol_stats.ham_percent)
  end

  return file_stats, all_symbols_stats, all_fps, all_fns
end

return utility