diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-11-30 14:27:39 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-11-30 14:27:39 +0000 |
commit | 0d8245f8339014dec90c004ce5bd09c764f182cd (patch) | |
tree | 6deb11129f7990f1e7849aaaf29ff436e3beabbb /lualib | |
parent | a092f57efaf01e16fcb090d5b3b610f0dc4f7180 (diff) | |
download | rspamd-0d8245f8339014dec90c004ce5bd09c764f182cd.tar.gz rspamd-0d8245f8339014dec90c004ce5bd09c764f182cd.zip |
[Rework] Clickhouse: Improve performance
Diffstat (limited to 'lualib')
-rw-r--r-- | lualib/rspamadm/clickhouse.lua | 125 |
1 files changed, 66 insertions, 59 deletions
diff --git a/lualib/rspamadm/clickhouse.lua b/lualib/rspamadm/clickhouse.lua index d1bbbef1a..4388b8ce0 100644 --- a/lualib/rspamadm/clickhouse.lua +++ b/lualib/rspamadm/clickhouse.lua @@ -86,43 +86,10 @@ local function load_config(config_file) end end -local function get_excluded_symbols(res) +local function get_excluded_symbols(known_symbols, correlations, seen_total) -- Walk results once to collect all symbols & count ocurrences - local known_symbols, remove = {}, {} - local symbols_count, seen_total = 1, 0 - for _, r in ipairs(res) do - local is_spam = true - if r['Action'] == 'no action' or r['Action'] == 'greylist' then - is_spam = false - end - seen_total = seen_total + 1 - for _, sym in ipairs(r['Symbols.Names']) do - local t = known_symbols[sym] - if not t then - local spam_count, ham_count = 0, 0 - if is_spam then - spam_count = spam_count + 1 - else - ham_count = ham_count + 1 - end - known_symbols[sym] = { - id = symbols_count, - seen = 1, - seen_ham = ham_count, - seen_spam = spam_count, - } - symbols_count = symbols_count + 1 - else - known_symbols[sym].seen = known_symbols[sym].seen + 1 - if is_spam then - known_symbols[sym].seen_spam = known_symbols[sym].seen_spam + 1 - else - known_symbols[sym].seen_ham = known_symbols[sym].seen_ham + 1 - end - end - end - end + local remove = {} local known_symbols_list = {} local composites = rspamd_config:get_all_opt('composites') for k, v in pairs(known_symbols) do @@ -147,27 +114,6 @@ local function get_excluded_symbols(res) } end - -- Walk results again & count correlations - local correlations = {} - for _, r in ipairs(res) do - for _, sym in ipairs(r['Symbols.Names']) do - for _, inner_sym_name in ipairs(r['Symbols.Names']) do - if inner_sym_name ~= sym then - local known_sym = known_symbols[sym] - local inner_sym = known_symbols[inner_sym_name] - if known_sym and inner_sym then - if not correlations[known_sym.id] then - correlations[known_sym.id] = {} - end - local n = correlations[known_sym.id][inner_sym.id] or 0 - n = n + 1 - correlations[known_sym.id][inner_sym.id] = n - end - end - end - end - end - -- Walk correlation matrix and check total counts for sym_id, row in pairs(correlations) do for inner_sym_id, count in pairs(row) do @@ -180,7 +126,7 @@ local function get_excluded_symbols(res) end end - return remove, known_symbols + return remove end local function handle_neural_profile(args) @@ -190,13 +136,74 @@ local function handle_neural_profile(args) local query = string.format( "SELECT Action, Symbols.Names FROM rspamd %s", args.where or '') local upstream = args.upstream:get_upstream_round_robin() - local err, res = lua_clickhouse.select_sync(upstream, args, http_params, query) + local known_symbols = {} + local symbols_count, seen_total = 1, 0 + local correlations = {} + + local function process_row(r) + local is_spam = true + if r['Action'] == 'no action' or r['Action'] == 'greylist' then + is_spam = false + end + seen_total = seen_total + 1 + + local nsym = #r['Symbols.Names'] + + for i = 1,nsym do + local sym = r['Symbols.Names'][i] + local t = known_symbols[sym] + if not t then + local spam_count, ham_count = 0, 0 + if is_spam then + spam_count = spam_count + 1 + else + ham_count = ham_count + 1 + end + known_symbols[sym] = { + id = symbols_count, + seen = 1, + seen_ham = ham_count, + seen_spam = spam_count, + } + symbols_count = symbols_count + 1 + else + known_symbols[sym].seen = known_symbols[sym].seen + 1 + if is_spam then + known_symbols[sym].seen_spam = known_symbols[sym].seen_spam + 1 + else + known_symbols[sym].seen_ham = known_symbols[sym].seen_ham + 1 + end + end + end + + -- Fill correlations + for i = 1,nsym do + for j = 1,nsym do + if i ~= j then + local sym = r['Symbols.Names'][i] + local inner_sym_name = r['Symbols.Names'][j] + local known_sym = known_symbols[sym] + local inner_sym = known_symbols[inner_sym_name] + if known_sym and inner_sym then + if not correlations[known_sym.id] then + correlations[known_sym.id] = {} + end + local n = correlations[known_sym.id][inner_sym.id] or 0 + n = n + 1 + correlations[known_sym.id][inner_sym.id] = n + end + end + end + end + end + + local err, _ = lua_clickhouse.select_sync(upstream, args, http_params, query, process_row) if err ~= nil then io.stderr:write(string.format('Error querying Clickhouse: %s\n', err)) os.exit(1) end - local remove, known_symbols = get_excluded_symbols(res) + local remove = get_excluded_symbols(known_symbols, correlations, seen_total) if not args.json then for k in pairs(known_symbols) do if not remove[k] then |