diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-02-22 13:46:47 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-02-22 13:46:47 +0000 |
commit | 0f5ed273b4fd6a9fb4efc44e3de334d5487e5be7 (patch) | |
tree | d41a184185f1d689abf1c310717d0e922ae1e6f4 /src | |
parent | d2ceb9d5560a68dff834abcb28608f9be42982c3 (diff) | |
download | rspamd-0f5ed273b4fd6a9fb4efc44e3de334d5487e5be7.tar.gz rspamd-0f5ed273b4fd6a9fb4efc44e3de334d5487e5be7.zip |
[Minor] Clickhouse: Restore old behaviour for full_urls
Diffstat (limited to 'src')
-rw-r--r-- | src/plugins/lua/clickhouse.lua | 51 |
1 files changed, 45 insertions, 6 deletions
diff --git a/src/plugins/lua/clickhouse.lua b/src/plugins/lua/clickhouse.lua index fcf9177a4..23ed65a85 100644 --- a/src/plugins/lua/clickhouse.lua +++ b/src/plugins/lua/clickhouse.lua @@ -767,16 +767,55 @@ local function clickhouse_collect(task) local urls_tlds = {} local urls_flags = {} - for i,u in ipairs(task_urls) do - if settings['full_urls'] then + if settings.full_urls then + for i,u in ipairs(task_urls) do urls_urls[i] = u:get_text() - else - urls_urls[i] = u:get_host() + urls_tlds[i] = u:get_tld() or u:get_host() + urls_flags[i] = u:get_flags_num() + end + else + -- We need to store unique + local mt = { + ord_tbl = {}, -- ordered list of urls + idx_tbl = {}, -- indexed by host + flags, reference to an index in ord_tbl + __newindex = function(t, k, v) + local idx = getmetatable(t).idx_tbl + local ord = getmetatable(t).ord_tbl + local key = k:get_host() .. tostring(k:get_flags_num()) + if idx[key] then + ord[idx[key]] = v -- replace + else + ord[#ord + 1] = v + idx[key] = #ord + end + end, + __index = function(t, k) + local ord = getmetatable(t).ord_tbl + if type(k) == 'number' then + return ord[k] + else + local idx = getmetatable(t).idx_tbl + local key = k:get_host() .. tostring(k:get_flags_num()) + if idx[key] then + return ord[idx[key]] + end + end + end, + } + -- Extra index needed for making this unique + local urls_idx = {} + setmetatable(urls_idx, mt) + for _,u in ipairs(task_urls) do + if not urls_idx[u] then + urls_idx[u] = u + urls_urls[#urls_urls + 1] = u:get_host() + urls_tlds[#urls_tlds + 1] = u:get_tld() or u:get_host() + urls_flags[#urls_flags + 1] = u:get_flags_num() + end end - urls_tlds[i] = u:get_tld() or u:get_host() - urls_flags[i] = u:get_flags_num() end + -- Get tlds table.insert(row, urls_tlds) -- Get hosts/full urls |