diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-08-19 16:49:19 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-08-19 16:49:19 +0100 |
commit | ddbbe07faf9e5f01f3b9c984a01551e82633af42 (patch) | |
tree | 19827e54bb949c1dc9b6eb8b01748fb68e5d1aea /lualib/lua_util.lua | |
parent | 11eac45b86c46e594184c90ef0fdd16c4cdd8f4b (diff) | |
download | rspamd-ddbbe07faf9e5f01f3b9c984a01551e82633af42.tar.gz rspamd-ddbbe07faf9e5f01f3b9c984a01551e82633af42.zip |
[Fix] More fixes to extract_specific_urls
Diffstat (limited to 'lualib/lua_util.lua')
-rw-r--r-- | lualib/lua_util.lua | 55 |
1 files changed, 38 insertions, 17 deletions
diff --git a/lualib/lua_util.lua b/lualib/lua_util.lua index 9bc42fd62..cde09ad6a 100644 --- a/lualib/lua_util.lua +++ b/lualib/lua_util.lua @@ -668,6 +668,18 @@ exports.filter_specific_urls = function (urls, params) local ntlds, neslds = 0, 0 local res = {} + local nres = 0 + + local function insert_url(str, u) + if not res[str] then + res[str] = u + nres = nres + 1 + + return true + end + + return false + end local function process_single_url(u) local esld = u:get_tld() @@ -682,13 +694,15 @@ exports.filter_specific_urls = function (urls, params) end end + local str_hash = tostring(u) + if esld then if not eslds[esld] then - eslds[esld] = {u} + eslds[esld] = {{str_hash, u}} neslds = neslds + 1 else if #eslds[esld] < params.esld_limit then - table.insert(eslds[esld], u) + table.insert(eslds[esld], {str_hash, u}) end end @@ -696,21 +710,21 @@ exports.filter_specific_urls = function (urls, params) local tld = table.concat(fun.totable(fun.tail(parts)), '.') if not tlds[tld] then - tlds[tld] = {u} + tlds[tld] = {{str_hash, u}} ntlds = ntlds + 1 else - table.insert(tlds[tld], u) + table.insert(tlds[tld], {str_hash, u}) end - -- Extract priority urls that are proven to be malicious - if not u:is_html_displayed() then + -- Special cases + if not u:get_protocol() == 'mailto' and not u:is_html_displayed() then if u:is_obscured() then - table.insert(res, u) + insert_url(str_hash, u) else if u:get_user() then - table.insert(res, u) + insert_url(str_hash, u) elseif u:is_subject() or u:is_phished() then - table.insert(res, u) + insert_url(str_hash, u) end end end @@ -722,7 +736,7 @@ exports.filter_specific_urls = function (urls, params) end local limit = params.limit - limit = limit - #res + limit = limit - nres if limit <= 0 then limit = 1 end if neslds <= limit then @@ -732,7 +746,8 @@ exports.filter_specific_urls = function (urls, params) for _,lurls in pairs(eslds) do if #lurls > 0 then - table.insert(res, table.remove(lurls)) + local last = table.remove(lurls) + insert_url(last[1], last[2]) limit = limit - 1 item_found = true end @@ -740,8 +755,9 @@ exports.filter_specific_urls = function (urls, params) until limit <= 0 or not item_found + res = exports.values(res) if params.task and not params.no_cache then - params.task:cache_set(cache_key, urls) + params.task:cache_set(cache_key, res) end return res end @@ -750,14 +766,16 @@ exports.filter_specific_urls = function (urls, params) while limit > 0 do for _,lurls in pairs(tlds) do if #lurls > 0 then - table.insert(res, table.remove(lurls)) + local last = table.remove(lurls) + insert_url(last[1], last[2]) limit = limit - 1 end end end + res = exports.values(res) if params.task and not params.no_cache then - params.task:cache_set(cache_key, urls) + params.task:cache_set(cache_key, res) end return res end @@ -774,11 +792,13 @@ exports.filter_specific_urls = function (urls, params) local tld1 = tlds[tlds_keys[i]] local tld2 = tlds[tlds_keys[ntlds - i]] if #tld1 > 0 then - table.insert(res, table.remove(tld1)) + local last = table.remove(tld1) + insert_url(last[1], last[2]) limit = limit - 1 end if #tld2 > 0 then - table.insert(res, table.remove(tld2)) + local last = table.remove(tld2) + insert_url(last[1], last[2]) limit = limit - 1 end @@ -787,8 +807,9 @@ exports.filter_specific_urls = function (urls, params) end end + res = exports.values(res) if params.task and not params.no_cache then - params.task:cache_set(cache_key, urls) + params.task:cache_set(cache_key, res) end return res |