From ddbbe07faf9e5f01f3b9c984a01551e82633af42 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 19 Aug 2019 16:49:19 +0100 Subject: [PATCH] [Fix] More fixes to extract_specific_urls --- lualib/lua_selectors/extractors.lua | 1 + lualib/lua_util.lua | 55 ++++++++++++++++++++--------- 2 files changed, 39 insertions(+), 17 deletions(-) diff --git a/lualib/lua_selectors/extractors.lua b/lualib/lua_selectors/extractors.lua index 993169708..1d06794a1 100644 --- a/lualib/lua_selectors/extractors.lua +++ b/lualib/lua_selectors/extractors.lua @@ -283,6 +283,7 @@ e.g. `get_tld`]], ['get_value'] = function(task, args) local params = args[1] or {} params.task = task + params.no_cache = true local urls = lua_util.extract_specific_urls(params) return urls,'userdata_list' end, diff --git a/lualib/lua_util.lua b/lualib/lua_util.lua index 9bc42fd62..cde09ad6a 100644 --- a/lualib/lua_util.lua +++ b/lualib/lua_util.lua @@ -668,6 +668,18 @@ exports.filter_specific_urls = function (urls, params) local ntlds, neslds = 0, 0 local res = {} + local nres = 0 + + local function insert_url(str, u) + if not res[str] then + res[str] = u + nres = nres + 1 + + return true + end + + return false + end local function process_single_url(u) local esld = u:get_tld() @@ -682,13 +694,15 @@ exports.filter_specific_urls = function (urls, params) end end + local str_hash = tostring(u) + if esld then if not eslds[esld] then - eslds[esld] = {u} + eslds[esld] = {{str_hash, u}} neslds = neslds + 1 else if #eslds[esld] < params.esld_limit then - table.insert(eslds[esld], u) + table.insert(eslds[esld], {str_hash, u}) end end @@ -696,21 +710,21 @@ exports.filter_specific_urls = function (urls, params) local tld = table.concat(fun.totable(fun.tail(parts)), '.') if not tlds[tld] then - tlds[tld] = {u} + tlds[tld] = {{str_hash, u}} ntlds = ntlds + 1 else - table.insert(tlds[tld], u) + table.insert(tlds[tld], {str_hash, u}) end - -- Extract priority urls that are proven to be malicious - if not u:is_html_displayed() then + -- Special cases + if not u:get_protocol() == 'mailto' and not u:is_html_displayed() then if u:is_obscured() then - table.insert(res, u) + insert_url(str_hash, u) else if u:get_user() then - table.insert(res, u) + insert_url(str_hash, u) elseif u:is_subject() or u:is_phished() then - table.insert(res, u) + insert_url(str_hash, u) end end end @@ -722,7 +736,7 @@ exports.filter_specific_urls = function (urls, params) end local limit = params.limit - limit = limit - #res + limit = limit - nres if limit <= 0 then limit = 1 end if neslds <= limit then @@ -732,7 +746,8 @@ exports.filter_specific_urls = function (urls, params) for _,lurls in pairs(eslds) do if #lurls > 0 then - table.insert(res, table.remove(lurls)) + local last = table.remove(lurls) + insert_url(last[1], last[2]) limit = limit - 1 item_found = true end @@ -740,8 +755,9 @@ exports.filter_specific_urls = function (urls, params) until limit <= 0 or not item_found + res = exports.values(res) if params.task and not params.no_cache then - params.task:cache_set(cache_key, urls) + params.task:cache_set(cache_key, res) end return res end @@ -750,14 +766,16 @@ exports.filter_specific_urls = function (urls, params) while limit > 0 do for _,lurls in pairs(tlds) do if #lurls > 0 then - table.insert(res, table.remove(lurls)) + local last = table.remove(lurls) + insert_url(last[1], last[2]) limit = limit - 1 end end end + res = exports.values(res) if params.task and not params.no_cache then - params.task:cache_set(cache_key, urls) + params.task:cache_set(cache_key, res) end return res end @@ -774,11 +792,13 @@ exports.filter_specific_urls = function (urls, params) local tld1 = tlds[tlds_keys[i]] local tld2 = tlds[tlds_keys[ntlds - i]] if #tld1 > 0 then - table.insert(res, table.remove(tld1)) + local last = table.remove(tld1) + insert_url(last[1], last[2]) limit = limit - 1 end if #tld2 > 0 then - table.insert(res, table.remove(tld2)) + local last = table.remove(tld2) + insert_url(last[1], last[2]) limit = limit - 1 end @@ -787,8 +807,9 @@ exports.filter_specific_urls = function (urls, params) end end + res = exports.values(res) if params.task and not params.no_cache then - params.task:cache_set(cache_key, urls) + params.task:cache_set(cache_key, res) end return res -- 2.39.5