local ntlds, neslds = 0, 0
local res = {}
+ local nres = 0
+
+ local function insert_url(str, u)
+ if not res[str] then
+ res[str] = u
+ nres = nres + 1
+
+ return true
+ end
+
+ return false
+ end
local function process_single_url(u)
local esld = u:get_tld()
end
end
+ local str_hash = tostring(u)
+
if esld then
if not eslds[esld] then
- eslds[esld] = {u}
+ eslds[esld] = {{str_hash, u}}
neslds = neslds + 1
else
if #eslds[esld] < params.esld_limit then
- table.insert(eslds[esld], u)
+ table.insert(eslds[esld], {str_hash, u})
end
end
local tld = table.concat(fun.totable(fun.tail(parts)), '.')
if not tlds[tld] then
- tlds[tld] = {u}
+ tlds[tld] = {{str_hash, u}}
ntlds = ntlds + 1
else
- table.insert(tlds[tld], u)
+ table.insert(tlds[tld], {str_hash, u})
end
- -- Extract priority urls that are proven to be malicious
- if not u:is_html_displayed() then
+ -- Special cases
+ if not u:get_protocol() == 'mailto' and not u:is_html_displayed() then
if u:is_obscured() then
- table.insert(res, u)
+ insert_url(str_hash, u)
else
if u:get_user() then
- table.insert(res, u)
+ insert_url(str_hash, u)
elseif u:is_subject() or u:is_phished() then
- table.insert(res, u)
+ insert_url(str_hash, u)
end
end
end
end
local limit = params.limit
- limit = limit - #res
+ limit = limit - nres
if limit <= 0 then limit = 1 end
if neslds <= limit then
for _,lurls in pairs(eslds) do
if #lurls > 0 then
- table.insert(res, table.remove(lurls))
+ local last = table.remove(lurls)
+ insert_url(last[1], last[2])
limit = limit - 1
item_found = true
end
until limit <= 0 or not item_found
+ res = exports.values(res)
if params.task and not params.no_cache then
- params.task:cache_set(cache_key, urls)
+ params.task:cache_set(cache_key, res)
end
return res
end
while limit > 0 do
for _,lurls in pairs(tlds) do
if #lurls > 0 then
- table.insert(res, table.remove(lurls))
+ local last = table.remove(lurls)
+ insert_url(last[1], last[2])
limit = limit - 1
end
end
end
+ res = exports.values(res)
if params.task and not params.no_cache then
- params.task:cache_set(cache_key, urls)
+ params.task:cache_set(cache_key, res)
end
return res
end
local tld1 = tlds[tlds_keys[i]]
local tld2 = tlds[tlds_keys[ntlds - i]]
if #tld1 > 0 then
- table.insert(res, table.remove(tld1))
+ local last = table.remove(tld1)
+ insert_url(last[1], last[2])
limit = limit - 1
end
if #tld2 > 0 then
- table.insert(res, table.remove(tld2))
+ local last = table.remove(tld2)
+ insert_url(last[1], last[2])
limit = limit - 1
end
end
end
+ res = exports.values(res)
if params.task and not params.no_cache then
- params.task:cache_set(cache_key, urls)
+ params.task:cache_set(cache_key, res)
end
return res