Browse Source

[Fix] More fixes to extract_specific_urls

tags/2.0
Vsevolod Stakhov 4 years ago
parent
commit
ddbbe07faf
2 changed files with 39 additions and 17 deletions
  1. 1
    0
      lualib/lua_selectors/extractors.lua
  2. 38
    17
      lualib/lua_util.lua

+ 1
- 0
lualib/lua_selectors/extractors.lua View File

['get_value'] = function(task, args) ['get_value'] = function(task, args)
local params = args[1] or {} local params = args[1] or {}
params.task = task params.task = task
params.no_cache = true
local urls = lua_util.extract_specific_urls(params) local urls = lua_util.extract_specific_urls(params)
return urls,'userdata_list' return urls,'userdata_list'
end, end,

+ 38
- 17
lualib/lua_util.lua View File

local ntlds, neslds = 0, 0 local ntlds, neslds = 0, 0


local res = {} local res = {}
local nres = 0

local function insert_url(str, u)
if not res[str] then
res[str] = u
nres = nres + 1

return true
end

return false
end


local function process_single_url(u) local function process_single_url(u)
local esld = u:get_tld() local esld = u:get_tld()
end end
end end


local str_hash = tostring(u)

if esld then if esld then
if not eslds[esld] then if not eslds[esld] then
eslds[esld] = {u}
eslds[esld] = {{str_hash, u}}
neslds = neslds + 1 neslds = neslds + 1
else else
if #eslds[esld] < params.esld_limit then if #eslds[esld] < params.esld_limit then
table.insert(eslds[esld], u)
table.insert(eslds[esld], {str_hash, u})
end end
end end


local tld = table.concat(fun.totable(fun.tail(parts)), '.') local tld = table.concat(fun.totable(fun.tail(parts)), '.')


if not tlds[tld] then if not tlds[tld] then
tlds[tld] = {u}
tlds[tld] = {{str_hash, u}}
ntlds = ntlds + 1 ntlds = ntlds + 1
else else
table.insert(tlds[tld], u)
table.insert(tlds[tld], {str_hash, u})
end end


-- Extract priority urls that are proven to be malicious
if not u:is_html_displayed() then
-- Special cases
if not u:get_protocol() == 'mailto' and not u:is_html_displayed() then
if u:is_obscured() then if u:is_obscured() then
table.insert(res, u)
insert_url(str_hash, u)
else else
if u:get_user() then if u:get_user() then
table.insert(res, u)
insert_url(str_hash, u)
elseif u:is_subject() or u:is_phished() then elseif u:is_subject() or u:is_phished() then
table.insert(res, u)
insert_url(str_hash, u)
end end
end end
end end
end end


local limit = params.limit local limit = params.limit
limit = limit - #res
limit = limit - nres
if limit <= 0 then limit = 1 end if limit <= 0 then limit = 1 end


if neslds <= limit then if neslds <= limit then


for _,lurls in pairs(eslds) do for _,lurls in pairs(eslds) do
if #lurls > 0 then if #lurls > 0 then
table.insert(res, table.remove(lurls))
local last = table.remove(lurls)
insert_url(last[1], last[2])
limit = limit - 1 limit = limit - 1
item_found = true item_found = true
end end


until limit <= 0 or not item_found until limit <= 0 or not item_found


res = exports.values(res)
if params.task and not params.no_cache then if params.task and not params.no_cache then
params.task:cache_set(cache_key, urls)
params.task:cache_set(cache_key, res)
end end
return res return res
end end
while limit > 0 do while limit > 0 do
for _,lurls in pairs(tlds) do for _,lurls in pairs(tlds) do
if #lurls > 0 then if #lurls > 0 then
table.insert(res, table.remove(lurls))
local last = table.remove(lurls)
insert_url(last[1], last[2])
limit = limit - 1 limit = limit - 1
end end
end end
end end


res = exports.values(res)
if params.task and not params.no_cache then if params.task and not params.no_cache then
params.task:cache_set(cache_key, urls)
params.task:cache_set(cache_key, res)
end end
return res return res
end end
local tld1 = tlds[tlds_keys[i]] local tld1 = tlds[tlds_keys[i]]
local tld2 = tlds[tlds_keys[ntlds - i]] local tld2 = tlds[tlds_keys[ntlds - i]]
if #tld1 > 0 then if #tld1 > 0 then
table.insert(res, table.remove(tld1))
local last = table.remove(tld1)
insert_url(last[1], last[2])
limit = limit - 1 limit = limit - 1
end end
if #tld2 > 0 then if #tld2 > 0 then
table.insert(res, table.remove(tld2))
local last = table.remove(tld2)
insert_url(last[1], last[2])
limit = limit - 1 limit = limit - 1
end end


end end
end end


res = exports.values(res)
if params.task and not params.no_cache then if params.task and not params.no_cache then
params.task:cache_set(cache_key, urls)
params.task:cache_set(cache_key, res)
end end


return res return res

Loading…
Cancel
Save