You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

url_redirector.lua 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. --[[
  2. Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. if confighelp then
  14. return
  15. end
  16. local rspamd_logger = require "rspamd_logger"
  17. local rspamd_http = require "rspamd_http"
  18. local hash = require "rspamd_cryptobox_hash"
  19. local rspamd_url = require "rspamd_url"
  20. local lua_util = require "lua_util"
  21. local lua_redis = require "lua_redis"
  22. local N = "url_redirector"
  23. -- Some popular UA
  24. local default_ua = {
  25. 'Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)',
  26. 'Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)',
  27. 'Wget/1.9.1',
  28. 'Mozilla/5.0 (Android; Linux armv7l; rv:9.0) Gecko/20111216 Firefox/9.0 Fennec/9.0',
  29. 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
  30. 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko',
  31. 'W3C-checklink/4.5 [4.160] libwww-perl/5.823',
  32. 'Lynx/2.8.8dev.3 libwww-FM/2.14 SSL-MM/1.4.1',
  33. }
  34. local redis_params
  35. local settings = {
  36. expire = 86400, -- 1 day by default
  37. timeout = 10, -- 10 seconds by default
  38. nested_limit = 5, -- How many redirects to follow
  39. --proxy = "http://example.com:3128", -- Send request through proxy
  40. key_prefix = 'rdr:', -- default hash name
  41. check_ssl = false, -- check ssl certificates
  42. max_urls = 5, -- how many urls to check
  43. max_size = 10 * 1024, -- maximum body to process
  44. user_agent = default_ua,
  45. redirector_symbol = nil, -- insert symbol if redirected url has been found
  46. redirector_symbol_nested = "URL_REDIRECTOR_NESTED", -- insert symbol if nested limit has been reached
  47. redirectors_only = true, -- follow merely redirectors
  48. top_urls_key = 'rdr:top_urls', -- key for top urls
  49. top_urls_count = 200, -- how many top urls to save
  50. redirector_hosts_map = nil -- check only those redirectors
  51. }
  52. local function adjust_url(task, orig_url, redir_url)
  53. local mempool = task:get_mempool()
  54. if type(redir_url) == 'string' then
  55. redir_url = rspamd_url.create(mempool, redir_url, { 'redirect_target' })
  56. end
  57. if redir_url then
  58. orig_url:set_redirected(redir_url, mempool)
  59. task:inject_url(redir_url)
  60. if settings.redirector_symbol then
  61. task:insert_result(settings.redirector_symbol, 1.0,
  62. string.format('%s->%s', orig_url:get_host(), redir_url:get_host()))
  63. end
  64. else
  65. rspamd_logger.infox(task, 'bad url %s as redirection for %s', redir_url, orig_url)
  66. end
  67. end
  68. local function cache_url(task, orig_url, url, key, prefix)
  69. -- String representation
  70. local str_orig_url = tostring(orig_url)
  71. local str_url = tostring(url)
  72. if str_url ~= str_orig_url then
  73. -- Set redirected url
  74. adjust_url(task, orig_url, url)
  75. end
  76. local function redis_trim_cb(err, _)
  77. if err then
  78. rspamd_logger.errx(task, 'got error while getting top urls count: %s', err)
  79. else
  80. rspamd_logger.infox(task, 'trimmed url set to %s elements',
  81. settings.top_urls_count)
  82. end
  83. end
  84. -- Cleanup logic
  85. local function redis_card_cb(err, data)
  86. if err then
  87. rspamd_logger.errx(task, 'got error while getting top urls count: %s', err)
  88. else
  89. if data then
  90. if tonumber(data) > settings.top_urls_count * 2 then
  91. local ret = lua_redis.redis_make_request(task,
  92. redis_params, -- connect params
  93. key, -- hash key
  94. true, -- is write
  95. redis_trim_cb, --callback
  96. 'ZREMRANGEBYRANK', -- command
  97. { settings.top_urls_key, '0',
  98. tostring(-(settings.top_urls_count + 1)) } -- arguments
  99. )
  100. if not ret then
  101. rspamd_logger.errx(task, 'cannot trim top urls set')
  102. else
  103. rspamd_logger.infox(task, 'need to trim urls set from %s to %s elements',
  104. data,
  105. settings.top_urls_count)
  106. return
  107. end
  108. end
  109. end
  110. end
  111. end
  112. local function redis_set_cb(err, _)
  113. if err then
  114. rspamd_logger.errx(task, 'got error while setting redirect keys: %s', err)
  115. else
  116. local ret = lua_redis.redis_make_request(task,
  117. redis_params, -- connect params
  118. key, -- hash key
  119. false, -- is write
  120. redis_card_cb, --callback
  121. 'ZCARD', -- command
  122. { settings.top_urls_key } -- arguments
  123. )
  124. if not ret then
  125. rspamd_logger.errx(task, 'cannot make redis request to cache results')
  126. end
  127. end
  128. end
  129. if prefix then
  130. -- Save url with prefix
  131. str_url = string.format('^%s:%s', prefix, str_url)
  132. end
  133. local ret, conn, _ = lua_redis.redis_make_request(task,
  134. redis_params, -- connect params
  135. key, -- hash key
  136. true, -- is write
  137. redis_set_cb, --callback
  138. 'SETEX', -- command
  139. { key, tostring(settings.expire), str_url } -- arguments
  140. )
  141. if not ret then
  142. rspamd_logger.errx(task, 'cannot make redis request to cache results')
  143. else
  144. conn:add_cmd('ZINCRBY', { settings.top_urls_key, '1', str_url })
  145. end
  146. end
  147. -- Reduce length of a string to a given length (16 by default)
  148. local function maybe_trim_url(url, limit)
  149. if not limit then
  150. limit = 16
  151. end
  152. if #url > limit then
  153. return string.sub(url, 1, limit) .. '...'
  154. else
  155. return url
  156. end
  157. end
  158. -- Resolve maybe cached url
  159. -- Orig url is the original url object
  160. -- url should be a new url object...
  161. local function resolve_cached(task, orig_url, url, key, ntries)
  162. local str_url = tostring(url or "")
  163. local function resolve_url()
  164. if ntries > settings.nested_limit then
  165. -- We cannot resolve more, stop
  166. rspamd_logger.debugm(N, task, 'cannot get more requests to resolve %s, stop on %s after %s attempts',
  167. orig_url, url, ntries)
  168. cache_url(task, orig_url, url, key, 'nested')
  169. local str_orig_url = tostring(orig_url)
  170. task:insert_result(settings.redirector_symbol_nested, 1.0,
  171. string.format('%s->%s:%d', maybe_trim_url(str_orig_url), maybe_trim_url(str_url), ntries))
  172. return
  173. end
  174. local redirection_codes = {
  175. [301] = true, -- moved permanently
  176. [302] = true, -- found
  177. [303] = true, -- see other
  178. [307] = true, -- temporary redirect
  179. [308] = true, -- permanent redirect
  180. }
  181. local function http_callback(err, code, _, headers)
  182. if err then
  183. rspamd_logger.infox(task, 'found redirect error from %s to %s, err message: %s',
  184. orig_url, url, err)
  185. cache_url(task, orig_url, url, key)
  186. else
  187. if code == 200 then
  188. if orig_url == url then
  189. rspamd_logger.infox(task, 'direct url %s, err code 200',
  190. url)
  191. else
  192. rspamd_logger.infox(task, 'found redirect from %s to %s, err code 200',
  193. orig_url, url)
  194. end
  195. cache_url(task, orig_url, url, key)
  196. elseif redirection_codes[code] then
  197. local loc = headers['location']
  198. local redir_url
  199. if loc then
  200. redir_url = rspamd_url.create(task:get_mempool(), loc)
  201. end
  202. rspamd_logger.debugm(N, task, 'found redirect from %s to %s, err code %s',
  203. orig_url, loc, code)
  204. if redir_url then
  205. if settings.redirectors_only then
  206. if settings.redirector_hosts_map:get_key(redir_url:get_host()) then
  207. resolve_cached(task, orig_url, redir_url, key, ntries + 1)
  208. else
  209. lua_util.debugm(N, task,
  210. "stop resolving redirects as %s is not a redirector", loc)
  211. cache_url(task, orig_url, redir_url, key)
  212. end
  213. else
  214. resolve_cached(task, orig_url, redir_url, key, ntries + 1)
  215. end
  216. else
  217. rspamd_logger.debugm(N, task, "no location, headers: %s", headers)
  218. cache_url(task, orig_url, url, key)
  219. end
  220. else
  221. rspamd_logger.debugm(N, task, 'found redirect error from %s to %s, err code: %s',
  222. orig_url, url, code)
  223. cache_url(task, orig_url, url, key)
  224. end
  225. end
  226. end
  227. local ua
  228. if type(settings.user_agent) == 'string' then
  229. ua = settings.user_agent
  230. else
  231. ua = settings.user_agent[math.random(#settings.user_agent)]
  232. end
  233. lua_util.debugm(N, task, 'select user agent %s', ua)
  234. rspamd_http.request {
  235. headers = {
  236. ['User-Agent'] = ua,
  237. },
  238. url = str_url,
  239. task = task,
  240. method = 'head',
  241. max_size = settings.max_size,
  242. timeout = settings.timeout,
  243. opaque_body = true,
  244. no_ssl_verify = not settings.check_ssl,
  245. callback = http_callback
  246. }
  247. end
  248. local function redis_get_cb(err, data)
  249. if not err then
  250. if type(data) == 'string' then
  251. if data ~= 'processing' then
  252. -- Got cached result
  253. rspamd_logger.debugm(N, task, 'found cached redirect from %s to %s',
  254. url, data)
  255. if data:sub(1, 1) == '^' then
  256. -- Prefixed url stored
  257. local prefix, new_url = data:match('^%^(%a+):(.+)$')
  258. if prefix == 'nested' then
  259. task:insert_result(settings.redirector_symbol_nested, 1.0,
  260. string.format('%s->%s:cached', maybe_trim_url(str_url), maybe_trim_url(new_url)))
  261. end
  262. data = new_url
  263. end
  264. if data ~= tostring(orig_url) then
  265. adjust_url(task, orig_url, data)
  266. end
  267. return
  268. end
  269. end
  270. end
  271. local function redis_reserve_cb(nerr, ndata)
  272. if nerr then
  273. rspamd_logger.errx(task, 'got error while setting redirect keys: %s', nerr)
  274. elseif ndata == 'OK' then
  275. resolve_url()
  276. end
  277. end
  278. if ntries == 1 then
  279. -- Reserve key in Redis that we are processing this redirection
  280. local ret = lua_redis.redis_make_request(task,
  281. redis_params, -- connect params
  282. key, -- hash key
  283. true, -- is write
  284. redis_reserve_cb, --callback
  285. 'SET', -- command
  286. { key, 'processing', 'EX', tostring(settings.timeout * 2), 'NX' } -- arguments
  287. )
  288. if not ret then
  289. rspamd_logger.errx(task, 'Couldn\'t schedule SET')
  290. end
  291. else
  292. -- Just continue resolving
  293. resolve_url()
  294. end
  295. end
  296. local ret = lua_redis.redis_make_request(task,
  297. redis_params, -- connect params
  298. key, -- hash key
  299. false, -- is write
  300. redis_get_cb, --callback
  301. 'GET', -- command
  302. { key } -- arguments
  303. )
  304. if not ret then
  305. rspamd_logger.errx(task, 'cannot make redis request to check results')
  306. end
  307. end
  308. local function url_redirector_process_url(task, url)
  309. local url_str = url:get_raw()
  310. -- 32 base32 characters are roughly 20 bytes of data or 160 bits
  311. local key = settings.key_prefix .. hash.create(url_str):base32():sub(1, 32)
  312. resolve_cached(task, url, url, key, 1)
  313. end
  314. local function url_redirector_handler(task)
  315. local sp_urls = lua_util.extract_specific_urls({
  316. task = task,
  317. limit = settings.max_urls,
  318. filter = function(url)
  319. local host = url:get_host()
  320. if settings.redirector_hosts_map:get_key(host) then
  321. lua_util.debugm(N, task, 'check url %s', tostring(url))
  322. return true
  323. end
  324. end,
  325. no_cache = true,
  326. need_content = true,
  327. })
  328. if sp_urls then
  329. for _, u in ipairs(sp_urls) do
  330. url_redirector_process_url(task, u)
  331. end
  332. end
  333. end
  334. local opts = rspamd_config:get_all_opt('url_redirector')
  335. if opts then
  336. settings = lua_util.override_defaults(settings, opts)
  337. redis_params = lua_redis.parse_redis_server('url_redirector', settings)
  338. if not redis_params then
  339. rspamd_logger.infox(rspamd_config, 'no servers are specified, disabling module')
  340. lua_util.disable_module(N, "redis")
  341. else
  342. if not settings.redirector_hosts_map then
  343. rspamd_logger.infox(rspamd_config, 'no redirector_hosts_map option is specified, disabling module')
  344. lua_util.disable_module(N, "config")
  345. else
  346. local lua_maps = require "lua_maps"
  347. settings.redirector_hosts_map = lua_maps.map_add_from_ucl(settings.redirector_hosts_map,
  348. 'set', 'Redirectors definitions')
  349. lua_redis.register_prefix(settings.key_prefix .. '[a-z0-9]{32}', N,
  350. 'URL redirector hashes', {
  351. type = 'string',
  352. })
  353. if settings.top_urls_key then
  354. lua_redis.register_prefix(settings.top_urls_key, N,
  355. 'URL redirector top urls', {
  356. type = 'zlist',
  357. })
  358. end
  359. local id = rspamd_config:register_symbol {
  360. name = 'URL_REDIRECTOR_CHECK',
  361. type = 'callback,prefilter',
  362. priority = lua_util.symbols_priorities.medium,
  363. callback = url_redirector_handler,
  364. -- In fact, the real timeout is nested_limit * timeout...
  365. augmentations = { string.format("timeout=%f", settings.timeout) }
  366. }
  367. rspamd_config:register_symbol {
  368. name = settings.redirector_symbol_nested,
  369. type = 'virtual',
  370. parent = id,
  371. score = 0,
  372. }
  373. if settings.redirector_symbol then
  374. rspamd_config:register_symbol {
  375. name = settings.redirector_symbol,
  376. type = 'virtual',
  377. parent = id,
  378. score = 0,
  379. }
  380. end
  381. end
  382. end
  383. end