You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

url_redirector.lua 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. --[[
  2. Copyright (c) 2017, Vsevolod Stakhov <vsevolod@highsecure.ru>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. if confighelp then
  14. return
  15. end
  16. local rspamd_logger = require "rspamd_logger"
  17. local rspamd_http = require "rspamd_http"
  18. local hash = require "rspamd_cryptobox_hash"
  19. local rspamd_url = require "rspamd_url"
  20. local lua_util = require "lua_util"
  21. local lua_redis = require "lua_redis"
  22. local N = "url_redirector"
  23. -- Some popular UA
  24. local default_ua = {
  25. 'Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)',
  26. 'Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)',
  27. 'Wget/1.9.1',
  28. 'Mozilla/5.0 (Android; Linux armv7l; rv:9.0) Gecko/20111216 Firefox/9.0 Fennec/9.0',
  29. 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
  30. 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko',
  31. 'W3C-checklink/4.5 [4.160] libwww-perl/5.823',
  32. 'Lynx/2.8.8dev.3 libwww-FM/2.14 SSL-MM/1.4.1',
  33. }
  34. local redis_params
  35. local settings = {
  36. expire = 86400, -- 1 day by default
  37. timeout = 10, -- 10 seconds by default
  38. nested_limit = 5, -- How many redirects to follow
  39. --proxy = "http://example.com:3128", -- Send request through proxy
  40. key_prefix = 'rdr:', -- default hash name
  41. check_ssl = false, -- check ssl certificates
  42. max_urls = 5, -- how many urls to check
  43. max_size = 10 * 1024, -- maximum body to process
  44. user_agent = default_ua,
  45. redirector_symbol = nil, -- insert symbol if redirected url has been found
  46. redirectors_only = true, -- follow merely redirectors
  47. top_urls_key = 'rdr:top_urls', -- key for top urls
  48. top_urls_count = 200, -- how many top urls to save
  49. redirector_hosts_map = nil -- check only those redirectors
  50. }
  51. local function adjust_url(task, orig_url, redir_url)
  52. if type(redir_url) == 'string' then
  53. redir_url = rspamd_url.create(task:get_mempool(), redir_url, {'redirect_target'})
  54. end
  55. if redir_url then
  56. orig_url:set_redirected(redir_url)
  57. task:inject_url(redir_url)
  58. if settings.redirector_symbol then
  59. task:insert_result(settings.redirector_symbol, 1.0,
  60. string.format('%s->%s', orig_url:get_host(), redir_url:get_host()))
  61. end
  62. else
  63. rspamd_logger.infox(task, 'bad url %s as redirection for %s', redir_url, orig_url)
  64. end
  65. end
  66. local function cache_url(task, orig_url, url, key, param)
  67. -- String representation
  68. local str_orig_url = tostring(orig_url)
  69. local str_url = tostring(url)
  70. if str_url ~= str_orig_url then
  71. -- Set redirected url
  72. adjust_url(task, orig_url, url)
  73. end
  74. local function redis_trim_cb(err, _)
  75. if err then
  76. rspamd_logger.errx(task, 'got error while getting top urls count: %s', err)
  77. else
  78. rspamd_logger.infox(task, 'trimmed url set to %s elements',
  79. settings.top_urls_count)
  80. end
  81. end
  82. -- Cleanup logic
  83. local function redis_card_cb(err, data)
  84. if err then
  85. rspamd_logger.errx(task, 'got error while getting top urls count: %s', err)
  86. else
  87. if data then
  88. if tonumber(data) > settings.top_urls_count * 2 then
  89. local ret = lua_redis.redis_make_request(task,
  90. redis_params, -- connect params
  91. key, -- hash key
  92. true, -- is write
  93. redis_trim_cb, --callback
  94. 'ZREMRANGEBYRANK', -- command
  95. {settings.top_urls_key, '0',
  96. tostring(-(settings.top_urls_count + 1))} -- arguments
  97. )
  98. if not ret then
  99. rspamd_logger.errx(task, 'cannot trim top urls set')
  100. else
  101. rspamd_logger.infox(task, 'need to trim urls set from %s to %s elements',
  102. data,
  103. settings.top_urls_count)
  104. return
  105. end
  106. end
  107. end
  108. end
  109. end
  110. local function redis_set_cb(err, _)
  111. if err then
  112. rspamd_logger.errx(task, 'got error while setting redirect keys: %s', err)
  113. else
  114. local ret = lua_redis.redis_make_request(task,
  115. redis_params, -- connect params
  116. key, -- hash key
  117. false, -- is write
  118. redis_card_cb, --callback
  119. 'ZCARD', -- command
  120. {settings.top_urls_key} -- arguments
  121. )
  122. if not ret then
  123. rspamd_logger.errx(task, 'cannot make redis request to cache results')
  124. end
  125. end
  126. end
  127. local ret,conn,_ = lua_redis.redis_make_request(task,
  128. redis_params, -- connect params
  129. key, -- hash key
  130. true, -- is write
  131. redis_set_cb, --callback
  132. 'SETEX', -- command
  133. {key, tostring(settings.expire), str_url} -- arguments
  134. )
  135. if not ret then
  136. rspamd_logger.errx(task, 'cannot make redis request to cache results')
  137. else
  138. conn:add_cmd('ZINCRBY', {settings.top_urls_key, '1', str_url})
  139. end
  140. end
  141. -- Resolve maybe cached url
  142. -- Orig url is the original url object
  143. -- url should be a new url object...
  144. local function resolve_cached(task, orig_url, url, key, ntries)
  145. local function resolve_url()
  146. if ntries > settings.nested_limit then
  147. -- We cannot resolve more, stop
  148. rspamd_logger.debugm(N, task, 'cannot get more requests to resolve %s, stop on %s after %s attempts',
  149. orig_url, url, ntries)
  150. cache_url(task, orig_url, url, key)
  151. return
  152. end
  153. local function http_callback(err, code, _, headers)
  154. if err then
  155. rspamd_logger.infox(task, 'found redirect error from %s to %s, err message: %s',
  156. orig_url, url, err)
  157. cache_url(task, orig_url, url, key)
  158. else
  159. if code == 200 then
  160. if orig_url == url then
  161. rspamd_logger.infox(task, 'direct url %s, err code 200',
  162. url)
  163. else
  164. rspamd_logger.infox(task, 'found redirect from %s to %s, err code 200',
  165. orig_url, url)
  166. end
  167. cache_url(task, orig_url, url, key)
  168. elseif code == 301 or code == 302 then
  169. local loc = headers['location']
  170. local redir_url
  171. if loc then
  172. redir_url = rspamd_url.create(task:get_mempool(), loc)
  173. end
  174. rspamd_logger.debugm(N, task, 'found redirect from %s to %s, err code %s',
  175. orig_url, loc, code)
  176. if redir_url then
  177. if settings.redirectors_only then
  178. if settings.redirector_hosts_map:get_key(redir_url:get_host()) then
  179. resolve_cached(task, orig_url, redir_url, key, ntries + 1)
  180. else
  181. lua_util.debugm(N, task,
  182. "stop resolving redirects as %s is not a redirector", loc)
  183. cache_url(task, orig_url, redir_url, key)
  184. end
  185. else
  186. resolve_cached(task, orig_url, redir_url, key, ntries + 1)
  187. end
  188. else
  189. rspamd_logger.debugm(N, task, "no location, headers: %s", headers)
  190. cache_url(task, orig_url, url, key)
  191. end
  192. else
  193. rspamd_logger.debugm(N, task, 'found redirect error from %s to %s, err code: %s',
  194. orig_url, url, code)
  195. cache_url(task, orig_url, url, key)
  196. end
  197. end
  198. end
  199. local ua
  200. if type(settings.user_agent) == 'string' then
  201. ua = settings.user_agent
  202. else
  203. ua = settings.user_agent[math.random(#settings.user_agent)]
  204. end
  205. lua_util.debugm(N, task, 'select user agent %s', ua)
  206. rspamd_http.request{
  207. headers = {
  208. ['User-Agent'] = ua,
  209. },
  210. url = tostring(url),
  211. task = task,
  212. method = 'head',
  213. max_size = settings.max_size,
  214. timeout = settings.timeout,
  215. opaque_body = true,
  216. no_ssl_verify = not settings.check_ssl,
  217. callback = http_callback
  218. }
  219. end
  220. local function redis_get_cb(err, data)
  221. if not err then
  222. if type(data) == 'string' then
  223. if data ~= 'processing' then
  224. -- Got cached result
  225. rspamd_logger.debugm(N, task, 'found cached redirect from %s to %s',
  226. url, data)
  227. if data ~= tostring(orig_url) then
  228. adjust_url(task, orig_url, data)
  229. end
  230. return
  231. end
  232. end
  233. end
  234. local function redis_reserve_cb(nerr, ndata)
  235. if nerr then
  236. rspamd_logger.errx(task, 'got error while setting redirect keys: %s', nerr)
  237. elseif ndata == 'OK' then
  238. resolve_url()
  239. end
  240. end
  241. if ntries == 1 then
  242. -- Reserve key in Redis that we are processing this redirection
  243. local ret = lua_redis.redis_make_request(task,
  244. redis_params, -- connect params
  245. key, -- hash key
  246. true, -- is write
  247. redis_reserve_cb, --callback
  248. 'SET', -- command
  249. {key, 'processing', 'EX', tostring(settings.timeout * 2), 'NX'} -- arguments
  250. )
  251. if not ret then
  252. rspamd_logger.errx(task, 'Couldn\'t schedule SET')
  253. end
  254. else
  255. -- Just continue resolving
  256. resolve_url()
  257. end
  258. end
  259. local ret = lua_redis.redis_make_request(task,
  260. redis_params, -- connect params
  261. key, -- hash key
  262. false, -- is write
  263. redis_get_cb, --callback
  264. 'GET', -- command
  265. {key} -- arguments
  266. )
  267. if not ret then
  268. rspamd_logger.errx(task, 'cannot make redis request to check results')
  269. end
  270. end
  271. local function url_redirector_process_url(task, url)
  272. local url_str = url:get_raw()
  273. -- 32 base32 characters are roughly 20 bytes of data or 160 bits
  274. local key = settings.key_prefix .. hash.create(url_str):base32():sub(1, 32)
  275. resolve_cached(task, url, url, key, 1)
  276. end
  277. local function url_redirector_handler(task)
  278. local sp_urls = lua_util.extract_specific_urls({
  279. task = task,
  280. limit = settings.max_urls,
  281. filter = function(url)
  282. local host = url:get_host()
  283. if settings.redirector_hosts_map:get_key(host) then
  284. lua_util.debugm(N, task, 'check url %s', tostring(url))
  285. return true
  286. end
  287. end,
  288. no_cache = true,
  289. })
  290. if sp_urls then
  291. for _,u in ipairs(sp_urls) do
  292. url_redirector_process_url(task, u)
  293. end
  294. end
  295. end
  296. local opts = rspamd_config:get_all_opt('url_redirector')
  297. if opts then
  298. settings = lua_util.override_defaults(settings, opts)
  299. redis_params = lua_redis.parse_redis_server('url_redirector', settings)
  300. if not redis_params then
  301. rspamd_logger.infox(rspamd_config, 'no servers are specified, disabling module')
  302. lua_util.disable_module(N, "redis")
  303. else
  304. if not settings.redirector_hosts_map then
  305. rspamd_logger.infox(rspamd_config, 'no redirector_hosts_map option is specified, disabling module')
  306. lua_util.disable_module(N, "config")
  307. else
  308. local lua_maps = require "lua_maps"
  309. settings.redirector_hosts_map = lua_maps.map_add_from_ucl(settings.redirector_hosts_map,
  310. 'set', 'Redirectors definitions')
  311. lua_redis.register_prefix(settings.key_prefix .. '[a-z0-9]{32}', N,
  312. 'URL redirector hashes', {
  313. type = 'string',
  314. })
  315. if settings.top_urls_key then
  316. lua_redis.register_prefix(settings.top_urls_key, N,
  317. 'URL redirector top urls', {
  318. type = 'zlist',
  319. })
  320. end
  321. local id = rspamd_config:register_symbol{
  322. name = 'URL_REDIRECTOR_CHECK',
  323. type = 'callback,prefilter',
  324. callback = url_redirector_handler,
  325. }
  326. if settings.redirector_symbol then
  327. rspamd_config:register_symbol{
  328. name = settings.redirector_symbol,
  329. type = 'virtual',
  330. parent = id,
  331. score = 0,
  332. }
  333. end
  334. end
  335. end
  336. end