You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

extractors.lua 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547
  1. --[[
  2. Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. local fun = require 'fun'
  14. local meta_functions = require "lua_meta"
  15. local lua_util = require "lua_util"
  16. local rspamd_url = require "rspamd_url"
  17. local common = require "lua_selectors/common"
  18. local ts = require("tableshape").types
  19. local maps = require "lua_selectors/maps"
  20. local E = {}
  21. local M = "selectors"
  22. local url_flags_ts = ts.array_of(ts.one_of(lua_util.keys(rspamd_url.flags))):is_optional()
  23. local function gen_exclude_flags_filter(exclude_flags)
  24. return function(u)
  25. local got_flags = u:get_flags()
  26. for _, flag in ipairs(exclude_flags) do
  27. if got_flags[flag] then return false end
  28. end
  29. return true
  30. end
  31. end
  32. local extractors = {
  33. -- Plain id function
  34. ['id'] = {
  35. ['get_value'] = function(_, args)
  36. if args[1] then
  37. return args[1], 'string'
  38. end
  39. return '','string'
  40. end,
  41. ['description'] = [[Return value from function's argument or an empty string,
  42. For example, `id('Something')` returns a string 'Something']],
  43. ['args_schema'] = {ts.string:is_optional()}
  44. },
  45. -- Similar but for making lists
  46. ['list'] = {
  47. ['get_value'] = function(_, args)
  48. if args[1] then
  49. return fun.map(tostring, args), 'string_list'
  50. end
  51. return {},'string_list'
  52. end,
  53. ['description'] = [[Return a list from function's arguments or an empty list,
  54. For example, `list('foo', 'bar')` returns a list {'foo', 'bar'}]],
  55. },
  56. -- Get source IP address
  57. ['ip'] = {
  58. ['get_value'] = function(task)
  59. local ip = task:get_ip()
  60. if ip and ip:is_valid() then return ip,'userdata' end
  61. return nil
  62. end,
  63. ['description'] = [[Get source IP address]],
  64. },
  65. -- Get MIME from
  66. ['from'] = {
  67. ['get_value'] = function(task, args)
  68. local from
  69. if type(args) == 'table' then
  70. from = task:get_from(args)
  71. else
  72. from = task:get_from(0)
  73. end
  74. if ((from or E)[1] or E).addr then
  75. return from[1],'table'
  76. end
  77. return nil
  78. end,
  79. ['description'] = [[Get MIME or SMTP from (e.g. `from('smtp')` or `from('mime')`,
  80. uses any type by default)]],
  81. },
  82. ['rcpts'] = {
  83. ['get_value'] = function(task, args)
  84. local rcpts
  85. if type(args) == 'table' then
  86. rcpts = task:get_recipients(args)
  87. else
  88. rcpts = task:get_recipients(0)
  89. end
  90. if ((rcpts or E)[1] or E).addr then
  91. return rcpts,'table_list'
  92. end
  93. return nil
  94. end,
  95. ['description'] = [[Get MIME or SMTP rcpts (e.g. `rcpts('smtp')` or `rcpts('mime')`,
  96. uses any type by default)]],
  97. },
  98. -- Get country (ASN module must be executed first)
  99. ['country'] = {
  100. ['get_value'] = function(task)
  101. local country = task:get_mempool():get_variable('country')
  102. if not country then
  103. return nil
  104. else
  105. return country,'string'
  106. end
  107. end,
  108. ['description'] = [[Get country (ASN module must be executed first)]],
  109. },
  110. -- Get ASN number
  111. ['asn'] = {
  112. ['type'] = 'string',
  113. ['get_value'] = function(task)
  114. local asn = task:get_mempool():get_variable('asn')
  115. if not asn then
  116. return nil
  117. else
  118. return asn,'string'
  119. end
  120. end,
  121. ['description'] = [[Get AS number (ASN module must be executed first)]],
  122. },
  123. -- Get authenticated username
  124. ['user'] = {
  125. ['get_value'] = function(task)
  126. local auser = task:get_user()
  127. if not auser then
  128. return nil
  129. else
  130. return auser,'string'
  131. end
  132. end,
  133. ['description'] = 'Get authenticated user name',
  134. },
  135. -- Get principal recipient
  136. ['to'] = {
  137. ['get_value'] = function(task)
  138. return task:get_principal_recipient(),'string'
  139. end,
  140. ['description'] = 'Get principal recipient',
  141. },
  142. -- Get content digest
  143. ['digest'] = {
  144. ['get_value'] = function(task)
  145. return task:get_digest(),'string'
  146. end,
  147. ['description'] = 'Get content digest',
  148. },
  149. -- Get list of all attachments digests
  150. ['attachments'] = {
  151. ['get_value'] = function(task, args)
  152. local parts = task:get_parts() or E
  153. local digests = {}
  154. for i,p in ipairs(parts) do
  155. if p:is_attachment() then
  156. table.insert(digests, common.get_cached_or_raw_digest(task, i, p, args))
  157. end
  158. end
  159. if #digests > 0 then
  160. return digests,'string_list'
  161. end
  162. return nil
  163. end,
  164. ['description'] = [[Get list of all attachments digests.
  165. The first optional argument is encoding (`hex`, `base32` (and forms `bleach32`, `rbase32`), `base64`),
  166. the second optional argument is optional hash type (`blake2`, `sha256`, `sha1`, `sha512`, `md5`)]],
  167. ['args_schema'] = common.digest_schema()
  168. },
  169. -- Get all attachments files
  170. ['files'] = {
  171. ['get_value'] = function(task)
  172. local parts = task:get_parts() or E
  173. local files = {}
  174. for _,p in ipairs(parts) do
  175. local fname = p:get_filename()
  176. if fname then
  177. table.insert(files, fname)
  178. end
  179. end
  180. if #files > 0 then
  181. return files,'string_list'
  182. end
  183. return nil
  184. end,
  185. ['description'] = 'Get all attachments files',
  186. },
  187. -- Get languages for text parts
  188. ['languages'] = {
  189. ['get_value'] = function(task)
  190. local text_parts = task:get_text_parts() or E
  191. local languages = {}
  192. for _,p in ipairs(text_parts) do
  193. local lang = p:get_language()
  194. if lang then
  195. table.insert(languages, lang)
  196. end
  197. end
  198. if #languages > 0 then
  199. return languages,'string_list'
  200. end
  201. return nil
  202. end,
  203. ['description'] = 'Get languages for text parts',
  204. },
  205. -- Get helo value
  206. ['helo'] = {
  207. ['get_value'] = function(task)
  208. return task:get_helo(),'string'
  209. end,
  210. ['description'] = 'Get helo value',
  211. },
  212. -- Get header with the name that is expected as an argument. Returns list of
  213. -- headers with this name
  214. ['header'] = {
  215. ['get_value'] = function(task, args)
  216. local strong = false
  217. if args[2] then
  218. if args[2]:match('strong') then
  219. strong = true
  220. end
  221. if args[2]:match('full') then
  222. return task:get_header_full(args[1], strong),'table_list'
  223. end
  224. return task:get_header(args[1], strong),'string'
  225. else
  226. return task:get_header(args[1]),'string'
  227. end
  228. end,
  229. ['description'] = [[Get header with the name that is expected as an argument.
  230. The optional second argument accepts list of flags:
  231. - `full`: returns all headers with this name with all data (like task:get_header_full())
  232. - `strong`: use case sensitive match when matching header's name]],
  233. ['args_schema'] = {ts.string,
  234. (ts.pattern("strong") + ts.pattern("full")):is_optional()}
  235. },
  236. -- Get list of received headers (returns list of tables)
  237. ['received'] = {
  238. ['get_value'] = function(task, args)
  239. local rh = task:get_received_headers()
  240. if not rh[1] then
  241. return nil
  242. end
  243. if args[1] then
  244. return fun.map(function(r) return r[args[1]] end, rh), 'string_list'
  245. end
  246. return rh,'table_list'
  247. end,
  248. ['description'] = [[Get list of received headers.
  249. If no arguments specified, returns list of tables. Otherwise, selects a specific element,
  250. e.g. `by_hostname`]],
  251. },
  252. -- Get all urls
  253. ['urls'] = {
  254. ['get_value'] = function(task, args)
  255. local urls = task:get_urls()
  256. if not urls[1] then
  257. return nil
  258. end
  259. if args[1] then
  260. return fun.map(function(r) return r[args[1]](r) end, urls), 'string_list'
  261. end
  262. return urls,'userdata_list'
  263. end,
  264. ['description'] = [[Get list of all urls.
  265. If no arguments specified, returns list of url objects. Otherwise, calls a specific method,
  266. e.g. `get_tld`]],
  267. },
  268. -- Get specific urls
  269. ['specific_urls'] = {
  270. ['get_value'] = function(task, args)
  271. local params = args[1] or {}
  272. params.task = task
  273. params.no_cache = true
  274. if params.exclude_flags then
  275. params.filter = gen_exclude_flags_filter(params.exclude_flags)
  276. end
  277. local urls = lua_util.extract_specific_urls(params)
  278. if not urls[1] then
  279. return nil
  280. end
  281. return urls,'userdata_list'
  282. end,
  283. ['description'] = [[Get most specific urls. Arguments are equal to the Lua API function]],
  284. ['args_schema'] = {ts.shape{
  285. limit = ts.number + ts.string / tonumber,
  286. esld_limit = (ts.number + ts.string / tonumber):is_optional(),
  287. exclude_flags = url_flags_ts,
  288. flags = url_flags_ts,
  289. flags_mode = ts.one_of{'explicit'}:is_optional(),
  290. prefix = ts.string:is_optional(),
  291. need_content = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  292. need_emails = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  293. need_images = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  294. ignore_redirected = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  295. }}
  296. },
  297. ['specific_urls_filter_map'] = {
  298. ['get_value'] = function(task, args)
  299. local map = maps[args[1]]
  300. if not map then
  301. lua_util.debugm(M, "invalid/unknown map: %s", args[1])
  302. end
  303. local params = args[2] or {}
  304. params.task = task
  305. params.no_cache = true
  306. if params.exclude_flags then
  307. params.filter = gen_exclude_flags_filter(params.exclude_flags)
  308. end
  309. local urls = lua_util.extract_specific_urls(params)
  310. if not urls[1] then
  311. return nil
  312. end
  313. return fun.filter(function(u) return map:get_key(tostring(u)) end, urls),'userdata_list'
  314. end,
  315. ['description'] = [[Get most specific urls, filtered by some map. Arguments are equal to the Lua API function]],
  316. ['args_schema'] = {ts.string, ts.shape{
  317. limit = ts.number + ts.string / tonumber,
  318. esld_limit = (ts.number + ts.string / tonumber):is_optional(),
  319. exclude_flags = url_flags_ts,
  320. flags = url_flags_ts,
  321. flags_mode = ts.one_of{'explicit'}:is_optional(),
  322. prefix = ts.string:is_optional(),
  323. need_content = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  324. need_emails = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  325. need_images = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  326. ignore_redirected = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  327. }}
  328. },
  329. -- URLs filtered by flags
  330. ['urls_filtered'] = {
  331. ['get_value'] = function(task, args)
  332. local urls = task:get_urls_filtered(args[1], args[2])
  333. if not urls[1] then
  334. return nil
  335. end
  336. return urls,'userdata_list'
  337. end,
  338. ['description'] = [[Get list of all urls filtered by flags_include/exclude
  339. (see rspamd_task:get_urls_filtered for description)]],
  340. ['args_schema'] = {ts.array_of{
  341. url_flags_ts:is_optional(), url_flags_ts:is_optional()
  342. }}
  343. },
  344. -- Get all emails
  345. ['emails'] = {
  346. ['get_value'] = function(task, args)
  347. local urls = task:get_emails()
  348. if not urls[1] then
  349. return nil
  350. end
  351. if args[1] then
  352. return fun.map(function(r) return r[args[1]](r) end, urls), 'string_list'
  353. end
  354. return urls,'userdata_list'
  355. end,
  356. ['description'] = [[Get list of all emails.
  357. If no arguments specified, returns list of url objects. Otherwise, calls a specific method,
  358. e.g. `get_user`]],
  359. },
  360. -- Get specific pool var. The first argument must be variable name,
  361. -- the second argument is optional and defines the type (string by default)
  362. ['pool_var'] = {
  363. ['get_value'] = function(task, args)
  364. local type = args[2] or 'string'
  365. return task:get_mempool():get_variable(args[1], type),(type)
  366. end,
  367. ['description'] = [[Get specific pool var. The first argument must be variable name,
  368. the second argument is optional and defines the type (string by default)]],
  369. ['args_schema'] = {ts.string, ts.string:is_optional()}
  370. },
  371. -- Get value of specific key from task cache
  372. ['task_cache'] = {
  373. ['get_value'] = function(task, args)
  374. local val = task:cache_get(args[1])
  375. if not val then
  376. return
  377. end
  378. if type(val) == 'table' then
  379. if not val[1] then
  380. return
  381. end
  382. return val, 'string_list'
  383. end
  384. return val, 'string'
  385. end,
  386. ['description'] = [[Get value of specific key from task cache. The first argument must be
  387. the key name]],
  388. ['args_schema'] = {ts.string}
  389. },
  390. -- Get specific HTTP request header. The first argument must be header name.
  391. ['request_header'] = {
  392. ['get_value'] = function(task, args)
  393. local hdr = task:get_request_header(args[1])
  394. if hdr then
  395. return hdr,'string'
  396. end
  397. return nil
  398. end,
  399. ['description'] = [[Get specific HTTP request header.
  400. The first argument must be header name.]],
  401. ['args_schema'] = {ts.string}
  402. },
  403. -- Get task date, optionally formatted
  404. ['time'] = {
  405. ['get_value'] = function(task, args)
  406. local what = args[1] or 'message'
  407. local dt = task:get_date{format = what, gmt = true}
  408. if dt then
  409. if args[2] then
  410. -- Should be in format !xxx, as dt is in GMT
  411. return os.date(args[2], dt),'string'
  412. end
  413. return tostring(dt),'string'
  414. end
  415. return nil
  416. end,
  417. ['description'] = [[Get task timestamp. The first argument is type:
  418. - `connect`: connection timestamp (default)
  419. - `message`: timestamp as defined by `Date` header
  420. The second argument is optional time format, see [os.date](http://pgl.yoyo.org/luai/i/os.date) description]],
  421. ['args_schema'] = {ts.one_of{'connect', 'message'}:is_optional(),
  422. ts.string:is_optional()}
  423. },
  424. -- Get text words from a message
  425. ['words'] = {
  426. ['get_value'] = function(task, args)
  427. local how = args[1] or 'stem'
  428. local tp = task:get_text_parts()
  429. if tp then
  430. local rtype = 'string_list'
  431. if how == 'full' then
  432. rtype = 'table_list'
  433. end
  434. return lua_util.flatten(
  435. fun.map(function(p)
  436. return p:get_words(how)
  437. end, tp)), rtype
  438. end
  439. return nil
  440. end,
  441. ['description'] = [[Get words from text parts
  442. - `stem`: stemmed words (default)
  443. - `raw`: raw words
  444. - `norm`: normalised words (lowercased)
  445. - `full`: list of tables
  446. ]],
  447. ['args_schema'] = { ts.one_of { 'stem', 'raw', 'norm', 'full' }:is_optional()},
  448. },
  449. -- Get queue ID
  450. ['queueid'] = {
  451. ['get_value'] = function(task)
  452. local queueid = task:get_queue_id()
  453. if queueid then return queueid,'string' end
  454. return nil
  455. end,
  456. ['description'] = [[Get queue ID]],
  457. },
  458. -- Get ID of the task being processed
  459. ['uid'] = {
  460. ['get_value'] = function(task)
  461. local uid = task:get_uid()
  462. if uid then return uid,'string' end
  463. return nil
  464. end,
  465. ['description'] = [[Get ID of the task being processed]],
  466. },
  467. -- Get message ID of the task being processed
  468. ['messageid'] = {
  469. ['get_value'] = function(task)
  470. local mid = task:get_message_id()
  471. if mid then return mid,'string' end
  472. return nil
  473. end,
  474. ['description'] = [[Get message ID]],
  475. },
  476. -- Get specific symbol
  477. ['symbol'] = {
  478. ['get_value'] = function(task, args)
  479. local symbol = task:get_symbol(args[1], args[2])
  480. if symbol then
  481. return symbol[1],'table'
  482. end
  483. end,
  484. ['description'] = 'Get specific symbol. The first argument must be the symbol name. ' ..
  485. 'The second argument is an optional shadow result name. ' ..
  486. 'Returns the symbol table. See task:get_symbol()',
  487. ['args_schema'] = {ts.string, ts.string:is_optional()}
  488. },
  489. -- Get full scan result
  490. ['scan_result'] = {
  491. ['get_value'] = function(task, args)
  492. local res = task:get_metric_result(args[1])
  493. if res then
  494. return res,'table'
  495. end
  496. end,
  497. ['description'] = 'Get full scan result (either default or shadow if shadow result name is specified)' ..
  498. 'Returns the result table. See task:get_metric_result()',
  499. ['args_schema'] = {ts.string:is_optional()}
  500. },
  501. -- Get list of metatokens as strings
  502. ['metatokens'] = {
  503. ['get_value'] = function(task)
  504. local tokens = meta_functions.gen_metatokens(task)
  505. if not tokens[1] then
  506. return nil
  507. end
  508. local res = {}
  509. for _, t in ipairs(tokens) do
  510. table.insert(res, tostring(t))
  511. end
  512. return res, 'string_list'
  513. end,
  514. ['description'] = 'Get metatokens for a message as strings',
  515. },
  516. }
  517. return extractors