You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

extractors.lua 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. --[[
  2. Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. local fun = require 'fun'
  14. local meta_functions = require "lua_meta"
  15. local lua_util = require "lua_util"
  16. local rspamd_url = require "rspamd_url"
  17. local common = require "lua_selectors/common"
  18. local ts = require("tableshape").types
  19. local maps = require "lua_selectors/maps"
  20. local E = {}
  21. local M = "selectors"
  22. local url_flags_ts = ts.array_of(ts.one_of(lua_util.keys(rspamd_url.flags))):is_optional()
  23. local function gen_exclude_flags_filter(exclude_flags)
  24. return function(u)
  25. local got_flags = u:get_flags()
  26. for _, flag in ipairs(exclude_flags) do
  27. if got_flags[flag] then
  28. return false
  29. end
  30. end
  31. return true
  32. end
  33. end
  34. local extractors = {
  35. -- Plain id function
  36. ['id'] = {
  37. ['get_value'] = function(_, args)
  38. if args[1] then
  39. return args[1], 'string'
  40. end
  41. return '', 'string'
  42. end,
  43. ['description'] = [[Return value from function's argument or an empty string,
  44. For example, `id('Something')` returns a string 'Something']],
  45. ['args_schema'] = { ts.string:is_optional() }
  46. },
  47. -- Similar but for making lists
  48. ['list'] = {
  49. ['get_value'] = function(_, args)
  50. if args[1] then
  51. return fun.map(tostring, args), 'string_list'
  52. end
  53. return {}, 'string_list'
  54. end,
  55. ['description'] = [[Return a list from function's arguments or an empty list,
  56. For example, `list('foo', 'bar')` returns a list {'foo', 'bar'}]],
  57. },
  58. -- Get source IP address
  59. ['ip'] = {
  60. ['get_value'] = function(task)
  61. local ip = task:get_ip()
  62. if ip and ip:is_valid() then
  63. return ip, 'userdata'
  64. end
  65. return nil
  66. end,
  67. ['description'] = [[Get source IP address]],
  68. },
  69. -- Get MIME from
  70. ['from'] = {
  71. ['get_value'] = function(task, args)
  72. local from
  73. if type(args) == 'table' then
  74. from = task:get_from(args)
  75. else
  76. from = task:get_from(0)
  77. end
  78. if ((from or E)[1] or E).addr then
  79. return from[1], 'table'
  80. end
  81. return nil
  82. end,
  83. ['description'] = [[Get MIME or SMTP from (e.g. `from('smtp')` or `from('mime')`,
  84. uses any type by default)]],
  85. },
  86. ['rcpts'] = {
  87. ['get_value'] = function(task, args)
  88. local rcpts
  89. if type(args) == 'table' then
  90. rcpts = task:get_recipients(args)
  91. else
  92. rcpts = task:get_recipients(0)
  93. end
  94. if ((rcpts or E)[1] or E).addr then
  95. return rcpts, 'table_list'
  96. end
  97. return nil
  98. end,
  99. ['description'] = [[Get MIME or SMTP rcpts (e.g. `rcpts('smtp')` or `rcpts('mime')`,
  100. uses any type by default)]],
  101. },
  102. -- Get country (ASN module must be executed first)
  103. ['country'] = {
  104. ['get_value'] = function(task)
  105. local country = task:get_mempool():get_variable('country')
  106. if not country then
  107. return nil
  108. else
  109. return country, 'string'
  110. end
  111. end,
  112. ['description'] = [[Get country (ASN module must be executed first)]],
  113. },
  114. -- Get ASN number
  115. ['asn'] = {
  116. ['type'] = 'string',
  117. ['get_value'] = function(task)
  118. local asn = task:get_mempool():get_variable('asn')
  119. if not asn then
  120. return nil
  121. else
  122. return asn, 'string'
  123. end
  124. end,
  125. ['description'] = [[Get AS number (ASN module must be executed first)]],
  126. },
  127. -- Get authenticated username
  128. ['user'] = {
  129. ['get_value'] = function(task)
  130. local auser = task:get_user()
  131. if not auser then
  132. return nil
  133. else
  134. return auser, 'string'
  135. end
  136. end,
  137. ['description'] = 'Get authenticated user name',
  138. },
  139. -- Get principal recipient
  140. ['to'] = {
  141. ['get_value'] = function(task)
  142. return task:get_principal_recipient(), 'string'
  143. end,
  144. ['description'] = 'Get principal recipient',
  145. },
  146. -- Get content digest
  147. ['digest'] = {
  148. ['get_value'] = function(task)
  149. return task:get_digest(), 'string'
  150. end,
  151. ['description'] = 'Get content digest',
  152. },
  153. -- Get list of all attachments digests
  154. ['attachments'] = {
  155. ['get_value'] = function(task, args)
  156. local parts = task:get_parts() or E
  157. local digests = {}
  158. for i, p in ipairs(parts) do
  159. if p:is_attachment() then
  160. table.insert(digests, common.get_cached_or_raw_digest(task, i, p, args))
  161. end
  162. end
  163. if #digests > 0 then
  164. return digests, 'string_list'
  165. end
  166. return nil
  167. end,
  168. ['description'] = [[Get list of all attachments digests.
  169. The first optional argument is encoding (`hex`, `base32` (and forms `bleach32`, `rbase32`), `base64`),
  170. the second optional argument is optional hash type (`blake2`, `sha256`, `sha1`, `sha512`, `md5`)]],
  171. ['args_schema'] = common.digest_schema()
  172. },
  173. -- Get all attachments files
  174. ['files'] = {
  175. ['get_value'] = function(task)
  176. local parts = task:get_parts() or E
  177. local files = {}
  178. for _, p in ipairs(parts) do
  179. local fname = p:get_filename()
  180. if fname then
  181. table.insert(files, fname)
  182. end
  183. end
  184. if #files > 0 then
  185. return files, 'string_list'
  186. end
  187. return nil
  188. end,
  189. ['description'] = 'Get all attachments files',
  190. },
  191. -- Get languages for text parts
  192. ['languages'] = {
  193. ['get_value'] = function(task)
  194. local text_parts = task:get_text_parts() or E
  195. local languages = {}
  196. for _, p in ipairs(text_parts) do
  197. local lang = p:get_language()
  198. if lang then
  199. table.insert(languages, lang)
  200. end
  201. end
  202. if #languages > 0 then
  203. return languages, 'string_list'
  204. end
  205. return nil
  206. end,
  207. ['description'] = 'Get languages for text parts',
  208. },
  209. -- Get helo value
  210. ['helo'] = {
  211. ['get_value'] = function(task)
  212. return task:get_helo(), 'string'
  213. end,
  214. ['description'] = 'Get helo value',
  215. },
  216. -- Get header with the name that is expected as an argument. Returns list of
  217. -- headers with this name
  218. ['header'] = {
  219. ['get_value'] = function(task, args)
  220. local strong = false
  221. if args[2] then
  222. if args[2]:match('strong') then
  223. strong = true
  224. end
  225. if args[2]:match('full') then
  226. return task:get_header_full(args[1], strong), 'table_list'
  227. end
  228. return task:get_header(args[1], strong), 'string'
  229. else
  230. return task:get_header(args[1]), 'string'
  231. end
  232. end,
  233. ['description'] = [[Get header with the name that is expected as an argument.
  234. The optional second argument accepts list of flags:
  235. - `full`: returns all headers with this name with all data (like task:get_header_full())
  236. - `strong`: use case sensitive match when matching header's name]],
  237. ['args_schema'] = { ts.string,
  238. (ts.pattern("strong") + ts.pattern("full")):is_optional() }
  239. },
  240. -- Get list of received headers (returns list of tables)
  241. ['received'] = {
  242. ['get_value'] = function(task, args)
  243. local rh = task:get_received_headers()
  244. if not rh[1] then
  245. return nil
  246. end
  247. if args[1] then
  248. return fun.map(function(r)
  249. return r[args[1]]
  250. end, rh), 'string_list'
  251. end
  252. return rh, 'table_list'
  253. end,
  254. ['description'] = [[Get list of received headers.
  255. If no arguments specified, returns list of tables. Otherwise, selects a specific element,
  256. e.g. `by_hostname`]],
  257. },
  258. -- Get all urls
  259. ['urls'] = {
  260. ['get_value'] = function(task, args)
  261. local urls = task:get_urls()
  262. if not urls[1] then
  263. return nil
  264. end
  265. if args[1] then
  266. return fun.map(function(r)
  267. return r[args[1]](r)
  268. end, urls), 'string_list'
  269. end
  270. return urls, 'userdata_list'
  271. end,
  272. ['description'] = [[Get list of all urls.
  273. If no arguments specified, returns list of url objects. Otherwise, calls a specific method,
  274. e.g. `get_tld`]],
  275. },
  276. -- Get specific urls
  277. ['specific_urls'] = {
  278. ['get_value'] = function(task, args)
  279. local params = args[1] or {}
  280. params.task = task
  281. params.no_cache = true
  282. if params.exclude_flags then
  283. params.filter = gen_exclude_flags_filter(params.exclude_flags)
  284. end
  285. local urls = lua_util.extract_specific_urls(params)
  286. if not urls[1] then
  287. return nil
  288. end
  289. return urls, 'userdata_list'
  290. end,
  291. ['description'] = [[Get most specific urls. Arguments are equal to the Lua API function]],
  292. ['args_schema'] = { ts.shape {
  293. limit = ts.number + ts.string / tonumber,
  294. esld_limit = (ts.number + ts.string / tonumber):is_optional(),
  295. exclude_flags = url_flags_ts,
  296. flags = url_flags_ts,
  297. flags_mode = ts.one_of { 'explicit' }:is_optional(),
  298. prefix = ts.string:is_optional(),
  299. need_content = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  300. need_emails = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  301. need_images = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  302. ignore_redirected = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  303. } }
  304. },
  305. ['specific_urls_filter_map'] = {
  306. ['get_value'] = function(task, args)
  307. local map = maps[args[1]]
  308. if not map then
  309. lua_util.debugm(M, "invalid/unknown map: %s", args[1])
  310. end
  311. local params = args[2] or {}
  312. params.task = task
  313. params.no_cache = true
  314. if params.exclude_flags then
  315. params.filter = gen_exclude_flags_filter(params.exclude_flags)
  316. end
  317. local urls = lua_util.extract_specific_urls(params)
  318. if not urls[1] then
  319. return nil
  320. end
  321. return fun.filter(function(u)
  322. return map:get_key(tostring(u))
  323. end, urls), 'userdata_list'
  324. end,
  325. ['description'] = [[Get most specific urls, filtered by some map. Arguments are equal to the Lua API function]],
  326. ['args_schema'] = { ts.string, ts.shape {
  327. limit = ts.number + ts.string / tonumber,
  328. esld_limit = (ts.number + ts.string / tonumber):is_optional(),
  329. exclude_flags = url_flags_ts,
  330. flags = url_flags_ts,
  331. flags_mode = ts.one_of { 'explicit' }:is_optional(),
  332. prefix = ts.string:is_optional(),
  333. need_content = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  334. need_emails = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  335. need_images = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  336. ignore_redirected = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  337. } }
  338. },
  339. -- URLs filtered by flags
  340. ['urls_filtered'] = {
  341. ['get_value'] = function(task, args)
  342. local urls = task:get_urls_filtered(args[1], args[2])
  343. if not urls[1] then
  344. return nil
  345. end
  346. return urls, 'userdata_list'
  347. end,
  348. ['description'] = [[Get list of all urls filtered by flags_include/exclude
  349. (see rspamd_task:get_urls_filtered for description)]],
  350. ['args_schema'] = { ts.array_of {
  351. url_flags_ts:is_optional(), url_flags_ts:is_optional()
  352. } }
  353. },
  354. -- Get all emails
  355. ['emails'] = {
  356. ['get_value'] = function(task, args)
  357. local urls = task:get_emails()
  358. if not urls[1] then
  359. return nil
  360. end
  361. if args[1] then
  362. return fun.map(function(r)
  363. return r[args[1]](r)
  364. end, urls), 'string_list'
  365. end
  366. return urls, 'userdata_list'
  367. end,
  368. ['description'] = [[Get list of all emails.
  369. If no arguments specified, returns list of url objects. Otherwise, calls a specific method,
  370. e.g. `get_user`]],
  371. },
  372. -- Get specific pool var. The first argument must be variable name,
  373. -- the second argument is optional and defines the type (string by default)
  374. ['pool_var'] = {
  375. ['get_value'] = function(task, args)
  376. local type = args[2] or 'string'
  377. return task:get_mempool():get_variable(args[1], type), (type)
  378. end,
  379. ['description'] = [[Get specific pool var. The first argument must be variable name,
  380. the second argument is optional and defines the type (string by default)]],
  381. ['args_schema'] = { ts.string, ts.string:is_optional() }
  382. },
  383. -- Get value of specific key from task cache
  384. ['task_cache'] = {
  385. ['get_value'] = function(task, args)
  386. local val = task:cache_get(args[1])
  387. if not val then
  388. return
  389. end
  390. if type(val) == 'table' then
  391. if not val[1] then
  392. return
  393. end
  394. return val, 'string_list'
  395. end
  396. return val, 'string'
  397. end,
  398. ['description'] = [[Get value of specific key from task cache. The first argument must be
  399. the key name]],
  400. ['args_schema'] = { ts.string }
  401. },
  402. -- Get specific HTTP request header. The first argument must be header name.
  403. ['request_header'] = {
  404. ['get_value'] = function(task, args)
  405. local hdr = task:get_request_header(args[1])
  406. if hdr then
  407. return hdr, 'string'
  408. end
  409. return nil
  410. end,
  411. ['description'] = [[Get specific HTTP request header.
  412. The first argument must be header name.]],
  413. ['args_schema'] = { ts.string }
  414. },
  415. -- Get task date, optionally formatted
  416. ['time'] = {
  417. ['get_value'] = function(task, args)
  418. local what = args[1] or 'message'
  419. local dt = task:get_date { format = what, gmt = true }
  420. if dt then
  421. if args[2] then
  422. -- Should be in format !xxx, as dt is in GMT
  423. return os.date(args[2], dt), 'string'
  424. end
  425. return tostring(dt), 'string'
  426. end
  427. return nil
  428. end,
  429. ['description'] = [[Get task timestamp. The first argument is type:
  430. - `connect`: connection timestamp (default)
  431. - `message`: timestamp as defined by `Date` header
  432. The second argument is optional time format, see [os.date](http://pgl.yoyo.org/luai/i/os.date) description]],
  433. ['args_schema'] = { ts.one_of { 'connect', 'message' }:is_optional(),
  434. ts.string:is_optional() }
  435. },
  436. -- Get text words from a message
  437. ['words'] = {
  438. ['get_value'] = function(task, args)
  439. local how = args[1] or 'stem'
  440. local tp = task:get_text_parts()
  441. if tp then
  442. local rtype = 'string_list'
  443. if how == 'full' then
  444. rtype = 'table_list'
  445. end
  446. return lua_util.flatten(
  447. fun.map(function(p)
  448. return p:get_words(how)
  449. end, tp)), rtype
  450. end
  451. return nil
  452. end,
  453. ['description'] = [[Get words from text parts
  454. - `stem`: stemmed words (default)
  455. - `raw`: raw words
  456. - `norm`: normalised words (lowercased)
  457. - `full`: list of tables
  458. ]],
  459. ['args_schema'] = { ts.one_of { 'stem', 'raw', 'norm', 'full' }:is_optional() },
  460. },
  461. -- Get queue ID
  462. ['queueid'] = {
  463. ['get_value'] = function(task)
  464. local queueid = task:get_queue_id()
  465. if queueid then
  466. return queueid, 'string'
  467. end
  468. return nil
  469. end,
  470. ['description'] = [[Get queue ID]],
  471. },
  472. -- Get ID of the task being processed
  473. ['uid'] = {
  474. ['get_value'] = function(task)
  475. local uid = task:get_uid()
  476. if uid then
  477. return uid, 'string'
  478. end
  479. return nil
  480. end,
  481. ['description'] = [[Get ID of the task being processed]],
  482. },
  483. -- Get message ID of the task being processed
  484. ['messageid'] = {
  485. ['get_value'] = function(task)
  486. local mid = task:get_message_id()
  487. if mid then
  488. return mid, 'string'
  489. end
  490. return nil
  491. end,
  492. ['description'] = [[Get message ID]],
  493. },
  494. -- Get specific symbol
  495. ['symbol'] = {
  496. ['get_value'] = function(task, args)
  497. local symbol = task:get_symbol(args[1], args[2])
  498. if symbol then
  499. return symbol[1], 'table'
  500. end
  501. end,
  502. ['description'] = 'Get specific symbol. The first argument must be the symbol name. ' ..
  503. 'The second argument is an optional shadow result name. ' ..
  504. 'Returns the symbol table. See task:get_symbol()',
  505. ['args_schema'] = { ts.string, ts.string:is_optional() }
  506. },
  507. -- Get full scan result
  508. ['scan_result'] = {
  509. ['get_value'] = function(task, args)
  510. local res = task:get_metric_result(args[1])
  511. if res then
  512. return res, 'table'
  513. end
  514. end,
  515. ['description'] = 'Get full scan result (either default or shadow if shadow result name is specified)' ..
  516. 'Returns the result table. See task:get_metric_result()',
  517. ['args_schema'] = { ts.string:is_optional() }
  518. },
  519. -- Get list of metatokens as strings
  520. ['metatokens'] = {
  521. ['get_value'] = function(task)
  522. local tokens = meta_functions.gen_metatokens(task)
  523. if not tokens[1] then
  524. return nil
  525. end
  526. local res = {}
  527. for _, t in ipairs(tokens) do
  528. table.insert(res, tostring(t))
  529. end
  530. return res, 'string_list'
  531. end,
  532. ['description'] = 'Get metatokens for a message as strings',
  533. },
  534. }
  535. return extractors