You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

extractors.lua 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574
  1. --[[
  2. Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. local fun = require 'fun'
  14. local meta_functions = require "lua_meta"
  15. local lua_util = require "lua_util"
  16. local rspamd_util = require "rspamd_util"
  17. local rspamd_url = require "rspamd_url"
  18. local common = require "lua_selectors/common"
  19. local ts = require("tableshape").types
  20. local maps = require "lua_selectors/maps"
  21. local E = {}
  22. local M = "selectors"
  23. local HOSTNAME = rspamd_util.get_hostname()
  24. local url_flags_ts = ts.array_of(ts.one_of(lua_util.keys(rspamd_url.flags))):is_optional()
  25. local function gen_exclude_flags_filter(exclude_flags)
  26. return function(u)
  27. local got_flags = u:get_flags()
  28. for _, flag in ipairs(exclude_flags) do
  29. if got_flags[flag] then
  30. return false
  31. end
  32. end
  33. return true
  34. end
  35. end
  36. local extractors = {
  37. -- Plain id function
  38. ['id'] = {
  39. ['get_value'] = function(_, args)
  40. if args[1] then
  41. return args[1], 'string'
  42. end
  43. return '', 'string'
  44. end,
  45. ['description'] = [[Return value from function's argument or an empty string,
  46. For example, `id('Something')` returns a string 'Something']],
  47. ['args_schema'] = { ts.string:is_optional() }
  48. },
  49. -- Similar but for making lists
  50. ['list'] = {
  51. ['get_value'] = function(_, args)
  52. if args[1] then
  53. return fun.map(tostring, args), 'string_list'
  54. end
  55. return {}, 'string_list'
  56. end,
  57. ['description'] = [[Return a list from function's arguments or an empty list,
  58. For example, `list('foo', 'bar')` returns a list {'foo', 'bar'}]],
  59. },
  60. -- Get source IP address
  61. ['ip'] = {
  62. ['get_value'] = function(task)
  63. local ip = task:get_ip()
  64. if ip and ip:is_valid() then
  65. return ip, 'userdata'
  66. end
  67. return nil
  68. end,
  69. ['description'] = [[Get source IP address]],
  70. },
  71. -- Get MIME from
  72. ['from'] = {
  73. ['get_value'] = function(task, args)
  74. local from
  75. if type(args) == 'table' then
  76. from = task:get_from(args)
  77. else
  78. from = task:get_from(0)
  79. end
  80. if ((from or E)[1] or E).addr then
  81. return from[1], 'table'
  82. end
  83. return nil
  84. end,
  85. ['description'] = [[Get MIME or SMTP from (e.g. `from('smtp')` or `from('mime')`,
  86. uses any type by default)]],
  87. },
  88. ['rcpts'] = {
  89. ['get_value'] = function(task, args)
  90. local rcpts
  91. if type(args) == 'table' then
  92. rcpts = task:get_recipients(args)
  93. else
  94. rcpts = task:get_recipients(0)
  95. end
  96. if ((rcpts or E)[1] or E).addr then
  97. return rcpts, 'table_list'
  98. end
  99. return nil
  100. end,
  101. ['description'] = [[Get MIME or SMTP rcpts (e.g. `rcpts('smtp')` or `rcpts('mime')`,
  102. uses any type by default)]],
  103. },
  104. -- Get country (ASN module must be executed first)
  105. ['country'] = {
  106. ['get_value'] = function(task)
  107. local country = task:get_mempool():get_variable('country')
  108. if not country then
  109. return nil
  110. else
  111. return country, 'string'
  112. end
  113. end,
  114. ['description'] = [[Get country (ASN module must be executed first)]],
  115. },
  116. -- Get ASN number
  117. ['asn'] = {
  118. ['type'] = 'string',
  119. ['get_value'] = function(task)
  120. local asn = task:get_mempool():get_variable('asn')
  121. if not asn then
  122. return nil
  123. else
  124. return asn, 'string'
  125. end
  126. end,
  127. ['description'] = [[Get AS number (ASN module must be executed first)]],
  128. },
  129. -- Get authenticated username
  130. ['user'] = {
  131. ['get_value'] = function(task)
  132. local auser = task:get_user()
  133. if not auser then
  134. return nil
  135. else
  136. return auser, 'string'
  137. end
  138. end,
  139. ['description'] = 'Get authenticated user name',
  140. },
  141. -- Get principal recipient
  142. ['to'] = {
  143. ['get_value'] = function(task)
  144. return task:get_principal_recipient(), 'string'
  145. end,
  146. ['description'] = 'Get principal recipient',
  147. },
  148. -- Get content digest
  149. ['digest'] = {
  150. ['get_value'] = function(task)
  151. return task:get_digest(), 'string'
  152. end,
  153. ['description'] = 'Get content digest',
  154. },
  155. -- Get list of all attachments digests
  156. ['attachments'] = {
  157. ['get_value'] = function(task, args)
  158. local parts = task:get_parts() or E
  159. local digests = {}
  160. for i, p in ipairs(parts) do
  161. if p:is_attachment() then
  162. table.insert(digests, common.get_cached_or_raw_digest(task, i, p, args))
  163. end
  164. end
  165. if #digests > 0 then
  166. return digests, 'string_list'
  167. end
  168. return nil
  169. end,
  170. ['description'] = [[Get list of all attachments digests.
  171. The first optional argument is encoding (`hex`, `base32` (and forms `bleach32`, `rbase32`), `base64`),
  172. the second optional argument is optional hash type (`blake2`, `sha256`, `sha1`, `sha512`, `md5`)]],
  173. ['args_schema'] = common.digest_schema()
  174. },
  175. -- Get all attachments files
  176. ['files'] = {
  177. ['get_value'] = function(task)
  178. local parts = task:get_parts() or E
  179. local files = {}
  180. for _, p in ipairs(parts) do
  181. local fname = p:get_filename()
  182. if fname then
  183. table.insert(files, fname)
  184. end
  185. end
  186. if #files > 0 then
  187. return files, 'string_list'
  188. end
  189. return nil
  190. end,
  191. ['description'] = 'Get all attachments files',
  192. },
  193. -- Get languages for text parts
  194. ['languages'] = {
  195. ['get_value'] = function(task)
  196. local text_parts = task:get_text_parts() or E
  197. local languages = {}
  198. for _, p in ipairs(text_parts) do
  199. local lang = p:get_language()
  200. if lang then
  201. table.insert(languages, lang)
  202. end
  203. end
  204. if #languages > 0 then
  205. return languages, 'string_list'
  206. end
  207. return nil
  208. end,
  209. ['description'] = 'Get languages for text parts',
  210. },
  211. -- Get helo value
  212. ['helo'] = {
  213. ['get_value'] = function(task)
  214. return task:get_helo(), 'string'
  215. end,
  216. ['description'] = 'Get helo value',
  217. },
  218. -- Get header with the name that is expected as an argument. Returns list of
  219. -- headers with this name
  220. ['header'] = {
  221. ['get_value'] = function(task, args)
  222. local strong = false
  223. if args[2] then
  224. if args[2]:match('strong') then
  225. strong = true
  226. end
  227. if args[2]:match('full') then
  228. return task:get_header_full(args[1], strong), 'table_list'
  229. end
  230. return task:get_header(args[1], strong), 'string'
  231. else
  232. return task:get_header(args[1]), 'string'
  233. end
  234. end,
  235. ['description'] = [[Get header with the name that is expected as an argument.
  236. The optional second argument accepts list of flags:
  237. - `full`: returns all headers with this name with all data (like task:get_header_full())
  238. - `strong`: use case sensitive match when matching header's name]],
  239. ['args_schema'] = { ts.string,
  240. (ts.pattern("strong") + ts.pattern("full")):is_optional() }
  241. },
  242. -- Get list of received headers (returns list of tables)
  243. ['received'] = {
  244. ['get_value'] = function(task, args)
  245. local rh = task:get_received_headers()
  246. if not rh[1] then
  247. return nil
  248. end
  249. if args[1] then
  250. return fun.map(function(r)
  251. return r[args[1]]
  252. end, rh), 'string_list'
  253. end
  254. return rh, 'table_list'
  255. end,
  256. ['description'] = [[Get list of received headers.
  257. If no arguments specified, returns list of tables. Otherwise, selects a specific element,
  258. e.g. `by_hostname`]],
  259. },
  260. -- Get all urls
  261. ['urls'] = {
  262. ['get_value'] = function(task, args)
  263. local urls = task:get_urls()
  264. if not urls[1] then
  265. return nil
  266. end
  267. if args[1] then
  268. return fun.map(function(r)
  269. return r[args[1]](r)
  270. end, urls), 'string_list'
  271. end
  272. return urls, 'userdata_list'
  273. end,
  274. ['description'] = [[Get list of all urls.
  275. If no arguments specified, returns list of url objects. Otherwise, calls a specific method,
  276. e.g. `get_tld`]],
  277. },
  278. -- Get specific urls
  279. ['specific_urls'] = {
  280. ['get_value'] = function(task, args)
  281. local params = args[1] or {}
  282. params.task = task
  283. params.no_cache = true
  284. if params.exclude_flags then
  285. params.filter = gen_exclude_flags_filter(params.exclude_flags)
  286. end
  287. local urls = lua_util.extract_specific_urls(params)
  288. if not urls[1] then
  289. return nil
  290. end
  291. return urls, 'userdata_list'
  292. end,
  293. ['description'] = [[Get most specific urls. Arguments are equal to the Lua API function]],
  294. ['args_schema'] = { ts.shape {
  295. limit = ts.number + ts.string / tonumber,
  296. esld_limit = (ts.number + ts.string / tonumber):is_optional(),
  297. exclude_flags = url_flags_ts,
  298. flags = url_flags_ts,
  299. flags_mode = ts.one_of { 'explicit' }:is_optional(),
  300. prefix = ts.string:is_optional(),
  301. need_content = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  302. need_emails = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  303. need_images = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  304. ignore_redirected = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  305. } }
  306. },
  307. ['specific_urls_filter_map'] = {
  308. ['get_value'] = function(task, args)
  309. local map = maps[args[1]]
  310. if not map then
  311. lua_util.debugm(M, "invalid/unknown map: %s", args[1])
  312. end
  313. local params = args[2] or {}
  314. params.task = task
  315. params.no_cache = true
  316. if params.exclude_flags then
  317. params.filter = gen_exclude_flags_filter(params.exclude_flags)
  318. end
  319. local urls = lua_util.extract_specific_urls(params)
  320. if not urls[1] then
  321. return nil
  322. end
  323. return fun.filter(function(u)
  324. return map:get_key(tostring(u))
  325. end, urls), 'userdata_list'
  326. end,
  327. ['description'] = [[Get most specific urls, filtered by some map. Arguments are equal to the Lua API function]],
  328. ['args_schema'] = { ts.string, ts.shape {
  329. limit = ts.number + ts.string / tonumber,
  330. esld_limit = (ts.number + ts.string / tonumber):is_optional(),
  331. exclude_flags = url_flags_ts,
  332. flags = url_flags_ts,
  333. flags_mode = ts.one_of { 'explicit' }:is_optional(),
  334. prefix = ts.string:is_optional(),
  335. need_content = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  336. need_emails = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  337. need_images = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  338. ignore_redirected = (ts.boolean + ts.string / lua_util.toboolean):is_optional(),
  339. } }
  340. },
  341. -- URLs filtered by flags
  342. ['urls_filtered'] = {
  343. ['get_value'] = function(task, args)
  344. local urls = task:get_urls_filtered(args[1], args[2])
  345. if not urls[1] then
  346. return nil
  347. end
  348. return urls, 'userdata_list'
  349. end,
  350. ['description'] = [[Get list of all urls filtered by flags_include/exclude
  351. (see rspamd_task:get_urls_filtered for description)]],
  352. ['args_schema'] = { ts.array_of {
  353. url_flags_ts:is_optional(), url_flags_ts:is_optional()
  354. } }
  355. },
  356. -- Get all emails
  357. ['emails'] = {
  358. ['get_value'] = function(task, args)
  359. local urls = task:get_emails()
  360. if not urls[1] then
  361. return nil
  362. end
  363. if args[1] then
  364. return fun.map(function(r)
  365. return r[args[1]](r)
  366. end, urls), 'string_list'
  367. end
  368. return urls, 'userdata_list'
  369. end,
  370. ['description'] = [[Get list of all emails.
  371. If no arguments specified, returns list of url objects. Otherwise, calls a specific method,
  372. e.g. `get_user`]],
  373. },
  374. -- Get specific pool var. The first argument must be variable name,
  375. -- the second argument is optional and defines the type (string by default)
  376. ['pool_var'] = {
  377. ['get_value'] = function(task, args)
  378. local type = args[2] or 'string'
  379. return task:get_mempool():get_variable(args[1], type), (type)
  380. end,
  381. ['description'] = [[Get specific pool var. The first argument must be variable name,
  382. the second argument is optional and defines the type (string by default)]],
  383. ['args_schema'] = { ts.string, ts.string:is_optional() }
  384. },
  385. -- Get value of specific key from task cache
  386. ['task_cache'] = {
  387. ['get_value'] = function(task, args)
  388. local val = task:cache_get(args[1])
  389. if not val then
  390. return
  391. end
  392. if type(val) == 'table' then
  393. if not val[1] then
  394. return
  395. end
  396. return val, 'string_list'
  397. end
  398. return val, 'string'
  399. end,
  400. ['description'] = [[Get value of specific key from task cache. The first argument must be
  401. the key name]],
  402. ['args_schema'] = { ts.string }
  403. },
  404. -- Get specific HTTP request header. The first argument must be header name.
  405. ['request_header'] = {
  406. ['get_value'] = function(task, args)
  407. local hdr = task:get_request_header(args[1])
  408. if hdr then
  409. return hdr, 'string'
  410. end
  411. return nil
  412. end,
  413. ['description'] = [[Get specific HTTP request header.
  414. The first argument must be header name.]],
  415. ['args_schema'] = { ts.string }
  416. },
  417. -- Get task date, optionally formatted
  418. ['time'] = {
  419. ['get_value'] = function(task, args)
  420. local what = args[1] or 'message'
  421. local dt = task:get_date { format = what, gmt = true }
  422. if dt then
  423. if args[2] then
  424. -- Should be in format !xxx, as dt is in GMT
  425. return os.date(args[2], dt), 'string'
  426. end
  427. return tostring(dt), 'string'
  428. end
  429. return nil
  430. end,
  431. ['description'] = [[Get task timestamp. The first argument is type:
  432. - `connect`: connection timestamp (default)
  433. - `message`: timestamp as defined by `Date` header
  434. The second argument is optional time format, see [os.date](http://pgl.yoyo.org/luai/i/os.date) description]],
  435. ['args_schema'] = { ts.one_of { 'connect', 'message' }:is_optional(),
  436. ts.string:is_optional() }
  437. },
  438. -- Get text words from a message
  439. ['words'] = {
  440. ['get_value'] = function(task, args)
  441. local how = args[1] or 'stem'
  442. local tp = task:get_text_parts()
  443. if tp then
  444. local rtype = 'string_list'
  445. if how == 'full' then
  446. rtype = 'table_list'
  447. end
  448. return lua_util.flatten(
  449. fun.map(function(p)
  450. return p:get_words(how)
  451. end, tp)), rtype
  452. end
  453. return nil
  454. end,
  455. ['description'] = [[Get words from text parts
  456. - `stem`: stemmed words (default)
  457. - `raw`: raw words
  458. - `norm`: normalised words (lowercased)
  459. - `full`: list of tables
  460. ]],
  461. ['args_schema'] = { ts.one_of { 'stem', 'raw', 'norm', 'full' }:is_optional() },
  462. },
  463. -- Get queue ID
  464. ['queueid'] = {
  465. ['get_value'] = function(task)
  466. local queueid = task:get_queue_id()
  467. if queueid then
  468. return queueid, 'string'
  469. end
  470. return nil
  471. end,
  472. ['description'] = [[Get queue ID]],
  473. },
  474. -- Get ID of the task being processed
  475. ['uid'] = {
  476. ['get_value'] = function(task)
  477. local uid = task:get_uid()
  478. if uid then
  479. return uid, 'string'
  480. end
  481. return nil
  482. end,
  483. ['description'] = [[Get ID of the task being processed]],
  484. },
  485. -- Get message ID of the task being processed
  486. ['messageid'] = {
  487. ['get_value'] = function(task)
  488. local mid = task:get_message_id()
  489. if mid then
  490. return mid, 'string'
  491. end
  492. return nil
  493. end,
  494. ['description'] = [[Get message ID]],
  495. },
  496. -- Get specific symbol
  497. ['symbol'] = {
  498. ['get_value'] = function(task, args)
  499. local symbol = task:get_symbol(args[1], args[2])
  500. if symbol then
  501. return symbol[1], 'table'
  502. end
  503. end,
  504. ['description'] = 'Get specific symbol. The first argument must be the symbol name. ' ..
  505. 'The second argument is an optional shadow result name. ' ..
  506. 'Returns the symbol table. See task:get_symbol()',
  507. ['args_schema'] = { ts.string, ts.string:is_optional() }
  508. },
  509. -- Get full scan result
  510. ['scan_result'] = {
  511. ['get_value'] = function(task, args)
  512. local res = task:get_metric_result(args[1])
  513. if res then
  514. return res, 'table'
  515. end
  516. end,
  517. ['description'] = 'Get full scan result (either default or shadow if shadow result name is specified)' ..
  518. 'Returns the result table. See task:get_metric_result()',
  519. ['args_schema'] = { ts.string:is_optional() }
  520. },
  521. -- Get list of metatokens as strings
  522. ['metatokens'] = {
  523. ['get_value'] = function(task)
  524. local tokens = meta_functions.gen_metatokens(task)
  525. if not tokens[1] then
  526. return nil
  527. end
  528. local res = {}
  529. for _, t in ipairs(tokens) do
  530. table.insert(res, tostring(t))
  531. end
  532. return res, 'string_list'
  533. end,
  534. ['description'] = 'Get metatokens for a message as strings',
  535. },
  536. ['rspamd_hostname'] = {
  537. ['get_value'] = function(task)
  538. return HOSTNAME, 'string'
  539. end,
  540. ['description'] = 'Get hostname of the filter server',
  541. },
  542. }
  543. return extractors