You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lua_meta.lua 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549
  1. --[[
  2. Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. local exports = {}
  14. local N = "metatokens"
  15. local ts = require("tableshape").types
  16. local logger = require "rspamd_logger"
  17. -- Metafunctions
  18. local function meta_size_function(task)
  19. local sizes = {
  20. 100,
  21. 200,
  22. 500,
  23. 1000,
  24. 2000,
  25. 4000,
  26. 10000,
  27. 20000,
  28. 30000,
  29. 100000,
  30. 200000,
  31. 400000,
  32. 800000,
  33. 1000000,
  34. 2000000,
  35. 8000000,
  36. }
  37. local size = task:get_size()
  38. for i = 1, #sizes do
  39. if sizes[i] >= size then
  40. return { (1.0 * i) / #sizes }
  41. end
  42. end
  43. return { 0 }
  44. end
  45. local function meta_images_function(task)
  46. local images = task:get_images()
  47. local ntotal = 0
  48. local njpg = 0
  49. local npng = 0
  50. local nlarge = 0
  51. local nsmall = 0
  52. if images then
  53. for _, img in ipairs(images) do
  54. if img:get_type() == 'png' then
  55. npng = npng + 1
  56. elseif img:get_type() == 'jpeg' then
  57. njpg = njpg + 1
  58. end
  59. local w = img:get_width()
  60. local h = img:get_height()
  61. if w > 0 and h > 0 then
  62. if w + h > 256 then
  63. nlarge = nlarge + 1
  64. else
  65. nsmall = nsmall + 1
  66. end
  67. end
  68. ntotal = ntotal + 1
  69. end
  70. end
  71. if ntotal > 0 then
  72. njpg = 1.0 * njpg / ntotal
  73. npng = 1.0 * npng / ntotal
  74. nlarge = 1.0 * nlarge / ntotal
  75. nsmall = 1.0 * nsmall / ntotal
  76. end
  77. return { ntotal, njpg, npng, nlarge, nsmall }
  78. end
  79. local function meta_nparts_function(task)
  80. local nattachments = 0
  81. local ntextparts = 0
  82. local totalparts = 1
  83. local tp = task:get_text_parts()
  84. if tp then
  85. ntextparts = #tp
  86. end
  87. local parts = task:get_parts()
  88. if parts then
  89. for _, p in ipairs(parts) do
  90. if p:is_attachment() then
  91. nattachments = nattachments + 1
  92. end
  93. totalparts = totalparts + 1
  94. end
  95. end
  96. return { (1.0 * ntextparts) / totalparts, (1.0 * nattachments) / totalparts }
  97. end
  98. local function meta_encoding_function(task)
  99. local nutf = 0
  100. local nother = 0
  101. local tp = task:get_text_parts()
  102. if tp and #tp > 0 then
  103. for _, p in ipairs(tp) do
  104. if p:is_utf() then
  105. nutf = nutf + 1
  106. else
  107. nother = nother + 1
  108. end
  109. end
  110. return { nutf / #tp, nother / #tp }
  111. end
  112. return { 0, 0 }
  113. end
  114. local function meta_recipients_function(task)
  115. local nmime = 0
  116. local nsmtp = 0
  117. if task:has_recipients('mime') then
  118. nmime = #(task:get_recipients('mime'))
  119. end
  120. if task:has_recipients('smtp') then
  121. nsmtp = #(task:get_recipients('smtp'))
  122. end
  123. if nmime > 0 then
  124. nmime = 1.0 / nmime
  125. end
  126. if nsmtp > 0 then
  127. nsmtp = 1.0 / nsmtp
  128. end
  129. return { nmime, nsmtp }
  130. end
  131. local function meta_received_function(task)
  132. local count_factor = 0
  133. local invalid_factor = 0
  134. local rh = task:get_received_headers()
  135. local time_factor = 0
  136. local secure_factor = 0
  137. local fun = require "fun"
  138. if rh and #rh > 0 then
  139. local ntotal = 0.0
  140. local init_time = 0
  141. fun.each(function(rc)
  142. ntotal = ntotal + 1.0
  143. if not rc.by_hostname then
  144. invalid_factor = invalid_factor + 1.0
  145. end
  146. if init_time == 0 and rc.timestamp then
  147. init_time = rc.timestamp
  148. elseif rc.timestamp then
  149. time_factor = time_factor + math.abs(init_time - rc.timestamp)
  150. init_time = rc.timestamp
  151. end
  152. if rc.flags and (rc.flags['ssl'] or rc.flags['authenticated']) then
  153. secure_factor = secure_factor + 1.0
  154. end
  155. end,
  156. fun.filter(function(rc)
  157. return not rc.flags or not rc.flags['artificial']
  158. end, rh))
  159. if ntotal > 0 then
  160. invalid_factor = invalid_factor / ntotal
  161. secure_factor = secure_factor / ntotal
  162. count_factor = 1.0 / ntotal
  163. end
  164. if time_factor ~= 0 then
  165. time_factor = 1.0 / time_factor
  166. end
  167. end
  168. return { count_factor, invalid_factor, time_factor, secure_factor }
  169. end
  170. local function meta_urls_function(task)
  171. local has_urls, nurls = task:has_urls()
  172. if has_urls and nurls > 0 then
  173. return { 1.0 / nurls }
  174. end
  175. return { 0 }
  176. end
  177. local function meta_words_function(task)
  178. local avg_len = task:get_mempool():get_variable("avg_words_len", "double") or 0.0
  179. local short_words = task:get_mempool():get_variable("short_words_cnt", "double") or 0.0
  180. local ret_len = 0
  181. local lens = {
  182. 2,
  183. 3,
  184. 4,
  185. 5,
  186. 6,
  187. 7,
  188. 8,
  189. 9,
  190. 10,
  191. 15,
  192. 20,
  193. }
  194. for i = 1, #lens do
  195. if lens[i] >= avg_len then
  196. ret_len = (1.0 * i) / #lens
  197. break
  198. end
  199. end
  200. local tp = task:get_text_parts()
  201. local wres = {
  202. 0, -- spaces rate
  203. 0, -- double spaces rate
  204. 0, -- non spaces rate
  205. 0, -- ascii characters rate
  206. 0, -- non-ascii characters rate
  207. 0, -- capital characters rate
  208. 0, -- numeric characters
  209. }
  210. for _, p in ipairs(tp) do
  211. local stats = p:get_stats()
  212. local len = p:get_length()
  213. if len > 0 then
  214. wres[1] = wres[1] + stats['spaces'] / len
  215. wres[2] = wres[2] + stats['double_spaces'] / len
  216. wres[3] = wres[3] + stats['non_spaces'] / len
  217. wres[4] = wres[4] + stats['ascii_characters'] / len
  218. wres[5] = wres[5] + stats['non_ascii_characters'] / len
  219. wres[6] = wres[6] + stats['capital_letters'] / len
  220. wres[7] = wres[7] + stats['numeric_characters'] / len
  221. end
  222. end
  223. local ret = {
  224. short_words,
  225. ret_len,
  226. }
  227. local divisor = 1.0
  228. if #tp > 0 then
  229. divisor = #tp
  230. end
  231. for _, wr in ipairs(wres) do
  232. table.insert(ret, wr / divisor)
  233. end
  234. return ret
  235. end
  236. local metafunctions = {
  237. {
  238. cb = meta_size_function,
  239. ninputs = 1,
  240. names = {
  241. "size"
  242. },
  243. description = 'Describes size of the message',
  244. },
  245. {
  246. cb = meta_images_function,
  247. ninputs = 5,
  248. -- 1 - number of images,
  249. -- 2 - number of png images,
  250. -- 3 - number of jpeg images
  251. -- 4 - number of large images (> 128 x 128)
  252. -- 5 - number of small images (< 128 x 128)
  253. names = {
  254. 'nimages',
  255. 'npng_images',
  256. 'njpeg_images',
  257. 'nlarge_images',
  258. 'nsmall_images'
  259. },
  260. description = [[Functions for images matching:
  261. - number of images,
  262. - number of png images,
  263. - number of jpeg images
  264. - number of large images (> 128 x 128)
  265. - number of small images (< 128 x 128)
  266. ]]
  267. },
  268. {
  269. cb = meta_nparts_function,
  270. ninputs = 2,
  271. -- 1 - number of text parts
  272. -- 2 - number of attachments
  273. names = {
  274. 'ntext_parts',
  275. 'nattachments'
  276. },
  277. description = [[Functions for images matching:
  278. - number of text parts
  279. - number of attachments
  280. ]]
  281. },
  282. {
  283. cb = meta_encoding_function,
  284. ninputs = 2,
  285. -- 1 - number of utf parts
  286. -- 2 - number of non-utf parts
  287. names = {
  288. 'nutf_parts',
  289. 'nascii_parts'
  290. },
  291. description = [[Functions for encoding matching:
  292. - number of utf parts
  293. - number of non-utf parts
  294. ]]
  295. },
  296. {
  297. cb = meta_recipients_function,
  298. ninputs = 2,
  299. -- 1 - number of mime rcpt
  300. -- 2 - number of smtp rcpt
  301. names = {
  302. 'nmime_rcpt',
  303. 'nsmtp_rcpt'
  304. },
  305. description = [[Functions for recipients data matching:
  306. - number of mime rcpt
  307. - number of smtp rcpt
  308. ]]
  309. },
  310. {
  311. cb = meta_received_function,
  312. ninputs = 4,
  313. names = {
  314. 'nreceived',
  315. 'nreceived_invalid',
  316. 'nreceived_bad_time',
  317. 'nreceived_secure'
  318. },
  319. description = [[Functions for received headers data matching:
  320. - number of received headers
  321. - number of bad received headers
  322. - number of skewed time received headers
  323. - number of received via secured relays
  324. ]]
  325. },
  326. {
  327. cb = meta_urls_function,
  328. ninputs = 1,
  329. names = {
  330. 'nurls'
  331. },
  332. description = [[Functions for urls data matching:
  333. - number of urls
  334. ]]
  335. },
  336. {
  337. cb = meta_words_function,
  338. ninputs = 9,
  339. names = {
  340. 'avg_words_len',
  341. 'nshort_words',
  342. 'spaces_rate',
  343. 'double_spaces_rate',
  344. 'non_spaces_rate',
  345. 'ascii_characters_rate',
  346. 'non_ascii_characters_rate',
  347. 'capital_characters_rate',
  348. 'numeric_characters'
  349. },
  350. description = [[Functions for words data matching:
  351. - average length of the words
  352. - number of short words
  353. - rate of spaces in the text
  354. - rate of multiple spaces
  355. - rate of non space characters
  356. - rate of ascii characters
  357. - rate of non-ascii characters
  358. - rate of capital letters
  359. - rate of numbers
  360. ]]
  361. },
  362. }
  363. local meta_schema = ts.shape {
  364. cb = ts.func,
  365. ninputs = ts.number,
  366. names = ts.array_of(ts.string),
  367. description = ts.string:is_optional()
  368. }
  369. local metatokens_by_name = {}
  370. local function fill_metatokens_by_name()
  371. metatokens_by_name = {}
  372. for _, mt in ipairs(metafunctions) do
  373. for i = 1, mt.ninputs do
  374. local name = mt.names[i]
  375. metatokens_by_name[name] = function(task)
  376. local results = mt.cb(task)
  377. return results[i]
  378. end
  379. end
  380. end
  381. end
  382. local function calculate_digest()
  383. local cr = require "rspamd_cryptobox_hash"
  384. local h = cr.create()
  385. for _, mt in ipairs(metafunctions) do
  386. for i = 1, mt.ninputs do
  387. local name = mt.names[i]
  388. h:update(name)
  389. end
  390. end
  391. exports.digest = h:hex()
  392. end
  393. local function rspamd_gen_metatokens(task, names)
  394. local lua_util = require "lua_util"
  395. local ipairs = ipairs
  396. local metatokens = {}
  397. if not names then
  398. local cached = task:cache_get('metatokens')
  399. if cached then
  400. return cached
  401. else
  402. for _, mt in ipairs(metafunctions) do
  403. local ct = mt.cb(task)
  404. for i, tok in ipairs(ct) do
  405. lua_util.debugm(N, task, "metatoken: %s = %s",
  406. mt.names[i], tok)
  407. if tok ~= tok or tok == math.huge then
  408. logger.errx(task, 'metatoken %s returned %s; replace it with 0 for sanity',
  409. mt.names[i], tok)
  410. tok = 0.0
  411. end
  412. table.insert(metatokens, tok)
  413. end
  414. end
  415. task:cache_set('metatokens', metatokens)
  416. end
  417. else
  418. for _, n in ipairs(names) do
  419. if metatokens_by_name[n] then
  420. local tok = metatokens_by_name[n](task)
  421. if tok ~= tok or tok == math.huge then
  422. logger.errx(task, 'metatoken %s returned %s; replace it with 0 for sanity',
  423. n, tok)
  424. tok = 0.0
  425. end
  426. table.insert(metatokens, tok)
  427. else
  428. logger.errx(task, 'unknown metatoken: %s', n)
  429. end
  430. end
  431. end
  432. return metatokens
  433. end
  434. exports.rspamd_gen_metatokens = rspamd_gen_metatokens
  435. exports.gen_metatokens = rspamd_gen_metatokens
  436. local function rspamd_gen_metatokens_table(task)
  437. local metatokens = {}
  438. for _, mt in ipairs(metafunctions) do
  439. local ct = mt.cb(task)
  440. for i, tok in ipairs(ct) do
  441. if tok ~= tok or tok == math.huge then
  442. logger.errx(task, 'metatoken %s returned %s; replace it with 0 for sanity',
  443. mt.names[i], tok)
  444. tok = 0.0
  445. end
  446. metatokens[mt.names[i]] = tok
  447. end
  448. end
  449. return metatokens
  450. end
  451. exports.rspamd_gen_metatokens_table = rspamd_gen_metatokens_table
  452. exports.gen_metatokens_table = rspamd_gen_metatokens_table
  453. local function rspamd_count_metatokens()
  454. local ipairs = ipairs
  455. local total = 0
  456. for _, mt in ipairs(metafunctions) do
  457. total = total + mt.ninputs
  458. end
  459. return total
  460. end
  461. exports.rspamd_count_metatokens = rspamd_count_metatokens
  462. exports.count_metatokens = rspamd_count_metatokens
  463. exports.version = 1 -- MUST be increased on each change of metatokens
  464. exports.add_metafunction = function(tbl)
  465. local ret, err = meta_schema(tbl)
  466. if not ret then
  467. logger.errx('cannot add metafunction: %s', err)
  468. else
  469. table.insert(metafunctions, tbl)
  470. fill_metatokens_by_name()
  471. calculate_digest()
  472. end
  473. end
  474. fill_metatokens_by_name()
  475. calculate_digest()
  476. return exports