Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

lua_meta.lua 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543
  1. --[[
  2. Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. local exports = {}
  14. local N = "metatokens"
  15. local ts = require("tableshape").types
  16. local logger = require "rspamd_logger"
  17. -- Metafunctions
  18. local function meta_size_function(task)
  19. local sizes = {
  20. 100,
  21. 200,
  22. 500,
  23. 1000,
  24. 2000,
  25. 4000,
  26. 10000,
  27. 20000,
  28. 30000,
  29. 100000,
  30. 200000,
  31. 400000,
  32. 800000,
  33. 1000000,
  34. 2000000,
  35. 8000000,
  36. }
  37. local size = task:get_size()
  38. for i = 1,#sizes do
  39. if sizes[i] >= size then
  40. return {(1.0 * i) / #sizes}
  41. end
  42. end
  43. return {0}
  44. end
  45. local function meta_images_function(task)
  46. local images = task:get_images()
  47. local ntotal = 0
  48. local njpg = 0
  49. local npng = 0
  50. local nlarge = 0
  51. local nsmall = 0
  52. if images then
  53. for _,img in ipairs(images) do
  54. if img:get_type() == 'png' then
  55. npng = npng + 1
  56. elseif img:get_type() == 'jpeg' then
  57. njpg = njpg + 1
  58. end
  59. local w = img:get_width()
  60. local h = img:get_height()
  61. if w > 0 and h > 0 then
  62. if w + h > 256 then
  63. nlarge = nlarge + 1
  64. else
  65. nsmall = nsmall + 1
  66. end
  67. end
  68. ntotal = ntotal + 1
  69. end
  70. end
  71. if ntotal > 0 then
  72. njpg = 1.0 * njpg / ntotal
  73. npng = 1.0 * npng / ntotal
  74. nlarge = 1.0 * nlarge / ntotal
  75. nsmall = 1.0 * nsmall / ntotal
  76. end
  77. return {ntotal,njpg,npng,nlarge,nsmall}
  78. end
  79. local function meta_nparts_function(task)
  80. local nattachments = 0
  81. local ntextparts = 0
  82. local totalparts = 1
  83. local tp = task:get_text_parts()
  84. if tp then
  85. ntextparts = #tp
  86. end
  87. local parts = task:get_parts()
  88. if parts then
  89. for _,p in ipairs(parts) do
  90. if p:is_attachment() then
  91. nattachments = nattachments + 1
  92. end
  93. totalparts = totalparts + 1
  94. end
  95. end
  96. return {(1.0 * ntextparts)/totalparts, (1.0 * nattachments)/totalparts}
  97. end
  98. local function meta_encoding_function(task)
  99. local nutf = 0
  100. local nother = 0
  101. local tp = task:get_text_parts()
  102. if tp and #tp > 0 then
  103. for _,p in ipairs(tp) do
  104. if p:is_utf() then
  105. nutf = nutf + 1
  106. else
  107. nother = nother + 1
  108. end
  109. end
  110. return {nutf / #tp, nother / #tp}
  111. end
  112. return {0, 0}
  113. end
  114. local function meta_recipients_function(task)
  115. local nmime = 0
  116. local nsmtp = 0
  117. if task:has_recipients('mime') then
  118. nmime = #(task:get_recipients('mime'))
  119. end
  120. if task:has_recipients('smtp') then
  121. nsmtp = #(task:get_recipients('smtp'))
  122. end
  123. if nmime > 0 then nmime = 1.0 / nmime end
  124. if nsmtp > 0 then nsmtp = 1.0 / nsmtp end
  125. return {nmime,nsmtp}
  126. end
  127. local function meta_received_function(task)
  128. local count_factor = 0
  129. local invalid_factor = 0
  130. local rh = task:get_received_headers()
  131. local time_factor = 0
  132. local secure_factor = 0
  133. local fun = require "fun"
  134. if rh and #rh > 0 then
  135. local ntotal = 0.0
  136. local init_time = 0
  137. fun.each(function(rc)
  138. ntotal = ntotal + 1.0
  139. if not rc.by_hostname then
  140. invalid_factor = invalid_factor + 1.0
  141. end
  142. if init_time == 0 and rc.timestamp then
  143. init_time = rc.timestamp
  144. elseif rc.timestamp then
  145. time_factor = time_factor + math.abs(init_time - rc.timestamp)
  146. init_time = rc.timestamp
  147. end
  148. if rc.flags and (rc.flags['ssl'] or rc.flags['authenticated']) then
  149. secure_factor = secure_factor + 1.0
  150. end
  151. end,
  152. fun.filter(function(rc) return not rc.flags or not rc.flags['artificial'] end, rh))
  153. if ntotal > 0 then
  154. invalid_factor = invalid_factor / ntotal
  155. secure_factor = secure_factor / ntotal
  156. count_factor = 1.0 / ntotal
  157. end
  158. if time_factor ~= 0 then
  159. time_factor = 1.0 / time_factor
  160. end
  161. end
  162. return {count_factor, invalid_factor, time_factor, secure_factor}
  163. end
  164. local function meta_urls_function(task)
  165. local has_urls,nurls = task:has_urls()
  166. if has_urls and nurls > 0 then
  167. return {1.0 / nurls}
  168. end
  169. return {0}
  170. end
  171. local function meta_words_function(task)
  172. local avg_len = task:get_mempool():get_variable("avg_words_len", "double") or 0.0
  173. local short_words = task:get_mempool():get_variable("short_words_cnt", "double") or 0.0
  174. local ret_len = 0
  175. local lens = {
  176. 2,
  177. 3,
  178. 4,
  179. 5,
  180. 6,
  181. 7,
  182. 8,
  183. 9,
  184. 10,
  185. 15,
  186. 20,
  187. }
  188. for i = 1,#lens do
  189. if lens[i] >= avg_len then
  190. ret_len = (1.0 * i) / #lens
  191. break
  192. end
  193. end
  194. local tp = task:get_text_parts()
  195. local wres = {
  196. 0, -- spaces rate
  197. 0, -- double spaces rate
  198. 0, -- non spaces rate
  199. 0, -- ascii characters rate
  200. 0, -- non-ascii characters rate
  201. 0, -- capital characters rate
  202. 0, -- numeric characters
  203. }
  204. for _,p in ipairs(tp) do
  205. local stats = p:get_stats()
  206. local len = p:get_length()
  207. if len > 0 then
  208. wres[1] = wres[1] + stats['spaces'] / len
  209. wres[2] = wres[2] + stats['double_spaces'] / len
  210. wres[3] = wres[3] + stats['non_spaces'] / len
  211. wres[4] = wres[4] + stats['ascii_characters'] / len
  212. wres[5] = wres[5] + stats['non_ascii_characters'] / len
  213. wres[6] = wres[6] + stats['capital_letters'] / len
  214. wres[7] = wres[7] + stats['numeric_characters'] / len
  215. end
  216. end
  217. local ret = {
  218. short_words,
  219. ret_len,
  220. }
  221. local divisor = 1.0
  222. if #tp > 0 then
  223. divisor = #tp
  224. end
  225. for _,wr in ipairs(wres) do
  226. table.insert(ret, wr / divisor)
  227. end
  228. return ret
  229. end
  230. local metafunctions = {
  231. {
  232. cb = meta_size_function,
  233. ninputs = 1,
  234. names = {
  235. "size"
  236. },
  237. description = 'Describes size of the message',
  238. },
  239. {
  240. cb = meta_images_function,
  241. ninputs = 5,
  242. -- 1 - number of images,
  243. -- 2 - number of png images,
  244. -- 3 - number of jpeg images
  245. -- 4 - number of large images (> 128 x 128)
  246. -- 5 - number of small images (< 128 x 128)
  247. names = {
  248. 'nimages',
  249. 'npng_images',
  250. 'njpeg_images',
  251. 'nlarge_images',
  252. 'nsmall_images'
  253. },
  254. description = [[Functions for images matching:
  255. - number of images,
  256. - number of png images,
  257. - number of jpeg images
  258. - number of large images (> 128 x 128)
  259. - number of small images (< 128 x 128)
  260. ]]
  261. },
  262. {
  263. cb = meta_nparts_function,
  264. ninputs = 2,
  265. -- 1 - number of text parts
  266. -- 2 - number of attachments
  267. names = {
  268. 'ntext_parts',
  269. 'nattachments'
  270. },
  271. description = [[Functions for images matching:
  272. - number of text parts
  273. - number of attachments
  274. ]]
  275. },
  276. {
  277. cb = meta_encoding_function,
  278. ninputs = 2,
  279. -- 1 - number of utf parts
  280. -- 2 - number of non-utf parts
  281. names = {
  282. 'nutf_parts',
  283. 'nascii_parts'
  284. },
  285. description = [[Functions for encoding matching:
  286. - number of utf parts
  287. - number of non-utf parts
  288. ]]
  289. },
  290. {
  291. cb = meta_recipients_function,
  292. ninputs = 2,
  293. -- 1 - number of mime rcpt
  294. -- 2 - number of smtp rcpt
  295. names = {
  296. 'nmime_rcpt',
  297. 'nsmtp_rcpt'
  298. },
  299. description = [[Functions for recipients data matching:
  300. - number of mime rcpt
  301. - number of smtp rcpt
  302. ]]
  303. },
  304. {
  305. cb = meta_received_function,
  306. ninputs = 4,
  307. names = {
  308. 'nreceived',
  309. 'nreceived_invalid',
  310. 'nreceived_bad_time',
  311. 'nreceived_secure'
  312. },
  313. description = [[Functions for received headers data matching:
  314. - number of received headers
  315. - number of bad received headers
  316. - number of skewed time received headers
  317. - number of received via secured relays
  318. ]]
  319. },
  320. {
  321. cb = meta_urls_function,
  322. ninputs = 1,
  323. names = {
  324. 'nurls'
  325. },
  326. description = [[Functions for urls data matching:
  327. - number of urls
  328. ]]
  329. },
  330. {
  331. cb = meta_words_function,
  332. ninputs = 9,
  333. names = {
  334. 'avg_words_len',
  335. 'nshort_words',
  336. 'spaces_rate',
  337. 'double_spaces_rate',
  338. 'non_spaces_rate',
  339. 'ascii_characters_rate',
  340. 'non_ascii_characters_rate',
  341. 'capital_characters_rate',
  342. 'numeric_characters'
  343. },
  344. description = [[Functions for words data matching:
  345. - average length of the words
  346. - number of short words
  347. - rate of spaces in the text
  348. - rate of multiple spaces
  349. - rate of non space characters
  350. - rate of ascii characters
  351. - rate of non-ascii characters
  352. - rate of capital letters
  353. - rate of numbers
  354. ]]
  355. },
  356. }
  357. local meta_schema = ts.shape{
  358. cb = ts.func,
  359. ninputs = ts.number,
  360. names = ts.array_of(ts.string),
  361. description = ts.string:is_optional()
  362. }
  363. local metatokens_by_name = {}
  364. local function fill_metatokens_by_name()
  365. metatokens_by_name = {}
  366. for _,mt in ipairs(metafunctions) do
  367. for i=1,mt.ninputs do
  368. local name = mt.names[i]
  369. metatokens_by_name[name] = function(task)
  370. local results = mt.cb(task)
  371. return results[i]
  372. end
  373. end
  374. end
  375. end
  376. local function calculate_digest()
  377. local cr = require "rspamd_cryptobox_hash"
  378. local h = cr.create()
  379. for _,mt in ipairs(metafunctions) do
  380. for i=1,mt.ninputs do
  381. local name = mt.names[i]
  382. h:update(name)
  383. end
  384. end
  385. exports.digest = h:hex()
  386. end
  387. local function rspamd_gen_metatokens(task, names)
  388. local lua_util = require "lua_util"
  389. local ipairs = ipairs
  390. local metatokens = {}
  391. if not names then
  392. local cached = task:cache_get('metatokens')
  393. if cached then
  394. return cached
  395. else
  396. for _,mt in ipairs(metafunctions) do
  397. local ct = mt.cb(task)
  398. for i,tok in ipairs(ct) do
  399. lua_util.debugm(N, task, "metatoken: %s = %s",
  400. mt.names[i], tok)
  401. if tok ~= tok or tok == math.huge then
  402. logger.errx(task, 'metatoken %s returned %s; replace it with 0 for sanity',
  403. mt.names[i], tok)
  404. tok = 0.0
  405. end
  406. table.insert(metatokens, tok)
  407. end
  408. end
  409. task:cache_set('metatokens', metatokens)
  410. end
  411. else
  412. for _,n in ipairs(names) do
  413. if metatokens_by_name[n] then
  414. local tok = metatokens_by_name[n](task)
  415. if tok ~= tok or tok == math.huge then
  416. logger.errx(task, 'metatoken %s returned %s; replace it with 0 for sanity',
  417. n, tok)
  418. tok = 0.0
  419. end
  420. table.insert(metatokens, tok)
  421. else
  422. logger.errx(task, 'unknown metatoken: %s', n)
  423. end
  424. end
  425. end
  426. return metatokens
  427. end
  428. exports.rspamd_gen_metatokens = rspamd_gen_metatokens
  429. exports.gen_metatokens = rspamd_gen_metatokens
  430. local function rspamd_gen_metatokens_table(task)
  431. local metatokens = {}
  432. for _,mt in ipairs(metafunctions) do
  433. local ct = mt.cb(task)
  434. for i,tok in ipairs(ct) do
  435. if tok ~= tok or tok == math.huge then
  436. logger.errx(task, 'metatoken %s returned %s; replace it with 0 for sanity',
  437. mt.names[i], tok)
  438. tok = 0.0
  439. end
  440. metatokens[mt.names[i]] = tok
  441. end
  442. end
  443. return metatokens
  444. end
  445. exports.rspamd_gen_metatokens_table = rspamd_gen_metatokens_table
  446. exports.gen_metatokens_table = rspamd_gen_metatokens_table
  447. local function rspamd_count_metatokens()
  448. local ipairs = ipairs
  449. local total = 0
  450. for _,mt in ipairs(metafunctions) do
  451. total = total + mt.ninputs
  452. end
  453. return total
  454. end
  455. exports.rspamd_count_metatokens = rspamd_count_metatokens
  456. exports.count_metatokens = rspamd_count_metatokens
  457. exports.version = 1 -- MUST be increased on each change of metatokens
  458. exports.add_metafunction = function(tbl)
  459. local ret, err = meta_schema(tbl)
  460. if not ret then
  461. logger.errx('cannot add metafunction: %s', err)
  462. else
  463. table.insert(metafunctions, tbl)
  464. fill_metatokens_by_name()
  465. calculate_digest()
  466. end
  467. end
  468. fill_metatokens_by_name()
  469. calculate_digest()
  470. return exports