You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lua_meta.lua 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523
  1. --[[
  2. Copyright (c) 2017, Vsevolod Stakhov <vsevolod@highsecure.ru>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. local exports = {}
  14. local N = "metatokens"
  15. local ts = require("tableshape").types
  16. -- Metafunctions
  17. local function meta_size_function(task)
  18. local sizes = {
  19. 100,
  20. 200,
  21. 500,
  22. 1000,
  23. 2000,
  24. 4000,
  25. 10000,
  26. 20000,
  27. 30000,
  28. 100000,
  29. 200000,
  30. 400000,
  31. 800000,
  32. 1000000,
  33. 2000000,
  34. 8000000,
  35. }
  36. local size = task:get_size()
  37. for i = 1,#sizes do
  38. if sizes[i] >= size then
  39. return {(1.0 * i) / #sizes}
  40. end
  41. end
  42. return {0}
  43. end
  44. local function meta_images_function(task)
  45. local images = task:get_images()
  46. local ntotal = 0
  47. local njpg = 0
  48. local npng = 0
  49. local nlarge = 0
  50. local nsmall = 0
  51. if images then
  52. for _,img in ipairs(images) do
  53. if img:get_type() == 'png' then
  54. npng = npng + 1
  55. elseif img:get_type() == 'jpeg' then
  56. njpg = njpg + 1
  57. end
  58. local w = img:get_width()
  59. local h = img:get_height()
  60. if w > 0 and h > 0 then
  61. if w + h > 256 then
  62. nlarge = nlarge + 1
  63. else
  64. nsmall = nsmall + 1
  65. end
  66. end
  67. ntotal = ntotal + 1
  68. end
  69. end
  70. if ntotal > 0 then
  71. njpg = 1.0 * njpg / ntotal
  72. npng = 1.0 * npng / ntotal
  73. nlarge = 1.0 * nlarge / ntotal
  74. nsmall = 1.0 * nsmall / ntotal
  75. end
  76. return {ntotal,njpg,npng,nlarge,nsmall}
  77. end
  78. local function meta_nparts_function(task)
  79. local nattachments = 0
  80. local ntextparts = 0
  81. local totalparts = 1
  82. local tp = task:get_text_parts()
  83. if tp then
  84. ntextparts = #tp
  85. end
  86. local parts = task:get_parts()
  87. if parts then
  88. for _,p in ipairs(parts) do
  89. if p:is_attachment() then
  90. nattachments = nattachments + 1
  91. end
  92. totalparts = totalparts + 1
  93. end
  94. end
  95. return {(1.0 * ntextparts)/totalparts, (1.0 * nattachments)/totalparts}
  96. end
  97. local function meta_encoding_function(task)
  98. local nutf = 0
  99. local nother = 0
  100. local tp = task:get_text_parts()
  101. if tp and #tp > 0 then
  102. for _,p in ipairs(tp) do
  103. if p:is_utf() then
  104. nutf = nutf + 1
  105. else
  106. nother = nother + 1
  107. end
  108. end
  109. return {nutf / #tp, nother / #tp}
  110. end
  111. return {0, 0}
  112. end
  113. local function meta_recipients_function(task)
  114. local nmime = 0
  115. local nsmtp = 0
  116. if task:has_recipients('mime') then
  117. nmime = #(task:get_recipients('mime'))
  118. end
  119. if task:has_recipients('smtp') then
  120. nsmtp = #(task:get_recipients('smtp'))
  121. end
  122. if nmime > 0 then nmime = 1.0 / nmime end
  123. if nsmtp > 0 then nsmtp = 1.0 / nsmtp end
  124. return {nmime,nsmtp}
  125. end
  126. local function meta_received_function(task)
  127. local count_factor = 0
  128. local invalid_factor = 0
  129. local rh = task:get_received_headers()
  130. local time_factor = 0
  131. local secure_factor = 0
  132. local fun = require "fun"
  133. if rh and #rh > 0 then
  134. local ntotal = 0.0
  135. local init_time = 0
  136. fun.each(function(rc)
  137. ntotal = ntotal + 1.0
  138. if not rc.by_hostname then
  139. invalid_factor = invalid_factor + 1.0
  140. end
  141. if init_time == 0 and rc.timestamp then
  142. init_time = rc.timestamp
  143. elseif rc.timestamp then
  144. time_factor = time_factor + math.abs(init_time - rc.timestamp)
  145. init_time = rc.timestamp
  146. end
  147. if rc.flags and (rc.flags['ssl'] or rc.flags['authenticated']) then
  148. secure_factor = secure_factor + 1.0
  149. end
  150. end,
  151. fun.filter(function(rc) return not rc.flags or not rc.flags['artificial'] end, rh))
  152. invalid_factor = invalid_factor / ntotal
  153. secure_factor = secure_factor / ntotal
  154. count_factor = 1.0 / ntotal
  155. if time_factor ~= 0 then
  156. time_factor = 1.0 / time_factor
  157. end
  158. end
  159. return {count_factor, invalid_factor, time_factor, secure_factor}
  160. end
  161. local function meta_urls_function(task)
  162. if task:has_urls() then
  163. return {1.0 / #(task:get_urls())}
  164. end
  165. return {0}
  166. end
  167. local function meta_words_function(task)
  168. local avg_len = task:get_mempool():get_variable("avg_words_len", "double") or 0.0
  169. local short_words = task:get_mempool():get_variable("short_words_cnt", "double") or 0.0
  170. local ret_len = 0
  171. local lens = {
  172. 2,
  173. 3,
  174. 4,
  175. 5,
  176. 6,
  177. 7,
  178. 8,
  179. 9,
  180. 10,
  181. 15,
  182. 20,
  183. }
  184. for i = 1,#lens do
  185. if lens[i] >= avg_len then
  186. ret_len = (1.0 * i) / #lens
  187. break
  188. end
  189. end
  190. local tp = task:get_text_parts()
  191. local wres = {
  192. 0, -- spaces rate
  193. 0, -- double spaces rate
  194. 0, -- non spaces rate
  195. 0, -- ascii characters rate
  196. 0, -- non-ascii characters rate
  197. 0, -- capital characters rate
  198. 0, -- numeric cahracters
  199. }
  200. for _,p in ipairs(tp) do
  201. local stats = p:get_stats()
  202. local len = p:get_length()
  203. if len > 0 then
  204. wres[1] = wres[1] + stats['spaces'] / len
  205. wres[2] = wres[2] + stats['double_spaces'] / len
  206. wres[3] = wres[3] + stats['non_spaces'] / len
  207. wres[4] = wres[4] + stats['ascii_characters'] / len
  208. wres[5] = wres[5] + stats['non_ascii_characters'] / len
  209. wres[6] = wres[6] + stats['capital_letters'] / len
  210. wres[7] = wres[7] + stats['numeric_characters'] / len
  211. end
  212. end
  213. local ret = {
  214. short_words,
  215. ret_len,
  216. }
  217. local divisor = 1.0
  218. if #tp > 0 then
  219. divisor = #tp
  220. end
  221. for _,wr in ipairs(wres) do
  222. table.insert(ret, wr / divisor)
  223. end
  224. return ret
  225. end
  226. local metafunctions = {
  227. {
  228. cb = meta_size_function,
  229. ninputs = 1,
  230. names = {
  231. "size"
  232. },
  233. description = 'Describes size of the message',
  234. },
  235. {
  236. cb = meta_images_function,
  237. ninputs = 5,
  238. -- 1 - number of images,
  239. -- 2 - number of png images,
  240. -- 3 - number of jpeg images
  241. -- 4 - number of large images (> 128 x 128)
  242. -- 5 - number of small images (< 128 x 128)
  243. names = {
  244. 'nimages',
  245. 'npng_images',
  246. 'njpeg_images',
  247. 'nlarge_images',
  248. 'nsmall_images'
  249. },
  250. description = [[Functions for images matching:
  251. - number of images,
  252. - number of png images,
  253. - number of jpeg images
  254. - number of large images (> 128 x 128)
  255. - number of small images (< 128 x 128)
  256. ]]
  257. },
  258. {
  259. cb = meta_nparts_function,
  260. ninputs = 2,
  261. -- 1 - number of text parts
  262. -- 2 - number of attachments
  263. names = {
  264. 'ntext_parts',
  265. 'nattachments'
  266. },
  267. description = [[Functions for images matching:
  268. - number of text parts
  269. - number of attachments
  270. ]]
  271. },
  272. {
  273. cb = meta_encoding_function,
  274. ninputs = 2,
  275. -- 1 - number of utf parts
  276. -- 2 - number of non-utf parts
  277. names = {
  278. 'nutf_parts',
  279. 'nascii_parts'
  280. },
  281. description = [[Functions for encoding matching:
  282. - number of utf parts
  283. - number of non-utf parts
  284. ]]
  285. },
  286. {
  287. cb = meta_recipients_function,
  288. ninputs = 2,
  289. -- 1 - number of mime rcpt
  290. -- 2 - number of smtp rcpt
  291. names = {
  292. 'nmime_rcpt',
  293. 'nsmtp_rcpt'
  294. },
  295. description = [[Functions for recipients data matching:
  296. - number of mime rcpt
  297. - number of smtp rcpt
  298. ]]
  299. },
  300. {
  301. cb = meta_received_function,
  302. ninputs = 4,
  303. names = {
  304. 'nreceived',
  305. 'nreceived_invalid',
  306. 'nreceived_bad_time',
  307. 'nreceived_secure'
  308. },
  309. description = [[Functions for received headers data matching:
  310. - number of received headers
  311. - number of bad received headers
  312. - number of skewed time received headers
  313. - number of received via secured relays
  314. ]]
  315. },
  316. {
  317. cb = meta_urls_function,
  318. ninputs = 1,
  319. names = {
  320. 'nurls'
  321. },
  322. description = [[Functions for urls data matching:
  323. - number of urls
  324. ]]
  325. },
  326. {
  327. cb = meta_words_function,
  328. ninputs = 9,
  329. names = {
  330. 'avg_words_len',
  331. 'nshort_words',
  332. 'spaces_rate',
  333. 'double_spaces_rate',
  334. 'non_spaces_rate',
  335. 'ascii_characters_rate',
  336. 'non_ascii_characters_rate',
  337. 'capital_characters_rate',
  338. 'numeric_cahracters'
  339. },
  340. description = [[Functions for words data matching:
  341. - average length of the words
  342. - number of short words
  343. - rate of spaces in the text
  344. - rate of multiple spaces
  345. - rate of non space characters
  346. - rate of ascii characters
  347. - rate of non-ascii characters
  348. - rate of capital letters
  349. - rate of numbers
  350. ]]
  351. },
  352. }
  353. local meta_schema = ts.shape{
  354. cb = ts.func,
  355. ninputs = ts.number,
  356. names = ts.array_of(ts.string),
  357. description = ts.string:is_optional()
  358. }
  359. local metatokens_by_name = {}
  360. local function fill_metatokens_by_name()
  361. metatokens_by_name = {}
  362. for _,mt in ipairs(metafunctions) do
  363. for i=1,mt.ninputs do
  364. local name = mt.names[i]
  365. metatokens_by_name[name] = function(task)
  366. local results = mt.cb(task)
  367. return results[i]
  368. end
  369. end
  370. end
  371. end
  372. local function calculate_digest()
  373. local cr = require "rspamd_cryptobox_hash"
  374. local h = cr.create()
  375. for _,mt in ipairs(metafunctions) do
  376. for i=1,mt.ninputs do
  377. local name = mt.names[i]
  378. h:update(name)
  379. end
  380. end
  381. exports.digest = h:hex()
  382. end
  383. local function rspamd_gen_metatokens(task, names)
  384. local lua_util = require "lua_util"
  385. local ipairs = ipairs
  386. local metatokens = {}
  387. if not names then
  388. local cached = task:cache_get('metatokens')
  389. if cached then
  390. return cached
  391. else
  392. for _,mt in ipairs(metafunctions) do
  393. local ct = mt.cb(task)
  394. for i,tok in ipairs(ct) do
  395. lua_util.debugm(N, task, "metatoken: %s = %s",
  396. mt.names[i], tok)
  397. table.insert(metatokens, tok)
  398. end
  399. end
  400. task:cache_set('metatokens', metatokens)
  401. end
  402. else
  403. local logger = require "rspamd_logger"
  404. for _,n in ipairs(names) do
  405. if metatokens_by_name[n] then
  406. table.insert(metatokens, metatokens_by_name[n](task))
  407. else
  408. logger.errx(task, 'unknown metatoken: %s', n)
  409. end
  410. end
  411. end
  412. return metatokens
  413. end
  414. exports.rspamd_gen_metatokens = rspamd_gen_metatokens
  415. exports.gen_metatokens = rspamd_gen_metatokens
  416. local function rspamd_gen_metatokens_table(task)
  417. local metatokens = {}
  418. for _,mt in ipairs(metafunctions) do
  419. local ct = mt.cb(task)
  420. for i,tok in ipairs(ct) do
  421. metatokens[mt.names[i]] = tok
  422. end
  423. end
  424. return metatokens
  425. end
  426. exports.rspamd_gen_metatokens_table = rspamd_gen_metatokens_table
  427. exports.gen_metatokens_table = rspamd_gen_metatokens_table
  428. local function rspamd_count_metatokens()
  429. local ipairs = ipairs
  430. local total = 0
  431. for _,mt in ipairs(metafunctions) do
  432. total = total + mt.ninputs
  433. end
  434. return total
  435. end
  436. exports.rspamd_count_metatokens = rspamd_count_metatokens
  437. exports.count_metatokens = rspamd_count_metatokens
  438. exports.add_metafunction = function(tbl)
  439. local ret, err = meta_schema(tbl)
  440. if not ret then
  441. local logger = require "rspamd_logger"
  442. logger.errx('cannot add metafunction: %s', err)
  443. else
  444. table.insert(metafunctions, tbl)
  445. fill_metatokens_by_name()
  446. calculate_digest()
  447. end
  448. end
  449. fill_metatokens_by_name()
  450. calculate_digest()
  451. return exports