You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lua_fuzzy.lua 9.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. --[[
  2. Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. --[[[
  14. -- @module lua_fuzzy
  15. -- This module contains helper functions for supporting fuzzy check module
  16. --]]
  17. local N = "lua_fuzzy"
  18. local lua_util = require "lua_util"
  19. local rspamd_regexp = require "rspamd_regexp"
  20. local fun = require "fun"
  21. local rspamd_logger = require "rspamd_logger"
  22. local ts = require("tableshape").types
  23. -- Filled by C code, indexed by number in this table
  24. local rules = {}
  25. -- Pre-defined rules options
  26. local policies = {
  27. recommended = {
  28. min_bytes = 1024,
  29. min_height = 500,
  30. min_width = 500,
  31. min_length = 64,
  32. text_multiplier = 4.0, -- divide min_bytes by 4 for texts
  33. mime_types = { "application/*" },
  34. scan_archives = true,
  35. short_text_direct_hash = true,
  36. text_shingles = true,
  37. skip_images = false,
  38. }
  39. }
  40. local default_policy = policies.recommended
  41. local schema_fields = {
  42. min_bytes = ts.number + ts.string / tonumber,
  43. min_height = ts.number + ts.string / tonumber,
  44. min_width = ts.number + ts.string / tonumber,
  45. min_length = ts.number + ts.string / tonumber,
  46. text_multiplier = ts.number,
  47. mime_types = ts.array_of(ts.string),
  48. scan_archives = ts.boolean,
  49. short_text_direct_hash = ts.boolean,
  50. text_shingles = ts.boolean,
  51. skip_images = ts.boolean,
  52. }
  53. local policy_schema = ts.shape(schema_fields)
  54. local policy_schema_open = ts.shape(schema_fields, {
  55. open = true,
  56. })
  57. local exports = {}
  58. --[[[
  59. -- @function lua_fuzzy.register_policy(name, policy)
  60. -- Adds a new policy with name `name`. Must be valid, checked using policy_schema
  61. --]]
  62. exports.register_policy = function(name, policy)
  63. if policies[name] then
  64. rspamd_logger.warnx(rspamd_config, "overriding policy %s", name)
  65. end
  66. local parsed_policy, err = policy_schema:transform(policy)
  67. if not parsed_policy then
  68. rspamd_logger.errx(rspamd_config, 'invalid fuzzy rule policy %s: %s',
  69. name, err)
  70. return
  71. else
  72. policies.name = parsed_policy
  73. end
  74. end
  75. --[[[
  76. -- @function lua_fuzzy.process_rule(rule)
  77. -- Processes fuzzy rule (applying policies or defaults if needed). Returns policy id
  78. --]]
  79. exports.process_rule = function(rule)
  80. local processed_rule = lua_util.shallowcopy(rule)
  81. local policy = default_policy
  82. if processed_rule.policy then
  83. policy = policies[processed_rule.policy]
  84. end
  85. if policy then
  86. processed_rule = lua_util.override_defaults(policy, processed_rule)
  87. local parsed_policy, err = policy_schema_open:transform(processed_rule)
  88. if not parsed_policy then
  89. rspamd_logger.errx(rspamd_config, 'invalid fuzzy rule default fields: %s', err)
  90. else
  91. processed_rule = parsed_policy
  92. end
  93. else
  94. rspamd_logger.warnx(rspamd_config, "unknown policy %s", processed_rule.policy)
  95. end
  96. if processed_rule.mime_types then
  97. processed_rule.mime_types = fun.totable(fun.map(function(gl)
  98. return rspamd_regexp.import_glob(gl, 'i')
  99. end, processed_rule.mime_types))
  100. end
  101. table.insert(rules, processed_rule)
  102. return #rules
  103. end
  104. local function check_length(task, part, rule)
  105. local bytes = part:get_length()
  106. local length_ok = bytes > 0
  107. local id = part:get_id()
  108. lua_util.debugm(N, task, 'check size of part %s', id)
  109. if length_ok and rule.min_bytes > 0 then
  110. local adjusted_bytes = bytes
  111. if part:is_text() then
  112. -- Fuzzy plugin uses stripped utf content to get an exact hash, that
  113. -- corresponds to `get_content_oneline()`
  114. -- However, in the case of empty parts this method returns `nil`, so extra
  115. -- sanity check is required.
  116. bytes = #(part:get_text():get_content_oneline() or '')
  117. -- Short hashing algorithm also use subject unless explicitly denied
  118. if not rule.no_subject then
  119. local subject = task:get_subject() or ''
  120. bytes = bytes + #subject
  121. end
  122. if rule.text_multiplier then
  123. adjusted_bytes = bytes * rule.text_multiplier
  124. end
  125. end
  126. if rule.min_bytes > adjusted_bytes then
  127. lua_util.debugm(N, task, 'skip part of length %s (%s adjusted) ' ..
  128. 'as it has less than %s bytes',
  129. bytes, adjusted_bytes, rule.min_bytes)
  130. length_ok = false
  131. else
  132. lua_util.debugm(N, task, 'allow part of length %s (%s adjusted)',
  133. bytes, adjusted_bytes, rule.min_bytes)
  134. end
  135. else
  136. lua_util.debugm(N, task, 'allow part %s, no length limits', id)
  137. end
  138. return length_ok
  139. end
  140. local function check_text_part(task, part, rule, text)
  141. local allow_direct, allow_shingles = false, false
  142. local id = part:get_id()
  143. lua_util.debugm(N, task, 'check text part %s', id)
  144. local wcnt = text:get_words_count()
  145. if rule.text_shingles then
  146. -- Check number of words
  147. local min_words = rule.min_length or 0
  148. if min_words < 32 then
  149. min_words = 32 -- Minimum for shingles
  150. end
  151. if wcnt < min_words then
  152. lua_util.debugm(N, task, 'text has less than %s words: %s; disable shingles',
  153. rule.min_length, wcnt)
  154. allow_shingles = false
  155. else
  156. lua_util.debugm(N, task, 'allow shingles in text %s, %s words',
  157. id, wcnt)
  158. allow_shingles = true
  159. end
  160. if not rule.short_text_direct_hash and not allow_shingles then
  161. allow_direct = false
  162. else
  163. if not allow_shingles then
  164. lua_util.debugm(N, task,
  165. 'allow direct hash for short text %s, %s words',
  166. id, wcnt)
  167. allow_direct = check_length(task, part, rule)
  168. else
  169. allow_direct = wcnt > 0
  170. end
  171. end
  172. else
  173. lua_util.debugm(N, task,
  174. 'disable shingles in text %s', id)
  175. allow_direct = check_length(task, part, rule)
  176. end
  177. return allow_direct, allow_shingles
  178. end
  179. --local function has_sane_text_parts(task)
  180. -- local text_parts = task:get_text_parts() or {}
  181. -- return fun.any(function(tp) return tp:get_words_count() > 32 end, text_parts)
  182. --end
  183. local function check_image_part(task, part, rule, image)
  184. if rule.skip_images then
  185. lua_util.debugm(N, task, 'skip image part as images are disabled')
  186. return false, false
  187. end
  188. local id = part:get_id()
  189. lua_util.debugm(N, task, 'check image part %s', id)
  190. if rule.min_width > 0 or rule.min_height > 0 then
  191. -- Check dimensions
  192. local min_width = rule.min_width or rule.min_height
  193. local min_height = rule.min_height or rule.min_width
  194. local height = image:get_height()
  195. local width = image:get_width()
  196. if height and width then
  197. if height < min_height or width < min_width then
  198. lua_util.debugm(N, task, 'skip image part %s as it does not meet minimum sizes: %sx%s < %sx%s',
  199. id, width, height, min_width, min_height)
  200. return false, false
  201. else
  202. lua_util.debugm(N, task, 'allow image part %s: %sx%s',
  203. id, width, height)
  204. end
  205. end
  206. end
  207. return check_length(task, part, rule), false
  208. end
  209. local function mime_types_check(task, part, rule)
  210. local t, st = part:get_type()
  211. if not t then
  212. return false, false
  213. end
  214. local ct = string.format('%s/%s', t, st)
  215. local detected_ct
  216. t, st = part:get_detected_type()
  217. if t then
  218. detected_ct = string.format('%s/%s', t, st)
  219. else
  220. detected_ct = ct
  221. end
  222. local id = part:get_id()
  223. lua_util.debugm(N, task, 'check binary part %s: %s', id, ct)
  224. -- For bad mime parts we implicitly enable fuzzy check
  225. local mime_trace = (task:get_symbol('MIME_TRACE') or {})[1]
  226. local opts = {}
  227. if mime_trace then
  228. opts = mime_trace.options or opts
  229. end
  230. opts = fun.tomap(fun.map(function(opt)
  231. local elts = lua_util.str_split(opt, ':')
  232. return elts[1], elts[2]
  233. end, opts))
  234. if opts[id] and opts[id] == '-' then
  235. lua_util.debugm(N, task, 'explicitly check binary part %s: bad mime type %s', id, ct)
  236. return check_length(task, part, rule), false
  237. end
  238. if rule.mime_types then
  239. if fun.any(function(gl_re)
  240. if gl_re:match(ct) or (detected_ct and gl_re:match(detected_ct)) then
  241. return true
  242. else
  243. return false
  244. end
  245. end, rule.mime_types) then
  246. lua_util.debugm(N, task, 'found mime type match for part %s: %s (%s detected)',
  247. id, ct, detected_ct)
  248. return check_length(task, part, rule), false
  249. end
  250. return false, false
  251. end
  252. return false, false
  253. end
  254. exports.check_mime_part = function(task, part, rule_id)
  255. local rule = rules[rule_id]
  256. if not rule then
  257. rspamd_logger.errx(task, 'cannot find rule with id %s', rule_id)
  258. return false, false
  259. end
  260. if part:is_text() then
  261. return check_text_part(task, part, rule, part:get_text())
  262. end
  263. if part:is_image() then
  264. return check_image_part(task, part, rule, part:get_image())
  265. end
  266. if part:is_archive() and rule.scan_archives then
  267. -- Always send archives
  268. lua_util.debugm(N, task, 'check archive part %s', part:get_id())
  269. return true, false
  270. end
  271. if part:is_specific() then
  272. local sp = part:get_specific()
  273. if type(sp) == 'table' and sp.fuzzy_hashes then
  274. lua_util.debugm(N, task, 'check specific part %s', part:get_id())
  275. return true, false
  276. end
  277. end
  278. if part:is_attachment() then
  279. return mime_types_check(task, part, rule)
  280. end
  281. return false, false
  282. end
  283. exports.cleanup_rules = function()
  284. rules = {}
  285. end
  286. return exports