You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lua_fuzzy.lua 9.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. --[[
  2. Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. --[[[
  14. -- @module lua_fuzzy
  15. -- This module contains helper functions for supporting fuzzy check module
  16. --]]
  17. local N = "lua_fuzzy"
  18. local lua_util = require "lua_util"
  19. local rspamd_regexp = require "rspamd_regexp"
  20. local fun = require "fun"
  21. local rspamd_logger = require "rspamd_logger"
  22. local ts = require("tableshape").types
  23. -- Filled by C code, indexed by number in this table
  24. local rules = {}
  25. -- Pre-defined rules options
  26. local policies = {
  27. recommended = {
  28. min_bytes = 1024,
  29. min_height = 500,
  30. min_width = 500,
  31. min_length = 64,
  32. text_multiplier = 4.0, -- divide min_bytes by 4 for texts
  33. mime_types = {"application/*"},
  34. scan_archives = true,
  35. short_text_direct_hash = true,
  36. text_shingles = true,
  37. skip_images = false,
  38. }
  39. }
  40. local default_policy = policies.recommended
  41. local policy_schema = ts.shape{
  42. min_bytes = ts.number + ts.string / tonumber,
  43. min_height = ts.number + ts.string / tonumber,
  44. min_width = ts.number + ts.string / tonumber,
  45. min_length = ts.number + ts.string / tonumber,
  46. text_multiplier = ts.number,
  47. mime_types = ts.array_of(ts.string),
  48. scan_archives = ts.boolean,
  49. short_text_direct_hash = ts.boolean,
  50. text_shingles = ts.boolean,
  51. skip_images = ts.boolean,
  52. }
  53. local exports = {}
  54. --[[[
  55. -- @function lua_fuzzy.register_policy(name, policy)
  56. -- Adds a new policy with name `name`. Must be valid, checked using policy_schema
  57. --]]
  58. exports.register_policy = function(name, policy)
  59. if policies[name] then
  60. rspamd_logger.warnx(rspamd_config, "overriding policy %s", name)
  61. end
  62. local parsed_policy,err = policy_schema:transform(policy)
  63. if not parsed_policy then
  64. rspamd_logger.errx(rspamd_config, 'invalid fuzzy rule policy %s: %s',
  65. name, err)
  66. return
  67. else
  68. policies.name = parsed_policy
  69. end
  70. end
  71. --[[[
  72. -- @function lua_fuzzy.process_rule(rule)
  73. -- Processes fuzzy rule (applying policies or defaults if needed). Returns policy id
  74. --]]
  75. exports.process_rule = function(rule)
  76. local processed_rule = lua_util.shallowcopy(rule)
  77. local policy = default_policy
  78. if processed_rule.policy then
  79. policy = policies[processed_rule.policy]
  80. end
  81. if policy then
  82. processed_rule = lua_util.override_defaults(policy, processed_rule)
  83. else
  84. rspamd_logger.warnx(rspamd_config, "unknown policy %s", processed_rule.policy)
  85. end
  86. if processed_rule.mime_types then
  87. processed_rule.mime_types = fun.totable(fun.map(function(gl)
  88. return rspamd_regexp.import_glob(gl, 'i')
  89. end, processed_rule.mime_types))
  90. end
  91. table.insert(rules, processed_rule)
  92. return #rules
  93. end
  94. local function check_length(task, part, rule)
  95. local bytes = part:get_length()
  96. local length_ok = bytes > 0
  97. local id = part:get_id()
  98. lua_util.debugm(N, task, 'check size of part %s', id)
  99. if length_ok and rule.min_bytes > 0 then
  100. local adjusted_bytes = bytes
  101. if part:is_text() then
  102. -- Fuzzy plugin uses stripped utf content to get an exact hash, that
  103. -- corresponds to `get_content_oneline()`
  104. -- However, in the case of empty parts this method returns `nil`, so extra
  105. -- sanity check is required.
  106. bytes = #(part:get_text():get_content_oneline() or '')
  107. -- Short hashing algorithm also use subject unless explicitly denied
  108. if not rule.no_subject then
  109. local subject = task:get_subject() or ''
  110. bytes = bytes + #subject
  111. end
  112. if rule.text_multiplier then
  113. adjusted_bytes = bytes * rule.text_multiplier
  114. end
  115. end
  116. if rule.min_bytes > adjusted_bytes then
  117. lua_util.debugm(N, task, 'skip part of length %s (%s adjusted) ' ..
  118. 'as it has less than %s bytes',
  119. bytes, adjusted_bytes, rule.min_bytes)
  120. length_ok = false
  121. else
  122. lua_util.debugm(N, task, 'allow part of length %s (%s adjusted)',
  123. bytes, adjusted_bytes, rule.min_bytes)
  124. end
  125. else
  126. lua_util.debugm(N, task, 'allow part %s, no length limits', id)
  127. end
  128. return length_ok
  129. end
  130. local function check_text_part(task, part, rule, text)
  131. local allow_direct,allow_shingles = false,false
  132. local id = part:get_id()
  133. lua_util.debugm(N, task, 'check text part %s', id)
  134. local wcnt = text:get_words_count()
  135. if rule.text_shingles then
  136. -- Check number of words
  137. local min_words = rule.min_length or 0
  138. if min_words < 32 then
  139. min_words = 32 -- Minimum for shingles
  140. end
  141. if wcnt < min_words then
  142. lua_util.debugm(N, task, 'text has less than %s words: %s; disable shingles',
  143. rule.min_length, wcnt)
  144. allow_shingles = false
  145. else
  146. lua_util.debugm(N, task, 'allow shingles in text %s, %s words',
  147. id, wcnt)
  148. allow_shingles = true
  149. end
  150. if not rule.short_text_direct_hash and not allow_shingles then
  151. allow_direct = false
  152. else
  153. if not allow_shingles then
  154. lua_util.debugm(N, task,
  155. 'allow direct hash for short text %s, %s words',
  156. id, wcnt)
  157. allow_direct = check_length(task, part, rule)
  158. else
  159. allow_direct = wcnt > 0
  160. end
  161. end
  162. else
  163. lua_util.debugm(N, task,
  164. 'disable shingles in text %s', id)
  165. allow_direct = check_length(task, part, rule)
  166. end
  167. return allow_direct,allow_shingles
  168. end
  169. --local function has_sane_text_parts(task)
  170. -- local text_parts = task:get_text_parts() or {}
  171. -- return fun.any(function(tp) return tp:get_words_count() > 32 end, text_parts)
  172. --end
  173. local function check_image_part(task, part, rule, image)
  174. if rule.skip_images then
  175. lua_util.debugm(N, task, 'skip image part as images are disabled')
  176. return false,false
  177. end
  178. local id = part:get_id()
  179. lua_util.debugm(N, task, 'check image part %s', id)
  180. if rule.min_width > 0 or rule.min_height > 0 then
  181. -- Check dimensions
  182. local min_width = rule.min_width or rule.min_height
  183. local min_height = rule.min_height or rule.min_width
  184. local height = image:get_height()
  185. local width = image:get_width()
  186. if height and width then
  187. if height < min_height or width < min_width then
  188. lua_util.debugm(N, task, 'skip image part %s as it does not meet minimum sizes: %sx%s < %sx%s',
  189. id, width, height, min_width, min_height)
  190. return false, false
  191. else
  192. lua_util.debugm(N, task, 'allow image part %s: %sx%s',
  193. id, width, height)
  194. end
  195. end
  196. end
  197. return check_length(task, part, rule),false
  198. end
  199. local function mime_types_check(task, part, rule)
  200. local t,st = part:get_type()
  201. if not t then return false, false end
  202. local ct = string.format('%s/%s', t, st)
  203. local detected_ct
  204. t,st = part:get_detected_type()
  205. if t then
  206. detected_ct = string.format('%s/%s', t, st)
  207. else
  208. detected_ct = ct
  209. end
  210. local id = part:get_id()
  211. lua_util.debugm(N, task, 'check binary part %s: %s', id, ct)
  212. -- For bad mime parts we implicitly enable fuzzy check
  213. local mime_trace = (task:get_symbol('MIME_TRACE') or {})[1]
  214. local opts = {}
  215. if mime_trace then
  216. opts = mime_trace.options or opts
  217. end
  218. opts = fun.tomap(fun.map(function(opt)
  219. local elts = lua_util.str_split(opt, ':')
  220. return elts[1],elts[2]
  221. end, opts))
  222. if opts[id] and opts[id] == '-' then
  223. lua_util.debugm(N, task, 'explicitly check binary part %s: bad mime type %s', id, ct)
  224. return check_length(task, part, rule),false
  225. end
  226. if rule.mime_types then
  227. if fun.any(function(gl_re)
  228. if gl_re:match(ct) or (detected_ct and gl_re:match(detected_ct)) then
  229. return true
  230. else
  231. return false
  232. end
  233. end, rule.mime_types) then
  234. lua_util.debugm(N, task, 'found mime type match for part %s: %s (%s detected)',
  235. id, ct, detected_ct)
  236. return check_length(task, part, rule),false
  237. end
  238. return false, false
  239. end
  240. return false,false
  241. end
  242. exports.check_mime_part = function(task, part, rule_id)
  243. local rule = rules[rule_id]
  244. if not rule then
  245. rspamd_logger.errx(task, 'cannot find rule with id %s', rule_id)
  246. return false,false
  247. end
  248. if part:is_text() then
  249. return check_text_part(task, part, rule, part:get_text())
  250. end
  251. if part:is_image() then
  252. return check_image_part(task, part, rule, part:get_image())
  253. end
  254. if part:is_archive() and rule.scan_archives then
  255. -- Always send archives
  256. lua_util.debugm(N, task, 'check archive part %s', part:get_id())
  257. return true,false
  258. end
  259. if part:is_specific() then
  260. local sp = part:get_specific()
  261. if type(sp) == 'table' and sp.fuzzy_hashes then
  262. lua_util.debugm(N, task, 'check specific part %s', part:get_id())
  263. return true,false
  264. end
  265. end
  266. if part:is_attachment() then
  267. return mime_types_check(task, part, rule)
  268. end
  269. return false,false
  270. end
  271. exports.cleanup_rules = function()
  272. rules = {}
  273. end
  274. return exports