diff options
author | Vsevolod Stakhov <vsevolod@rspamd.com> | 2024-12-11 13:55:03 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rspamd.com> | 2024-12-11 13:55:03 +0000 |
commit | 7e6bec3c411cf368f37c1701141a5f65895b741a (patch) | |
tree | aedb460be0ec8b5bbec6b271656af979ddeb25c1 | |
parent | b0e0367901b4223f68a6de8edf1fede2c7eedcd0 (diff) | |
download | rspamd-7e6bec3c411cf368f37c1701141a5f65895b741a.tar.gz rspamd-7e6bec3c411cf368f37c1701141a5f65895b741a.zip |
[Rework] Multimap should use only distinct text parts for content matching
Issue: #5248
-rw-r--r-- | lualib/lua_mime.lua | 43 | ||||
-rw-r--r-- | src/plugins/lua/multimap.lua | 7 |
2 files changed, 47 insertions, 3 deletions
diff --git a/lualib/lua_mime.lua b/lualib/lua_mime.lua index 1135f2b63..f68758ec9 100644 --- a/lualib/lua_mime.lua +++ b/lualib/lua_mime.lua @@ -957,6 +957,49 @@ exports.get_displayed_text_part = function(task) end --[[[ +-- @function lua_mime.get_distinct_text_parts(task) +-- Returns the list of parts that are visible or have a distinct content +-- @param {task} task Rspamd task object +-- @return array of {text_part} a selected part +--]] +exports.get_distinct_text_parts = function(task) + local text_parts = task:get_text_parts() + if not text_parts then + return {} + end + + local text_part_idx + + local distance = task:get_mempool():get_variable('parts_distance', 'double') + if not distance then + return text_parts + end + distance = tonumber(distance) + + if distance > 0.5 then + -- Parts are distinct + return text_parts + end + + -- First pass: categorize parts + for i, part in ipairs(text_parts) do + local mp = part:get_mimepart() + if not mp:is_attachment() then + if not part:is_html() then + -- Found text part that is similar to html part + text_part_idx = i + end + end + end + + if text_part_idx then + table.remove(text_parts, text_part_idx) + end + + return text_parts +end + +--[[[ -- @function lua_mime.anonymize_message(task, settings) -- Anonymizes message content by replacing sensitive data -- @param {task} task Rspamd task object diff --git a/src/plugins/lua/multimap.lua b/src/plugins/lua/multimap.lua index e852ce15e..a61da606b 100644 --- a/src/plugins/lua/multimap.lua +++ b/src/plugins/lua/multimap.lua @@ -29,6 +29,7 @@ local rspamd_ip = require "rspamd_ip" local lua_util = require "lua_util" local lua_selectors = require "lua_selectors" local lua_maps = require "lua_maps" +local lua_mime = require "lua_mime" local redis_params local fun = require "fun" local N = 'multimap' @@ -453,19 +454,19 @@ local function apply_content_filter(task, filter) return { task:get_raw_headers() } elseif filter == 'text' then local ret = {} - for _, p in ipairs(task:get_text_parts()) do + for _, p in ipairs(lua_mime.get_distinct_text_parts(task)) do table.insert(ret, p:get_content()) end return ret elseif filter == 'rawtext' then local ret = {} - for _, p in ipairs(task:get_text_parts()) do + for _, p in ipairs(lua_mime.get_distinct_text_parts(task)) do table.insert(ret, p:get_content('raw_parsed')) end return ret elseif filter == 'oneline' then local ret = {} - for _, p in ipairs(task:get_text_parts()) do + for _, p in ipairs(lua_mime.get_distinct_text_parts(task)) do table.insert(ret, p:get_content_oneline()) end return ret |