diff options
author | Vsevolod Stakhov <vsevolod@rspamd.com> | 2024-11-30 14:56:37 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rspamd.com> | 2024-11-30 14:56:37 +0000 |
commit | f0e20efae58e69ed49980cd186b0623ee3615ee4 (patch) | |
tree | 720b5f9cbb16f92deda5fab3138fb87d21508747 /lualib/lua_mime.lua | |
parent | 8d48eb47dfcc2fef3ad4ce7fe166a1cf36d3ffe6 (diff) | |
download | rspamd-f0e20efae58e69ed49980cd186b0623ee3615ee4.tar.gz rspamd-f0e20efae58e69ed49980cd186b0623ee3615ee4.zip |
[Feature] Unify displayed part selection
Diffstat (limited to 'lualib/lua_mime.lua')
-rw-r--r-- | lualib/lua_mime.lua | 86 |
1 files changed, 62 insertions, 24 deletions
diff --git a/lualib/lua_mime.lua b/lualib/lua_mime.lua index ce14a49f3..2cbdda804 100644 --- a/lualib/lua_mime.lua +++ b/lualib/lua_mime.lua @@ -898,6 +898,65 @@ exports.remove_attachments = function(task, settings) end --[[[ +-- @function lua_mime.get_displayed_text_part(task) +-- Returns the most relevant displayed content from an email +-- @param {task} task Rspamd task object +-- @return {text_part} a selected part +--]] +exports.get_displayed_text_part = function(task) + local text_parts = task:get_text_parts() + if not text_parts then + return nil + end + + local html_part + local text_part + local html_attachment + + -- First pass: categorize parts + for _, part in ipairs(text_parts) do + local mp = part:get_mimepart() + if not mp:is_attachment() then + if part:is_html() then + html_part = part + else + text_part = text_part or part + end + else + -- Check for HTML attachments + if part:is_html() and mp:get_length() < 102400 then + -- 100KB limit, as long ones are likely not something that we should check + html_attachment = part + end + end + end + + -- Decision logic + if html_part then + local word_count = html_part:get_words_count() or 0 + if word_count >= 10 then + -- Arbitrary minimum threshold, e.g. I believe it's minimum sane + return html_part + end + end + + if text_part then + local word_count = html_part:get_words_count() or 0 + if word_count >= 10 then + -- Arbitrary minimum threshold, e.g. I believe it's minimum sane + return text_part + end + end + + if html_attachment then + return html_attachment + end + + -- Only short parts, but still let's try our best + return html_part or text_part +end + +--[[[ -- @function lua_mime.anonymize_message(task, settings) -- Anonymizes message content by replacing sensitive data -- @param {task} task Rspamd task object @@ -976,31 +1035,10 @@ exports.anonymize_message = function(task, settings) local urls = {} local emails = {} - -- Extract text content, URLs and emails - local text_parts = task:get_text_parts() - for _, part in ipairs(text_parts) do - if not part:get_mimepart():is_attachment() then - if part:is_html() then - local words = part:get_words('norm') - if words then - text_content = words - end - break -- Use only first HTML part - end - end - end + local sel_part = exports.get_displayed_text_part(task) - -- If no HTML parts found, use first text part - if #text_content == 0 then - for _, part in ipairs(text_parts) do - if not part:is_html() then - local words = part:get_words('norm') - if words then - text_content = words - end - break - end - end + if sel_part then + text_content = sel_part:get_words('norm') end -- Process URLs |