From 871bd66a50dd8139373d13154c6107cf0940a7fb Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 26 Nov 2024 17:13:08 +0000 Subject: [Project] Initial stuff to implement messages anonymization --- lualib/lua_mime.lua | 178 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) diff --git a/lualib/lua_mime.lua b/lualib/lua_mime.lua index 795a803e5..167939189 100644 --- a/lualib/lua_mime.lua +++ b/lualib/lua_mime.lua @@ -897,4 +897,182 @@ exports.remove_attachments = function(task, settings) return state end +--[[[ +-- @function lua_mime.anonymize_message(task, settings) +-- Anonymizes message content by replacing sensitive data +-- @param {task} task Rspamd task object +-- @param {table} settings Table with the following fields: +-- * strip_attachments: boolean, whether to strip all attachments +-- * custom_header_process: table of header_name => function(orig_header) pairs +-- @return {table} modified message state similar to other modification functions +--]] +exports.anonymize_message = function(task, settings) + local newline_s = newline(task) + local state = { + newline_s = newline_s + } + local out = {} + + -- Default header processors + local function anonymize_email_header(hdr) + local addrs = rspamd_util.parse_mail_address(hdr.value, task:get_mempool()) + if addrs and addrs[1] then + local modified = {} + for _, addr in ipairs(addrs) do + table.insert(modified, string.format('anonymous@%s', addr.domain or 'example.com')) + end + + return table.concat(modified, ',') + end + return 'anonymous@example.com' + end + + local function anonymize_received_header(hdr) + local processed = string.gsub(hdr.value, '%d+%.%d+%.%d+%.%d+', 'x.x.x.x') + processed = string.gsub(processed, '%x+:%x+:%x+:%x+:%x+:%x+:%x+:%x+', 'x:x:x:x:x:x:x:x') + return processed + end + + local default_header_process = { + ['from'] = anonymize_email_header, + ['to'] = anonymize_email_header, + ['cc'] = anonymize_email_header, + ['bcc'] = anonymize_email_header, + ['received'] = anonymize_received_header, + } + + -- Merge with custom processors + local header_processors = settings.custom_header_process or {} + for k, v in pairs(default_header_process) do + if not header_processors[k] then + header_processors[k] = v + end + end + + -- Process headers + local modified_headers = {} + for name, processor in pairs(header_processors) do + local hdrs = task:get_header_full(name, true) + if hdrs then + for _, hdr in ipairs(hdrs) do + local new_value = processor(hdr) + if new_value then + table.insert(modified_headers, { + name = name, + value = new_value + }) + end + end + end + end + + -- Create new text content + local text_content = {} + local urls = {} + local emails = {} + + -- Extract text content, URLs and emails + local text_parts = task:get_text_parts() + for _, part in ipairs(text_parts) do + if part:is_html() then + local words = part:get_words('norm') + if words then + text_content = words + end + break -- Use only first HTML part + end + end + + -- If no HTML parts found, use first text part + if #text_content == 0 then + for _, part in ipairs(text_parts) do + if not part:is_html() then + local words = part:get_words('norm') + if words then + text_content = words + end + break + end + end + end + + -- Process URLs + local function process_url(url) + local clean_url = url:get_host() + local path = url:get_path() + if path and path ~= "/" then + clean_url = clean_url .. path + end + return string.format('https://%s', clean_url) + end + + for _, url in ipairs(task:get_urls(true)) do + table.insert(urls, process_url(url)) + end + + -- Process emails + local function process_email(email) + return string.format('nobody@%s', email.domain or 'example.com') + end + + for _, email in ipairs(task:get_emails()) do + table.insert(emails, process_email(email)) + end + + -- Construct new message + table.insert(text_content, '\nurls: ') + table.insert(text_content, table.concat(urls, ', ')) + table.insert(text_content, '\nemails: ') + table.insert(text_content, table.concat(emails, ', ')) + local new_text = table.concat(text_content, ' ') + + -- Create new message structure + local boundaries = {} + local cur_boundary = '--XXX' + boundaries[1] = cur_boundary + + -- Add headers + out[#out + 1] = { + string.format('Content-Type: multipart/mixed; boundary="%s"', cur_boundary), + true + } + for _, hdr in ipairs(modified_headers) do + out[#out + 1] = { + string.format('%s: %s', hdr.name, hdr.value), + true + } + end + out[#out + 1] = { '', true } + + -- Add text part + out[#out + 1] = { + string.format('--%s', cur_boundary), + true + } + out[#out + 1] = { + 'Content-Type: text/plain; charset=utf-8\nContent-Transfer-Encoding: quoted-printable', + true + } + out[#out + 1] = { '', true } + out[#out + 1] = { + rspamd_util.encode_qp(new_text, 76, task:get_newlines_type()), + false + } + + -- Close boundaries + out[#out + 1] = { + string.format('--%s--', cur_boundary), + true + } + + state.out = out + state.need_rewrite_ct = true + state.new_ct = { + type = 'multipart', + subtype = 'mixed' + } + + return state +end + return exports -- cgit v1.2.3 From 8d48eb47dfcc2fef3ad4ce7fe166a1cf36d3ffe6 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 29 Nov 2024 13:40:11 +0000 Subject: [Project] Add tool to rspamadm --- lualib/lua_mime.lua | 41 +++++++++++++++++++++++------------------ lualib/rspamadm/mime.lua | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 18 deletions(-) diff --git a/lualib/lua_mime.lua b/lualib/lua_mime.lua index 167939189..ce14a49f3 100644 --- a/lualib/lua_mime.lua +++ b/lualib/lua_mime.lua @@ -951,21 +951,26 @@ exports.anonymize_message = function(task, settings) -- Process headers local modified_headers = {} - for name, processor in pairs(header_processors) do - local hdrs = task:get_header_full(name, true) - if hdrs then - for _, hdr in ipairs(hdrs) do - local new_value = processor(hdr) - if new_value then - table.insert(modified_headers, { - name = name, - value = new_value - }) - end + local function process_hdr(name, hdr) + local processor = header_processors[name:lower()] + if processor then + local new_value = processor(hdr) + if new_value then + table.insert(modified_headers, { + name = name, + value = new_value + }) end + else + table.insert(modified_headers, { + name = name, + value = hdr.value, + }) end end + task:headers_foreach(process_hdr, { full = true }) + -- Create new text content local text_content = {} local urls = {} @@ -974,12 +979,14 @@ exports.anonymize_message = function(task, settings) -- Extract text content, URLs and emails local text_parts = task:get_text_parts() for _, part in ipairs(text_parts) do - if part:is_html() then - local words = part:get_words('norm') - if words then - text_content = words + if not part:get_mimepart():is_attachment() then + if part:is_html() then + local words = part:get_words('norm') + if words then + text_content = words + end + break -- Use only first HTML part end - break -- Use only first HTML part end end @@ -1027,9 +1034,7 @@ exports.anonymize_message = function(task, settings) local new_text = table.concat(text_content, ' ') -- Create new message structure - local boundaries = {} local cur_boundary = '--XXX' - boundaries[1] = cur_boundary -- Add headers out[#out + 1] = { diff --git a/lualib/rspamadm/mime.lua b/lualib/rspamadm/mime.lua index 7750c5a78..617f57a77 100644 --- a/lualib/rspamadm/mime.lua +++ b/lualib/rspamadm/mime.lua @@ -179,6 +179,13 @@ strip:option "--max-text-size" :convert(tonumber) :default(math.huge) +local anonymize = parser:command "anonymize" + :description "Try to remove sensitive information from a message" +anonymize:argument "file" + :description "File to process" + :argname "" + :args "+" + local sign = parser:command "sign" :description "Performs DKIM signing" sign:argument "file" @@ -968,6 +975,41 @@ local function strip_handler(opts) end end +local function anonymize_handler(opts) + load_config(opts) + rspamd_url.init(rspamd_config:get_tld_path()) + + for _, fname in ipairs(opts.file) do + local task = load_task(opts, fname) + local newline_s = newline(task) + + local rewrite = lua_mime.anonymize_message(task, opts) or {} + + for _, o in ipairs(rewrite.out) do + if type(o) == 'string' then + io.write(o) + io.write(newline_s) + elseif type(o) == 'table' then + io.flush() + if type(o[1]) == 'string' then + io.write(o[1]) + else + o[1]:save_in_file(1) + end + + if o[2] then + io.write(newline_s) + end + else + o:save_in_file(1) + io.write(newline_s) + end + end + + task:destroy() -- No automatic dtor + end +end + -- Strips directories and .extensions (if present) from a filepath local function filename_only(filepath) local filename = filepath:match(".*%/([^%.]+)") @@ -1076,6 +1118,8 @@ local function handler(args) sign_handler(opts) elseif command == 'dump' then dump_handler(opts) + elseif command == 'anonymize' then + anonymize_handler(opts) else parser:error('command %s is not implemented', command) end -- cgit v1.2.3 From f0e20efae58e69ed49980cd186b0623ee3615ee4 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 30 Nov 2024 14:56:37 +0000 Subject: [Feature] Unify displayed part selection --- lualib/lua_mime.lua | 86 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 24 deletions(-) diff --git a/lualib/lua_mime.lua b/lualib/lua_mime.lua index ce14a49f3..2cbdda804 100644 --- a/lualib/lua_mime.lua +++ b/lualib/lua_mime.lua @@ -897,6 +897,65 @@ exports.remove_attachments = function(task, settings) return state end +--[[[ +-- @function lua_mime.get_displayed_text_part(task) +-- Returns the most relevant displayed content from an email +-- @param {task} task Rspamd task object +-- @return {text_part} a selected part +--]] +exports.get_displayed_text_part = function(task) + local text_parts = task:get_text_parts() + if not text_parts then + return nil + end + + local html_part + local text_part + local html_attachment + + -- First pass: categorize parts + for _, part in ipairs(text_parts) do + local mp = part:get_mimepart() + if not mp:is_attachment() then + if part:is_html() then + html_part = part + else + text_part = text_part or part + end + else + -- Check for HTML attachments + if part:is_html() and mp:get_length() < 102400 then + -- 100KB limit, as long ones are likely not something that we should check + html_attachment = part + end + end + end + + -- Decision logic + if html_part then + local word_count = html_part:get_words_count() or 0 + if word_count >= 10 then + -- Arbitrary minimum threshold, e.g. I believe it's minimum sane + return html_part + end + end + + if text_part then + local word_count = html_part:get_words_count() or 0 + if word_count >= 10 then + -- Arbitrary minimum threshold, e.g. I believe it's minimum sane + return text_part + end + end + + if html_attachment then + return html_attachment + end + + -- Only short parts, but still let's try our best + return html_part or text_part +end + --[[[ -- @function lua_mime.anonymize_message(task, settings) -- Anonymizes message content by replacing sensitive data @@ -976,31 +1035,10 @@ exports.anonymize_message = function(task, settings) local urls = {} local emails = {} - -- Extract text content, URLs and emails - local text_parts = task:get_text_parts() - for _, part in ipairs(text_parts) do - if not part:get_mimepart():is_attachment() then - if part:is_html() then - local words = part:get_words('norm') - if words then - text_content = words - end - break -- Use only first HTML part - end - end - end + local sel_part = exports.get_displayed_text_part(task) - -- If no HTML parts found, use first text part - if #text_content == 0 then - for _, part in ipairs(text_parts) do - if not part:is_html() then - local words = part:get_words('norm') - if words then - text_content = words - end - break - end - end + if sel_part then + text_content = sel_part:get_words('norm') end -- Process URLs -- cgit v1.2.3 From 11e153c0147690a9649966263b393ec1ad750c1b Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 2 Dec 2024 13:39:22 +0000 Subject: [Minor] Some more adjustments --- lualib/lua_mime.lua | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/lualib/lua_mime.lua b/lualib/lua_mime.lua index 2cbdda804..75e5d11f8 100644 --- a/lualib/lua_mime.lua +++ b/lualib/lua_mime.lua @@ -966,6 +966,9 @@ end -- @return {table} modified message state similar to other modification functions --]] exports.anonymize_message = function(task, settings) + local rspamd_re = require "rspamd_regexp" + -- We exclude words with digits, currency symbols and so on + local exclude_words_re = rspamd_re.create_cached([[/^(?:\d+|\d+\D{1,3}|\p{Sc}.*|(\+?\d{1,3}[\s\-]?)?)$/u]]) local newline_s = newline(task) local state = { newline_s = newline_s @@ -1039,6 +1042,11 @@ exports.anonymize_message = function(task, settings) if sel_part then text_content = sel_part:get_words('norm') + for i, w in ipairs(text_content) do + if exclude_words_re:match(w) then + text_content[i] = string.rep('x', #w) + end + end end -- Process URLs @@ -1080,10 +1088,12 @@ exports.anonymize_message = function(task, settings) true } for _, hdr in ipairs(modified_headers) do - out[#out + 1] = { - string.format('%s: %s', hdr.name, hdr.value), - true - } + if hdr.name ~= 'Content-Type' then + out[#out + 1] = { + string.format('%s: %s', hdr.name, hdr.value), + true + } + end end out[#out + 1] = { '', true } @@ -1099,7 +1109,7 @@ exports.anonymize_message = function(task, settings) out[#out + 1] = { '', true } out[#out + 1] = { rspamd_util.encode_qp(new_text, 76, task:get_newlines_type()), - false + true } -- Close boundaries -- cgit v1.2.3 From 53f73e02a4daf884a31a69e3f55ad1328c3bd1d3 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 5 Dec 2024 11:43:31 +0000 Subject: [Minor] Fix misprint --- lualib/lua_mime.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lualib/lua_mime.lua b/lualib/lua_mime.lua index 75e5d11f8..bb65dc9ba 100644 --- a/lualib/lua_mime.lua +++ b/lualib/lua_mime.lua @@ -941,7 +941,7 @@ exports.get_displayed_text_part = function(task) end if text_part then - local word_count = html_part:get_words_count() or 0 + local word_count = text_part:get_words_count() or 0 if word_count >= 10 then -- Arbitrary minimum threshold, e.g. I believe it's minimum sane return text_part -- cgit v1.2.3 From 14b7742ea4de1bc258bf4031da637d9caf27c23f Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 7 Dec 2024 16:48:55 +0000 Subject: [Feature] Add include/exclude logic for headers --- lualib/lua_mime.lua | 66 ++++++++++++++++++++++++++++++++++++++++-------- lualib/rspamadm/mime.lua | 8 ++++++ 2 files changed, 64 insertions(+), 10 deletions(-) diff --git a/lualib/lua_mime.lua b/lualib/lua_mime.lua index bb65dc9ba..de5c6db33 100644 --- a/lualib/lua_mime.lua +++ b/lualib/lua_mime.lua @@ -1012,22 +1012,68 @@ exports.anonymize_message = function(task, settings) end -- Process headers + local all_include = true + local all_exclude = false + + -- Convert strings list to a list of globs where possible + local function process_exceptions_list(list) + if list and #list > 0 then + for i, hdr in ipairs(list) do + local gl = rspamd_re.import_glob(hdr, 'i') + if gl then + list[i] = gl + end + end + return true + end + end + + local function maybe_match_header(hdr, list) + if not list then + return false + end + for _, expr in ipairs(list) do + if type(expr) == 'userdata' then + if expr:match(hdr) then + return true + end + else + if expr:lower() == hdr:lower() then + return true + end + end + end + return false + end + + if process_exceptions_list(settings.include_header) then + all_include = false + all_exclude = true + end + if process_exceptions_list(settings.exclude_header) then + all_exclude = true + end + local modified_headers = {} local function process_hdr(name, hdr) - local processor = header_processors[name:lower()] - if processor then - local new_value = processor(hdr) - if new_value then + local include_hdr = (all_include and not maybe_match_header(name, settings.exclude_header)) or + (all_exclude and maybe_match_header(name, settings.include_header)) + if include_hdr then + local processor = header_processors[name:lower()] + if processor then + local new_value = processor(hdr) + if new_value then + table.insert(modified_headers, { + name = name, + value = new_value + }) + end + else table.insert(modified_headers, { name = name, - value = new_value + value = hdr.value }) end - else - table.insert(modified_headers, { - name = name, - value = hdr.value, - }) end end diff --git a/lualib/rspamadm/mime.lua b/lualib/rspamadm/mime.lua index 617f57a77..f8c7fc4f7 100644 --- a/lualib/rspamadm/mime.lua +++ b/lualib/rspamadm/mime.lua @@ -185,6 +185,14 @@ anonymize:argument "file" :description "File to process" :argname "" :args "+" +anonymize:option "--exclude-header -X" + :description "Exclude specific headers from anonymization" + :argname "
" + :count "*" +anonymize:option "--include-header -I" + :description "Include specific headers from anonymization" + :argname "
" + :count "*" local sign = parser:command "sign" :description "Performs DKIM signing" -- cgit v1.2.3 From 2f181c45db9c0dd324ffd7c1873d1d752761377b Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 9 Dec 2024 15:23:05 +0000 Subject: [Minor] Fix urls path issue --- lualib/lua_mime.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lualib/lua_mime.lua b/lualib/lua_mime.lua index de5c6db33..d6a8a70bf 100644 --- a/lualib/lua_mime.lua +++ b/lualib/lua_mime.lua @@ -1100,7 +1100,7 @@ exports.anonymize_message = function(task, settings) local clean_url = url:get_host() local path = url:get_path() if path and path ~= "/" then - clean_url = clean_url .. path + clean_url = string.format("%s/%s", clean_url, path) end return string.format('https://%s', clean_url) end -- cgit v1.2.3 From 717cfba84f7a80e0d164a03a2bf45b06b2f07075 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 9 Dec 2024 15:25:26 +0000 Subject: [Minor] Make urls and emails unique --- lualib/lua_mime.lua | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lualib/lua_mime.lua b/lualib/lua_mime.lua index d6a8a70bf..24d02c304 100644 --- a/lualib/lua_mime.lua +++ b/lualib/lua_mime.lua @@ -967,6 +967,7 @@ end --]] exports.anonymize_message = function(task, settings) local rspamd_re = require "rspamd_regexp" + local lua_util = require "lua_util" -- We exclude words with digits, currency symbols and so on local exclude_words_re = rspamd_re.create_cached([[/^(?:\d+|\d+\D{1,3}|\p{Sc}.*|(\+?\d{1,3}[\s\-]?)?)$/u]]) local newline_s = newline(task) @@ -1106,7 +1107,7 @@ exports.anonymize_message = function(task, settings) end for _, url in ipairs(task:get_urls(true)) do - table.insert(urls, process_url(url)) + urls[process_url(url)] = true end -- Process emails @@ -1115,14 +1116,14 @@ exports.anonymize_message = function(task, settings) end for _, email in ipairs(task:get_emails()) do - table.insert(emails, process_email(email)) + emails[process_email(email)] = true end -- Construct new message table.insert(text_content, '\nurls: ') - table.insert(text_content, table.concat(urls, ', ')) + table.insert(text_content, table.concat(lua_util.keys(urls), ', ')) table.insert(text_content, '\nemails: ') - table.insert(text_content, table.concat(emails, ', ')) + table.insert(text_content, table.concat(lua_util.keys(emails), ', ')) local new_text = table.concat(text_content, ' ') -- Create new message structure -- cgit v1.2.3 From ad9a444a525e51290a1845927e84188d2e9f18bf Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 10 Dec 2024 11:36:41 +0000 Subject: [Minor] Some adjustments --- lualib/lua_mime.lua | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lualib/lua_mime.lua b/lualib/lua_mime.lua index 24d02c304..1135f2b63 100644 --- a/lualib/lua_mime.lua +++ b/lualib/lua_mime.lua @@ -969,7 +969,7 @@ exports.anonymize_message = function(task, settings) local rspamd_re = require "rspamd_regexp" local lua_util = require "lua_util" -- We exclude words with digits, currency symbols and so on - local exclude_words_re = rspamd_re.create_cached([[/^(?:\d+|\d+\D{1,3}|\p{Sc}.*|(\+?\d{1,3}[\s\-]?)?)$/u]]) + local exclude_words_re = rspamd_re.create_cached([=[/^(?:\d+|\d+\D{1,3}|\p{Sc}.*|(\+?\d{1,3}[\s\-]?)?)$/(:?^[[:alpha:]]*\d{4,}.*$)/u]=]) local newline_s = newline(task) local state = { newline_s = newline_s @@ -1120,9 +1120,9 @@ exports.anonymize_message = function(task, settings) end -- Construct new message - table.insert(text_content, '\nurls: ') + table.insert(text_content, '\nurls:') table.insert(text_content, table.concat(lua_util.keys(urls), ', ')) - table.insert(text_content, '\nemails: ') + table.insert(text_content, '\nemails:') table.insert(text_content, table.concat(lua_util.keys(emails), ', ')) local new_text = table.concat(text_content, ' ') -- cgit v1.2.3