diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-06-25 16:28:43 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-06-25 16:28:43 +0100 |
commit | 592ce3ab0a40fe016b0d90424d8eacc1e3025412 (patch) | |
tree | 249cdb4b5d6d2117c697edcf8ae33966a25fd21c | |
parent | 3c353232c9f9c56efa5b9e7f58402111cc3524b8 (diff) | |
download | rspamd-592ce3ab0a40fe016b0d90424d8eacc1e3025412.tar.gz rspamd-592ce3ab0a40fe016b0d90424d8eacc1e3025412.zip |
[Feature] Allow to extract words in `rspamadm mime`
-rw-r--r-- | lualib/rspamadm/mime.lua | 24 |
1 files changed, 22 insertions, 2 deletions
diff --git a/lualib/rspamadm/mime.lua b/lualib/rspamadm/mime.lua index 300dd43ae..ceb2894c5 100644 --- a/lualib/rspamadm/mime.lua +++ b/lualib/rspamadm/mime.lua @@ -72,6 +72,8 @@ extract:option "-o --output" decoded_utf = "raw_utf" } :default "content" +extract:flag "-w --words" + :description "Extracts words" local stat = parser:command "stat st s" @@ -185,19 +187,37 @@ end local function extract_handler(opts) local out_elts = {} + + if opts.words then + -- Enable stemming + rspamd_config:init_subsystem('langdet') + end + for _,fname in ipairs(opts.file) do local task = load_task(opts, fname) out_elts[fname] = {} + if not opts.text and not opts.html then + parser:error('please select html or text part to be extracted') + end + if opts.text or opts.html then local tp = task:get_text_parts() or {} for _,part in ipairs(tp) do local how = opts.output if opts.text and not part:is_html() then - table.insert(out_elts[fname], tostring(part:get_content(how))) + if opts.words then + table.insert(out_elts[fname], table.concat(part:get_words(), ' ')) + else + table.insert(out_elts[fname], tostring(part:get_content(how))) + end elseif opts.html and part:is_html() then - table.insert(out_elts[fname], tostring(part:get_content(how))) + if opts.words then + table.insert(out_elts[fname], table.concat(part:get_words(), ' ')) + else + table.insert(out_elts[fname], tostring(part:get_content(how))) + end end end end |