summaryrefslogtreecommitdiffstats
path: root/lualib/lua_content/pdf.lua
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2020-01-22 11:40:33 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2020-01-22 11:40:33 +0000
commitd5f70b34e0a4f01722ea1020d1e7dac03b134bf2 (patch)
tree92a3219c452c677074912b1cb7310a118425668a /lualib/lua_content/pdf.lua
parentf3633a995f251bba75faddc25097c92cb4d299af (diff)
downloadrspamd-d5f70b34e0a4f01722ea1020d1e7dac03b134bf2.tar.gz
rspamd-d5f70b34e0a4f01722ea1020d1e7dac03b134bf2.zip
[Project] Lua_content: Add preliminary support of fuzzy hashes from PDF scripts
Diffstat (limited to 'lualib/lua_content/pdf.lua')
-rw-r--r--lualib/lua_content/pdf.lua38
1 files changed, 36 insertions, 2 deletions
diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua
index 53625065a..d4ad892e0 100644
--- a/lualib/lua_content/pdf.lua
+++ b/lualib/lua_content/pdf.lua
@@ -113,6 +113,9 @@ local config = {
text_extraction = false, -- NYI feature
url_extraction = true,
enabled = true,
+ js_fuzzy = true, -- Generate fuzzy hashes from PDF javascripts
+ min_js_fuzzy = 32, -- Minimum size of js to be considered as a fuzzy
+ openaction_fuzzy_only = false, -- Generate fuzzy from all scripts
}
-- Used to process patterns found in PDF
@@ -529,6 +532,7 @@ local function process_javascript(task, pdf, js)
local njs = {
data = js,
hash = rspamd_util.encode_base32(bin_hash),
+ bin_hash = bin_hash,
}
pdf.scripts[bin_hash] = njs
return njs
@@ -555,7 +559,7 @@ local function process_action(task, pdf, obj)
if js then
obj.js = js
lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
- obj.major, obj.minor, obj.js)
+ obj.major, obj.minor, obj.js.data)
else
lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
obj.major, obj.minor, js)
@@ -759,7 +763,7 @@ process_dict = function(task, pdf, obj, dict)
if js then
obj.js = js
lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
- obj.major, obj.minor, obj.js)
+ obj.major, obj.minor, obj.js.data)
else
lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
obj.major, obj.minor, js)
@@ -1154,6 +1158,36 @@ local function process_pdf(input, _, task)
if config.url_extraction then
search_urls(task, pdf_output)
end
+
+ if config.js_fuzzy and pdf_output.scripts then
+ pdf_output.fuzzy_hashes = {}
+ if config.openaction_fuzzy_only then
+ -- OpenAction only
+ if pdf_output.openaction and pdf_output.openaction.bin_hash then
+ if config.min_js_fuzzy and #pdf_output.openaction.data >= config.min_js_fuzzy then
+ lua_util.debugm(N, task, "pdf: add fuzzy hash from openaction: %s",
+ pdf_output.openaction.hash)
+ table.insert(pdf_output.fuzzy_hashes, pdf_output.openaction.bin_hash)
+ else
+ lua_util.debugm(N, task, "pdf: skip fuzzy hash from Javascript: %s, too short: %s",
+ pdf_output.openaction.hash, #pdf_output.openaction.data)
+ end
+ end
+ else
+ -- All hashes
+ for h,sc in pairs(pdf_output.scripts) do
+ if config.min_js_fuzzy and #sc.data >= config.min_js_fuzzy then
+ lua_util.debugm(N, task, "pdf: add fuzzy hash from Javascript: %s",
+ sc.hash)
+ table.insert(pdf_output.fuzzy_hashes, h)
+ else
+ lua_util.debugm(N, task, "pdf: skip fuzzy hash from Javascript: %s, too short: %s",
+ sc.hash, #sc.data)
+ end
+ end
+
+ end
+ end
else
pdf_output.flags.no_objects = true
end