summaryrefslogtreecommitdiffstats
path: root/lualib/lua_content/pdf.lua
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2020-01-20 10:11:30 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2020-01-20 10:11:30 +0000
commit4822d25fb38a5e40c0437df9643bfec23d249c3d (patch)
tree55f565f44bdb0690169b79250d3b6a05062eb64d /lualib/lua_content/pdf.lua
parente585d5c85ba970bebd884ebaec35d23cea0581fb (diff)
downloadrspamd-4822d25fb38a5e40c0437df9643bfec23d249c3d.tar.gz
rspamd-4822d25fb38a5e40c0437df9643bfec23d249c3d.zip
[Project] Lua_content: Rework JS parsing
Diffstat (limited to 'lualib/lua_content/pdf.lua')
-rw-r--r--lualib/lua_content/pdf.lua114
1 files changed, 53 insertions, 61 deletions
diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua
index f1f9badda..e64ac86e4 100644
--- a/lualib/lua_content/pdf.lua
+++ b/lualib/lua_content/pdf.lua
@@ -35,18 +35,6 @@ local pdf_patterns = {
[[\ntrailer\r?\n]]
}
},
- javascript = {
- patterns = {
- [[\/JS(?:[\s/><])]],
- [[\/JavaScript(?:[\s/><])]],
- }
- },
- openaction = {
- patterns = {
- [[\/OpenAction(?:[\s/><])]],
- [[\/AA(?:[\s/><])]],
- }
- },
suspicious = {
patterns = {
[[netsh\s]],
@@ -471,13 +459,14 @@ local function parse_object_grammar(obj, task, pdf)
obj.major, obj.minor, obj_or_err)
else
-- Direct object
- pdf.ref[obj_ref(obj.major, obj.minor)] = obj_or_err
if type(obj_or_err) == 'table' then
obj.dict = obj_or_err
obj.uncompressed = obj_or_err
lua_util.debugm(N, task, 'direct object %s:%s is parsed to: %s',
obj.major, obj.minor, obj_or_err)
+ pdf.ref[obj_ref(obj.major, obj.minor)] = obj
else
+ pdf.ref[obj_ref(obj.major, obj.minor)] = obj_or_err
obj.dict = {}
obj.uncompressed = obj_or_err
end
@@ -511,9 +500,12 @@ local function process_font(task, pdf, font, fname)
end
end
--- Extract interesting stuff, e.g. javascript
+-- Forward declaration
+local process_dict
+
+-- Extract interesting stuff from /Action, e.g. javascript
local function process_action(task, pdf, obj)
- if obj.dict and obj.dict.JS then
+ if not obj.js and (obj.dict and obj.dict.JS) then
local js = maybe_dereference_object(obj.dict.JS, pdf, task)
if js then
@@ -529,19 +521,21 @@ local function process_action(task, pdf, obj)
end
if type(js) == 'string' then
- lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
- obj.major, obj.minor, js)
if not pdf.scripts then
pdf.scripts = {}
end
- pdf.scripts[#pdf.scripts + 1] = rspamd_text.fromstring(js)
- elseif type(js) == 'userdata' then
+ obj.js = rspamd_text.fromstring(js)
+ pdf.scripts[#pdf.scripts + 1] = obj.js
lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
- obj.major, obj.minor, js)
+ obj.major, obj.minor, obj.js)
+ elseif type(js) == 'userdata' then
if not pdf.scripts then
pdf.scripts = {}
end
+ obj.js = js
pdf.scripts[#pdf.scripts + 1] = js
+ lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
+ obj.major, obj.minor, js)
else
lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
obj.major, obj.minor, js)
@@ -553,40 +547,46 @@ local function process_action(task, pdf, obj)
end
end
-local function process_dict(task, pdf, obj, dict)
+-- Extract interesting stuff from /Catalog, e.g. javascript in /OpenAction
+local function process_catalog(task, pdf, obj)
+ if obj.dict and obj.dict.OpenAction then
+ local action = maybe_dereference_object(obj.dict.OpenAction, pdf, task)
+
+ if action and type(action) == 'table' then
+ -- This also processes action js (if not already processed)
+ process_dict(task, pdf, action, action.dict)
+ if action.js then
+ lua_util.debugm(N, task, 'found openaction JS in %s:%s: %s',
+ obj.major, obj.minor, action.js)
+ pdf.openaction = action.js
+ else
+ lua_util.debugm(N, task, 'no JS in openaction %s:%s: %s',
+ obj.major, obj.minor, action)
+ end
+ else
+ lua_util.debugm(N, task, 'cannot find openaction %s:%s: %s -> %s',
+ obj.major, obj.minor, obj.dict.OpenAction, action)
+ end
+ else
+ lua_util.debugm(N, task, 'no openaction in catalog %s:%s',
+ obj.major, obj.minor)
+ end
+end
+
+process_dict = function(task, pdf, obj, dict)
if not obj.type and type(dict) == 'table' then
if dict.Type and type(dict.Type) == 'string' then
-- Common stuff
obj.type = dict.Type
- else
- -- Fucking pdf, we need to guess a type (or ignore that crap)...
- lua_util.debugm(N, task, 'no explicit type for %s:%s',
- obj.major, obj.minor)
- if dict.Parent then
- -- Guess by parent
- local parent = dereference_object(dict.Parent, pdf)
-
- if parent and parent.type then
- if parent.type == 'Catalog' then
- obj.type = 'Pages'
- elseif parent.type == 'Pages' then
- obj.type = 'Page'
- end
-
- if obj.type then
- lua_util.debugm(N, task, 'guessed type for %s:%s (%s) from parent %s:%s (%s)',
- obj.major, obj.minor, obj.type, parent.major, parent.minor, parent.type)
-
- end
- end
- end
end
if not obj.type then
+ lua_util.debugm(N, task, 'no type for %s:%s',
+ obj.major, obj.minor)
return
end
- lua_util.debugm(N, task, 'process stream dictionary for object %s:%s -> %s',
+ lua_util.debugm(N, task, 'processed stream dictionary for object %s:%s -> %s',
obj.major, obj.minor, obj.type)
local contents = dict.Contents
if contents and type(contents) == 'table' then
@@ -617,6 +617,8 @@ local function process_dict(task, pdf, obj, dict)
rspamd_logger.infox(task, 'cannot parse resources from pdf: %s returned by grammar',
obj.resources)
obj.resources = {}
+ elseif obj.resources.dict then
+ obj.resources = obj.resources.dict
end
else
-- Fucking pdf: we need to inherit from parent
@@ -648,19 +650,17 @@ local function process_dict(task, pdf, obj, dict)
if config.text_extraction then
process_font(task, pdf, font, k)
+ lua_util.debugm(N, task, 'found font "%s" for object %s:%s -> %s',
+ k, obj.major, obj.minor, font)
end
-
- lua_util.debugm(N, task, 'found font "%s" for object %s:%s -> %s',
- k, obj.major, obj.minor, font)
end
end
end
- lua_util.debugm(N, task, 'found resources for object %s:%s: %s',
- obj.major, obj.minor, obj.resources)
+ lua_util.debugm(N, task, 'found resources for object %s:%s (%s): %s',
+ obj.major, obj.minor, obj.type, obj.resources)
if obj.type == 'FontDescriptor' then
-
lua_util.debugm(N, task, "obj %s:%s is a font descriptor",
obj.major, obj.minor)
@@ -687,8 +687,10 @@ local function process_dict(task, pdf, obj, dict)
end
elseif obj.type == 'Action' then
process_action(task, pdf, obj)
+ elseif obj.type == 'Catalog' then
+ process_catalog(task, pdf, obj)
end
- end
+ end -- Already processed dict (obj.type is not empty)
end
-- This function is intended to unpack objects from ObjStm crappy structure
@@ -1098,16 +1100,6 @@ processors.trailer = function(input, task, positions, output)
end
end
-processors.javascript = function(_, task, _, output)
- lua_util.debugm(N, task, "pdf: found javascript tag")
- output.javascript = true
-end
-
-processors.openaction = function(_, task, _, output)
- lua_util.debugm(N, task, "pdf: found openaction tag")
- output.openaction = true
-end
-
processors.suspicious = function(_, task, _, output)
lua_util.debugm(N, task, "pdf: found a suspicious pattern")
output.suspicious = true