1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207 |
- --[[
- Copyright (c) 2019, Vsevolod Stakhov <vsevolod@highsecure.ru>
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ]]--
-
- --[[[
- -- @module lua_content/pdf
- -- This module contains some heuristics for PDF files
- --]]
-
- local rspamd_trie = require "rspamd_trie"
- local rspamd_util = require "rspamd_util"
- local rspamd_text = require "rspamd_text"
- local rspamd_url = require "rspamd_url"
- local rspamd_logger = require "rspamd_logger"
- local bit = require "bit"
- local N = "lua_content"
- local lua_util = require "lua_util"
- local rspamd_regexp = require "rspamd_regexp"
- local lpeg = require "lpeg"
- local pdf_patterns = {
- trailer = {
- patterns = {
- [[\ntrailer\r?\n]]
- }
- },
- suspicious = {
- patterns = {
- [[netsh\s]],
- [[echo\s]],
- [[\/[A-Za-z]*#\d\d(?:[#A-Za-z<>/\s])]], -- Hex encode obfuscation
- }
- },
- start_object = {
- patterns = {
- [=[[\r\n\0]\s*\d+\s+\d+\s+obj[\s<]]=]
- }
- },
- end_object = {
- patterns = {
- [=[endobj[\r\n]]=]
- }
- },
- start_stream = {
- patterns = {
- [=[>\s*stream[\r\n]]=],
- }
- },
- end_stream = {
- patterns = {
- [=[endstream[\r\n]]=]
- }
- }
- }
-
- local pdf_text_patterns = {
- start = {
- patterns = {
- [[\sBT\s]]
- }
- },
- stop = {
- patterns = {
- [[\sET\b]]
- }
- }
- }
-
- local pdf_cmap_patterns = {
- start = {
- patterns = {
- [[\d\s+beginbfchar\s]],
- [[\d\s+beginbfrange\s]]
- }
- },
- stop = {
- patterns = {
- [[\sendbfrange\b]],
- [[\sendbchar\b]]
- }
- }
- }
-
- -- index[n] ->
- -- t[1] - pattern,
- -- t[2] - key in patterns table,
- -- t[3] - value in patterns table
- -- t[4] - local pattern index
- local pdf_indexes = {}
- local pdf_text_indexes = {}
- local pdf_cmap_indexes = {}
-
- local pdf_trie
- local pdf_text_trie
- local pdf_cmap_trie
-
- local exports = {}
-
- local config = {
- max_extraction_size = 512 * 1024,
- max_processing_size = 32 * 1024,
- text_extraction = false, -- NYI feature
- url_extraction = true,
- enabled = true,
- js_fuzzy = true, -- Generate fuzzy hashes from PDF javascripts
- min_js_fuzzy = 32, -- Minimum size of js to be considered as a fuzzy
- openaction_fuzzy_only = false, -- Generate fuzzy from all scripts
- }
-
- -- Used to process patterns found in PDF
- -- positions for functional processors should be a iter/table from trie matcher in form
- ---- [{n1, pat_idx1}, ... {nn, pat_idxn}] where
- ---- pat_idxn is pattern index and n1 ... nn are match positions
- local processors = {}
- -- PDF objects outer grammar in LPEG style (performing table captures)
- local pdf_outer_grammar
- local pdf_text_grammar
-
- -- Used to match objects
- local object_re = rspamd_regexp.create_cached([=[/(\d+)\s+(\d+)\s+obj\s*/]=])
-
- local function config_module()
- local opts = rspamd_config:get_all_opt('lua_content')
-
- if opts and opts.pdf then
- config = lua_util.override_defaults(config, opts.pdf)
- end
- end
-
- local function compile_tries()
- local default_compile_flags = bit.bor(rspamd_trie.flags.re,
- rspamd_trie.flags.dot_all,
- rspamd_trie.flags.no_start)
- local function compile_pats(patterns, indexes, compile_flags)
- local strs = {}
- for what,data in pairs(patterns) do
- for i,pat in ipairs(data.patterns) do
- strs[#strs + 1] = pat
- indexes[#indexes + 1] = {what, data, pat, i}
- end
- end
-
- return rspamd_trie.create(strs, compile_flags or default_compile_flags)
- end
-
- if not pdf_trie then
- pdf_trie = compile_pats(pdf_patterns, pdf_indexes)
- end
- if not pdf_text_trie then
- pdf_text_trie = compile_pats(pdf_text_patterns, pdf_text_indexes)
- end
- if not pdf_cmap_trie then
- pdf_cmap_trie = compile_pats(pdf_cmap_patterns, pdf_cmap_indexes)
- end
- end
-
- -- Returns a table with generic grammar elements for PDF
- local function generic_grammar_elts()
- local P = lpeg.P
- local R = lpeg.R
- local S = lpeg.S
- local V = lpeg.V
- local C = lpeg.C
- local D = R'09' -- Digits
-
- local grammar_elts = {}
-
- -- Helper functions
- local function pdf_hexstring_unescape(s)
- local function ue(cc)
- return string.char(tonumber(cc, 16))
- end
- if #s % 2 == 0 then
- -- Sane hex string
- return s:gsub('..', ue)
- end
-
- -- WTF hex string
- -- Append '0' to it and unescape...
- return s:sub(1, #s - 1):gsub('..' , ue) .. (s:sub(#s) .. '0'):gsub('..' , ue)
- end
-
- local function pdf_string_unescape(s)
- local function ue_single(cc)
- if cc == '\\r' then
- return '\r'
- elseif cc == '\\n' then
- return '\n'
- else
- return cc:gsub(2, 2)
- end
- end
- -- simple unescape \char
- s = s:gsub('\\[^%d]', ue_single)
- -- unescape octal
- local function ue_octal(cc)
- -- Replace unknown stuff with '?'
- return string.char(tonumber(cc:sub(2), 8) or 63)
- end
- s = s:gsub('\\%d%d?%d?', ue_octal)
-
- return s
- end
-
- local function pdf_id_unescape(s)
- return (s:gsub('#%d%d', function (cc)
- return string.char(tonumber(cc:sub(2), 16))
- end))
- end
-
- local delim = S'()<>[]{}/%'
- grammar_elts.ws = S'\0 \r\n\t\f'
- local hex = R'af' + R'AF' + D
- -- Comments.
- local eol = P'\r\n' + '\n'
- local line = (1 - S'\r\n\f')^0 * eol^-1
- grammar_elts.comment = P'%' * line
-
- -- Numbers.
- local sign = S'+-'^-1
- local decimal = D^1
- local float = D^1 * P'.' * D^0 + P'.' * D^1
- grammar_elts.number = C(sign * (float + decimal)) / tonumber
-
- -- String
- grammar_elts.str = P{ "(" * C(((1 - S"()\\") + (P '\\' * 1) + V(1))^0) / pdf_string_unescape * ")" }
- grammar_elts.hexstr = P{"<" * C(hex^0) / pdf_hexstring_unescape * ">"}
-
- -- Identifier
- grammar_elts.id = P{'/' * C((1-(delim + grammar_elts.ws))^1) / pdf_id_unescape}
-
- -- Booleans (who care about them?)
- grammar_elts.boolean = C(P("true") + P("false"))
-
- -- Stupid references
- grammar_elts.ref = lpeg.Ct{lpeg.Cc("%REF%") * C(D^1) * " " * C(D^1) * " " * "R"}
-
- return grammar_elts
- end
-
-
- -- Generates a grammar to parse outer elements (external objects in PDF notation)
- local function gen_outer_grammar()
- local V = lpeg.V
- local gen = generic_grammar_elts()
-
- return lpeg.P{
- "EXPR";
- EXPR = gen.ws^0 * V("ELT")^0 * gen.ws^0,
- ELT = V("ARRAY") + V("DICT") + V("ATOM"),
- ATOM = gen.ws^0 * (gen.comment + gen.boolean + gen.ref +
- gen.number + V("STRING") + gen.id) * gen.ws^0,
- DICT = "<<" * gen.ws^0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR")^0, rawset) * gen.ws^0 * ">>",
- KV_PAIR = lpeg.Cg(gen.id * gen.ws^0 * V("ELT") * gen.ws^0),
- ARRAY = "[" * gen.ws^0 * lpeg.Ct(V("ELT")^0) * gen.ws^0 * "]",
- STRING = lpeg.P{gen.str + gen.hexstr},
- }
- end
-
- -- Graphic state in PDF
- local function gen_graphics_unary()
- local P = lpeg.P
- local S = lpeg.S
-
- return P("q") + P("Q") + P("h")
- + S("WSsFfBb") * P("*")^0 + P("n")
-
- end
- local function gen_graphics_binary()
- local P = lpeg.P
- local S = lpeg.S
-
- return S("gGwJjMi") +
- P("M") + P("ri") + P("gs") +
- P("CS") + P("cs") + P("sh")
- end
- local function gen_graphics_ternary()
- local P = lpeg.P
- local S = lpeg.S
-
- return P("d") + P("m") + S("lm")
- end
- local function gen_graphics_nary()
- local P = lpeg.P
- local S = lpeg.S
-
- return P("SC") + P("sc") + P("SCN") + P("scn") + P("k") + P("K") + P("re") + S("cvy") +
- P("RG") + P("rg")
- end
-
- -- Generates a grammar to parse text blocks (between BT and ET)
- local function gen_text_grammar()
- local V = lpeg.V
- local P = lpeg.P
- local C = lpeg.C
- local gen = generic_grammar_elts()
-
- local empty = ""
- local unary_ops = C("T*") / "\n" +
- C(gen_graphics_unary()) / empty
- local binary_ops = P("Tc") + P("Tw") + P("Tz") + P("TL") + P("Tr") + P("Ts") +
- gen_graphics_binary()
- local ternary_ops = P("TD") + P("Td") + gen_graphics_ternary()
- local nary_op = P("Tm") + gen_graphics_nary()
- local text_binary_op = P("Tj") + P("TJ") + P("'")
- local text_quote_op = P('"')
- local font_op = P("Tf")
-
- return lpeg.P{
- "EXPR";
- EXPR = gen.ws^0 * lpeg.Ct(V("COMMAND")^0),
- COMMAND = (V("UNARY") + V("BINARY") + V("TERNARY") + V("NARY") + V("TEXT") +
- V("FONT") + gen.comment) * gen.ws^0,
- UNARY = unary_ops,
- BINARY = V("ARG") / empty * gen.ws^1 * binary_ops,
- TERNARY = V("ARG") / empty * gen.ws^1 * V("ARG") / empty * gen.ws^1 * ternary_ops,
- NARY = (gen.number / 0 * gen.ws^1)^1 * (gen.id / empty * gen.ws^0)^-1 * nary_op,
- ARG = V("ARRAY") + V("DICT") + V("ATOM"),
- ATOM = (gen.comment + gen.boolean + gen.ref +
- gen.number + V("STRING") + gen.id),
- DICT = "<<" * gen.ws^0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR")^0, rawset) * gen.ws^0 * ">>",
- KV_PAIR = lpeg.Cg(gen.id * gen.ws^0 * V("ARG") * gen.ws^0),
- ARRAY = "[" * gen.ws^0 * lpeg.Ct(V("ARG")^0) * gen.ws^0 * "]",
- STRING = lpeg.P{gen.str + gen.hexstr},
- TEXT = (V("TEXT_ARG") * gen.ws^1 * text_binary_op) +
- (V("ARG") / 0 * gen.ws^1 * V("ARG") / 0 * gen.ws^1 * V("TEXT_ARG") * gen.ws^1 * text_quote_op),
- FONT = (V("FONT_ARG") * gen.ws^1 * (gen.number / 0) * gen.ws^1 * font_op),
- FONT_ARG = lpeg.Ct(lpeg.Cc("%font%") * gen.id),
- TEXT_ARG = lpeg.Ct(V("STRING")) + V("TEXT_ARRAY"),
- TEXT_ARRAY = "[" *
- lpeg.Ct(((gen.ws^0 * (gen.ws^0 * (gen.number / 0)^0 * gen.ws^0 * (gen.str + gen.hexstr)))^1)) * gen.ws^0 * "]",
- }
- end
-
-
- -- Call immediately on require
- compile_tries()
- config_module()
- pdf_outer_grammar = gen_outer_grammar()
- pdf_text_grammar = gen_text_grammar()
-
- local function extract_text_data(specific)
- return nil -- NYI
- end
-
- -- Generates index for major/minor pair
- local function obj_ref(major, minor)
- return major * 10.0 + 1.0 / (minor + 1.0)
- end
-
- -- Return indirect object reference (if needed)
- local function maybe_dereference_object(elt, pdf, task)
- if type(elt) == 'table' and elt[1] == '%REF%' then
- local ref = obj_ref(elt[2], elt[3])
-
- if pdf.ref[ref] then
- -- No recursion!
- return pdf.ref[ref]
- else
- lua_util.debugm(N, task, 'cannot dereference %s:%s -> %s',
- elt[2], elt[3], obj_ref(elt[2], elt[3]))
- return nil
- end
- end
-
- return elt
- end
-
- -- Apply PDF stream filter
- local function apply_pdf_filter(input, filt)
- if filt == 'FlateDecode' then
- return rspamd_util.inflate(input, config.max_extraction_size)
- end
-
- return nil
- end
-
- -- Conditionally apply a pipeline of stream filters and return uncompressed data
- local function maybe_apply_filter(dict, data)
- local uncompressed = data
-
- if dict.Filter then
- local filt = dict.Filter
- if type(filt) == 'string' then
- filt = {filt}
- end
-
- for _,f in ipairs(filt) do
- uncompressed = apply_pdf_filter(uncompressed, f)
-
- if not uncompressed then break end
- end
- end
-
- return uncompressed
- end
-
- -- Conditionally extract stream data from object and attach it as obj.uncompressed
- local function maybe_extract_object_stream(obj, pdf, task)
- local dict = obj.dict
- if dict.Length then
- local len = math.min(obj.stream.len,
- tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0)
- local real_stream = obj.stream.data:span(1, len)
-
- local uncompressed = maybe_apply_filter(dict, real_stream)
-
- if uncompressed then
- obj.uncompressed = uncompressed
- lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)',
- obj.major, obj.minor, len, uncompressed:len())
- return obj.uncompressed
- else
- lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s',
- obj.major, obj.minor, len, dict.Filter)
- end
- end
- end
-
-
- local function parse_object_grammar(obj, task, pdf)
- -- Parse grammar
- local obj_dict_span
- if obj.stream then
- obj_dict_span = obj.data:span(1, obj.stream.start - obj.start)
- else
- obj_dict_span = obj.data
- end
-
- if obj_dict_span:len() < config.max_processing_size then
- local ret,obj_or_err = pcall(pdf_outer_grammar.match, pdf_outer_grammar, obj_dict_span)
-
- if ret then
- if obj.stream then
- obj.dict = obj_or_err
- lua_util.debugm(N, task, 'stream object %s:%s is parsed to: %s',
- obj.major, obj.minor, obj_or_err)
- else
- -- Direct object
- if type(obj_or_err) == 'table' then
- obj.dict = obj_or_err
- obj.uncompressed = obj_or_err
- lua_util.debugm(N, task, 'direct object %s:%s is parsed to: %s',
- obj.major, obj.minor, obj_or_err)
- pdf.ref[obj_ref(obj.major, obj.minor)] = obj
- else
- pdf.ref[obj_ref(obj.major, obj.minor)] = obj_or_err
- obj.dict = {}
- obj.uncompressed = obj_or_err
- end
- end
- else
- lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
- obj.major, obj.minor, obj_or_err)
- end
- else
- lua_util.debugm(N, task, 'object %s:%s cannot be parsed: too large %s',
- obj.major, obj.minor, obj_dict_span:len())
- end
- end
-
- -- Extracts font data and process /ToUnicode mappings
- -- NYI in fact as cmap is ridiculously stupid and complicated
- local function process_font(task, pdf, font, fname)
- local dict = font
- if font.dict then
- dict = font.dict
- end
-
- if type(dict) == 'table' and dict.ToUnicode then
- local cmap = maybe_dereference_object(dict.ToUnicode, pdf, task)
-
- if cmap and cmap.dict then
- maybe_extract_object_stream(cmap, pdf, task)
- lua_util.debugm(N, task, 'found cmap for font %s: %s',
- fname, cmap.uncompressed)
- end
- end
- end
-
- -- Forward declaration
- local process_dict
-
- -- This function processes javascript string and returns JS hash and JS rspamd_text
- local function process_javascript(task, pdf, js)
- local rspamd_cryptobox_hash = require "rspamd_cryptobox_hash"
- if type(js) == 'string' then
- js = rspamd_text.fromstring(js):oneline()
- elseif type(js) == 'userdata' then
- js = js:oneline()
- else
- return nil
- end
-
- local hash = rspamd_cryptobox_hash.create(js)
- local bin_hash = hash:bin()
-
- if not pdf.scripts then
- pdf.scripts = {}
- end
-
- if pdf.scripts[bin_hash] then
- -- Duplicate
- return pdf.scripts[bin_hash]
- end
-
- local njs = {
- data = js,
- hash = rspamd_util.encode_base32(bin_hash),
- bin_hash = bin_hash,
- }
- pdf.scripts[bin_hash] = njs
- return njs
- end
-
- -- Extract interesting stuff from /Action, e.g. javascript
- local function process_action(task, pdf, obj)
- if not (obj.js or obj.launch) and (obj.dict and obj.dict.JS) then
- local js = maybe_dereference_object(obj.dict.JS, pdf, task)
-
- if js then
- if type(js) == 'table' then
- local extracted_js = maybe_extract_object_stream(js, pdf, task)
-
- if not extracted_js then
- lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
- obj.major, obj.minor, js)
- else
- js = extracted_js
- end
- end
-
- js = process_javascript(task, pdf, js)
- if js then
- obj.js = js
- lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
- obj.major, obj.minor, obj.js.data)
- else
- lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
- obj.major, obj.minor, js)
- end
- elseif obj.dict.F then
- local launch = maybe_dereference_object(obj.dict.F, pdf, task)
-
- if launch then
- if type(launch) == 'string' then
- obj.launch = rspamd_text.fromstring(launch):exclude_chars('%n%c')
- lua_util.debugm(N, task, 'extracted launch from %s:%s: %s',
- obj.major, obj.minor, obj.launch)
- elseif type(launch) == 'userdata' then
- obj.launch = launch:exclude_chars('%n%c')
- lua_util.debugm(N, task, 'extracted launch from %s:%s: %s',
- obj.major, obj.minor, obj.launch)
- else
- lua_util.debugm(N, task, 'invalid type for launch from %s:%s: %s',
- obj.major, obj.minor, launch)
- end
- end
- else
-
- lua_util.debugm(N, task, 'no JS attribute in action %s:%s',
- obj.major, obj.minor)
- end
- end
- end
-
- -- Extract interesting stuff from /Catalog, e.g. javascript in /OpenAction
- local function process_catalog(task, pdf, obj)
- if obj.dict then
- if obj.dict.OpenAction then
- local action = maybe_dereference_object(obj.dict.OpenAction, pdf, task)
-
- if action and type(action) == 'table' then
- -- This also processes action js (if not already processed)
- process_dict(task, pdf, action, action.dict)
- if action.js then
- lua_util.debugm(N, task, 'found openaction JS in %s:%s: %s',
- obj.major, obj.minor, action.js)
- pdf.openaction = action.js
- elseif action.launch then
- lua_util.debugm(N, task, 'found openaction launch in %s:%s: %s',
- obj.major, obj.minor, action.launch)
- pdf.launch = action.launch
- else
- lua_util.debugm(N, task, 'no JS in openaction %s:%s: %s',
- obj.major, obj.minor, action)
- end
- else
- lua_util.debugm(N, task, 'cannot find openaction %s:%s: %s -> %s',
- obj.major, obj.minor, obj.dict.OpenAction, action)
- end
- else
- lua_util.debugm(N, task, 'no openaction in catalog %s:%s',
- obj.major, obj.minor)
- end
- end
- end
-
- process_dict = function(task, pdf, obj, dict)
- if not obj.type and type(dict) == 'table' then
- if dict.Type and type(dict.Type) == 'string' then
- -- Common stuff
- obj.type = dict.Type
- end
-
- if not obj.type then
-
- if obj.dict.S and obj.dict.JS then
- obj.type = 'Javascript'
- lua_util.debugm(N, task, 'implicit type for Javascript object %s:%s',
- obj.major, obj.minor)
- else
- lua_util.debugm(N, task, 'no type for %s:%s',
- obj.major, obj.minor)
- return
- end
- end
-
- lua_util.debugm(N, task, 'processed stream dictionary for object %s:%s -> %s',
- obj.major, obj.minor, obj.type)
- local contents = dict.Contents
- if contents and type(contents) == 'table' then
- if contents[1] == '%REF%' then
- -- Single reference
- contents = {contents}
- end
- obj.contents = {}
-
- for _,c in ipairs(contents) do
- local cobj = maybe_dereference_object(c, pdf, task)
- if cobj and type(cobj) == 'table' then
- obj.contents[#obj.contents + 1] = cobj
- cobj.parent = obj
- cobj.type = 'content'
- end
- end
-
- lua_util.debugm(N, task, 'found content objects for %s:%s -> %s',
- obj.major, obj.minor, #obj.contents)
- end
-
- local resources = dict.Resources
- if resources and type(resources) == 'table' then
- obj.resources = maybe_dereference_object(resources, pdf, task)
-
- if type(obj.resources) ~= 'table' then
- rspamd_logger.infox(task, 'cannot parse resources from pdf: %s returned by grammar',
- obj.resources)
- obj.resources = {}
- elseif obj.resources.dict then
- obj.resources = obj.resources.dict
- end
- else
- -- Fucking pdf: we need to inherit from parent
- resources = {}
- if dict.Parent then
- local parent = maybe_dereference_object(dict.Parent, pdf, task)
-
- if parent and type(parent) == 'table' and parent.dict then
- if parent.resources then
- lua_util.debugm(N, task, 'propagated resources from %s:%s to %s:%s',
- parent.major, parent.minor, obj.major, obj.minor)
- resources = parent.resources
- end
- end
- end
-
- obj.resources = resources
- end
-
- local fonts = obj.resources.Font
-
- if fonts and type(fonts) == 'table' then
- obj.fonts = {}
- for k,v in pairs(fonts) do
- obj.fonts[k] = maybe_dereference_object(v, pdf, task)
-
- if obj.fonts[k] then
- local font = obj.fonts[k]
-
- if config.text_extraction then
- process_font(task, pdf, font, k)
- lua_util.debugm(N, task, 'found font "%s" for object %s:%s -> %s',
- k, obj.major, obj.minor, font)
- end
- end
- end
- end
-
- lua_util.debugm(N, task, 'found resources for object %s:%s (%s): %s',
- obj.major, obj.minor, obj.type, obj.resources)
-
- if obj.type == 'Action' then
- process_action(task, pdf, obj)
- elseif obj.type == 'Catalog' then
- process_catalog(task, pdf, obj)
- elseif obj.type == 'Javascript' then
- local js = maybe_dereference_object(obj.dict.JS, pdf, task)
-
- if js then
- if type(js) == 'table' then
- local extracted_js = maybe_extract_object_stream(js, pdf, task)
-
- if not extracted_js then
- lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
- obj.major, obj.minor, js)
- else
- js = extracted_js
- end
- end
-
- js = process_javascript(task, pdf, js)
- if js then
- obj.js = js
- lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
- obj.major, obj.minor, obj.js.data)
- else
- lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
- obj.major, obj.minor, js)
- end
- end
- end
- end -- Already processed dict (obj.type is not empty)
- end
-
- -- This function is intended to unpack objects from ObjStm crappy structure
- local compound_obj_grammar
- local function compound_obj_grammar_gen()
- if not compound_obj_grammar then
- local gen = generic_grammar_elts()
- compound_obj_grammar = gen.ws^0 * (gen.comment * gen.ws^1)^0 *
- lpeg.Ct(lpeg.Ct(gen.number * gen.ws^1 * gen.number * gen.ws^0)^1)
- end
-
- return compound_obj_grammar
- end
- local function pdf_compound_object_unpack(_, uncompressed, pdf, task, first)
- -- First, we need to parse data line by line likely to find a line
- -- that consists of pairs of numbers
- compound_obj_grammar_gen()
- local elts = compound_obj_grammar:match(uncompressed)
- if elts and #elts > 0 then
- lua_util.debugm(N, task, 'compound elts (chunk length %s): %s',
- #uncompressed, elts)
-
- for i,pair in ipairs(elts) do
- local obj_number,offset = pair[1], pair[2]
-
- offset = offset + first
- if offset < #uncompressed then
- local span_len
- if i == #elts then
- span_len = #uncompressed - offset
- else
- span_len = (elts[i + 1][2] + first) - offset
- end
-
- if span_len > 0 and offset + span_len < #uncompressed then
- local obj = {
- major = obj_number,
- minor = 0, -- Implicit
- data = uncompressed:span(offset + 1, span_len),
- ref = obj_ref(obj_number, 0)
- }
- parse_object_grammar(obj, task, pdf)
-
- if obj.dict then
- pdf.objects[#pdf.objects + 1] = obj
- end
- end
- end
- end
- end
- end
-
- -- PDF 1.5 ObjStmt
- local function extract_pdf_compound_objects(task, pdf)
- for _,obj in ipairs(pdf.objects or {}) do
- if obj.stream and obj.dict and type(obj.dict) == 'table' then
- local t = obj.dict.Type
- if t and t == 'ObjStm' then
- -- We are in troubles sir...
- local nobjs = tonumber(maybe_dereference_object(obj.dict.N, pdf, task))
- local first = tonumber(maybe_dereference_object(obj.dict.First, pdf, task))
-
- if nobjs and first then
- --local extend = maybe_dereference_object(obj.dict.Extends, pdf, task)
- lua_util.debugm(N, task, 'extract ObjStm with %s objects (%s first) %s extend',
- nobjs, first, obj.dict.Extends)
-
- local uncompressed = maybe_extract_object_stream(obj, pdf, task)
-
- if uncompressed then
- pdf_compound_object_unpack(obj, uncompressed, pdf, task, first)
- end
- else
- lua_util.debugm(N, task, 'ObjStm object %s:%s has bad dict: %s',
- obj.major, obj.minor, obj.dict)
- end
- end
- end
- end
- end
-
- -- This function arranges starts and ends of all objects and process them into initial
- -- set of objects
- local function extract_outer_objects(task, input, pdf)
- local start_pos, end_pos = 1, 1
- local obj_count = 0
- while start_pos <= #pdf.start_objects and end_pos <= #pdf.end_objects do
- local first = pdf.start_objects[start_pos]
- local last = pdf.end_objects[end_pos]
-
- -- 7 is length of `endobj\n`
- if first + 7 < last then
- local len = last - first - 7
-
- -- Also get the starting span and try to match it versus obj re to get numbers
- local obj_line_potential = first - 32
- if obj_line_potential < 1 then obj_line_potential = 1 end
- local prev_obj_end = pdf.end_objects[end_pos - 1]
- if end_pos > 1 and prev_obj_end >= obj_line_potential and prev_obj_end < first then
- obj_line_potential = prev_obj_end + 1
- end
-
- local obj_line_span = input:span(obj_line_potential, first - obj_line_potential + 1)
- local matches = object_re:search(obj_line_span, true, true)
-
- if matches and matches[1] then
- local nobj = {
- start = first,
- len = len,
- data = input:span(first, len),
- major = tonumber(matches[1][2]),
- minor = tonumber(matches[1][3]),
- }
- pdf.objects[obj_count + 1] = nobj
- if nobj.major and nobj.minor then
- -- Add reference
- local ref = obj_ref(nobj.major, nobj.minor)
- nobj.ref = ref -- Our internal reference
- pdf.ref[ref] = nobj
- end
- end
-
- obj_count = obj_count + 1
- start_pos = start_pos + 1
- end_pos = end_pos + 1
- elseif first > last then
- end_pos = end_pos + 1
- else
- start_pos = start_pos + 1
- end_pos = end_pos + 1
- end
- end
- end
-
- -- This function attaches streams to objects and processes outer pdf grammar
- local function attach_pdf_streams(task, input, pdf)
- if pdf.start_streams and pdf.end_streams then
- local start_pos, end_pos = 1, 1
-
- for _,obj in ipairs(pdf.objects) do
- while start_pos <= #pdf.start_streams and end_pos <= #pdf.end_streams do
- local first = pdf.start_streams[start_pos]
- local last = pdf.end_streams[end_pos]
- last = last - 10 -- Exclude endstream\n pattern
- lua_util.debugm(N, task, "start: %s, end: %s; obj: %s-%s",
- first, last, obj.start, obj.start + obj.len)
- if first > obj.start and last < obj.start + obj.len and last > first then
- -- In case if we have fake endstream :(
- while pdf.end_streams[end_pos + 1] and pdf.end_streams[end_pos + 1] < obj.start + obj.len do
- end_pos = end_pos + 1
- last = pdf.end_streams[end_pos]
- end
- -- Strip the first \n
- while first < last do
- local chr = input:at(first)
- if chr ~= 13 and chr ~= 10 then break end
- first = first + 1
- end
- local len = last - first
- obj.stream = {
- start = first,
- len = len,
- data = input:span(first, len)
- }
- start_pos = start_pos + 1
- end_pos = end_pos + 1
- break
- elseif first < obj.start then
- start_pos = start_pos + 1
- elseif last > obj.start + obj.len then
- -- Not this object
- break
- else
- start_pos = start_pos + 1
- end_pos = end_pos + 1
- end
- end
- if obj.stream then
- lua_util.debugm(N, task, 'found object %s:%s %s start %s len, %s stream start, %s stream length',
- obj.major, obj.minor, obj.start, obj.len, obj.stream.start, obj.stream.len)
- else
- lua_util.debugm(N, task, 'found object %s:%s %s start %s len, no stream',
- obj.major, obj.minor, obj.start, obj.len)
- end
- end
- end
- end
-
- -- Processes PDF objects: extracts streams, object numbers, process outer grammar,
- -- augment object types
- local function postprocess_pdf_objects(task, input, pdf)
- pdf.objects = {} -- objects table
- pdf.ref = {} -- references table
- extract_outer_objects(task, input, pdf)
-
- -- Now we have objects and we need to attach streams that are in bounds
- attach_pdf_streams(task, input, pdf)
- -- Parse grammar for outer objects
- for _,obj in ipairs(pdf.objects) do
- if obj.ref then
- parse_object_grammar(obj, task, pdf)
- end
- end
- extract_pdf_compound_objects(task, pdf)
-
- -- Now we might probably have all objects being processed
- for _,obj in ipairs(pdf.objects) do
- if obj.dict then
- -- Types processing
- process_dict(task, pdf, obj, obj.dict)
- end
- end
- end
-
- local function offsets_to_blocks(starts, ends, out)
- local start_pos, end_pos = 1, 1
-
- while start_pos <= #starts and end_pos <= #ends do
- local first = starts[start_pos]
- local last = ends[end_pos]
-
- if first < last then
- local len = last - first
- out[#out + 1] = {
- start = first,
- len = len,
- }
- start_pos = start_pos + 1
- end_pos = end_pos + 1
- elseif first > last then
- end_pos = end_pos + 1
- else
- -- Not ordered properly!
- break
- end
- end
- end
-
- local function search_text(task, pdf)
- for _,obj in ipairs(pdf.objects) do
- if obj.type == 'Page' and obj.contents then
- local text = {}
- for _,tobj in ipairs(obj.contents) do
- maybe_extract_object_stream(tobj, pdf, task)
- local matches = pdf_text_trie:match(tobj.uncompressed or '')
- if matches then
- local text_blocks = {}
- local starts = {}
- local ends = {}
-
- for npat,matched_positions in pairs(matches) do
- if npat == 1 then
- for _,pos in ipairs(matched_positions) do
- starts[#starts + 1] = pos
- end
- else
- for _,pos in ipairs(matched_positions) do
- ends[#ends + 1] = pos
- end
- end
- end
-
- offsets_to_blocks(starts, ends, text_blocks)
- for _,bl in ipairs(text_blocks) do
- if bl.len > 2 then
- -- To remove \s+ET\b pattern (it can leave trailing space or not but it doesn't matter)
- bl.len = bl.len - 2
- end
-
- bl.data = tobj.uncompressed:span(bl.start, bl.len)
- --lua_util.debugm(N, task, 'extracted text from object %s:%s: %s',
- -- tobj.major, tobj.minor, bl.data)
-
- if bl.len < config.max_processing_size then
- local ret,obj_or_err = pcall(pdf_text_grammar.match, pdf_text_grammar,
- bl.data)
-
- if ret then
- text[#text + 1] = obj_or_err
- lua_util.debugm(N, task, 'attached %s from content object %s:%s to %s:%s',
- obj_or_err, tobj.major, tobj.minor, obj.major, obj.minor)
- else
- lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
- obj.major, obj.minor, obj_or_err)
- end
-
- end
- end
- end
- end
-
- -- Join all text data together
- if #text > 0 then
- obj.text = rspamd_text.fromtable(text)
- lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
- obj.major, obj.minor, obj.text)
- end
- end
- end
- end
-
- -- This function searches objects for `/URI` key and parses it's content
- local function search_urls(task, pdf)
- local function recursive_object_traverse(obj, dict, rec)
- if rec > 10 then
- lua_util.debugm(N, task, 'object %s:%s recurses too much',
- obj.major, obj.minor)
- return
- end
-
- for k,v in pairs(dict) do
- if type(v) == 'table' then
- recursive_object_traverse(obj, v, rec + 1)
- elseif k == 'URI' then
- v = maybe_dereference_object(v, pdf, task)
- if type(v) == 'string' then
- local url = rspamd_url.create(task:get_mempool(), v)
-
- if url then
- lua_util.debugm(N, task, 'found url %s in object %s:%s',
- v, obj.major, obj.minor)
- task:inject_url(url)
- end
- end
- end
- end
- end
-
- for _,obj in ipairs(pdf.objects) do
- if obj.dict and type(obj.dict) == 'table' then
- recursive_object_traverse(obj, obj.dict, 0)
- end
- end
- end
-
- local function process_pdf(input, _, task)
-
- if not config.enabled then
- -- Skip processing
- return {}
- end
-
- local matches = pdf_trie:match(input)
-
- if matches then
- local pdf_output = {
- tag = 'pdf',
- extract_text = extract_text_data,
- }
- local grouped_processors = {}
- for npat,matched_positions in pairs(matches) do
- local index = pdf_indexes[npat]
-
- local proc_key,loc_npat = index[1], index[4]
-
- if not grouped_processors[proc_key] then
- grouped_processors[proc_key] = {
- processor_func = processors[proc_key],
- offsets = {},
- }
- end
- local proc = grouped_processors[proc_key]
- -- Fill offsets
- for _,pos in ipairs(matched_positions) do
- proc.offsets[#proc.offsets + 1] = {pos, loc_npat}
- end
- end
-
- for name,processor in pairs(grouped_processors) do
- -- Sort by offset
- lua_util.debugm(N, task, "pdf: process group %s with %s matches",
- name, #processor.offsets)
- table.sort(processor.offsets, function(e1, e2) return e1[1] < e2[1] end)
- processor.processor_func(input, task, processor.offsets, pdf_output)
- end
-
- pdf_output.flags = {}
-
- if pdf_output.start_objects and pdf_output.end_objects then
- -- Postprocess objects
- postprocess_pdf_objects(task, input, pdf_output)
- if config.text_extraction then
- search_text(task, pdf_output)
- end
- if config.url_extraction then
- search_urls(task, pdf_output)
- end
-
- if config.js_fuzzy and pdf_output.scripts then
- pdf_output.fuzzy_hashes = {}
- if config.openaction_fuzzy_only then
- -- OpenAction only
- if pdf_output.openaction and pdf_output.openaction.bin_hash then
- if config.min_js_fuzzy and #pdf_output.openaction.data >= config.min_js_fuzzy then
- lua_util.debugm(N, task, "pdf: add fuzzy hash from openaction: %s",
- pdf_output.openaction.hash)
- table.insert(pdf_output.fuzzy_hashes, pdf_output.openaction.bin_hash)
- else
- lua_util.debugm(N, task, "pdf: skip fuzzy hash from Javascript: %s, too short: %s",
- pdf_output.openaction.hash, #pdf_output.openaction.data)
- end
- end
- else
- -- All hashes
- for h,sc in pairs(pdf_output.scripts) do
- if config.min_js_fuzzy and #sc.data >= config.min_js_fuzzy then
- lua_util.debugm(N, task, "pdf: add fuzzy hash from Javascript: %s",
- sc.hash)
- table.insert(pdf_output.fuzzy_hashes, h)
- else
- lua_util.debugm(N, task, "pdf: skip fuzzy hash from Javascript: %s, too short: %s",
- sc.hash, #sc.data)
- end
- end
-
- end
- end
- else
- pdf_output.flags.no_objects = true
- end
-
- return pdf_output
- end
- end
-
- -- Processes the PDF trailer
- processors.trailer = function(input, task, positions, output)
- local last_pos = positions[#positions]
-
- local last_span = input:span(last_pos[1])
- for line in last_span:lines(true) do
- if line:find('/Encrypt ') then
- lua_util.debugm(N, task, "pdf: found encrypted line in trailer: %s",
- line)
- output.encrypted = true
- end
- end
- end
-
- processors.suspicious = function(_, task, _, output)
- lua_util.debugm(N, task, "pdf: found a suspicious pattern")
- output.suspicious = true
- end
-
- local function generic_table_inserter(positions, output, output_key)
- if not output[output_key] then
- output[output_key] = {}
- end
- local shift = #output[output_key]
- for i,pos in ipairs(positions) do
- output[output_key][i + shift] = pos[1]
- end
- end
-
- processors.start_object = function(_, task, positions, output)
- generic_table_inserter(positions, output, 'start_objects')
- end
-
- processors.end_object = function(_, task, positions, output)
- generic_table_inserter(positions, output, 'end_objects')
- end
-
- processors.start_stream = function(_, task, positions, output)
- generic_table_inserter(positions, output, 'start_streams')
- end
-
- processors.end_stream = function(_, task, positions, output)
- generic_table_inserter(positions, output, 'end_streams')
- end
-
- exports.process = process_pdf
-
- return exports
|