You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

pdf.lua 36KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207
  1. --[[
  2. Copyright (c) 2019, Vsevolod Stakhov <vsevolod@highsecure.ru>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. --[[[
  14. -- @module lua_content/pdf
  15. -- This module contains some heuristics for PDF files
  16. --]]
  17. local rspamd_trie = require "rspamd_trie"
  18. local rspamd_util = require "rspamd_util"
  19. local rspamd_text = require "rspamd_text"
  20. local rspamd_url = require "rspamd_url"
  21. local rspamd_logger = require "rspamd_logger"
  22. local bit = require "bit"
  23. local N = "lua_content"
  24. local lua_util = require "lua_util"
  25. local rspamd_regexp = require "rspamd_regexp"
  26. local lpeg = require "lpeg"
  27. local pdf_patterns = {
  28. trailer = {
  29. patterns = {
  30. [[\ntrailer\r?\n]]
  31. }
  32. },
  33. suspicious = {
  34. patterns = {
  35. [[netsh\s]],
  36. [[echo\s]],
  37. [[\/[A-Za-z]*#\d\d(?:[#A-Za-z<>/\s])]], -- Hex encode obfuscation
  38. }
  39. },
  40. start_object = {
  41. patterns = {
  42. [=[[\r\n\0]\s*\d+\s+\d+\s+obj[\s<]]=]
  43. }
  44. },
  45. end_object = {
  46. patterns = {
  47. [=[endobj[\r\n]]=]
  48. }
  49. },
  50. start_stream = {
  51. patterns = {
  52. [=[>\s*stream[\r\n]]=],
  53. }
  54. },
  55. end_stream = {
  56. patterns = {
  57. [=[endstream[\r\n]]=]
  58. }
  59. }
  60. }
  61. local pdf_text_patterns = {
  62. start = {
  63. patterns = {
  64. [[\sBT\s]]
  65. }
  66. },
  67. stop = {
  68. patterns = {
  69. [[\sET\b]]
  70. }
  71. }
  72. }
  73. local pdf_cmap_patterns = {
  74. start = {
  75. patterns = {
  76. [[\d\s+beginbfchar\s]],
  77. [[\d\s+beginbfrange\s]]
  78. }
  79. },
  80. stop = {
  81. patterns = {
  82. [[\sendbfrange\b]],
  83. [[\sendbchar\b]]
  84. }
  85. }
  86. }
  87. -- index[n] ->
  88. -- t[1] - pattern,
  89. -- t[2] - key in patterns table,
  90. -- t[3] - value in patterns table
  91. -- t[4] - local pattern index
  92. local pdf_indexes = {}
  93. local pdf_text_indexes = {}
  94. local pdf_cmap_indexes = {}
  95. local pdf_trie
  96. local pdf_text_trie
  97. local pdf_cmap_trie
  98. local exports = {}
  99. local config = {
  100. max_extraction_size = 512 * 1024,
  101. max_processing_size = 32 * 1024,
  102. text_extraction = false, -- NYI feature
  103. url_extraction = true,
  104. enabled = true,
  105. js_fuzzy = true, -- Generate fuzzy hashes from PDF javascripts
  106. min_js_fuzzy = 32, -- Minimum size of js to be considered as a fuzzy
  107. openaction_fuzzy_only = false, -- Generate fuzzy from all scripts
  108. }
  109. -- Used to process patterns found in PDF
  110. -- positions for functional processors should be a iter/table from trie matcher in form
  111. ---- [{n1, pat_idx1}, ... {nn, pat_idxn}] where
  112. ---- pat_idxn is pattern index and n1 ... nn are match positions
  113. local processors = {}
  114. -- PDF objects outer grammar in LPEG style (performing table captures)
  115. local pdf_outer_grammar
  116. local pdf_text_grammar
  117. -- Used to match objects
  118. local object_re = rspamd_regexp.create_cached([=[/(\d+)\s+(\d+)\s+obj\s*/]=])
  119. local function config_module()
  120. local opts = rspamd_config:get_all_opt('lua_content')
  121. if opts and opts.pdf then
  122. config = lua_util.override_defaults(config, opts.pdf)
  123. end
  124. end
  125. local function compile_tries()
  126. local default_compile_flags = bit.bor(rspamd_trie.flags.re,
  127. rspamd_trie.flags.dot_all,
  128. rspamd_trie.flags.no_start)
  129. local function compile_pats(patterns, indexes, compile_flags)
  130. local strs = {}
  131. for what,data in pairs(patterns) do
  132. for i,pat in ipairs(data.patterns) do
  133. strs[#strs + 1] = pat
  134. indexes[#indexes + 1] = {what, data, pat, i}
  135. end
  136. end
  137. return rspamd_trie.create(strs, compile_flags or default_compile_flags)
  138. end
  139. if not pdf_trie then
  140. pdf_trie = compile_pats(pdf_patterns, pdf_indexes)
  141. end
  142. if not pdf_text_trie then
  143. pdf_text_trie = compile_pats(pdf_text_patterns, pdf_text_indexes)
  144. end
  145. if not pdf_cmap_trie then
  146. pdf_cmap_trie = compile_pats(pdf_cmap_patterns, pdf_cmap_indexes)
  147. end
  148. end
  149. -- Returns a table with generic grammar elements for PDF
  150. local function generic_grammar_elts()
  151. local P = lpeg.P
  152. local R = lpeg.R
  153. local S = lpeg.S
  154. local V = lpeg.V
  155. local C = lpeg.C
  156. local D = R'09' -- Digits
  157. local grammar_elts = {}
  158. -- Helper functions
  159. local function pdf_hexstring_unescape(s)
  160. local function ue(cc)
  161. return string.char(tonumber(cc, 16))
  162. end
  163. if #s % 2 == 0 then
  164. -- Sane hex string
  165. return s:gsub('..', ue)
  166. end
  167. -- WTF hex string
  168. -- Append '0' to it and unescape...
  169. return s:sub(1, #s - 1):gsub('..' , ue) .. (s:sub(#s) .. '0'):gsub('..' , ue)
  170. end
  171. local function pdf_string_unescape(s)
  172. local function ue_single(cc)
  173. if cc == '\\r' then
  174. return '\r'
  175. elseif cc == '\\n' then
  176. return '\n'
  177. else
  178. return cc:gsub(2, 2)
  179. end
  180. end
  181. -- simple unescape \char
  182. s = s:gsub('\\[^%d]', ue_single)
  183. -- unescape octal
  184. local function ue_octal(cc)
  185. -- Replace unknown stuff with '?'
  186. return string.char(tonumber(cc:sub(2), 8) or 63)
  187. end
  188. s = s:gsub('\\%d%d?%d?', ue_octal)
  189. return s
  190. end
  191. local function pdf_id_unescape(s)
  192. return (s:gsub('#%d%d', function (cc)
  193. return string.char(tonumber(cc:sub(2), 16))
  194. end))
  195. end
  196. local delim = S'()<>[]{}/%'
  197. grammar_elts.ws = S'\0 \r\n\t\f'
  198. local hex = R'af' + R'AF' + D
  199. -- Comments.
  200. local eol = P'\r\n' + '\n'
  201. local line = (1 - S'\r\n\f')^0 * eol^-1
  202. grammar_elts.comment = P'%' * line
  203. -- Numbers.
  204. local sign = S'+-'^-1
  205. local decimal = D^1
  206. local float = D^1 * P'.' * D^0 + P'.' * D^1
  207. grammar_elts.number = C(sign * (float + decimal)) / tonumber
  208. -- String
  209. grammar_elts.str = P{ "(" * C(((1 - S"()\\") + (P '\\' * 1) + V(1))^0) / pdf_string_unescape * ")" }
  210. grammar_elts.hexstr = P{"<" * C(hex^0) / pdf_hexstring_unescape * ">"}
  211. -- Identifier
  212. grammar_elts.id = P{'/' * C((1-(delim + grammar_elts.ws))^1) / pdf_id_unescape}
  213. -- Booleans (who care about them?)
  214. grammar_elts.boolean = C(P("true") + P("false"))
  215. -- Stupid references
  216. grammar_elts.ref = lpeg.Ct{lpeg.Cc("%REF%") * C(D^1) * " " * C(D^1) * " " * "R"}
  217. return grammar_elts
  218. end
  219. -- Generates a grammar to parse outer elements (external objects in PDF notation)
  220. local function gen_outer_grammar()
  221. local V = lpeg.V
  222. local gen = generic_grammar_elts()
  223. return lpeg.P{
  224. "EXPR";
  225. EXPR = gen.ws^0 * V("ELT")^0 * gen.ws^0,
  226. ELT = V("ARRAY") + V("DICT") + V("ATOM"),
  227. ATOM = gen.ws^0 * (gen.comment + gen.boolean + gen.ref +
  228. gen.number + V("STRING") + gen.id) * gen.ws^0,
  229. DICT = "<<" * gen.ws^0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR")^0, rawset) * gen.ws^0 * ">>",
  230. KV_PAIR = lpeg.Cg(gen.id * gen.ws^0 * V("ELT") * gen.ws^0),
  231. ARRAY = "[" * gen.ws^0 * lpeg.Ct(V("ELT")^0) * gen.ws^0 * "]",
  232. STRING = lpeg.P{gen.str + gen.hexstr},
  233. }
  234. end
  235. -- Graphic state in PDF
  236. local function gen_graphics_unary()
  237. local P = lpeg.P
  238. local S = lpeg.S
  239. return P("q") + P("Q") + P("h")
  240. + S("WSsFfBb") * P("*")^0 + P("n")
  241. end
  242. local function gen_graphics_binary()
  243. local P = lpeg.P
  244. local S = lpeg.S
  245. return S("gGwJjMi") +
  246. P("M") + P("ri") + P("gs") +
  247. P("CS") + P("cs") + P("sh")
  248. end
  249. local function gen_graphics_ternary()
  250. local P = lpeg.P
  251. local S = lpeg.S
  252. return P("d") + P("m") + S("lm")
  253. end
  254. local function gen_graphics_nary()
  255. local P = lpeg.P
  256. local S = lpeg.S
  257. return P("SC") + P("sc") + P("SCN") + P("scn") + P("k") + P("K") + P("re") + S("cvy") +
  258. P("RG") + P("rg")
  259. end
  260. -- Generates a grammar to parse text blocks (between BT and ET)
  261. local function gen_text_grammar()
  262. local V = lpeg.V
  263. local P = lpeg.P
  264. local C = lpeg.C
  265. local gen = generic_grammar_elts()
  266. local empty = ""
  267. local unary_ops = C("T*") / "\n" +
  268. C(gen_graphics_unary()) / empty
  269. local binary_ops = P("Tc") + P("Tw") + P("Tz") + P("TL") + P("Tr") + P("Ts") +
  270. gen_graphics_binary()
  271. local ternary_ops = P("TD") + P("Td") + gen_graphics_ternary()
  272. local nary_op = P("Tm") + gen_graphics_nary()
  273. local text_binary_op = P("Tj") + P("TJ") + P("'")
  274. local text_quote_op = P('"')
  275. local font_op = P("Tf")
  276. return lpeg.P{
  277. "EXPR";
  278. EXPR = gen.ws^0 * lpeg.Ct(V("COMMAND")^0),
  279. COMMAND = (V("UNARY") + V("BINARY") + V("TERNARY") + V("NARY") + V("TEXT") +
  280. V("FONT") + gen.comment) * gen.ws^0,
  281. UNARY = unary_ops,
  282. BINARY = V("ARG") / empty * gen.ws^1 * binary_ops,
  283. TERNARY = V("ARG") / empty * gen.ws^1 * V("ARG") / empty * gen.ws^1 * ternary_ops,
  284. NARY = (gen.number / 0 * gen.ws^1)^1 * (gen.id / empty * gen.ws^0)^-1 * nary_op,
  285. ARG = V("ARRAY") + V("DICT") + V("ATOM"),
  286. ATOM = (gen.comment + gen.boolean + gen.ref +
  287. gen.number + V("STRING") + gen.id),
  288. DICT = "<<" * gen.ws^0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR")^0, rawset) * gen.ws^0 * ">>",
  289. KV_PAIR = lpeg.Cg(gen.id * gen.ws^0 * V("ARG") * gen.ws^0),
  290. ARRAY = "[" * gen.ws^0 * lpeg.Ct(V("ARG")^0) * gen.ws^0 * "]",
  291. STRING = lpeg.P{gen.str + gen.hexstr},
  292. TEXT = (V("TEXT_ARG") * gen.ws^1 * text_binary_op) +
  293. (V("ARG") / 0 * gen.ws^1 * V("ARG") / 0 * gen.ws^1 * V("TEXT_ARG") * gen.ws^1 * text_quote_op),
  294. FONT = (V("FONT_ARG") * gen.ws^1 * (gen.number / 0) * gen.ws^1 * font_op),
  295. FONT_ARG = lpeg.Ct(lpeg.Cc("%font%") * gen.id),
  296. TEXT_ARG = lpeg.Ct(V("STRING")) + V("TEXT_ARRAY"),
  297. TEXT_ARRAY = "[" *
  298. lpeg.Ct(((gen.ws^0 * (gen.ws^0 * (gen.number / 0)^0 * gen.ws^0 * (gen.str + gen.hexstr)))^1)) * gen.ws^0 * "]",
  299. }
  300. end
  301. -- Call immediately on require
  302. compile_tries()
  303. config_module()
  304. pdf_outer_grammar = gen_outer_grammar()
  305. pdf_text_grammar = gen_text_grammar()
  306. local function extract_text_data(specific)
  307. return nil -- NYI
  308. end
  309. -- Generates index for major/minor pair
  310. local function obj_ref(major, minor)
  311. return major * 10.0 + 1.0 / (minor + 1.0)
  312. end
  313. -- Return indirect object reference (if needed)
  314. local function maybe_dereference_object(elt, pdf, task)
  315. if type(elt) == 'table' and elt[1] == '%REF%' then
  316. local ref = obj_ref(elt[2], elt[3])
  317. if pdf.ref[ref] then
  318. -- No recursion!
  319. return pdf.ref[ref]
  320. else
  321. lua_util.debugm(N, task, 'cannot dereference %s:%s -> %s',
  322. elt[2], elt[3], obj_ref(elt[2], elt[3]))
  323. return nil
  324. end
  325. end
  326. return elt
  327. end
  328. -- Apply PDF stream filter
  329. local function apply_pdf_filter(input, filt)
  330. if filt == 'FlateDecode' then
  331. return rspamd_util.inflate(input, config.max_extraction_size)
  332. end
  333. return nil
  334. end
  335. -- Conditionally apply a pipeline of stream filters and return uncompressed data
  336. local function maybe_apply_filter(dict, data)
  337. local uncompressed = data
  338. if dict.Filter then
  339. local filt = dict.Filter
  340. if type(filt) == 'string' then
  341. filt = {filt}
  342. end
  343. for _,f in ipairs(filt) do
  344. uncompressed = apply_pdf_filter(uncompressed, f)
  345. if not uncompressed then break end
  346. end
  347. end
  348. return uncompressed
  349. end
  350. -- Conditionally extract stream data from object and attach it as obj.uncompressed
  351. local function maybe_extract_object_stream(obj, pdf, task)
  352. local dict = obj.dict
  353. if dict.Length then
  354. local len = math.min(obj.stream.len,
  355. tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0)
  356. local real_stream = obj.stream.data:span(1, len)
  357. local uncompressed = maybe_apply_filter(dict, real_stream)
  358. if uncompressed then
  359. obj.uncompressed = uncompressed
  360. lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)',
  361. obj.major, obj.minor, len, uncompressed:len())
  362. return obj.uncompressed
  363. else
  364. lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s',
  365. obj.major, obj.minor, len, dict.Filter)
  366. end
  367. end
  368. end
  369. local function parse_object_grammar(obj, task, pdf)
  370. -- Parse grammar
  371. local obj_dict_span
  372. if obj.stream then
  373. obj_dict_span = obj.data:span(1, obj.stream.start - obj.start)
  374. else
  375. obj_dict_span = obj.data
  376. end
  377. if obj_dict_span:len() < config.max_processing_size then
  378. local ret,obj_or_err = pcall(pdf_outer_grammar.match, pdf_outer_grammar, obj_dict_span)
  379. if ret then
  380. if obj.stream then
  381. obj.dict = obj_or_err
  382. lua_util.debugm(N, task, 'stream object %s:%s is parsed to: %s',
  383. obj.major, obj.minor, obj_or_err)
  384. else
  385. -- Direct object
  386. if type(obj_or_err) == 'table' then
  387. obj.dict = obj_or_err
  388. obj.uncompressed = obj_or_err
  389. lua_util.debugm(N, task, 'direct object %s:%s is parsed to: %s',
  390. obj.major, obj.minor, obj_or_err)
  391. pdf.ref[obj_ref(obj.major, obj.minor)] = obj
  392. else
  393. pdf.ref[obj_ref(obj.major, obj.minor)] = obj_or_err
  394. obj.dict = {}
  395. obj.uncompressed = obj_or_err
  396. end
  397. end
  398. else
  399. lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
  400. obj.major, obj.minor, obj_or_err)
  401. end
  402. else
  403. lua_util.debugm(N, task, 'object %s:%s cannot be parsed: too large %s',
  404. obj.major, obj.minor, obj_dict_span:len())
  405. end
  406. end
  407. -- Extracts font data and process /ToUnicode mappings
  408. -- NYI in fact as cmap is ridiculously stupid and complicated
  409. local function process_font(task, pdf, font, fname)
  410. local dict = font
  411. if font.dict then
  412. dict = font.dict
  413. end
  414. if type(dict) == 'table' and dict.ToUnicode then
  415. local cmap = maybe_dereference_object(dict.ToUnicode, pdf, task)
  416. if cmap and cmap.dict then
  417. maybe_extract_object_stream(cmap, pdf, task)
  418. lua_util.debugm(N, task, 'found cmap for font %s: %s',
  419. fname, cmap.uncompressed)
  420. end
  421. end
  422. end
  423. -- Forward declaration
  424. local process_dict
  425. -- This function processes javascript string and returns JS hash and JS rspamd_text
  426. local function process_javascript(task, pdf, js)
  427. local rspamd_cryptobox_hash = require "rspamd_cryptobox_hash"
  428. if type(js) == 'string' then
  429. js = rspamd_text.fromstring(js):oneline()
  430. elseif type(js) == 'userdata' then
  431. js = js:oneline()
  432. else
  433. return nil
  434. end
  435. local hash = rspamd_cryptobox_hash.create(js)
  436. local bin_hash = hash:bin()
  437. if not pdf.scripts then
  438. pdf.scripts = {}
  439. end
  440. if pdf.scripts[bin_hash] then
  441. -- Duplicate
  442. return pdf.scripts[bin_hash]
  443. end
  444. local njs = {
  445. data = js,
  446. hash = rspamd_util.encode_base32(bin_hash),
  447. bin_hash = bin_hash,
  448. }
  449. pdf.scripts[bin_hash] = njs
  450. return njs
  451. end
  452. -- Extract interesting stuff from /Action, e.g. javascript
  453. local function process_action(task, pdf, obj)
  454. if not (obj.js or obj.launch) and (obj.dict and obj.dict.JS) then
  455. local js = maybe_dereference_object(obj.dict.JS, pdf, task)
  456. if js then
  457. if type(js) == 'table' then
  458. local extracted_js = maybe_extract_object_stream(js, pdf, task)
  459. if not extracted_js then
  460. lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
  461. obj.major, obj.minor, js)
  462. else
  463. js = extracted_js
  464. end
  465. end
  466. js = process_javascript(task, pdf, js)
  467. if js then
  468. obj.js = js
  469. lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
  470. obj.major, obj.minor, obj.js.data)
  471. else
  472. lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
  473. obj.major, obj.minor, js)
  474. end
  475. elseif obj.dict.F then
  476. local launch = maybe_dereference_object(obj.dict.F, pdf, task)
  477. if launch then
  478. if type(launch) == 'string' then
  479. obj.launch = rspamd_text.fromstring(launch):exclude_chars('%n%c')
  480. lua_util.debugm(N, task, 'extracted launch from %s:%s: %s',
  481. obj.major, obj.minor, obj.launch)
  482. elseif type(launch) == 'userdata' then
  483. obj.launch = launch:exclude_chars('%n%c')
  484. lua_util.debugm(N, task, 'extracted launch from %s:%s: %s',
  485. obj.major, obj.minor, obj.launch)
  486. else
  487. lua_util.debugm(N, task, 'invalid type for launch from %s:%s: %s',
  488. obj.major, obj.minor, launch)
  489. end
  490. end
  491. else
  492. lua_util.debugm(N, task, 'no JS attribute in action %s:%s',
  493. obj.major, obj.minor)
  494. end
  495. end
  496. end
  497. -- Extract interesting stuff from /Catalog, e.g. javascript in /OpenAction
  498. local function process_catalog(task, pdf, obj)
  499. if obj.dict then
  500. if obj.dict.OpenAction then
  501. local action = maybe_dereference_object(obj.dict.OpenAction, pdf, task)
  502. if action and type(action) == 'table' then
  503. -- This also processes action js (if not already processed)
  504. process_dict(task, pdf, action, action.dict)
  505. if action.js then
  506. lua_util.debugm(N, task, 'found openaction JS in %s:%s: %s',
  507. obj.major, obj.minor, action.js)
  508. pdf.openaction = action.js
  509. elseif action.launch then
  510. lua_util.debugm(N, task, 'found openaction launch in %s:%s: %s',
  511. obj.major, obj.minor, action.launch)
  512. pdf.launch = action.launch
  513. else
  514. lua_util.debugm(N, task, 'no JS in openaction %s:%s: %s',
  515. obj.major, obj.minor, action)
  516. end
  517. else
  518. lua_util.debugm(N, task, 'cannot find openaction %s:%s: %s -> %s',
  519. obj.major, obj.minor, obj.dict.OpenAction, action)
  520. end
  521. else
  522. lua_util.debugm(N, task, 'no openaction in catalog %s:%s',
  523. obj.major, obj.minor)
  524. end
  525. end
  526. end
  527. process_dict = function(task, pdf, obj, dict)
  528. if not obj.type and type(dict) == 'table' then
  529. if dict.Type and type(dict.Type) == 'string' then
  530. -- Common stuff
  531. obj.type = dict.Type
  532. end
  533. if not obj.type then
  534. if obj.dict.S and obj.dict.JS then
  535. obj.type = 'Javascript'
  536. lua_util.debugm(N, task, 'implicit type for Javascript object %s:%s',
  537. obj.major, obj.minor)
  538. else
  539. lua_util.debugm(N, task, 'no type for %s:%s',
  540. obj.major, obj.minor)
  541. return
  542. end
  543. end
  544. lua_util.debugm(N, task, 'processed stream dictionary for object %s:%s -> %s',
  545. obj.major, obj.minor, obj.type)
  546. local contents = dict.Contents
  547. if contents and type(contents) == 'table' then
  548. if contents[1] == '%REF%' then
  549. -- Single reference
  550. contents = {contents}
  551. end
  552. obj.contents = {}
  553. for _,c in ipairs(contents) do
  554. local cobj = maybe_dereference_object(c, pdf, task)
  555. if cobj and type(cobj) == 'table' then
  556. obj.contents[#obj.contents + 1] = cobj
  557. cobj.parent = obj
  558. cobj.type = 'content'
  559. end
  560. end
  561. lua_util.debugm(N, task, 'found content objects for %s:%s -> %s',
  562. obj.major, obj.minor, #obj.contents)
  563. end
  564. local resources = dict.Resources
  565. if resources and type(resources) == 'table' then
  566. obj.resources = maybe_dereference_object(resources, pdf, task)
  567. if type(obj.resources) ~= 'table' then
  568. rspamd_logger.infox(task, 'cannot parse resources from pdf: %s returned by grammar',
  569. obj.resources)
  570. obj.resources = {}
  571. elseif obj.resources.dict then
  572. obj.resources = obj.resources.dict
  573. end
  574. else
  575. -- Fucking pdf: we need to inherit from parent
  576. resources = {}
  577. if dict.Parent then
  578. local parent = maybe_dereference_object(dict.Parent, pdf, task)
  579. if parent and type(parent) == 'table' and parent.dict then
  580. if parent.resources then
  581. lua_util.debugm(N, task, 'propagated resources from %s:%s to %s:%s',
  582. parent.major, parent.minor, obj.major, obj.minor)
  583. resources = parent.resources
  584. end
  585. end
  586. end
  587. obj.resources = resources
  588. end
  589. local fonts = obj.resources.Font
  590. if fonts and type(fonts) == 'table' then
  591. obj.fonts = {}
  592. for k,v in pairs(fonts) do
  593. obj.fonts[k] = maybe_dereference_object(v, pdf, task)
  594. if obj.fonts[k] then
  595. local font = obj.fonts[k]
  596. if config.text_extraction then
  597. process_font(task, pdf, font, k)
  598. lua_util.debugm(N, task, 'found font "%s" for object %s:%s -> %s',
  599. k, obj.major, obj.minor, font)
  600. end
  601. end
  602. end
  603. end
  604. lua_util.debugm(N, task, 'found resources for object %s:%s (%s): %s',
  605. obj.major, obj.minor, obj.type, obj.resources)
  606. if obj.type == 'Action' then
  607. process_action(task, pdf, obj)
  608. elseif obj.type == 'Catalog' then
  609. process_catalog(task, pdf, obj)
  610. elseif obj.type == 'Javascript' then
  611. local js = maybe_dereference_object(obj.dict.JS, pdf, task)
  612. if js then
  613. if type(js) == 'table' then
  614. local extracted_js = maybe_extract_object_stream(js, pdf, task)
  615. if not extracted_js then
  616. lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
  617. obj.major, obj.minor, js)
  618. else
  619. js = extracted_js
  620. end
  621. end
  622. js = process_javascript(task, pdf, js)
  623. if js then
  624. obj.js = js
  625. lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
  626. obj.major, obj.minor, obj.js.data)
  627. else
  628. lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
  629. obj.major, obj.minor, js)
  630. end
  631. end
  632. end
  633. end -- Already processed dict (obj.type is not empty)
  634. end
  635. -- This function is intended to unpack objects from ObjStm crappy structure
  636. local compound_obj_grammar
  637. local function compound_obj_grammar_gen()
  638. if not compound_obj_grammar then
  639. local gen = generic_grammar_elts()
  640. compound_obj_grammar = gen.ws^0 * (gen.comment * gen.ws^1)^0 *
  641. lpeg.Ct(lpeg.Ct(gen.number * gen.ws^1 * gen.number * gen.ws^0)^1)
  642. end
  643. return compound_obj_grammar
  644. end
  645. local function pdf_compound_object_unpack(_, uncompressed, pdf, task, first)
  646. -- First, we need to parse data line by line likely to find a line
  647. -- that consists of pairs of numbers
  648. compound_obj_grammar_gen()
  649. local elts = compound_obj_grammar:match(uncompressed)
  650. if elts and #elts > 0 then
  651. lua_util.debugm(N, task, 'compound elts (chunk length %s): %s',
  652. #uncompressed, elts)
  653. for i,pair in ipairs(elts) do
  654. local obj_number,offset = pair[1], pair[2]
  655. offset = offset + first
  656. if offset < #uncompressed then
  657. local span_len
  658. if i == #elts then
  659. span_len = #uncompressed - offset
  660. else
  661. span_len = (elts[i + 1][2] + first) - offset
  662. end
  663. if span_len > 0 and offset + span_len < #uncompressed then
  664. local obj = {
  665. major = obj_number,
  666. minor = 0, -- Implicit
  667. data = uncompressed:span(offset + 1, span_len),
  668. ref = obj_ref(obj_number, 0)
  669. }
  670. parse_object_grammar(obj, task, pdf)
  671. if obj.dict then
  672. pdf.objects[#pdf.objects + 1] = obj
  673. end
  674. end
  675. end
  676. end
  677. end
  678. end
  679. -- PDF 1.5 ObjStmt
  680. local function extract_pdf_compound_objects(task, pdf)
  681. for _,obj in ipairs(pdf.objects or {}) do
  682. if obj.stream and obj.dict and type(obj.dict) == 'table' then
  683. local t = obj.dict.Type
  684. if t and t == 'ObjStm' then
  685. -- We are in troubles sir...
  686. local nobjs = tonumber(maybe_dereference_object(obj.dict.N, pdf, task))
  687. local first = tonumber(maybe_dereference_object(obj.dict.First, pdf, task))
  688. if nobjs and first then
  689. --local extend = maybe_dereference_object(obj.dict.Extends, pdf, task)
  690. lua_util.debugm(N, task, 'extract ObjStm with %s objects (%s first) %s extend',
  691. nobjs, first, obj.dict.Extends)
  692. local uncompressed = maybe_extract_object_stream(obj, pdf, task)
  693. if uncompressed then
  694. pdf_compound_object_unpack(obj, uncompressed, pdf, task, first)
  695. end
  696. else
  697. lua_util.debugm(N, task, 'ObjStm object %s:%s has bad dict: %s',
  698. obj.major, obj.minor, obj.dict)
  699. end
  700. end
  701. end
  702. end
  703. end
  704. -- This function arranges starts and ends of all objects and process them into initial
  705. -- set of objects
  706. local function extract_outer_objects(task, input, pdf)
  707. local start_pos, end_pos = 1, 1
  708. local obj_count = 0
  709. while start_pos <= #pdf.start_objects and end_pos <= #pdf.end_objects do
  710. local first = pdf.start_objects[start_pos]
  711. local last = pdf.end_objects[end_pos]
  712. -- 7 is length of `endobj\n`
  713. if first + 7 < last then
  714. local len = last - first - 7
  715. -- Also get the starting span and try to match it versus obj re to get numbers
  716. local obj_line_potential = first - 32
  717. if obj_line_potential < 1 then obj_line_potential = 1 end
  718. local prev_obj_end = pdf.end_objects[end_pos - 1]
  719. if end_pos > 1 and prev_obj_end >= obj_line_potential and prev_obj_end < first then
  720. obj_line_potential = prev_obj_end + 1
  721. end
  722. local obj_line_span = input:span(obj_line_potential, first - obj_line_potential + 1)
  723. local matches = object_re:search(obj_line_span, true, true)
  724. if matches and matches[1] then
  725. local nobj = {
  726. start = first,
  727. len = len,
  728. data = input:span(first, len),
  729. major = tonumber(matches[1][2]),
  730. minor = tonumber(matches[1][3]),
  731. }
  732. pdf.objects[obj_count + 1] = nobj
  733. if nobj.major and nobj.minor then
  734. -- Add reference
  735. local ref = obj_ref(nobj.major, nobj.minor)
  736. nobj.ref = ref -- Our internal reference
  737. pdf.ref[ref] = nobj
  738. end
  739. end
  740. obj_count = obj_count + 1
  741. start_pos = start_pos + 1
  742. end_pos = end_pos + 1
  743. elseif first > last then
  744. end_pos = end_pos + 1
  745. else
  746. start_pos = start_pos + 1
  747. end_pos = end_pos + 1
  748. end
  749. end
  750. end
  751. -- This function attaches streams to objects and processes outer pdf grammar
  752. local function attach_pdf_streams(task, input, pdf)
  753. if pdf.start_streams and pdf.end_streams then
  754. local start_pos, end_pos = 1, 1
  755. for _,obj in ipairs(pdf.objects) do
  756. while start_pos <= #pdf.start_streams and end_pos <= #pdf.end_streams do
  757. local first = pdf.start_streams[start_pos]
  758. local last = pdf.end_streams[end_pos]
  759. last = last - 10 -- Exclude endstream\n pattern
  760. lua_util.debugm(N, task, "start: %s, end: %s; obj: %s-%s",
  761. first, last, obj.start, obj.start + obj.len)
  762. if first > obj.start and last < obj.start + obj.len and last > first then
  763. -- In case if we have fake endstream :(
  764. while pdf.end_streams[end_pos + 1] and pdf.end_streams[end_pos + 1] < obj.start + obj.len do
  765. end_pos = end_pos + 1
  766. last = pdf.end_streams[end_pos]
  767. end
  768. -- Strip the first \n
  769. while first < last do
  770. local chr = input:at(first)
  771. if chr ~= 13 and chr ~= 10 then break end
  772. first = first + 1
  773. end
  774. local len = last - first
  775. obj.stream = {
  776. start = first,
  777. len = len,
  778. data = input:span(first, len)
  779. }
  780. start_pos = start_pos + 1
  781. end_pos = end_pos + 1
  782. break
  783. elseif first < obj.start then
  784. start_pos = start_pos + 1
  785. elseif last > obj.start + obj.len then
  786. -- Not this object
  787. break
  788. else
  789. start_pos = start_pos + 1
  790. end_pos = end_pos + 1
  791. end
  792. end
  793. if obj.stream then
  794. lua_util.debugm(N, task, 'found object %s:%s %s start %s len, %s stream start, %s stream length',
  795. obj.major, obj.minor, obj.start, obj.len, obj.stream.start, obj.stream.len)
  796. else
  797. lua_util.debugm(N, task, 'found object %s:%s %s start %s len, no stream',
  798. obj.major, obj.minor, obj.start, obj.len)
  799. end
  800. end
  801. end
  802. end
  803. -- Processes PDF objects: extracts streams, object numbers, process outer grammar,
  804. -- augment object types
  805. local function postprocess_pdf_objects(task, input, pdf)
  806. pdf.objects = {} -- objects table
  807. pdf.ref = {} -- references table
  808. extract_outer_objects(task, input, pdf)
  809. -- Now we have objects and we need to attach streams that are in bounds
  810. attach_pdf_streams(task, input, pdf)
  811. -- Parse grammar for outer objects
  812. for _,obj in ipairs(pdf.objects) do
  813. if obj.ref then
  814. parse_object_grammar(obj, task, pdf)
  815. end
  816. end
  817. extract_pdf_compound_objects(task, pdf)
  818. -- Now we might probably have all objects being processed
  819. for _,obj in ipairs(pdf.objects) do
  820. if obj.dict then
  821. -- Types processing
  822. process_dict(task, pdf, obj, obj.dict)
  823. end
  824. end
  825. end
  826. local function offsets_to_blocks(starts, ends, out)
  827. local start_pos, end_pos = 1, 1
  828. while start_pos <= #starts and end_pos <= #ends do
  829. local first = starts[start_pos]
  830. local last = ends[end_pos]
  831. if first < last then
  832. local len = last - first
  833. out[#out + 1] = {
  834. start = first,
  835. len = len,
  836. }
  837. start_pos = start_pos + 1
  838. end_pos = end_pos + 1
  839. elseif first > last then
  840. end_pos = end_pos + 1
  841. else
  842. -- Not ordered properly!
  843. break
  844. end
  845. end
  846. end
  847. local function search_text(task, pdf)
  848. for _,obj in ipairs(pdf.objects) do
  849. if obj.type == 'Page' and obj.contents then
  850. local text = {}
  851. for _,tobj in ipairs(obj.contents) do
  852. maybe_extract_object_stream(tobj, pdf, task)
  853. local matches = pdf_text_trie:match(tobj.uncompressed or '')
  854. if matches then
  855. local text_blocks = {}
  856. local starts = {}
  857. local ends = {}
  858. for npat,matched_positions in pairs(matches) do
  859. if npat == 1 then
  860. for _,pos in ipairs(matched_positions) do
  861. starts[#starts + 1] = pos
  862. end
  863. else
  864. for _,pos in ipairs(matched_positions) do
  865. ends[#ends + 1] = pos
  866. end
  867. end
  868. end
  869. offsets_to_blocks(starts, ends, text_blocks)
  870. for _,bl in ipairs(text_blocks) do
  871. if bl.len > 2 then
  872. -- To remove \s+ET\b pattern (it can leave trailing space or not but it doesn't matter)
  873. bl.len = bl.len - 2
  874. end
  875. bl.data = tobj.uncompressed:span(bl.start, bl.len)
  876. --lua_util.debugm(N, task, 'extracted text from object %s:%s: %s',
  877. -- tobj.major, tobj.minor, bl.data)
  878. if bl.len < config.max_processing_size then
  879. local ret,obj_or_err = pcall(pdf_text_grammar.match, pdf_text_grammar,
  880. bl.data)
  881. if ret then
  882. text[#text + 1] = obj_or_err
  883. lua_util.debugm(N, task, 'attached %s from content object %s:%s to %s:%s',
  884. obj_or_err, tobj.major, tobj.minor, obj.major, obj.minor)
  885. else
  886. lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
  887. obj.major, obj.minor, obj_or_err)
  888. end
  889. end
  890. end
  891. end
  892. end
  893. -- Join all text data together
  894. if #text > 0 then
  895. obj.text = rspamd_text.fromtable(text)
  896. lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
  897. obj.major, obj.minor, obj.text)
  898. end
  899. end
  900. end
  901. end
  902. -- This function searches objects for `/URI` key and parses it's content
  903. local function search_urls(task, pdf)
  904. local function recursive_object_traverse(obj, dict, rec)
  905. if rec > 10 then
  906. lua_util.debugm(N, task, 'object %s:%s recurses too much',
  907. obj.major, obj.minor)
  908. return
  909. end
  910. for k,v in pairs(dict) do
  911. if type(v) == 'table' then
  912. recursive_object_traverse(obj, v, rec + 1)
  913. elseif k == 'URI' then
  914. v = maybe_dereference_object(v, pdf, task)
  915. if type(v) == 'string' then
  916. local url = rspamd_url.create(task:get_mempool(), v)
  917. if url then
  918. lua_util.debugm(N, task, 'found url %s in object %s:%s',
  919. v, obj.major, obj.minor)
  920. task:inject_url(url)
  921. end
  922. end
  923. end
  924. end
  925. end
  926. for _,obj in ipairs(pdf.objects) do
  927. if obj.dict and type(obj.dict) == 'table' then
  928. recursive_object_traverse(obj, obj.dict, 0)
  929. end
  930. end
  931. end
  932. local function process_pdf(input, _, task)
  933. if not config.enabled then
  934. -- Skip processing
  935. return {}
  936. end
  937. local matches = pdf_trie:match(input)
  938. if matches then
  939. local pdf_output = {
  940. tag = 'pdf',
  941. extract_text = extract_text_data,
  942. }
  943. local grouped_processors = {}
  944. for npat,matched_positions in pairs(matches) do
  945. local index = pdf_indexes[npat]
  946. local proc_key,loc_npat = index[1], index[4]
  947. if not grouped_processors[proc_key] then
  948. grouped_processors[proc_key] = {
  949. processor_func = processors[proc_key],
  950. offsets = {},
  951. }
  952. end
  953. local proc = grouped_processors[proc_key]
  954. -- Fill offsets
  955. for _,pos in ipairs(matched_positions) do
  956. proc.offsets[#proc.offsets + 1] = {pos, loc_npat}
  957. end
  958. end
  959. for name,processor in pairs(grouped_processors) do
  960. -- Sort by offset
  961. lua_util.debugm(N, task, "pdf: process group %s with %s matches",
  962. name, #processor.offsets)
  963. table.sort(processor.offsets, function(e1, e2) return e1[1] < e2[1] end)
  964. processor.processor_func(input, task, processor.offsets, pdf_output)
  965. end
  966. pdf_output.flags = {}
  967. if pdf_output.start_objects and pdf_output.end_objects then
  968. -- Postprocess objects
  969. postprocess_pdf_objects(task, input, pdf_output)
  970. if config.text_extraction then
  971. search_text(task, pdf_output)
  972. end
  973. if config.url_extraction then
  974. search_urls(task, pdf_output)
  975. end
  976. if config.js_fuzzy and pdf_output.scripts then
  977. pdf_output.fuzzy_hashes = {}
  978. if config.openaction_fuzzy_only then
  979. -- OpenAction only
  980. if pdf_output.openaction and pdf_output.openaction.bin_hash then
  981. if config.min_js_fuzzy and #pdf_output.openaction.data >= config.min_js_fuzzy then
  982. lua_util.debugm(N, task, "pdf: add fuzzy hash from openaction: %s",
  983. pdf_output.openaction.hash)
  984. table.insert(pdf_output.fuzzy_hashes, pdf_output.openaction.bin_hash)
  985. else
  986. lua_util.debugm(N, task, "pdf: skip fuzzy hash from Javascript: %s, too short: %s",
  987. pdf_output.openaction.hash, #pdf_output.openaction.data)
  988. end
  989. end
  990. else
  991. -- All hashes
  992. for h,sc in pairs(pdf_output.scripts) do
  993. if config.min_js_fuzzy and #sc.data >= config.min_js_fuzzy then
  994. lua_util.debugm(N, task, "pdf: add fuzzy hash from Javascript: %s",
  995. sc.hash)
  996. table.insert(pdf_output.fuzzy_hashes, h)
  997. else
  998. lua_util.debugm(N, task, "pdf: skip fuzzy hash from Javascript: %s, too short: %s",
  999. sc.hash, #sc.data)
  1000. end
  1001. end
  1002. end
  1003. end
  1004. else
  1005. pdf_output.flags.no_objects = true
  1006. end
  1007. return pdf_output
  1008. end
  1009. end
  1010. -- Processes the PDF trailer
  1011. processors.trailer = function(input, task, positions, output)
  1012. local last_pos = positions[#positions]
  1013. local last_span = input:span(last_pos[1])
  1014. for line in last_span:lines(true) do
  1015. if line:find('/Encrypt ') then
  1016. lua_util.debugm(N, task, "pdf: found encrypted line in trailer: %s",
  1017. line)
  1018. output.encrypted = true
  1019. end
  1020. end
  1021. end
  1022. processors.suspicious = function(_, task, _, output)
  1023. lua_util.debugm(N, task, "pdf: found a suspicious pattern")
  1024. output.suspicious = true
  1025. end
  1026. local function generic_table_inserter(positions, output, output_key)
  1027. if not output[output_key] then
  1028. output[output_key] = {}
  1029. end
  1030. local shift = #output[output_key]
  1031. for i,pos in ipairs(positions) do
  1032. output[output_key][i + shift] = pos[1]
  1033. end
  1034. end
  1035. processors.start_object = function(_, task, positions, output)
  1036. generic_table_inserter(positions, output, 'start_objects')
  1037. end
  1038. processors.end_object = function(_, task, positions, output)
  1039. generic_table_inserter(positions, output, 'end_objects')
  1040. end
  1041. processors.start_stream = function(_, task, positions, output)
  1042. generic_table_inserter(positions, output, 'start_streams')
  1043. end
  1044. processors.end_stream = function(_, task, positions, output)
  1045. generic_table_inserter(positions, output, 'end_streams')
  1046. end
  1047. exports.process = process_pdf
  1048. return exports