You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

pdf.lua 42KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389
  1. --[[
  2. Copyright (c) 2019, Vsevolod Stakhov <vsevolod@highsecure.ru>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. --[[[
  14. -- @module lua_content/pdf
  15. -- This module contains some heuristics for PDF files
  16. --]]
  17. local rspamd_trie = require "rspamd_trie"
  18. local rspamd_util = require "rspamd_util"
  19. local rspamd_text = require "rspamd_text"
  20. local rspamd_url = require "rspamd_url"
  21. local bit = require "bit"
  22. local N = "lua_content"
  23. local lua_util = require "lua_util"
  24. local rspamd_regexp = require "rspamd_regexp"
  25. local lpeg = require "lpeg"
  26. local pdf_patterns = {
  27. trailer = {
  28. patterns = {
  29. [[\ntrailer\r?\n]]
  30. }
  31. },
  32. suspicious = {
  33. patterns = {
  34. [[netsh\s]],
  35. [[echo\s]],
  36. [[\/[A-Za-z]*#\d\d(?:[#A-Za-z<>/\s])]], -- Hex encode obfuscation
  37. }
  38. },
  39. start_object = {
  40. patterns = {
  41. [=[[\r\n\0]\s*\d+\s+\d+\s+obj[\s<]]=]
  42. }
  43. },
  44. end_object = {
  45. patterns = {
  46. [=[endobj[\r\n]]=]
  47. }
  48. },
  49. start_stream = {
  50. patterns = {
  51. [=[>\s*stream[\r\n]]=],
  52. }
  53. },
  54. end_stream = {
  55. patterns = {
  56. [=[endstream[\r\n]]=]
  57. }
  58. }
  59. }
  60. local pdf_text_patterns = {
  61. start = {
  62. patterns = {
  63. [[\sBT\s]]
  64. }
  65. },
  66. stop = {
  67. patterns = {
  68. [[\sET\b]]
  69. }
  70. }
  71. }
  72. local pdf_cmap_patterns = {
  73. start = {
  74. patterns = {
  75. [[\d\s+beginbfchar\s]],
  76. [[\d\s+beginbfrange\s]]
  77. }
  78. },
  79. stop = {
  80. patterns = {
  81. [[\sendbfrange\b]],
  82. [[\sendbchar\b]]
  83. }
  84. }
  85. }
  86. -- index[n] ->
  87. -- t[1] - pattern,
  88. -- t[2] - key in patterns table,
  89. -- t[3] - value in patterns table
  90. -- t[4] - local pattern index
  91. local pdf_indexes = {}
  92. local pdf_text_indexes = {}
  93. local pdf_cmap_indexes = {}
  94. local pdf_trie
  95. local pdf_text_trie
  96. local pdf_cmap_trie
  97. local exports = {}
  98. local config = {
  99. max_extraction_size = 512 * 1024,
  100. max_processing_size = 32 * 1024,
  101. text_extraction = false, -- NYI feature
  102. url_extraction = true,
  103. enabled = true,
  104. js_fuzzy = true, -- Generate fuzzy hashes from PDF javascripts
  105. min_js_fuzzy = 256, -- Minimum size of js to be considered as a fuzzy
  106. openaction_fuzzy_only = false, -- Generate fuzzy from all scripts
  107. max_pdf_objects = 10000, -- Maximum number of objects to be considered
  108. max_pdf_trailer = 10 * 1024 * 1024, -- Maximum trailer size (to avoid abuse)
  109. max_pdf_trailer_lines = 100, -- Maximum number of lines in pdf trailer
  110. pdf_process_timeout = 1.0, -- Timeout in seconds for processing
  111. }
  112. -- Used to process patterns found in PDF
  113. -- positions for functional processors should be a iter/table from trie matcher in form
  114. ---- [{n1, pat_idx1}, ... {nn, pat_idxn}] where
  115. ---- pat_idxn is pattern index and n1 ... nn are match positions
  116. local processors = {}
  117. -- PDF objects outer grammar in LPEG style (performing table captures)
  118. local pdf_outer_grammar
  119. local pdf_text_grammar
  120. -- Used to match objects
  121. local object_re = rspamd_regexp.create_cached([=[/(\d+)\s+(\d+)\s+obj\s*/]=])
  122. local function config_module()
  123. local opts = rspamd_config:get_all_opt('lua_content')
  124. if opts and opts.pdf then
  125. config = lua_util.override_defaults(config, opts.pdf)
  126. end
  127. end
  128. local function compile_tries()
  129. local default_compile_flags = bit.bor(rspamd_trie.flags.re,
  130. rspamd_trie.flags.dot_all,
  131. rspamd_trie.flags.no_start)
  132. local function compile_pats(patterns, indexes, compile_flags)
  133. local strs = {}
  134. for what,data in pairs(patterns) do
  135. for i,pat in ipairs(data.patterns) do
  136. strs[#strs + 1] = pat
  137. indexes[#indexes + 1] = {what, data, pat, i}
  138. end
  139. end
  140. return rspamd_trie.create(strs, compile_flags or default_compile_flags)
  141. end
  142. if not pdf_trie then
  143. pdf_trie = compile_pats(pdf_patterns, pdf_indexes)
  144. end
  145. if not pdf_text_trie then
  146. pdf_text_trie = compile_pats(pdf_text_patterns, pdf_text_indexes)
  147. end
  148. if not pdf_cmap_trie then
  149. pdf_cmap_trie = compile_pats(pdf_cmap_patterns, pdf_cmap_indexes)
  150. end
  151. end
  152. -- Returns a table with generic grammar elements for PDF
  153. local function generic_grammar_elts()
  154. local P = lpeg.P
  155. local R = lpeg.R
  156. local S = lpeg.S
  157. local V = lpeg.V
  158. local C = lpeg.C
  159. local D = R'09' -- Digits
  160. local grammar_elts = {}
  161. -- Helper functions
  162. local function pdf_hexstring_unescape(s)
  163. local function ue(cc)
  164. return string.char(tonumber(cc, 16))
  165. end
  166. if #s % 2 == 0 then
  167. -- Sane hex string
  168. return s:gsub('..', ue)
  169. end
  170. -- WTF hex string
  171. -- Append '0' to it and unescape...
  172. return s:sub(1, #s - 1):gsub('..' , ue) .. (s:sub(#s) .. '0'):gsub('..' , ue)
  173. end
  174. local function pdf_string_unescape(s)
  175. local function ue_single(cc)
  176. if cc == '\\r' then
  177. return '\r'
  178. elseif cc == '\\n' then
  179. return '\n'
  180. else
  181. return cc:gsub(2, 2)
  182. end
  183. end
  184. -- simple unescape \char
  185. s = s:gsub('\\[^%d]', ue_single)
  186. -- unescape octal
  187. local function ue_octal(cc)
  188. -- Replace unknown stuff with '?'
  189. return string.char(tonumber(cc:sub(2), 8) or 63)
  190. end
  191. s = s:gsub('\\%d%d?%d?', ue_octal)
  192. return s
  193. end
  194. local function pdf_id_unescape(s)
  195. return (s:gsub('#%d%d', function (cc)
  196. return string.char(tonumber(cc:sub(2), 16))
  197. end))
  198. end
  199. local delim = S'()<>[]{}/%'
  200. grammar_elts.ws = S'\0 \r\n\t\f'
  201. local hex = R'af' + R'AF' + D
  202. -- Comments.
  203. local eol = P'\r\n' + '\n'
  204. local line = (1 - S'\r\n\f')^0 * eol^-1
  205. grammar_elts.comment = P'%' * line
  206. -- Numbers.
  207. local sign = S'+-'^-1
  208. local decimal = D^1
  209. local float = D^1 * P'.' * D^0 + P'.' * D^1
  210. grammar_elts.number = C(sign * (float + decimal)) / tonumber
  211. -- String
  212. grammar_elts.str = P{ "(" * C(((1 - S"()\\") + (P '\\' * 1) + V(1))^0) / pdf_string_unescape * ")" }
  213. grammar_elts.hexstr = P{"<" * C(hex^0) / pdf_hexstring_unescape * ">"}
  214. -- Identifier
  215. grammar_elts.id = P{'/' * C((1-(delim + grammar_elts.ws))^1) / pdf_id_unescape}
  216. -- Booleans (who care about them?)
  217. grammar_elts.boolean = C(P("true") + P("false"))
  218. -- Stupid references
  219. grammar_elts.ref = lpeg.Ct{lpeg.Cc("%REF%") * C(D^1) * " " * C(D^1) * " " * "R"}
  220. return grammar_elts
  221. end
  222. -- Generates a grammar to parse outer elements (external objects in PDF notation)
  223. local function gen_outer_grammar()
  224. local V = lpeg.V
  225. local gen = generic_grammar_elts()
  226. return lpeg.P{
  227. "EXPR";
  228. EXPR = gen.ws^0 * V("ELT")^0 * gen.ws^0,
  229. ELT = V("ARRAY") + V("DICT") + V("ATOM"),
  230. ATOM = gen.ws^0 * (gen.comment + gen.boolean + gen.ref +
  231. gen.number + V("STRING") + gen.id) * gen.ws^0,
  232. DICT = "<<" * gen.ws^0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR")^0, rawset) * gen.ws^0 * ">>",
  233. KV_PAIR = lpeg.Cg(gen.id * gen.ws^0 * V("ELT") * gen.ws^0),
  234. ARRAY = "[" * gen.ws^0 * lpeg.Ct(V("ELT")^0) * gen.ws^0 * "]",
  235. STRING = lpeg.P{gen.str + gen.hexstr},
  236. }
  237. end
  238. -- Graphic state in PDF
  239. local function gen_graphics_unary()
  240. local P = lpeg.P
  241. local S = lpeg.S
  242. return P("q") + P("Q") + P("h")
  243. + S("WSsFfBb") * P("*")^0 + P("n")
  244. end
  245. local function gen_graphics_binary()
  246. local P = lpeg.P
  247. local S = lpeg.S
  248. return S("gGwJjMi") +
  249. P("M") + P("ri") + P("gs") +
  250. P("CS") + P("cs") + P("sh")
  251. end
  252. local function gen_graphics_ternary()
  253. local P = lpeg.P
  254. local S = lpeg.S
  255. return P("d") + P("m") + S("lm")
  256. end
  257. local function gen_graphics_nary()
  258. local P = lpeg.P
  259. local S = lpeg.S
  260. return P("SC") + P("sc") + P("SCN") + P("scn") + P("k") + P("K") + P("re") + S("cvy") +
  261. P("RG") + P("rg")
  262. end
  263. -- Generates a grammar to parse text blocks (between BT and ET)
  264. local function gen_text_grammar()
  265. local V = lpeg.V
  266. local P = lpeg.P
  267. local C = lpeg.C
  268. local gen = generic_grammar_elts()
  269. local empty = ""
  270. local unary_ops = C("T*") / "\n" +
  271. C(gen_graphics_unary()) / empty
  272. local binary_ops = P("Tc") + P("Tw") + P("Tz") + P("TL") + P("Tr") + P("Ts") +
  273. gen_graphics_binary()
  274. local ternary_ops = P("TD") + P("Td") + gen_graphics_ternary()
  275. local nary_op = P("Tm") + gen_graphics_nary()
  276. local text_binary_op = P("Tj") + P("TJ") + P("'")
  277. local text_quote_op = P('"')
  278. local font_op = P("Tf")
  279. return lpeg.P{
  280. "EXPR";
  281. EXPR = gen.ws^0 * lpeg.Ct(V("COMMAND")^0),
  282. COMMAND = (V("UNARY") + V("BINARY") + V("TERNARY") + V("NARY") + V("TEXT") +
  283. V("FONT") + gen.comment) * gen.ws^0,
  284. UNARY = unary_ops,
  285. BINARY = V("ARG") / empty * gen.ws^1 * binary_ops,
  286. TERNARY = V("ARG") / empty * gen.ws^1 * V("ARG") / empty * gen.ws^1 * ternary_ops,
  287. NARY = (gen.number / 0 * gen.ws^1)^1 * (gen.id / empty * gen.ws^0)^-1 * nary_op,
  288. ARG = V("ARRAY") + V("DICT") + V("ATOM"),
  289. ATOM = (gen.comment + gen.boolean + gen.ref +
  290. gen.number + V("STRING") + gen.id),
  291. DICT = "<<" * gen.ws^0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR")^0, rawset) * gen.ws^0 * ">>",
  292. KV_PAIR = lpeg.Cg(gen.id * gen.ws^0 * V("ARG") * gen.ws^0),
  293. ARRAY = "[" * gen.ws^0 * lpeg.Ct(V("ARG")^0) * gen.ws^0 * "]",
  294. STRING = lpeg.P{gen.str + gen.hexstr},
  295. TEXT = (V("TEXT_ARG") * gen.ws^1 * text_binary_op) +
  296. (V("ARG") / 0 * gen.ws^1 * V("ARG") / 0 * gen.ws^1 * V("TEXT_ARG") * gen.ws^1 * text_quote_op),
  297. FONT = (V("FONT_ARG") * gen.ws^1 * (gen.number / 0) * gen.ws^1 * font_op),
  298. FONT_ARG = lpeg.Ct(lpeg.Cc("%font%") * gen.id),
  299. TEXT_ARG = lpeg.Ct(V("STRING")) + V("TEXT_ARRAY"),
  300. TEXT_ARRAY = "[" *
  301. lpeg.Ct(((gen.ws^0 * (gen.ws^0 * (gen.number / 0)^0 * gen.ws^0 * (gen.str + gen.hexstr)))^1)) * gen.ws^0 * "]",
  302. }
  303. end
  304. -- Call immediately on require
  305. compile_tries()
  306. config_module()
  307. pdf_outer_grammar = gen_outer_grammar()
  308. pdf_text_grammar = gen_text_grammar()
  309. local function extract_text_data(specific)
  310. return nil -- NYI
  311. end
  312. -- Generates index for major/minor pair
  313. local function obj_ref(major, minor)
  314. return major * 10.0 + 1.0 / (minor + 1.0)
  315. end
  316. -- Return indirect object reference (if needed)
  317. local function maybe_dereference_object(elt, pdf, task)
  318. if type(elt) == 'table' and elt[1] == '%REF%' then
  319. local ref = obj_ref(elt[2], elt[3])
  320. if pdf.ref[ref] then
  321. -- No recursion!
  322. return pdf.ref[ref]
  323. else
  324. lua_util.debugm(N, task, 'cannot dereference %s:%s -> %s, no object',
  325. elt[2], elt[3], obj_ref(elt[2], elt[3]))
  326. return nil
  327. end
  328. end
  329. return elt
  330. end
  331. -- Apply PDF stream filter
  332. local function apply_pdf_filter(input, filt)
  333. if filt == 'FlateDecode' then
  334. return rspamd_util.inflate(input, config.max_extraction_size)
  335. end
  336. return nil
  337. end
  338. -- Conditionally apply a pipeline of stream filters and return uncompressed data
  339. local function maybe_apply_filter(dict, data, pdf, task)
  340. local uncompressed = data
  341. if dict.Filter then
  342. local filt = dict.Filter
  343. if type(filt) == 'string' then
  344. filt = {filt}
  345. end
  346. if dict.DecodeParms then
  347. local decode_params = maybe_dereference_object(dict.DecodeParms, pdf, task)
  348. if type(decode_params) == 'table' then
  349. if decode_params.Predictor then
  350. return nil,'predictor exists'
  351. end
  352. end
  353. end
  354. for _,f in ipairs(filt) do
  355. uncompressed = apply_pdf_filter(uncompressed, f)
  356. if not uncompressed then break end
  357. end
  358. end
  359. return uncompressed,nil
  360. end
  361. -- Conditionally extract stream data from object and attach it as obj.uncompressed
  362. local function maybe_extract_object_stream(obj, pdf, task)
  363. if pdf.encrypted then
  364. -- TODO add decryption some day
  365. return nil
  366. end
  367. local dict = obj.dict
  368. if dict.Length and type(obj.stream) == 'table' then
  369. local len = math.min(obj.stream.len,
  370. tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0)
  371. local real_stream = obj.stream.data:span(1, len)
  372. local uncompressed,filter_err = maybe_apply_filter(dict, real_stream, pdf, task)
  373. if uncompressed then
  374. obj.uncompressed = uncompressed
  375. lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)',
  376. obj.major, obj.minor, len, uncompressed:len())
  377. return obj.uncompressed
  378. else
  379. lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s: %s',
  380. obj.major, obj.minor, len, dict.Filter, filter_err)
  381. end
  382. end
  383. end
  384. local function parse_object_grammar(obj, task, pdf)
  385. -- Parse grammar
  386. local obj_dict_span
  387. if obj.stream then
  388. obj_dict_span = obj.data:span(1, obj.stream.start - obj.start)
  389. else
  390. obj_dict_span = obj.data
  391. end
  392. if obj_dict_span:len() < config.max_processing_size then
  393. local ret,obj_or_err = pcall(pdf_outer_grammar.match, pdf_outer_grammar, obj_dict_span)
  394. if ret then
  395. if obj.stream then
  396. if type(obj_or_err) == 'table' then
  397. obj.dict = obj_or_err
  398. else
  399. obj.dict = {}
  400. end
  401. lua_util.debugm(N, task, 'stream object %s:%s is parsed to: %s',
  402. obj.major, obj.minor, obj_or_err)
  403. else
  404. -- Direct object
  405. if type(obj_or_err) == 'table' then
  406. obj.dict = obj_or_err
  407. obj.uncompressed = obj_or_err
  408. lua_util.debugm(N, task, 'direct object %s:%s is parsed to: %s',
  409. obj.major, obj.minor, obj_or_err)
  410. pdf.ref[obj_ref(obj.major, obj.minor)] = obj
  411. else
  412. lua_util.debugm(N, task, 'direct object %s:%s is parsed to raw data: %s',
  413. obj.major, obj.minor, obj_or_err)
  414. pdf.ref[obj_ref(obj.major, obj.minor)] = obj_or_err
  415. obj.dict = {}
  416. obj.uncompressed = obj_or_err
  417. end
  418. end
  419. else
  420. lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
  421. obj.major, obj.minor, obj_or_err)
  422. end
  423. else
  424. lua_util.debugm(N, task, 'object %s:%s cannot be parsed: too large %s',
  425. obj.major, obj.minor, obj_dict_span:len())
  426. end
  427. end
  428. -- Extracts font data and process /ToUnicode mappings
  429. -- NYI in fact as cmap is ridiculously stupid and complicated
  430. --[[
  431. local function process_font(task, pdf, font, fname)
  432. local dict = font
  433. if font.dict then
  434. dict = font.dict
  435. end
  436. if type(dict) == 'table' and dict.ToUnicode then
  437. local cmap = maybe_dereference_object(dict.ToUnicode, pdf, task)
  438. if cmap and cmap.dict then
  439. maybe_extract_object_stream(cmap, pdf, task)
  440. lua_util.debugm(N, task, 'found cmap for font %s: %s',
  441. fname, cmap.uncompressed)
  442. end
  443. end
  444. end
  445. --]]
  446. -- Forward declaration
  447. local process_dict
  448. -- This function processes javascript string and returns JS hash and JS rspamd_text
  449. local function process_javascript(task, pdf, js)
  450. local rspamd_cryptobox_hash = require "rspamd_cryptobox_hash"
  451. if type(js) == 'string' then
  452. js = rspamd_text.fromstring(js):oneline()
  453. elseif type(js) == 'userdata' then
  454. js = js:oneline()
  455. else
  456. return nil
  457. end
  458. local hash = rspamd_cryptobox_hash.create(js)
  459. local bin_hash = hash:bin()
  460. if not pdf.scripts then
  461. pdf.scripts = {}
  462. end
  463. if pdf.scripts[bin_hash] then
  464. -- Duplicate
  465. return pdf.scripts[bin_hash]
  466. end
  467. local njs = {
  468. data = js,
  469. hash = hash:hex(),
  470. bin_hash = bin_hash,
  471. }
  472. pdf.scripts[bin_hash] = njs
  473. return njs
  474. end
  475. -- Extract interesting stuff from /Action, e.g. javascript
  476. local function process_action(task, pdf, obj)
  477. if not (obj.js or obj.launch) and (obj.dict and obj.dict.JS) then
  478. local js = maybe_dereference_object(obj.dict.JS, pdf, task)
  479. if js then
  480. if type(js) == 'table' then
  481. local extracted_js = maybe_extract_object_stream(js, pdf, task)
  482. if not extracted_js then
  483. lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
  484. obj.major, obj.minor, js)
  485. else
  486. js = extracted_js
  487. end
  488. end
  489. js = process_javascript(task, pdf, js)
  490. if js then
  491. obj.js = js
  492. lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
  493. obj.major, obj.minor, obj.js.data)
  494. else
  495. lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
  496. obj.major, obj.minor, js)
  497. end
  498. elseif obj.dict.F then
  499. local launch = maybe_dereference_object(obj.dict.F, pdf, task)
  500. if launch then
  501. if type(launch) == 'string' then
  502. obj.launch = rspamd_text.fromstring(launch):exclude_chars('%n%c')
  503. lua_util.debugm(N, task, 'extracted launch from %s:%s: %s',
  504. obj.major, obj.minor, obj.launch)
  505. elseif type(launch) == 'userdata' then
  506. obj.launch = launch:exclude_chars('%n%c')
  507. lua_util.debugm(N, task, 'extracted launch from %s:%s: %s',
  508. obj.major, obj.minor, obj.launch)
  509. else
  510. lua_util.debugm(N, task, 'invalid type for launch from %s:%s: %s',
  511. obj.major, obj.minor, launch)
  512. end
  513. end
  514. else
  515. lua_util.debugm(N, task, 'no JS attribute in action %s:%s',
  516. obj.major, obj.minor)
  517. end
  518. end
  519. end
  520. -- Extract interesting stuff from /Catalog, e.g. javascript in /OpenAction
  521. local function process_catalog(task, pdf, obj)
  522. if obj.dict then
  523. if obj.dict.OpenAction then
  524. local action = maybe_dereference_object(obj.dict.OpenAction, pdf, task)
  525. if action and type(action) == 'table' then
  526. -- This also processes action js (if not already processed)
  527. process_dict(task, pdf, action, action.dict)
  528. if action.js then
  529. lua_util.debugm(N, task, 'found openaction JS in %s:%s: %s',
  530. obj.major, obj.minor, action.js)
  531. pdf.openaction = action.js
  532. elseif action.launch then
  533. lua_util.debugm(N, task, 'found openaction launch in %s:%s: %s',
  534. obj.major, obj.minor, action.launch)
  535. pdf.launch = action.launch
  536. else
  537. lua_util.debugm(N, task, 'no JS in openaction %s:%s: %s',
  538. obj.major, obj.minor, action)
  539. end
  540. else
  541. lua_util.debugm(N, task, 'cannot find openaction %s:%s: %s -> %s',
  542. obj.major, obj.minor, obj.dict.OpenAction, action)
  543. end
  544. else
  545. lua_util.debugm(N, task, 'no openaction in catalog %s:%s',
  546. obj.major, obj.minor)
  547. end
  548. end
  549. end
  550. local function process_xref(task, pdf, obj)
  551. if obj.dict then
  552. if obj.dict.Encrypt then
  553. local encrypt = maybe_dereference_object(obj.dict.Encrypt, pdf, task)
  554. lua_util.debugm(N, task, 'found encrypt: %s in xref object %s:%s',
  555. encrypt, obj.major, obj.minor)
  556. pdf.encrypted = true
  557. end
  558. end
  559. end
  560. process_dict = function(task, pdf, obj, dict)
  561. if not obj.type and type(dict) == 'table' then
  562. if dict.Type and type(dict.Type) == 'string' then
  563. -- Common stuff
  564. obj.type = dict.Type
  565. end
  566. if not obj.type then
  567. if obj.dict.S and obj.dict.JS then
  568. obj.type = 'Javascript'
  569. lua_util.debugm(N, task, 'implicit type for Javascript object %s:%s',
  570. obj.major, obj.minor)
  571. else
  572. lua_util.debugm(N, task, 'no type for %s:%s',
  573. obj.major, obj.minor)
  574. return
  575. end
  576. end
  577. lua_util.debugm(N, task, 'processed stream dictionary for object %s:%s -> %s',
  578. obj.major, obj.minor, obj.type)
  579. local contents = dict.Contents
  580. if contents and type(contents) == 'table' then
  581. if contents[1] == '%REF%' then
  582. -- Single reference
  583. contents = {contents}
  584. end
  585. obj.contents = {}
  586. for _,c in ipairs(contents) do
  587. local cobj = maybe_dereference_object(c, pdf, task)
  588. if cobj and type(cobj) == 'table' then
  589. obj.contents[#obj.contents + 1] = cobj
  590. cobj.parent = obj
  591. cobj.type = 'content'
  592. end
  593. end
  594. lua_util.debugm(N, task, 'found content objects for %s:%s -> %s',
  595. obj.major, obj.minor, #obj.contents)
  596. end
  597. local resources = dict.Resources
  598. if resources and type(resources) == 'table' then
  599. local res_ref = maybe_dereference_object(resources, pdf, task)
  600. if type(res_ref) ~= 'table' then
  601. lua_util.debugm(N, task, 'cannot parse resources from pdf: %s',
  602. resources)
  603. obj.resources = {}
  604. elseif res_ref.dict then
  605. obj.resources = res_ref.dict
  606. else
  607. obj.resources = {}
  608. end
  609. else
  610. -- Fucking pdf: we need to inherit from parent
  611. resources = {}
  612. if dict.Parent then
  613. local parent = maybe_dereference_object(dict.Parent, pdf, task)
  614. if parent and type(parent) == 'table' and parent.dict then
  615. if parent.resources then
  616. lua_util.debugm(N, task, 'propagated resources from %s:%s to %s:%s',
  617. parent.major, parent.minor, obj.major, obj.minor)
  618. resources = parent.resources
  619. end
  620. end
  621. end
  622. obj.resources = resources
  623. end
  624. --[[Disabled fonts extraction
  625. local fonts = obj.resources.Font
  626. if fonts and type(fonts) == 'table' then
  627. obj.fonts = {}
  628. for k,v in pairs(fonts) do
  629. obj.fonts[k] = maybe_dereference_object(v, pdf, task)
  630. if obj.fonts[k] then
  631. local font = obj.fonts[k]
  632. if config.text_extraction then
  633. process_font(task, pdf, font, k)
  634. lua_util.debugm(N, task, 'found font "%s" for object %s:%s -> %s',
  635. k, obj.major, obj.minor, font)
  636. end
  637. end
  638. end
  639. end
  640. ]]
  641. lua_util.debugm(N, task, 'found resources for object %s:%s (%s): %s',
  642. obj.major, obj.minor, obj.type, obj.resources)
  643. if obj.type == 'Action' then
  644. process_action(task, pdf, obj)
  645. elseif obj.type == 'Catalog' then
  646. process_catalog(task, pdf, obj)
  647. elseif obj.type == 'XRef' then
  648. -- XRef stream instead of trailer from PDF 1.5 (thanks Adobe)
  649. process_xref(task, pdf, obj)
  650. elseif obj.type == 'Javascript' then
  651. local js = maybe_dereference_object(obj.dict.JS, pdf, task)
  652. if js then
  653. if type(js) == 'table' then
  654. local extracted_js = maybe_extract_object_stream(js, pdf, task)
  655. if not extracted_js then
  656. lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
  657. obj.major, obj.minor, js)
  658. else
  659. js = extracted_js
  660. end
  661. end
  662. js = process_javascript(task, pdf, js)
  663. if js then
  664. obj.js = js
  665. lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
  666. obj.major, obj.minor, obj.js.data)
  667. else
  668. lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
  669. obj.major, obj.minor, js)
  670. end
  671. end
  672. end
  673. end -- Already processed dict (obj.type is not empty)
  674. end
  675. -- This function is intended to unpack objects from ObjStm crappy structure
  676. local compound_obj_grammar
  677. local function compound_obj_grammar_gen()
  678. if not compound_obj_grammar then
  679. local gen = generic_grammar_elts()
  680. compound_obj_grammar = gen.ws^0 * (gen.comment * gen.ws^1)^0 *
  681. lpeg.Ct(lpeg.Ct(gen.number * gen.ws^1 * gen.number * gen.ws^0)^1)
  682. end
  683. return compound_obj_grammar
  684. end
  685. local function pdf_compound_object_unpack(_, uncompressed, pdf, task, first)
  686. -- First, we need to parse data line by line likely to find a line
  687. -- that consists of pairs of numbers
  688. compound_obj_grammar_gen()
  689. local elts = compound_obj_grammar:match(uncompressed)
  690. if elts and #elts > 0 then
  691. lua_util.debugm(N, task, 'compound elts (chunk length %s): %s',
  692. #uncompressed, elts)
  693. for i,pair in ipairs(elts) do
  694. local obj_number,offset = pair[1], pair[2]
  695. offset = offset + first
  696. if offset < #uncompressed then
  697. local span_len
  698. if i == #elts then
  699. span_len = #uncompressed - offset
  700. else
  701. span_len = (elts[i + 1][2] + first) - offset
  702. end
  703. if span_len > 0 and offset + span_len < #uncompressed then
  704. local obj = {
  705. major = obj_number,
  706. minor = 0, -- Implicit
  707. data = uncompressed:span(offset + 1, span_len),
  708. ref = obj_ref(obj_number, 0)
  709. }
  710. parse_object_grammar(obj, task, pdf)
  711. if obj.dict then
  712. pdf.objects[#pdf.objects + 1] = obj
  713. end
  714. end
  715. end
  716. end
  717. end
  718. end
  719. -- PDF 1.5 ObjStmt
  720. local function extract_pdf_compound_objects(task, pdf)
  721. for i,obj in ipairs(pdf.objects or {}) do
  722. if i > 0 and i % 100 == 0 then
  723. local now = rspamd_util.get_ticks()
  724. if now >= pdf.end_timestamp then
  725. pdf.timeout_processing = now - pdf.start_timestamp
  726. lua_util.debugm(N, task, 'pdf: timeout processing compound objects after spending %s seconds, ' ..
  727. '%s elements processed',
  728. pdf.timeout_processing, i)
  729. break
  730. end
  731. end
  732. if obj.stream and obj.dict and type(obj.dict) == 'table' then
  733. local t = obj.dict.Type
  734. if t and t == 'ObjStm' then
  735. -- We are in troubles sir...
  736. local nobjs = tonumber(maybe_dereference_object(obj.dict.N, pdf, task))
  737. local first = tonumber(maybe_dereference_object(obj.dict.First, pdf, task))
  738. if nobjs and first then
  739. --local extend = maybe_dereference_object(obj.dict.Extends, pdf, task)
  740. lua_util.debugm(N, task, 'extract ObjStm with %s objects (%s first) %s extend',
  741. nobjs, first, obj.dict.Extends)
  742. local uncompressed = maybe_extract_object_stream(obj, pdf, task)
  743. if uncompressed then
  744. pdf_compound_object_unpack(obj, uncompressed, pdf, task, first)
  745. end
  746. else
  747. lua_util.debugm(N, task, 'ObjStm object %s:%s has bad dict: %s',
  748. obj.major, obj.minor, obj.dict)
  749. end
  750. end
  751. end
  752. end
  753. end
  754. -- This function arranges starts and ends of all objects and process them into initial
  755. -- set of objects
  756. local function extract_outer_objects(task, input, pdf)
  757. local start_pos, end_pos = 1, 1
  758. local max_start_pos, max_end_pos
  759. local obj_count = 0
  760. max_start_pos = math.min(config.max_pdf_objects, #pdf.start_objects)
  761. max_end_pos = math.min(config.max_pdf_objects, #pdf.end_objects)
  762. lua_util.debugm(N, task, "pdf: extract objects from %s start positions and %s end positions",
  763. max_start_pos, max_end_pos)
  764. while start_pos <= max_start_pos and end_pos <= max_end_pos do
  765. local first = pdf.start_objects[start_pos]
  766. local last = pdf.end_objects[end_pos]
  767. -- 7 is length of `endobj\n`
  768. if first + 6 < last then
  769. local len = last - first - 6
  770. -- Also get the starting span and try to match it versus obj re to get numbers
  771. local obj_line_potential = first - 32
  772. if obj_line_potential < 1 then obj_line_potential = 1 end
  773. local prev_obj_end = pdf.end_objects[end_pos - 1]
  774. if end_pos > 1 and prev_obj_end >= obj_line_potential and prev_obj_end < first then
  775. obj_line_potential = prev_obj_end + 1
  776. end
  777. local obj_line_span = input:span(obj_line_potential, first - obj_line_potential + 1)
  778. local matches = object_re:search(obj_line_span, true, true)
  779. if matches and matches[1] then
  780. local nobj = {
  781. start = first,
  782. len = len,
  783. data = input:span(first, len),
  784. major = tonumber(matches[1][2]),
  785. minor = tonumber(matches[1][3]),
  786. }
  787. pdf.objects[obj_count + 1] = nobj
  788. if nobj.major and nobj.minor then
  789. -- Add reference
  790. local ref = obj_ref(nobj.major, nobj.minor)
  791. nobj.ref = ref -- Our internal reference
  792. pdf.ref[ref] = nobj
  793. end
  794. end
  795. obj_count = obj_count + 1
  796. start_pos = start_pos + 1
  797. end_pos = end_pos + 1
  798. elseif first > last then
  799. end_pos = end_pos + 1
  800. else
  801. start_pos = start_pos + 1
  802. end_pos = end_pos + 1
  803. end
  804. end
  805. end
  806. -- This function attaches streams to objects and processes outer pdf grammar
  807. local function attach_pdf_streams(task, input, pdf)
  808. if pdf.start_streams and pdf.end_streams then
  809. local start_pos, end_pos = 1, 1
  810. local max_start_pos, max_end_pos
  811. max_start_pos = math.min(config.max_pdf_objects, #pdf.start_streams)
  812. max_end_pos = math.min(config.max_pdf_objects, #pdf.end_streams)
  813. for _,obj in ipairs(pdf.objects) do
  814. while start_pos <= max_start_pos and end_pos <= max_end_pos do
  815. local first = pdf.start_streams[start_pos]
  816. local last = pdf.end_streams[end_pos]
  817. last = last - 10 -- Exclude endstream\n pattern
  818. lua_util.debugm(N, task, "start: %s, end: %s; obj: %s-%s",
  819. first, last, obj.start, obj.start + obj.len)
  820. if first > obj.start and last < obj.start + obj.len and last > first then
  821. -- In case if we have fake endstream :(
  822. while pdf.end_streams[end_pos + 1] and pdf.end_streams[end_pos + 1] < obj.start + obj.len do
  823. end_pos = end_pos + 1
  824. last = pdf.end_streams[end_pos]
  825. end
  826. -- Strip the first \n
  827. while first < last do
  828. local chr = input:byte(first)
  829. if chr ~= 13 and chr ~= 10 then break end
  830. first = first + 1
  831. end
  832. local len = last - first
  833. obj.stream = {
  834. start = first,
  835. len = len,
  836. data = input:span(first, len)
  837. }
  838. start_pos = start_pos + 1
  839. end_pos = end_pos + 1
  840. break
  841. elseif first < obj.start then
  842. start_pos = start_pos + 1
  843. elseif last > obj.start + obj.len then
  844. -- Not this object
  845. break
  846. else
  847. start_pos = start_pos + 1
  848. end_pos = end_pos + 1
  849. end
  850. end
  851. if obj.stream then
  852. lua_util.debugm(N, task, 'found object %s:%s %s start %s len, %s stream start, %s stream length',
  853. obj.major, obj.minor, obj.start, obj.len, obj.stream.start, obj.stream.len)
  854. else
  855. lua_util.debugm(N, task, 'found object %s:%s %s start %s len, no stream',
  856. obj.major, obj.minor, obj.start, obj.len)
  857. end
  858. end
  859. end
  860. end
  861. -- Processes PDF objects: extracts streams, object numbers, process outer grammar,
  862. -- augment object types
  863. local function postprocess_pdf_objects(task, input, pdf)
  864. pdf.objects = {} -- objects table
  865. pdf.ref = {} -- references table
  866. extract_outer_objects(task, input, pdf)
  867. -- Now we have objects and we need to attach streams that are in bounds
  868. attach_pdf_streams(task, input, pdf)
  869. -- Parse grammar for outer objects
  870. for i,obj in ipairs(pdf.objects) do
  871. if i > 0 and i % 100 == 0 then
  872. local now = rspamd_util.get_ticks()
  873. if now >= pdf.end_timestamp then
  874. pdf.timeout_processing = now - pdf.start_timestamp
  875. lua_util.debugm(N, task, 'pdf: timeout processing grammars after spending %s seconds, ' ..
  876. '%s elements processed',
  877. pdf.timeout_processing, i)
  878. break
  879. end
  880. end
  881. if obj.ref then
  882. parse_object_grammar(obj, task, pdf)
  883. -- Special early handling
  884. if obj.dict and obj.dict.Type and obj.dict.Type == 'XRef' then
  885. process_xref(task, pdf, obj)
  886. end
  887. end
  888. end
  889. if not pdf.timeout_processing then
  890. extract_pdf_compound_objects(task, pdf)
  891. else
  892. -- ENOTIME
  893. return
  894. end
  895. -- Now we might probably have all objects being processed
  896. for i,obj in ipairs(pdf.objects) do
  897. if obj.dict then
  898. -- Types processing
  899. if i > 0 and i % 100 == 0 then
  900. local now = rspamd_util.get_ticks()
  901. if now >= pdf.end_timestamp then
  902. pdf.timeout_processing = now - pdf.start_timestamp
  903. lua_util.debugm(N, task, 'pdf: timeout processing dicts after spending %s seconds, ' ..
  904. '%s elements processed',
  905. pdf.timeout_processing, i)
  906. break
  907. end
  908. end
  909. process_dict(task, pdf, obj, obj.dict)
  910. end
  911. end
  912. end
  913. local function offsets_to_blocks(starts, ends, out)
  914. local start_pos, end_pos = 1, 1
  915. while start_pos <= #starts and end_pos <= #ends do
  916. local first = starts[start_pos]
  917. local last = ends[end_pos]
  918. if first < last then
  919. local len = last - first
  920. out[#out + 1] = {
  921. start = first,
  922. len = len,
  923. }
  924. start_pos = start_pos + 1
  925. end_pos = end_pos + 1
  926. elseif first > last then
  927. end_pos = end_pos + 1
  928. else
  929. -- Not ordered properly!
  930. break
  931. end
  932. end
  933. end
  934. local function search_text(task, pdf)
  935. for _,obj in ipairs(pdf.objects) do
  936. if obj.type == 'Page' and obj.contents then
  937. local text = {}
  938. for _,tobj in ipairs(obj.contents) do
  939. maybe_extract_object_stream(tobj, pdf, task)
  940. local matches = pdf_text_trie:match(tobj.uncompressed or '')
  941. if matches then
  942. local text_blocks = {}
  943. local starts = {}
  944. local ends = {}
  945. for npat,matched_positions in pairs(matches) do
  946. if npat == 1 then
  947. for _,pos in ipairs(matched_positions) do
  948. starts[#starts + 1] = pos
  949. end
  950. else
  951. for _,pos in ipairs(matched_positions) do
  952. ends[#ends + 1] = pos
  953. end
  954. end
  955. end
  956. offsets_to_blocks(starts, ends, text_blocks)
  957. for _,bl in ipairs(text_blocks) do
  958. if bl.len > 2 then
  959. -- To remove \s+ET\b pattern (it can leave trailing space or not but it doesn't matter)
  960. bl.len = bl.len - 2
  961. end
  962. bl.data = tobj.uncompressed:span(bl.start, bl.len)
  963. --lua_util.debugm(N, task, 'extracted text from object %s:%s: %s',
  964. -- tobj.major, tobj.minor, bl.data)
  965. if bl.len < config.max_processing_size then
  966. local ret,obj_or_err = pcall(pdf_text_grammar.match, pdf_text_grammar,
  967. bl.data)
  968. if ret then
  969. text[#text + 1] = obj_or_err
  970. lua_util.debugm(N, task, 'attached %s from content object %s:%s to %s:%s',
  971. obj_or_err, tobj.major, tobj.minor, obj.major, obj.minor)
  972. else
  973. lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
  974. obj.major, obj.minor, obj_or_err)
  975. end
  976. end
  977. end
  978. end
  979. end
  980. -- Join all text data together
  981. if #text > 0 then
  982. obj.text = rspamd_text.fromtable(text)
  983. lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
  984. obj.major, obj.minor, obj.text)
  985. end
  986. end
  987. end
  988. end
  989. -- This function searches objects for `/URI` key and parses it's content
  990. local function search_urls(task, pdf, mpart)
  991. local function recursive_object_traverse(obj, dict, rec)
  992. if rec > 10 then
  993. lua_util.debugm(N, task, 'object %s:%s recurses too much',
  994. obj.major, obj.minor)
  995. return
  996. end
  997. for k,v in pairs(dict) do
  998. if type(v) == 'table' then
  999. recursive_object_traverse(obj, v, rec + 1)
  1000. elseif k == 'URI' then
  1001. v = maybe_dereference_object(v, pdf, task)
  1002. if type(v) == 'string' then
  1003. local url = rspamd_url.create(task:get_mempool(), v, {'content'})
  1004. if url then
  1005. lua_util.debugm(N, task, 'found url %s in object %s:%s',
  1006. v, obj.major, obj.minor)
  1007. task:inject_url(url, mpart)
  1008. end
  1009. end
  1010. end
  1011. end
  1012. end
  1013. for _,obj in ipairs(pdf.objects) do
  1014. if obj.dict and type(obj.dict) == 'table' then
  1015. recursive_object_traverse(obj, obj.dict, 0)
  1016. end
  1017. end
  1018. end
  1019. local function process_pdf(input, mpart, task)
  1020. if not config.enabled then
  1021. -- Skip processing
  1022. return {}
  1023. end
  1024. local matches = pdf_trie:match(input)
  1025. if matches then
  1026. local start_ts = rspamd_util.get_ticks()
  1027. -- Temp object used to share data between pdf extraction methods
  1028. local pdf_object = {
  1029. tag = 'pdf',
  1030. extract_text = extract_text_data,
  1031. start_timestamp = start_ts,
  1032. end_timestamp = start_ts + config.pdf_process_timeout,
  1033. }
  1034. -- Output object that excludes all internal stuff
  1035. local pdf_output = lua_util.shallowcopy(pdf_object)
  1036. local grouped_processors = {}
  1037. for npat,matched_positions in pairs(matches) do
  1038. local index = pdf_indexes[npat]
  1039. local proc_key,loc_npat = index[1], index[4]
  1040. if not grouped_processors[proc_key] then
  1041. grouped_processors[proc_key] = {
  1042. processor_func = processors[proc_key],
  1043. offsets = {},
  1044. }
  1045. end
  1046. local proc = grouped_processors[proc_key]
  1047. -- Fill offsets
  1048. for _,pos in ipairs(matched_positions) do
  1049. proc.offsets[#proc.offsets + 1] = {pos, loc_npat}
  1050. end
  1051. end
  1052. for name,processor in pairs(grouped_processors) do
  1053. -- Sort by offset
  1054. lua_util.debugm(N, task, "pdf: process group %s with %s matches",
  1055. name, #processor.offsets)
  1056. table.sort(processor.offsets, function(e1, e2) return e1[1] < e2[1] end)
  1057. processor.processor_func(input, task, processor.offsets, pdf_object, pdf_output)
  1058. end
  1059. pdf_output.flags = {}
  1060. if pdf_object.start_objects and pdf_object.end_objects then
  1061. if #pdf_object.start_objects > config.max_pdf_objects then
  1062. pdf_output.many_objects = #pdf_object.start_objects
  1063. -- Trim
  1064. end
  1065. -- Postprocess objects
  1066. postprocess_pdf_objects(task, input, pdf_object)
  1067. if config.text_extraction then
  1068. search_text(task, pdf_object, pdf_output)
  1069. end
  1070. if config.url_extraction then
  1071. search_urls(task, pdf_object, mpart, pdf_output)
  1072. end
  1073. if config.js_fuzzy and pdf_object.scripts then
  1074. pdf_output.fuzzy_hashes = {}
  1075. if config.openaction_fuzzy_only then
  1076. -- OpenAction only
  1077. if pdf_object.openaction and pdf_object.openaction.bin_hash then
  1078. if config.min_js_fuzzy and #pdf_object.openaction.data >= config.min_js_fuzzy then
  1079. lua_util.debugm(N, task, "pdf: add fuzzy hash from openaction: %s",
  1080. pdf_object.openaction.hash)
  1081. table.insert(pdf_output.fuzzy_hashes, pdf_object.openaction.bin_hash)
  1082. else
  1083. lua_util.debugm(N, task, "pdf: skip fuzzy hash from Javascript: %s, too short: %s",
  1084. pdf_object.openaction.hash, #pdf_object.openaction.data)
  1085. end
  1086. end
  1087. else
  1088. -- All hashes
  1089. for h,sc in pairs(pdf_object.scripts) do
  1090. if config.min_js_fuzzy and #sc.data >= config.min_js_fuzzy then
  1091. lua_util.debugm(N, task, "pdf: add fuzzy hash from Javascript: %s",
  1092. sc.hash)
  1093. table.insert(pdf_output.fuzzy_hashes, h)
  1094. else
  1095. lua_util.debugm(N, task, "pdf: skip fuzzy hash from Javascript: %s, too short: %s",
  1096. sc.hash, #sc.data)
  1097. end
  1098. end
  1099. end
  1100. end
  1101. else
  1102. pdf_output.flags.no_objects = true
  1103. end
  1104. -- Propagate from object to output
  1105. if pdf_object.encrypted then
  1106. pdf_output.encrypted = true
  1107. end
  1108. if pdf_object.scripts then
  1109. pdf_output.scripts = true
  1110. end
  1111. return pdf_output
  1112. end
  1113. end
  1114. -- Processes the PDF trailer
  1115. processors.trailer = function(input, task, positions, pdf_object, pdf_output)
  1116. local last_pos = positions[#positions]
  1117. lua_util.debugm(N, task, 'pdf: process trailer at position %s (%s total length)',
  1118. last_pos, #input)
  1119. if last_pos[1] > config.max_pdf_trailer then
  1120. pdf_output.long_trailer = #input - last_pos[1]
  1121. return
  1122. end
  1123. local last_span = input:span(last_pos[1])
  1124. local lines_checked = 0
  1125. for line in last_span:lines(true) do
  1126. if line:find('/Encrypt ') then
  1127. lua_util.debugm(N, task, "pdf: found encrypted line in trailer: %s",
  1128. line)
  1129. pdf_output.encrypted = true
  1130. pdf_object.encrypted = true
  1131. break
  1132. end
  1133. lines_checked = lines_checked + 1
  1134. if lines_checked > config.max_pdf_trailer_lines then
  1135. lua_util.debugm(N, task, "pdf: trailer has too many lines, stop checking")
  1136. pdf_output.long_trailer = #input - last_pos[1]
  1137. break
  1138. end
  1139. end
  1140. end
  1141. processors.suspicious = function(input, task, positions, pdf_object, pdf_output)
  1142. local suspicious_factor = 0.0
  1143. local nexec = 0
  1144. local nencoded = 0
  1145. local close_encoded = 0
  1146. local last_encoded
  1147. for _,match in ipairs(positions) do
  1148. if match[2] == 1 then
  1149. -- netsh
  1150. suspicious_factor = suspicious_factor + 0.5
  1151. elseif match[2] == 2 then
  1152. nexec = nexec + 1
  1153. else
  1154. nencoded = nencoded + 1
  1155. if last_encoded then
  1156. if match[1] - last_encoded < 8 then
  1157. -- likely consecutive encoded chars, increase factor
  1158. close_encoded = close_encoded + 1
  1159. end
  1160. end
  1161. last_encoded = match[1]
  1162. end
  1163. end
  1164. if nencoded > 10 then
  1165. suspicious_factor = suspicious_factor + nencoded / 10
  1166. end
  1167. if nexec > 1 then
  1168. suspicious_factor = suspicious_factor + nexec / 2.0
  1169. end
  1170. if close_encoded > 4 and nencoded - close_encoded < 5 then
  1171. -- Too many close encoded comparing to the total number of encoded characters
  1172. suspicious_factor = suspicious_factor + 0.5
  1173. end
  1174. lua_util.debugm(N, task, 'pdf: found a suspicious patterns: %s exec, %s encoded (%s close), ' ..
  1175. '%s final factor',
  1176. nexec, nencoded, close_encoded, suspicious_factor)
  1177. if suspicious_factor > 1.0 then
  1178. suspicious_factor = 1.0
  1179. end
  1180. pdf_output.suspicious = suspicious_factor
  1181. end
  1182. local function generic_table_inserter(positions, pdf_object, output_key)
  1183. if not pdf_object[output_key] then
  1184. pdf_object[output_key] = {}
  1185. end
  1186. local shift = #pdf_object[output_key]
  1187. for i,pos in ipairs(positions) do
  1188. pdf_object[output_key][i + shift] = pos[1]
  1189. end
  1190. end
  1191. processors.start_object = function(_, task, positions, pdf_object)
  1192. generic_table_inserter(positions, pdf_object, 'start_objects')
  1193. end
  1194. processors.end_object = function(_, task, positions, pdf_object)
  1195. generic_table_inserter(positions, pdf_object, 'end_objects')
  1196. end
  1197. processors.start_stream = function(_, task, positions, pdf_object)
  1198. generic_table_inserter(positions, pdf_object, 'start_streams')
  1199. end
  1200. processors.end_stream = function(_, task, positions, pdf_object)
  1201. generic_table_inserter(positions, pdf_object, 'end_streams')
  1202. end
  1203. exports.process = process_pdf
  1204. return exports