You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

pdf.lua 43KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424
  1. --[[
  2. Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. --[[[
  14. -- @module lua_content/pdf
  15. -- This module contains some heuristics for PDF files
  16. --]]
  17. local rspamd_trie = require "rspamd_trie"
  18. local rspamd_util = require "rspamd_util"
  19. local rspamd_text = require "rspamd_text"
  20. local rspamd_url = require "rspamd_url"
  21. local bit = require "bit"
  22. local N = "lua_content"
  23. local lua_util = require "lua_util"
  24. local rspamd_regexp = require "rspamd_regexp"
  25. local lpeg = require "lpeg"
  26. local pdf_patterns = {
  27. trailer = {
  28. patterns = {
  29. [[\ntrailer\r?\n]]
  30. }
  31. },
  32. suspicious = {
  33. patterns = {
  34. [[netsh\s]],
  35. [[echo\s]],
  36. [=[\/[A-Za-z]*#\d\d[#A-Za-z<>/\s]]=], -- Hex encode obfuscation
  37. }
  38. },
  39. start_object = {
  40. patterns = {
  41. [=[[\r\n\0]\s*\d+\s+\d+\s+obj[\s<]]=]
  42. }
  43. },
  44. end_object = {
  45. patterns = {
  46. [=[endobj[\r\n]]=]
  47. }
  48. },
  49. start_stream = {
  50. patterns = {
  51. [=[>\s*stream[\r\n]]=],
  52. }
  53. },
  54. end_stream = {
  55. patterns = {
  56. [=[endstream[\r\n]]=]
  57. }
  58. }
  59. }
  60. local pdf_text_patterns = {
  61. start = {
  62. patterns = {
  63. [[\sBT\s]]
  64. }
  65. },
  66. stop = {
  67. patterns = {
  68. [[\sET\b]]
  69. }
  70. }
  71. }
  72. local pdf_cmap_patterns = {
  73. start = {
  74. patterns = {
  75. [[\d\s+beginbfchar\s]],
  76. [[\d\s+beginbfrange\s]]
  77. }
  78. },
  79. stop = {
  80. patterns = {
  81. [[\sendbfrange\b]],
  82. [[\sendbchar\b]]
  83. }
  84. }
  85. }
  86. -- index[n] ->
  87. -- t[1] - pattern,
  88. -- t[2] - key in patterns table,
  89. -- t[3] - value in patterns table
  90. -- t[4] - local pattern index
  91. local pdf_indexes = {}
  92. local pdf_text_indexes = {}
  93. local pdf_cmap_indexes = {}
  94. local pdf_trie
  95. local pdf_text_trie
  96. local pdf_cmap_trie
  97. local exports = {}
  98. local config = {
  99. max_extraction_size = 512 * 1024,
  100. max_processing_size = 32 * 1024,
  101. text_extraction = false, -- NYI feature
  102. url_extraction = true,
  103. enabled = true,
  104. js_fuzzy = true, -- Generate fuzzy hashes from PDF javascripts
  105. min_js_fuzzy = 256, -- Minimum size of js to be considered as a fuzzy
  106. openaction_fuzzy_only = false, -- Generate fuzzy from all scripts
  107. max_pdf_objects = 10000, -- Maximum number of objects to be considered
  108. max_pdf_trailer = 10 * 1024 * 1024, -- Maximum trailer size (to avoid abuse)
  109. max_pdf_trailer_lines = 100, -- Maximum number of lines in pdf trailer
  110. pdf_process_timeout = 1.0, -- Timeout in seconds for processing
  111. }
  112. -- Used to process patterns found in PDF
  113. -- positions for functional processors should be a iter/table from trie matcher in form
  114. ---- [{n1, pat_idx1}, ... {nn, pat_idxn}] where
  115. ---- pat_idxn is pattern index and n1 ... nn are match positions
  116. local processors = {}
  117. -- PDF objects outer grammar in LPEG style (performing table captures)
  118. local pdf_outer_grammar
  119. local pdf_text_grammar
  120. -- Used to match objects
  121. local object_re = rspamd_regexp.create_cached([=[/(\d+)\s+(\d+)\s+obj\s*/]=])
  122. local function config_module()
  123. local opts = rspamd_config:get_all_opt('lua_content')
  124. if opts and opts.pdf then
  125. config = lua_util.override_defaults(config, opts.pdf)
  126. end
  127. end
  128. local function compile_tries()
  129. local default_compile_flags = bit.bor(rspamd_trie.flags.re,
  130. rspamd_trie.flags.dot_all,
  131. rspamd_trie.flags.no_start)
  132. local function compile_pats(patterns, indexes, compile_flags)
  133. local strs = {}
  134. for what, data in pairs(patterns) do
  135. for i, pat in ipairs(data.patterns) do
  136. strs[#strs + 1] = pat
  137. indexes[#indexes + 1] = { what, data, pat, i }
  138. end
  139. end
  140. return rspamd_trie.create(strs, compile_flags or default_compile_flags)
  141. end
  142. if not pdf_trie then
  143. pdf_trie = compile_pats(pdf_patterns, pdf_indexes)
  144. end
  145. if not pdf_text_trie then
  146. pdf_text_trie = compile_pats(pdf_text_patterns, pdf_text_indexes)
  147. end
  148. if not pdf_cmap_trie then
  149. pdf_cmap_trie = compile_pats(pdf_cmap_patterns, pdf_cmap_indexes)
  150. end
  151. end
  152. -- Returns a table with generic grammar elements for PDF
  153. local function generic_grammar_elts()
  154. local P = lpeg.P
  155. local R = lpeg.R
  156. local S = lpeg.S
  157. local V = lpeg.V
  158. local C = lpeg.C
  159. local D = R '09' -- Digits
  160. local grammar_elts = {}
  161. -- Helper functions
  162. local function pdf_hexstring_unescape(s)
  163. if #s % 2 == 0 then
  164. -- Sane hex string
  165. return lua_util.unhex(s)
  166. end
  167. -- WTF hex string
  168. -- Append '0' to it and unescape...
  169. return lua_util.unhex(s:sub(1, #s - 1)) .. lua_util.unhex((s:sub(#s) .. '0'))
  170. end
  171. local function pdf_string_unescape(s)
  172. local function ue_single(cc)
  173. if cc == '\\r' then
  174. return '\r'
  175. elseif cc == '\\n' then
  176. return '\n'
  177. else
  178. return cc:gsub(2, 2)
  179. end
  180. end
  181. -- simple unescape \char
  182. s = s:gsub('\\[^%d]', ue_single)
  183. -- unescape octal
  184. local function ue_octal(cc)
  185. -- Replace unknown stuff with '?'
  186. return string.char(tonumber(cc:sub(2), 8) or 63)
  187. end
  188. s = s:gsub('\\%d%d?%d?', ue_octal)
  189. return s
  190. end
  191. local function pdf_id_unescape(s)
  192. return (s:gsub('#%d%d', function(cc)
  193. return string.char(tonumber(cc:sub(2), 16))
  194. end))
  195. end
  196. local delim = S '()<>[]{}/%'
  197. grammar_elts.ws = S '\0 \r\n\t\f'
  198. local hex = R 'af' + R 'AF' + D
  199. -- Comments.
  200. local eol = P '\r\n' + '\n'
  201. local line = (1 - S '\r\n\f') ^ 0 * eol ^ -1
  202. grammar_elts.comment = P '%' * line
  203. -- Numbers.
  204. local sign = S '+-' ^ -1
  205. local decimal = D ^ 1
  206. local float = D ^ 1 * P '.' * D ^ 0 + P '.' * D ^ 1
  207. grammar_elts.number = C(sign * (float + decimal)) / tonumber
  208. -- String
  209. grammar_elts.str = P { "(" * C(((1 - S "()\\") + (P '\\' * 1) + V(1)) ^ 0) / pdf_string_unescape * ")" }
  210. grammar_elts.hexstr = P { "<" * C(hex ^ 0) / pdf_hexstring_unescape * ">" }
  211. -- Identifier
  212. grammar_elts.id = P { '/' * C((1 - (delim + grammar_elts.ws)) ^ 1) / pdf_id_unescape }
  213. -- Booleans (who care about them?)
  214. grammar_elts.boolean = C(P("true") + P("false"))
  215. -- Stupid references
  216. grammar_elts.ref = lpeg.Ct { lpeg.Cc("%REF%") * C(D ^ 1) * " " * C(D ^ 1) * " " * "R" }
  217. return grammar_elts
  218. end
  219. -- Generates a grammar to parse outer elements (external objects in PDF notation)
  220. local function gen_outer_grammar()
  221. local V = lpeg.V
  222. local gen = generic_grammar_elts()
  223. return lpeg.P {
  224. "EXPR";
  225. EXPR = gen.ws ^ 0 * V("ELT") ^ 0 * gen.ws ^ 0,
  226. ELT = V("ARRAY") + V("DICT") + V("ATOM"),
  227. ATOM = gen.ws ^ 0 * (gen.comment + gen.boolean + gen.ref +
  228. gen.number + V("STRING") + gen.id) * gen.ws ^ 0,
  229. DICT = "<<" * gen.ws ^ 0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR") ^ 0, rawset) * gen.ws ^ 0 * ">>",
  230. KV_PAIR = lpeg.Cg(gen.id * gen.ws ^ 0 * V("ELT") * gen.ws ^ 0),
  231. ARRAY = "[" * gen.ws ^ 0 * lpeg.Ct(V("ELT") ^ 0) * gen.ws ^ 0 * "]",
  232. STRING = lpeg.P { gen.str + gen.hexstr },
  233. }
  234. end
  235. -- Graphic state in PDF
  236. local function gen_graphics_unary()
  237. local P = lpeg.P
  238. local S = lpeg.S
  239. return P("q") + P("Q") + P("h")
  240. + S("WSsFfBb") * P("*") ^ 0 + P("n")
  241. end
  242. local function gen_graphics_binary()
  243. local P = lpeg.P
  244. local S = lpeg.S
  245. return S("gGwJjMi") +
  246. P("M") + P("ri") + P("gs") +
  247. P("CS") + P("cs") + P("sh")
  248. end
  249. local function gen_graphics_ternary()
  250. local P = lpeg.P
  251. local S = lpeg.S
  252. return P("d") + P("m") + S("lm")
  253. end
  254. local function gen_graphics_nary()
  255. local P = lpeg.P
  256. local S = lpeg.S
  257. return P("SC") + P("sc") + P("SCN") + P("scn") + P("k") + P("K") + P("re") + S("cvy") +
  258. P("RG") + P("rg")
  259. end
  260. -- Generates a grammar to parse text blocks (between BT and ET)
  261. local function gen_text_grammar()
  262. local V = lpeg.V
  263. local P = lpeg.P
  264. local C = lpeg.C
  265. local gen = generic_grammar_elts()
  266. local empty = ""
  267. local unary_ops = C("T*") / "\n" +
  268. C(gen_graphics_unary()) / empty
  269. local binary_ops = P("Tc") + P("Tw") + P("Tz") + P("TL") + P("Tr") + P("Ts") +
  270. gen_graphics_binary()
  271. local ternary_ops = P("TD") + P("Td") + gen_graphics_ternary()
  272. local nary_op = P("Tm") + gen_graphics_nary()
  273. local text_binary_op = P("Tj") + P("TJ") + P("'")
  274. local text_quote_op = P('"')
  275. local font_op = P("Tf")
  276. return lpeg.P {
  277. "EXPR";
  278. EXPR = gen.ws ^ 0 * lpeg.Ct(V("COMMAND") ^ 0),
  279. COMMAND = (V("UNARY") + V("BINARY") + V("TERNARY") + V("NARY") + V("TEXT") +
  280. V("FONT") + gen.comment) * gen.ws ^ 0,
  281. UNARY = unary_ops,
  282. BINARY = V("ARG") / empty * gen.ws ^ 1 * binary_ops,
  283. TERNARY = V("ARG") / empty * gen.ws ^ 1 * V("ARG") / empty * gen.ws ^ 1 * ternary_ops,
  284. NARY = (gen.number / 0 * gen.ws ^ 1) ^ 1 * (gen.id / empty * gen.ws ^ 0) ^ -1 * nary_op,
  285. ARG = V("ARRAY") + V("DICT") + V("ATOM"),
  286. ATOM = (gen.comment + gen.boolean + gen.ref +
  287. gen.number + V("STRING") + gen.id),
  288. DICT = "<<" * gen.ws ^ 0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR") ^ 0, rawset) * gen.ws ^ 0 * ">>",
  289. KV_PAIR = lpeg.Cg(gen.id * gen.ws ^ 0 * V("ARG") * gen.ws ^ 0),
  290. ARRAY = "[" * gen.ws ^ 0 * lpeg.Ct(V("ARG") ^ 0) * gen.ws ^ 0 * "]",
  291. STRING = lpeg.P { gen.str + gen.hexstr },
  292. TEXT = (V("TEXT_ARG") * gen.ws ^ 1 * text_binary_op) +
  293. (V("ARG") / 0 * gen.ws ^ 1 * V("ARG") / 0 * gen.ws ^ 1 * V("TEXT_ARG") * gen.ws ^ 1 * text_quote_op),
  294. FONT = (V("FONT_ARG") * gen.ws ^ 1 * (gen.number / 0) * gen.ws ^ 1 * font_op),
  295. FONT_ARG = lpeg.Ct(lpeg.Cc("%font%") * gen.id),
  296. TEXT_ARG = lpeg.Ct(V("STRING")) + V("TEXT_ARRAY"),
  297. TEXT_ARRAY = "[" *
  298. lpeg.Ct(((gen.ws ^ 0 * (gen.ws ^ 0 * (gen.number / 0) ^ 0 * gen.ws ^ 0 * (gen.str + gen.hexstr))) ^ 1)) * gen.ws ^ 0 * "]",
  299. }
  300. end
  301. -- Call immediately on require
  302. compile_tries()
  303. config_module()
  304. pdf_outer_grammar = gen_outer_grammar()
  305. pdf_text_grammar = gen_text_grammar()
  306. local function extract_text_data(specific)
  307. return nil -- NYI
  308. end
  309. -- Generates index for major/minor pair
  310. local function obj_ref(major, minor)
  311. return major * 10.0 + 1.0 / (minor + 1.0)
  312. end
  313. -- Return indirect object reference (if needed)
  314. local function maybe_dereference_object(elt, pdf, task)
  315. if type(elt) == 'table' and elt[1] == '%REF%' then
  316. local ref = obj_ref(elt[2], elt[3])
  317. if pdf.ref[ref] then
  318. -- No recursion!
  319. return pdf.ref[ref]
  320. else
  321. lua_util.debugm(N, task, 'cannot dereference %s:%s -> %s, no object',
  322. elt[2], elt[3], obj_ref(elt[2], elt[3]))
  323. return nil
  324. end
  325. end
  326. return elt
  327. end
  328. -- Apply PDF stream filter
  329. local function apply_pdf_filter(input, filt)
  330. if filt == 'FlateDecode' then
  331. return rspamd_util.inflate(input, config.max_extraction_size)
  332. end
  333. return nil
  334. end
  335. -- Conditionally apply a pipeline of stream filters and return uncompressed data
  336. local function maybe_apply_filter(dict, data, pdf, task)
  337. local uncompressed = data
  338. if dict.Filter then
  339. local filt = dict.Filter
  340. if type(filt) == 'string' then
  341. filt = { filt }
  342. end
  343. if dict.DecodeParms then
  344. local decode_params = maybe_dereference_object(dict.DecodeParms, pdf, task)
  345. if type(decode_params) == 'table' then
  346. if decode_params.Predictor then
  347. return nil, 'predictor exists'
  348. end
  349. end
  350. end
  351. for _, f in ipairs(filt) do
  352. uncompressed = apply_pdf_filter(uncompressed, f)
  353. if not uncompressed then
  354. break
  355. end
  356. end
  357. end
  358. return uncompressed, nil
  359. end
  360. -- Conditionally extract stream data from object and attach it as obj.uncompressed
  361. local function maybe_extract_object_stream(obj, pdf, task)
  362. if pdf.encrypted then
  363. -- TODO add decryption some day
  364. return nil
  365. end
  366. local dict = obj.dict
  367. if dict.Length and type(obj.stream) == 'table' then
  368. local len = math.min(obj.stream.len,
  369. tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0)
  370. if len > 0 then
  371. local real_stream = obj.stream.data:span(1, len)
  372. local uncompressed, filter_err = maybe_apply_filter(dict, real_stream, pdf, task)
  373. if uncompressed then
  374. obj.uncompressed = uncompressed
  375. lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)',
  376. obj.major, obj.minor, len, uncompressed:len())
  377. return obj.uncompressed
  378. else
  379. lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s: %s',
  380. obj.major, obj.minor, len, dict.Filter, filter_err)
  381. end
  382. else
  383. lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s',
  384. obj.major, obj.minor, len)
  385. end
  386. end
  387. end
  388. local function parse_object_grammar(obj, task, pdf)
  389. -- Parse grammar
  390. local obj_dict_span
  391. if obj.stream then
  392. obj_dict_span = obj.data:span(1, obj.stream.start - obj.start)
  393. else
  394. obj_dict_span = obj.data
  395. end
  396. if obj_dict_span:len() < config.max_processing_size then
  397. local ret, obj_or_err = pcall(pdf_outer_grammar.match, pdf_outer_grammar, obj_dict_span)
  398. if ret then
  399. if obj.stream then
  400. if type(obj_or_err) == 'table' then
  401. obj.dict = obj_or_err
  402. else
  403. obj.dict = {}
  404. end
  405. lua_util.debugm(N, task, 'stream object %s:%s is parsed to: %s',
  406. obj.major, obj.minor, obj_or_err)
  407. else
  408. -- Direct object
  409. if type(obj_or_err) == 'table' then
  410. obj.dict = obj_or_err
  411. obj.uncompressed = obj_or_err
  412. lua_util.debugm(N, task, 'direct object %s:%s is parsed to: %s',
  413. obj.major, obj.minor, obj_or_err)
  414. pdf.ref[obj_ref(obj.major, obj.minor)] = obj
  415. else
  416. lua_util.debugm(N, task, 'direct object %s:%s is parsed to raw data: %s',
  417. obj.major, obj.minor, obj_or_err)
  418. pdf.ref[obj_ref(obj.major, obj.minor)] = obj_or_err
  419. obj.dict = {}
  420. obj.uncompressed = obj_or_err
  421. end
  422. end
  423. else
  424. lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
  425. obj.major, obj.minor, obj_or_err)
  426. end
  427. else
  428. lua_util.debugm(N, task, 'object %s:%s cannot be parsed: too large %s',
  429. obj.major, obj.minor, obj_dict_span:len())
  430. end
  431. end
  432. -- Extracts font data and process /ToUnicode mappings
  433. -- NYI in fact as cmap is ridiculously stupid and complicated
  434. --[[
  435. local function process_font(task, pdf, font, fname)
  436. local dict = font
  437. if font.dict then
  438. dict = font.dict
  439. end
  440. if type(dict) == 'table' and dict.ToUnicode then
  441. local cmap = maybe_dereference_object(dict.ToUnicode, pdf, task)
  442. if cmap and cmap.dict then
  443. maybe_extract_object_stream(cmap, pdf, task)
  444. lua_util.debugm(N, task, 'found cmap for font %s: %s',
  445. fname, cmap.uncompressed)
  446. end
  447. end
  448. end
  449. --]]
  450. -- Forward declaration
  451. local process_dict
  452. -- This function processes javascript string and returns JS hash and JS rspamd_text
  453. local function process_javascript(task, pdf, js, obj)
  454. local rspamd_cryptobox_hash = require "rspamd_cryptobox_hash"
  455. if type(js) == 'string' then
  456. js = rspamd_text.fromstring(js):oneline()
  457. elseif type(js) == 'userdata' then
  458. js = js:oneline()
  459. else
  460. return nil
  461. end
  462. local hash = rspamd_cryptobox_hash.create(js)
  463. local bin_hash = hash:bin()
  464. if not pdf.scripts then
  465. pdf.scripts = {}
  466. end
  467. if pdf.scripts[bin_hash] then
  468. -- Duplicate
  469. return pdf.scripts[bin_hash]
  470. end
  471. local njs = {
  472. data = js,
  473. hash = hash:hex(),
  474. bin_hash = bin_hash,
  475. object = obj,
  476. }
  477. pdf.scripts[bin_hash] = njs
  478. return njs
  479. end
  480. -- Extract interesting stuff from /Action, e.g. javascript
  481. local function process_action(task, pdf, obj)
  482. if not (obj.js or obj.launch) and (obj.dict and obj.dict.JS) then
  483. local js = maybe_dereference_object(obj.dict.JS, pdf, task)
  484. if js then
  485. if type(js) == 'table' then
  486. local extracted_js = maybe_extract_object_stream(js, pdf, task)
  487. if not extracted_js then
  488. lua_util.debugm(N, task, 'invalid type for JavaScript from %s:%s: %s',
  489. obj.major, obj.minor, js)
  490. else
  491. js = extracted_js
  492. end
  493. end
  494. js = process_javascript(task, pdf, js, obj)
  495. if js then
  496. obj.js = js
  497. lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
  498. obj.major, obj.minor, obj.js.data)
  499. else
  500. lua_util.debugm(N, task, 'invalid type for JavaScript from %s:%s: %s',
  501. obj.major, obj.minor, js)
  502. end
  503. elseif obj.dict.F then
  504. local launch = maybe_dereference_object(obj.dict.F, pdf, task)
  505. if launch then
  506. if type(launch) == 'string' then
  507. obj.launch = rspamd_text.fromstring(launch):exclude_chars('%n%c')
  508. lua_util.debugm(N, task, 'extracted launch from %s:%s: %s',
  509. obj.major, obj.minor, obj.launch)
  510. elseif type(launch) == 'userdata' then
  511. obj.launch = launch:exclude_chars('%n%c')
  512. lua_util.debugm(N, task, 'extracted launch from %s:%s: %s',
  513. obj.major, obj.minor, obj.launch)
  514. else
  515. lua_util.debugm(N, task, 'invalid type for launch from %s:%s: %s',
  516. obj.major, obj.minor, launch)
  517. end
  518. end
  519. else
  520. lua_util.debugm(N, task, 'no JS attribute in action %s:%s',
  521. obj.major, obj.minor)
  522. end
  523. end
  524. end
  525. -- Extract interesting stuff from /Catalog, e.g. javascript in /OpenAction
  526. local function process_catalog(task, pdf, obj)
  527. if obj.dict then
  528. if obj.dict.OpenAction then
  529. local action = maybe_dereference_object(obj.dict.OpenAction, pdf, task)
  530. if action and type(action) == 'table' then
  531. -- This also processes action js (if not already processed)
  532. process_dict(task, pdf, action, action.dict)
  533. if action.js then
  534. lua_util.debugm(N, task, 'found openaction JS in %s:%s: %s',
  535. obj.major, obj.minor, action.js)
  536. pdf.openaction = action.js
  537. action.js.object = obj
  538. elseif action.launch then
  539. lua_util.debugm(N, task, 'found openaction launch in %s:%s: %s',
  540. obj.major, obj.minor, action.launch)
  541. pdf.launch = action.launch
  542. else
  543. lua_util.debugm(N, task, 'no JS in openaction %s:%s: %s',
  544. obj.major, obj.minor, action)
  545. end
  546. else
  547. lua_util.debugm(N, task, 'cannot find openaction %s:%s: %s -> %s',
  548. obj.major, obj.minor, obj.dict.OpenAction, action)
  549. end
  550. else
  551. lua_util.debugm(N, task, 'no openaction in catalog %s:%s',
  552. obj.major, obj.minor)
  553. end
  554. end
  555. end
  556. local function process_xref(task, pdf, obj)
  557. if obj.dict then
  558. if obj.dict.Encrypt then
  559. local encrypt = maybe_dereference_object(obj.dict.Encrypt, pdf, task)
  560. lua_util.debugm(N, task, 'found encrypt: %s in xref object %s:%s',
  561. encrypt, obj.major, obj.minor)
  562. pdf.encrypted = true
  563. end
  564. end
  565. end
  566. process_dict = function(task, pdf, obj, dict)
  567. if not obj.type and type(dict) == 'table' then
  568. if dict.Type and type(dict.Type) == 'string' then
  569. -- Common stuff
  570. obj.type = dict.Type
  571. end
  572. if not obj.type then
  573. if obj.dict.S and obj.dict.JS then
  574. obj.type = 'Javascript'
  575. lua_util.debugm(N, task, 'implicit type for JavaScript object %s:%s',
  576. obj.major, obj.minor)
  577. else
  578. lua_util.debugm(N, task, 'no type for %s:%s',
  579. obj.major, obj.minor)
  580. return
  581. end
  582. end
  583. lua_util.debugm(N, task, 'processed stream dictionary for object %s:%s -> %s',
  584. obj.major, obj.minor, obj.type)
  585. local contents = dict.Contents
  586. if contents and type(contents) == 'table' then
  587. if contents[1] == '%REF%' then
  588. -- Single reference
  589. contents = { contents }
  590. end
  591. obj.contents = {}
  592. for _, c in ipairs(contents) do
  593. local cobj = maybe_dereference_object(c, pdf, task)
  594. if cobj and type(cobj) == 'table' then
  595. obj.contents[#obj.contents + 1] = cobj
  596. cobj.parent = obj
  597. cobj.type = 'content'
  598. end
  599. end
  600. lua_util.debugm(N, task, 'found content objects for %s:%s -> %s',
  601. obj.major, obj.minor, #obj.contents)
  602. end
  603. local resources = dict.Resources
  604. if resources and type(resources) == 'table' then
  605. local res_ref = maybe_dereference_object(resources, pdf, task)
  606. if type(res_ref) ~= 'table' then
  607. lua_util.debugm(N, task, 'cannot parse resources from pdf: %s',
  608. resources)
  609. obj.resources = {}
  610. elseif res_ref.dict then
  611. obj.resources = res_ref.dict
  612. else
  613. obj.resources = {}
  614. end
  615. else
  616. -- Fucking pdf: we need to inherit from parent
  617. resources = {}
  618. if dict.Parent then
  619. local parent = maybe_dereference_object(dict.Parent, pdf, task)
  620. if parent and type(parent) == 'table' and parent.dict then
  621. if parent.resources then
  622. lua_util.debugm(N, task, 'propagated resources from %s:%s to %s:%s',
  623. parent.major, parent.minor, obj.major, obj.minor)
  624. resources = parent.resources
  625. end
  626. end
  627. end
  628. obj.resources = resources
  629. end
  630. --[[Disabled fonts extraction
  631. local fonts = obj.resources.Font
  632. if fonts and type(fonts) == 'table' then
  633. obj.fonts = {}
  634. for k,v in pairs(fonts) do
  635. obj.fonts[k] = maybe_dereference_object(v, pdf, task)
  636. if obj.fonts[k] then
  637. local font = obj.fonts[k]
  638. if config.text_extraction then
  639. process_font(task, pdf, font, k)
  640. lua_util.debugm(N, task, 'found font "%s" for object %s:%s -> %s',
  641. k, obj.major, obj.minor, font)
  642. end
  643. end
  644. end
  645. end
  646. ]]
  647. lua_util.debugm(N, task, 'found resources for object %s:%s (%s): %s',
  648. obj.major, obj.minor, obj.type, obj.resources)
  649. if obj.type == 'Action' then
  650. process_action(task, pdf, obj)
  651. elseif obj.type == 'Catalog' then
  652. process_catalog(task, pdf, obj)
  653. elseif obj.type == 'XRef' then
  654. -- XRef stream instead of trailer from PDF 1.5 (thanks Adobe)
  655. process_xref(task, pdf, obj)
  656. elseif obj.type == 'Javascript' then
  657. local js = maybe_dereference_object(obj.dict.JS, pdf, task)
  658. if js then
  659. if type(js) == 'table' then
  660. local extracted_js = maybe_extract_object_stream(js, pdf, task)
  661. if not extracted_js then
  662. lua_util.debugm(N, task, 'invalid type for JavaScript from %s:%s: %s',
  663. obj.major, obj.minor, js)
  664. else
  665. js = extracted_js
  666. end
  667. end
  668. js = process_javascript(task, pdf, js, obj)
  669. if js then
  670. obj.js = js
  671. lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
  672. obj.major, obj.minor, obj.js.data)
  673. else
  674. lua_util.debugm(N, task, 'invalid type for JavaScript from %s:%s: %s',
  675. obj.major, obj.minor, js)
  676. end
  677. end
  678. end
  679. end -- Already processed dict (obj.type is not empty)
  680. end
  681. -- This function is intended to unpack objects from ObjStm crappy structure
  682. local compound_obj_grammar
  683. local function compound_obj_grammar_gen()
  684. if not compound_obj_grammar then
  685. local gen = generic_grammar_elts()
  686. compound_obj_grammar = gen.ws ^ 0 * (gen.comment * gen.ws ^ 1) ^ 0 *
  687. lpeg.Ct(lpeg.Ct(gen.number * gen.ws ^ 1 * gen.number * gen.ws ^ 0) ^ 1)
  688. end
  689. return compound_obj_grammar
  690. end
  691. local function pdf_compound_object_unpack(_, uncompressed, pdf, task, first)
  692. -- First, we need to parse data line by line likely to find a line
  693. -- that consists of pairs of numbers
  694. compound_obj_grammar_gen()
  695. local elts = compound_obj_grammar:match(uncompressed)
  696. if elts and #elts > 0 then
  697. lua_util.debugm(N, task, 'compound elts (chunk length %s): %s',
  698. #uncompressed, elts)
  699. for i, pair in ipairs(elts) do
  700. local obj_number, offset = pair[1], pair[2]
  701. offset = offset + first
  702. if offset < #uncompressed then
  703. local span_len
  704. if i == #elts then
  705. span_len = #uncompressed - offset
  706. else
  707. span_len = (elts[i + 1][2] + first) - offset
  708. end
  709. if span_len > 0 and offset + span_len <= #uncompressed then
  710. local obj = {
  711. major = obj_number,
  712. minor = 0, -- Implicit
  713. data = uncompressed:span(offset + 1, span_len),
  714. ref = obj_ref(obj_number, 0)
  715. }
  716. parse_object_grammar(obj, task, pdf)
  717. if obj.dict then
  718. pdf.objects[#pdf.objects + 1] = obj
  719. end
  720. else
  721. lua_util.debugm(N, task, 'invalid span_len for compound object %s:%s; offset = %s, len = %s',
  722. pair[1], pair[2], offset + span_len, #uncompressed)
  723. end
  724. end
  725. end
  726. end
  727. end
  728. -- PDF 1.5 ObjStmt
  729. local function extract_pdf_compound_objects(task, pdf)
  730. for i, obj in ipairs(pdf.objects or {}) do
  731. if i > 0 and i % 100 == 0 then
  732. local now = rspamd_util.get_ticks()
  733. if now >= pdf.end_timestamp then
  734. pdf.timeout_processing = now - pdf.start_timestamp
  735. lua_util.debugm(N, task, 'pdf: timeout processing compound objects after spending %s seconds, ' ..
  736. '%s elements processed',
  737. pdf.timeout_processing, i)
  738. break
  739. end
  740. end
  741. if obj.stream and obj.dict and type(obj.dict) == 'table' then
  742. local t = obj.dict.Type
  743. if t and t == 'ObjStm' then
  744. -- We are in troubles sir...
  745. local nobjs = tonumber(maybe_dereference_object(obj.dict.N, pdf, task))
  746. local first = tonumber(maybe_dereference_object(obj.dict.First, pdf, task))
  747. if nobjs and first then
  748. --local extend = maybe_dereference_object(obj.dict.Extends, pdf, task)
  749. lua_util.debugm(N, task, 'extract ObjStm with %s objects (%s first) %s extend',
  750. nobjs, first, obj.dict.Extends)
  751. local uncompressed = maybe_extract_object_stream(obj, pdf, task)
  752. if uncompressed then
  753. pdf_compound_object_unpack(obj, uncompressed, pdf, task, first)
  754. end
  755. else
  756. lua_util.debugm(N, task, 'ObjStm object %s:%s has bad dict: %s',
  757. obj.major, obj.minor, obj.dict)
  758. end
  759. end
  760. end
  761. end
  762. end
  763. -- This function arranges starts and ends of all objects and process them into initial
  764. -- set of objects
  765. local function extract_outer_objects(task, input, pdf)
  766. local start_pos, end_pos = 1, 1
  767. local max_start_pos, max_end_pos
  768. local obj_count = 0
  769. max_start_pos = math.min(config.max_pdf_objects, #pdf.start_objects)
  770. max_end_pos = math.min(config.max_pdf_objects, #pdf.end_objects)
  771. lua_util.debugm(N, task, "pdf: extract objects from %s start positions and %s end positions",
  772. max_start_pos, max_end_pos)
  773. while start_pos <= max_start_pos and end_pos <= max_end_pos do
  774. local first = pdf.start_objects[start_pos]
  775. local last = pdf.end_objects[end_pos]
  776. -- 7 is length of `endobj\n`
  777. if first + 6 < last then
  778. local len = last - first - 6
  779. -- Also get the starting span and try to match it versus obj re to get numbers
  780. local obj_line_potential = first - 32
  781. if obj_line_potential < 1 then
  782. obj_line_potential = 1
  783. end
  784. local prev_obj_end = pdf.end_objects[end_pos - 1]
  785. if end_pos > 1 and prev_obj_end >= obj_line_potential and prev_obj_end < first then
  786. obj_line_potential = prev_obj_end + 1
  787. end
  788. local obj_line_span = input:span(obj_line_potential, first - obj_line_potential + 1)
  789. local matches = object_re:search(obj_line_span, true, true)
  790. if matches and matches[1] then
  791. local nobj = {
  792. start = first,
  793. len = len,
  794. data = input:span(first, len),
  795. major = tonumber(matches[1][2]),
  796. minor = tonumber(matches[1][3]),
  797. }
  798. pdf.objects[obj_count + 1] = nobj
  799. if nobj.major and nobj.minor then
  800. -- Add reference
  801. local ref = obj_ref(nobj.major, nobj.minor)
  802. nobj.ref = ref -- Our internal reference
  803. pdf.ref[ref] = nobj
  804. end
  805. end
  806. obj_count = obj_count + 1
  807. start_pos = start_pos + 1
  808. end_pos = end_pos + 1
  809. elseif first > last then
  810. end_pos = end_pos + 1
  811. else
  812. start_pos = start_pos + 1
  813. end_pos = end_pos + 1
  814. end
  815. end
  816. end
  817. -- This function attaches streams to objects and processes outer pdf grammar
  818. local function attach_pdf_streams(task, input, pdf)
  819. if pdf.start_streams and pdf.end_streams then
  820. local start_pos, end_pos = 1, 1
  821. local max_start_pos, max_end_pos
  822. max_start_pos = math.min(config.max_pdf_objects, #pdf.start_streams)
  823. max_end_pos = math.min(config.max_pdf_objects, #pdf.end_streams)
  824. for _, obj in ipairs(pdf.objects) do
  825. while start_pos <= max_start_pos and end_pos <= max_end_pos do
  826. local first = pdf.start_streams[start_pos]
  827. local last = pdf.end_streams[end_pos]
  828. last = last - 10 -- Exclude endstream\n pattern
  829. lua_util.debugm(N, task, "start: %s, end: %s; obj: %s-%s",
  830. first, last, obj.start, obj.start + obj.len)
  831. if first > obj.start and last < obj.start + obj.len and last > first then
  832. -- In case if we have fake endstream :(
  833. while pdf.end_streams[end_pos + 1] and pdf.end_streams[end_pos + 1] < obj.start + obj.len do
  834. end_pos = end_pos + 1
  835. last = pdf.end_streams[end_pos]
  836. end
  837. -- Strip the first \n
  838. while first < last do
  839. local chr = input:byte(first)
  840. if chr ~= 13 and chr ~= 10 then
  841. break
  842. end
  843. first = first + 1
  844. end
  845. local len = last - first
  846. obj.stream = {
  847. start = first,
  848. len = len,
  849. data = input:span(first, len)
  850. }
  851. start_pos = start_pos + 1
  852. end_pos = end_pos + 1
  853. break
  854. elseif first < obj.start then
  855. start_pos = start_pos + 1
  856. elseif last > obj.start + obj.len then
  857. -- Not this object
  858. break
  859. else
  860. start_pos = start_pos + 1
  861. end_pos = end_pos + 1
  862. end
  863. end
  864. if obj.stream then
  865. lua_util.debugm(N, task, 'found object %s:%s %s start %s len, %s stream start, %s stream length',
  866. obj.major, obj.minor, obj.start, obj.len, obj.stream.start, obj.stream.len)
  867. else
  868. lua_util.debugm(N, task, 'found object %s:%s %s start %s len, no stream',
  869. obj.major, obj.minor, obj.start, obj.len)
  870. end
  871. end
  872. end
  873. end
  874. -- Processes PDF objects: extracts streams, object numbers, process outer grammar,
  875. -- augment object types
  876. local function postprocess_pdf_objects(task, input, pdf)
  877. pdf.objects = {} -- objects table
  878. pdf.ref = {} -- references table
  879. extract_outer_objects(task, input, pdf)
  880. -- Now we have objects and we need to attach streams that are in bounds
  881. attach_pdf_streams(task, input, pdf)
  882. -- Parse grammar for outer objects
  883. for i, obj in ipairs(pdf.objects) do
  884. if i > 0 and i % 100 == 0 then
  885. local now = rspamd_util.get_ticks()
  886. if now >= pdf.end_timestamp then
  887. pdf.timeout_processing = now - pdf.start_timestamp
  888. lua_util.debugm(N, task, 'pdf: timeout processing grammars after spending %s seconds, ' ..
  889. '%s elements processed',
  890. pdf.timeout_processing, i)
  891. break
  892. end
  893. end
  894. if obj.ref then
  895. parse_object_grammar(obj, task, pdf)
  896. -- Special early handling
  897. if obj.dict and obj.dict.Type and obj.dict.Type == 'XRef' then
  898. process_xref(task, pdf, obj)
  899. end
  900. end
  901. end
  902. if not pdf.timeout_processing then
  903. extract_pdf_compound_objects(task, pdf)
  904. else
  905. -- ENOTIME
  906. return
  907. end
  908. -- Now we might probably have all objects being processed
  909. for i, obj in ipairs(pdf.objects) do
  910. if obj.dict then
  911. -- Types processing
  912. if i > 0 and i % 100 == 0 then
  913. local now = rspamd_util.get_ticks()
  914. if now >= pdf.end_timestamp then
  915. pdf.timeout_processing = now - pdf.start_timestamp
  916. lua_util.debugm(N, task, 'pdf: timeout processing dicts after spending %s seconds, ' ..
  917. '%s elements processed',
  918. pdf.timeout_processing, i)
  919. break
  920. end
  921. end
  922. process_dict(task, pdf, obj, obj.dict)
  923. end
  924. end
  925. end
  926. local function offsets_to_blocks(starts, ends, out)
  927. local start_pos, end_pos = 1, 1
  928. while start_pos <= #starts and end_pos <= #ends do
  929. local first = starts[start_pos]
  930. local last = ends[end_pos]
  931. if first < last then
  932. local len = last - first
  933. out[#out + 1] = {
  934. start = first,
  935. len = len,
  936. }
  937. start_pos = start_pos + 1
  938. end_pos = end_pos + 1
  939. elseif first > last then
  940. end_pos = end_pos + 1
  941. else
  942. -- Not ordered properly!
  943. break
  944. end
  945. end
  946. end
  947. local function search_text(task, pdf)
  948. for _, obj in ipairs(pdf.objects) do
  949. if obj.type == 'Page' and obj.contents then
  950. local text = {}
  951. for _, tobj in ipairs(obj.contents) do
  952. maybe_extract_object_stream(tobj, pdf, task)
  953. local matches = pdf_text_trie:match(tobj.uncompressed or '')
  954. if matches then
  955. local text_blocks = {}
  956. local starts = {}
  957. local ends = {}
  958. for npat, matched_positions in pairs(matches) do
  959. if npat == 1 then
  960. for _, pos in ipairs(matched_positions) do
  961. starts[#starts + 1] = pos
  962. end
  963. else
  964. for _, pos in ipairs(matched_positions) do
  965. ends[#ends + 1] = pos
  966. end
  967. end
  968. end
  969. offsets_to_blocks(starts, ends, text_blocks)
  970. for _, bl in ipairs(text_blocks) do
  971. if bl.len > 2 then
  972. -- To remove \s+ET\b pattern (it can leave trailing space or not but it doesn't matter)
  973. bl.len = bl.len - 2
  974. end
  975. bl.data = tobj.uncompressed:span(bl.start, bl.len)
  976. --lua_util.debugm(N, task, 'extracted text from object %s:%s: %s',
  977. -- tobj.major, tobj.minor, bl.data)
  978. if bl.len < config.max_processing_size then
  979. local ret, obj_or_err = pcall(pdf_text_grammar.match, pdf_text_grammar,
  980. bl.data)
  981. if ret then
  982. text[#text + 1] = obj_or_err
  983. lua_util.debugm(N, task, 'attached %s from content object %s:%s to %s:%s',
  984. obj_or_err, tobj.major, tobj.minor, obj.major, obj.minor)
  985. else
  986. lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
  987. obj.major, obj.minor, obj_or_err)
  988. end
  989. end
  990. end
  991. end
  992. end
  993. -- Join all text data together
  994. if #text > 0 then
  995. obj.text = rspamd_text.fromtable(text)
  996. lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
  997. obj.major, obj.minor, obj.text)
  998. end
  999. end
  1000. end
  1001. end
  1002. -- This function searches objects for `/URI` key and parses it's content
  1003. local function search_urls(task, pdf, mpart)
  1004. local function recursive_object_traverse(obj, dict, rec)
  1005. if rec > 10 then
  1006. lua_util.debugm(N, task, 'object %s:%s recurses too much',
  1007. obj.major, obj.minor)
  1008. return
  1009. end
  1010. for k, v in pairs(dict) do
  1011. if type(v) == 'table' then
  1012. recursive_object_traverse(obj, v, rec + 1)
  1013. elseif k == 'URI' then
  1014. v = maybe_dereference_object(v, pdf, task)
  1015. if type(v) == 'string' then
  1016. local url = rspamd_url.create(task:get_mempool(), v, { 'content' })
  1017. if url then
  1018. lua_util.debugm(N, task, 'found url %s in object %s:%s',
  1019. v, obj.major, obj.minor)
  1020. task:inject_url(url, mpart)
  1021. end
  1022. end
  1023. end
  1024. end
  1025. end
  1026. for _, obj in ipairs(pdf.objects) do
  1027. if obj.dict and type(obj.dict) == 'table' then
  1028. recursive_object_traverse(obj, obj.dict, 0)
  1029. end
  1030. end
  1031. end
  1032. local function process_pdf(input, mpart, task)
  1033. if not config.enabled then
  1034. -- Skip processing
  1035. return {}
  1036. end
  1037. local matches = pdf_trie:match(input)
  1038. if matches then
  1039. local start_ts = rspamd_util.get_ticks()
  1040. -- Temp object used to share data between pdf extraction methods
  1041. local pdf_object = {
  1042. tag = 'pdf',
  1043. extract_text = extract_text_data,
  1044. start_timestamp = start_ts,
  1045. end_timestamp = start_ts + config.pdf_process_timeout,
  1046. }
  1047. -- Output object that excludes all internal stuff
  1048. local pdf_output = lua_util.shallowcopy(pdf_object)
  1049. local grouped_processors = {}
  1050. for npat, matched_positions in pairs(matches) do
  1051. local index = pdf_indexes[npat]
  1052. local proc_key, loc_npat = index[1], index[4]
  1053. if not grouped_processors[proc_key] then
  1054. grouped_processors[proc_key] = {
  1055. processor_func = processors[proc_key],
  1056. offsets = {},
  1057. }
  1058. end
  1059. local proc = grouped_processors[proc_key]
  1060. -- Fill offsets
  1061. for _, pos in ipairs(matched_positions) do
  1062. proc.offsets[#proc.offsets + 1] = { pos, loc_npat }
  1063. end
  1064. end
  1065. for name, processor in pairs(grouped_processors) do
  1066. -- Sort by offset
  1067. lua_util.debugm(N, task, "pdf: process group %s with %s matches",
  1068. name, #processor.offsets)
  1069. table.sort(processor.offsets, function(e1, e2)
  1070. return e1[1] < e2[1]
  1071. end)
  1072. processor.processor_func(input, task, processor.offsets, pdf_object, pdf_output)
  1073. end
  1074. pdf_output.flags = {}
  1075. if pdf_object.start_objects and pdf_object.end_objects then
  1076. if #pdf_object.start_objects > config.max_pdf_objects then
  1077. pdf_output.many_objects = #pdf_object.start_objects
  1078. -- Trim
  1079. end
  1080. -- Postprocess objects
  1081. postprocess_pdf_objects(task, input, pdf_object)
  1082. if config.text_extraction then
  1083. search_text(task, pdf_object, pdf_output)
  1084. end
  1085. if config.url_extraction then
  1086. search_urls(task, pdf_object, mpart, pdf_output)
  1087. end
  1088. if config.js_fuzzy and pdf_object.scripts then
  1089. pdf_output.fuzzy_hashes = {}
  1090. if config.openaction_fuzzy_only then
  1091. -- OpenAction only
  1092. if pdf_object.openaction and pdf_object.openaction.bin_hash then
  1093. if config.min_js_fuzzy and #pdf_object.openaction.data >= config.min_js_fuzzy then
  1094. lua_util.debugm(N, task, "pdf: add fuzzy hash from openaction: %s; size = %s; object: %s:%s",
  1095. pdf_object.openaction.hash,
  1096. #pdf_object.openaction.data,
  1097. pdf_object.openaction.object.major, pdf_object.openaction.object.minor)
  1098. table.insert(pdf_output.fuzzy_hashes, pdf_object.openaction.bin_hash)
  1099. else
  1100. lua_util.debugm(N, task, "pdf: skip fuzzy hash from JavaScript: %s, too short: %s",
  1101. pdf_object.openaction.hash, #pdf_object.openaction.data)
  1102. end
  1103. end
  1104. else
  1105. -- All hashes
  1106. for h, sc in pairs(pdf_object.scripts) do
  1107. if config.min_js_fuzzy and #sc.data >= config.min_js_fuzzy then
  1108. lua_util.debugm(N, task, "pdf: add fuzzy hash from JavaScript: %s; size = %s; object: %s:%s",
  1109. sc.hash,
  1110. #sc.data,
  1111. sc.object.major, sc.object.minor)
  1112. table.insert(pdf_output.fuzzy_hashes, h)
  1113. else
  1114. lua_util.debugm(N, task, "pdf: skip fuzzy hash from JavaScript: %s, too short: %s",
  1115. sc.hash, #sc.data)
  1116. end
  1117. end
  1118. end
  1119. end
  1120. else
  1121. pdf_output.flags.no_objects = true
  1122. end
  1123. -- Propagate from object to output
  1124. if pdf_object.encrypted then
  1125. pdf_output.encrypted = true
  1126. end
  1127. if pdf_object.scripts then
  1128. pdf_output.scripts = true
  1129. end
  1130. return pdf_output
  1131. end
  1132. end
  1133. -- Processes the PDF trailer
  1134. processors.trailer = function(input, task, positions, pdf_object, pdf_output)
  1135. local last_pos = positions[#positions]
  1136. lua_util.debugm(N, task, 'pdf: process trailer at position %s (%s total length)',
  1137. last_pos, #input)
  1138. if last_pos[1] > config.max_pdf_trailer then
  1139. pdf_output.long_trailer = #input - last_pos[1]
  1140. return
  1141. end
  1142. local last_span = input:span(last_pos[1])
  1143. local lines_checked = 0
  1144. for line in last_span:lines(true) do
  1145. if line:find('/Encrypt ') then
  1146. lua_util.debugm(N, task, "pdf: found encrypted line in trailer: %s",
  1147. line)
  1148. pdf_output.encrypted = true
  1149. pdf_object.encrypted = true
  1150. break
  1151. end
  1152. lines_checked = lines_checked + 1
  1153. if lines_checked > config.max_pdf_trailer_lines then
  1154. lua_util.debugm(N, task, "pdf: trailer has too many lines, stop checking")
  1155. pdf_output.long_trailer = #input - last_pos[1]
  1156. break
  1157. end
  1158. end
  1159. end
  1160. processors.suspicious = function(input, task, positions, pdf_object, pdf_output)
  1161. local suspicious_factor = 0.0
  1162. local nexec = 0
  1163. local nencoded = 0
  1164. local close_encoded = 0
  1165. local last_encoded
  1166. for _, match in ipairs(positions) do
  1167. if match[2] == 1 then
  1168. -- netsh
  1169. suspicious_factor = suspicious_factor + 0.5
  1170. elseif match[2] == 2 then
  1171. nexec = nexec + 1
  1172. elseif match[2] == 3 then
  1173. local enc_data = input:sub(match[1] - 2, match[1] - 1)
  1174. local legal_escape = false
  1175. if enc_data then
  1176. enc_data = enc_data:strtoul()
  1177. if enc_data then
  1178. -- Legit encode cases are non printable characters (e.g. spaces)
  1179. if enc_data < 0x21 or enc_data >= 0x7f then
  1180. legal_escape = true
  1181. end
  1182. end
  1183. end
  1184. if not legal_escape then
  1185. nencoded = nencoded + 1
  1186. if last_encoded then
  1187. if match[1] - last_encoded < 8 then
  1188. -- likely consecutive encoded chars, increase factor
  1189. close_encoded = close_encoded + 1
  1190. end
  1191. end
  1192. last_encoded = match[1]
  1193. end
  1194. end
  1195. end
  1196. if nencoded > 10 then
  1197. suspicious_factor = suspicious_factor + nencoded / 10
  1198. end
  1199. if nexec > 1 then
  1200. suspicious_factor = suspicious_factor + nexec / 2.0
  1201. end
  1202. if close_encoded > 4 and nencoded - close_encoded < 5 then
  1203. -- Too many close encoded comparing to the total number of encoded characters
  1204. suspicious_factor = suspicious_factor + 0.5
  1205. end
  1206. lua_util.debugm(N, task, 'pdf: found a suspicious patterns: %s exec, %s encoded (%s close), ' ..
  1207. '%s final factor',
  1208. nexec, nencoded, close_encoded, suspicious_factor)
  1209. if suspicious_factor > 1.0 then
  1210. suspicious_factor = 1.0
  1211. end
  1212. pdf_output.suspicious = suspicious_factor
  1213. end
  1214. local function generic_table_inserter(positions, pdf_object, output_key)
  1215. if not pdf_object[output_key] then
  1216. pdf_object[output_key] = {}
  1217. end
  1218. local shift = #pdf_object[output_key]
  1219. for i, pos in ipairs(positions) do
  1220. pdf_object[output_key][i + shift] = pos[1]
  1221. end
  1222. end
  1223. processors.start_object = function(_, task, positions, pdf_object)
  1224. generic_table_inserter(positions, pdf_object, 'start_objects')
  1225. end
  1226. processors.end_object = function(_, task, positions, pdf_object)
  1227. generic_table_inserter(positions, pdf_object, 'end_objects')
  1228. end
  1229. processors.start_stream = function(_, task, positions, pdf_object)
  1230. generic_table_inserter(positions, pdf_object, 'start_streams')
  1231. end
  1232. processors.end_stream = function(_, task, positions, pdf_object)
  1233. generic_table_inserter(positions, pdf_object, 'end_streams')
  1234. end
  1235. exports.process = process_pdf
  1236. return exports