Du kannst nicht mehr als 25 Themen auswählen Themen müssen mit entweder einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325
  1. --[[
  2. Copyright (c) 2019, Vsevolod Stakhov <vsevolod@highsecure.ru>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. --[[[
  14. -- @module lua_content/pdf
  15. -- This module contains some heuristics for PDF files
  16. --]]
  17. local rspamd_trie = require "rspamd_trie"
  18. local rspamd_util = require "rspamd_util"
  19. local rspamd_text = require "rspamd_text"
  20. local rspamd_url = require "rspamd_url"
  21. local bit = require "bit"
  22. local N = "lua_content"
  23. local lua_util = require "lua_util"
  24. local rspamd_regexp = require "rspamd_regexp"
  25. local lpeg = require "lpeg"
  26. local pdf_patterns = {
  27. trailer = {
  28. patterns = {
  29. [[\ntrailer\r?\n]]
  30. }
  31. },
  32. suspicious = {
  33. patterns = {
  34. [[netsh\s]],
  35. [[echo\s]],
  36. [[\/[A-Za-z]*#\d\d(?:[#A-Za-z<>/\s])]], -- Hex encode obfuscation
  37. }
  38. },
  39. start_object = {
  40. patterns = {
  41. [=[[\r\n\0]\s*\d+\s+\d+\s+obj[\s<]]=]
  42. }
  43. },
  44. end_object = {
  45. patterns = {
  46. [=[endobj[\r\n]]=]
  47. }
  48. },
  49. start_stream = {
  50. patterns = {
  51. [=[>\s*stream[\r\n]]=],
  52. }
  53. },
  54. end_stream = {
  55. patterns = {
  56. [=[endstream[\r\n]]=]
  57. }
  58. }
  59. }
  60. local pdf_text_patterns = {
  61. start = {
  62. patterns = {
  63. [[\sBT\s]]
  64. }
  65. },
  66. stop = {
  67. patterns = {
  68. [[\sET\b]]
  69. }
  70. }
  71. }
  72. local pdf_cmap_patterns = {
  73. start = {
  74. patterns = {
  75. [[\d\s+beginbfchar\s]],
  76. [[\d\s+beginbfrange\s]]
  77. }
  78. },
  79. stop = {
  80. patterns = {
  81. [[\sendbfrange\b]],
  82. [[\sendbchar\b]]
  83. }
  84. }
  85. }
  86. -- index[n] ->
  87. -- t[1] - pattern,
  88. -- t[2] - key in patterns table,
  89. -- t[3] - value in patterns table
  90. -- t[4] - local pattern index
  91. local pdf_indexes = {}
  92. local pdf_text_indexes = {}
  93. local pdf_cmap_indexes = {}
  94. local pdf_trie
  95. local pdf_text_trie
  96. local pdf_cmap_trie
  97. local exports = {}
  98. local config = {
  99. max_extraction_size = 512 * 1024,
  100. max_processing_size = 32 * 1024,
  101. text_extraction = false, -- NYI feature
  102. url_extraction = true,
  103. enabled = true,
  104. js_fuzzy = true, -- Generate fuzzy hashes from PDF javascripts
  105. min_js_fuzzy = 256, -- Minimum size of js to be considered as a fuzzy
  106. openaction_fuzzy_only = false, -- Generate fuzzy from all scripts
  107. max_pdf_objects = 10000, -- Maximum number of objects to be considered
  108. max_pdf_trailer = 10 * 1024 * 1024, -- Maximum trailer size (to avoid abuse)
  109. max_pdf_trailer_lines = 100, -- Maximum number of lines in pdf trailer
  110. pdf_process_timeout = 1.0, -- Timeout in seconds for processing
  111. }
  112. -- Used to process patterns found in PDF
  113. -- positions for functional processors should be a iter/table from trie matcher in form
  114. ---- [{n1, pat_idx1}, ... {nn, pat_idxn}] where
  115. ---- pat_idxn is pattern index and n1 ... nn are match positions
  116. local processors = {}
  117. -- PDF objects outer grammar in LPEG style (performing table captures)
  118. local pdf_outer_grammar
  119. local pdf_text_grammar
  120. -- Used to match objects
  121. local object_re = rspamd_regexp.create_cached([=[/(\d+)\s+(\d+)\s+obj\s*/]=])
  122. local function config_module()
  123. local opts = rspamd_config:get_all_opt('lua_content')
  124. if opts and opts.pdf then
  125. config = lua_util.override_defaults(config, opts.pdf)
  126. end
  127. end
  128. local function compile_tries()
  129. local default_compile_flags = bit.bor(rspamd_trie.flags.re,
  130. rspamd_trie.flags.dot_all,
  131. rspamd_trie.flags.no_start)
  132. local function compile_pats(patterns, indexes, compile_flags)
  133. local strs = {}
  134. for what,data in pairs(patterns) do
  135. for i,pat in ipairs(data.patterns) do
  136. strs[#strs + 1] = pat
  137. indexes[#indexes + 1] = {what, data, pat, i}
  138. end
  139. end
  140. return rspamd_trie.create(strs, compile_flags or default_compile_flags)
  141. end
  142. if not pdf_trie then
  143. pdf_trie = compile_pats(pdf_patterns, pdf_indexes)
  144. end
  145. if not pdf_text_trie then
  146. pdf_text_trie = compile_pats(pdf_text_patterns, pdf_text_indexes)
  147. end
  148. if not pdf_cmap_trie then
  149. pdf_cmap_trie = compile_pats(pdf_cmap_patterns, pdf_cmap_indexes)
  150. end
  151. end
  152. -- Returns a table with generic grammar elements for PDF
  153. local function generic_grammar_elts()
  154. local P = lpeg.P
  155. local R = lpeg.R
  156. local S = lpeg.S
  157. local V = lpeg.V
  158. local C = lpeg.C
  159. local D = R'09' -- Digits
  160. local grammar_elts = {}
  161. -- Helper functions
  162. local function pdf_hexstring_unescape(s)
  163. local function ue(cc)
  164. return string.char(tonumber(cc, 16))
  165. end
  166. if #s % 2 == 0 then
  167. -- Sane hex string
  168. return s:gsub('..', ue)
  169. end
  170. -- WTF hex string
  171. -- Append '0' to it and unescape...
  172. return s:sub(1, #s - 1):gsub('..' , ue) .. (s:sub(#s) .. '0'):gsub('..' , ue)
  173. end
  174. local function pdf_string_unescape(s)
  175. local function ue_single(cc)
  176. if cc == '\\r' then
  177. return '\r'
  178. elseif cc == '\\n' then
  179. return '\n'
  180. else
  181. return cc:gsub(2, 2)
  182. end
  183. end
  184. -- simple unescape \char
  185. s = s:gsub('\\[^%d]', ue_single)
  186. -- unescape octal
  187. local function ue_octal(cc)
  188. -- Replace unknown stuff with '?'
  189. return string.char(tonumber(cc:sub(2), 8) or 63)
  190. end
  191. s = s:gsub('\\%d%d?%d?', ue_octal)
  192. return s
  193. end
  194. local function pdf_id_unescape(s)
  195. return (s:gsub('#%d%d', function (cc)
  196. return string.char(tonumber(cc:sub(2), 16))
  197. end))
  198. end
  199. local delim = S'()<>[]{}/%'
  200. grammar_elts.ws = S'\0 \r\n\t\f'
  201. local hex = R'af' + R'AF' + D
  202. -- Comments.
  203. local eol = P'\r\n' + '\n'
  204. local line = (1 - S'\r\n\f')^0 * eol^-1
  205. grammar_elts.comment = P'%' * line
  206. -- Numbers.
  207. local sign = S'+-'^-1
  208. local decimal = D^1
  209. local float = D^1 * P'.' * D^0 + P'.' * D^1
  210. grammar_elts.number = C(sign * (float + decimal)) / tonumber
  211. -- String
  212. grammar_elts.str = P{ "(" * C(((1 - S"()\\") + (P '\\' * 1) + V(1))^0) / pdf_string_unescape * ")" }
  213. grammar_elts.hexstr = P{"<" * C(hex^0) / pdf_hexstring_unescape * ">"}
  214. -- Identifier
  215. grammar_elts.id = P{'/' * C((1-(delim + grammar_elts.ws))^1) / pdf_id_unescape}
  216. -- Booleans (who care about them?)
  217. grammar_elts.boolean = C(P("true") + P("false"))
  218. -- Stupid references
  219. grammar_elts.ref = lpeg.Ct{lpeg.Cc("%REF%") * C(D^1) * " " * C(D^1) * " " * "R"}
  220. return grammar_elts
  221. end
  222. -- Generates a grammar to parse outer elements (external objects in PDF notation)
  223. local function gen_outer_grammar()
  224. local V = lpeg.V
  225. local gen = generic_grammar_elts()
  226. return lpeg.P{
  227. "EXPR";
  228. EXPR = gen.ws^0 * V("ELT")^0 * gen.ws^0,
  229. ELT = V("ARRAY") + V("DICT") + V("ATOM"),
  230. ATOM = gen.ws^0 * (gen.comment + gen.boolean + gen.ref +
  231. gen.number + V("STRING") + gen.id) * gen.ws^0,
  232. DICT = "<<" * gen.ws^0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR")^0, rawset) * gen.ws^0 * ">>",
  233. KV_PAIR = lpeg.Cg(gen.id * gen.ws^0 * V("ELT") * gen.ws^0),
  234. ARRAY = "[" * gen.ws^0 * lpeg.Ct(V("ELT")^0) * gen.ws^0 * "]",
  235. STRING = lpeg.P{gen.str + gen.hexstr},
  236. }
  237. end
  238. -- Graphic state in PDF
  239. local function gen_graphics_unary()
  240. local P = lpeg.P
  241. local S = lpeg.S
  242. return P("q") + P("Q") + P("h")
  243. + S("WSsFfBb") * P("*")^0 + P("n")
  244. end
  245. local function gen_graphics_binary()
  246. local P = lpeg.P
  247. local S = lpeg.S
  248. return S("gGwJjMi") +
  249. P("M") + P("ri") + P("gs") +
  250. P("CS") + P("cs") + P("sh")
  251. end
  252. local function gen_graphics_ternary()
  253. local P = lpeg.P
  254. local S = lpeg.S
  255. return P("d") + P("m") + S("lm")
  256. end
  257. local function gen_graphics_nary()
  258. local P = lpeg.P
  259. local S = lpeg.S
  260. return P("SC") + P("sc") + P("SCN") + P("scn") + P("k") + P("K") + P("re") + S("cvy") +
  261. P("RG") + P("rg")
  262. end
  263. -- Generates a grammar to parse text blocks (between BT and ET)
  264. local function gen_text_grammar()
  265. local V = lpeg.V
  266. local P = lpeg.P
  267. local C = lpeg.C
  268. local gen = generic_grammar_elts()
  269. local empty = ""
  270. local unary_ops = C("T*") / "\n" +
  271. C(gen_graphics_unary()) / empty
  272. local binary_ops = P("Tc") + P("Tw") + P("Tz") + P("TL") + P("Tr") + P("Ts") +
  273. gen_graphics_binary()
  274. local ternary_ops = P("TD") + P("Td") + gen_graphics_ternary()
  275. local nary_op = P("Tm") + gen_graphics_nary()
  276. local text_binary_op = P("Tj") + P("TJ") + P("'")
  277. local text_quote_op = P('"')
  278. local font_op = P("Tf")
  279. return lpeg.P{
  280. "EXPR";
  281. EXPR = gen.ws^0 * lpeg.Ct(V("COMMAND")^0),
  282. COMMAND = (V("UNARY") + V("BINARY") + V("TERNARY") + V("NARY") + V("TEXT") +
  283. V("FONT") + gen.comment) * gen.ws^0,
  284. UNARY = unary_ops,
  285. BINARY = V("ARG") / empty * gen.ws^1 * binary_ops,
  286. TERNARY = V("ARG") / empty * gen.ws^1 * V("ARG") / empty * gen.ws^1 * ternary_ops,
  287. NARY = (gen.number / 0 * gen.ws^1)^1 * (gen.id / empty * gen.ws^0)^-1 * nary_op,
  288. ARG = V("ARRAY") + V("DICT") + V("ATOM"),
  289. ATOM = (gen.comment + gen.boolean + gen.ref +
  290. gen.number + V("STRING") + gen.id),
  291. DICT = "<<" * gen.ws^0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR")^0, rawset) * gen.ws^0 * ">>",
  292. KV_PAIR = lpeg.Cg(gen.id * gen.ws^0 * V("ARG") * gen.ws^0),
  293. ARRAY = "[" * gen.ws^0 * lpeg.Ct(V("ARG")^0) * gen.ws^0 * "]",
  294. STRING = lpeg.P{gen.str + gen.hexstr},
  295. TEXT = (V("TEXT_ARG") * gen.ws^1 * text_binary_op) +
  296. (V("ARG") / 0 * gen.ws^1 * V("ARG") / 0 * gen.ws^1 * V("TEXT_ARG") * gen.ws^1 * text_quote_op),
  297. FONT = (V("FONT_ARG") * gen.ws^1 * (gen.number / 0) * gen.ws^1 * font_op),
  298. FONT_ARG = lpeg.Ct(lpeg.Cc("%font%") * gen.id),
  299. TEXT_ARG = lpeg.Ct(V("STRING")) + V("TEXT_ARRAY"),
  300. TEXT_ARRAY = "[" *
  301. lpeg.Ct(((gen.ws^0 * (gen.ws^0 * (gen.number / 0)^0 * gen.ws^0 * (gen.str + gen.hexstr)))^1)) * gen.ws^0 * "]",
  302. }
  303. end
  304. -- Call immediately on require
  305. compile_tries()
  306. config_module()
  307. pdf_outer_grammar = gen_outer_grammar()
  308. pdf_text_grammar = gen_text_grammar()
  309. local function extract_text_data(specific)
  310. return nil -- NYI
  311. end
  312. -- Generates index for major/minor pair
  313. local function obj_ref(major, minor)
  314. return major * 10.0 + 1.0 / (minor + 1.0)
  315. end
  316. -- Return indirect object reference (if needed)
  317. local function maybe_dereference_object(elt, pdf, task)
  318. if type(elt) == 'table' and elt[1] == '%REF%' then
  319. local ref = obj_ref(elt[2], elt[3])
  320. if pdf.ref[ref] then
  321. -- No recursion!
  322. return pdf.ref[ref]
  323. else
  324. lua_util.debugm(N, task, 'cannot dereference %s:%s -> %s, no object',
  325. elt[2], elt[3], obj_ref(elt[2], elt[3]))
  326. return nil
  327. end
  328. end
  329. return elt
  330. end
  331. -- Apply PDF stream filter
  332. local function apply_pdf_filter(input, filt)
  333. if filt == 'FlateDecode' then
  334. return rspamd_util.inflate(input, config.max_extraction_size)
  335. end
  336. return nil
  337. end
  338. -- Conditionally apply a pipeline of stream filters and return uncompressed data
  339. local function maybe_apply_filter(dict, data, pdf, task)
  340. local uncompressed = data
  341. if dict.Filter then
  342. local filt = dict.Filter
  343. if type(filt) == 'string' then
  344. filt = {filt}
  345. end
  346. if dict.DecodeParms then
  347. local decode_params = maybe_dereference_object(dict.DecodeParms, pdf, task)
  348. if type(decode_params) == 'table' then
  349. if decode_params.Predictor then
  350. return nil,'predictor exists'
  351. end
  352. end
  353. end
  354. for _,f in ipairs(filt) do
  355. uncompressed = apply_pdf_filter(uncompressed, f)
  356. if not uncompressed then break end
  357. end
  358. end
  359. return uncompressed,nil
  360. end
  361. -- Conditionally extract stream data from object and attach it as obj.uncompressed
  362. local function maybe_extract_object_stream(obj, pdf, task)
  363. if pdf.encrypted then
  364. -- TODO add decryption some day
  365. return nil
  366. end
  367. local dict = obj.dict
  368. if dict.Length and type(obj.stream) == 'table' then
  369. local len = math.min(obj.stream.len,
  370. tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0)
  371. local real_stream = obj.stream.data:span(1, len)
  372. local uncompressed,filter_err = maybe_apply_filter(dict, real_stream, pdf, task)
  373. if uncompressed then
  374. obj.uncompressed = uncompressed
  375. lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)',
  376. obj.major, obj.minor, len, uncompressed:len())
  377. return obj.uncompressed
  378. else
  379. lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s: %s',
  380. obj.major, obj.minor, len, dict.Filter, filter_err)
  381. end
  382. end
  383. end
  384. local function parse_object_grammar(obj, task, pdf)
  385. -- Parse grammar
  386. local obj_dict_span
  387. if obj.stream then
  388. obj_dict_span = obj.data:span(1, obj.stream.start - obj.start)
  389. else
  390. obj_dict_span = obj.data
  391. end
  392. if obj_dict_span:len() < config.max_processing_size then
  393. local ret,obj_or_err = pcall(pdf_outer_grammar.match, pdf_outer_grammar, obj_dict_span)
  394. if ret then
  395. if obj.stream then
  396. obj.dict = obj_or_err
  397. lua_util.debugm(N, task, 'stream object %s:%s is parsed to: %s',
  398. obj.major, obj.minor, obj_or_err)
  399. else
  400. -- Direct object
  401. if type(obj_or_err) == 'table' then
  402. obj.dict = obj_or_err
  403. obj.uncompressed = obj_or_err
  404. lua_util.debugm(N, task, 'direct object %s:%s is parsed to: %s',
  405. obj.major, obj.minor, obj_or_err)
  406. pdf.ref[obj_ref(obj.major, obj.minor)] = obj
  407. else
  408. lua_util.debugm(N, task, 'direct object %s:%s is parsed to raw data: %s',
  409. obj.major, obj.minor, obj_or_err)
  410. pdf.ref[obj_ref(obj.major, obj.minor)] = obj_or_err
  411. obj.dict = {}
  412. obj.uncompressed = obj_or_err
  413. end
  414. end
  415. else
  416. lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
  417. obj.major, obj.minor, obj_or_err)
  418. end
  419. else
  420. lua_util.debugm(N, task, 'object %s:%s cannot be parsed: too large %s',
  421. obj.major, obj.minor, obj_dict_span:len())
  422. end
  423. end
  424. -- Extracts font data and process /ToUnicode mappings
  425. -- NYI in fact as cmap is ridiculously stupid and complicated
  426. --[[
  427. local function process_font(task, pdf, font, fname)
  428. local dict = font
  429. if font.dict then
  430. dict = font.dict
  431. end
  432. if type(dict) == 'table' and dict.ToUnicode then
  433. local cmap = maybe_dereference_object(dict.ToUnicode, pdf, task)
  434. if cmap and cmap.dict then
  435. maybe_extract_object_stream(cmap, pdf, task)
  436. lua_util.debugm(N, task, 'found cmap for font %s: %s',
  437. fname, cmap.uncompressed)
  438. end
  439. end
  440. end
  441. --]]
  442. -- Forward declaration
  443. local process_dict
  444. -- This function processes javascript string and returns JS hash and JS rspamd_text
  445. local function process_javascript(task, pdf, js)
  446. local rspamd_cryptobox_hash = require "rspamd_cryptobox_hash"
  447. if type(js) == 'string' then
  448. js = rspamd_text.fromstring(js):oneline()
  449. elseif type(js) == 'userdata' then
  450. js = js:oneline()
  451. else
  452. return nil
  453. end
  454. local hash = rspamd_cryptobox_hash.create(js)
  455. local bin_hash = hash:bin()
  456. if not pdf.scripts then
  457. pdf.scripts = {}
  458. end
  459. if pdf.scripts[bin_hash] then
  460. -- Duplicate
  461. return pdf.scripts[bin_hash]
  462. end
  463. local njs = {
  464. data = js,
  465. hash = rspamd_util.encode_base32(bin_hash),
  466. bin_hash = bin_hash,
  467. }
  468. pdf.scripts[bin_hash] = njs
  469. return njs
  470. end
  471. -- Extract interesting stuff from /Action, e.g. javascript
  472. local function process_action(task, pdf, obj)
  473. if not (obj.js or obj.launch) and (obj.dict and obj.dict.JS) then
  474. local js = maybe_dereference_object(obj.dict.JS, pdf, task)
  475. if js then
  476. if type(js) == 'table' then
  477. local extracted_js = maybe_extract_object_stream(js, pdf, task)
  478. if not extracted_js then
  479. lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
  480. obj.major, obj.minor, js)
  481. else
  482. js = extracted_js
  483. end
  484. end
  485. js = process_javascript(task, pdf, js)
  486. if js then
  487. obj.js = js
  488. lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
  489. obj.major, obj.minor, obj.js.data)
  490. else
  491. lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
  492. obj.major, obj.minor, js)
  493. end
  494. elseif obj.dict.F then
  495. local launch = maybe_dereference_object(obj.dict.F, pdf, task)
  496. if launch then
  497. if type(launch) == 'string' then
  498. obj.launch = rspamd_text.fromstring(launch):exclude_chars('%n%c')
  499. lua_util.debugm(N, task, 'extracted launch from %s:%s: %s',
  500. obj.major, obj.minor, obj.launch)
  501. elseif type(launch) == 'userdata' then
  502. obj.launch = launch:exclude_chars('%n%c')
  503. lua_util.debugm(N, task, 'extracted launch from %s:%s: %s',
  504. obj.major, obj.minor, obj.launch)
  505. else
  506. lua_util.debugm(N, task, 'invalid type for launch from %s:%s: %s',
  507. obj.major, obj.minor, launch)
  508. end
  509. end
  510. else
  511. lua_util.debugm(N, task, 'no JS attribute in action %s:%s',
  512. obj.major, obj.minor)
  513. end
  514. end
  515. end
  516. -- Extract interesting stuff from /Catalog, e.g. javascript in /OpenAction
  517. local function process_catalog(task, pdf, obj)
  518. if obj.dict then
  519. if obj.dict.OpenAction then
  520. local action = maybe_dereference_object(obj.dict.OpenAction, pdf, task)
  521. if action and type(action) == 'table' then
  522. -- This also processes action js (if not already processed)
  523. process_dict(task, pdf, action, action.dict)
  524. if action.js then
  525. lua_util.debugm(N, task, 'found openaction JS in %s:%s: %s',
  526. obj.major, obj.minor, action.js)
  527. pdf.openaction = action.js
  528. elseif action.launch then
  529. lua_util.debugm(N, task, 'found openaction launch in %s:%s: %s',
  530. obj.major, obj.minor, action.launch)
  531. pdf.launch = action.launch
  532. else
  533. lua_util.debugm(N, task, 'no JS in openaction %s:%s: %s',
  534. obj.major, obj.minor, action)
  535. end
  536. else
  537. lua_util.debugm(N, task, 'cannot find openaction %s:%s: %s -> %s',
  538. obj.major, obj.minor, obj.dict.OpenAction, action)
  539. end
  540. else
  541. lua_util.debugm(N, task, 'no openaction in catalog %s:%s',
  542. obj.major, obj.minor)
  543. end
  544. end
  545. end
  546. local function process_xref(task, pdf, obj)
  547. if obj.dict then
  548. if obj.dict.Encrypt then
  549. local encrypt = maybe_dereference_object(obj.dict.Encrypt, pdf, task)
  550. lua_util.debugm(N, task, 'found encrypt: %s in xref object %s:%s',
  551. encrypt, obj.major, obj.minor)
  552. pdf.encrypted = true
  553. end
  554. end
  555. end
  556. process_dict = function(task, pdf, obj, dict)
  557. if not obj.type and type(dict) == 'table' then
  558. if dict.Type and type(dict.Type) == 'string' then
  559. -- Common stuff
  560. obj.type = dict.Type
  561. end
  562. if not obj.type then
  563. if obj.dict.S and obj.dict.JS then
  564. obj.type = 'Javascript'
  565. lua_util.debugm(N, task, 'implicit type for Javascript object %s:%s',
  566. obj.major, obj.minor)
  567. else
  568. lua_util.debugm(N, task, 'no type for %s:%s',
  569. obj.major, obj.minor)
  570. return
  571. end
  572. end
  573. lua_util.debugm(N, task, 'processed stream dictionary for object %s:%s -> %s',
  574. obj.major, obj.minor, obj.type)
  575. local contents = dict.Contents
  576. if contents and type(contents) == 'table' then
  577. if contents[1] == '%REF%' then
  578. -- Single reference
  579. contents = {contents}
  580. end
  581. obj.contents = {}
  582. for _,c in ipairs(contents) do
  583. local cobj = maybe_dereference_object(c, pdf, task)
  584. if cobj and type(cobj) == 'table' then
  585. obj.contents[#obj.contents + 1] = cobj
  586. cobj.parent = obj
  587. cobj.type = 'content'
  588. end
  589. end
  590. lua_util.debugm(N, task, 'found content objects for %s:%s -> %s',
  591. obj.major, obj.minor, #obj.contents)
  592. end
  593. local resources = dict.Resources
  594. if resources and type(resources) == 'table' then
  595. local res_ref = maybe_dereference_object(resources, pdf, task)
  596. if type(res_ref) ~= 'table' then
  597. lua_util.debugm(N, task, 'cannot parse resources from pdf: %s',
  598. resources)
  599. obj.resources = {}
  600. elseif res_ref.dict then
  601. obj.resources = res_ref.dict
  602. else
  603. obj.resources = {}
  604. end
  605. else
  606. -- Fucking pdf: we need to inherit from parent
  607. resources = {}
  608. if dict.Parent then
  609. local parent = maybe_dereference_object(dict.Parent, pdf, task)
  610. if parent and type(parent) == 'table' and parent.dict then
  611. if parent.resources then
  612. lua_util.debugm(N, task, 'propagated resources from %s:%s to %s:%s',
  613. parent.major, parent.minor, obj.major, obj.minor)
  614. resources = parent.resources
  615. end
  616. end
  617. end
  618. obj.resources = resources
  619. end
  620. --[[Disabled fonts extraction
  621. local fonts = obj.resources.Font
  622. if fonts and type(fonts) == 'table' then
  623. obj.fonts = {}
  624. for k,v in pairs(fonts) do
  625. obj.fonts[k] = maybe_dereference_object(v, pdf, task)
  626. if obj.fonts[k] then
  627. local font = obj.fonts[k]
  628. if config.text_extraction then
  629. process_font(task, pdf, font, k)
  630. lua_util.debugm(N, task, 'found font "%s" for object %s:%s -> %s',
  631. k, obj.major, obj.minor, font)
  632. end
  633. end
  634. end
  635. end
  636. ]]
  637. lua_util.debugm(N, task, 'found resources for object %s:%s (%s): %s',
  638. obj.major, obj.minor, obj.type, obj.resources)
  639. if obj.type == 'Action' then
  640. process_action(task, pdf, obj)
  641. elseif obj.type == 'Catalog' then
  642. process_catalog(task, pdf, obj)
  643. elseif obj.type == 'XRef' then
  644. -- XRef stream instead of trailer from PDF 1.5 (thanks Adobe)
  645. process_xref(task, pdf, obj)
  646. elseif obj.type == 'Javascript' then
  647. local js = maybe_dereference_object(obj.dict.JS, pdf, task)
  648. if js then
  649. if type(js) == 'table' then
  650. local extracted_js = maybe_extract_object_stream(js, pdf, task)
  651. if not extracted_js then
  652. lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
  653. obj.major, obj.minor, js)
  654. else
  655. js = extracted_js
  656. end
  657. end
  658. js = process_javascript(task, pdf, js)
  659. if js then
  660. obj.js = js
  661. lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
  662. obj.major, obj.minor, obj.js.data)
  663. else
  664. lua_util.debugm(N, task, 'invalid type for javascript from %s:%s: %s',
  665. obj.major, obj.minor, js)
  666. end
  667. end
  668. end
  669. end -- Already processed dict (obj.type is not empty)
  670. end
  671. -- This function is intended to unpack objects from ObjStm crappy structure
  672. local compound_obj_grammar
  673. local function compound_obj_grammar_gen()
  674. if not compound_obj_grammar then
  675. local gen = generic_grammar_elts()
  676. compound_obj_grammar = gen.ws^0 * (gen.comment * gen.ws^1)^0 *
  677. lpeg.Ct(lpeg.Ct(gen.number * gen.ws^1 * gen.number * gen.ws^0)^1)
  678. end
  679. return compound_obj_grammar
  680. end
  681. local function pdf_compound_object_unpack(_, uncompressed, pdf, task, first)
  682. -- First, we need to parse data line by line likely to find a line
  683. -- that consists of pairs of numbers
  684. compound_obj_grammar_gen()
  685. local elts = compound_obj_grammar:match(uncompressed)
  686. if elts and #elts > 0 then
  687. lua_util.debugm(N, task, 'compound elts (chunk length %s): %s',
  688. #uncompressed, elts)
  689. for i,pair in ipairs(elts) do
  690. local obj_number,offset = pair[1], pair[2]
  691. offset = offset + first
  692. if offset < #uncompressed then
  693. local span_len
  694. if i == #elts then
  695. span_len = #uncompressed - offset
  696. else
  697. span_len = (elts[i + 1][2] + first) - offset
  698. end
  699. if span_len > 0 and offset + span_len < #uncompressed then
  700. local obj = {
  701. major = obj_number,
  702. minor = 0, -- Implicit
  703. data = uncompressed:span(offset + 1, span_len),
  704. ref = obj_ref(obj_number, 0)
  705. }
  706. parse_object_grammar(obj, task, pdf)
  707. if obj.dict then
  708. pdf.objects[#pdf.objects + 1] = obj
  709. end
  710. end
  711. end
  712. end
  713. end
  714. end
  715. -- PDF 1.5 ObjStmt
  716. local function extract_pdf_compound_objects(task, pdf)
  717. for i,obj in ipairs(pdf.objects or {}) do
  718. if i > 0 and i % 100 == 0 then
  719. local now = rspamd_util.get_ticks()
  720. if now >= pdf.end_timestamp then
  721. pdf.timeout_processing = now - pdf.start_timestamp
  722. lua_util.debugm(N, task, 'pdf: timeout processing compound objects after spending %s seconds, ' ..
  723. '%s elements processed',
  724. pdf.timeout_processing, i)
  725. break
  726. end
  727. end
  728. if obj.stream and obj.dict and type(obj.dict) == 'table' then
  729. local t = obj.dict.Type
  730. if t and t == 'ObjStm' then
  731. -- We are in troubles sir...
  732. local nobjs = tonumber(maybe_dereference_object(obj.dict.N, pdf, task))
  733. local first = tonumber(maybe_dereference_object(obj.dict.First, pdf, task))
  734. if nobjs and first then
  735. --local extend = maybe_dereference_object(obj.dict.Extends, pdf, task)
  736. lua_util.debugm(N, task, 'extract ObjStm with %s objects (%s first) %s extend',
  737. nobjs, first, obj.dict.Extends)
  738. local uncompressed = maybe_extract_object_stream(obj, pdf, task)
  739. if uncompressed then
  740. pdf_compound_object_unpack(obj, uncompressed, pdf, task, first)
  741. end
  742. else
  743. lua_util.debugm(N, task, 'ObjStm object %s:%s has bad dict: %s',
  744. obj.major, obj.minor, obj.dict)
  745. end
  746. end
  747. end
  748. end
  749. end
  750. -- This function arranges starts and ends of all objects and process them into initial
  751. -- set of objects
  752. local function extract_outer_objects(task, input, pdf)
  753. local start_pos, end_pos = 1, 1
  754. local max_start_pos, max_end_pos
  755. local obj_count = 0
  756. max_start_pos = math.min(config.max_pdf_objects, #pdf.start_objects)
  757. max_end_pos = math.min(config.max_pdf_objects, #pdf.end_objects)
  758. lua_util.debugm(N, task, "pdf: extract objects from %s start positions and %s end positions",
  759. max_start_pos, max_end_pos)
  760. while start_pos <= max_start_pos and end_pos <= max_end_pos do
  761. local first = pdf.start_objects[start_pos]
  762. local last = pdf.end_objects[end_pos]
  763. -- 7 is length of `endobj\n`
  764. if first + 6 < last then
  765. local len = last - first - 6
  766. -- Also get the starting span and try to match it versus obj re to get numbers
  767. local obj_line_potential = first - 32
  768. if obj_line_potential < 1 then obj_line_potential = 1 end
  769. local prev_obj_end = pdf.end_objects[end_pos - 1]
  770. if end_pos > 1 and prev_obj_end >= obj_line_potential and prev_obj_end < first then
  771. obj_line_potential = prev_obj_end + 1
  772. end
  773. local obj_line_span = input:span(obj_line_potential, first - obj_line_potential + 1)
  774. local matches = object_re:search(obj_line_span, true, true)
  775. if matches and matches[1] then
  776. local nobj = {
  777. start = first,
  778. len = len,
  779. data = input:span(first, len),
  780. major = tonumber(matches[1][2]),
  781. minor = tonumber(matches[1][3]),
  782. }
  783. pdf.objects[obj_count + 1] = nobj
  784. if nobj.major and nobj.minor then
  785. -- Add reference
  786. local ref = obj_ref(nobj.major, nobj.minor)
  787. nobj.ref = ref -- Our internal reference
  788. pdf.ref[ref] = nobj
  789. end
  790. end
  791. obj_count = obj_count + 1
  792. start_pos = start_pos + 1
  793. end_pos = end_pos + 1
  794. elseif first > last then
  795. end_pos = end_pos + 1
  796. else
  797. start_pos = start_pos + 1
  798. end_pos = end_pos + 1
  799. end
  800. end
  801. end
  802. -- This function attaches streams to objects and processes outer pdf grammar
  803. local function attach_pdf_streams(task, input, pdf)
  804. if pdf.start_streams and pdf.end_streams then
  805. local start_pos, end_pos = 1, 1
  806. local max_start_pos, max_end_pos
  807. max_start_pos = math.min(config.max_pdf_objects, #pdf.start_streams)
  808. max_end_pos = math.min(config.max_pdf_objects, #pdf.end_streams)
  809. for _,obj in ipairs(pdf.objects) do
  810. while start_pos <= max_start_pos and end_pos <= max_end_pos do
  811. local first = pdf.start_streams[start_pos]
  812. local last = pdf.end_streams[end_pos]
  813. last = last - 10 -- Exclude endstream\n pattern
  814. lua_util.debugm(N, task, "start: %s, end: %s; obj: %s-%s",
  815. first, last, obj.start, obj.start + obj.len)
  816. if first > obj.start and last < obj.start + obj.len and last > first then
  817. -- In case if we have fake endstream :(
  818. while pdf.end_streams[end_pos + 1] and pdf.end_streams[end_pos + 1] < obj.start + obj.len do
  819. end_pos = end_pos + 1
  820. last = pdf.end_streams[end_pos]
  821. end
  822. -- Strip the first \n
  823. while first < last do
  824. local chr = input:at(first)
  825. if chr ~= 13 and chr ~= 10 then break end
  826. first = first + 1
  827. end
  828. local len = last - first
  829. obj.stream = {
  830. start = first,
  831. len = len,
  832. data = input:span(first, len)
  833. }
  834. start_pos = start_pos + 1
  835. end_pos = end_pos + 1
  836. break
  837. elseif first < obj.start then
  838. start_pos = start_pos + 1
  839. elseif last > obj.start + obj.len then
  840. -- Not this object
  841. break
  842. else
  843. start_pos = start_pos + 1
  844. end_pos = end_pos + 1
  845. end
  846. end
  847. if obj.stream then
  848. lua_util.debugm(N, task, 'found object %s:%s %s start %s len, %s stream start, %s stream length',
  849. obj.major, obj.minor, obj.start, obj.len, obj.stream.start, obj.stream.len)
  850. else
  851. lua_util.debugm(N, task, 'found object %s:%s %s start %s len, no stream',
  852. obj.major, obj.minor, obj.start, obj.len)
  853. end
  854. end
  855. end
  856. end
  857. -- Processes PDF objects: extracts streams, object numbers, process outer grammar,
  858. -- augment object types
  859. local function postprocess_pdf_objects(task, input, pdf)
  860. pdf.objects = {} -- objects table
  861. pdf.ref = {} -- references table
  862. extract_outer_objects(task, input, pdf)
  863. -- Now we have objects and we need to attach streams that are in bounds
  864. attach_pdf_streams(task, input, pdf)
  865. -- Parse grammar for outer objects
  866. for i,obj in ipairs(pdf.objects) do
  867. if i > 0 and i % 100 == 0 then
  868. local now = rspamd_util.get_ticks()
  869. if now >= pdf.end_timestamp then
  870. pdf.timeout_processing = now - pdf.start_timestamp
  871. lua_util.debugm(N, task, 'pdf: timeout processing grammars after spending %s seconds, ' ..
  872. '%s elements processed',
  873. pdf.timeout_processing, i)
  874. break
  875. end
  876. end
  877. if obj.ref then
  878. parse_object_grammar(obj, task, pdf)
  879. end
  880. end
  881. if not pdf.timeout_processing then
  882. extract_pdf_compound_objects(task, pdf)
  883. else
  884. -- ENOTIME
  885. return
  886. end
  887. -- Now we might probably have all objects being processed
  888. for i,obj in ipairs(pdf.objects) do
  889. if obj.dict then
  890. -- Types processing
  891. if i > 0 and i % 100 == 0 then
  892. local now = rspamd_util.get_ticks()
  893. if now >= pdf.end_timestamp then
  894. pdf.timeout_processing = now - pdf.start_timestamp
  895. lua_util.debugm(N, task, 'pdf: timeout processing dicts after spending %s seconds, ' ..
  896. '%s elements processed',
  897. pdf.timeout_processing, i)
  898. break
  899. end
  900. end
  901. process_dict(task, pdf, obj, obj.dict)
  902. end
  903. end
  904. end
  905. local function offsets_to_blocks(starts, ends, out)
  906. local start_pos, end_pos = 1, 1
  907. while start_pos <= #starts and end_pos <= #ends do
  908. local first = starts[start_pos]
  909. local last = ends[end_pos]
  910. if first < last then
  911. local len = last - first
  912. out[#out + 1] = {
  913. start = first,
  914. len = len,
  915. }
  916. start_pos = start_pos + 1
  917. end_pos = end_pos + 1
  918. elseif first > last then
  919. end_pos = end_pos + 1
  920. else
  921. -- Not ordered properly!
  922. break
  923. end
  924. end
  925. end
  926. local function search_text(task, pdf)
  927. for _,obj in ipairs(pdf.objects) do
  928. if obj.type == 'Page' and obj.contents then
  929. local text = {}
  930. for _,tobj in ipairs(obj.contents) do
  931. maybe_extract_object_stream(tobj, pdf, task)
  932. local matches = pdf_text_trie:match(tobj.uncompressed or '')
  933. if matches then
  934. local text_blocks = {}
  935. local starts = {}
  936. local ends = {}
  937. for npat,matched_positions in pairs(matches) do
  938. if npat == 1 then
  939. for _,pos in ipairs(matched_positions) do
  940. starts[#starts + 1] = pos
  941. end
  942. else
  943. for _,pos in ipairs(matched_positions) do
  944. ends[#ends + 1] = pos
  945. end
  946. end
  947. end
  948. offsets_to_blocks(starts, ends, text_blocks)
  949. for _,bl in ipairs(text_blocks) do
  950. if bl.len > 2 then
  951. -- To remove \s+ET\b pattern (it can leave trailing space or not but it doesn't matter)
  952. bl.len = bl.len - 2
  953. end
  954. bl.data = tobj.uncompressed:span(bl.start, bl.len)
  955. --lua_util.debugm(N, task, 'extracted text from object %s:%s: %s',
  956. -- tobj.major, tobj.minor, bl.data)
  957. if bl.len < config.max_processing_size then
  958. local ret,obj_or_err = pcall(pdf_text_grammar.match, pdf_text_grammar,
  959. bl.data)
  960. if ret then
  961. text[#text + 1] = obj_or_err
  962. lua_util.debugm(N, task, 'attached %s from content object %s:%s to %s:%s',
  963. obj_or_err, tobj.major, tobj.minor, obj.major, obj.minor)
  964. else
  965. lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
  966. obj.major, obj.minor, obj_or_err)
  967. end
  968. end
  969. end
  970. end
  971. end
  972. -- Join all text data together
  973. if #text > 0 then
  974. obj.text = rspamd_text.fromtable(text)
  975. lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
  976. obj.major, obj.minor, obj.text)
  977. end
  978. end
  979. end
  980. end
  981. -- This function searches objects for `/URI` key and parses it's content
  982. local function search_urls(task, pdf, mpart)
  983. local function recursive_object_traverse(obj, dict, rec)
  984. if rec > 10 then
  985. lua_util.debugm(N, task, 'object %s:%s recurses too much',
  986. obj.major, obj.minor)
  987. return
  988. end
  989. for k,v in pairs(dict) do
  990. if type(v) == 'table' then
  991. recursive_object_traverse(obj, v, rec + 1)
  992. elseif k == 'URI' then
  993. v = maybe_dereference_object(v, pdf, task)
  994. if type(v) == 'string' then
  995. local url = rspamd_url.create(task:get_mempool(), v, {'content'})
  996. if url then
  997. lua_util.debugm(N, task, 'found url %s in object %s:%s',
  998. v, obj.major, obj.minor)
  999. task:inject_url(url, mpart)
  1000. end
  1001. end
  1002. end
  1003. end
  1004. end
  1005. for _,obj in ipairs(pdf.objects) do
  1006. if obj.dict and type(obj.dict) == 'table' then
  1007. recursive_object_traverse(obj, obj.dict, 0)
  1008. end
  1009. end
  1010. end
  1011. local function process_pdf(input, mpart, task)
  1012. if not config.enabled then
  1013. -- Skip processing
  1014. return {}
  1015. end
  1016. local matches = pdf_trie:match(input)
  1017. if matches then
  1018. local start_ts = rspamd_util.get_ticks()
  1019. local pdf_output = {
  1020. tag = 'pdf',
  1021. extract_text = extract_text_data,
  1022. start_timestamp = start_ts,
  1023. end_timestamp = start_ts + config.pdf_process_timeout,
  1024. }
  1025. local grouped_processors = {}
  1026. for npat,matched_positions in pairs(matches) do
  1027. local index = pdf_indexes[npat]
  1028. local proc_key,loc_npat = index[1], index[4]
  1029. if not grouped_processors[proc_key] then
  1030. grouped_processors[proc_key] = {
  1031. processor_func = processors[proc_key],
  1032. offsets = {},
  1033. }
  1034. end
  1035. local proc = grouped_processors[proc_key]
  1036. -- Fill offsets
  1037. for _,pos in ipairs(matched_positions) do
  1038. proc.offsets[#proc.offsets + 1] = {pos, loc_npat}
  1039. end
  1040. end
  1041. for name,processor in pairs(grouped_processors) do
  1042. -- Sort by offset
  1043. lua_util.debugm(N, task, "pdf: process group %s with %s matches",
  1044. name, #processor.offsets)
  1045. table.sort(processor.offsets, function(e1, e2) return e1[1] < e2[1] end)
  1046. processor.processor_func(input, task, processor.offsets, pdf_output)
  1047. end
  1048. pdf_output.flags = {}
  1049. if pdf_output.start_objects and pdf_output.end_objects then
  1050. if #pdf_output.start_objects > config.max_pdf_objects then
  1051. pdf_output.many_objects = #pdf_output.start_objects
  1052. -- Trim
  1053. end
  1054. -- Postprocess objects
  1055. postprocess_pdf_objects(task, input, pdf_output)
  1056. if config.text_extraction then
  1057. search_text(task, pdf_output)
  1058. end
  1059. if config.url_extraction then
  1060. search_urls(task, pdf_output, mpart)
  1061. end
  1062. if config.js_fuzzy and pdf_output.scripts then
  1063. pdf_output.fuzzy_hashes = {}
  1064. if config.openaction_fuzzy_only then
  1065. -- OpenAction only
  1066. if pdf_output.openaction and pdf_output.openaction.bin_hash then
  1067. if config.min_js_fuzzy and #pdf_output.openaction.data >= config.min_js_fuzzy then
  1068. lua_util.debugm(N, task, "pdf: add fuzzy hash from openaction: %s",
  1069. pdf_output.openaction.hash)
  1070. table.insert(pdf_output.fuzzy_hashes, pdf_output.openaction.bin_hash)
  1071. else
  1072. lua_util.debugm(N, task, "pdf: skip fuzzy hash from Javascript: %s, too short: %s",
  1073. pdf_output.openaction.hash, #pdf_output.openaction.data)
  1074. end
  1075. end
  1076. else
  1077. -- All hashes
  1078. for h,sc in pairs(pdf_output.scripts) do
  1079. if config.min_js_fuzzy and #sc.data >= config.min_js_fuzzy then
  1080. lua_util.debugm(N, task, "pdf: add fuzzy hash from Javascript: %s",
  1081. sc.hash)
  1082. table.insert(pdf_output.fuzzy_hashes, h)
  1083. else
  1084. lua_util.debugm(N, task, "pdf: skip fuzzy hash from Javascript: %s, too short: %s",
  1085. sc.hash, #sc.data)
  1086. end
  1087. end
  1088. end
  1089. end
  1090. else
  1091. pdf_output.flags.no_objects = true
  1092. end
  1093. return pdf_output
  1094. end
  1095. end
  1096. -- Processes the PDF trailer
  1097. processors.trailer = function(input, task, positions, output)
  1098. local last_pos = positions[#positions]
  1099. lua_util.debugm(N, task, 'pdf: process trailer at position %s (%s total length)',
  1100. last_pos, #input)
  1101. if last_pos[1] > config.max_pdf_trailer then
  1102. output.long_trailer = #input - last_pos[1]
  1103. return
  1104. end
  1105. local last_span = input:span(last_pos[1])
  1106. local lines_checked = 0
  1107. for line in last_span:lines(true) do
  1108. if line:find('/Encrypt ') then
  1109. lua_util.debugm(N, task, "pdf: found encrypted line in trailer: %s",
  1110. line)
  1111. output.encrypted = true
  1112. break
  1113. end
  1114. lines_checked = lines_checked + 1
  1115. if lines_checked > config.max_pdf_trailer_lines then
  1116. lua_util.debugm(N, task, "pdf: trailer has too many lines, stop checking")
  1117. output.long_trailer = #input - last_pos[1]
  1118. break
  1119. end
  1120. end
  1121. end
  1122. processors.suspicious = function(_, task, _, output)
  1123. lua_util.debugm(N, task, "pdf: found a suspicious pattern")
  1124. output.suspicious = true
  1125. end
  1126. local function generic_table_inserter(positions, output, output_key)
  1127. if not output[output_key] then
  1128. output[output_key] = {}
  1129. end
  1130. local shift = #output[output_key]
  1131. for i,pos in ipairs(positions) do
  1132. output[output_key][i + shift] = pos[1]
  1133. end
  1134. end
  1135. processors.start_object = function(_, task, positions, output)
  1136. generic_table_inserter(positions, output, 'start_objects')
  1137. end
  1138. processors.end_object = function(_, task, positions, output)
  1139. generic_table_inserter(positions, output, 'end_objects')
  1140. end
  1141. processors.start_stream = function(_, task, positions, output)
  1142. generic_table_inserter(positions, output, 'start_streams')
  1143. end
  1144. processors.end_stream = function(_, task, positions, output)
  1145. generic_table_inserter(positions, output, 'end_streams')
  1146. end
  1147. exports.process = process_pdf
  1148. return exports