Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412
  1. --[[
  2. Copyright (c) 2019, Vsevolod Stakhov <vsevolod@highsecure.ru>
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ]]--
  13. --[[[
  14. -- @module lua_content/pdf
  15. -- This module contains some heuristics for PDF files
  16. --]]
  17. local rspamd_trie = require "rspamd_trie"
  18. local rspamd_util = require "rspamd_util"
  19. local rspamd_text = require "rspamd_text"
  20. local rspamd_url = require "rspamd_url"
  21. local bit = require "bit"
  22. local N = "lua_content"
  23. local lua_util = require "lua_util"
  24. local rspamd_regexp = require "rspamd_regexp"
  25. local lpeg = require "lpeg"
  26. local pdf_patterns = {
  27. trailer = {
  28. patterns = {
  29. [[\ntrailer\r?\n]]
  30. }
  31. },
  32. suspicious = {
  33. patterns = {
  34. [[netsh\s]],
  35. [[echo\s]],
  36. [=[\/[A-Za-z]*#\d\d[#A-Za-z<>/\s]]=], -- Hex encode obfuscation
  37. }
  38. },
  39. start_object = {
  40. patterns = {
  41. [=[[\r\n\0]\s*\d+\s+\d+\s+obj[\s<]]=]
  42. }
  43. },
  44. end_object = {
  45. patterns = {
  46. [=[endobj[\r\n]]=]
  47. }
  48. },
  49. start_stream = {
  50. patterns = {
  51. [=[>\s*stream[\r\n]]=],
  52. }
  53. },
  54. end_stream = {
  55. patterns = {
  56. [=[endstream[\r\n]]=]
  57. }
  58. }
  59. }
  60. local pdf_text_patterns = {
  61. start = {
  62. patterns = {
  63. [[\sBT\s]]
  64. }
  65. },
  66. stop = {
  67. patterns = {
  68. [[\sET\b]]
  69. }
  70. }
  71. }
  72. local pdf_cmap_patterns = {
  73. start = {
  74. patterns = {
  75. [[\d\s+beginbfchar\s]],
  76. [[\d\s+beginbfrange\s]]
  77. }
  78. },
  79. stop = {
  80. patterns = {
  81. [[\sendbfrange\b]],
  82. [[\sendbchar\b]]
  83. }
  84. }
  85. }
  86. -- index[n] ->
  87. -- t[1] - pattern,
  88. -- t[2] - key in patterns table,
  89. -- t[3] - value in patterns table
  90. -- t[4] - local pattern index
  91. local pdf_indexes = {}
  92. local pdf_text_indexes = {}
  93. local pdf_cmap_indexes = {}
  94. local pdf_trie
  95. local pdf_text_trie
  96. local pdf_cmap_trie
  97. local exports = {}
  98. local config = {
  99. max_extraction_size = 512 * 1024,
  100. max_processing_size = 32 * 1024,
  101. text_extraction = false, -- NYI feature
  102. url_extraction = true,
  103. enabled = true,
  104. js_fuzzy = true, -- Generate fuzzy hashes from PDF javascripts
  105. min_js_fuzzy = 256, -- Minimum size of js to be considered as a fuzzy
  106. openaction_fuzzy_only = false, -- Generate fuzzy from all scripts
  107. max_pdf_objects = 10000, -- Maximum number of objects to be considered
  108. max_pdf_trailer = 10 * 1024 * 1024, -- Maximum trailer size (to avoid abuse)
  109. max_pdf_trailer_lines = 100, -- Maximum number of lines in pdf trailer
  110. pdf_process_timeout = 1.0, -- Timeout in seconds for processing
  111. }
  112. -- Used to process patterns found in PDF
  113. -- positions for functional processors should be a iter/table from trie matcher in form
  114. ---- [{n1, pat_idx1}, ... {nn, pat_idxn}] where
  115. ---- pat_idxn is pattern index and n1 ... nn are match positions
  116. local processors = {}
  117. -- PDF objects outer grammar in LPEG style (performing table captures)
  118. local pdf_outer_grammar
  119. local pdf_text_grammar
  120. -- Used to match objects
  121. local object_re = rspamd_regexp.create_cached([=[/(\d+)\s+(\d+)\s+obj\s*/]=])
  122. local function config_module()
  123. local opts = rspamd_config:get_all_opt('lua_content')
  124. if opts and opts.pdf then
  125. config = lua_util.override_defaults(config, opts.pdf)
  126. end
  127. end
  128. local function compile_tries()
  129. local default_compile_flags = bit.bor(rspamd_trie.flags.re,
  130. rspamd_trie.flags.dot_all,
  131. rspamd_trie.flags.no_start)
  132. local function compile_pats(patterns, indexes, compile_flags)
  133. local strs = {}
  134. for what,data in pairs(patterns) do
  135. for i,pat in ipairs(data.patterns) do
  136. strs[#strs + 1] = pat
  137. indexes[#indexes + 1] = {what, data, pat, i}
  138. end
  139. end
  140. return rspamd_trie.create(strs, compile_flags or default_compile_flags)
  141. end
  142. if not pdf_trie then
  143. pdf_trie = compile_pats(pdf_patterns, pdf_indexes)
  144. end
  145. if not pdf_text_trie then
  146. pdf_text_trie = compile_pats(pdf_text_patterns, pdf_text_indexes)
  147. end
  148. if not pdf_cmap_trie then
  149. pdf_cmap_trie = compile_pats(pdf_cmap_patterns, pdf_cmap_indexes)
  150. end
  151. end
  152. -- Returns a table with generic grammar elements for PDF
  153. local function generic_grammar_elts()
  154. local P = lpeg.P
  155. local R = lpeg.R
  156. local S = lpeg.S
  157. local V = lpeg.V
  158. local C = lpeg.C
  159. local D = R'09' -- Digits
  160. local grammar_elts = {}
  161. -- Helper functions
  162. local function pdf_hexstring_unescape(s)
  163. if #s % 2 == 0 then
  164. -- Sane hex string
  165. return lua_util.unhex(s)
  166. end
  167. -- WTF hex string
  168. -- Append '0' to it and unescape...
  169. return lua_util.unhex(s:sub(1, #s - 1)) .. lua_util.unhex((s:sub(#s) .. '0'))
  170. end
  171. local function pdf_string_unescape(s)
  172. local function ue_single(cc)
  173. if cc == '\\r' then
  174. return '\r'
  175. elseif cc == '\\n' then
  176. return '\n'
  177. else
  178. return cc:gsub(2, 2)
  179. end
  180. end
  181. -- simple unescape \char
  182. s = s:gsub('\\[^%d]', ue_single)
  183. -- unescape octal
  184. local function ue_octal(cc)
  185. -- Replace unknown stuff with '?'
  186. return string.char(tonumber(cc:sub(2), 8) or 63)
  187. end
  188. s = s:gsub('\\%d%d?%d?', ue_octal)
  189. return s
  190. end
  191. local function pdf_id_unescape(s)
  192. return (s:gsub('#%d%d', function (cc)
  193. return string.char(tonumber(cc:sub(2), 16))
  194. end))
  195. end
  196. local delim = S'()<>[]{}/%'
  197. grammar_elts.ws = S'\0 \r\n\t\f'
  198. local hex = R'af' + R'AF' + D
  199. -- Comments.
  200. local eol = P'\r\n' + '\n'
  201. local line = (1 - S'\r\n\f')^0 * eol^-1
  202. grammar_elts.comment = P'%' * line
  203. -- Numbers.
  204. local sign = S'+-'^-1
  205. local decimal = D^1
  206. local float = D^1 * P'.' * D^0 + P'.' * D^1
  207. grammar_elts.number = C(sign * (float + decimal)) / tonumber
  208. -- String
  209. grammar_elts.str = P{ "(" * C(((1 - S"()\\") + (P '\\' * 1) + V(1))^0) / pdf_string_unescape * ")" }
  210. grammar_elts.hexstr = P{"<" * C(hex^0) / pdf_hexstring_unescape * ">"}
  211. -- Identifier
  212. grammar_elts.id = P{'/' * C((1-(delim + grammar_elts.ws))^1) / pdf_id_unescape}
  213. -- Booleans (who care about them?)
  214. grammar_elts.boolean = C(P("true") + P("false"))
  215. -- Stupid references
  216. grammar_elts.ref = lpeg.Ct{lpeg.Cc("%REF%") * C(D^1) * " " * C(D^1) * " " * "R"}
  217. return grammar_elts
  218. end
  219. -- Generates a grammar to parse outer elements (external objects in PDF notation)
  220. local function gen_outer_grammar()
  221. local V = lpeg.V
  222. local gen = generic_grammar_elts()
  223. return lpeg.P{
  224. "EXPR";
  225. EXPR = gen.ws^0 * V("ELT")^0 * gen.ws^0,
  226. ELT = V("ARRAY") + V("DICT") + V("ATOM"),
  227. ATOM = gen.ws^0 * (gen.comment + gen.boolean + gen.ref +
  228. gen.number + V("STRING") + gen.id) * gen.ws^0,
  229. DICT = "<<" * gen.ws^0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR")^0, rawset) * gen.ws^0 * ">>",
  230. KV_PAIR = lpeg.Cg(gen.id * gen.ws^0 * V("ELT") * gen.ws^0),
  231. ARRAY = "[" * gen.ws^0 * lpeg.Ct(V("ELT")^0) * gen.ws^0 * "]",
  232. STRING = lpeg.P{gen.str + gen.hexstr},
  233. }
  234. end
  235. -- Graphic state in PDF
  236. local function gen_graphics_unary()
  237. local P = lpeg.P
  238. local S = lpeg.S
  239. return P("q") + P("Q") + P("h")
  240. + S("WSsFfBb") * P("*")^0 + P("n")
  241. end
  242. local function gen_graphics_binary()
  243. local P = lpeg.P
  244. local S = lpeg.S
  245. return S("gGwJjMi") +
  246. P("M") + P("ri") + P("gs") +
  247. P("CS") + P("cs") + P("sh")
  248. end
  249. local function gen_graphics_ternary()
  250. local P = lpeg.P
  251. local S = lpeg.S
  252. return P("d") + P("m") + S("lm")
  253. end
  254. local function gen_graphics_nary()
  255. local P = lpeg.P
  256. local S = lpeg.S
  257. return P("SC") + P("sc") + P("SCN") + P("scn") + P("k") + P("K") + P("re") + S("cvy") +
  258. P("RG") + P("rg")
  259. end
  260. -- Generates a grammar to parse text blocks (between BT and ET)
  261. local function gen_text_grammar()
  262. local V = lpeg.V
  263. local P = lpeg.P
  264. local C = lpeg.C
  265. local gen = generic_grammar_elts()
  266. local empty = ""
  267. local unary_ops = C("T*") / "\n" +
  268. C(gen_graphics_unary()) / empty
  269. local binary_ops = P("Tc") + P("Tw") + P("Tz") + P("TL") + P("Tr") + P("Ts") +
  270. gen_graphics_binary()
  271. local ternary_ops = P("TD") + P("Td") + gen_graphics_ternary()
  272. local nary_op = P("Tm") + gen_graphics_nary()
  273. local text_binary_op = P("Tj") + P("TJ") + P("'")
  274. local text_quote_op = P('"')
  275. local font_op = P("Tf")
  276. return lpeg.P{
  277. "EXPR";
  278. EXPR = gen.ws^0 * lpeg.Ct(V("COMMAND")^0),
  279. COMMAND = (V("UNARY") + V("BINARY") + V("TERNARY") + V("NARY") + V("TEXT") +
  280. V("FONT") + gen.comment) * gen.ws^0,
  281. UNARY = unary_ops,
  282. BINARY = V("ARG") / empty * gen.ws^1 * binary_ops,
  283. TERNARY = V("ARG") / empty * gen.ws^1 * V("ARG") / empty * gen.ws^1 * ternary_ops,
  284. NARY = (gen.number / 0 * gen.ws^1)^1 * (gen.id / empty * gen.ws^0)^-1 * nary_op,
  285. ARG = V("ARRAY") + V("DICT") + V("ATOM"),
  286. ATOM = (gen.comment + gen.boolean + gen.ref +
  287. gen.number + V("STRING") + gen.id),
  288. DICT = "<<" * gen.ws^0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR")^0, rawset) * gen.ws^0 * ">>",
  289. KV_PAIR = lpeg.Cg(gen.id * gen.ws^0 * V("ARG") * gen.ws^0),
  290. ARRAY = "[" * gen.ws^0 * lpeg.Ct(V("ARG")^0) * gen.ws^0 * "]",
  291. STRING = lpeg.P{gen.str + gen.hexstr},
  292. TEXT = (V("TEXT_ARG") * gen.ws^1 * text_binary_op) +
  293. (V("ARG") / 0 * gen.ws^1 * V("ARG") / 0 * gen.ws^1 * V("TEXT_ARG") * gen.ws^1 * text_quote_op),
  294. FONT = (V("FONT_ARG") * gen.ws^1 * (gen.number / 0) * gen.ws^1 * font_op),
  295. FONT_ARG = lpeg.Ct(lpeg.Cc("%font%") * gen.id),
  296. TEXT_ARG = lpeg.Ct(V("STRING")) + V("TEXT_ARRAY"),
  297. TEXT_ARRAY = "[" *
  298. lpeg.Ct(((gen.ws^0 * (gen.ws^0 * (gen.number / 0)^0 * gen.ws^0 * (gen.str + gen.hexstr)))^1)) * gen.ws^0 * "]",
  299. }
  300. end
  301. -- Call immediately on require
  302. compile_tries()
  303. config_module()
  304. pdf_outer_grammar = gen_outer_grammar()
  305. pdf_text_grammar = gen_text_grammar()
  306. local function extract_text_data(specific)
  307. return nil -- NYI
  308. end
  309. -- Generates index for major/minor pair
  310. local function obj_ref(major, minor)
  311. return major * 10.0 + 1.0 / (minor + 1.0)
  312. end
  313. -- Return indirect object reference (if needed)
  314. local function maybe_dereference_object(elt, pdf, task)
  315. if type(elt) == 'table' and elt[1] == '%REF%' then
  316. local ref = obj_ref(elt[2], elt[3])
  317. if pdf.ref[ref] then
  318. -- No recursion!
  319. return pdf.ref[ref]
  320. else
  321. lua_util.debugm(N, task, 'cannot dereference %s:%s -> %s, no object',
  322. elt[2], elt[3], obj_ref(elt[2], elt[3]))
  323. return nil
  324. end
  325. end
  326. return elt
  327. end
  328. -- Apply PDF stream filter
  329. local function apply_pdf_filter(input, filt)
  330. if filt == 'FlateDecode' then
  331. return rspamd_util.inflate(input, config.max_extraction_size)
  332. end
  333. return nil
  334. end
  335. -- Conditionally apply a pipeline of stream filters and return uncompressed data
  336. local function maybe_apply_filter(dict, data, pdf, task)
  337. local uncompressed = data
  338. if dict.Filter then
  339. local filt = dict.Filter
  340. if type(filt) == 'string' then
  341. filt = {filt}
  342. end
  343. if dict.DecodeParms then
  344. local decode_params = maybe_dereference_object(dict.DecodeParms, pdf, task)
  345. if type(decode_params) == 'table' then
  346. if decode_params.Predictor then
  347. return nil,'predictor exists'
  348. end
  349. end
  350. end
  351. for _,f in ipairs(filt) do
  352. uncompressed = apply_pdf_filter(uncompressed, f)
  353. if not uncompressed then break end
  354. end
  355. end
  356. return uncompressed,nil
  357. end
  358. -- Conditionally extract stream data from object and attach it as obj.uncompressed
  359. local function maybe_extract_object_stream(obj, pdf, task)
  360. if pdf.encrypted then
  361. -- TODO add decryption some day
  362. return nil
  363. end
  364. local dict = obj.dict
  365. if dict.Length and type(obj.stream) == 'table' then
  366. local len = math.min(obj.stream.len,
  367. tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0)
  368. local real_stream = obj.stream.data:span(1, len)
  369. local uncompressed,filter_err = maybe_apply_filter(dict, real_stream, pdf, task)
  370. if uncompressed then
  371. obj.uncompressed = uncompressed
  372. lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)',
  373. obj.major, obj.minor, len, uncompressed:len())
  374. return obj.uncompressed
  375. else
  376. lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s: %s',
  377. obj.major, obj.minor, len, dict.Filter, filter_err)
  378. end
  379. end
  380. end
  381. local function parse_object_grammar(obj, task, pdf)
  382. -- Parse grammar
  383. local obj_dict_span
  384. if obj.stream then
  385. obj_dict_span = obj.data:span(1, obj.stream.start - obj.start)
  386. else
  387. obj_dict_span = obj.data
  388. end
  389. if obj_dict_span:len() < config.max_processing_size then
  390. local ret,obj_or_err = pcall(pdf_outer_grammar.match, pdf_outer_grammar, obj_dict_span)
  391. if ret then
  392. if obj.stream then
  393. if type(obj_or_err) == 'table' then
  394. obj.dict = obj_or_err
  395. else
  396. obj.dict = {}
  397. end
  398. lua_util.debugm(N, task, 'stream object %s:%s is parsed to: %s',
  399. obj.major, obj.minor, obj_or_err)
  400. else
  401. -- Direct object
  402. if type(obj_or_err) == 'table' then
  403. obj.dict = obj_or_err
  404. obj.uncompressed = obj_or_err
  405. lua_util.debugm(N, task, 'direct object %s:%s is parsed to: %s',
  406. obj.major, obj.minor, obj_or_err)
  407. pdf.ref[obj_ref(obj.major, obj.minor)] = obj
  408. else
  409. lua_util.debugm(N, task, 'direct object %s:%s is parsed to raw data: %s',
  410. obj.major, obj.minor, obj_or_err)
  411. pdf.ref[obj_ref(obj.major, obj.minor)] = obj_or_err
  412. obj.dict = {}
  413. obj.uncompressed = obj_or_err
  414. end
  415. end
  416. else
  417. lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
  418. obj.major, obj.minor, obj_or_err)
  419. end
  420. else
  421. lua_util.debugm(N, task, 'object %s:%s cannot be parsed: too large %s',
  422. obj.major, obj.minor, obj_dict_span:len())
  423. end
  424. end
  425. -- Extracts font data and process /ToUnicode mappings
  426. -- NYI in fact as cmap is ridiculously stupid and complicated
  427. --[[
  428. local function process_font(task, pdf, font, fname)
  429. local dict = font
  430. if font.dict then
  431. dict = font.dict
  432. end
  433. if type(dict) == 'table' and dict.ToUnicode then
  434. local cmap = maybe_dereference_object(dict.ToUnicode, pdf, task)
  435. if cmap and cmap.dict then
  436. maybe_extract_object_stream(cmap, pdf, task)
  437. lua_util.debugm(N, task, 'found cmap for font %s: %s',
  438. fname, cmap.uncompressed)
  439. end
  440. end
  441. end
  442. --]]
  443. -- Forward declaration
  444. local process_dict
  445. -- This function processes javascript string and returns JS hash and JS rspamd_text
  446. local function process_javascript(task, pdf, js, obj)
  447. local rspamd_cryptobox_hash = require "rspamd_cryptobox_hash"
  448. if type(js) == 'string' then
  449. js = rspamd_text.fromstring(js):oneline()
  450. elseif type(js) == 'userdata' then
  451. js = js:oneline()
  452. else
  453. return nil
  454. end
  455. local hash = rspamd_cryptobox_hash.create(js)
  456. local bin_hash = hash:bin()
  457. if not pdf.scripts then
  458. pdf.scripts = {}
  459. end
  460. if pdf.scripts[bin_hash] then
  461. -- Duplicate
  462. return pdf.scripts[bin_hash]
  463. end
  464. local njs = {
  465. data = js,
  466. hash = hash:hex(),
  467. bin_hash = bin_hash,
  468. object = obj,
  469. }
  470. pdf.scripts[bin_hash] = njs
  471. return njs
  472. end
  473. -- Extract interesting stuff from /Action, e.g. javascript
  474. local function process_action(task, pdf, obj)
  475. if not (obj.js or obj.launch) and (obj.dict and obj.dict.JS) then
  476. local js = maybe_dereference_object(obj.dict.JS, pdf, task)
  477. if js then
  478. if type(js) == 'table' then
  479. local extracted_js = maybe_extract_object_stream(js, pdf, task)
  480. if not extracted_js then
  481. lua_util.debugm(N, task, 'invalid type for JavaScript from %s:%s: %s',
  482. obj.major, obj.minor, js)
  483. else
  484. js = extracted_js
  485. end
  486. end
  487. js = process_javascript(task, pdf, js, obj)
  488. if js then
  489. obj.js = js
  490. lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
  491. obj.major, obj.minor, obj.js.data)
  492. else
  493. lua_util.debugm(N, task, 'invalid type for JavaScript from %s:%s: %s',
  494. obj.major, obj.minor, js)
  495. end
  496. elseif obj.dict.F then
  497. local launch = maybe_dereference_object(obj.dict.F, pdf, task)
  498. if launch then
  499. if type(launch) == 'string' then
  500. obj.launch = rspamd_text.fromstring(launch):exclude_chars('%n%c')
  501. lua_util.debugm(N, task, 'extracted launch from %s:%s: %s',
  502. obj.major, obj.minor, obj.launch)
  503. elseif type(launch) == 'userdata' then
  504. obj.launch = launch:exclude_chars('%n%c')
  505. lua_util.debugm(N, task, 'extracted launch from %s:%s: %s',
  506. obj.major, obj.minor, obj.launch)
  507. else
  508. lua_util.debugm(N, task, 'invalid type for launch from %s:%s: %s',
  509. obj.major, obj.minor, launch)
  510. end
  511. end
  512. else
  513. lua_util.debugm(N, task, 'no JS attribute in action %s:%s',
  514. obj.major, obj.minor)
  515. end
  516. end
  517. end
  518. -- Extract interesting stuff from /Catalog, e.g. javascript in /OpenAction
  519. local function process_catalog(task, pdf, obj)
  520. if obj.dict then
  521. if obj.dict.OpenAction then
  522. local action = maybe_dereference_object(obj.dict.OpenAction, pdf, task)
  523. if action and type(action) == 'table' then
  524. -- This also processes action js (if not already processed)
  525. process_dict(task, pdf, action, action.dict)
  526. if action.js then
  527. lua_util.debugm(N, task, 'found openaction JS in %s:%s: %s',
  528. obj.major, obj.minor, action.js)
  529. pdf.openaction = action.js
  530. action.js.object = obj
  531. elseif action.launch then
  532. lua_util.debugm(N, task, 'found openaction launch in %s:%s: %s',
  533. obj.major, obj.minor, action.launch)
  534. pdf.launch = action.launch
  535. else
  536. lua_util.debugm(N, task, 'no JS in openaction %s:%s: %s',
  537. obj.major, obj.minor, action)
  538. end
  539. else
  540. lua_util.debugm(N, task, 'cannot find openaction %s:%s: %s -> %s',
  541. obj.major, obj.minor, obj.dict.OpenAction, action)
  542. end
  543. else
  544. lua_util.debugm(N, task, 'no openaction in catalog %s:%s',
  545. obj.major, obj.minor)
  546. end
  547. end
  548. end
  549. local function process_xref(task, pdf, obj)
  550. if obj.dict then
  551. if obj.dict.Encrypt then
  552. local encrypt = maybe_dereference_object(obj.dict.Encrypt, pdf, task)
  553. lua_util.debugm(N, task, 'found encrypt: %s in xref object %s:%s',
  554. encrypt, obj.major, obj.minor)
  555. pdf.encrypted = true
  556. end
  557. end
  558. end
  559. process_dict = function(task, pdf, obj, dict)
  560. if not obj.type and type(dict) == 'table' then
  561. if dict.Type and type(dict.Type) == 'string' then
  562. -- Common stuff
  563. obj.type = dict.Type
  564. end
  565. if not obj.type then
  566. if obj.dict.S and obj.dict.JS then
  567. obj.type = 'Javascript'
  568. lua_util.debugm(N, task, 'implicit type for JavaScript object %s:%s',
  569. obj.major, obj.minor)
  570. else
  571. lua_util.debugm(N, task, 'no type for %s:%s',
  572. obj.major, obj.minor)
  573. return
  574. end
  575. end
  576. lua_util.debugm(N, task, 'processed stream dictionary for object %s:%s -> %s',
  577. obj.major, obj.minor, obj.type)
  578. local contents = dict.Contents
  579. if contents and type(contents) == 'table' then
  580. if contents[1] == '%REF%' then
  581. -- Single reference
  582. contents = {contents}
  583. end
  584. obj.contents = {}
  585. for _,c in ipairs(contents) do
  586. local cobj = maybe_dereference_object(c, pdf, task)
  587. if cobj and type(cobj) == 'table' then
  588. obj.contents[#obj.contents + 1] = cobj
  589. cobj.parent = obj
  590. cobj.type = 'content'
  591. end
  592. end
  593. lua_util.debugm(N, task, 'found content objects for %s:%s -> %s',
  594. obj.major, obj.minor, #obj.contents)
  595. end
  596. local resources = dict.Resources
  597. if resources and type(resources) == 'table' then
  598. local res_ref = maybe_dereference_object(resources, pdf, task)
  599. if type(res_ref) ~= 'table' then
  600. lua_util.debugm(N, task, 'cannot parse resources from pdf: %s',
  601. resources)
  602. obj.resources = {}
  603. elseif res_ref.dict then
  604. obj.resources = res_ref.dict
  605. else
  606. obj.resources = {}
  607. end
  608. else
  609. -- Fucking pdf: we need to inherit from parent
  610. resources = {}
  611. if dict.Parent then
  612. local parent = maybe_dereference_object(dict.Parent, pdf, task)
  613. if parent and type(parent) == 'table' and parent.dict then
  614. if parent.resources then
  615. lua_util.debugm(N, task, 'propagated resources from %s:%s to %s:%s',
  616. parent.major, parent.minor, obj.major, obj.minor)
  617. resources = parent.resources
  618. end
  619. end
  620. end
  621. obj.resources = resources
  622. end
  623. --[[Disabled fonts extraction
  624. local fonts = obj.resources.Font
  625. if fonts and type(fonts) == 'table' then
  626. obj.fonts = {}
  627. for k,v in pairs(fonts) do
  628. obj.fonts[k] = maybe_dereference_object(v, pdf, task)
  629. if obj.fonts[k] then
  630. local font = obj.fonts[k]
  631. if config.text_extraction then
  632. process_font(task, pdf, font, k)
  633. lua_util.debugm(N, task, 'found font "%s" for object %s:%s -> %s',
  634. k, obj.major, obj.minor, font)
  635. end
  636. end
  637. end
  638. end
  639. ]]
  640. lua_util.debugm(N, task, 'found resources for object %s:%s (%s): %s',
  641. obj.major, obj.minor, obj.type, obj.resources)
  642. if obj.type == 'Action' then
  643. process_action(task, pdf, obj)
  644. elseif obj.type == 'Catalog' then
  645. process_catalog(task, pdf, obj)
  646. elseif obj.type == 'XRef' then
  647. -- XRef stream instead of trailer from PDF 1.5 (thanks Adobe)
  648. process_xref(task, pdf, obj)
  649. elseif obj.type == 'Javascript' then
  650. local js = maybe_dereference_object(obj.dict.JS, pdf, task)
  651. if js then
  652. if type(js) == 'table' then
  653. local extracted_js = maybe_extract_object_stream(js, pdf, task)
  654. if not extracted_js then
  655. lua_util.debugm(N, task, 'invalid type for JavaScript from %s:%s: %s',
  656. obj.major, obj.minor, js)
  657. else
  658. js = extracted_js
  659. end
  660. end
  661. js = process_javascript(task, pdf, js, obj)
  662. if js then
  663. obj.js = js
  664. lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s',
  665. obj.major, obj.minor, obj.js.data)
  666. else
  667. lua_util.debugm(N, task, 'invalid type for JavaScript from %s:%s: %s',
  668. obj.major, obj.minor, js)
  669. end
  670. end
  671. end
  672. end -- Already processed dict (obj.type is not empty)
  673. end
  674. -- This function is intended to unpack objects from ObjStm crappy structure
  675. local compound_obj_grammar
  676. local function compound_obj_grammar_gen()
  677. if not compound_obj_grammar then
  678. local gen = generic_grammar_elts()
  679. compound_obj_grammar = gen.ws^0 * (gen.comment * gen.ws^1)^0 *
  680. lpeg.Ct(lpeg.Ct(gen.number * gen.ws^1 * gen.number * gen.ws^0)^1)
  681. end
  682. return compound_obj_grammar
  683. end
  684. local function pdf_compound_object_unpack(_, uncompressed, pdf, task, first)
  685. -- First, we need to parse data line by line likely to find a line
  686. -- that consists of pairs of numbers
  687. compound_obj_grammar_gen()
  688. local elts = compound_obj_grammar:match(uncompressed)
  689. if elts and #elts > 0 then
  690. lua_util.debugm(N, task, 'compound elts (chunk length %s): %s',
  691. #uncompressed, elts)
  692. for i,pair in ipairs(elts) do
  693. local obj_number,offset = pair[1], pair[2]
  694. offset = offset + first
  695. if offset < #uncompressed then
  696. local span_len
  697. if i == #elts then
  698. span_len = #uncompressed - offset
  699. else
  700. span_len = (elts[i + 1][2] + first) - offset
  701. end
  702. if span_len > 0 and offset + span_len <= #uncompressed then
  703. local obj = {
  704. major = obj_number,
  705. minor = 0, -- Implicit
  706. data = uncompressed:span(offset + 1, span_len),
  707. ref = obj_ref(obj_number, 0)
  708. }
  709. parse_object_grammar(obj, task, pdf)
  710. if obj.dict then
  711. pdf.objects[#pdf.objects + 1] = obj
  712. end
  713. else
  714. lua_util.debugm(N, task, 'invalid span_len for compound object %s:%s; offset = %s, len = %s',
  715. pair[1], pair[2], offset + span_len, #uncompressed)
  716. end
  717. end
  718. end
  719. end
  720. end
  721. -- PDF 1.5 ObjStmt
  722. local function extract_pdf_compound_objects(task, pdf)
  723. for i,obj in ipairs(pdf.objects or {}) do
  724. if i > 0 and i % 100 == 0 then
  725. local now = rspamd_util.get_ticks()
  726. if now >= pdf.end_timestamp then
  727. pdf.timeout_processing = now - pdf.start_timestamp
  728. lua_util.debugm(N, task, 'pdf: timeout processing compound objects after spending %s seconds, ' ..
  729. '%s elements processed',
  730. pdf.timeout_processing, i)
  731. break
  732. end
  733. end
  734. if obj.stream and obj.dict and type(obj.dict) == 'table' then
  735. local t = obj.dict.Type
  736. if t and t == 'ObjStm' then
  737. -- We are in troubles sir...
  738. local nobjs = tonumber(maybe_dereference_object(obj.dict.N, pdf, task))
  739. local first = tonumber(maybe_dereference_object(obj.dict.First, pdf, task))
  740. if nobjs and first then
  741. --local extend = maybe_dereference_object(obj.dict.Extends, pdf, task)
  742. lua_util.debugm(N, task, 'extract ObjStm with %s objects (%s first) %s extend',
  743. nobjs, first, obj.dict.Extends)
  744. local uncompressed = maybe_extract_object_stream(obj, pdf, task)
  745. if uncompressed then
  746. pdf_compound_object_unpack(obj, uncompressed, pdf, task, first)
  747. end
  748. else
  749. lua_util.debugm(N, task, 'ObjStm object %s:%s has bad dict: %s',
  750. obj.major, obj.minor, obj.dict)
  751. end
  752. end
  753. end
  754. end
  755. end
  756. -- This function arranges starts and ends of all objects and process them into initial
  757. -- set of objects
  758. local function extract_outer_objects(task, input, pdf)
  759. local start_pos, end_pos = 1, 1
  760. local max_start_pos, max_end_pos
  761. local obj_count = 0
  762. max_start_pos = math.min(config.max_pdf_objects, #pdf.start_objects)
  763. max_end_pos = math.min(config.max_pdf_objects, #pdf.end_objects)
  764. lua_util.debugm(N, task, "pdf: extract objects from %s start positions and %s end positions",
  765. max_start_pos, max_end_pos)
  766. while start_pos <= max_start_pos and end_pos <= max_end_pos do
  767. local first = pdf.start_objects[start_pos]
  768. local last = pdf.end_objects[end_pos]
  769. -- 7 is length of `endobj\n`
  770. if first + 6 < last then
  771. local len = last - first - 6
  772. -- Also get the starting span and try to match it versus obj re to get numbers
  773. local obj_line_potential = first - 32
  774. if obj_line_potential < 1 then obj_line_potential = 1 end
  775. local prev_obj_end = pdf.end_objects[end_pos - 1]
  776. if end_pos > 1 and prev_obj_end >= obj_line_potential and prev_obj_end < first then
  777. obj_line_potential = prev_obj_end + 1
  778. end
  779. local obj_line_span = input:span(obj_line_potential, first - obj_line_potential + 1)
  780. local matches = object_re:search(obj_line_span, true, true)
  781. if matches and matches[1] then
  782. local nobj = {
  783. start = first,
  784. len = len,
  785. data = input:span(first, len),
  786. major = tonumber(matches[1][2]),
  787. minor = tonumber(matches[1][3]),
  788. }
  789. pdf.objects[obj_count + 1] = nobj
  790. if nobj.major and nobj.minor then
  791. -- Add reference
  792. local ref = obj_ref(nobj.major, nobj.minor)
  793. nobj.ref = ref -- Our internal reference
  794. pdf.ref[ref] = nobj
  795. end
  796. end
  797. obj_count = obj_count + 1
  798. start_pos = start_pos + 1
  799. end_pos = end_pos + 1
  800. elseif first > last then
  801. end_pos = end_pos + 1
  802. else
  803. start_pos = start_pos + 1
  804. end_pos = end_pos + 1
  805. end
  806. end
  807. end
  808. -- This function attaches streams to objects and processes outer pdf grammar
  809. local function attach_pdf_streams(task, input, pdf)
  810. if pdf.start_streams and pdf.end_streams then
  811. local start_pos, end_pos = 1, 1
  812. local max_start_pos, max_end_pos
  813. max_start_pos = math.min(config.max_pdf_objects, #pdf.start_streams)
  814. max_end_pos = math.min(config.max_pdf_objects, #pdf.end_streams)
  815. for _,obj in ipairs(pdf.objects) do
  816. while start_pos <= max_start_pos and end_pos <= max_end_pos do
  817. local first = pdf.start_streams[start_pos]
  818. local last = pdf.end_streams[end_pos]
  819. last = last - 10 -- Exclude endstream\n pattern
  820. lua_util.debugm(N, task, "start: %s, end: %s; obj: %s-%s",
  821. first, last, obj.start, obj.start + obj.len)
  822. if first > obj.start and last < obj.start + obj.len and last > first then
  823. -- In case if we have fake endstream :(
  824. while pdf.end_streams[end_pos + 1] and pdf.end_streams[end_pos + 1] < obj.start + obj.len do
  825. end_pos = end_pos + 1
  826. last = pdf.end_streams[end_pos]
  827. end
  828. -- Strip the first \n
  829. while first < last do
  830. local chr = input:byte(first)
  831. if chr ~= 13 and chr ~= 10 then break end
  832. first = first + 1
  833. end
  834. local len = last - first
  835. obj.stream = {
  836. start = first,
  837. len = len,
  838. data = input:span(first, len)
  839. }
  840. start_pos = start_pos + 1
  841. end_pos = end_pos + 1
  842. break
  843. elseif first < obj.start then
  844. start_pos = start_pos + 1
  845. elseif last > obj.start + obj.len then
  846. -- Not this object
  847. break
  848. else
  849. start_pos = start_pos + 1
  850. end_pos = end_pos + 1
  851. end
  852. end
  853. if obj.stream then
  854. lua_util.debugm(N, task, 'found object %s:%s %s start %s len, %s stream start, %s stream length',
  855. obj.major, obj.minor, obj.start, obj.len, obj.stream.start, obj.stream.len)
  856. else
  857. lua_util.debugm(N, task, 'found object %s:%s %s start %s len, no stream',
  858. obj.major, obj.minor, obj.start, obj.len)
  859. end
  860. end
  861. end
  862. end
  863. -- Processes PDF objects: extracts streams, object numbers, process outer grammar,
  864. -- augment object types
  865. local function postprocess_pdf_objects(task, input, pdf)
  866. pdf.objects = {} -- objects table
  867. pdf.ref = {} -- references table
  868. extract_outer_objects(task, input, pdf)
  869. -- Now we have objects and we need to attach streams that are in bounds
  870. attach_pdf_streams(task, input, pdf)
  871. -- Parse grammar for outer objects
  872. for i,obj in ipairs(pdf.objects) do
  873. if i > 0 and i % 100 == 0 then
  874. local now = rspamd_util.get_ticks()
  875. if now >= pdf.end_timestamp then
  876. pdf.timeout_processing = now - pdf.start_timestamp
  877. lua_util.debugm(N, task, 'pdf: timeout processing grammars after spending %s seconds, ' ..
  878. '%s elements processed',
  879. pdf.timeout_processing, i)
  880. break
  881. end
  882. end
  883. if obj.ref then
  884. parse_object_grammar(obj, task, pdf)
  885. -- Special early handling
  886. if obj.dict and obj.dict.Type and obj.dict.Type == 'XRef' then
  887. process_xref(task, pdf, obj)
  888. end
  889. end
  890. end
  891. if not pdf.timeout_processing then
  892. extract_pdf_compound_objects(task, pdf)
  893. else
  894. -- ENOTIME
  895. return
  896. end
  897. -- Now we might probably have all objects being processed
  898. for i,obj in ipairs(pdf.objects) do
  899. if obj.dict then
  900. -- Types processing
  901. if i > 0 and i % 100 == 0 then
  902. local now = rspamd_util.get_ticks()
  903. if now >= pdf.end_timestamp then
  904. pdf.timeout_processing = now - pdf.start_timestamp
  905. lua_util.debugm(N, task, 'pdf: timeout processing dicts after spending %s seconds, ' ..
  906. '%s elements processed',
  907. pdf.timeout_processing, i)
  908. break
  909. end
  910. end
  911. process_dict(task, pdf, obj, obj.dict)
  912. end
  913. end
  914. end
  915. local function offsets_to_blocks(starts, ends, out)
  916. local start_pos, end_pos = 1, 1
  917. while start_pos <= #starts and end_pos <= #ends do
  918. local first = starts[start_pos]
  919. local last = ends[end_pos]
  920. if first < last then
  921. local len = last - first
  922. out[#out + 1] = {
  923. start = first,
  924. len = len,
  925. }
  926. start_pos = start_pos + 1
  927. end_pos = end_pos + 1
  928. elseif first > last then
  929. end_pos = end_pos + 1
  930. else
  931. -- Not ordered properly!
  932. break
  933. end
  934. end
  935. end
  936. local function search_text(task, pdf)
  937. for _,obj in ipairs(pdf.objects) do
  938. if obj.type == 'Page' and obj.contents then
  939. local text = {}
  940. for _,tobj in ipairs(obj.contents) do
  941. maybe_extract_object_stream(tobj, pdf, task)
  942. local matches = pdf_text_trie:match(tobj.uncompressed or '')
  943. if matches then
  944. local text_blocks = {}
  945. local starts = {}
  946. local ends = {}
  947. for npat,matched_positions in pairs(matches) do
  948. if npat == 1 then
  949. for _,pos in ipairs(matched_positions) do
  950. starts[#starts + 1] = pos
  951. end
  952. else
  953. for _,pos in ipairs(matched_positions) do
  954. ends[#ends + 1] = pos
  955. end
  956. end
  957. end
  958. offsets_to_blocks(starts, ends, text_blocks)
  959. for _,bl in ipairs(text_blocks) do
  960. if bl.len > 2 then
  961. -- To remove \s+ET\b pattern (it can leave trailing space or not but it doesn't matter)
  962. bl.len = bl.len - 2
  963. end
  964. bl.data = tobj.uncompressed:span(bl.start, bl.len)
  965. --lua_util.debugm(N, task, 'extracted text from object %s:%s: %s',
  966. -- tobj.major, tobj.minor, bl.data)
  967. if bl.len < config.max_processing_size then
  968. local ret,obj_or_err = pcall(pdf_text_grammar.match, pdf_text_grammar,
  969. bl.data)
  970. if ret then
  971. text[#text + 1] = obj_or_err
  972. lua_util.debugm(N, task, 'attached %s from content object %s:%s to %s:%s',
  973. obj_or_err, tobj.major, tobj.minor, obj.major, obj.minor)
  974. else
  975. lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s',
  976. obj.major, obj.minor, obj_or_err)
  977. end
  978. end
  979. end
  980. end
  981. end
  982. -- Join all text data together
  983. if #text > 0 then
  984. obj.text = rspamd_text.fromtable(text)
  985. lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
  986. obj.major, obj.minor, obj.text)
  987. end
  988. end
  989. end
  990. end
  991. -- This function searches objects for `/URI` key and parses it's content
  992. local function search_urls(task, pdf, mpart)
  993. local function recursive_object_traverse(obj, dict, rec)
  994. if rec > 10 then
  995. lua_util.debugm(N, task, 'object %s:%s recurses too much',
  996. obj.major, obj.minor)
  997. return
  998. end
  999. for k,v in pairs(dict) do
  1000. if type(v) == 'table' then
  1001. recursive_object_traverse(obj, v, rec + 1)
  1002. elseif k == 'URI' then
  1003. v = maybe_dereference_object(v, pdf, task)
  1004. if type(v) == 'string' then
  1005. local url = rspamd_url.create(task:get_mempool(), v, {'content'})
  1006. if url then
  1007. lua_util.debugm(N, task, 'found url %s in object %s:%s',
  1008. v, obj.major, obj.minor)
  1009. task:inject_url(url, mpart)
  1010. end
  1011. end
  1012. end
  1013. end
  1014. end
  1015. for _,obj in ipairs(pdf.objects) do
  1016. if obj.dict and type(obj.dict) == 'table' then
  1017. recursive_object_traverse(obj, obj.dict, 0)
  1018. end
  1019. end
  1020. end
  1021. local function process_pdf(input, mpart, task)
  1022. if not config.enabled then
  1023. -- Skip processing
  1024. return {}
  1025. end
  1026. local matches = pdf_trie:match(input)
  1027. if matches then
  1028. local start_ts = rspamd_util.get_ticks()
  1029. -- Temp object used to share data between pdf extraction methods
  1030. local pdf_object = {
  1031. tag = 'pdf',
  1032. extract_text = extract_text_data,
  1033. start_timestamp = start_ts,
  1034. end_timestamp = start_ts + config.pdf_process_timeout,
  1035. }
  1036. -- Output object that excludes all internal stuff
  1037. local pdf_output = lua_util.shallowcopy(pdf_object)
  1038. local grouped_processors = {}
  1039. for npat,matched_positions in pairs(matches) do
  1040. local index = pdf_indexes[npat]
  1041. local proc_key,loc_npat = index[1], index[4]
  1042. if not grouped_processors[proc_key] then
  1043. grouped_processors[proc_key] = {
  1044. processor_func = processors[proc_key],
  1045. offsets = {},
  1046. }
  1047. end
  1048. local proc = grouped_processors[proc_key]
  1049. -- Fill offsets
  1050. for _,pos in ipairs(matched_positions) do
  1051. proc.offsets[#proc.offsets + 1] = {pos, loc_npat}
  1052. end
  1053. end
  1054. for name,processor in pairs(grouped_processors) do
  1055. -- Sort by offset
  1056. lua_util.debugm(N, task, "pdf: process group %s with %s matches",
  1057. name, #processor.offsets)
  1058. table.sort(processor.offsets, function(e1, e2) return e1[1] < e2[1] end)
  1059. processor.processor_func(input, task, processor.offsets, pdf_object, pdf_output)
  1060. end
  1061. pdf_output.flags = {}
  1062. if pdf_object.start_objects and pdf_object.end_objects then
  1063. if #pdf_object.start_objects > config.max_pdf_objects then
  1064. pdf_output.many_objects = #pdf_object.start_objects
  1065. -- Trim
  1066. end
  1067. -- Postprocess objects
  1068. postprocess_pdf_objects(task, input, pdf_object)
  1069. if config.text_extraction then
  1070. search_text(task, pdf_object, pdf_output)
  1071. end
  1072. if config.url_extraction then
  1073. search_urls(task, pdf_object, mpart, pdf_output)
  1074. end
  1075. if config.js_fuzzy and pdf_object.scripts then
  1076. pdf_output.fuzzy_hashes = {}
  1077. if config.openaction_fuzzy_only then
  1078. -- OpenAction only
  1079. if pdf_object.openaction and pdf_object.openaction.bin_hash then
  1080. if config.min_js_fuzzy and #pdf_object.openaction.data >= config.min_js_fuzzy then
  1081. lua_util.debugm(N, task, "pdf: add fuzzy hash from openaction: %s; size = %s; object: %s:%s",
  1082. pdf_object.openaction.hash,
  1083. #pdf_object.openaction.data,
  1084. pdf_object.openaction.object.major, pdf_object.openaction.object.minor)
  1085. table.insert(pdf_output.fuzzy_hashes, pdf_object.openaction.bin_hash)
  1086. else
  1087. lua_util.debugm(N, task, "pdf: skip fuzzy hash from JavaScript: %s, too short: %s",
  1088. pdf_object.openaction.hash, #pdf_object.openaction.data)
  1089. end
  1090. end
  1091. else
  1092. -- All hashes
  1093. for h,sc in pairs(pdf_object.scripts) do
  1094. if config.min_js_fuzzy and #sc.data >= config.min_js_fuzzy then
  1095. lua_util.debugm(N, task, "pdf: add fuzzy hash from JavaScript: %s; size = %s; object: %s:%s",
  1096. sc.hash,
  1097. #sc.data,
  1098. sc.object.major, sc.object.minor)
  1099. table.insert(pdf_output.fuzzy_hashes, h)
  1100. else
  1101. lua_util.debugm(N, task, "pdf: skip fuzzy hash from JavaScript: %s, too short: %s",
  1102. sc.hash, #sc.data)
  1103. end
  1104. end
  1105. end
  1106. end
  1107. else
  1108. pdf_output.flags.no_objects = true
  1109. end
  1110. -- Propagate from object to output
  1111. if pdf_object.encrypted then
  1112. pdf_output.encrypted = true
  1113. end
  1114. if pdf_object.scripts then
  1115. pdf_output.scripts = true
  1116. end
  1117. return pdf_output
  1118. end
  1119. end
  1120. -- Processes the PDF trailer
  1121. processors.trailer = function(input, task, positions, pdf_object, pdf_output)
  1122. local last_pos = positions[#positions]
  1123. lua_util.debugm(N, task, 'pdf: process trailer at position %s (%s total length)',
  1124. last_pos, #input)
  1125. if last_pos[1] > config.max_pdf_trailer then
  1126. pdf_output.long_trailer = #input - last_pos[1]
  1127. return
  1128. end
  1129. local last_span = input:span(last_pos[1])
  1130. local lines_checked = 0
  1131. for line in last_span:lines(true) do
  1132. if line:find('/Encrypt ') then
  1133. lua_util.debugm(N, task, "pdf: found encrypted line in trailer: %s",
  1134. line)
  1135. pdf_output.encrypted = true
  1136. pdf_object.encrypted = true
  1137. break
  1138. end
  1139. lines_checked = lines_checked + 1
  1140. if lines_checked > config.max_pdf_trailer_lines then
  1141. lua_util.debugm(N, task, "pdf: trailer has too many lines, stop checking")
  1142. pdf_output.long_trailer = #input - last_pos[1]
  1143. break
  1144. end
  1145. end
  1146. end
  1147. processors.suspicious = function(input, task, positions, pdf_object, pdf_output)
  1148. local suspicious_factor = 0.0
  1149. local nexec = 0
  1150. local nencoded = 0
  1151. local close_encoded = 0
  1152. local last_encoded
  1153. for _,match in ipairs(positions) do
  1154. if match[2] == 1 then
  1155. -- netsh
  1156. suspicious_factor = suspicious_factor + 0.5
  1157. elseif match[2] == 2 then
  1158. nexec = nexec + 1
  1159. elseif match[2] == 3 then
  1160. local enc_data = input:sub(match[1] - 2, match[1] - 1)
  1161. local legal_escape = false
  1162. if enc_data then
  1163. enc_data = enc_data:strtoul()
  1164. if enc_data then
  1165. -- Legit encode cases are non printable characters (e.g. spaces)
  1166. if enc_data < 0x21 or enc_data >= 0x7f then
  1167. legal_escape = true
  1168. end
  1169. end
  1170. end
  1171. if not legal_escape then
  1172. nencoded = nencoded + 1
  1173. if last_encoded then
  1174. if match[1] - last_encoded < 8 then
  1175. -- likely consecutive encoded chars, increase factor
  1176. close_encoded = close_encoded + 1
  1177. end
  1178. end
  1179. last_encoded = match[1]
  1180. end
  1181. end
  1182. end
  1183. if nencoded > 10 then
  1184. suspicious_factor = suspicious_factor + nencoded / 10
  1185. end
  1186. if nexec > 1 then
  1187. suspicious_factor = suspicious_factor + nexec / 2.0
  1188. end
  1189. if close_encoded > 4 and nencoded - close_encoded < 5 then
  1190. -- Too many close encoded comparing to the total number of encoded characters
  1191. suspicious_factor = suspicious_factor + 0.5
  1192. end
  1193. lua_util.debugm(N, task, 'pdf: found a suspicious patterns: %s exec, %s encoded (%s close), ' ..
  1194. '%s final factor',
  1195. nexec, nencoded, close_encoded, suspicious_factor)
  1196. if suspicious_factor > 1.0 then
  1197. suspicious_factor = 1.0
  1198. end
  1199. pdf_output.suspicious = suspicious_factor
  1200. end
  1201. local function generic_table_inserter(positions, pdf_object, output_key)
  1202. if not pdf_object[output_key] then
  1203. pdf_object[output_key] = {}
  1204. end
  1205. local shift = #pdf_object[output_key]
  1206. for i,pos in ipairs(positions) do
  1207. pdf_object[output_key][i + shift] = pos[1]
  1208. end
  1209. end
  1210. processors.start_object = function(_, task, positions, pdf_object)
  1211. generic_table_inserter(positions, pdf_object, 'start_objects')
  1212. end
  1213. processors.end_object = function(_, task, positions, pdf_object)
  1214. generic_table_inserter(positions, pdf_object, 'end_objects')
  1215. end
  1216. processors.start_stream = function(_, task, positions, pdf_object)
  1217. generic_table_inserter(positions, pdf_object, 'start_streams')
  1218. end
  1219. processors.end_stream = function(_, task, positions, pdf_object)
  1220. generic_table_inserter(positions, pdf_object, 'end_streams')
  1221. end
  1222. exports.process = process_pdf
  1223. return exports