From d9e1e67ad7ecd2dfa7ae65405126f8c22cb10f0f Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 13 Jan 2020 21:50:25 +0000 Subject: [Project] Lua_content: Add preliminary support of compound objects --- lualib/lua_content/pdf.lua | 123 +++++++++++++++++++++++++++------------------ 1 file changed, 75 insertions(+), 48 deletions(-) (limited to 'lualib/lua_content/pdf.lua') diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua index d3d4b9d85..cd0276e0c 100644 --- a/lualib/lua_content/pdf.lua +++ b/lualib/lua_content/pdf.lua @@ -479,6 +479,74 @@ local function process_dict(task, pdf, obj, dict) end end +local function apply_pdf_filter(input, filt) + if filt == 'FlateDecode' then + return rspamd_util.inflate(input, config.max_extraction_size) + end + + return nil +end + +local function maybe_apply_filter(dict, data) + local uncompressed = data + + if dict.Filter then + local filt = dict.Filter + if type(filt) == 'string' then + filt = {filt} + end + + for _,f in ipairs(filt) do + uncompressed = apply_pdf_filter(uncompressed, f) + + if not uncompressed then break end + end + end + + return uncompressed +end + +local function maybe_extract_object_stream(obj, pdf, task) + local dict = obj.dict + if dict.Filter and dict.Length then + local len = math.min(obj.stream.len, + tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0) + local real_stream = obj.stream.data:span(1, len) + + local uncompressed = maybe_apply_filter(dict, real_stream) + + if uncompressed then + obj.uncompressed = uncompressed + lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)', + obj.major, obj.minor, len, uncompressed:len()) + return obj.uncompressed + else + lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s', + obj.major, obj.minor, len, dict.Filter) + end + end +end + +-- This function is intended to unpack objects from ObjStm crappy structure +local compound_obj_grammar +local function compound_obj_grammar_gen() + if not compound_obj_grammar then + local gen = generic_grammar_elts() + compound_obj_grammar = gen.ws^0 * (gen.comment * gen.ws^1)^0 * + lpeg.Ct(lpeg.Ct(gen.number * gen.ws^1 * gen.number * gen.ws^0)^1) + end +end +local function pdf_compound_object_unpack(obj, uncompressed, pdf, task) + -- First, we need to parse data line by line likely to find a line + -- that consists of pairs of numbers + compound_obj_grammar_gen() + local elts = compound_obj_grammar:match(uncompressed) + if elts and #elts > 0 then + lua_util.debugm(N, task, 'compound elts: %s', + elts) + end +end + -- PDF 1.5 ObjStmt local function extract_pdf_compound_objects(task, pdf) for _,obj in ipairs(pdf.objects or {}) do @@ -492,7 +560,13 @@ local function extract_pdf_compound_objects(task, pdf) if nobjs and first then local extend = maybe_dereference_object(obj.dict.Extends, pdf, task) lua_util.debugm(N, task, 'extract ObjStm with %s objects (%s first) %s extend', - nobjs, first, extend) + nobjs, first, obj.dict.Extends) + + local uncompressed = maybe_extract_object_stream(obj, pdf, task) + + if uncompressed then + pdf_compound_object_unpack(obj, uncompressed, pdf, task) + end else lua_util.debugm(N, task, 'ObjStm object %s:%s has bad dict: %s', obj.major, obj.minor, obj.dict) @@ -672,53 +746,6 @@ local function postprocess_pdf_objects(task, input, pdf) end end -local function apply_pdf_filter(input, filt) - if filt == 'FlateDecode' then - return rspamd_util.inflate(input, config.max_extraction_size) - end - - return nil -end - -local function maybe_apply_filter(dict, data) - local uncompressed = data - - if dict.Filter then - local filt = dict.Filter - if type(filt) == 'string' then - filt = {filt} - end - - for _,f in ipairs(filt) do - uncompressed = apply_pdf_filter(uncompressed, f) - - if not uncompressed then break end - end - end - - return uncompressed -end - -local function maybe_extract_object_stream(obj, pdf, task) - local dict = obj.dict - if dict.Filter and dict.Length then - local len = math.min(obj.stream.len, - tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0) - local real_stream = obj.stream.data:span(1, len) - - local uncompressed = maybe_apply_filter(dict, real_stream) - - if uncompressed then - obj.uncompressed = uncompressed - lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)', - obj.major, obj.minor, len, uncompressed:len()) - else - lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s', - obj.major, obj.minor, len, dict.Filter) - end - end -end - local function offsets_to_blocks(starts, ends, out) local start_pos, end_pos = 1, 1 -- cgit v1.2.3