summaryrefslogtreecommitdiffstats
path: root/lualib/lua_content/pdf.lua
diff options
context:
space:
mode:
Diffstat (limited to 'lualib/lua_content/pdf.lua')
-rw-r--r--lualib/lua_content/pdf.lua197
1 files changed, 102 insertions, 95 deletions
diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua
index 0677fab06..a8b6e984a 100644
--- a/lualib/lua_content/pdf.lua
+++ b/lualib/lua_content/pdf.lua
@@ -147,10 +147,10 @@ local function compile_tries()
rspamd_trie.flags.no_start)
local function compile_pats(patterns, indexes, compile_flags)
local strs = {}
- for what,data in pairs(patterns) do
- for i,pat in ipairs(data.patterns) do
+ for what, data in pairs(patterns) do
+ for i, pat in ipairs(data.patterns) do
strs[#strs + 1] = pat
- indexes[#indexes + 1] = {what, data, pat, i}
+ indexes[#indexes + 1] = { what, data, pat, i }
end
end
@@ -175,7 +175,7 @@ local function generic_grammar_elts()
local S = lpeg.S
local V = lpeg.V
local C = lpeg.C
- local D = R'09' -- Digits
+ local D = R '09' -- Digits
local grammar_elts = {}
@@ -214,37 +214,37 @@ local function generic_grammar_elts()
end
local function pdf_id_unescape(s)
- return (s:gsub('#%d%d', function (cc)
+ return (s:gsub('#%d%d', function(cc)
return string.char(tonumber(cc:sub(2), 16))
end))
end
- local delim = S'()<>[]{}/%'
- grammar_elts.ws = S'\0 \r\n\t\f'
- local hex = R'af' + R'AF' + D
+ local delim = S '()<>[]{}/%'
+ grammar_elts.ws = S '\0 \r\n\t\f'
+ local hex = R 'af' + R 'AF' + D
-- Comments.
- local eol = P'\r\n' + '\n'
- local line = (1 - S'\r\n\f')^0 * eol^-1
- grammar_elts.comment = P'%' * line
+ local eol = P '\r\n' + '\n'
+ local line = (1 - S '\r\n\f') ^ 0 * eol ^ -1
+ grammar_elts.comment = P '%' * line
-- Numbers.
- local sign = S'+-'^-1
- local decimal = D^1
- local float = D^1 * P'.' * D^0 + P'.' * D^1
+ local sign = S '+-' ^ -1
+ local decimal = D ^ 1
+ local float = D ^ 1 * P '.' * D ^ 0 + P '.' * D ^ 1
grammar_elts.number = C(sign * (float + decimal)) / tonumber
-- String
- grammar_elts.str = P{ "(" * C(((1 - S"()\\") + (P '\\' * 1) + V(1))^0) / pdf_string_unescape * ")" }
- grammar_elts.hexstr = P{"<" * C(hex^0) / pdf_hexstring_unescape * ">"}
+ grammar_elts.str = P { "(" * C(((1 - S "()\\") + (P '\\' * 1) + V(1)) ^ 0) / pdf_string_unescape * ")" }
+ grammar_elts.hexstr = P { "<" * C(hex ^ 0) / pdf_hexstring_unescape * ">" }
-- Identifier
- grammar_elts.id = P{'/' * C((1-(delim + grammar_elts.ws))^1) / pdf_id_unescape}
+ grammar_elts.id = P { '/' * C((1 - (delim + grammar_elts.ws)) ^ 1) / pdf_id_unescape }
-- Booleans (who care about them?)
grammar_elts.boolean = C(P("true") + P("false"))
-- Stupid references
- grammar_elts.ref = lpeg.Ct{lpeg.Cc("%REF%") * C(D^1) * " " * C(D^1) * " " * "R"}
+ grammar_elts.ref = lpeg.Ct { lpeg.Cc("%REF%") * C(D ^ 1) * " " * C(D ^ 1) * " " * "R" }
return grammar_elts
end
@@ -255,16 +255,16 @@ local function gen_outer_grammar()
local V = lpeg.V
local gen = generic_grammar_elts()
- return lpeg.P{
+ return lpeg.P {
"EXPR";
- EXPR = gen.ws^0 * V("ELT")^0 * gen.ws^0,
+ EXPR = gen.ws ^ 0 * V("ELT") ^ 0 * gen.ws ^ 0,
ELT = V("ARRAY") + V("DICT") + V("ATOM"),
- ATOM = gen.ws^0 * (gen.comment + gen.boolean + gen.ref +
- gen.number + V("STRING") + gen.id) * gen.ws^0,
- DICT = "<<" * gen.ws^0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR")^0, rawset) * gen.ws^0 * ">>",
- KV_PAIR = lpeg.Cg(gen.id * gen.ws^0 * V("ELT") * gen.ws^0),
- ARRAY = "[" * gen.ws^0 * lpeg.Ct(V("ELT")^0) * gen.ws^0 * "]",
- STRING = lpeg.P{gen.str + gen.hexstr},
+ ATOM = gen.ws ^ 0 * (gen.comment + gen.boolean + gen.ref +
+ gen.number + V("STRING") + gen.id) * gen.ws ^ 0,
+ DICT = "<<" * gen.ws ^ 0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR") ^ 0, rawset) * gen.ws ^ 0 * ">>",
+ KV_PAIR = lpeg.Cg(gen.id * gen.ws ^ 0 * V("ELT") * gen.ws ^ 0),
+ ARRAY = "[" * gen.ws ^ 0 * lpeg.Ct(V("ELT") ^ 0) * gen.ws ^ 0 * "]",
+ STRING = lpeg.P { gen.str + gen.hexstr },
}
end
@@ -274,7 +274,7 @@ local function gen_graphics_unary()
local S = lpeg.S
return P("q") + P("Q") + P("h")
- + S("WSsFfBb") * P("*")^0 + P("n")
+ + S("WSsFfBb") * P("*") ^ 0 + P("n")
end
local function gen_graphics_binary()
@@ -317,29 +317,29 @@ local function gen_text_grammar()
local text_quote_op = P('"')
local font_op = P("Tf")
- return lpeg.P{
+ return lpeg.P {
"EXPR";
- EXPR = gen.ws^0 * lpeg.Ct(V("COMMAND")^0),
+ EXPR = gen.ws ^ 0 * lpeg.Ct(V("COMMAND") ^ 0),
COMMAND = (V("UNARY") + V("BINARY") + V("TERNARY") + V("NARY") + V("TEXT") +
- V("FONT") + gen.comment) * gen.ws^0,
+ V("FONT") + gen.comment) * gen.ws ^ 0,
UNARY = unary_ops,
- BINARY = V("ARG") / empty * gen.ws^1 * binary_ops,
- TERNARY = V("ARG") / empty * gen.ws^1 * V("ARG") / empty * gen.ws^1 * ternary_ops,
- NARY = (gen.number / 0 * gen.ws^1)^1 * (gen.id / empty * gen.ws^0)^-1 * nary_op,
+ BINARY = V("ARG") / empty * gen.ws ^ 1 * binary_ops,
+ TERNARY = V("ARG") / empty * gen.ws ^ 1 * V("ARG") / empty * gen.ws ^ 1 * ternary_ops,
+ NARY = (gen.number / 0 * gen.ws ^ 1) ^ 1 * (gen.id / empty * gen.ws ^ 0) ^ -1 * nary_op,
ARG = V("ARRAY") + V("DICT") + V("ATOM"),
ATOM = (gen.comment + gen.boolean + gen.ref +
gen.number + V("STRING") + gen.id),
- DICT = "<<" * gen.ws^0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR")^0, rawset) * gen.ws^0 * ">>",
- KV_PAIR = lpeg.Cg(gen.id * gen.ws^0 * V("ARG") * gen.ws^0),
- ARRAY = "[" * gen.ws^0 * lpeg.Ct(V("ARG")^0) * gen.ws^0 * "]",
- STRING = lpeg.P{gen.str + gen.hexstr},
- TEXT = (V("TEXT_ARG") * gen.ws^1 * text_binary_op) +
- (V("ARG") / 0 * gen.ws^1 * V("ARG") / 0 * gen.ws^1 * V("TEXT_ARG") * gen.ws^1 * text_quote_op),
- FONT = (V("FONT_ARG") * gen.ws^1 * (gen.number / 0) * gen.ws^1 * font_op),
+ DICT = "<<" * gen.ws ^ 0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR") ^ 0, rawset) * gen.ws ^ 0 * ">>",
+ KV_PAIR = lpeg.Cg(gen.id * gen.ws ^ 0 * V("ARG") * gen.ws ^ 0),
+ ARRAY = "[" * gen.ws ^ 0 * lpeg.Ct(V("ARG") ^ 0) * gen.ws ^ 0 * "]",
+ STRING = lpeg.P { gen.str + gen.hexstr },
+ TEXT = (V("TEXT_ARG") * gen.ws ^ 1 * text_binary_op) +
+ (V("ARG") / 0 * gen.ws ^ 1 * V("ARG") / 0 * gen.ws ^ 1 * V("TEXT_ARG") * gen.ws ^ 1 * text_quote_op),
+ FONT = (V("FONT_ARG") * gen.ws ^ 1 * (gen.number / 0) * gen.ws ^ 1 * font_op),
FONT_ARG = lpeg.Ct(lpeg.Cc("%font%") * gen.id),
TEXT_ARG = lpeg.Ct(V("STRING")) + V("TEXT_ARRAY"),
TEXT_ARRAY = "[" *
- lpeg.Ct(((gen.ws^0 * (gen.ws^0 * (gen.number / 0)^0 * gen.ws^0 * (gen.str + gen.hexstr)))^1)) * gen.ws^0 * "]",
+ lpeg.Ct(((gen.ws ^ 0 * (gen.ws ^ 0 * (gen.number / 0) ^ 0 * gen.ws ^ 0 * (gen.str + gen.hexstr))) ^ 1)) * gen.ws ^ 0 * "]",
}
end
@@ -393,7 +393,7 @@ local function maybe_apply_filter(dict, data, pdf, task)
if dict.Filter then
local filt = dict.Filter
if type(filt) == 'string' then
- filt = {filt}
+ filt = { filt }
end
if dict.DecodeParms then
@@ -401,19 +401,21 @@ local function maybe_apply_filter(dict, data, pdf, task)
if type(decode_params) == 'table' then
if decode_params.Predictor then
- return nil,'predictor exists'
+ return nil, 'predictor exists'
end
end
end
- for _,f in ipairs(filt) do
+ for _, f in ipairs(filt) do
uncompressed = apply_pdf_filter(uncompressed, f)
- if not uncompressed then break end
+ if not uncompressed then
+ break
+ end
end
end
- return uncompressed,nil
+ return uncompressed, nil
end
-- Conditionally extract stream data from object and attach it as obj.uncompressed
@@ -428,7 +430,7 @@ local function maybe_extract_object_stream(obj, pdf, task)
tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0)
local real_stream = obj.stream.data:span(1, len)
- local uncompressed,filter_err = maybe_apply_filter(dict, real_stream, pdf, task)
+ local uncompressed, filter_err = maybe_apply_filter(dict, real_stream, pdf, task)
if uncompressed then
obj.uncompressed = uncompressed
@@ -442,7 +444,6 @@ local function maybe_extract_object_stream(obj, pdf, task)
end
end
-
local function parse_object_grammar(obj, task, pdf)
-- Parse grammar
local obj_dict_span
@@ -453,7 +454,7 @@ local function parse_object_grammar(obj, task, pdf)
end
if obj_dict_span:len() < config.max_processing_size then
- local ret,obj_or_err = pcall(pdf_outer_grammar.match, pdf_outer_grammar, obj_dict_span)
+ local ret, obj_or_err = pcall(pdf_outer_grammar.match, pdf_outer_grammar, obj_dict_span)
if ret then
if obj.stream then
@@ -669,11 +670,11 @@ process_dict = function(task, pdf, obj, dict)
if contents and type(contents) == 'table' then
if contents[1] == '%REF%' then
-- Single reference
- contents = {contents}
+ contents = { contents }
end
obj.contents = {}
- for _,c in ipairs(contents) do
+ for _, c in ipairs(contents) do
local cobj = maybe_dereference_object(c, pdf, task)
if cobj and type(cobj) == 'table' then
obj.contents[#obj.contents + 1] = cobj
@@ -719,25 +720,25 @@ process_dict = function(task, pdf, obj, dict)
---[[Disabled fonts extraction
- local fonts = obj.resources.Font
- if fonts and type(fonts) == 'table' then
- obj.fonts = {}
- for k,v in pairs(fonts) do
- obj.fonts[k] = maybe_dereference_object(v, pdf, task)
+ --[[Disabled fonts extraction
+ local fonts = obj.resources.Font
+ if fonts and type(fonts) == 'table' then
+ obj.fonts = {}
+ for k,v in pairs(fonts) do
+ obj.fonts[k] = maybe_dereference_object(v, pdf, task)
- if obj.fonts[k] then
- local font = obj.fonts[k]
+ if obj.fonts[k] then
+ local font = obj.fonts[k]
- if config.text_extraction then
- process_font(task, pdf, font, k)
- lua_util.debugm(N, task, 'found font "%s" for object %s:%s -> %s',
- k, obj.major, obj.minor, font)
+ if config.text_extraction then
+ process_font(task, pdf, font, k)
+ lua_util.debugm(N, task, 'found font "%s" for object %s:%s -> %s',
+ k, obj.major, obj.minor, font)
+ end
+ end
end
end
- end
- end
-]]
+ ]]
lua_util.debugm(N, task, 'found resources for object %s:%s (%s): %s',
obj.major, obj.minor, obj.type, obj.resources)
@@ -783,8 +784,8 @@ local compound_obj_grammar
local function compound_obj_grammar_gen()
if not compound_obj_grammar then
local gen = generic_grammar_elts()
- compound_obj_grammar = gen.ws^0 * (gen.comment * gen.ws^1)^0 *
- lpeg.Ct(lpeg.Ct(gen.number * gen.ws^1 * gen.number * gen.ws^0)^1)
+ compound_obj_grammar = gen.ws ^ 0 * (gen.comment * gen.ws ^ 1) ^ 0 *
+ lpeg.Ct(lpeg.Ct(gen.number * gen.ws ^ 1 * gen.number * gen.ws ^ 0) ^ 1)
end
return compound_obj_grammar
@@ -798,8 +799,8 @@ local function pdf_compound_object_unpack(_, uncompressed, pdf, task, first)
lua_util.debugm(N, task, 'compound elts (chunk length %s): %s',
#uncompressed, elts)
- for i,pair in ipairs(elts) do
- local obj_number,offset = pair[1], pair[2]
+ for i, pair in ipairs(elts) do
+ local obj_number, offset = pair[1], pair[2]
offset = offset + first
if offset < #uncompressed then
@@ -833,7 +834,7 @@ end
-- PDF 1.5 ObjStmt
local function extract_pdf_compound_objects(task, pdf)
- for i,obj in ipairs(pdf.objects or {}) do
+ for i, obj in ipairs(pdf.objects or {}) do
if i > 0 and i % 100 == 0 then
local now = rspamd_util.get_ticks()
@@ -894,7 +895,9 @@ local function extract_outer_objects(task, input, pdf)
-- Also get the starting span and try to match it versus obj re to get numbers
local obj_line_potential = first - 32
- if obj_line_potential < 1 then obj_line_potential = 1 end
+ if obj_line_potential < 1 then
+ obj_line_potential = 1
+ end
local prev_obj_end = pdf.end_objects[end_pos - 1]
if end_pos > 1 and prev_obj_end >= obj_line_potential and prev_obj_end < first then
obj_line_potential = prev_obj_end + 1
@@ -941,7 +944,7 @@ local function attach_pdf_streams(task, input, pdf)
max_start_pos = math.min(config.max_pdf_objects, #pdf.start_streams)
max_end_pos = math.min(config.max_pdf_objects, #pdf.end_streams)
- for _,obj in ipairs(pdf.objects) do
+ for _, obj in ipairs(pdf.objects) do
while start_pos <= max_start_pos and end_pos <= max_end_pos do
local first = pdf.start_streams[start_pos]
local last = pdf.end_streams[end_pos]
@@ -957,7 +960,9 @@ local function attach_pdf_streams(task, input, pdf)
-- Strip the first \n
while first < last do
local chr = input:byte(first)
- if chr ~= 13 and chr ~= 10 then break end
+ if chr ~= 13 and chr ~= 10 then
+ break
+ end
first = first + 1
end
local len = last - first
@@ -1000,7 +1005,7 @@ local function postprocess_pdf_objects(task, input, pdf)
-- Now we have objects and we need to attach streams that are in bounds
attach_pdf_streams(task, input, pdf)
-- Parse grammar for outer objects
- for i,obj in ipairs(pdf.objects) do
+ for i, obj in ipairs(pdf.objects) do
if i > 0 and i % 100 == 0 then
local now = rspamd_util.get_ticks()
@@ -1031,7 +1036,7 @@ local function postprocess_pdf_objects(task, input, pdf)
end
-- Now we might probably have all objects being processed
- for i,obj in ipairs(pdf.objects) do
+ for i, obj in ipairs(pdf.objects) do
if obj.dict then
-- Types processing
if i > 0 and i % 100 == 0 then
@@ -1076,10 +1081,10 @@ local function offsets_to_blocks(starts, ends, out)
end
local function search_text(task, pdf)
- for _,obj in ipairs(pdf.objects) do
+ for _, obj in ipairs(pdf.objects) do
if obj.type == 'Page' and obj.contents then
local text = {}
- for _,tobj in ipairs(obj.contents) do
+ for _, tobj in ipairs(obj.contents) do
maybe_extract_object_stream(tobj, pdf, task)
local matches = pdf_text_trie:match(tobj.uncompressed or '')
if matches then
@@ -1087,20 +1092,20 @@ local function search_text(task, pdf)
local starts = {}
local ends = {}
- for npat,matched_positions in pairs(matches) do
+ for npat, matched_positions in pairs(matches) do
if npat == 1 then
- for _,pos in ipairs(matched_positions) do
+ for _, pos in ipairs(matched_positions) do
starts[#starts + 1] = pos
end
else
- for _,pos in ipairs(matched_positions) do
+ for _, pos in ipairs(matched_positions) do
ends[#ends + 1] = pos
end
end
end
offsets_to_blocks(starts, ends, text_blocks)
- for _,bl in ipairs(text_blocks) do
+ for _, bl in ipairs(text_blocks) do
if bl.len > 2 then
-- To remove \s+ET\b pattern (it can leave trailing space or not but it doesn't matter)
bl.len = bl.len - 2
@@ -1111,7 +1116,7 @@ local function search_text(task, pdf)
-- tobj.major, tobj.minor, bl.data)
if bl.len < config.max_processing_size then
- local ret,obj_or_err = pcall(pdf_text_grammar.match, pdf_text_grammar,
+ local ret, obj_or_err = pcall(pdf_text_grammar.match, pdf_text_grammar,
bl.data)
if ret then
@@ -1147,13 +1152,13 @@ local function search_urls(task, pdf, mpart)
return
end
- for k,v in pairs(dict) do
+ for k, v in pairs(dict) do
if type(v) == 'table' then
recursive_object_traverse(obj, v, rec + 1)
elseif k == 'URI' then
v = maybe_dereference_object(v, pdf, task)
if type(v) == 'string' then
- local url = rspamd_url.create(task:get_mempool(), v, {'content'})
+ local url = rspamd_url.create(task:get_mempool(), v, { 'content' })
if url then
lua_util.debugm(N, task, 'found url %s in object %s:%s',
@@ -1165,7 +1170,7 @@ local function search_urls(task, pdf, mpart)
end
end
- for _,obj in ipairs(pdf.objects) do
+ for _, obj in ipairs(pdf.objects) do
if obj.dict and type(obj.dict) == 'table' then
recursive_object_traverse(obj, obj.dict, 0)
end
@@ -1193,10 +1198,10 @@ local function process_pdf(input, mpart, task)
-- Output object that excludes all internal stuff
local pdf_output = lua_util.shallowcopy(pdf_object)
local grouped_processors = {}
- for npat,matched_positions in pairs(matches) do
+ for npat, matched_positions in pairs(matches) do
local index = pdf_indexes[npat]
- local proc_key,loc_npat = index[1], index[4]
+ local proc_key, loc_npat = index[1], index[4]
if not grouped_processors[proc_key] then
grouped_processors[proc_key] = {
@@ -1206,16 +1211,18 @@ local function process_pdf(input, mpart, task)
end
local proc = grouped_processors[proc_key]
-- Fill offsets
- for _,pos in ipairs(matched_positions) do
- proc.offsets[#proc.offsets + 1] = {pos, loc_npat}
+ for _, pos in ipairs(matched_positions) do
+ proc.offsets[#proc.offsets + 1] = { pos, loc_npat }
end
end
- for name,processor in pairs(grouped_processors) do
+ for name, processor in pairs(grouped_processors) do
-- Sort by offset
lua_util.debugm(N, task, "pdf: process group %s with %s matches",
name, #processor.offsets)
- table.sort(processor.offsets, function(e1, e2) return e1[1] < e2[1] end)
+ table.sort(processor.offsets, function(e1, e2)
+ return e1[1] < e2[1]
+ end)
processor.processor_func(input, task, processor.offsets, pdf_object, pdf_output)
end
@@ -1254,7 +1261,7 @@ local function process_pdf(input, mpart, task)
end
else
-- All hashes
- for h,sc in pairs(pdf_object.scripts) do
+ for h, sc in pairs(pdf_object.scripts) do
if config.min_js_fuzzy and #sc.data >= config.min_js_fuzzy then
lua_util.debugm(N, task, "pdf: add fuzzy hash from JavaScript: %s; size = %s; object: %s:%s",
sc.hash,
@@ -1323,7 +1330,7 @@ processors.suspicious = function(input, task, positions, pdf_object, pdf_output)
local nencoded = 0
local close_encoded = 0
local last_encoded
- for _,match in ipairs(positions) do
+ for _, match in ipairs(positions) do
if match[2] == 1 then
-- netsh
suspicious_factor = suspicious_factor + 0.5
@@ -1386,7 +1393,7 @@ local function generic_table_inserter(positions, pdf_object, output_key)
pdf_object[output_key] = {}
end
local shift = #pdf_object[output_key]
- for i,pos in ipairs(positions) do
+ for i, pos in ipairs(positions) do
pdf_object[output_key][i + shift] = pos[1]
end
end