diff options
Diffstat (limited to 'lualib/lua_content')
-rw-r--r-- | lualib/lua_content/init.lua | 6 | ||||
-rw-r--r-- | lualib/lua_content/pdf.lua | 126 |
2 files changed, 132 insertions, 0 deletions
diff --git a/lualib/lua_content/init.lua b/lualib/lua_content/init.lua index 06d509e00..c23ca9d09 100644 --- a/lualib/lua_content/init.lua +++ b/lualib/lua_content/init.lua @@ -31,6 +31,12 @@ local content_modules = { extensions = {'ical'}, output = "text" }, + pdf = { + mime_type = "application/pdf", + module = require "lua_content/pdf", + extensions = {'pdf'}, + output = "table" + }, } local modules_by_mime_type diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua new file mode 100644 index 000000000..9ff3d0260 --- /dev/null +++ b/lualib/lua_content/pdf.lua @@ -0,0 +1,126 @@ +--[[ +Copyright (c) 2019, Vsevolod Stakhov <vsevolod@highsecure.ru> + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +]]-- + +--[[[ +-- @module lua_content/pdf +-- This module contains some heuristics for PDF files +--]] + +local rspamd_trie = require "rspamd_trie" +local bit = require "bit" +local pdf_trie +local N = "lua_content" +local lua_util = require "lua_util" +local pdf_patterns = { + trailer = { + patterns = { + [[\ntrailer\r?\n]] + } + } +} + +-- index[n] -> +-- t[1] - pattern, +-- t[2] - key in patterns table, +-- t[3] - value in patterns table +-- t[4] - local pattern index +local pdf_indexes = {} +local exports = {} + +-- Used to process patterns found in PDF +-- positions for functional processors should be a iter/table from trie matcher in form +---- [{n1, pat_idx1}, ... {nn, pat_idxn}] where +---- pat_idxn is pattern index and n1 ... nn are match positions +local processors = {} + +local function compile_tries() + local default_compile_flags = bit.bor(rspamd_trie.flags.re, + rspamd_trie.flags.dot_all, + rspamd_trie.flags.single_match, + rspamd_trie.flags.no_start) + local function compile_pats(patterns, indexes, compile_flags) + local strs = {} + for what,data in pairs(patterns) do + for i,pat in ipairs(data.patterns) do + strs[#strs + 1] = pat + indexes[#indexes + 1] = {what, data, pat, i} + end + end + + return rspamd_trie.create(strs, compile_flags or default_compile_flags) + end + + if not pdf_trie then + pdf_trie = compile_pats(pdf_patterns, pdf_indexes) + end +end + +-- Call immediately on require +compile_tries() + +local function process_pdf(input, _, task) + local matches = pdf_trie:match(input) + local pdf_output = {} + + if matches then + local grouped_processors = {} + for npat,matched_positions in pairs(matches) do + local index = pdf_indexes[npat] + + local proc_key,loc_npat = index[1], index[4] + + if not grouped_processors[proc_key] then + grouped_processors[proc_key] = { + processor_func = processors[proc_key], + offsets = {}, + } + end + local proc = grouped_processors[proc_key] + -- Fill offsets + for _,pos in ipairs(matched_positions) do + proc.offsets[#proc.offsets + 1] = {pos, loc_npat} + end + end + + for name,processor in pairs(grouped_processors) do + -- Sort by offset + lua_util.debugm(N, task, "pdf: process group %s with %s matches", + name, #processor.offsets) + table.sort(processor.offsets, function(e1, e2) return e1[1] < e2[1] end) + processor.processor_func(input, task, processor.offsets, pdf_output) + end + + return pdf_output + end +end + +-- Processes the PDF trailer +processors.trailer = function(input, task, positions, output) + local last_pos = positions[#positions] + + local last_span = input:span(last_pos[1]) + for line in last_span:lines(true) do + if line:find('/Encrypt ') then + lua_util.debugm(N, task, "pdf: found encrypted line in trailer: %s", + line) + output.encrypted = true + end + end +end + +exports.process = process_pdf + +return exports
\ No newline at end of file |