summaryrefslogtreecommitdiffstats
path: root/lualib
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2020-01-06 10:27:53 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2020-01-06 10:27:53 +0000
commit1ca8b0c04281f1f56999a73b0981824b7bab40f4 (patch)
tree2aee07479ad19ac836132c0dee0c81d83078a0b4 /lualib
parentbfd423303fd18eea049cf9f20a2bc87a809f6990 (diff)
downloadrspamd-1ca8b0c04281f1f56999a73b0981824b7bab40f4.tar.gz
rspamd-1ca8b0c04281f1f56999a73b0981824b7bab40f4.zip
[Project] Lua_content: Add text blocks search logic
Diffstat (limited to 'lualib')
-rw-r--r--lualib/lua_content/pdf.lua87
1 files changed, 85 insertions, 2 deletions
diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua
index d4f995901..91f317dbe 100644
--- a/lualib/lua_content/pdf.lua
+++ b/lualib/lua_content/pdf.lua
@@ -22,7 +22,6 @@ limitations under the License.
local rspamd_trie = require "rspamd_trie"
local rspamd_util = require "rspamd_util"
local bit = require "bit"
-local pdf_trie
local N = "lua_content"
local lua_util = require "lua_util"
local rspamd_regexp = require "rspamd_regexp"
@@ -74,12 +73,30 @@ local pdf_patterns = {
}
}
+local pdf_text_patterns = {
+ start = {
+ patterns = {
+ [[\s+BT\s+]]
+ }
+ },
+ stop = {
+ patterns = {
+ [[\s+ET\b]]
+ }
+ }
+}
+
-- index[n] ->
-- t[1] - pattern,
-- t[2] - key in patterns table,
-- t[3] - value in patterns table
-- t[4] - local pattern index
local pdf_indexes = {}
+local pdf_text_indexes = {}
+
+local pdf_trie
+local pdf_text_trie
+
local exports = {}
-- Used to process patterns found in PDF
@@ -89,7 +106,7 @@ local exports = {}
local processors = {}
-- PDF objects outer grammar in LPEG style (performing table captures)
local pdf_outer_grammar
-local pdf_inner_grammar
+local pdf_text_grammar
local max_extraction_size = 512 * 1024 -- TODO: make it configurable
@@ -115,6 +132,9 @@ local function compile_tries()
if not pdf_trie then
pdf_trie = compile_pats(pdf_patterns, pdf_indexes)
end
+ if not pdf_text_trie then
+ pdf_text_trie = compile_pats(pdf_text_patterns, pdf_text_indexes)
+ end
end
-- Returns a table with generic grammar elements for PDF
@@ -426,6 +446,7 @@ local function extract_pdf_objects(task, pdf)
if uncompressed then
lua_util.debugm(N, task, 'extracted object %s:%s: %s (%s -> %s)',
obj.major, obj.minor, uncompressed, len, uncompressed:len())
+ obj.uncompressed = uncompressed
else
lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s',
obj.major, obj.minor, len, dict.Filter)
@@ -445,6 +466,67 @@ local function extract_pdf_objects(task, pdf)
end
end
+local function offsets_to_blocks(starts, ends, out)
+ local start_pos, end_pos = 1, 1
+
+ while start_pos <= #starts and end_pos <= #ends do
+ local first = starts[start_pos]
+ local last = ends[end_pos]
+
+ if first < last then
+ local len = last - first
+ out[#out + 1] = {
+ start = first,
+ len = len,
+ }
+ start_pos = start_pos + 1
+ end_pos = end_pos + 1
+ elseif start_pos > end_pos then
+ end_pos = end_pos + 1
+ else
+ -- Not ordered properly!
+ break
+ end
+ end
+end
+
+local function search_text(task, pdf)
+ for _,obj in ipairs(pdf.objects) do
+ if obj.uncompressed then
+ local matches = pdf_text_trie:match(obj.uncompressed)
+ if matches then
+ local text_blocks = {}
+ local starts = {}
+ local ends = {}
+
+ for npat,matched_positions in pairs(matches) do
+ if npat == 1 then
+ for _,pos in ipairs(matched_positions) do
+ starts[#starts + 1] = pos
+ end
+ else
+ for _,pos in ipairs(matched_positions) do
+ ends[#ends + 1] = pos
+ end
+ end
+ end
+
+ offsets_to_blocks(starts, ends, text_blocks)
+ for _,bl in ipairs(text_blocks) do
+ if bl.len > 2 then
+ -- To remove \s+ET\b pattern (it can leave trailing space or not but it doesn't matter)
+ bl.len = bl.len - 2
+ end
+
+ bl.data = obj.uncompressed:span(bl.start, bl.len)
+ lua_util.debugm(N, task, 'extracted text from object %s:%s: %s',
+ obj.major, obj.minor, bl.data)
+ end
+ end
+ end
+ end
+end
+
local function process_pdf(input, _, task)
local matches = pdf_trie:match(input)
@@ -486,6 +568,7 @@ local function process_pdf(input, _, task)
-- Postprocess objects
postprocess_pdf_objects(task, input, pdf_output)
extract_pdf_objects(task, pdf_output)
+ search_text(task, pdf_output)
else
pdf_output.flags.no_objects = true
end