Преглед изворни кода

[Feature] Lua_magic: Add a sane CSV heuristic

tags/3.2
Vsevolod Stakhov пре 2 година
родитељ
комит
137ab1cb78
1 измењених фајлова са 65 додато и 1 уклоњено
  1. 65
    1
      lualib/lua_magic/heuristics.lua

+ 65
- 1
lualib/lua_magic/heuristics.lua Прегледај датотеку

@@ -320,6 +320,67 @@ local function detect_archive_flaw(part, arch, log_obj, _)
return arch_type:lower(),40
end

local csv_grammar
-- Returns a grammar that will count commas
local function get_csv_grammar()
if not csv_grammar then
local lpeg = require'lpeg'

local field = '"' * lpeg.Cs(((lpeg.P(1) - '"') + lpeg.P'""' / '"')^0) * '"' +
lpeg.C((1 - lpeg.S',\n"')^0)

csv_grammar = lpeg.Cf(lpeg.Cc(0) * field * lpeg.P( (lpeg.P(',') +
lpeg.P('\t')) * field)^1 * (lpeg.S'\r\n' + -1),
function(acc) return acc + 1 end)
end

return csv_grammar
end
local function validate_csv(part, content, log_obj)
local max_chunk = 32768
local chunk = content:sub(1, max_chunk)

local expected_commas
local matched_lines = 0
local max_matched_lines = 10

lua_util.debugm(N, log_obj, "check for csv pattern")

for s in chunk:lines() do
local ncommas = get_csv_grammar():match(s)

if not ncommas then
lua_util.debugm(N, log_obj, "not a csv line at line number %s",
matched_lines)
return false
end

if expected_commas and ncommas ~= expected_commas then
-- Mismatched commas
lua_util.debugm(N, log_obj, "missmatched commas on line %s: %s != %s",
matched_lines, ncommas, expected_commas)
return false
elseif not expected_commas then
if ncommas == 0 then
lua_util.debugm(N, log_obj, "no commas in the first line")
return false
end
expected_commas = ncommas
end

matched_lines = matched_lines + 1

if matched_lines > max_matched_lines then
break
end
end

lua_util.debugm(N, log_obj, "csv content is sane: %s fields; %s lines checked",
expected_commas, matched_lines)

return true
end

exports.mime_part_heuristic = function(part, log_obj, _)
if part:is_archive() then
local arch = part:get_archive()
@@ -452,7 +513,10 @@ exports.text_part_heuristic = function(part, log_obj, _)

if weight then
if weight >= 40 then
return ext, weight
-- Extra validation for csv extension
if ext ~= 'csv' or validate_csv(part, content, log_obj) then
return ext, weight
end
elseif fname and weight >= 20 then
return ext, weight
end

Loading…
Откажи
Сачувај