瀏覽代碼

[Rework] Lua_magic: Try to detect text parts with 8bit characters for non-utf8 encodings

tags/3.0
Vsevolod Stakhov 3 年之前
父節點
當前提交
401baa2b86
共有 1 個文件被更改,包括 31 次插入25 次删除
  1. 31
    25
      lualib/lua_magic/heuristics.lua

+ 31
- 25
lualib/lua_magic/heuristics.lua 查看文件

exports.text_part_heuristic = function(part, log_obj, _) exports.text_part_heuristic = function(part, log_obj, _)
-- We get some span of data and check it -- We get some span of data and check it
local function is_span_text(span) local function is_span_text(span)
local function rough_utf8_check(bytes, idx, remain)
-- We examine 8 bit content, and we assume it might be localized text
-- if it has more than 3 subsequent 8 bit characters
local function rough_8bit_check(bytes, idx, remain)
local b = bytes[idx] local b = bytes[idx]
if b >= 127 then
if bit.band(b, 0xe0) == 0xc0 and remain > 1 and
bit.band(bytes[idx + 1], 0xc0) == 0x80 then
return true,1
elseif bit.band(b, 0xf0) == 0xe0 and remain > 2 and
bit.band(bytes[idx + 1], 0xc0) == 0x80 and
bit.band(bytes[idx + 2], 0xc0) == 0x80 then
return true,2
elseif bit.band(b, 0xf8) == 0xf0 and remain > 3 and
bit.band(bytes[idx + 1], 0xc0) == 0x80 and
bit.band(bytes[idx + 2], 0xc0) == 0x80 and
bit.band(bytes[idx + 3], 0xc0) == 0x80 then
return true,3
end
return false
else
return true,0
local n8bit = 0

while b >= 127 and n8bit < remain do
n8bit = n8bit + 1
idx = idx + 1
b = bytes[idx]
end end

if n8bit >= 3 then
return true,n8bit
end

return false,0
end end


-- Convert to string as LuaJIT can optimise string.sub (and fun.iter) but not C calls -- Convert to string as LuaJIT can optimise string.sub (and fun.iter) but not C calls
if (b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09) then if (b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09) then
non_printable = non_printable + 1 non_printable = non_printable + 1
elseif b >= 127 then elseif b >= 127 then
local c,nskip = rough_utf8_check(bytes, i, tlen - i)
local c,nskip = rough_8bit_check(bytes, i, tlen - i)


if not c then if not c then
non_printable = non_printable + 1 non_printable = non_printable + 1


if res.html and res.html >= 40 then if res.html and res.html >= 40 then
-- HTML has priority over something like js... -- HTML has priority over something like js...
return 'html',res.html
return 'html', res.html
end end


local ext,weight = process_top_detected(res)
local ext, weight = process_top_detected(res)


if weight and weight >= 40 then if weight and weight >= 40 then
return ext,weight
return ext, weight
end end
end end


-- Content type stuff -- Content type stuff
if (mtype == 'text' or mtype == 'application') and (msubtype == 'html' or msubtype == 'xhtml+xml') then
return 'html',21
if (mtype == 'text' or mtype == 'application') and
(msubtype == 'html' or msubtype == 'xhtml+xml') then
return 'html', 21
end end


-- Extension stuff -- Extension stuff
local function has_extension(file, ext)
local ext_len = ext:len()
return file:len() > ext_len + 1
and file:sub(-ext_len):lower() == ext
and file:sub(-ext_len - 1, -ext_len - 1) == '.'
end


local fname = part:get_filename() local fname = part:get_filename()
if fname and fname:match('html?$') then
if fname and (has_extension(fname, 'htm') or has_extension(fname, 'html')) then
return 'html',21 return 'html',21
end end



Loading…
取消
儲存