Browse Source

[Minor] Fix text parts heuristic

tags/2.0
Vsevolod Stakhov 4 years ago
parent
commit
12fdcf7bad
1 changed files with 32 additions and 8 deletions
  1. 32
    8
      lualib/lua_magic/heuristics.lua

+ 32
- 8
lualib/lua_magic/heuristics.lua View File

@@ -314,26 +314,50 @@ end
exports.text_part_heuristic = function(part, log_obj)
-- We get some span of data and check it
local function is_span_text(span)
local function rough_utf8_check(b)
local function rough_utf8_check(bytes, idx, remain)
local b = bytes[idx]
if b >= 127 then
if bit.band(b, 0xe0) == 0xc0 or bit.band(b, 0xf0) == 0xe0 or bit.band(b, 0xf8) == 0xf0 then
return true
if bit.band(b, 0xe0) == 0xc0 and remain > 1 and
bit.band(bytes[idx + 1], 0xc0) == 0x80 then
return true,1
elseif bit.band(b, 0xf0) == 0xe0 and remain > 2 and
bit.band(bytes[idx + 1], 0xc0) == 0x80 and
bit.band(bytes[idx + 2], 0xc0) == 0x80 then
return true,2
elseif bit.band(b, 0xf8) == 0xf0 and remain > 3 and
bit.band(bytes[idx + 1], 0xc0) == 0x80 and
bit.band(bytes[idx + 2], 0xc0) == 0x80 and
bit.band(bytes[idx + 3], 0xc0) == 0x80 then
return true,3
end
return false
else
return true
return true,0
end
end

-- Convert to string as LuaJIT can optimise string.sub (and fun.iter) but not C calls
local tlen = #span
local non_printable = 0
for _,b in ipairs(span:bytes()) do
if ((b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09))
or (not rough_utf8_check(b)) then
local bytes = span:bytes()
local i = 1
repeat
local b = bytes[i]

if (b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09) then
non_printable = non_printable + 1
elseif b >= 127 then
local c,nskip = rough_utf8_check(bytes, i, tlen - i)

if not c then
non_printable = non_printable + 1
else
i = i + nskip
end
end
end
i = i + 1
until i > tlen

lua_util.debugm(N, log_obj, "text part check: %s printable, %s non-printable, %s total",
tlen - non_printable, non_printable, tlen)
if non_printable / tlen > 0.0078125 then

Loading…
Cancel
Save