|
|
|
|
|
|
|
|
exports.text_part_heuristic = function(part, log_obj, _) |
|
|
exports.text_part_heuristic = function(part, log_obj, _) |
|
|
-- We get some span of data and check it |
|
|
-- We get some span of data and check it |
|
|
local function is_span_text(span) |
|
|
local function is_span_text(span) |
|
|
local function rough_utf8_check(bytes, idx, remain) |
|
|
|
|
|
|
|
|
-- We examine 8 bit content, and we assume it might be localized text |
|
|
|
|
|
-- if it has more than 3 subsequent 8 bit characters |
|
|
|
|
|
local function rough_8bit_check(bytes, idx, remain) |
|
|
local b = bytes[idx] |
|
|
local b = bytes[idx] |
|
|
if b >= 127 then |
|
|
|
|
|
if bit.band(b, 0xe0) == 0xc0 and remain > 1 and |
|
|
|
|
|
bit.band(bytes[idx + 1], 0xc0) == 0x80 then |
|
|
|
|
|
return true,1 |
|
|
|
|
|
elseif bit.band(b, 0xf0) == 0xe0 and remain > 2 and |
|
|
|
|
|
bit.band(bytes[idx + 1], 0xc0) == 0x80 and |
|
|
|
|
|
bit.band(bytes[idx + 2], 0xc0) == 0x80 then |
|
|
|
|
|
return true,2 |
|
|
|
|
|
elseif bit.band(b, 0xf8) == 0xf0 and remain > 3 and |
|
|
|
|
|
bit.band(bytes[idx + 1], 0xc0) == 0x80 and |
|
|
|
|
|
bit.band(bytes[idx + 2], 0xc0) == 0x80 and |
|
|
|
|
|
bit.band(bytes[idx + 3], 0xc0) == 0x80 then |
|
|
|
|
|
return true,3 |
|
|
|
|
|
end |
|
|
|
|
|
return false |
|
|
|
|
|
else |
|
|
|
|
|
return true,0 |
|
|
|
|
|
|
|
|
local n8bit = 0 |
|
|
|
|
|
|
|
|
|
|
|
while b >= 127 and n8bit < remain do |
|
|
|
|
|
n8bit = n8bit + 1 |
|
|
|
|
|
idx = idx + 1 |
|
|
|
|
|
b = bytes[idx] |
|
|
end |
|
|
end |
|
|
|
|
|
|
|
|
|
|
|
if n8bit >= 3 then |
|
|
|
|
|
return true,n8bit |
|
|
|
|
|
end |
|
|
|
|
|
|
|
|
|
|
|
return false,0 |
|
|
end |
|
|
end |
|
|
|
|
|
|
|
|
-- Convert to string as LuaJIT can optimise string.sub (and fun.iter) but not C calls |
|
|
-- Convert to string as LuaJIT can optimise string.sub (and fun.iter) but not C calls |
|
|
|
|
|
|
|
|
if (b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09) then |
|
|
if (b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09) then |
|
|
non_printable = non_printable + 1 |
|
|
non_printable = non_printable + 1 |
|
|
elseif b >= 127 then |
|
|
elseif b >= 127 then |
|
|
local c,nskip = rough_utf8_check(bytes, i, tlen - i) |
|
|
|
|
|
|
|
|
local c,nskip = rough_8bit_check(bytes, i, tlen - i) |
|
|
|
|
|
|
|
|
if not c then |
|
|
if not c then |
|
|
non_printable = non_printable + 1 |
|
|
non_printable = non_printable + 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if res.html and res.html >= 40 then |
|
|
if res.html and res.html >= 40 then |
|
|
-- HTML has priority over something like js... |
|
|
-- HTML has priority over something like js... |
|
|
return 'html',res.html |
|
|
|
|
|
|
|
|
return 'html', res.html |
|
|
end |
|
|
end |
|
|
|
|
|
|
|
|
local ext,weight = process_top_detected(res) |
|
|
|
|
|
|
|
|
local ext, weight = process_top_detected(res) |
|
|
|
|
|
|
|
|
if weight and weight >= 40 then |
|
|
if weight and weight >= 40 then |
|
|
return ext,weight |
|
|
|
|
|
|
|
|
return ext, weight |
|
|
end |
|
|
end |
|
|
end |
|
|
end |
|
|
|
|
|
|
|
|
-- Content type stuff |
|
|
-- Content type stuff |
|
|
if (mtype == 'text' or mtype == 'application') and (msubtype == 'html' or msubtype == 'xhtml+xml') then |
|
|
|
|
|
return 'html',21 |
|
|
|
|
|
|
|
|
if (mtype == 'text' or mtype == 'application') and |
|
|
|
|
|
(msubtype == 'html' or msubtype == 'xhtml+xml') then |
|
|
|
|
|
return 'html', 21 |
|
|
end |
|
|
end |
|
|
|
|
|
|
|
|
-- Extension stuff |
|
|
-- Extension stuff |
|
|
|
|
|
local function has_extension(file, ext) |
|
|
|
|
|
local ext_len = ext:len() |
|
|
|
|
|
return file:len() > ext_len + 1 |
|
|
|
|
|
and file:sub(-ext_len):lower() == ext |
|
|
|
|
|
and file:sub(-ext_len - 1, -ext_len - 1) == '.' |
|
|
|
|
|
end |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
local fname = part:get_filename() |
|
|
local fname = part:get_filename() |
|
|
if fname and fname:match('html?$') then |
|
|
|
|
|
|
|
|
if fname and (has_extension(fname, 'htm') or has_extension(fname, 'html')) then |
|
|
return 'html',21 |
|
|
return 'html',21 |
|
|
end |
|
|
end |
|
|
|
|
|
|