diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-04-22 14:08:11 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-04-22 14:08:11 +0100 |
commit | 401baa2b86548fa6968020dc14aa83f9d713e97b (patch) | |
tree | e06004531b931a9424ce908b3652a4290e72233a /lualib | |
parent | 42228b92e57e27a5af566496bd585afde5cbb07c (diff) | |
download | rspamd-401baa2b86548fa6968020dc14aa83f9d713e97b.tar.gz rspamd-401baa2b86548fa6968020dc14aa83f9d713e97b.zip |
[Rework] Lua_magic: Try to detect text parts with 8bit characters for non-utf8 encodings
Diffstat (limited to 'lualib')
-rw-r--r-- | lualib/lua_magic/heuristics.lua | 56 |
1 files changed, 31 insertions, 25 deletions
diff --git a/lualib/lua_magic/heuristics.lua b/lualib/lua_magic/heuristics.lua index 15d8527fd..042bfde3f 100644 --- a/lualib/lua_magic/heuristics.lua +++ b/lualib/lua_magic/heuristics.lua @@ -327,26 +327,23 @@ end exports.text_part_heuristic = function(part, log_obj, _) -- We get some span of data and check it local function is_span_text(span) - local function rough_utf8_check(bytes, idx, remain) + -- We examine 8 bit content, and we assume it might be localized text + -- if it has more than 3 subsequent 8 bit characters + local function rough_8bit_check(bytes, idx, remain) local b = bytes[idx] - if b >= 127 then - if bit.band(b, 0xe0) == 0xc0 and remain > 1 and - bit.band(bytes[idx + 1], 0xc0) == 0x80 then - return true,1 - elseif bit.band(b, 0xf0) == 0xe0 and remain > 2 and - bit.band(bytes[idx + 1], 0xc0) == 0x80 and - bit.band(bytes[idx + 2], 0xc0) == 0x80 then - return true,2 - elseif bit.band(b, 0xf8) == 0xf0 and remain > 3 and - bit.band(bytes[idx + 1], 0xc0) == 0x80 and - bit.band(bytes[idx + 2], 0xc0) == 0x80 and - bit.band(bytes[idx + 3], 0xc0) == 0x80 then - return true,3 - end - return false - else - return true,0 + local n8bit = 0 + + while b >= 127 and n8bit < remain do + n8bit = n8bit + 1 + idx = idx + 1 + b = bytes[idx] end + + if n8bit >= 3 then + return true,n8bit + end + + return false,0 end -- Convert to string as LuaJIT can optimise string.sub (and fun.iter) but not C calls @@ -360,7 +357,7 @@ exports.text_part_heuristic = function(part, log_obj, _) if (b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09) then non_printable = non_printable + 1 elseif b >= 127 then - local c,nskip = rough_utf8_check(bytes, i, tlen - i) + local c,nskip = rough_8bit_check(bytes, i, tlen - i) if not c then non_printable = non_printable + 1 @@ -425,24 +422,33 @@ exports.text_part_heuristic = function(part, log_obj, _) if res.html and res.html >= 40 then -- HTML has priority over something like js... - return 'html',res.html + return 'html', res.html end - local ext,weight = process_top_detected(res) + local ext, weight = process_top_detected(res) if weight and weight >= 40 then - return ext,weight + return ext, weight end end -- Content type stuff - if (mtype == 'text' or mtype == 'application') and (msubtype == 'html' or msubtype == 'xhtml+xml') then - return 'html',21 + if (mtype == 'text' or mtype == 'application') and + (msubtype == 'html' or msubtype == 'xhtml+xml') then + return 'html', 21 end -- Extension stuff + local function has_extension(file, ext) + local ext_len = ext:len() + return file:len() > ext_len + 1 + and file:sub(-ext_len):lower() == ext + and file:sub(-ext_len - 1, -ext_len - 1) == '.' + end + + local fname = part:get_filename() - if fname and fname:match('html?$') then + if fname and (has_extension(fname, 'htm') or has_extension(fname, 'html')) then return 'html',21 end |