From ce8352554afa648b81400e3267587e22705b08cf Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 13 Feb 2017 12:17:58 +0000 Subject: [PATCH] [Fix] Improve OMOGRAPH_URL rule - Calculate omographs in each dot component separately - Normalize omographs - Count utf8 characters properly --- rules/misc.lua | 17 ++++++++++++++--- src/lua/lua_util.c | 7 ++++++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/rules/misc.lua b/rules/misc.lua index 56de79a6b..75f31f39c 100644 --- a/rules/misc.lua +++ b/rules/misc.lua @@ -767,10 +767,21 @@ rspamd_config.OMOGRAPH_URL = { local h = u:get_host() if h then - local non_latin,total = util.count_non_ascii(h) + local parts = rspamd_str_split(h, '.') - if non_latin ~= total and non_latin > 0 then - return true, 1.0, h + local bad_omographs = 0 + + for _,p in ipairs(parts) do + local cnlat,ctot = util.count_non_ascii(p) + + if cnlat > 0 and cnlat ~= ctot then + bad_omographs = bad_omographs + 1.0 / cnlat + end + end + + if bad_omographs > 0 then + if bad_omographs > 1 then bad_omographs = 1.0 end + return true, bad_omographs, h end end end diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 220467664..669ac5bd9 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -1891,7 +1891,7 @@ lua_util_count_non_ascii (lua_State *L) { gsize len; const gchar *str = lua_tolstring (L, 1, &len); - const gchar *p, *end; + const gchar *p, *end, *np; gint ret = 0, total = 0; if (str != NULL) { @@ -1900,8 +1900,13 @@ lua_util_count_non_ascii (lua_State *L) while (p < end) { if (*p & 0x80) { + np = g_utf8_find_next_char (p, end); ret ++; total ++; + + p = (np != p) ? np : p + 1; + + continue; } else if (g_ascii_isalpha (*p)) { total ++; -- 2.39.5