From: Miecio Za Date: Tue, 26 Mar 2019 19:23:13 +0000 (+0100) Subject: [Minor] Rewrite is_utf_mixed_script to use libicu only X-Git-Tag: 1.9.1~38^2 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=89c5fe4c05012315e9229e033ae3ded8c31b1cd7;p=rspamd.git [Minor] Rewrite is_utf_mixed_script to use libicu only Rewrite to use U8_NEXT --- diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 881257ed3..af4673af8 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -2513,15 +2513,18 @@ lua_util_is_utf_mixed_script(lua_State *L) { LUA_TRACE_POINT; gsize len_of_string; - const gchar *end, *string_to_check = lua_tolstring (L, 1, &len_of_string); + const gchar *string_to_check = lua_tolstring (L, 1, &len_of_string); UScriptCode last_script_code = USCRIPT_INVALID_CODE; UErrorCode uc_err = U_ZERO_ERROR; - if (string_to_check && g_utf8_validate (string_to_check, len_of_string, &end)) { - len_of_string = g_utf8_strlen (string_to_check, len_of_string); - - for(; *string_to_check; string_to_check = g_utf8_next_char(string_to_check)){ - gunichar char_to_check = g_utf8_get_char(string_to_check); + if (string_to_check) { + uint index = 0; + UChar32 char_to_check = 0; + while(index < len_of_string) { + U8_NEXT(string_to_check, index, len_of_string, char_to_check); + if (char_to_check < 0 ) { + return luaL_error (L, "passed string is not valid utf"); + } UScriptCode current_script_code = uscript_getScript(char_to_check, &uc_err); if (uc_err != U_ZERO_ERROR){ msg_err ("cannot get unicode script for character, error: %s", u_errorName (uc_err)); diff --git a/test/lua/unit/rspamd_util.lua b/test/lua/unit/rspamd_util.lua index 802b400d2..859316be7 100644 --- a/test/lua/unit/rspamd_util.lua +++ b/test/lua/unit/rspamd_util.lua @@ -5,36 +5,42 @@ context("Rspamd util for lua - check generic functions", function() { input = "test1", result = false, + mixed_script = false, range_start = 0x0000, range_end = 0x017f }, { input = "test test xxx", result = false, + mixed_script = false, range_start = 0x0000, range_end = 0x017f }, { input = "АбЫрвАлг", result = true, + mixed_script = false, range_start = 0x0000, range_end = 0x017f }, { input = "АбЫрвАлг example", result = true, + mixed_script = true, range_start = 0x0000, range_end = 0x017f }, { input = "example ąłśćżłóę", result = false, + mixed_script = false, range_start = 0x0000, range_end = 0x017f }, { input = "ąłśćżłóę АбЫрвАлг", result = true, + mixed_script = true, range_start = 0x0000, range_end = 0x017f }, @@ -64,4 +70,20 @@ context("Rspamd util for lua - check generic functions", function() assert_equal(res["letters"], 10) assert_equal(res["digits"], 2) end) + + for i,c in ipairs(cases) do + test("is_utf_mixed_script, test case #" .. i, function() + local actual = util.is_utf_mixed_script(c.input) + + assert_equal(c.mixed_script, actual) + end) + end + + test("is_utf_mixed_script, invalid utf str should return errror", function() + assert_error(util.is_utf_mixed_script,'\200\213\202') + end) + + test("is_utf_mixed_script, empty str should return errror", function() + assert_error(util.is_utf_mixed_script,'\200\213\202') + end) end)