]> source.dussan.org Git - rspamd.git/commitdiff
[Minor] Rewrite is_utf_mixed_script to use libicu only 2813/head
authorMiecio Za <miecio@miecio.net>
Tue, 26 Mar 2019 19:23:13 +0000 (20:23 +0100)
committerMiecio Za <miecio@miecio.net>
Tue, 26 Mar 2019 19:23:13 +0000 (20:23 +0100)
Rewrite to use U8_NEXT

src/lua/lua_util.c
test/lua/unit/rspamd_util.lua

index 881257ed378a5c16834aeed65a45c6083cc05ecf..af4673af8309d29a7e2a7dc5faed83e20bc4d99d 100644 (file)
@@ -2513,15 +2513,18 @@ lua_util_is_utf_mixed_script(lua_State *L)
 {
        LUA_TRACE_POINT;
        gsize len_of_string;
-       const gchar *end, *string_to_check = lua_tolstring (L, 1, &len_of_string);
+       const gchar *string_to_check = lua_tolstring (L, 1, &len_of_string);
        UScriptCode last_script_code = USCRIPT_INVALID_CODE;
        UErrorCode uc_err = U_ZERO_ERROR;
 
-       if (string_to_check && g_utf8_validate (string_to_check, len_of_string, &end)) {
-               len_of_string = g_utf8_strlen (string_to_check, len_of_string);
-
-               for(; *string_to_check; string_to_check = g_utf8_next_char(string_to_check)){
-                       gunichar char_to_check = g_utf8_get_char(string_to_check);
+       if (string_to_check) {
+               uint index = 0;
+               UChar32 char_to_check = 0;
+               while(index < len_of_string) {
+                       U8_NEXT(string_to_check, index, len_of_string, char_to_check);
+                       if (char_to_check < 0 ) {
+                               return luaL_error (L, "passed string is not valid utf");
+                       }
                        UScriptCode current_script_code = uscript_getScript(char_to_check, &uc_err);
                        if (uc_err != U_ZERO_ERROR){
                                msg_err ("cannot get unicode script for character, error: %s", u_errorName (uc_err));
index 802b400d27f484cd717675032bf08ce1ef1ae419..859316be7fb7bed429391d4f6a7652bf0261abea 100644 (file)
@@ -5,36 +5,42 @@ context("Rspamd util for lua - check generic functions", function()
         {
             input = "test1",
             result = false,
+            mixed_script = false,
             range_start = 0x0000,
             range_end = 0x017f
         },
         {
             input = "test test xxx",
             result = false,
+            mixed_script = false,
             range_start = 0x0000,
             range_end = 0x017f
         },
         {
             input = "АбЫрвАлг",
             result = true,
+            mixed_script = false,
             range_start = 0x0000,
             range_end = 0x017f
         },
         {
             input = "АбЫрвАлг example",
             result = true,
+            mixed_script = true,
             range_start = 0x0000,
             range_end = 0x017f
         },
         {
             input = "example ąłśćżłóę",
             result = false,
+            mixed_script = false,
             range_start = 0x0000,
             range_end = 0x017f
         },
         {
             input = "ąłśćżłóę АбЫрвАлг",
             result = true,
+            mixed_script = true,
             range_start = 0x0000,
             range_end = 0x017f
         },
@@ -64,4 +70,20 @@ context("Rspamd util for lua - check generic functions", function()
         assert_equal(res["letters"], 10)
         assert_equal(res["digits"], 2)
     end)
+
+    for i,c in ipairs(cases) do
+        test("is_utf_mixed_script, test case #" .. i, function()
+          local actual = util.is_utf_mixed_script(c.input)
+
+          assert_equal(c.mixed_script, actual)
+        end)
+    end
+
+    test("is_utf_mixed_script, invalid utf str should return errror", function()
+        assert_error(util.is_utf_mixed_script,'\200\213\202')
+    end)
+
+    test("is_utf_mixed_script, empty str should return errror", function()
+        assert_error(util.is_utf_mixed_script,'\200\213\202')
+    end)
 end)