]> source.dussan.org Git - rspamd.git/commitdiff
[Minor] Add util.if_utf_mixed_script to lua
authorMiecio Za <miecio@miecio.net>
Mon, 18 Mar 2019 18:56:57 +0000 (19:56 +0100)
committerMiecio Za <miecio@miecio.net>
Thu, 21 Mar 2019 15:02:57 +0000 (16:02 +0100)
Add new function which implements PoC for chekcing
mixed script in utf string. Behaviour is similar
to single string spoof detection in libicu before
version 58

src/lua/lua_util.c

index 1a37eaef6a00eedf7237ef1414cf8cf02bd0c58b..881257ed378a5c16834aeed65a45c6083cc05ecf 100644 (file)
@@ -29,6 +29,7 @@
 #include <zlib.h>
 
 #include "unicode/uspoof.h"
+#include "unicode/uscript.h"
 
 /***
  * @module rspamd_util
@@ -393,6 +394,14 @@ LUA_FUNCTION_DEF (util, normalize_prob);
  */
 LUA_FUNCTION_DEF (util, is_utf_spoofed);
 
+/**
+* @function util.is_utf_mixed_script(str)
+* Returns true if a string contains mixed unicode scripts
+* @param {string} String to check
+* @return {boolean} true if a string contains chars with mixed unicode script
+*/
+LUA_FUNCTION_DEF (util, is_utf_mixed_script);
+
 /**
 * @function util.is_utf_outside_range(str, range_start, range_end)
 * Returns true if a string contains chars outside range
@@ -633,6 +642,7 @@ static const struct luaL_reg utillib_f[] = {
        LUA_INTERFACE_DEF (util, caseless_hash),
        LUA_INTERFACE_DEF (util, caseless_hash_fast),
        LUA_INTERFACE_DEF (util, is_utf_spoofed),
+       LUA_INTERFACE_DEF (util, is_utf_mixed_script),
        LUA_INTERFACE_DEF (util, is_utf_outside_range),
        LUA_INTERFACE_DEF (util, get_string_stats),
        LUA_INTERFACE_DEF (util, is_valid_utf8),
@@ -2498,6 +2508,47 @@ lua_util_is_utf_spoofed (lua_State *L)
        return nres;
 }
 
+static gint
+lua_util_is_utf_mixed_script(lua_State *L)
+{
+       LUA_TRACE_POINT;
+       gsize len_of_string;
+       const gchar *end, *string_to_check = lua_tolstring (L, 1, &len_of_string);
+       UScriptCode last_script_code = USCRIPT_INVALID_CODE;
+       UErrorCode uc_err = U_ZERO_ERROR;
+
+       if (string_to_check && g_utf8_validate (string_to_check, len_of_string, &end)) {
+               len_of_string = g_utf8_strlen (string_to_check, len_of_string);
+
+               for(; *string_to_check; string_to_check = g_utf8_next_char(string_to_check)){
+                       gunichar char_to_check = g_utf8_get_char(string_to_check);
+                       UScriptCode current_script_code = uscript_getScript(char_to_check, &uc_err);
+                       if (uc_err != U_ZERO_ERROR){
+                               msg_err ("cannot get unicode script for character, error: %s", u_errorName (uc_err));
+                               lua_pushboolean (L, false);
+                               return 1;
+                       }
+                       if ( current_script_code != USCRIPT_COMMON && current_script_code != USCRIPT_INHERITED ){
+                               if (last_script_code == USCRIPT_INVALID_CODE ){
+                                       last_script_code = current_script_code;
+                               } else {
+                                       if ( last_script_code != current_script_code ){
+                                               lua_pushboolean (L, true);
+                                               return 1;
+                                       }
+                               }
+                       }
+               }
+       }
+       else {
+               return luaL_error (L, "invalid arguments");
+       }
+
+       lua_pushboolean (L, false);
+
+       return 1;
+}
+
 static gint
 lua_util_get_string_stats (lua_State *L)
 {