Browse Source

[Minor] Add util.if_utf_mixed_script to lua

Add new function which implements PoC for chekcing
mixed script in utf string. Behaviour is similar
to single string spoof detection in libicu before
version 58
tags/1.9.1
Miecio Za 5 years ago
parent
commit
7824318dca
1 changed files with 51 additions and 0 deletions
  1. 51
    0
      src/lua/lua_util.c

+ 51
- 0
src/lua/lua_util.c View File

@@ -29,6 +29,7 @@
#include <zlib.h>

#include "unicode/uspoof.h"
#include "unicode/uscript.h"

/***
* @module rspamd_util
@@ -393,6 +394,14 @@ LUA_FUNCTION_DEF (util, normalize_prob);
*/
LUA_FUNCTION_DEF (util, is_utf_spoofed);

/**
* @function util.is_utf_mixed_script(str)
* Returns true if a string contains mixed unicode scripts
* @param {string} String to check
* @return {boolean} true if a string contains chars with mixed unicode script
*/
LUA_FUNCTION_DEF (util, is_utf_mixed_script);

/**
* @function util.is_utf_outside_range(str, range_start, range_end)
* Returns true if a string contains chars outside range
@@ -633,6 +642,7 @@ static const struct luaL_reg utillib_f[] = {
LUA_INTERFACE_DEF (util, caseless_hash),
LUA_INTERFACE_DEF (util, caseless_hash_fast),
LUA_INTERFACE_DEF (util, is_utf_spoofed),
LUA_INTERFACE_DEF (util, is_utf_mixed_script),
LUA_INTERFACE_DEF (util, is_utf_outside_range),
LUA_INTERFACE_DEF (util, get_string_stats),
LUA_INTERFACE_DEF (util, is_valid_utf8),
@@ -2498,6 +2508,47 @@ lua_util_is_utf_spoofed (lua_State *L)
return nres;
}

static gint
lua_util_is_utf_mixed_script(lua_State *L)
{
LUA_TRACE_POINT;
gsize len_of_string;
const gchar *end, *string_to_check = lua_tolstring (L, 1, &len_of_string);
UScriptCode last_script_code = USCRIPT_INVALID_CODE;
UErrorCode uc_err = U_ZERO_ERROR;

if (string_to_check && g_utf8_validate (string_to_check, len_of_string, &end)) {
len_of_string = g_utf8_strlen (string_to_check, len_of_string);

for(; *string_to_check; string_to_check = g_utf8_next_char(string_to_check)){
gunichar char_to_check = g_utf8_get_char(string_to_check);
UScriptCode current_script_code = uscript_getScript(char_to_check, &uc_err);
if (uc_err != U_ZERO_ERROR){
msg_err ("cannot get unicode script for character, error: %s", u_errorName (uc_err));
lua_pushboolean (L, false);
return 1;
}
if ( current_script_code != USCRIPT_COMMON && current_script_code != USCRIPT_INHERITED ){
if (last_script_code == USCRIPT_INVALID_CODE ){
last_script_code = current_script_code;
} else {
if ( last_script_code != current_script_code ){
lua_pushboolean (L, true);
return 1;
}
}
}
}
}
else {
return luaL_error (L, "invalid arguments");
}

lua_pushboolean (L, false);

return 1;
}

static gint
lua_util_get_string_stats (lua_State *L)
{

Loading…
Cancel
Save