aboutsummaryrefslogtreecommitdiffstats
path: root/src/lua/lua_util.c
diff options
context:
space:
mode:
authorMiecio Za <miecio@miecio.net>2019-03-18 19:56:57 +0100
committerMiecio Za <miecio@miecio.net>2019-03-21 16:02:57 +0100
commit7824318dca65d223df397329b3eabde86c3fde02 (patch)
treedb460952d782db42c32dda1d182752e95c62b676 /src/lua/lua_util.c
parent198e242157ed81b871671f6a77e3d525a57350a5 (diff)
downloadrspamd-7824318dca65d223df397329b3eabde86c3fde02.tar.gz
rspamd-7824318dca65d223df397329b3eabde86c3fde02.zip
[Minor] Add util.if_utf_mixed_script to lua
Add new function which implements PoC for chekcing mixed script in utf string. Behaviour is similar to single string spoof detection in libicu before version 58
Diffstat (limited to 'src/lua/lua_util.c')
-rw-r--r--src/lua/lua_util.c51
1 files changed, 51 insertions, 0 deletions
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 1a37eaef6..881257ed3 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -29,6 +29,7 @@
#include <zlib.h>
#include "unicode/uspoof.h"
+#include "unicode/uscript.h"
/***
* @module rspamd_util
@@ -394,6 +395,14 @@ LUA_FUNCTION_DEF (util, normalize_prob);
LUA_FUNCTION_DEF (util, is_utf_spoofed);
/**
+* @function util.is_utf_mixed_script(str)
+* Returns true if a string contains mixed unicode scripts
+* @param {string} String to check
+* @return {boolean} true if a string contains chars with mixed unicode script
+*/
+LUA_FUNCTION_DEF (util, is_utf_mixed_script);
+
+/**
* @function util.is_utf_outside_range(str, range_start, range_end)
* Returns true if a string contains chars outside range
* @param {string} String to check
@@ -633,6 +642,7 @@ static const struct luaL_reg utillib_f[] = {
LUA_INTERFACE_DEF (util, caseless_hash),
LUA_INTERFACE_DEF (util, caseless_hash_fast),
LUA_INTERFACE_DEF (util, is_utf_spoofed),
+ LUA_INTERFACE_DEF (util, is_utf_mixed_script),
LUA_INTERFACE_DEF (util, is_utf_outside_range),
LUA_INTERFACE_DEF (util, get_string_stats),
LUA_INTERFACE_DEF (util, is_valid_utf8),
@@ -2499,6 +2509,47 @@ lua_util_is_utf_spoofed (lua_State *L)
}
static gint
+lua_util_is_utf_mixed_script(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ gsize len_of_string;
+ const gchar *end, *string_to_check = lua_tolstring (L, 1, &len_of_string);
+ UScriptCode last_script_code = USCRIPT_INVALID_CODE;
+ UErrorCode uc_err = U_ZERO_ERROR;
+
+ if (string_to_check && g_utf8_validate (string_to_check, len_of_string, &end)) {
+ len_of_string = g_utf8_strlen (string_to_check, len_of_string);
+
+ for(; *string_to_check; string_to_check = g_utf8_next_char(string_to_check)){
+ gunichar char_to_check = g_utf8_get_char(string_to_check);
+ UScriptCode current_script_code = uscript_getScript(char_to_check, &uc_err);
+ if (uc_err != U_ZERO_ERROR){
+ msg_err ("cannot get unicode script for character, error: %s", u_errorName (uc_err));
+ lua_pushboolean (L, false);
+ return 1;
+ }
+ if ( current_script_code != USCRIPT_COMMON && current_script_code != USCRIPT_INHERITED ){
+ if (last_script_code == USCRIPT_INVALID_CODE ){
+ last_script_code = current_script_code;
+ } else {
+ if ( last_script_code != current_script_code ){
+ lua_pushboolean (L, true);
+ return 1;
+ }
+ }
+ }
+ }
+ }
+ else {
+ return luaL_error (L, "invalid arguments");
+ }
+
+ lua_pushboolean (L, false);
+
+ return 1;
+}
+
+static gint
lua_util_get_string_stats (lua_State *L)
{
LUA_TRACE_POINT;