diff options
author | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-05-05 17:35:13 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-05-05 17:35:13 +0100 |
commit | 7f7e0530304343e9ada79844892f57b0a0a7b422 (patch) | |
tree | e529a3a8656ae7162db3aa165c154af05278d452 /src/lua | |
parent | 9d0a7d7b982e59b1845ae3e65bee22be0c9bb7d3 (diff) | |
download | rspamd-7f7e0530304343e9ada79844892f57b0a0a7b422.tar.gz rspamd-7f7e0530304343e9ada79844892f57b0a0a7b422.zip |
[Minor] Lua_util: Add normalize_utf8 utility
Issue: #4475
Diffstat (limited to 'src/lua')
-rw-r--r-- | src/lua/lua_util.c | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 4a9b6ae5c..2ac985c25 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -240,6 +240,19 @@ LUA_FUNCTION_DEF (util, strlen_utf8); LUA_FUNCTION_DEF (util, lower_utf8); /*** + * @function util.normalize_utf8(str) + * Gets a string in UTF8 and normalises it to NFKC_Casefold form + * @param {string} str utf8 encoded string + * @return {string,integer} lowercased utf8 string + result of the normalisation (use bit.band to check): + * RSPAMD_UNICODE_NORM_NORMAL = 0, + * RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0), + * RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1), + * RSPAMD_UNICODE_NORM_ERROR = (1 << 2), + * RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3) + */ +LUA_FUNCTION_DEF (util, normalize_utf8); + +/*** * @function util.strequal_caseless(str1, str2) * Compares two strings regardless of their case using ascii comparison. * Returns `true` if `str1` is equal to `str2` @@ -672,6 +685,7 @@ static const struct luaL_reg utillib_f[] = { LUA_INTERFACE_DEF (util, parse_mail_address), LUA_INTERFACE_DEF (util, strlen_utf8), LUA_INTERFACE_DEF (util, lower_utf8), + LUA_INTERFACE_DEF (util, normalize_utf8), LUA_INTERFACE_DEF (util, strequal_caseless), LUA_INTERFACE_DEF (util, strequal_caseless_utf8), LUA_INTERFACE_DEF (util, get_ticks), @@ -1606,6 +1620,39 @@ lua_util_lower_utf8 (lua_State *L) } static gint +lua_util_normalize_utf8 (lua_State *L) +{ + LUA_TRACE_POINT; + struct rspamd_lua_text *t; + bool is_text = lua_type (L, 1) == LUA_TUSERDATA; + + t = lua_check_text_or_string (L, 1); + + if (!t) { + return luaL_error(L, "invalid arguments"); + } + + char *cpy = g_malloc (t->len + 1); + memcpy (cpy, t->start, t->len); + cpy[t->len] = '\0'; + gsize len = t->len; + enum rspamd_utf8_normalise_result res = rspamd_normalise_unicode_inplace(cpy, &len); + + if (is_text) { + struct rspamd_lua_text *out = lua_new_text (L, cpy, len, FALSE); + out->flags |= RSPAMD_TEXT_FLAG_OWN; + } + else { + lua_pushlstring(L, cpy, len); + g_free(cpy); + } + + lua_pushinteger(L, res); + + return 2; +} + +static gint lua_util_strequal_caseless (lua_State *L) { LUA_TRACE_POINT; |