From: Vsevolod Stakhov Date: Fri, 5 May 2023 16:35:13 +0000 (+0100) Subject: [Minor] Lua_util: Add normalize_utf8 utility X-Git-Tag: 3.6~137 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=7f7e0530304343e9ada79844892f57b0a0a7b422;p=rspamd.git [Minor] Lua_util: Add normalize_utf8 utility Issue: #4475 --- diff --git a/src/libserver/url.h b/src/libserver/url.h index 6918d96ad..0b326869b 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -359,7 +359,7 @@ int rspamd_url_cmp_qsort(const void *u1, const void *u2); */ #define rspamd_url_normalise_propagate_flags(pool, input, len_out, url_flags_out) \ do { \ - enum rspamd_normalise_result norm_res; \ + enum rspamd_utf8_normalise_result norm_res; \ norm_res = rspamd_normalise_unicode_inplace((input), (len_out)); \ if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) { \ url_flags_out |= RSPAMD_URL_FLAG_UNNORMALISED; \ diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx index dadd5fc6e..8d9fc31a9 100644 --- a/src/libutil/cxx/utf8_util.cxx +++ b/src/libutil/cxx/utf8_util.cxx @@ -75,7 +75,7 @@ rspamd_string_unicode_trim_inplace (const char *str, size_t *len) return ret; } -enum rspamd_normalise_result +enum rspamd_utf8_normalise_result rspamd_normalise_unicode_inplace(char *start, size_t *len) { UErrorCode uc_err = U_ZERO_ERROR; @@ -156,7 +156,7 @@ rspamd_normalise_unicode_inplace(char *start, size_t *len) *len = filter_zw_spaces_and_push_back(uc_string); } - return static_cast(ret); + return static_cast(ret); } struct rspamd_icu_collate_storage { diff --git a/src/libutil/cxx/utf8_util.h b/src/libutil/cxx/utf8_util.h index a9476f78d..da4ebdb24 100644 --- a/src/libutil/cxx/utf8_util.h +++ b/src/libutil/cxx/utf8_util.h @@ -34,7 +34,7 @@ extern "C" { */ const char* rspamd_string_unicode_trim_inplace (const char *str, size_t *len); -enum rspamd_normalise_result { +enum rspamd_utf8_normalise_result { RSPAMD_UNICODE_NORM_NORMAL = 0, RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0), RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1), @@ -49,7 +49,7 @@ enum rspamd_normalise_result { * @param len * @return TRUE if a string has been normalised */ -enum rspamd_normalise_result rspamd_normalise_unicode_inplace(gchar *start, gsize *len); +enum rspamd_utf8_normalise_result rspamd_normalise_unicode_inplace(gchar *start, gsize *len); /** * Compare two strings using libicu collator diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 4a9b6ae5c..2ac985c25 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -239,6 +239,19 @@ LUA_FUNCTION_DEF (util, strlen_utf8); */ LUA_FUNCTION_DEF (util, lower_utf8); +/*** + * @function util.normalize_utf8(str) + * Gets a string in UTF8 and normalises it to NFKC_Casefold form + * @param {string} str utf8 encoded string + * @return {string,integer} lowercased utf8 string + result of the normalisation (use bit.band to check): + * RSPAMD_UNICODE_NORM_NORMAL = 0, + * RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0), + * RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1), + * RSPAMD_UNICODE_NORM_ERROR = (1 << 2), + * RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3) + */ +LUA_FUNCTION_DEF (util, normalize_utf8); + /*** * @function util.strequal_caseless(str1, str2) * Compares two strings regardless of their case using ascii comparison. @@ -672,6 +685,7 @@ static const struct luaL_reg utillib_f[] = { LUA_INTERFACE_DEF (util, parse_mail_address), LUA_INTERFACE_DEF (util, strlen_utf8), LUA_INTERFACE_DEF (util, lower_utf8), + LUA_INTERFACE_DEF (util, normalize_utf8), LUA_INTERFACE_DEF (util, strequal_caseless), LUA_INTERFACE_DEF (util, strequal_caseless_utf8), LUA_INTERFACE_DEF (util, get_ticks), @@ -1605,6 +1619,39 @@ lua_util_lower_utf8 (lua_State *L) return 1; } +static gint +lua_util_normalize_utf8 (lua_State *L) +{ + LUA_TRACE_POINT; + struct rspamd_lua_text *t; + bool is_text = lua_type (L, 1) == LUA_TUSERDATA; + + t = lua_check_text_or_string (L, 1); + + if (!t) { + return luaL_error(L, "invalid arguments"); + } + + char *cpy = g_malloc (t->len + 1); + memcpy (cpy, t->start, t->len); + cpy[t->len] = '\0'; + gsize len = t->len; + enum rspamd_utf8_normalise_result res = rspamd_normalise_unicode_inplace(cpy, &len); + + if (is_text) { + struct rspamd_lua_text *out = lua_new_text (L, cpy, len, FALSE); + out->flags |= RSPAMD_TEXT_FLAG_OWN; + } + else { + lua_pushlstring(L, cpy, len); + g_free(cpy); + } + + lua_pushinteger(L, res); + + return 2; +} + static gint lua_util_strequal_caseless (lua_State *L) {