diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-05-17 16:34:35 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-05-17 16:34:35 +0100 |
commit | 58d60ea07f1478fe5fcc1d152d54237169e3bd43 (patch) | |
tree | c1487c990636254bb22cc82cf54bae364fc8d1d4 /src | |
parent | d2ed10141ba44a5de8ae8d54d73ee95cbe1e07cf (diff) | |
download | rspamd-58d60ea07f1478fe5fcc1d152d54237169e3bd43.tar.gz rspamd-58d60ea07f1478fe5fcc1d152d54237169e3bd43.zip |
[Rework] Use C++ version for unicode normalisation
Diffstat (limited to 'src')
-rw-r--r-- | src/libserver/html.c | 2 | ||||
-rw-r--r-- | src/libserver/url.h | 3 | ||||
-rw-r--r-- | src/libutil/cxx/utf8_util.cxx | 100 | ||||
-rw-r--r-- | src/libutil/cxx/utf8_util.h | 17 | ||||
-rw-r--r-- | src/libutil/str_util.c | 129 | ||||
-rw-r--r-- | src/libutil/str_util.h | 16 |
6 files changed, 120 insertions, 147 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c index 8d7b722a5..cfdd0acef 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -2667,7 +2667,7 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool, } } - rspamd_normalise_unicode_inplace (pool, url->visible_part, &dlen); + rspamd_normalise_unicode_inplace (url->visible_part, &dlen); } static gboolean diff --git a/src/libserver/url.h b/src/libserver/url.h index 72fce5f9e..4ace18f1a 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -6,6 +6,7 @@ #include "mem_pool.h" #include "khash.h" #include "fstring.h" +#include "libutil/cxx/utf8_util.h" #ifdef __cplusplus extern "C" { @@ -356,7 +357,7 @@ int rspamd_url_cmp_qsort(const void *u1, const void *u2); #define rspamd_url_normalise_propagate_flags(pool, input, len_out, url_flags_out) \ do { \ enum rspamd_normalise_result norm_res; \ - norm_res = rspamd_normalise_unicode_inplace((pool), (input), (len_out)); \ + norm_res = rspamd_normalise_unicode_inplace((input), (len_out)); \ if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) { \ url_flags_out |= RSPAMD_URL_FLAG_UNNORMALISED; \ } \ diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx index f44d02671..6bca4b18e 100644 --- a/src/libutil/cxx/utf8_util.cxx +++ b/src/libutil/cxx/utf8_util.cxx @@ -18,6 +18,8 @@ #include <unicode/utypes.h> #include <unicode/utf8.h> #include <unicode/uchar.h> +#include <unicode/normalizer2.h> +#include <unicode/schriter.h> #include <utility> #include <string> @@ -98,3 +100,101 @@ TEST_SUITE("utf8 utils") { } + +enum rspamd_normalise_result +rspamd_normalise_unicode_inplace(char *start, size_t *len) +{ + UErrorCode uc_err = U_ZERO_ERROR; + const auto *nfkc_norm = icu::Normalizer2::getNFKCInstance(uc_err); + static icu::UnicodeSet zw_spaces{}; + + if (!zw_spaces.isFrozen()) { + /* Add zw spaces to the set */ + zw_spaces.add(0x200B); + zw_spaces.add(0x200C); + zw_spaces.add(0x200D); + zw_spaces.add(0xFEF); + zw_spaces.add(0x00AD); + zw_spaces.freeze(); + } + + int ret = RSPAMD_UNICODE_NORM_NORMAL; + + g_assert (U_SUCCESS (uc_err)); + + auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, *len)); + auto is_normal = nfkc_norm->quickCheck(uc_string, uc_err); + + if (!U_SUCCESS (uc_err)) { + return RSPAMD_UNICODE_NORM_ERROR; + } + + /* Filter zero width spaces and push resulting string back */ + const auto filter_zw_spaces_and_push_back = [&](const icu::UnicodeString &input) -> size_t { + icu::StringCharacterIterator it{input}; + size_t i = 0; + + while(it.hasNext()) { + auto uc = it.next32PostInc(); + + if (zw_spaces.contains(uc)) { + ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES; + } + else { + UBool err = 0; + U8_APPEND(start, i, *len, uc, err); + + if (err) { + ret = RSPAMD_UNICODE_NORM_ERROR; + + return i; + } + } + } + + return i; + }; + + if (is_normal != UNORM_YES) { + /* Need to normalise */ + ret |= RSPAMD_UNICODE_NORM_UNNORMAL; + + auto normalised = nfkc_norm->normalize(uc_string, uc_err); + + if (!U_SUCCESS (uc_err)) { + return RSPAMD_UNICODE_NORM_ERROR; + } + + *len = filter_zw_spaces_and_push_back(normalised); + } + else { + *len = filter_zw_spaces_and_push_back(uc_string); + } + + return static_cast<enum rspamd_normalise_result>(ret); +} + +TEST_SUITE("utf8 utils") { + TEST_CASE("utf8 normalise") { + std::tuple<const char *, const char *, int> cases[] = { + {"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL}, + {"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL}, + /* Zero width spaces */ + {"\xE2\x80\x8B""те""\xE2\x80\x8B""ст", "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES}, + /* Special case of diacritic */ + {"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL}, + /* Same with zw spaces */ + {"13\u200C_\u0020\u0308\u0301\u038e\u03ab\u200D", "13_ ̈́ΎΫ", + RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ZERO_SPACES}, + }; + + for (const auto &c : cases) { + std::string cpy{std::get<0>(c)}; + auto ns = cpy.size(); + auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns); + cpy.resize(ns); + CHECK(cpy == std::string(std::get<1>(c))); + CHECK(res == std::get<2>(c)); + } + } +}
\ No newline at end of file diff --git a/src/libutil/cxx/utf8_util.h b/src/libutil/cxx/utf8_util.h index 40bb53bf0..21add9bae 100644 --- a/src/libutil/cxx/utf8_util.h +++ b/src/libutil/cxx/utf8_util.h @@ -34,6 +34,23 @@ extern "C" { */ char* rspamd_string_unicode_trim_inplace (char *str, size_t *len); +enum rspamd_normalise_result { + RSPAMD_UNICODE_NORM_NORMAL = 0, + RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0), + RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1), + RSPAMD_UNICODE_NORM_ERROR = (1 << 2), + RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3) +}; + +/** + * Gets a string in UTF8 and normalises it to NFKC_Casefold form + * @param pool optional memory pool used for logging purposes + * @param start + * @param len + * @return TRUE if a string has been normalised + */ +enum rspamd_normalise_result rspamd_normalise_unicode_inplace(gchar *start, gsize *len); + #ifdef __cplusplus } #endif diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index 00774d588..1e92c8e54 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -3020,135 +3020,6 @@ rspamd_get_unicode_normalizer (void) #endif } - -enum rspamd_normalise_result -rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, gchar *start, - gsize *len) -{ -#if U_ICU_VERSION_MAJOR_NUM >= 44 - UErrorCode uc_err = U_ZERO_ERROR; - UConverter *utf8_conv = rspamd_get_utf8_converter (); - const UNormalizer2 *norm = rspamd_get_unicode_normalizer (); - gint32 nsym, end; - UChar *src = NULL, *dest = NULL; - enum rspamd_normalise_result ret = 0; - gboolean has_invisible = FALSE; - - /* We first need to convert data to UChars :( */ - src = g_malloc ((*len + 1) * sizeof (*src)); - nsym = ucnv_toUChars (utf8_conv, src, *len + 1, - start, *len, &uc_err); - - if (!U_SUCCESS (uc_err)) { - msg_warn_pool_check ("cannot normalise URL, cannot convert to unicode: %s", - u_errorName (uc_err)); - ret |= RSPAMD_UNICODE_NORM_ERROR; - goto out; - } - - /* We can now check if we need to decompose */ - end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err); - - if (!U_SUCCESS (uc_err)) { - msg_warn_pool_check ("cannot normalise URL, cannot check normalisation: %s", - u_errorName (uc_err)); - ret |= RSPAMD_UNICODE_NORM_ERROR; - goto out; - } - - for (gint32 i = 0; i < nsym; i ++) { - if (IS_ZERO_WIDTH_SPACE (src[i])) { - has_invisible = TRUE; - break; - } - } - - uc_err = U_ZERO_ERROR; - - if (end != nsym) { - /* No normalisation needed, but we may still have invisible spaces */ - /* We copy sub(src, 0, end) to dest and normalise the rest */ - ret |= RSPAMD_UNICODE_NORM_UNNORMAL; - dest = g_malloc (nsym * sizeof (*dest)); - memcpy (dest, src, end * sizeof (*dest)); - nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym, - src + end, nsym - end, &uc_err); - - if (!U_SUCCESS (uc_err)) { - if (uc_err != U_BUFFER_OVERFLOW_ERROR) { - msg_warn_pool_check ("cannot normalise URL: %s", - u_errorName (uc_err)); - ret |= RSPAMD_UNICODE_NORM_ERROR; - } - - goto out; - } - } - else if (!has_invisible) { - goto out; - } - else { - dest = src; - src = NULL; - } - - if (has_invisible) { - /* Also filter zero width spaces */ - gint32 new_len = 0; - UChar *t = dest, *h = dest; - - ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES; - - for (gint32 i = 0; i < nsym; i ++) { - if (!IS_ZERO_WIDTH_SPACE (*h)) { - *t++ = *h++; - new_len ++; - } - else { - h ++; - } - } - - nsym = new_len; - } - - /* We now convert it back to utf */ - nsym = ucnv_fromUChars (utf8_conv, start, *len, dest, nsym, &uc_err); - - if (!U_SUCCESS (uc_err)) { - msg_warn_pool_check ("cannot normalise URL, cannot convert to UTF8: %s" - " input length: %d chars, unicode length: %d utf16 symbols", - u_errorName (uc_err), (gint)*len, (gint)nsym); - - if (uc_err == U_BUFFER_OVERFLOW_ERROR) { - ret |= RSPAMD_UNICODE_NORM_OVERFLOW; - } - else { - ret |= RSPAMD_UNICODE_NORM_ERROR; - } - - goto out; - } - - *len = nsym; - -out: - - if (src) { - g_free (src); - } - - if (dest) { - g_free (dest); - } - - return ret; -#else - /* Kill that with fire please */ - return FALSE; -#endif -} - gchar * rspamd_str_regexp_escape (const gchar *pattern, gsize slen, gsize *dst_len, enum rspamd_regexp_escape_flags flags) diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index 427d6b94e..cfa37848f 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -475,23 +475,7 @@ struct UNormalizer2; const struct UNormalizer2 *rspamd_get_unicode_normalizer (void); -enum rspamd_normalise_result { - RSPAMD_UNICODE_NORM_NORMAL = 0, - RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0), - RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1), - RSPAMD_UNICODE_NORM_ERROR = (1 << 2), - RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3) -}; -/** - * Gets a string in UTF8 and normalises it to NFKC_Casefold form - * @param pool optional memory pool used for logging purposes - * @param start - * @param len - * @return TRUE if a string has been normalised - */ -enum rspamd_normalise_result rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool, - gchar *start, gsize *len); enum rspamd_regexp_escape_flags { RSPAMD_REGEXP_ESCAPE_ASCII = 0, |