From 06ba232b45946fc52c5d812551ac50c2343e3b99 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 2 Aug 2021 17:00:14 +0100 Subject: [PATCH] [Feature] Add rspamd_utf8_strcmp utility --- src/libutil/cxx/utf8_util.cxx | 173 ++++++++++++++++++++++++---------- src/libutil/cxx/utf8_util.h | 9 ++ 2 files changed, 130 insertions(+), 52 deletions(-) diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx index e42ef917f..8b99d1f35 100644 --- a/src/libutil/cxx/utf8_util.cxx +++ b/src/libutil/cxx/utf8_util.cxx @@ -20,9 +20,11 @@ #include #include #include +#include #include #include #include +#include #include "utf8_util.h" #include "str_util.h" @@ -73,35 +75,6 @@ rspamd_string_unicode_trim_inplace (const char *str, size_t *len) return ret; } -TEST_SUITE("utf8 utils") { - TEST_CASE("utf8 trim") { - std::pair cases[] = { - {" \u200B""abc ", "abc"}, - {" ", ""}, - {" a", "a"}, - {"a ", "a"}, - {"a a", "a a"}, - {"abc", "abc"}, - {"a ", "a"}, - {" abc ", "abc"}, - {" abc ", "abc"}, - {" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"}, - {" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"}, - {" \xE2\x80\x8B""abc \xE2\x80\x8B ", "abc"}, - }; - - for (const auto &c : cases) { - std::string cpy{c.first}; - auto ns = cpy.size(); - auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns); - std::string res{nstart, ns}; - CHECK(res == std::string{c.second}); - } - } -} - - - enum rspamd_normalise_result rspamd_normalise_unicode_inplace(char *start, size_t *len) { @@ -184,30 +157,126 @@ rspamd_normalise_unicode_inplace(char *start, size_t *len) return static_cast(ret); } +struct rspamd_icu_collate_storage { + icu::Collator* collator = nullptr; + rspamd_icu_collate_storage() { + UErrorCode success = U_ZERO_ERROR; + collator = icu::Collator::createInstance(icu::Locale::getEnglish(), success); + /* Ignore all difference except functional */ + collator->setStrength(icu::Collator::PRIMARY); + } + + ~rspamd_icu_collate_storage() { + if (collator) { + delete collator; + } + } +}; + +static rspamd_icu_collate_storage collate_storage; + +int +rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n) +{ + if (n >= std::numeric_limits::max()) { + /* + * It's hard to say what to do here... But libicu wants int, so we fall + * back to g_ascii_strcasecmp which can deal with size_t + */ + return g_ascii_strncasecmp(s1, s2, n); + } + + UErrorCode success = U_ZERO_ERROR; + auto res = collate_storage.collator->compareUTF8({s1, (int) n}, {s2, (int) n}, + success); + + switch (res) { + case UCOL_EQUAL: + return 0; + case UCOL_GREATER: + return 1; + case UCOL_LESS: + default: + return -1; + } +} + TEST_SUITE("utf8 utils") { - TEST_CASE("utf8 normalise") { - std::tuple cases[] = { - {"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL}, - {"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL}, - /* Zero width spaces */ - {"\xE2\x80\x8B""те""\xE2\x80\x8B""ст", "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES}, - /* Special case of diacritic */ - {"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL}, - /* Same with zw spaces */ - {"13\u200C_\u0020\u0308\u0301\u038e\u03ab\u200D", "13_ ̈́ΎΫ", - RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ZERO_SPACES}, - /* Buffer overflow case */ - {"u\xC2\xC2\xC2\xC2\xC2\xC2""abcdef""abcdef", "u������", - RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ERROR}, - }; - - for (const auto &c : cases) { - std::string cpy{std::get<0>(c)}; - auto ns = cpy.size(); - auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns); - cpy.resize(ns); - CHECK(cpy == std::string(std::get<1>(c))); - CHECK(res == std::get<2>(c)); +TEST_CASE("utf8 normalise") { + std::tuple cases[] = { + {"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL}, + {"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL}, + /* Zero width spaces */ + {"\xE2\x80\x8B""те""\xE2\x80\x8B""ст", "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES}, + /* Special case of diacritic */ + {"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL}, + /* Same with zw spaces */ + {"13\u200C_\u0020\u0308\u0301\u038e\u03ab\u200D", "13_ ̈́ΎΫ", + RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ZERO_SPACES}, + /* Buffer overflow case */ + {"u\xC2\xC2\xC2\xC2\xC2\xC2""abcdef""abcdef", "u������", + RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ERROR}, + }; + + for (const auto &c : cases) { + std::string cpy{std::get<0>(c)}; + auto ns = cpy.size(); + auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns); + cpy.resize(ns); + CHECK(cpy == std::string(std::get<1>(c))); + CHECK(res == std::get<2>(c)); + } +} + +TEST_CASE("utf8 trim") { + std::pair cases[] = { + {" \u200B""abc ", "abc"}, + {" ", ""}, + {" a", "a"}, + {"a ", "a"}, + {"a a", "a a"}, + {"abc", "abc"}, + {"a ", "a"}, + {" abc ", "abc"}, + {" abc ", "abc"}, + {" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"}, + {" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"}, + {" \xE2\x80\x8B""abc \xE2\x80\x8B ", "abc"}, + }; + + for (const auto &c : cases) { + std::string cpy{c.first}; + auto ns = cpy.size(); + auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns); + std::string res{nstart, ns}; + CHECK(res == std::string{c.second}); + } +} + + +TEST_CASE("utf8 strcmp") { + std::tuple cases[] = { + {"abc", "abc", -1, 0}, + {"", "", -1, 0}, + {"aBc", "AbC", -1, 0}, + {"abc", "ab", 2, 0}, + {"теСт", "ТесТ", -1, 0}, + {"теСт", "Тезт", 4, 0}, + {"теСт", "Тезт", -1, 1}, + {"abc", "ABD", -1, -1}, + {"\0a\0", "\0a\1", 2, 0}, + {"\0a\0", "\0b\1", 3, -1}, + }; + + for (const auto &c : cases) { + auto [s1, s2, n, expected] = c; + if (n == -1) { + n = MIN(strlen(s1), strlen(s2)); + } + SUBCASE((std::string("test case: ") + s1 + " <=> " + s2).c_str()) { + auto ret = rspamd_utf8_strcmp(s1, s2, n); + CHECK(ret == expected); } } +} } \ No newline at end of file diff --git a/src/libutil/cxx/utf8_util.h b/src/libutil/cxx/utf8_util.h index 242e03f00..28bd6a144 100644 --- a/src/libutil/cxx/utf8_util.h +++ b/src/libutil/cxx/utf8_util.h @@ -51,6 +51,15 @@ enum rspamd_normalise_result { */ enum rspamd_normalise_result rspamd_normalise_unicode_inplace(gchar *start, gsize *len); +/** + * Compare two strings using libicu collator + * @param s1 + * @param s2 + * @param n + * @return an integer greater than, equal to, or less than 0, according as the string s1 is greater than, equal to, or less than the string s2. + */ +int rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n); + #ifdef __cplusplus } #endif -- 2.39.5