diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-08-02 17:00:14 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-08-02 17:00:14 +0100 |
commit | 06ba232b45946fc52c5d812551ac50c2343e3b99 (patch) | |
tree | 1c68e4e462e21c731c3a33ebd934920094796728 /src/libutil/cxx/utf8_util.cxx | |
parent | 8e65fac07f092bc0450bcaa611fccde3dd890fa6 (diff) | |
download | rspamd-06ba232b45946fc52c5d812551ac50c2343e3b99.tar.gz rspamd-06ba232b45946fc52c5d812551ac50c2343e3b99.zip |
[Feature] Add rspamd_utf8_strcmp utility
Diffstat (limited to 'src/libutil/cxx/utf8_util.cxx')
-rw-r--r-- | src/libutil/cxx/utf8_util.cxx | 173 |
1 files changed, 121 insertions, 52 deletions
diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx index e42ef917f..8b99d1f35 100644 --- a/src/libutil/cxx/utf8_util.cxx +++ b/src/libutil/cxx/utf8_util.cxx @@ -20,9 +20,11 @@ #include <unicode/uchar.h> #include <unicode/normalizer2.h> #include <unicode/schriter.h> +#include <unicode/coll.h> #include <utility> #include <tuple> #include <string> +#include <limits> #include "utf8_util.h" #include "str_util.h" @@ -73,35 +75,6 @@ rspamd_string_unicode_trim_inplace (const char *str, size_t *len) return ret; } -TEST_SUITE("utf8 utils") { - TEST_CASE("utf8 trim") { - std::pair<const char *, const char *> cases[] = { - {" \u200B""abc ", "abc"}, - {" ", ""}, - {" a", "a"}, - {"a ", "a"}, - {"a a", "a a"}, - {"abc", "abc"}, - {"a ", "a"}, - {" abc ", "abc"}, - {" abc ", "abc"}, - {" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"}, - {" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"}, - {" \xE2\x80\x8B""abc \xE2\x80\x8B ", "abc"}, - }; - - for (const auto &c : cases) { - std::string cpy{c.first}; - auto ns = cpy.size(); - auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns); - std::string res{nstart, ns}; - CHECK(res == std::string{c.second}); - } - } -} - - - enum rspamd_normalise_result rspamd_normalise_unicode_inplace(char *start, size_t *len) { @@ -184,30 +157,126 @@ rspamd_normalise_unicode_inplace(char *start, size_t *len) return static_cast<enum rspamd_normalise_result>(ret); } +struct rspamd_icu_collate_storage { + icu::Collator* collator = nullptr; + rspamd_icu_collate_storage() { + UErrorCode success = U_ZERO_ERROR; + collator = icu::Collator::createInstance(icu::Locale::getEnglish(), success); + /* Ignore all difference except functional */ + collator->setStrength(icu::Collator::PRIMARY); + } + + ~rspamd_icu_collate_storage() { + if (collator) { + delete collator; + } + } +}; + +static rspamd_icu_collate_storage collate_storage; + +int +rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n) +{ + if (n >= std::numeric_limits<int>::max()) { + /* + * It's hard to say what to do here... But libicu wants int, so we fall + * back to g_ascii_strcasecmp which can deal with size_t + */ + return g_ascii_strncasecmp(s1, s2, n); + } + + UErrorCode success = U_ZERO_ERROR; + auto res = collate_storage.collator->compareUTF8({s1, (int) n}, {s2, (int) n}, + success); + + switch (res) { + case UCOL_EQUAL: + return 0; + case UCOL_GREATER: + return 1; + case UCOL_LESS: + default: + return -1; + } +} + TEST_SUITE("utf8 utils") { - TEST_CASE("utf8 normalise") { - std::tuple<const char *, const char *, int> cases[] = { - {"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL}, - {"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL}, - /* Zero width spaces */ - {"\xE2\x80\x8B""те""\xE2\x80\x8B""ст", "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES}, - /* Special case of diacritic */ - {"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL}, - /* Same with zw spaces */ - {"13\u200C_\u0020\u0308\u0301\u038e\u03ab\u200D", "13_ ̈́ΎΫ", - RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ZERO_SPACES}, - /* Buffer overflow case */ - {"u\xC2\xC2\xC2\xC2\xC2\xC2""abcdef""abcdef", "u������", - RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ERROR}, - }; - - for (const auto &c : cases) { - std::string cpy{std::get<0>(c)}; - auto ns = cpy.size(); - auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns); - cpy.resize(ns); - CHECK(cpy == std::string(std::get<1>(c))); - CHECK(res == std::get<2>(c)); +TEST_CASE("utf8 normalise") { + std::tuple<const char *, const char *, int> cases[] = { + {"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL}, + {"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL}, + /* Zero width spaces */ + {"\xE2\x80\x8B""те""\xE2\x80\x8B""ст", "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES}, + /* Special case of diacritic */ + {"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL}, + /* Same with zw spaces */ + {"13\u200C_\u0020\u0308\u0301\u038e\u03ab\u200D", "13_ ̈́ΎΫ", + RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ZERO_SPACES}, + /* Buffer overflow case */ + {"u\xC2\xC2\xC2\xC2\xC2\xC2""abcdef""abcdef", "u������", + RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ERROR}, + }; + + for (const auto &c : cases) { + std::string cpy{std::get<0>(c)}; + auto ns = cpy.size(); + auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns); + cpy.resize(ns); + CHECK(cpy == std::string(std::get<1>(c))); + CHECK(res == std::get<2>(c)); + } +} + +TEST_CASE("utf8 trim") { + std::pair<const char *, const char *> cases[] = { + {" \u200B""abc ", "abc"}, + {" ", ""}, + {" a", "a"}, + {"a ", "a"}, + {"a a", "a a"}, + {"abc", "abc"}, + {"a ", "a"}, + {" abc ", "abc"}, + {" abc ", "abc"}, + {" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"}, + {" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"}, + {" \xE2\x80\x8B""abc \xE2\x80\x8B ", "abc"}, + }; + + for (const auto &c : cases) { + std::string cpy{c.first}; + auto ns = cpy.size(); + auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns); + std::string res{nstart, ns}; + CHECK(res == std::string{c.second}); + } +} + + +TEST_CASE("utf8 strcmp") { + std::tuple<const char *, const char *, int, int> cases[] = { + {"abc", "abc", -1, 0}, + {"", "", -1, 0}, + {"aBc", "AbC", -1, 0}, + {"abc", "ab", 2, 0}, + {"теСт", "ТесТ", -1, 0}, + {"теСт", "Тезт", 4, 0}, + {"теСт", "Тезт", -1, 1}, + {"abc", "ABD", -1, -1}, + {"\0a\0", "\0a\1", 2, 0}, + {"\0a\0", "\0b\1", 3, -1}, + }; + + for (const auto &c : cases) { + auto [s1, s2, n, expected] = c; + if (n == -1) { + n = MIN(strlen(s1), strlen(s2)); + } + SUBCASE((std::string("test case: ") + s1 + " <=> " + s2).c_str()) { + auto ret = rspamd_utf8_strcmp(s1, s2, n); + CHECK(ret == expected); } } +} }
\ No newline at end of file |