123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421 |
- /*-
- * Copyright 2021 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- #define U_CHARSET_IS_UTF8 1
- #include <unicode/utypes.h>
- #include <unicode/utf8.h>
- #include <unicode/uchar.h>
- #include <unicode/normalizer2.h>
- #include <unicode/schriter.h>
- #include <unicode/coll.h>
- #include <unicode/translit.h>
- #include <utility>
- #include <tuple>
- #include <string>
- #include <limits>
- #include <memory>
-
- #include "utf8_util.h"
- #include "str_util.h"
-
- #define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
- #include "doctest/doctest.h"
-
- const char *
- rspamd_string_unicode_trim_inplace(const char *str, size_t *len)
- {
- const auto *p = str, *end = str + *len;
- auto i = 0;
-
- while (i < *len) {
- UChar32 uc;
- auto prev_i = i;
-
- U8_NEXT(p, i, *len, uc);
-
- if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
- i = prev_i;
- break;
- }
- }
-
- p += i;
- (*len) -= i;
- i = end - p;
- auto *ret = p;
-
- if (i > 0) {
-
- while (i > 0) {
- UChar32 uc;
- auto prev_i = i;
-
- U8_PREV(p, 0, i, uc);
-
- if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
- i = prev_i;
- break;
- }
- }
-
- *len = i;
- }
-
- return ret;
- }
-
- enum rspamd_utf8_normalise_result
- rspamd_normalise_unicode_inplace(char *start, size_t *len)
- {
- UErrorCode uc_err = U_ZERO_ERROR;
- const auto *nfkc_norm = icu::Normalizer2::getNFKCInstance(uc_err);
- static icu::UnicodeSet zw_spaces{};
-
- if (!zw_spaces.isFrozen()) {
- /* Add zw spaces to the set */
- zw_spaces.add(0x200B);
- /* TODO: ZW non joiner, it might be used for ligatures, so it should possibly be excluded as well */
- zw_spaces.add(0x200C);
- /* See github issue #4290 for explanation. It seems that the ZWJ has many legit use cases */
- //zw_spaces.add(0x200D);
- zw_spaces.add(0xFEF);
- zw_spaces.add(0x00AD);
- zw_spaces.freeze();
- }
-
- int ret = RSPAMD_UNICODE_NORM_NORMAL;
-
- g_assert(U_SUCCESS(uc_err));
-
- auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, *len));
- auto is_normal = nfkc_norm->quickCheck(uc_string, uc_err);
-
- if (!U_SUCCESS(uc_err)) {
- return RSPAMD_UNICODE_NORM_ERROR;
- }
-
- /* Filter zero width spaces and push resulting string back */
- const auto filter_zw_spaces_and_push_back = [&](const icu::UnicodeString &input) -> size_t {
- icu::StringCharacterIterator it{input};
- size_t i = 0;
-
- while (it.hasNext()) {
- /* libicu is very 'special' if it comes to 'safe' macro */
- if (i >= *len) {
- ret |= RSPAMD_UNICODE_NORM_ERROR;
- break;
- }
-
- auto uc = it.next32PostInc();
-
- if (zw_spaces.contains(uc)) {
- ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;
- }
- else {
- UBool err = 0;
-
- if (uc == 0xFFFD) {
- ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
- }
- U8_APPEND((uint8_t *) start, i, *len, uc, err);
-
- if (err) {
- ret |= RSPAMD_UNICODE_NORM_ERROR;
- break;
- }
- }
- }
-
- return i;
- };
-
- if (is_normal != UNORM_YES) {
- /* Need to normalise */
- ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
-
- auto normalised = nfkc_norm->normalize(uc_string, uc_err);
-
- if (!U_SUCCESS(uc_err)) {
- return RSPAMD_UNICODE_NORM_ERROR;
- }
-
- *len = filter_zw_spaces_and_push_back(normalised);
- }
- else {
- *len = filter_zw_spaces_and_push_back(uc_string);
- }
-
- return static_cast<enum rspamd_utf8_normalise_result>(ret);
- }
-
- char *
- rspamd_utf8_transliterate(const char *start, gsize len, gsize *target_len)
- {
- UErrorCode uc_err = U_ZERO_ERROR;
-
- static std::unique_ptr<icu::Transliterator> transliterator;
-
- if (!transliterator) {
- UParseError parse_err;
- static const auto rules = icu::UnicodeString{":: Any-Latin;"
- ":: [:Nonspacing Mark:] Remove;"
- ":: [:Punctuation:] Remove;"
- ":: [:Symbol:] Remove;"
- ":: [:Format:] Remove;"
- ":: Latin-ASCII;"
- ":: Lower();"
- ":: NULL;"
- "[:Space Separator:] > ' '"};
- transliterator = std::unique_ptr<icu::Transliterator>(
- icu::Transliterator::createFromRules("RspamdTranslit", rules, UTRANS_FORWARD, parse_err, uc_err));
-
- if (U_FAILURE(uc_err) || !transliterator) {
- auto context = icu::UnicodeString(parse_err.postContext, sizeof(parse_err.preContext) / sizeof(UChar));
- g_error("fatal error: cannot init libicu transliteration engine: %s, line: %d, offset: %d",
- u_errorName(uc_err), parse_err.line, parse_err.offset);
- abort();
- }
- }
-
- auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, len));
- transliterator->transliterate(uc_string);
-
- // We assume that all characters are now ascii
- auto dest_len = uc_string.length();
- char *dest = (char *) g_malloc(dest_len + 1);
- auto sink = icu::CheckedArrayByteSink(dest, dest_len);
- uc_string.toUTF8(sink);
-
- *target_len = sink.NumberOfBytesWritten();
- dest[*target_len] = '\0';
-
- return dest;
- }
-
- struct rspamd_icu_collate_storage {
- icu::Collator *collator = nullptr;
- rspamd_icu_collate_storage()
- {
- UErrorCode uc_err = U_ZERO_ERROR;
- collator = icu::Collator::createInstance(icu::Locale::getEnglish(), uc_err);
-
- if (U_FAILURE(uc_err) || collator == nullptr) {
- g_error("fatal error: cannot init libicu collation engine: %s",
- u_errorName(uc_err));
- abort();
- }
- /* Ignore all difference except functional */
- collator->setStrength(icu::Collator::PRIMARY);
- }
-
- ~rspamd_icu_collate_storage()
- {
- if (collator) {
- delete collator;
- }
- }
- };
-
- static rspamd_icu_collate_storage collate_storage;
-
- int rspamd_utf8_strcmp_sizes(const char *s1, gsize n1, const char *s2, gsize n2)
- {
- if (n1 >= std::numeric_limits<int>::max() || n2 >= std::numeric_limits<int>::max()) {
- /*
- * It's hard to say what to do here... But libicu wants int, so we fall
- * back to g_ascii_strcasecmp which can deal with size_t
- */
- if (n1 == n2) {
- return g_ascii_strncasecmp(s1, s2, n1);
- }
- else {
- return n1 - n2;
- }
- }
-
- UErrorCode success = U_ZERO_ERROR;
- auto res = collate_storage.collator->compareUTF8({s1, (int) n1}, {s2, (int) n2},
- success);
-
- switch (res) {
- case UCOL_EQUAL:
- return 0;
- case UCOL_GREATER:
- return 1;
- case UCOL_LESS:
- default:
- return -1;
- }
- }
-
- int rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n)
- {
- return rspamd_utf8_strcmp_sizes(s1, n, s2, n);
- }
-
- TEST_SUITE("utf8 utils")
- {
- TEST_CASE("utf8 normalise")
- {
- std::tuple<const char *, const char *, int> cases[] = {
- {"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL},
- {"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL},
- /* Zero width spaces */
- {"\xE2\x80\x8B"
- "те"
- "\xE2\x80\x8B"
- "ст",
- "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES},
- /* Special case of diacritic */
- {"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL},
- // String containing a non-joiner character
- {"س\u200Cت", "ست", RSPAMD_UNICODE_NORM_ZERO_SPACES},
- // String containing a soft hyphen
- {"in\u00ADter\u00ADest\u00ADing", "interesting", RSPAMD_UNICODE_NORM_ZERO_SPACES},
- // String with ligature
- {"fish", "fish", RSPAMD_UNICODE_NORM_UNNORMAL},
- // String with accented characters and zero-width spaces
- {"café\u200Blatté\u200C", "cafélatté", RSPAMD_UNICODE_NORM_ZERO_SPACES},
- /* Same with zw spaces */
- {"13\u200C_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ",
- RSPAMD_UNICODE_NORM_UNNORMAL | RSPAMD_UNICODE_NORM_ZERO_SPACES},
- /* Buffer overflow case */
- {"u\xC2\xC2\xC2\xC2\xC2\xC2"
- "abcdef"
- "abcdef",
- "u\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD",
- RSPAMD_UNICODE_NORM_UNNORMAL | RSPAMD_UNICODE_NORM_ERROR},
- // String with a mix of special characters, ligatures, and zero-width spaces
- {"fish\u200Bcafé\u200C\u200Dlatté\u200D\u00AD", "fishcafé\u200Dlatté\u200D", RSPAMD_UNICODE_NORM_UNNORMAL | RSPAMD_UNICODE_NORM_ZERO_SPACES},
- // Empty string
- {"", "", RSPAMD_UNICODE_NORM_NORMAL},
- };
-
- for (const auto &c: cases) {
- std::string cpy{std::get<0>(c)};
- auto ns = cpy.size();
- auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns);
- cpy.resize(ns);
- CHECK(cpy == std::string(std::get<1>(c)));
- CHECK(res == std::get<2>(c));
- }
- }
-
- TEST_CASE("utf8 trim")
- {
- std::pair<const char *, const char *> cases[] = {
- {" \u200B"
- "abc ",
- "abc"},
- {" ", ""},
- {" a", "a"},
- {"a ", "a"},
- {"a a", "a a"},
- {"abc", "abc"},
- {"a ", "a"},
- {" abc ", "abc"},
- {" abc ", "abc"},
- {" \xE2\x80\x8B"
- "a\xE2\x80\x8B"
- "bc ",
- "a\xE2\x80\x8B"
- "bc"},
- {" \xE2\x80\x8B"
- "abc\xE2\x80\x8B ",
- "abc"},
- {" \xE2\x80\x8B"
- "abc \xE2\x80\x8B ",
- "abc"},
- };
-
- for (const auto &c: cases) {
- std::string cpy{c.first};
- auto ns = cpy.size();
- auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns);
- std::string res{nstart, ns};
- CHECK(res == std::string{c.second});
- }
- }
-
-
- TEST_CASE("utf8 strcmp")
- {
- std::tuple<const char *, const char *, int, int> cases[] = {
- {"abc", "abc", -1, 0},
- {"", "", -1, 0},
- {"aBc", "AbC", -1, 0},
- {"abc", "ab", 2, 0},
- {"теСт", "ТесТ", -1, 0},
- {"теСт", "Тезт", 4, 0},
- {"теСт", "Тезт", -1, 1},
- {"abc", "ABD", -1, -1},
- {"\0a\0", "\0a\1", 2, 0},
- {"\0a\0", "\0b\1", 3, -1},
- };
-
- for (const auto &c: cases) {
- auto [s1, s2, n, expected] = c;
- if (n == -1) {
- n = MIN(strlen(s1), strlen(s2));
- }
- SUBCASE((std::string("test case: ") + s1 + " <=> " + s2).c_str())
- {
- auto ret = rspamd_utf8_strcmp(s1, s2, n);
- CHECK(ret == expected);
- }
- }
- }
-
- TEST_CASE("transliterate")
- {
- using namespace std::literals;
- std::tuple<std::string_view, const char *> cases[] = {
- {"abc"sv, "abc"},
- {""sv, ""},
- {"тест"sv, "test"},
- // Diacritic to ascii
- {"Ύ"sv, "y"},
- // Chinese to pinyin
- {"你好"sv, "ni hao"},
- // Japanese to romaji
- {"こんにちは"sv, "konnichiha"},
- // Devanagari to latin
- {"नमस्ते"sv, "namaste"},
- // Arabic to latin
- {"مرحبا"sv, "mrhba"},
- // Remove of punctuation
- {"a.b.c"sv, "abc"},
- // Lowercase
- {"ABC"sv, "abc"},
- // Remove zero-width spaces
- {"\xE2\x80\x8B"
- "abc\xE2\x80\x8B"
- "def"sv,
- "abcdef"},
- };
-
- for (const auto &c: cases) {
- auto [s1, s2] = c;
- SUBCASE((std::string("test case: ") + std::string(s1) + " => " + s2).c_str())
- {
- gsize tlen;
- auto *ret = rspamd_utf8_transliterate(s1.data(), s1.length(), &tlen);
- CHECK(tlen == strlen(s2));
- CHECK(strcmp(s2, ret) == 0);
- }
- }
- }
- }
|