From 4fef58d837dd1d8cd2457053b859c8f76c7251df Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 20 Jul 2023 21:43:08 +0100 Subject: [Feature] Add function to transliterate utf8 to ascii with some normalisation --- src/libutil/cxx/utf8_util.cxx | 80 +++++++++++++++++++++++++++++++++++++++++++ src/libutil/cxx/utf8_util.h | 9 +++++ 2 files changed, 89 insertions(+) (limited to 'src/libutil') diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx index 8d9fc31a9..4be7e9c58 100644 --- a/src/libutil/cxx/utf8_util.cxx +++ b/src/libutil/cxx/utf8_util.cxx @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -159,6 +160,50 @@ rspamd_normalise_unicode_inplace(char *start, size_t *len) return static_cast(ret); } +gchar* +rspamd_utf8_transliterate(const gchar *start, gsize len, gsize *target_len) +{ + UErrorCode uc_err = U_ZERO_ERROR; + + static const icu::Transliterator *transliterator = nullptr; + + if (transliterator == nullptr) { + UParseError parse_err; + static const auto rules = icu::UnicodeString{":: Any-Latin;" + ":: [:Nonspacing Mark:] Remove;" + ":: [:Punctuation:] Remove;" + ":: [:Symbol:] Remove;" + ":: [:Format:] Remove;" + ":: Latin-ASCII;" + ":: Lower();" + ":: NULL;" + "[:Space Separator:] > ' '" + }; + transliterator = icu::Transliterator::createFromRules("RspamdTranslit", rules, UTRANS_FORWARD, parse_err, uc_err); + + if (U_FAILURE(uc_err) || transliterator == nullptr) { + auto context = icu::UnicodeString(parse_err.postContext, sizeof(parse_err.preContext) / sizeof(UChar)); + g_error ("fatal error: cannot init libicu transliteration engine: %s, line: %d, offset: %d", + u_errorName(uc_err), parse_err.line, parse_err.offset); + abort(); + } + } + + auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, len)); + transliterator->transliterate(uc_string); + + // We assume that all characters are now ascii + auto dest_len = uc_string.length(); + gchar *dest = (gchar *)g_malloc(dest_len + 1); + auto sink = icu::CheckedArrayByteSink(dest, dest_len); + uc_string.toUTF8(sink); + + *target_len = sink.NumberOfBytesWritten(); + dest[*target_len] = '\0'; + + return dest; +} + struct rspamd_icu_collate_storage { icu::Collator* collator = nullptr; rspamd_icu_collate_storage() { @@ -310,4 +355,39 @@ TEST_CASE("utf8 strcmp") { } } } + +TEST_CASE("transliterate") { + using namespace std::literals; + std::tuple cases[] = { + {"abc"sv, "abc"}, + {""sv, ""}, + {"тест"sv, "test"}, + // Diacritic to ascii + {"Ύ"sv, "y"}, + // Chinese to pinyin + {"你好"sv, "ni hao"}, + // Japanese to romaji + {"こんにちは"sv, "konnichiha"}, + // Devanagari to latin + {"नमस्ते"sv, "namaste"}, + // Arabic to latin + {"مرحبا"sv, "mrhba"}, + // Remove of punctuation + {"a.b.c"sv, "abc"}, + // Lowercase + {"ABC"sv, "abc"}, + // Remove zero-width spaces + {"\xE2\x80\x8B""abc\xE2\x80\x8B""def"sv, "abcdef"}, + }; + + for (const auto &c : cases) { + auto [s1, s2] = c; + SUBCASE((std::string("test case: ") + std::string(s1) + " => " + s2).c_str()) { + gsize tlen; + auto *ret = rspamd_utf8_transliterate(s1.data(), s1.length(), &tlen); + CHECK(tlen == strlen(s2)); + CHECK(strcmp(s2, ret) == 0); + } + } +} } \ No newline at end of file diff --git a/src/libutil/cxx/utf8_util.h b/src/libutil/cxx/utf8_util.h index da4ebdb24..7f28ea45e 100644 --- a/src/libutil/cxx/utf8_util.h +++ b/src/libutil/cxx/utf8_util.h @@ -51,6 +51,15 @@ enum rspamd_utf8_normalise_result { */ enum rspamd_utf8_normalise_result rspamd_normalise_unicode_inplace(gchar *start, gsize *len); +/** + * Transliterate a string to ASCII + * @param start + * @param len + * @param target_len + * @return a new string that should be freed with g_free + */ +gchar* rspamd_utf8_transliterate(const gchar *start, gsize len, gsize *target_len); + /** * Compare two strings using libicu collator * @param s1 -- cgit v1.2.3