summaryrefslogtreecommitdiffstats
path: root/src/libutil
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rspamd.com>2023-07-20 21:43:08 +0100
committerVsevolod Stakhov <vsevolod@rspamd.com>2023-07-20 21:43:08 +0100
commit4fef58d837dd1d8cd2457053b859c8f76c7251df (patch)
treee1bc9ed64d67db3d6514b44e28f50140c3bb7f6b /src/libutil
parent51c73b5ace58811b68ad89855160415fa7dbc07b (diff)
downloadrspamd-4fef58d837dd1d8cd2457053b859c8f76c7251df.tar.gz
rspamd-4fef58d837dd1d8cd2457053b859c8f76c7251df.zip
[Feature] Add function to transliterate utf8 to ascii with some normalisation
Diffstat (limited to 'src/libutil')
-rw-r--r--src/libutil/cxx/utf8_util.cxx80
-rw-r--r--src/libutil/cxx/utf8_util.h9
2 files changed, 89 insertions, 0 deletions
diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx
index 8d9fc31a9..4be7e9c58 100644
--- a/src/libutil/cxx/utf8_util.cxx
+++ b/src/libutil/cxx/utf8_util.cxx
@@ -21,6 +21,7 @@
#include <unicode/normalizer2.h>
#include <unicode/schriter.h>
#include <unicode/coll.h>
+#include <unicode/translit.h>
#include <utility>
#include <tuple>
#include <string>
@@ -159,6 +160,50 @@ rspamd_normalise_unicode_inplace(char *start, size_t *len)
return static_cast<enum rspamd_utf8_normalise_result>(ret);
}
+gchar*
+rspamd_utf8_transliterate(const gchar *start, gsize len, gsize *target_len)
+{
+ UErrorCode uc_err = U_ZERO_ERROR;
+
+ static const icu::Transliterator *transliterator = nullptr;
+
+ if (transliterator == nullptr) {
+ UParseError parse_err;
+ static const auto rules = icu::UnicodeString{":: Any-Latin;"
+ ":: [:Nonspacing Mark:] Remove;"
+ ":: [:Punctuation:] Remove;"
+ ":: [:Symbol:] Remove;"
+ ":: [:Format:] Remove;"
+ ":: Latin-ASCII;"
+ ":: Lower();"
+ ":: NULL;"
+ "[:Space Separator:] > ' '"
+ };
+ transliterator = icu::Transliterator::createFromRules("RspamdTranslit", rules, UTRANS_FORWARD, parse_err, uc_err);
+
+ if (U_FAILURE(uc_err) || transliterator == nullptr) {
+ auto context = icu::UnicodeString(parse_err.postContext, sizeof(parse_err.preContext) / sizeof(UChar));
+ g_error ("fatal error: cannot init libicu transliteration engine: %s, line: %d, offset: %d",
+ u_errorName(uc_err), parse_err.line, parse_err.offset);
+ abort();
+ }
+ }
+
+ auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, len));
+ transliterator->transliterate(uc_string);
+
+ // We assume that all characters are now ascii
+ auto dest_len = uc_string.length();
+ gchar *dest = (gchar *)g_malloc(dest_len + 1);
+ auto sink = icu::CheckedArrayByteSink(dest, dest_len);
+ uc_string.toUTF8(sink);
+
+ *target_len = sink.NumberOfBytesWritten();
+ dest[*target_len] = '\0';
+
+ return dest;
+}
+
struct rspamd_icu_collate_storage {
icu::Collator* collator = nullptr;
rspamd_icu_collate_storage() {
@@ -310,4 +355,39 @@ TEST_CASE("utf8 strcmp") {
}
}
}
+
+TEST_CASE("transliterate") {
+ using namespace std::literals;
+ std::tuple<std::string_view, const char *> cases[] = {
+ {"abc"sv, "abc"},
+ {""sv, ""},
+ {"тест"sv, "test"},
+ // Diacritic to ascii
+ {"Ύ"sv, "y"},
+ // Chinese to pinyin
+ {"你好"sv, "ni hao"},
+ // Japanese to romaji
+ {"こんにちは"sv, "konnichiha"},
+ // Devanagari to latin
+ {"नमस्ते"sv, "namaste"},
+ // Arabic to latin
+ {"مرحبا"sv, "mrhba"},
+ // Remove of punctuation
+ {"a.b.c"sv, "abc"},
+ // Lowercase
+ {"ABC"sv, "abc"},
+ // Remove zero-width spaces
+ {"\xE2\x80\x8B""abc\xE2\x80\x8B""def"sv, "abcdef"},
+ };
+
+ for (const auto &c : cases) {
+ auto [s1, s2] = c;
+ SUBCASE((std::string("test case: ") + std::string(s1) + " => " + s2).c_str()) {
+ gsize tlen;
+ auto *ret = rspamd_utf8_transliterate(s1.data(), s1.length(), &tlen);
+ CHECK(tlen == strlen(s2));
+ CHECK(strcmp(s2, ret) == 0);
+ }
+ }
+}
} \ No newline at end of file
diff --git a/src/libutil/cxx/utf8_util.h b/src/libutil/cxx/utf8_util.h
index da4ebdb24..7f28ea45e 100644
--- a/src/libutil/cxx/utf8_util.h
+++ b/src/libutil/cxx/utf8_util.h
@@ -52,6 +52,15 @@ enum rspamd_utf8_normalise_result {
enum rspamd_utf8_normalise_result rspamd_normalise_unicode_inplace(gchar *start, gsize *len);
/**
+ * Transliterate a string to ASCII
+ * @param start
+ * @param len
+ * @param target_len
+ * @return a new string that should be freed with g_free
+ */
+gchar* rspamd_utf8_transliterate(const gchar *start, gsize len, gsize *target_len);
+
+/**
* Compare two strings using libicu collator
* @param s1
* @param s2