[Feature] Add rspamd_utf8_strcmp utility

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 2 Aug 2021 16:00:14 +0000 (17:00 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 2 Aug 2021 16:00:14 +0000 (17:00 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 2 Aug 2021 16:00:14 +0000 (17:00 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 2 Aug 2021 16:00:14 +0000 (17:00 +0100)
diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx

index e42ef917fa300338d0afe27fbf05bf4e0cae519e..8b99d1f35c88dce3b37ecad12dd8b2c2c7167cf4 100644 (file)
--- a/src/libutil/cxx/utf8_util.cxx
+++ b/src/libutil/cxx/utf8_util.cxx
@@ -20,9 +20,11 @@
  #include <unicode/uchar.h>
  #include <unicode/normalizer2.h>
  #include <unicode/schriter.h>
+#include <unicode/coll.h>
  #include <utility>
  #include <tuple>
  #include <string>
+#include <limits>
  
  #include "utf8_util.h"
  #include "str_util.h"
@@ -73,35 +75,6 @@ rspamd_string_unicode_trim_inplace (const char *str, size_t *len)
         return ret;
  }
  
-TEST_SUITE("utf8 utils") {
-       TEST_CASE("utf8 trim") {
-               std::pair<const char *, const char *> cases[] = {
-                               {" \u200B""abc ", "abc"},
-                               {"   ",  ""},
-                               {"   a", "a"},
-                               {"a   ", "a"},
-                               {"a a",  "a a"},
-                               {"abc",  "abc"},
-                               {"a ", "a"},
-                               {"   abc      ", "abc"},
-                               {" abc ", "abc"},
-                               {" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"},
-                               {" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"},
-                               {" \xE2\x80\x8B""abc \xE2\x80\x8B  ", "abc"},
-               };
-
-               for (const auto &c : cases) {
-                       std::string cpy{c.first};
-                       auto ns = cpy.size();
-                       auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns);
-                       std::string res{nstart, ns};
-                       CHECK(res == std::string{c.second});
-               }
-       }
-}
-
-
-
  enum rspamd_normalise_result
  rspamd_normalise_unicode_inplace(char *start, size_t *len)
  {
@@ -184,30 +157,126 @@ rspamd_normalise_unicode_inplace(char *start, size_t *len)
         return static_cast<enum rspamd_normalise_result>(ret);
  }
  
+struct rspamd_icu_collate_storage {
+       icu::Collator* collator = nullptr;
+       rspamd_icu_collate_storage() {
+               UErrorCode success = U_ZERO_ERROR;
+               collator = icu::Collator::createInstance(icu::Locale::getEnglish(), success);
+               /* Ignore all difference except functional */
+               collator->setStrength(icu::Collator::PRIMARY);
+       }
+
+       ~rspamd_icu_collate_storage() {
+               if (collator) {
+                       delete collator;
+               }
+       }
+};
+
+static rspamd_icu_collate_storage collate_storage;
+
+int
+rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n)
+{
+       if (n >= std::numeric_limits<int>::max()) {
+               /*
+                * It's hard to say what to do here... But libicu wants int, so we fall
+                * back to g_ascii_strcasecmp which can deal with size_t
+                */
+               return g_ascii_strncasecmp(s1, s2, n);
+       }
+
+       UErrorCode success = U_ZERO_ERROR;
+       auto res = collate_storage.collator->compareUTF8({s1, (int) n}, {s2, (int) n},
+                       success);
+
+       switch (res) {
+       case UCOL_EQUAL:
+               return 0;
+       case UCOL_GREATER:
+               return 1;
+       case UCOL_LESS:
+       default:
+               return -1;
+       }
+}
+
  TEST_SUITE("utf8 utils") {
-       TEST_CASE("utf8 normalise") {
-               std::tuple<const char *, const char *, int> cases[] = {
-                               {"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL},
-                               {"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL},
-                               /* Zero width spaces */
-                               {"\xE2\x80\x8B""те""\xE2\x80\x8B""ст", "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES},
-                               /* Special case of diacritic */
-                               {"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL},
-                               /* Same with zw spaces */
-                               {"13\u200C_\u0020\u0308\u0301\u038e\u03ab\u200D", "13_ ̈́ΎΫ",
-                                                               RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ZERO_SPACES},
-                               /* Buffer overflow case */
-                               {"u\xC2\xC2\xC2\xC2\xC2\xC2""abcdef""abcdef", "u������",
-                                                               RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ERROR},
-               };
-
-               for (const auto &c : cases) {
-                       std::string cpy{std::get<0>(c)};
-                       auto ns = cpy.size();
-                       auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns);
-                       cpy.resize(ns);
-                       CHECK(cpy == std::string(std::get<1>(c)));
-                       CHECK(res == std::get<2>(c));
+TEST_CASE("utf8 normalise") {
+       std::tuple<const char *, const char *, int> cases[] = {
+                       {"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL},
+                       {"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL},
+                       /* Zero width spaces */
+                       {"\xE2\x80\x8B""те""\xE2\x80\x8B""ст", "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES},
+                       /* Special case of diacritic */
+                       {"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL},
+                       /* Same with zw spaces */
+                       {"13\u200C_\u0020\u0308\u0301\u038e\u03ab\u200D", "13_ ̈́ΎΫ",
+                                       RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ZERO_SPACES},
+                       /* Buffer overflow case */
+                       {"u\xC2\xC2\xC2\xC2\xC2\xC2""abcdef""abcdef", "u������",
+                                       RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ERROR},
+       };
+
+       for (const auto &c : cases) {
+               std::string cpy{std::get<0>(c)};
+               auto ns = cpy.size();
+               auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns);
+               cpy.resize(ns);
+               CHECK(cpy == std::string(std::get<1>(c)));
+               CHECK(res == std::get<2>(c));
+       }
+}
+
+TEST_CASE("utf8 trim") {
+       std::pair<const char *, const char *> cases[] = {
+                       {" \u200B""abc ", "abc"},
+                       {"   ",  ""},
+                       {"   a", "a"},
+                       {"a   ", "a"},
+                       {"a a",  "a a"},
+                       {"abc",  "abc"},
+                       {"a ", "a"},
+                       {"   abc      ", "abc"},
+                       {" abc ", "abc"},
+                       {" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"},
+                       {" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"},
+                       {" \xE2\x80\x8B""abc \xE2\x80\x8B  ", "abc"},
+       };
+
+       for (const auto &c : cases) {
+               std::string cpy{c.first};
+               auto ns = cpy.size();
+               auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns);
+               std::string res{nstart, ns};
+               CHECK(res == std::string{c.second});
+       }
+}
+
+
+TEST_CASE("utf8 strcmp") {
+       std::tuple<const char *, const char *, int, int> cases[] = {
+                       {"abc", "abc", -1, 0},
+                       {"",  "", -1, 0},
+                       {"aBc", "AbC", -1, 0},
+                       {"abc", "ab", 2, 0},
+                       {"теСт", "ТесТ", -1, 0},
+                       {"теСт", "Тезт", 4, 0},
+                       {"теСт", "Тезт", -1, 1},
+                       {"abc", "ABD", -1, -1},
+                       {"\0a\0", "\0a\1", 2, 0},
+                       {"\0a\0", "\0b\1", 3, -1},
+       };
+
+       for (const auto &c : cases) {
+               auto [s1, s2, n, expected] = c;
+               if (n == -1) {
+                       n = MIN(strlen(s1), strlen(s2));
+               }
+               SUBCASE((std::string("test case: ") + s1 + " <=> " + s2).c_str()) {
+                       auto ret = rspamd_utf8_strcmp(s1, s2, n);
+                       CHECK(ret == expected);
                 }
         }
+}
  }
 \ No newline at end of file
diff --git a/src/libutil/cxx/utf8_util.h b/src/libutil/cxx/utf8_util.h

index 242e03f00d6ee374f3a63541c0d1eeda203cfdd1..28bd6a144c43ff847546b503924ab2820b49dbcb 100644 (file)
--- a/src/libutil/cxx/utf8_util.h
+++ b/src/libutil/cxx/utf8_util.h
@@ -51,6 +51,15 @@ enum rspamd_normalise_result {
   */
  enum rspamd_normalise_result rspamd_normalise_unicode_inplace(gchar *start, gsize *len);
  
+/**
+ * Compare two strings using libicu collator
+ * @param s1
+ * @param s2
+ * @param n
+ * @return an integer greater than, equal to, or less than 0, according as the string s1 is greater than, equal to, or less than the string s2.
+ */
+int rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n);
+
  #ifdef  __cplusplus
  }
  #endif
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 2 Aug 2021 16:00:14 +0000 (17:00 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 2 Aug 2021 16:00:14 +0000 (17:00 +0100)
src/libutil/cxx/utf8_util.cxx		patch \| blob \| history
src/libutil/cxx/utf8_util.h		patch \| blob \| history