You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

utf8_util.cxx 7.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. /*-
  2. * Copyright 2021 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #define U_CHARSET_IS_UTF8 1
  17. #include <unicode/utypes.h>
  18. #include <unicode/utf8.h>
  19. #include <unicode/uchar.h>
  20. #include <unicode/normalizer2.h>
  21. #include <unicode/schriter.h>
  22. #include <unicode/coll.h>
  23. #include <utility>
  24. #include <tuple>
  25. #include <string>
  26. #include <limits>
  27. #include "utf8_util.h"
  28. #include "str_util.h"
  29. #define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
  30. #include "doctest/doctest.h"
  31. const char *
  32. rspamd_string_unicode_trim_inplace (const char *str, size_t *len)
  33. {
  34. const auto *p = str, *end = str + *len;
  35. auto i = 0;
  36. while (i < *len) {
  37. UChar32 uc;
  38. auto prev_i = i;
  39. U8_NEXT(p, i, *len, uc);
  40. if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
  41. i = prev_i;
  42. break;
  43. }
  44. }
  45. p += i;
  46. (*len) -= i;
  47. i = end - p;
  48. auto *ret = p;
  49. if (i > 0) {
  50. while (i > 0) {
  51. UChar32 uc;
  52. auto prev_i = i;
  53. U8_PREV(p, 0, i, uc);
  54. if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
  55. i = prev_i;
  56. break;
  57. }
  58. }
  59. *len = i;
  60. }
  61. return ret;
  62. }
  63. enum rspamd_normalise_result
  64. rspamd_normalise_unicode_inplace(char *start, size_t *len)
  65. {
  66. UErrorCode uc_err = U_ZERO_ERROR;
  67. const auto *nfkc_norm = icu::Normalizer2::getNFKCInstance(uc_err);
  68. static icu::UnicodeSet zw_spaces{};
  69. if (!zw_spaces.isFrozen()) {
  70. /* Add zw spaces to the set */
  71. zw_spaces.add(0x200B);
  72. /* TODO: ZW non joiner, it might be used for ligatures, so it should possibly be excluded as well */
  73. zw_spaces.add(0x200C);
  74. /* See github issue #4290 for explanation. It seems that the ZWJ has many legit use cases */
  75. //zw_spaces.add(0x200D);
  76. zw_spaces.add(0xFEF);
  77. zw_spaces.add(0x00AD);
  78. zw_spaces.freeze();
  79. }
  80. int ret = RSPAMD_UNICODE_NORM_NORMAL;
  81. g_assert (U_SUCCESS (uc_err));
  82. auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, *len));
  83. auto is_normal = nfkc_norm->quickCheck(uc_string, uc_err);
  84. if (!U_SUCCESS (uc_err)) {
  85. return RSPAMD_UNICODE_NORM_ERROR;
  86. }
  87. /* Filter zero width spaces and push resulting string back */
  88. const auto filter_zw_spaces_and_push_back = [&](const icu::UnicodeString &input) -> size_t {
  89. icu::StringCharacterIterator it{input};
  90. size_t i = 0;
  91. while(it.hasNext()) {
  92. /* libicu is very 'special' if it comes to 'safe' macro */
  93. if (i >= *len) {
  94. ret |= RSPAMD_UNICODE_NORM_ERROR;
  95. break;
  96. }
  97. auto uc = it.next32PostInc();
  98. if (zw_spaces.contains(uc)) {
  99. ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;
  100. }
  101. else {
  102. UBool err = 0;
  103. if (uc == 0xFFFD) {
  104. ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
  105. }
  106. U8_APPEND((uint8_t*)start, i, *len, uc, err);
  107. if (err) {
  108. ret |= RSPAMD_UNICODE_NORM_ERROR;
  109. break;
  110. }
  111. }
  112. }
  113. return i;
  114. };
  115. if (is_normal != UNORM_YES) {
  116. /* Need to normalise */
  117. ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
  118. auto normalised = nfkc_norm->normalize(uc_string, uc_err);
  119. if (!U_SUCCESS (uc_err)) {
  120. return RSPAMD_UNICODE_NORM_ERROR;
  121. }
  122. *len = filter_zw_spaces_and_push_back(normalised);
  123. }
  124. else {
  125. *len = filter_zw_spaces_and_push_back(uc_string);
  126. }
  127. return static_cast<enum rspamd_normalise_result>(ret);
  128. }
  129. struct rspamd_icu_collate_storage {
  130. icu::Collator* collator = nullptr;
  131. rspamd_icu_collate_storage() {
  132. UErrorCode uc_err = U_ZERO_ERROR;
  133. collator = icu::Collator::createInstance(icu::Locale::getEnglish(), uc_err);
  134. if (U_FAILURE(uc_err) || collator == nullptr) {
  135. g_error ("fatal error: cannot init libicu collation engine: %s",
  136. u_errorName(uc_err));
  137. abort();
  138. }
  139. /* Ignore all difference except functional */
  140. collator->setStrength(icu::Collator::PRIMARY);
  141. }
  142. ~rspamd_icu_collate_storage() {
  143. if (collator) {
  144. delete collator;
  145. }
  146. }
  147. };
  148. static rspamd_icu_collate_storage collate_storage;
  149. int
  150. rspamd_utf8_strcmp_sizes(const char *s1, gsize n1, const char *s2, gsize n2)
  151. {
  152. if (n1 >= std::numeric_limits<int>::max() || n2 >= std::numeric_limits<int>::max()) {
  153. /*
  154. * It's hard to say what to do here... But libicu wants int, so we fall
  155. * back to g_ascii_strcasecmp which can deal with size_t
  156. */
  157. if (n1 == n2) {
  158. return g_ascii_strncasecmp(s1, s2, n1);
  159. }
  160. else {
  161. return n1 - n2;
  162. }
  163. }
  164. UErrorCode success = U_ZERO_ERROR;
  165. auto res = collate_storage.collator->compareUTF8({s1, (int) n1}, {s2, (int) n2},
  166. success);
  167. switch (res) {
  168. case UCOL_EQUAL:
  169. return 0;
  170. case UCOL_GREATER:
  171. return 1;
  172. case UCOL_LESS:
  173. default:
  174. return -1;
  175. }
  176. }
  177. int
  178. rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n)
  179. {
  180. return rspamd_utf8_strcmp_sizes(s1, n, s2, n);
  181. }
  182. TEST_SUITE("utf8 utils") {
  183. TEST_CASE("utf8 normalise") {
  184. std::tuple<const char *, const char *, int> cases[] = {
  185. {"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL},
  186. {"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL},
  187. /* Zero width spaces */
  188. {"\xE2\x80\x8B""те""\xE2\x80\x8B""ст", "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES},
  189. /* Special case of diacritic */
  190. {"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL},
  191. /* Same with zw spaces */
  192. {"13\u200C_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ",
  193. RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ZERO_SPACES},
  194. /* Buffer overflow case */
  195. {"u\xC2\xC2\xC2\xC2\xC2\xC2""abcdef""abcdef", "u\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD",
  196. RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ERROR},
  197. };
  198. for (const auto &c : cases) {
  199. std::string cpy{std::get<0>(c)};
  200. auto ns = cpy.size();
  201. auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns);
  202. cpy.resize(ns);
  203. CHECK(cpy == std::string(std::get<1>(c)));
  204. CHECK(res == std::get<2>(c));
  205. }
  206. }
  207. TEST_CASE("utf8 trim") {
  208. std::pair<const char *, const char *> cases[] = {
  209. {" \u200B""abc ", "abc"},
  210. {" ", ""},
  211. {" a", "a"},
  212. {"a ", "a"},
  213. {"a a", "a a"},
  214. {"abc", "abc"},
  215. {"a ", "a"},
  216. {" abc ", "abc"},
  217. {" abc ", "abc"},
  218. {" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"},
  219. {" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"},
  220. {" \xE2\x80\x8B""abc \xE2\x80\x8B ", "abc"},
  221. };
  222. for (const auto &c : cases) {
  223. std::string cpy{c.first};
  224. auto ns = cpy.size();
  225. auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns);
  226. std::string res{nstart, ns};
  227. CHECK(res == std::string{c.second});
  228. }
  229. }
  230. TEST_CASE("utf8 strcmp") {
  231. std::tuple<const char *, const char *, int, int> cases[] = {
  232. {"abc", "abc", -1, 0},
  233. {"", "", -1, 0},
  234. {"aBc", "AbC", -1, 0},
  235. {"abc", "ab", 2, 0},
  236. {"теСт", "ТесТ", -1, 0},
  237. {"теСт", "Тезт", 4, 0},
  238. {"теСт", "Тезт", -1, 1},
  239. {"abc", "ABD", -1, -1},
  240. {"\0a\0", "\0a\1", 2, 0},
  241. {"\0a\0", "\0b\1", 3, -1},
  242. };
  243. for (const auto &c : cases) {
  244. auto [s1, s2, n, expected] = c;
  245. if (n == -1) {
  246. n = MIN(strlen(s1), strlen(s2));
  247. }
  248. SUBCASE((std::string("test case: ") + s1 + " <=> " + s2).c_str()) {
  249. auto ret = rspamd_utf8_strcmp(s1, s2, n);
  250. CHECK(ret == expected);
  251. }
  252. }
  253. }
  254. }