You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

utf8_util.cxx 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. /*-
  2. * Copyright 2021 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #define U_CHARSET_IS_UTF8 1
  17. #include <unicode/utypes.h>
  18. #include <unicode/utf8.h>
  19. #include <unicode/uchar.h>
  20. #include <unicode/normalizer2.h>
  21. #include <unicode/schriter.h>
  22. #include <unicode/coll.h>
  23. #include <unicode/translit.h>
  24. #include <utility>
  25. #include <tuple>
  26. #include <string>
  27. #include <limits>
  28. #include <memory>
  29. #include "utf8_util.h"
  30. #include "str_util.h"
  31. #define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
  32. #include "doctest/doctest.h"
  33. const char *
  34. rspamd_string_unicode_trim_inplace(const char *str, size_t *len)
  35. {
  36. const auto *p = str, *end = str + *len;
  37. auto i = 0;
  38. while (i < *len) {
  39. UChar32 uc;
  40. auto prev_i = i;
  41. U8_NEXT(p, i, *len, uc);
  42. if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
  43. i = prev_i;
  44. break;
  45. }
  46. }
  47. p += i;
  48. (*len) -= i;
  49. i = end - p;
  50. auto *ret = p;
  51. if (i > 0) {
  52. while (i > 0) {
  53. UChar32 uc;
  54. auto prev_i = i;
  55. U8_PREV(p, 0, i, uc);
  56. if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
  57. i = prev_i;
  58. break;
  59. }
  60. }
  61. *len = i;
  62. }
  63. return ret;
  64. }
  65. enum rspamd_utf8_normalise_result
  66. rspamd_normalise_unicode_inplace(char *start, size_t *len)
  67. {
  68. UErrorCode uc_err = U_ZERO_ERROR;
  69. const auto *nfkc_norm = icu::Normalizer2::getNFKCInstance(uc_err);
  70. static icu::UnicodeSet zw_spaces{};
  71. if (!zw_spaces.isFrozen()) {
  72. /* Add zw spaces to the set */
  73. zw_spaces.add(0x200B);
  74. /* TODO: ZW non joiner, it might be used for ligatures, so it should possibly be excluded as well */
  75. zw_spaces.add(0x200C);
  76. /* See github issue #4290 for explanation. It seems that the ZWJ has many legit use cases */
  77. //zw_spaces.add(0x200D);
  78. zw_spaces.add(0xFEF);
  79. zw_spaces.add(0x00AD);
  80. zw_spaces.freeze();
  81. }
  82. int ret = RSPAMD_UNICODE_NORM_NORMAL;
  83. g_assert(U_SUCCESS(uc_err));
  84. auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, *len));
  85. auto is_normal = nfkc_norm->quickCheck(uc_string, uc_err);
  86. if (!U_SUCCESS(uc_err)) {
  87. return RSPAMD_UNICODE_NORM_ERROR;
  88. }
  89. /* Filter zero width spaces and push resulting string back */
  90. const auto filter_zw_spaces_and_push_back = [&](const icu::UnicodeString &input) -> size_t {
  91. icu::StringCharacterIterator it{input};
  92. size_t i = 0;
  93. while (it.hasNext()) {
  94. /* libicu is very 'special' if it comes to 'safe' macro */
  95. if (i >= *len) {
  96. ret |= RSPAMD_UNICODE_NORM_ERROR;
  97. break;
  98. }
  99. auto uc = it.next32PostInc();
  100. if (zw_spaces.contains(uc)) {
  101. ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;
  102. }
  103. else {
  104. UBool err = 0;
  105. if (uc == 0xFFFD) {
  106. ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
  107. }
  108. U8_APPEND((uint8_t *) start, i, *len, uc, err);
  109. if (err) {
  110. ret |= RSPAMD_UNICODE_NORM_ERROR;
  111. break;
  112. }
  113. }
  114. }
  115. return i;
  116. };
  117. if (is_normal != UNORM_YES) {
  118. /* Need to normalise */
  119. ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
  120. auto normalised = nfkc_norm->normalize(uc_string, uc_err);
  121. if (!U_SUCCESS(uc_err)) {
  122. return RSPAMD_UNICODE_NORM_ERROR;
  123. }
  124. *len = filter_zw_spaces_and_push_back(normalised);
  125. }
  126. else {
  127. *len = filter_zw_spaces_and_push_back(uc_string);
  128. }
  129. return static_cast<enum rspamd_utf8_normalise_result>(ret);
  130. }
  131. char *
  132. rspamd_utf8_transliterate(const char *start, gsize len, gsize *target_len)
  133. {
  134. UErrorCode uc_err = U_ZERO_ERROR;
  135. static std::unique_ptr<icu::Transliterator> transliterator;
  136. if (!transliterator) {
  137. UParseError parse_err;
  138. static const auto rules = icu::UnicodeString{":: Any-Latin;"
  139. ":: [:Nonspacing Mark:] Remove;"
  140. ":: [:Punctuation:] Remove;"
  141. ":: [:Symbol:] Remove;"
  142. ":: [:Format:] Remove;"
  143. ":: Latin-ASCII;"
  144. ":: Lower();"
  145. ":: NULL;"
  146. "[:Space Separator:] > ' '"};
  147. transliterator = std::unique_ptr<icu::Transliterator>(
  148. icu::Transliterator::createFromRules("RspamdTranslit", rules, UTRANS_FORWARD, parse_err, uc_err));
  149. if (U_FAILURE(uc_err) || !transliterator) {
  150. auto context = icu::UnicodeString(parse_err.postContext, sizeof(parse_err.preContext) / sizeof(UChar));
  151. g_error("fatal error: cannot init libicu transliteration engine: %s, line: %d, offset: %d",
  152. u_errorName(uc_err), parse_err.line, parse_err.offset);
  153. abort();
  154. }
  155. }
  156. auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, len));
  157. transliterator->transliterate(uc_string);
  158. // We assume that all characters are now ascii
  159. auto dest_len = uc_string.length();
  160. char *dest = (char *) g_malloc(dest_len + 1);
  161. auto sink = icu::CheckedArrayByteSink(dest, dest_len);
  162. uc_string.toUTF8(sink);
  163. *target_len = sink.NumberOfBytesWritten();
  164. dest[*target_len] = '\0';
  165. return dest;
  166. }
  167. struct rspamd_icu_collate_storage {
  168. icu::Collator *collator = nullptr;
  169. rspamd_icu_collate_storage()
  170. {
  171. UErrorCode uc_err = U_ZERO_ERROR;
  172. collator = icu::Collator::createInstance(icu::Locale::getEnglish(), uc_err);
  173. if (U_FAILURE(uc_err) || collator == nullptr) {
  174. g_error("fatal error: cannot init libicu collation engine: %s",
  175. u_errorName(uc_err));
  176. abort();
  177. }
  178. /* Ignore all difference except functional */
  179. collator->setStrength(icu::Collator::PRIMARY);
  180. }
  181. ~rspamd_icu_collate_storage()
  182. {
  183. if (collator) {
  184. delete collator;
  185. }
  186. }
  187. };
  188. static rspamd_icu_collate_storage collate_storage;
  189. int rspamd_utf8_strcmp_sizes(const char *s1, gsize n1, const char *s2, gsize n2)
  190. {
  191. if (n1 >= std::numeric_limits<int>::max() || n2 >= std::numeric_limits<int>::max()) {
  192. /*
  193. * It's hard to say what to do here... But libicu wants int, so we fall
  194. * back to g_ascii_strcasecmp which can deal with size_t
  195. */
  196. if (n1 == n2) {
  197. return g_ascii_strncasecmp(s1, s2, n1);
  198. }
  199. else {
  200. return n1 - n2;
  201. }
  202. }
  203. UErrorCode success = U_ZERO_ERROR;
  204. auto res = collate_storage.collator->compareUTF8({s1, (int) n1}, {s2, (int) n2},
  205. success);
  206. switch (res) {
  207. case UCOL_EQUAL:
  208. return 0;
  209. case UCOL_GREATER:
  210. return 1;
  211. case UCOL_LESS:
  212. default:
  213. return -1;
  214. }
  215. }
  216. int rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n)
  217. {
  218. return rspamd_utf8_strcmp_sizes(s1, n, s2, n);
  219. }
  220. TEST_SUITE("utf8 utils")
  221. {
  222. TEST_CASE("utf8 normalise")
  223. {
  224. std::tuple<const char *, const char *, int> cases[] = {
  225. {"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL},
  226. {"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL},
  227. /* Zero width spaces */
  228. {"\xE2\x80\x8B"
  229. "те"
  230. "\xE2\x80\x8B"
  231. "ст",
  232. "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES},
  233. /* Special case of diacritic */
  234. {"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL},
  235. // String containing a non-joiner character
  236. {"س\u200Cت", "ست", RSPAMD_UNICODE_NORM_ZERO_SPACES},
  237. // String containing a soft hyphen
  238. {"in\u00ADter\u00ADest\u00ADing", "interesting", RSPAMD_UNICODE_NORM_ZERO_SPACES},
  239. // String with ligature
  240. {"fish", "fish", RSPAMD_UNICODE_NORM_UNNORMAL},
  241. // String with accented characters and zero-width spaces
  242. {"café\u200Blatté\u200C", "cafélatté", RSPAMD_UNICODE_NORM_ZERO_SPACES},
  243. /* Same with zw spaces */
  244. {"13\u200C_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ",
  245. RSPAMD_UNICODE_NORM_UNNORMAL | RSPAMD_UNICODE_NORM_ZERO_SPACES},
  246. /* Buffer overflow case */
  247. {"u\xC2\xC2\xC2\xC2\xC2\xC2"
  248. "abcdef"
  249. "abcdef",
  250. "u\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD",
  251. RSPAMD_UNICODE_NORM_UNNORMAL | RSPAMD_UNICODE_NORM_ERROR},
  252. // String with a mix of special characters, ligatures, and zero-width spaces
  253. {"fish\u200Bcafé\u200C\u200Dlatté\u200D\u00AD", "fishcafé\u200Dlatté\u200D", RSPAMD_UNICODE_NORM_UNNORMAL | RSPAMD_UNICODE_NORM_ZERO_SPACES},
  254. // Empty string
  255. {"", "", RSPAMD_UNICODE_NORM_NORMAL},
  256. };
  257. for (const auto &c: cases) {
  258. std::string cpy{std::get<0>(c)};
  259. auto ns = cpy.size();
  260. auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns);
  261. cpy.resize(ns);
  262. CHECK(cpy == std::string(std::get<1>(c)));
  263. CHECK(res == std::get<2>(c));
  264. }
  265. }
  266. TEST_CASE("utf8 trim")
  267. {
  268. std::pair<const char *, const char *> cases[] = {
  269. {" \u200B"
  270. "abc ",
  271. "abc"},
  272. {" ", ""},
  273. {" a", "a"},
  274. {"a ", "a"},
  275. {"a a", "a a"},
  276. {"abc", "abc"},
  277. {"a ", "a"},
  278. {" abc ", "abc"},
  279. {" abc ", "abc"},
  280. {" \xE2\x80\x8B"
  281. "a\xE2\x80\x8B"
  282. "bc ",
  283. "a\xE2\x80\x8B"
  284. "bc"},
  285. {" \xE2\x80\x8B"
  286. "abc\xE2\x80\x8B ",
  287. "abc"},
  288. {" \xE2\x80\x8B"
  289. "abc \xE2\x80\x8B ",
  290. "abc"},
  291. };
  292. for (const auto &c: cases) {
  293. std::string cpy{c.first};
  294. auto ns = cpy.size();
  295. auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns);
  296. std::string res{nstart, ns};
  297. CHECK(res == std::string{c.second});
  298. }
  299. }
  300. TEST_CASE("utf8 strcmp")
  301. {
  302. std::tuple<const char *, const char *, int, int> cases[] = {
  303. {"abc", "abc", -1, 0},
  304. {"", "", -1, 0},
  305. {"aBc", "AbC", -1, 0},
  306. {"abc", "ab", 2, 0},
  307. {"теСт", "ТесТ", -1, 0},
  308. {"теСт", "Тезт", 4, 0},
  309. {"теСт", "Тезт", -1, 1},
  310. {"abc", "ABD", -1, -1},
  311. {"\0a\0", "\0a\1", 2, 0},
  312. {"\0a\0", "\0b\1", 3, -1},
  313. };
  314. for (const auto &c: cases) {
  315. auto [s1, s2, n, expected] = c;
  316. if (n == -1) {
  317. n = MIN(strlen(s1), strlen(s2));
  318. }
  319. SUBCASE((std::string("test case: ") + s1 + " <=> " + s2).c_str())
  320. {
  321. auto ret = rspamd_utf8_strcmp(s1, s2, n);
  322. CHECK(ret == expected);
  323. }
  324. }
  325. }
  326. TEST_CASE("transliterate")
  327. {
  328. using namespace std::literals;
  329. std::tuple<std::string_view, const char *> cases[] = {
  330. {"abc"sv, "abc"},
  331. {""sv, ""},
  332. {"тест"sv, "test"},
  333. // Diacritic to ascii
  334. {"Ύ"sv, "y"},
  335. // Chinese to pinyin
  336. {"你好"sv, "ni hao"},
  337. // Japanese to romaji
  338. {"こんにちは"sv, "konnichiha"},
  339. // Devanagari to latin
  340. {"नमस्ते"sv, "namaste"},
  341. // Arabic to latin
  342. {"مرحبا"sv, "mrhba"},
  343. // Remove of punctuation
  344. {"a.b.c"sv, "abc"},
  345. // Lowercase
  346. {"ABC"sv, "abc"},
  347. // Remove zero-width spaces
  348. {"\xE2\x80\x8B"
  349. "abc\xE2\x80\x8B"
  350. "def"sv,
  351. "abcdef"},
  352. };
  353. for (const auto &c: cases) {
  354. auto [s1, s2] = c;
  355. SUBCASE((std::string("test case: ") + std::string(s1) + " => " + s2).c_str())
  356. {
  357. gsize tlen;
  358. auto *ret = rspamd_utf8_transliterate(s1.data(), s1.length(), &tlen);
  359. CHECK(tlen == strlen(s2));
  360. CHECK(strcmp(s2, ret) == 0);
  361. }
  362. }
  363. }
  364. }