mirrors
/
rspamd
의 미러 https://github.com/vstakhov/rspamd.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
							/*-
 * Copyright 2021 Vsevolod Stakhov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define U_CHARSET_IS_UTF8 1
#include <unicode/utypes.h>
#include <unicode/utf8.h>
#include <unicode/uchar.h>
#include <unicode/normalizer2.h>
#include <unicode/schriter.h>
#include <unicode/coll.h>
#include <unicode/translit.h>
#include <utility>
#include <tuple>
#include <string>
#include <limits>
#include <memory>

#include "utf8_util.h"
#include "str_util.h"

#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
#include "doctest/doctest.h"

const char *
rspamd_string_unicode_trim_inplace(const char *str, size_t *len)
{
	const auto *p = str, *end = str + *len;
	auto i = 0;

	while (i < *len) {
		UChar32 uc;
		auto prev_i = i;

		U8_NEXT(p, i, *len, uc);

		if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
			i = prev_i;
			break;
		}
	}

	p += i;
	(*len) -= i;
	i = end - p;
	auto *ret = p;

	if (i > 0) {

		while (i > 0) {
			UChar32 uc;
			auto prev_i = i;

			U8_PREV(p, 0, i, uc);

			if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
				i = prev_i;
				break;
			}
		}

		*len = i;
	}

	return ret;
}

enum rspamd_utf8_normalise_result
rspamd_normalise_unicode_inplace(char *start, size_t *len)
{
	UErrorCode uc_err = U_ZERO_ERROR;
	const auto *nfkc_norm = icu::Normalizer2::getNFKCInstance(uc_err);
	static icu::UnicodeSet zw_spaces{};

	if (!zw_spaces.isFrozen()) {
		/* Add zw spaces to the set */
		zw_spaces.add(0x200B);
		/* TODO: ZW non joiner, it might be used for ligatures, so it should possibly be excluded as well */
		zw_spaces.add(0x200C);
		/* See github issue #4290 for explanation. It seems that the ZWJ has many legit use cases */
		//zw_spaces.add(0x200D);
		zw_spaces.add(0xFEF);
		zw_spaces.add(0x00AD);
		zw_spaces.freeze();
	}

	int ret = RSPAMD_UNICODE_NORM_NORMAL;

	g_assert(U_SUCCESS(uc_err));

	auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, *len));
	auto is_normal = nfkc_norm->quickCheck(uc_string, uc_err);

	if (!U_SUCCESS(uc_err)) {
		return RSPAMD_UNICODE_NORM_ERROR;
	}

	/* Filter zero width spaces and push resulting string back */
	const auto filter_zw_spaces_and_push_back = [&](const icu::UnicodeString &input) -> size_t {
		icu::StringCharacterIterator it{input};
		size_t i = 0;

		while (it.hasNext()) {
			/* libicu is very 'special' if it comes to 'safe' macro */
			if (i >= *len) {
				ret |= RSPAMD_UNICODE_NORM_ERROR;
				break;
			}

			auto uc = it.next32PostInc();

			if (zw_spaces.contains(uc)) {
				ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;
			}
			else {
				UBool err = 0;

				if (uc == 0xFFFD) {
					ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
				}
				U8_APPEND((uint8_t *) start, i, *len, uc, err);

				if (err) {
					ret |= RSPAMD_UNICODE_NORM_ERROR;
					break;
				}
			}
		}

		return i;
	};

	if (is_normal != UNORM_YES) {
		/* Need to normalise */
		ret |= RSPAMD_UNICODE_NORM_UNNORMAL;

		auto normalised = nfkc_norm->normalize(uc_string, uc_err);

		if (!U_SUCCESS(uc_err)) {
			return RSPAMD_UNICODE_NORM_ERROR;
		}

		*len = filter_zw_spaces_and_push_back(normalised);
	}
	else {
		*len = filter_zw_spaces_and_push_back(uc_string);
	}

	return static_cast<enum rspamd_utf8_normalise_result>(ret);
}

char *
rspamd_utf8_transliterate(const char *start, gsize len, gsize *target_len)
{
	UErrorCode uc_err = U_ZERO_ERROR;

	static std::unique_ptr<icu::Transliterator> transliterator;

	if (!transliterator) {
		UParseError parse_err;
		static const auto rules = icu::UnicodeString{":: Any-Latin;"
													 ":: [:Nonspacing Mark:] Remove;"
													 ":: [:Punctuation:] Remove;"
													 ":: [:Symbol:] Remove;"
													 ":: [:Format:] Remove;"
													 ":: Latin-ASCII;"
													 ":: Lower();"
													 ":: NULL;"
													 "[:Space Separator:] > ' '"};
		transliterator = std::unique_ptr<icu::Transliterator>(
			icu::Transliterator::createFromRules("RspamdTranslit", rules, UTRANS_FORWARD, parse_err, uc_err));

		if (U_FAILURE(uc_err) || !transliterator) {
			auto context = icu::UnicodeString(parse_err.postContext, sizeof(parse_err.preContext) / sizeof(UChar));
			g_error("fatal error: cannot init libicu transliteration engine: %s, line: %d, offset: %d",
					u_errorName(uc_err), parse_err.line, parse_err.offset);
			abort();
		}
	}

	auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, len));
	transliterator->transliterate(uc_string);

	// We assume that all characters are now ascii
	auto dest_len = uc_string.length();
	char *dest = (char *) g_malloc(dest_len + 1);
	auto sink = icu::CheckedArrayByteSink(dest, dest_len);
	uc_string.toUTF8(sink);

	*target_len = sink.NumberOfBytesWritten();
	dest[*target_len] = '\0';

	return dest;
}

struct rspamd_icu_collate_storage {
	icu::Collator *collator = nullptr;
	rspamd_icu_collate_storage()
	{
		UErrorCode uc_err = U_ZERO_ERROR;
		collator = icu::Collator::createInstance(icu::Locale::getEnglish(), uc_err);

		if (U_FAILURE(uc_err) || collator == nullptr) {
			g_error("fatal error: cannot init libicu collation engine: %s",
					u_errorName(uc_err));
			abort();
		}
		/* Ignore all difference except functional */
		collator->setStrength(icu::Collator::PRIMARY);
	}

	~rspamd_icu_collate_storage()
	{
		if (collator) {
			delete collator;
		}
	}
};

static rspamd_icu_collate_storage collate_storage;

int rspamd_utf8_strcmp_sizes(const char *s1, gsize n1, const char *s2, gsize n2)
{
	if (n1 >= std::numeric_limits<int>::max() || n2 >= std::numeric_limits<int>::max()) {
		/*
		 * It's hard to say what to do here... But libicu wants int, so we fall
		 * back to g_ascii_strcasecmp which can deal with size_t
		 */
		if (n1 == n2) {
			return g_ascii_strncasecmp(s1, s2, n1);
		}
		else {
			return n1 - n2;
		}
	}

	UErrorCode success = U_ZERO_ERROR;
	auto res = collate_storage.collator->compareUTF8({s1, (int) n1}, {s2, (int) n2},
													 success);

	switch (res) {
	case UCOL_EQUAL:
		return 0;
	case UCOL_GREATER:
		return 1;
	case UCOL_LESS:
	default:
		return -1;
	}
}

int rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n)
{
	return rspamd_utf8_strcmp_sizes(s1, n, s2, n);
}

TEST_SUITE("utf8 utils")
{
	TEST_CASE("utf8 normalise")
	{
		std::tuple<const char *, const char *, int> cases[] = {
			{"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL},
			{"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL},
			/* Zero width spaces */
			{"\xE2\x80\x8B"
			 "те"
			 "\xE2\x80\x8B"
			 "ст",
			 "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES},
			/* Special case of diacritic */
			{"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL},
			// String containing a non-joiner character
			{"س\u200Cت", "ست", RSPAMD_UNICODE_NORM_ZERO_SPACES},
			// String containing a soft hyphen
			{"in\u00ADter\u00ADest\u00ADing", "interesting", RSPAMD_UNICODE_NORM_ZERO_SPACES},
			// String with ligature
			{"ﬁsh", "fish", RSPAMD_UNICODE_NORM_UNNORMAL},
			// String with accented characters and zero-width spaces
			{"café\u200Blatté\u200C", "cafélatté", RSPAMD_UNICODE_NORM_ZERO_SPACES},
			/* Same with zw spaces */
			{"13\u200C_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ",
			 RSPAMD_UNICODE_NORM_UNNORMAL | RSPAMD_UNICODE_NORM_ZERO_SPACES},
			/* Buffer overflow case */
			{"u\xC2\xC2\xC2\xC2\xC2\xC2"
			 "abcdef"
			 "abcdef",
			 "u\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD",
			 RSPAMD_UNICODE_NORM_UNNORMAL | RSPAMD_UNICODE_NORM_ERROR},
			// String with a mix of special characters, ligatures, and zero-width spaces
			{"ﬁsh\u200Bcafé\u200C\u200Dlatté\u200D\u00AD", "fishcafé\u200Dlatté\u200D", RSPAMD_UNICODE_NORM_UNNORMAL | RSPAMD_UNICODE_NORM_ZERO_SPACES},
			// Empty string
			{"", "", RSPAMD_UNICODE_NORM_NORMAL},
		};

		for (const auto &c: cases) {
			std::string cpy{std::get<0>(c)};
			auto ns = cpy.size();
			auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns);
			cpy.resize(ns);
			CHECK(cpy == std::string(std::get<1>(c)));
			CHECK(res == std::get<2>(c));
		}
	}

	TEST_CASE("utf8 trim")
	{
		std::pair<const char *, const char *> cases[] = {
			{" \u200B"
			 "abc ",
			 "abc"},
			{"   ", ""},
			{"   a", "a"},
			{"a   ", "a"},
			{"a a", "a a"},
			{"abc", "abc"},
			{"a ", "a"},
			{"   abc      ", "abc"},
			{" abc ", "abc"},
			{" \xE2\x80\x8B"
			 "a\xE2\x80\x8B"
			 "bc ",
			 "a\xE2\x80\x8B"
			 "bc"},
			{" \xE2\x80\x8B"
			 "abc\xE2\x80\x8B ",
			 "abc"},
			{" \xE2\x80\x8B"
			 "abc \xE2\x80\x8B  ",
			 "abc"},
		};

		for (const auto &c: cases) {
			std::string cpy{c.first};
			auto ns = cpy.size();
			auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns);
			std::string res{nstart, ns};
			CHECK(res == std::string{c.second});
		}
	}


	TEST_CASE("utf8 strcmp")
	{
		std::tuple<const char *, const char *, int, int> cases[] = {
			{"abc", "abc", -1, 0},
			{"", "", -1, 0},
			{"aBc", "AbC", -1, 0},
			{"abc", "ab", 2, 0},
			{"теСт", "ТесТ", -1, 0},
			{"теСт", "Тезт", 4, 0},
			{"теСт", "Тезт", -1, 1},
			{"abc", "ABD", -1, -1},
			{"\0a\0", "\0a\1", 2, 0},
			{"\0a\0", "\0b\1", 3, -1},
		};

		for (const auto &c: cases) {
			auto [s1, s2, n, expected] = c;
			if (n == -1) {
				n = MIN(strlen(s1), strlen(s2));
			}
			SUBCASE((std::string("test case: ") + s1 + " <=> " + s2).c_str())
			{
				auto ret = rspamd_utf8_strcmp(s1, s2, n);
				CHECK(ret == expected);
			}
		}
	}

	TEST_CASE("transliterate")
	{
		using namespace std::literals;
		std::tuple<std::string_view, const char *> cases[] = {
			{"abc"sv, "abc"},
			{""sv, ""},
			{"тест"sv, "test"},
			// Diacritic to ascii
			{"Ύ"sv, "y"},
			// Chinese to pinyin
			{"你好"sv, "ni hao"},
			// Japanese to romaji
			{"こんにちは"sv, "konnichiha"},
			// Devanagari to latin
			{"नमस्ते"sv, "namaste"},
			// Arabic to latin
			{"مرحبا"sv, "mrhba"},
			// Remove of punctuation
			{"a.b.c"sv, "abc"},
			// Lowercase
			{"ABC"sv, "abc"},
			// Remove zero-width spaces
			{"\xE2\x80\x8B"
			 "abc\xE2\x80\x8B"
			 "def"sv,
			 "abcdef"},
		};

		for (const auto &c: cases) {
			auto [s1, s2] = c;
			SUBCASE((std::string("test case: ") + std::string(s1) + " => " + s2).c_str())
			{
				gsize tlen;
				auto *ret = rspamd_utf8_transliterate(s1.data(), s1.length(), &tlen);
				CHECK(tlen == strlen(s2));
				CHECK(strcmp(s2, ret) == 0);
			}
		}
	}
}