mirrors
/
rspamd
mirror of https://github.com/vstakhov/rspamd.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
							/*-
 * Copyright 2021 Vsevolod Stakhov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define U_CHARSET_IS_UTF8 1
#include <unicode/utypes.h>
#include <unicode/utf8.h>
#include <unicode/uchar.h>
#include <unicode/normalizer2.h>
#include <unicode/schriter.h>
#include <unicode/coll.h>
#include <utility>
#include <tuple>
#include <string>
#include <limits>

#include "utf8_util.h"
#include "str_util.h"

#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
#include "doctest/doctest.h"

const char *
rspamd_string_unicode_trim_inplace (const char *str, size_t *len)
{
	const auto *p = str, *end = str + *len;
	auto i = 0;

	while (i < *len) {
		UChar32 uc;
		auto prev_i = i;

		U8_NEXT(p, i, *len, uc);

		if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
			i = prev_i;
			break;
		}
	}

	p += i;
	(*len) -= i;
	i = end - p;
	auto *ret = p;

	if (i > 0) {

		while (i > 0) {
			UChar32 uc;
			auto prev_i = i;

			U8_PREV(p, 0, i, uc);

			if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
				i = prev_i;
				break;
			}
		}

		*len = i;
	}

	return ret;
}

enum rspamd_normalise_result
rspamd_normalise_unicode_inplace(char *start, size_t *len)
{
	UErrorCode uc_err = U_ZERO_ERROR;
	const auto *nfkc_norm = icu::Normalizer2::getNFKCInstance(uc_err);
	static icu::UnicodeSet zw_spaces{};

	if (!zw_spaces.isFrozen()) {
		/* Add zw spaces to the set */
		zw_spaces.add(0x200B);
		/* TODO: ZW non joiner, it might be used for ligatures, so it should possibly be excluded as well */
		zw_spaces.add(0x200C);
		/* See github issue #4290 for explanation. It seems that the ZWJ has many legit use cases */
		//zw_spaces.add(0x200D);
		zw_spaces.add(0xFEF);
		zw_spaces.add(0x00AD);
		zw_spaces.freeze();
	}

	int ret = RSPAMD_UNICODE_NORM_NORMAL;

	g_assert (U_SUCCESS (uc_err));

	auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, *len));
	auto is_normal = nfkc_norm->quickCheck(uc_string, uc_err);

	if (!U_SUCCESS (uc_err)) {
		return RSPAMD_UNICODE_NORM_ERROR;
	}

	/* Filter zero width spaces and push resulting string back */
	const auto filter_zw_spaces_and_push_back = [&](const icu::UnicodeString &input) -> size_t {
		icu::StringCharacterIterator it{input};
		size_t i = 0;

		while(it.hasNext()) {
			/* libicu is very 'special' if it comes to 'safe' macro */
			if (i >= *len) {
				ret |= RSPAMD_UNICODE_NORM_ERROR;
				break;
			}

			auto uc = it.next32PostInc();

			if (zw_spaces.contains(uc)) {
				ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;
			}
			else {
				UBool err = 0;

				if (uc == 0xFFFD) {
					ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
				}
				U8_APPEND((uint8_t*)start, i, *len, uc, err);

				if (err) {
					ret |= RSPAMD_UNICODE_NORM_ERROR;
					break;
				}
			}
		}

		return i;
	};

	if (is_normal != UNORM_YES) {
		/* Need to normalise */
		ret |= RSPAMD_UNICODE_NORM_UNNORMAL;

		auto normalised = nfkc_norm->normalize(uc_string, uc_err);

		if (!U_SUCCESS (uc_err)) {
			return RSPAMD_UNICODE_NORM_ERROR;
		}

		*len = filter_zw_spaces_and_push_back(normalised);
	}
	else {
		*len = filter_zw_spaces_and_push_back(uc_string);
	}

	return static_cast<enum rspamd_normalise_result>(ret);
}

struct rspamd_icu_collate_storage {
	icu::Collator* collator = nullptr;
	rspamd_icu_collate_storage() {
		UErrorCode uc_err = U_ZERO_ERROR;
		collator = icu::Collator::createInstance(icu::Locale::getEnglish(), uc_err);

		if (U_FAILURE(uc_err) || collator == nullptr) {
			g_error ("fatal error: cannot init libicu collation engine: %s",
					u_errorName(uc_err));
			abort();
		}
		/* Ignore all difference except functional */
		collator->setStrength(icu::Collator::PRIMARY);
	}

	~rspamd_icu_collate_storage() {
		if (collator) {
			delete collator;
		}
	}
};

static rspamd_icu_collate_storage collate_storage;

int
rspamd_utf8_strcmp_sizes(const char *s1, gsize n1, const char *s2, gsize n2)
{
	if (n1 >= std::numeric_limits<int>::max() || n2 >= std::numeric_limits<int>::max()) {
		/*
		 * It's hard to say what to do here... But libicu wants int, so we fall
		 * back to g_ascii_strcasecmp which can deal with size_t
		 */
		if (n1 == n2) {
			return g_ascii_strncasecmp(s1, s2, n1);
		}
		else {
			return n1 - n2;
		}
	}

	UErrorCode success = U_ZERO_ERROR;
	auto res = collate_storage.collator->compareUTF8({s1, (int) n1}, {s2, (int) n2},
			success);

	switch (res) {
	case UCOL_EQUAL:
		return 0;
	case UCOL_GREATER:
		return 1;
	case UCOL_LESS:
	default:
		return -1;
	}
}

int
rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n)
{
	return rspamd_utf8_strcmp_sizes(s1, n, s2, n);
}

TEST_SUITE("utf8 utils") {
TEST_CASE("utf8 normalise") {
	std::tuple<const char *, const char *, int> cases[] = {
			{"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL},
			{"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL},
			/* Zero width spaces */
			{"\xE2\x80\x8B""те""\xE2\x80\x8B""ст", "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES},
			/* Special case of diacritic */
			{"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL},
			/* Same with zw spaces */
			{"13\u200C_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ",
					RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ZERO_SPACES},
			/* Buffer overflow case */
			{"u\xC2\xC2\xC2\xC2\xC2\xC2""abcdef""abcdef", "u\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD",
					RSPAMD_UNICODE_NORM_UNNORMAL|RSPAMD_UNICODE_NORM_ERROR},
	};

	for (const auto &c : cases) {
		std::string cpy{std::get<0>(c)};
		auto ns = cpy.size();
		auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns);
		cpy.resize(ns);
		CHECK(cpy == std::string(std::get<1>(c)));
		CHECK(res == std::get<2>(c));
	}
}

TEST_CASE("utf8 trim") {
	std::pair<const char *, const char *> cases[] = {
			{" \u200B""abc ", "abc"},
			{"   ",  ""},
			{"   a", "a"},
			{"a   ", "a"},
			{"a a",  "a a"},
			{"abc",  "abc"},
			{"a ", "a"},
			{"   abc      ", "abc"},
			{" abc ", "abc"},
			{" \xE2\x80\x8B""a\xE2\x80\x8B""bc ", "a\xE2\x80\x8B""bc"},
			{" \xE2\x80\x8B""abc\xE2\x80\x8B ", "abc"},
			{" \xE2\x80\x8B""abc \xE2\x80\x8B  ", "abc"},
	};

	for (const auto &c : cases) {
		std::string cpy{c.first};
		auto ns = cpy.size();
		auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns);
		std::string res{nstart, ns};
		CHECK(res == std::string{c.second});
	}
}


TEST_CASE("utf8 strcmp") {
	std::tuple<const char *, const char *, int, int> cases[] = {
			{"abc", "abc", -1, 0},
			{"",  "", -1, 0},
			{"aBc", "AbC", -1, 0},
			{"abc", "ab", 2, 0},
			{"теСт", "ТесТ", -1, 0},
			{"теСт", "Тезт", 4, 0},
			{"теСт", "Тезт", -1, 1},
			{"abc", "ABD", -1, -1},
			{"\0a\0", "\0a\1", 2, 0},
			{"\0a\0", "\0b\1", 3, -1},
	};

	for (const auto &c : cases) {
		auto [s1, s2, n, expected] = c;
		if (n == -1) {
			n = MIN(strlen(s1), strlen(s2));
		}
		SUBCASE((std::string("test case: ") + s1 + " <=> " + s2).c_str()) {
			auto ret = rspamd_utf8_strcmp(s1, s2, n);
			CHECK(ret == expected);
		}
	}
}
}