From 94d6a693eac3401c50723ccc75aa11fc7017782d Mon Sep 17 00:00:00 2001 From: Pierre Ossman Date: Sat, 18 Mar 2023 14:59:27 +0100 Subject: Don't allow surrugate code points in UTF-8 These are not valid outside of UTF-16 so seeing them in a UTF-8 sequence means that something is wrong with that sequence. Best to filter them out rather than letting them propagate and have unknown effects. --- tests/unit/unicode.cxx | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tests') diff --git a/tests/unit/unicode.cxx b/tests/unit/unicode.cxx index d4e567e9..cb15e7e1 100644 --- a/tests/unit/unicode.cxx +++ b/tests/unit/unicode.cxx @@ -53,6 +53,10 @@ struct _ucs4utf8 ucs4utf8[] = { { 0x1f638, "\xf0\x9f\x98\xb8" }, { 0x2d006, "\xf0\xad\x80\x86" }, { 0xfffd, "\xe5\xe4" }, + { 0xfffd, "\xed\xa2\x80" }, + { 0xfffd, "\xed\xbb\xbf" }, + { 0xd880, "\xef\xbf\xbd" }, + { 0xdeff, "\xef\xbf\xbd" }, { 0x110200, "\xef\xbf\xbd" }, }; @@ -93,6 +97,7 @@ const char *validutf8[] = { const char *invalidutf8[] = { "\xe5\xe4\xf6", "\xf8\xa1\xa1\xa1\xa1", + "\xed\xa2\x80", }; const wchar_t *validutf16[] = { -- cgit v1.2.3