These are not valid outside of UTF-16 so seeing them in a UTF-8 sequence means that something is wrong with that sequence. Best to filter them out rather than letting them propagate and have unknown effects.pull/1640/head
*dst++ = 0x80 | (src & 0x3f); | *dst++ = 0x80 | (src & 0x3f); | ||||
*dst++ = '\0'; | *dst++ = '\0'; | ||||
return 2; | return 2; | ||||
} else if ((src >= 0xd800) && (src < 0xe000)) { | |||||
return ucs4ToUTF8(0xfffd, dst); | |||||
} else if (src < 0x10000) { | } else if (src < 0x10000) { | ||||
*dst++ = 0xe0 | (src >> 12); | *dst++ = 0xe0 | (src >> 12); | ||||
*dst++ = 0x80 | ((src >> 6) & 0x3f); | *dst++ = 0x80 | ((src >> 6) & 0x3f); | ||||
max--; | max--; | ||||
} | } | ||||
// UTF-16 surrogate code point? | |||||
if ((*dst >= 0xd800) && (*dst < 0xe000)) | |||||
*dst = 0xfffd; | |||||
return consumed; | return consumed; | ||||
} | } | ||||
{ 0x1f638, "\xf0\x9f\x98\xb8" }, | { 0x1f638, "\xf0\x9f\x98\xb8" }, | ||||
{ 0x2d006, "\xf0\xad\x80\x86" }, | { 0x2d006, "\xf0\xad\x80\x86" }, | ||||
{ 0xfffd, "\xe5\xe4" }, | { 0xfffd, "\xe5\xe4" }, | ||||
{ 0xfffd, "\xed\xa2\x80" }, | |||||
{ 0xfffd, "\xed\xbb\xbf" }, | |||||
{ 0xd880, "\xef\xbf\xbd" }, | |||||
{ 0xdeff, "\xef\xbf\xbd" }, | |||||
{ 0x110200, "\xef\xbf\xbd" }, | { 0x110200, "\xef\xbf\xbd" }, | ||||
}; | }; | ||||
const char *invalidutf8[] = { | const char *invalidutf8[] = { | ||||
"\xe5\xe4\xf6", | "\xe5\xe4\xf6", | ||||
"\xf8\xa1\xa1\xa1\xa1", | "\xf8\xa1\xa1\xa1\xa1", | ||||
"\xed\xa2\x80", | |||||
}; | }; | ||||
const wchar_t *validutf16[] = { | const wchar_t *validutf16[] = { |