namespace simdutf { namespace SIMDUTF_IMPLEMENTATION { namespace { namespace utf8_validation { using namespace simd; simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) // Bit 1 = Too Long (ASCII followed by continuation) // Bit 2 = Overlong 3-byte // Bit 4 = Surrogate // Bit 5 = Overlong 2-byte // Bit 7 = Two Continuations constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______ // 11______ 11______ constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______ constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____ constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____ constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______ constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______ constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____ // 11110100 101_____ // 11110101 1001____ // 11110101 101_____ // 1111011_ 1001____ // 1111011_ 101_____ // 11111___ 1001____ // 11111___ 101_____ constexpr const uint8_t TOO_LARGE_1000 = 1 << 6; // 11110101 1000____ // 1111011_ 1000____ // 11111___ 1000____ constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____ const simd8 byte_1_high = prev1.shr<4>().lookup_16( // 0_______ ________ TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, // 10______ ________ TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, // 1100____ ________ TOO_SHORT | OVERLONG_2, // 1101____ ________ TOO_SHORT, // 1110____ ________ TOO_SHORT | OVERLONG_3 | SURROGATE, // 1111____ ________ TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . const simd8 byte_1_low = (prev1 & 0x0F) .lookup_16( // ____0000 ________ CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, // ____0001 ________ CARRY | OVERLONG_2, // ____001_ ________ CARRY, CARRY, // ____0100 ________ CARRY | TOO_LARGE, // ____0101 ________ CARRY | TOO_LARGE | TOO_LARGE_1000, // ____011_ ________ CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, // ____1___ ________ CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, // ____1101 ________ CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000); const simd8 byte_2_high = input.shr<4>().lookup_16( // ________ 0_______ TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, // ________ 1000____ TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, // ________ 1001____ TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, // ________ 101_____ TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, // ________ 11______ TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); return (byte_1_high & byte_1_low & byte_2_high); } simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, const simd8 prev_input, const simd8 sc) { simd8 prev2 = input.prev<2>(prev_input); simd8 prev3 = input.prev<3>(prev_input); simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); simd8 must23_80 = must23 & uint8_t(0x80); return must23_80 ^ sc; } // // Return nonzero if there are incomplete multibyte characters at the end of the // block: e.g. if there is a 4-byte character, but it is 3 bytes from the end. // simdutf_really_inline simd8 is_incomplete(const simd8 input) { // If the previous input's last 3 bytes match this, they're too short (they // ended at EOF): // ... 1111____ 111_____ 11______ static const uint8_t max_array[32] = {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1}; const simd8 max_value( &max_array[sizeof(max_array) - sizeof(simd8)]); return input.gt_bits(max_value); } struct utf8_checker { // If this is nonzero, there has been a UTF-8 error. simd8 error; // The last input we received simd8 prev_input_block; // Whether the last input we received was incomplete (used for ASCII fast // path) simd8 prev_incomplete; // // Check whether the current bytes are valid UTF-8. // simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ // lead bytes (2, 3, 4-byte leads become large positive numbers instead of // small negative numbers) simd8 prev1 = input.prev<1>(prev_input); simd8 sc = check_special_cases(input, prev1); this->error |= check_multibyte_lengths(input, prev_input, sc); } // The only problem that can happen at EOF is that a multibyte character is // too short or a byte value too large in the last bytes: check_special_cases // only checks for bytes too large in the first of two bytes. simdutf_really_inline void check_eof() { // If the previous block had incomplete UTF-8 characters at the end, an // ASCII block can't possibly finish them. this->error |= this->prev_incomplete; } simdutf_really_inline void check_next_input(const simd8x64 &input) { if (simdutf_likely(is_ascii(input))) { this->error |= this->prev_incomplete; } else { // you might think that a for-loop would work, but under Visual Studio, it // is not good enough. static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), "We support either two or four chunks per 64-byte block."); if (simd8x64::NUM_CHUNKS == 2) { this->check_utf8_bytes(input.chunks[0], this->prev_input_block); this->check_utf8_bytes(input.chunks[1], input.chunks[0]); } else if (simd8x64::NUM_CHUNKS == 4) { this->check_utf8_bytes(input.chunks[0], this->prev_input_block); this->check_utf8_bytes(input.chunks[1], input.chunks[0]); this->check_utf8_bytes(input.chunks[2], input.chunks[1]); this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS - 1]); this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS - 1]; } } // do not forget to call check_eof! simdutf_really_inline bool errors() const { return this->error.any_bits_set_anywhere(); } }; // struct utf8_checker } // namespace utf8_validation using utf8_validation::utf8_checker; } // unnamed namespace } // namespace SIMDUTF_IMPLEMENTATION } // namespace simdutf