/* Returns a pair: the first unprocessed byte from buf and utf8_output A scalar routing should carry on the conversion of the tail. */ std::pair lsx_convert_latin1_to_utf8(const char *latin1_input, size_t len, char *utf8_out) { uint8_t *utf8_output = reinterpret_cast(utf8_out); const char *end = latin1_input + len; __m128i zero = __lsx_vldi(0); // We always write 16 bytes, of which more than the first 8 bytes // are valid. A safety margin of 8 is more than sufficient. while (latin1_input + 16 <= end) { __m128i in8 = __lsx_vld(reinterpret_cast(latin1_input), 0); uint32_t ascii = __lsx_vpickve2gr_hu(__lsx_vmskgez_b(in8), 0); if (ascii == 0xffff) { // ASCII fast path!!!! __lsx_vst(in8, utf8_output, 0); utf8_output += 16; latin1_input += 16; continue; } // We just fallback on UTF-16 code. This could be optimized/simplified // further. __m128i in16 = __lsx_vilvl_b(zero, in8); // 1. prepare 2-byte values // input 8-bit word : [aabb|bbbb] x 8 // expected output : [1100|00aa|10bb|bbbb] x 8 // t0 = [0000|00aa|bbbb|bb00] __m128i t0 = __lsx_vslli_h(in16, 2); // t1 = [0000|00aa|0000|0000] __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785)); // t3 = [0000|00aa|00bb|bbbb] __m128i t2 = __lsx_vbitsel_v(t1, in16, __lsx_vrepli_h(0x3f)); // t4 = [1100|00aa|10bb|bbbb] __m128i t3 = __lsx_vor_v(t2, __lsx_vreplgr2vr_h(uint16_t(0xc080))); // merge ASCII and 2-byte codewords __m128i one_byte_bytemask = __lsx_vsle_hu(in16, __lsx_vrepli_h(0x7F)); __m128i utf8_unpacked = __lsx_vbitsel_v(t3, in16, one_byte_bytemask); const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes [lsx_1_2_utf8_bytes_mask[(ascii & 0xff)]][0]; __m128i shuffle = __lsx_vld(row + 1, 0); __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle); // store bytes __lsx_vst(utf8_packed, utf8_output, 0); // adjust pointers latin1_input += 8; utf8_output += row[0]; } // while return std::make_pair(latin1_input, reinterpret_cast(utf8_output)); }