std::pair lsx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) { uint8_t *utf8_output = reinterpret_cast(utf8_out); const char32_t *end = buf + len; __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xC080)); __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7FF)); __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xDFFF)); __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/ __m128i forbidden_bytemask = __lsx_vldi(0x0); const size_t safety_margin = 12; // to avoid overruns, see issue // https://github.com/simdutf/simdutf/issues/92 while (buf + 16 + safety_margin < end) { __m128i in = __lsx_vld(reinterpret_cast(buf), 0); __m128i nextin = __lsx_vld(reinterpret_cast(buf), 16); // Check if no bits set above 16th if (__lsx_bz_v(__lsx_vpickod_h(in, nextin))) { // Pack UTF-32 to UTF-16 safely (without surrogate pairs) // Apply UTF-16 => UTF-8 routine (lsx_convert_utf16_to_utf8.cpp) __m128i utf16_packed = __lsx_vpickev_h(nextin, in); if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F), utf16_packed))) { // ASCII fast path!!!! // 1. pack the bytes // obviously suboptimal. __m128i utf8_packed = __lsx_vpickev_b(utf16_packed, utf16_packed); // 2. store (8 bytes) __lsx_vst(utf8_packed, utf8_output, 0); // 3. adjust pointers buf += 8; utf8_output += 8; continue; // we are done for this round! } __m128i zero = __lsx_vldi(0); if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, utf16_packed))) { // 1. prepare 2-byte values // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 // expected output : [110a|aaaa|10bb|bbbb] x 8 // t0 = [000a|aaaa|bbbb|bb00] const __m128i t0 = __lsx_vslli_h(utf16_packed, 2); // t1 = [000a|aaaa|0000|0000] const __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/)); // t2 = [0000|0000|00bb|bbbb] const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f)); // t3 = [000a|aaaa|00bb|bbbb] const __m128i t3 = __lsx_vor_v(t1, t2); // t4 = [110a|aaaa|10bb|bbbb] const __m128i t4 = __lsx_vor_v(t3, v_c080); // 2. merge ASCII and 2-byte codewords __m128i one_byte_bytemask = __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F /*0x007F*/)); __m128i utf8_unpacked = __lsx_vbitsel_v(t4, utf16_packed, one_byte_bytemask); // 3. prepare bitmask for 8-bit lookup uint32_t m2 = __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0); // 4. pack the bytes const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes [lsx_1_2_utf8_bytes_mask[m2]][0]; __m128i shuffle = __lsx_vld(row, 1); __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle); // 5. store bytes __lsx_vst(utf8_packed, utf8_output, 0); // 6. adjust pointers buf += 8; utf8_output += row[0]; continue; } else { // case: code units from register produce either 1, 2 or 3 UTF-8 bytes forbidden_bytemask = __lsx_vor_v( __lsx_vand_v( __lsx_vsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800 forbidden_bytemask); /* In this branch we handle three cases: 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes We expand the input word (16-bit) into two code units (32-bit), thus we have room for four bytes. However, we need five distinct bit layouts. Note that the last byte in cases #2 and #3 is the same. We precompute byte 1 for case #1 and the common byte for cases #2 & #3 in register t2. We precompute byte 1 for case #3 and -- **conditionally** -- precompute either byte 1 for case #2 or byte 2 for case #3. Note that they differ by exactly one bit. Finally from these two code units we build proper UTF-8 sequence, taking into account the case (i.e, the number of bytes to write). */ /** * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: * t2 => [0ccc|cccc] [10cc|cccc] * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) */ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] __m128i t0 = __lsx_vpickev_b(utf16_packed, utf16_packed); t0 = __lsx_vilvl_b(t0, t0); // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F)); __m128i t1 = __lsx_vand_v(t0, v_3f7f); // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688 /*0x8000*/)); // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] __m128i s0 = __lsx_vsrli_h(utf16_packed, 12); // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] __m128i s1 = __lsx_vslli_h(utf16_packed, 2); // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/)); // [00bb|bbbb|0000|aaaa] __m128i s2 = __lsx_vor_v(s0, s1); // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0)); __m128i s3 = __lsx_vor_v(s2, v_c0e0); // __m128i v_07ff = vmovq_n_u16((uint16_t)0x07FF); __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff); __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask, __lsx_vldi(-2752 /*0x4000*/)); __m128i s4 = __lsx_vxor_v(s3, m0); // 4. expand code units 16-bit => 32-bit __m128i out0 = __lsx_vilvl_h(s4, t2); __m128i out1 = __lsx_vilvh_h(s4, t2); // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle __m128i one_byte_bytemask = __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F)); __m128i one_or_two_bytes_bytemask_u16_to_u32_low = __lsx_vilvl_h(one_or_two_bytes_bytemask, zero); __m128i one_or_two_bytes_bytemask_u16_to_u32_high = __lsx_vilvh_h(one_or_two_bytes_bytemask, zero); __m128i one_byte_bytemask_u16_to_u32_low = __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask); __m128i one_byte_bytemask_u16_to_u32_high = __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask); const uint32_t mask0 = __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v( one_or_two_bytes_bytemask_u16_to_u32_low, one_byte_bytemask_u16_to_u32_low)), 0); const uint32_t mask1 = __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v( one_or_two_bytes_bytemask_u16_to_u32_high, one_byte_bytemask_u16_to_u32_high)), 0); const uint8_t *row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; __m128i shuffle0 = __lsx_vld(row0, 1); __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0); const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; __m128i shuffle1 = __lsx_vld(row1, 1); __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1); __lsx_vst(utf8_0, utf8_output, 0); utf8_output += row0[0]; __lsx_vst(utf8_1, utf8_output, 0); utf8_output += row1[0]; buf += 8; } // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> // will produce four UTF-8 bytes. } else { // Let us do a scalar fallback. // It may seem wasteful to use scalar code, but being efficient with SIMD // in the presence of surrogate pairs may require non-trivial tables. size_t forward = 15; size_t k = 0; if (size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1); } for (; k < forward; k++) { uint32_t word = buf[k]; if ((word & 0xFFFFFF80) == 0) { *utf8_output++ = char(word); } else if ((word & 0xFFFFF800) == 0) { *utf8_output++ = char((word >> 6) | 0b11000000); *utf8_output++ = char((word & 0b111111) | 0b10000000); } else if ((word & 0xFFFF0000) == 0) { if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } *utf8_output++ = char((word >> 12) | 0b11100000); *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); *utf8_output++ = char((word & 0b111111) | 0b10000000); } else { if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } *utf8_output++ = char((word >> 18) | 0b11110000); *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); *utf8_output++ = char((word & 0b111111) | 0b10000000); } } buf += k; } } // while // check for invalid input if (__lsx_bnz_v(forbidden_bytemask)) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } return std::make_pair(buf, reinterpret_cast(utf8_output)); } std::pair lsx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, char *utf8_out) { uint8_t *utf8_output = reinterpret_cast(utf8_out); const char32_t *start = buf; const char32_t *end = buf + len; __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xC080)); __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7FF)); __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xDFFF)); __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/ __m128i forbidden_bytemask = __lsx_vldi(0x0); const size_t safety_margin = 12; // to avoid overruns, see issue // https://github.com/simdutf/simdutf/issues/92 while (buf + 16 + safety_margin < end) { __m128i in = __lsx_vld(reinterpret_cast(buf), 0); __m128i nextin = __lsx_vld(reinterpret_cast(buf), 16); // Check if no bits set above 16th if (__lsx_bz_v(__lsx_vpickod_h(in, nextin))) { // Pack UTF-32 to UTF-16 safely (without surrogate pairs) // Apply UTF-16 => UTF-8 routine (lsx_convert_utf16_to_utf8.cpp) __m128i utf16_packed = __lsx_vpickev_h(nextin, in); if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F), utf16_packed))) { // ASCII fast path!!!! // 1. pack the bytes // obviously suboptimal. __m128i utf8_packed = __lsx_vpickev_b(utf16_packed, utf16_packed); // 2. store (8 bytes) __lsx_vst(utf8_packed, utf8_output, 0); // 3. adjust pointers buf += 8; utf8_output += 8; continue; // we are done for this round! } __m128i zero = __lsx_vldi(0); if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, utf16_packed))) { // 1. prepare 2-byte values // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 // expected output : [110a|aaaa|10bb|bbbb] x 8 // t0 = [000a|aaaa|bbbb|bb00] const __m128i t0 = __lsx_vslli_h(utf16_packed, 2); // t1 = [000a|aaaa|0000|0000] const __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/)); // t2 = [0000|0000|00bb|bbbb] const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f)); // t3 = [000a|aaaa|00bb|bbbb] const __m128i t3 = __lsx_vor_v(t1, t2); // t4 = [110a|aaaa|10bb|bbbb] const __m128i t4 = __lsx_vor_v(t3, v_c080); // 2. merge ASCII and 2-byte codewords __m128i one_byte_bytemask = __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F /*0x007F*/)); __m128i utf8_unpacked = __lsx_vbitsel_v(t4, utf16_packed, one_byte_bytemask); // 3. prepare bitmask for 8-bit lookup uint32_t m2 = __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0); // 4. pack the bytes const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes [lsx_1_2_utf8_bytes_mask[m2]][0]; __m128i shuffle = __lsx_vld(row, 1); __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle); // 5. store bytes __lsx_vst(utf8_packed, utf8_output, 0); // 6. adjust pointers buf += 8; utf8_output += row[0]; continue; } else { // case: code units from register produce either 1, 2 or 3 UTF-8 bytes forbidden_bytemask = __lsx_vor_v( __lsx_vand_v( __lsx_vsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800 forbidden_bytemask); if (__lsx_bnz_v(forbidden_bytemask)) { return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast(utf8_output)); } /* In this branch we handle three cases: 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes We expand the input word (16-bit) into two code units (32-bit), thus we have room for four bytes. However, we need five distinct bit layouts. Note that the last byte in cases #2 and #3 is the same. We precompute byte 1 for case #1 and the common byte for cases #2 & #3 in register t2. We precompute byte 1 for case #3 and -- **conditionally** -- precompute either byte 1 for case #2 or byte 2 for case #3. Note that they differ by exactly one bit. Finally from these two code units we build proper UTF-8 sequence, taking into account the case (i.e, the number of bytes to write). */ /** * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: * t2 => [0ccc|cccc] [10cc|cccc] * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) */ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] __m128i t0 = __lsx_vpickev_b(utf16_packed, utf16_packed); t0 = __lsx_vilvl_b(t0, t0); // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F)); __m128i t1 = __lsx_vand_v(t0, v_3f7f); // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688 /*0x8000*/)); // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] __m128i s0 = __lsx_vsrli_h(utf16_packed, 12); // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] __m128i s1 = __lsx_vslli_h(utf16_packed, 2); // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/)); // [00bb|bbbb|0000|aaaa] __m128i s2 = __lsx_vor_v(s0, s1); // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0)); __m128i s3 = __lsx_vor_v(s2, v_c0e0); // __m128i v_07ff = vmovq_n_u16((uint16_t)0x07FF); __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff); __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask, __lsx_vldi(-2752 /*0x4000*/)); __m128i s4 = __lsx_vxor_v(s3, m0); // 4. expand code units 16-bit => 32-bit __m128i out0 = __lsx_vilvl_h(s4, t2); __m128i out1 = __lsx_vilvh_h(s4, t2); // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle __m128i one_byte_bytemask = __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F)); __m128i one_or_two_bytes_bytemask_u16_to_u32_low = __lsx_vilvl_h(one_or_two_bytes_bytemask, zero); __m128i one_or_two_bytes_bytemask_u16_to_u32_high = __lsx_vilvh_h(one_or_two_bytes_bytemask, zero); __m128i one_byte_bytemask_u16_to_u32_low = __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask); __m128i one_byte_bytemask_u16_to_u32_high = __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask); const uint32_t mask0 = __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v( one_or_two_bytes_bytemask_u16_to_u32_low, one_byte_bytemask_u16_to_u32_low)), 0); const uint32_t mask1 = __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v( one_or_two_bytes_bytemask_u16_to_u32_high, one_byte_bytemask_u16_to_u32_high)), 0); const uint8_t *row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; __m128i shuffle0 = __lsx_vld(row0, 1); __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0); const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; __m128i shuffle1 = __lsx_vld(row1, 1); __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1); __lsx_vst(utf8_0, utf8_output, 0); utf8_output += row0[0]; __lsx_vst(utf8_1, utf8_output, 0); utf8_output += row1[0]; buf += 8; } // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> // will produce four UTF-8 bytes. } else { // Let us do a scalar fallback. // It may seem wasteful to use scalar code, but being efficient with SIMD // in the presence of surrogate pairs may require non-trivial tables. size_t forward = 15; size_t k = 0; if (size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1); } for (; k < forward; k++) { uint32_t word = buf[k]; if ((word & 0xFFFFFF80) == 0) { *utf8_output++ = char(word); } else if ((word & 0xFFFFF800) == 0) { *utf8_output++ = char((word >> 6) | 0b11000000); *utf8_output++ = char((word & 0b111111) | 0b10000000); } else if ((word & 0xFFFF0000) == 0) { if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair( result(error_code::SURROGATE, buf - start + k), reinterpret_cast(utf8_output)); } *utf8_output++ = char((word >> 12) | 0b11100000); *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); *utf8_output++ = char((word & 0b111111) | 0b10000000); } else { if (word > 0x10FFFF) { return std::make_pair( result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast(utf8_output)); } *utf8_output++ = char((word >> 18) | 0b11110000); *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); *utf8_output++ = char((word & 0b111111) | 0b10000000); } } buf += k; } } // while return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf8_output)); }