// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the // end of the code points. Only the least significant 12 bits of the mask // are accessed. // It returns how many bytes were consumed (up to 16, usually 12). template size_t convert_masked_utf8_to_utf16(const char *input, uint64_t utf8_end_of_code_point_mask, char16_t *&utf16_output) { // we use an approach where we try to process up to 12 input bytes. // Why 12 input bytes and not 16? Because we are concerned with the size of // the lookup tables. Also 12 is nicely divisible by two and three. // uint8x16_t in = vld1q_u8(reinterpret_cast(input)); const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff; // // Optimization note: our main path below is load-latency dependent. Thus it // is maybe beneficial to have fast paths that depend on branch prediction but // have less latency. This results in more instructions but, potentially, also // higher speeds. // We first try a few fast paths. // The obvious first test is ASCII, which actually consumes the full 16. if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) { // We process in chunks of 16 bytes // The routine in simd.h is reused. simd8 temp{vreinterpretq_s8_u8(in)}; temp.store_ascii_as_utf16(utf16_output); utf16_output += 16; // We wrote 16 16-bit characters. return 16; // We consumed 16 bytes. } // 3 byte sequences are the next most common, as seen in CJK, which has long // sequences of these. if (input_utf8_end_of_code_point_mask == 0x924) { // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte // UTF-16 code units. uint16x4_t composed = convert_utf8_3_byte_to_utf16(in); // Byte swap if necessary if (!match_system(big_endian)) { composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed))); } vst1_u16(reinterpret_cast(utf16_output), composed); utf16_output += 4; // We wrote 4 16-bit characters. return 12; // We consumed 12 bytes. } // 2 byte sequences occur in short bursts in languages like Greek and Russian. if ((utf8_end_of_code_point_mask & 0xFFF) == 0xaaa) { // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte // UTF-16 code units. uint16x8_t composed = convert_utf8_2_byte_to_utf16(in); // Byte swap if necessary if (!match_system(big_endian)) { composed = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed))); } vst1q_u16(reinterpret_cast(utf16_output), composed); utf16_output += 6; // We wrote 6 16-bit characters. return 12; // We consumed 12 bytes. } /// We do not have a fast path available, or the fast path is unimportant, so /// we fallback. const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex [input_utf8_end_of_code_point_mask][0]; const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex [input_utf8_end_of_code_point_mask][1]; if (idx < 64) { // SIX (6) input code-code units // Convert to UTF-16 uint16x8_t composed = convert_utf8_1_to_2_byte_to_utf16(in, idx); // Byte swap if necessary if (!match_system(big_endian)) { composed = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed))); } // Store vst1q_u16(reinterpret_cast(utf16_output), composed); utf16_output += 6; // We wrote 6 16-bit characters. return consumed; } else if (idx < 145) { // FOUR (4) input code-code units // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing. uint8x16_t sh = vld1q_u8(reinterpret_cast( simdutf::tables::utf8_to_utf16::shufutf8[idx])); // XXX: depending on the system scalar instructions might be faster. // 1 byte: 00000000 00000000 0ccccccc // 2 byte: 00000000 110bbbbb 10cccccc // 3 byte: 1110aaaa 10bbbbbb 10cccccc uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh)); // 1 byte: 00000000 0ccccccc // 2 byte: xx0bbbbb x0cccccc // 3 byte: xxbbbbbb x0cccccc uint16x4_t lowperm = vmovn_u32(perm); // Partially mask with bic (doesn't require a temporary register unlike and) // The shift left insert below will clear the top bits. // 1 byte: 00000000 00000000 // 2 byte: xx0bbbbb 00000000 // 3 byte: xxbbbbbb 00000000 uint16x4_t middlebyte = vbic_u16(lowperm, vmov_n_u16(uint16_t(~0xFF00))); // ASCII // 1 byte: 00000000 0ccccccc // 2+byte: 00000000 00cccccc uint16x4_t ascii = vand_u16(lowperm, vmov_n_u16(0x7F)); // Split into narrow vectors. // 2 byte: 00000000 00000000 // 3 byte: 00000000 xxxxaaaa uint16x4_t highperm = vshrn_n_u32(perm, 16); // Shift right accumulate the middle byte // 1 byte: 00000000 0ccccccc // 2 byte: 00xx0bbb bbcccccc // 3 byte: 00xxbbbb bbcccccc uint16x4_t middlelow = vsra_n_u16(ascii, middlebyte, 2); // Shift left and insert the top 4 bits, overwriting the garbage // 1 byte: 00000000 0ccccccc // 2 byte: 00000bbb bbcccccc // 3 byte: aaaabbbb bbcccccc uint16x4_t composed = vsli_n_u16(middlelow, highperm, 12); // Byte swap if necessary if (!match_system(big_endian)) { composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed))); } vst1_u16(reinterpret_cast(utf16_output), composed); utf16_output += 4; // We wrote 4 16-bit codepoints return consumed; } else if (idx < 209) { // THREE (3) input code-code units if (input_utf8_end_of_code_point_mask == 0x888) { // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte // UTF-16 pairs. Generating surrogate pairs is a little tricky though, but // it is easier when we can assume they are all pairs. This version does // not use the LUT, but 4 byte sequences are less common and the overhead // of the extra memory access is less important than the early branch // overhead in shorter sequences. // Swap byte pairs // 10dddddd 10cccccc|10bbbbbb 11110aaa // 10cccccc 10dddddd|11110aaa 10bbbbbb uint8x16_t swap = vrev16q_u8(in); // Shift left 2 bits // cccccc00 dddddd00 xxxxxxxx bbbbbb00 uint32x4_t shift = vreinterpretq_u32_u8(vshlq_n_u8(swap, 2)); // Create a magic number containing the low 2 bits of the trail surrogate // and all the corrections needed to create the pair. UTF-8 4b prefix = // -0x0000|0xF000 surrogate offset = -0x0000|0x0040 (0x10000 << 6) // surrogate high = +0x0000|0xD800 // surrogate low = +0xDC00|0x0000 // ------------------------------- // = +0xDC00|0xE7C0 uint32x4_t magic = vmovq_n_u32(0xDC00E7C0); // Generate unadjusted trail surrogate minus lowest 2 bits // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00 uint32x4_t trail = vbslq_u32(vmovq_n_u32(0x0000FF00), vreinterpretq_u32_u8(swap), shift); // Insert low 2 bits of trail surrogate to magic number for later // 11011100 00000000 11100111 110000cc uint16x8_t magic_with_low_2 = vreinterpretq_u16_u32(vsraq_n_u32(magic, shift, 30)); // Generate lead surrogate // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx uint32x4_t lead = vreinterpretq_u32_u16( vsliq_n_u16(vreinterpretq_u16_u8(swap), vreinterpretq_u16_u8(in), 6)); // Mask out lead // 000000cc ccdddddd|xxxxxxxx xxxxxxxx lead = vbicq_u32(lead, vmovq_n_u32(uint32_t(~0x03FFFFFF))); // Blend pairs // 000000cc ccdddddd|11110aaa bbbbbb00 uint16x8_t blend = vreinterpretq_u16_u32( vbslq_u32(vmovq_n_u32(0x0000FFFF), trail, lead)); // Add magic number to finish the result // 110111CC CCDDDDDD|110110AA BBBBBBCC uint16x8_t composed = vaddq_u16(blend, magic_with_low_2); // Byte swap if necessary if (!match_system(big_endian)) { composed = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed))); } uint16_t buffer[8]; vst1q_u16(reinterpret_cast(buffer), composed); for (int k = 0; k < 6; k++) { utf16_output[k] = buffer[k]; } // the loop might compiler to a couple of instructions. utf16_output += 6; // We wrote 3 32-bit surrogate pairs. return 12; // We consumed 12 bytes. } // 3 1-4 byte sequences uint8x16_t sh = vld1q_u8(reinterpret_cast( simdutf::tables::utf8_to_utf16::shufutf8[idx])); // 1 byte: 00000000 00000000 00000000 0ddddddd // 3 byte: 00000000 00000000 110ccccc 10dddddd // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh)); // added to fix issue https://github.com/simdutf/simdutf/issues/514 // We only want to write 2 * 16-bit code units when that is actually what we // have. Unfortunately, we cannot trust the input. So it is possible to get // 0xff as an input byte and it should not result in a surrogate pair. We // need to check for that. uint32_t permbuffer[4]; vst1q_u32(permbuffer, perm); // Mask the low and middle bytes // 00000000 00000000 00000000 0ddddddd uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7f)); // Because the surrogates need more work, the high surrogate is computed // first. uint32x4_t middlehigh = vshlq_n_u32(perm, 2); // 00000000 00000000 00cccccc 00000000 uint32x4_t middlebyte = vandq_u32(perm, vmovq_n_u32(0x3F00)); // Start assembling the sequence. Since the 4th byte is in the same position // as it would be in a surrogate and there is no dependency, shift left // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte: // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx uint32x4_t ab = vbslq_u32(vmovq_n_u32(0xFF000000), perm, middlehigh); // Top 16 bits contains the high ten bits of the surrogate pair before // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction uint32x4_t abc = vbslq_u32(vmovq_n_u32(0xFFFC0000), ab, vshlq_n_u32(middlebyte, 4)); // Combine the low 6 or 7 bits by a shift right accumulate // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o // correction uint32x4_t composed = vsraq_n_u32(ascii, abc, 6); // After this is for surrogates // Blend the low and high surrogates // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd uint32x4_t mixed = vbslq_u32(vmovq_n_u32(0xFFFF0000), abc, composed); // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte: // 11110aaa bbbbbbcc|000000cc ccdddddd uint16x8_t masked_pair = vreinterpretq_u16_u32( vbicq_u32(mixed, vmovq_n_u32(uint32_t(~0xFFFF03FF)))); // Correct the remaining UTF-8 prefix, surrogate offset, and add the // surrogate prefixes in one magic 16-bit addition. similar magic number but // without the continue byte adjust and halfword swapped UTF-8 4b prefix = // -0xF000|0x0000 surrogate offset = -0x0040|0x0000 (0x10000 << 6) // surrogate high = +0xD800|0x0000 // surrogate low = +0x0000|0xDC00 // ----------------------------------- // = +0xE7C0|0xDC00 uint16x8_t magic = vreinterpretq_u16_u32(vmovq_n_u32(0xE7C0DC00)); // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete uint32x4_t surrogates = vreinterpretq_u32_u16(vaddq_u16(masked_pair, magic)); // If the high bit is 1 (s32 less than zero), this needs a surrogate pair uint32x4_t is_pair = vcltzq_s32(vreinterpretq_s32_u32(perm)); // Select either the 4 byte surrogate pair or the 2 byte solo codepoint // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD uint32x4_t selected = vbslq_u32(is_pair, surrogates, composed); // Byte swap if necessary if (!match_system(big_endian)) { selected = vreinterpretq_u32_u8(vrev16q_u8(vreinterpretq_u8_u32(selected))); } // Attempting to shuffle and store would be complex, just scalarize. uint32_t buffer[4]; vst1q_u32(buffer, selected); // Test for the top bit of the surrogate mask. Remove due to issue 514 // const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 : // 0x00800000; for (size_t i = 0; i < 3; i++) { // Surrogate // Used to be if (buffer[i] & SURROGATE_MASK) { // See discussion above. // patch for issue https://github.com/simdutf/simdutf/issues/514 if ((permbuffer[i] & 0xf8000000) == 0xf0000000) { utf16_output[0] = uint16_t(buffer[i] >> 16); utf16_output[1] = uint16_t(buffer[i] & 0xFFFF); utf16_output += 2; } else { utf16_output[0] = uint16_t(buffer[i] & 0xFFFF); utf16_output++; } } return consumed; } else { // here we know that there is an error but we do not handle errors return 12; } }