diff options
author | Vsevolod Stakhov <vsevolod@rspamd.com> | 2024-11-29 11:31:35 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rspamd.com> | 2024-11-29 11:31:35 +0000 |
commit | b39a9f52ed3f33082f13f51678d053ee80a2e1f4 (patch) | |
tree | 2144a18d85681df09f83e255f2e5c6d04e61e878 /contrib/simdutf/src/haswell | |
parent | 6c0223b32b8fcb6621fa64197214abb400a09f52 (diff) | |
download | rspamd-b39a9f52ed3f33082f13f51678d053ee80a2e1f4.tar.gz rspamd-b39a9f52ed3f33082f13f51678d053ee80a2e1f4.zip |
[Rework] Replace fastutf with simdutf
Simdutf is faster and has way better support of the architectures (especially
when it comes to non-x86 stuff). Hence, it is a good idea to use it instead
of the non-supported fastutf8 stuff.
Diffstat (limited to 'contrib/simdutf/src/haswell')
16 files changed, 4261 insertions, 0 deletions
diff --git a/contrib/simdutf/src/haswell/avx2_base64.cpp b/contrib/simdutf/src/haswell/avx2_base64.cpp new file mode 100644 index 000000000..87302d181 --- /dev/null +++ b/contrib/simdutf/src/haswell/avx2_base64.cpp @@ -0,0 +1,577 @@ +/** + * References and further reading: + * + * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the + * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. + * https://arxiv.org/abs/1910.05109 + * + * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 + * Instructions, ACM Transactions on the Web 12 (3), 2018. + * https://arxiv.org/abs/1704.00605 + * + * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. + * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, + * Request for Comments: 4648. + * + * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. + * http://www.alfredklomp.com/programming/sse-base64/. (2014). + * + * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD + * acceleration. https://github.com/aklomp/base64. (2014). + * + * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). + * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ + * + * Nick Kopp. 2013. Base64 Encoding on a GPU. + * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). + */ + +template <bool base64_url> +simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) { + // credit: Wojciech Muła + __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51)); + const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input); + result = + _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13))); + __m256i shift_LUT; + if (base64_url) { + shift_LUT = _mm256_setr_epi8( + 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0, + + 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0); + } else { + shift_LUT = _mm256_setr_epi8( + 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0, + + 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, + '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0); + } + + result = _mm256_shuffle_epi8(shift_LUT, result); + return _mm256_add_epi8(result, input); +} + +template <bool isbase64url> +size_t encode_base64(char *dst, const char *src, size_t srclen, + base64_options options) { + // credit: Wojciech Muła + const uint8_t *input = (const uint8_t *)src; + + uint8_t *out = (uint8_t *)dst; + const __m256i shuf = + _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, + + 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); + size_t i = 0; + for (; i + 100 <= srclen; i += 96) { + const __m128i lo0 = _mm_loadu_si128( + reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0)); + const __m128i hi0 = _mm_loadu_si128( + reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1)); + const __m128i lo1 = _mm_loadu_si128( + reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2)); + const __m128i hi1 = _mm_loadu_si128( + reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3)); + const __m128i lo2 = _mm_loadu_si128( + reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4)); + const __m128i hi2 = _mm_loadu_si128( + reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5)); + const __m128i lo3 = _mm_loadu_si128( + reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6)); + const __m128i hi3 = _mm_loadu_si128( + reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7)); + + __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf); + __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf); + __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf); + __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf); + + const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00)); + const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00)); + const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00)); + const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00)); + + const __m256i t1_0 = + _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040)); + const __m256i t1_1 = + _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040)); + const __m256i t1_2 = + _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040)); + const __m256i t1_3 = + _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040)); + + const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0)); + const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0)); + const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0)); + const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0)); + + const __m256i t3_0 = + _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010)); + const __m256i t3_1 = + _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010)); + const __m256i t3_2 = + _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010)); + const __m256i t3_3 = + _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010)); + + const __m256i input0 = _mm256_or_si256(t1_0, t3_0); + const __m256i input1 = _mm256_or_si256(t1_1, t3_1); + const __m256i input2 = _mm256_or_si256(t1_2, t3_2); + const __m256i input3 = _mm256_or_si256(t1_3, t3_3); + + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), + lookup_pshufb_improved<isbase64url>(input0)); + out += 32; + + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), + lookup_pshufb_improved<isbase64url>(input1)); + out += 32; + + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), + lookup_pshufb_improved<isbase64url>(input2)); + out += 32; + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), + lookup_pshufb_improved<isbase64url>(input3)); + out += 32; + } + for (; i + 28 <= srclen; i += 24) { + // lo = [xxxx|DDDC|CCBB|BAAA] + // hi = [xxxx|HHHG|GGFF|FEEE] + const __m128i lo = + _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i)); + const __m128i hi = + _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3)); + + // bytes from groups A, B and C are needed in separate 32-bit lanes + // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA] + __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf); + + // this part is well commented in encode.sse.cpp + + const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00)); + const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); + const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0)); + const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); + const __m256i indices = _mm256_or_si256(t1, t3); + + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), + lookup_pshufb_improved<isbase64url>(indices)); + out += 32; + } + return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i, + srclen - i, options); +} + +static inline void compress(__m128i data, uint16_t mask, char *output) { + if (mask == 0) { + _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data); + return; + } + // this particular implementation was inspired by work done by @animetosho + // we do it in two steps, first 8 bytes and then second 8 bytes + uint8_t mask1 = uint8_t(mask); // least significant 8 bits + uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits + // next line just loads the 64-bit values thintable_epi8[mask1] and + // thintable_epi8[mask2] into a 128-bit register, using only + // two instructions on most compilers. + + __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2], + tables::base64::thintable_epi8[mask1]); + // we increment by 0x08 the second half of the mask + shufmask = + _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0)); + // this is the version "nearly pruned" + __m128i pruned = _mm_shuffle_epi8(data, shufmask); + // we still need to put the two halves together. + // we compute the popcount of the first half: + int pop1 = tables::base64::BitsSetTable256mul2[mask1]; + // then load the corresponding mask, what it does is to write + // only the first pop1 bytes from the first 8 bytes, and then + // it fills in with the bytes from the second 8 bytes + some filling + // at the end. + __m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>( + tables::base64::pshufb_combine_table + pop1 * 8)); + __m128i answer = _mm_shuffle_epi8(pruned, compactmask); + + _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer); +} + +static inline void compress(__m256i data, uint32_t mask, char *output) { + if (mask == 0) { + _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), data); + return; + } + compress(_mm256_castsi256_si128(data), uint16_t(mask), output); + compress(_mm256_extracti128_si256(data, 1), uint16_t(mask >> 16), + output + _mm_popcnt_u32(~mask & 0xFFFF)); +} + +struct block64 { + __m256i chunks[2]; +}; + +template <bool base64_url> +static inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) { + const __m256i ascii_space_tbl = + _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, + 0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0); + // credit: aqrit + __m256i delta_asso; + if (base64_url) { + delta_asso = + _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, + 0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, + 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF); + } else { + delta_asso = _mm256_setr_epi8( + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F); + } + + __m256i delta_values; + if (base64_url) { + delta_values = _mm256_setr_epi8( + 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), + uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0), + uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), + uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), + uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9)); + } else { + delta_values = _mm256_setr_epi8( + int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04), + int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00), + int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), + int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), + int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), + int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), + int8_t(0xB9), int8_t(0xB9)); + } + __m256i check_asso; + + if (base64_url) { + check_asso = + _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3, + 0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1, 0x1, 0x1, 0x1, 0x1, + 0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6); + } else { + + check_asso = _mm256_setr_epi8( + 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, + 0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F); + } + __m256i check_values; + if (base64_url) { + check_values = _mm256_setr_epi8( + uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), + uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6), + uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80), + 0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), + uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), + uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, + uint8_t(0x80), 0x0, uint8_t(0x80)); + } else { + check_values = _mm256_setr_epi8( + int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF), + int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86), + int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91), + int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), + int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), + int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), + int8_t(0x91), int8_t(0x80)); + } + const __m256i shifted = _mm256_srli_epi32(*src, 3); + const __m256i delta_hash = + _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted); + const __m256i check_hash = + _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted); + const __m256i out = + _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src); + const __m256i chk = + _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src); + const int mask = _mm256_movemask_epi8(chk); + if (mask) { + __m256i ascii_space = + _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src); + *error = (mask ^ _mm256_movemask_epi8(ascii_space)); + } + *src = out; + return (uint32_t)mask; +} + +template <bool base64_url> +static inline uint64_t to_base64_mask(block64 *b, uint64_t *error) { + uint32_t err0 = 0; + uint32_t err1 = 0; + uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], &err0); + uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], &err1); + *error = err0 | ((uint64_t)err1 << 32); + return m0 | (m1 << 32); +} + +static inline void copy_block(block64 *b, char *output) { + _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), b->chunks[0]); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + 32), b->chunks[1]); +} + +static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) { + uint64_t nmask = ~mask; + compress(b->chunks[0], uint32_t(mask), output); + compress(b->chunks[1], uint32_t(mask >> 32), + output + _mm_popcnt_u64(nmask & 0xFFFFFFFF)); + return _mm_popcnt_u64(nmask); +} + +// The caller of this function is responsible to ensure that there are 64 bytes +// available from reading at src. The data is read into a block64 structure. +static inline void load_block(block64 *b, const char *src) { + b->chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)); + b->chunks[1] = + _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32)); +} + +// The caller of this function is responsible to ensure that there are 128 bytes +// available from reading at src. The data is read into a block64 structure. +static inline void load_block(block64 *b, const char16_t *src) { + __m256i m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)); + __m256i m2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 16)); + __m256i m3 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32)); + __m256i m4 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 48)); + __m256i m1p = _mm256_permute2x128_si256(m1, m2, 0x20); + __m256i m2p = _mm256_permute2x128_si256(m1, m2, 0x31); + __m256i m3p = _mm256_permute2x128_si256(m3, m4, 0x20); + __m256i m4p = _mm256_permute2x128_si256(m3, m4, 0x31); + b->chunks[0] = _mm256_packus_epi16(m1p, m2p); + b->chunks[1] = _mm256_packus_epi16(m3p, m4p); +} + +static inline void base64_decode(char *out, __m256i str) { + // credit: aqrit + const __m256i pack_shuffle = + _mm256_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, + 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1); + const __m256i t0 = _mm256_maddubs_epi16(str, _mm256_set1_epi32(0x01400140)); + const __m256i t1 = _mm256_madd_epi16(t0, _mm256_set1_epi32(0x00011000)); + const __m256i t2 = _mm256_shuffle_epi8(t1, pack_shuffle); + + // Store the output: + _mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(t2)); + _mm_storeu_si128((__m128i *)(out + 12), _mm256_extracti128_si256(t2, 1)); +} +// decode 64 bytes and output 48 bytes +static inline void base64_decode_block(char *out, const char *src) { + base64_decode(out, + _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src))); + base64_decode(out + 24, _mm256_loadu_si256( + reinterpret_cast<const __m256i *>(src + 32))); +} +static inline void base64_decode_block_safe(char *out, const char *src) { + base64_decode(out, + _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src))); + char buffer[32]; // We enforce safety with a buffer. + base64_decode( + buffer, _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32))); + std::memcpy(out + 24, buffer, 24); +} +static inline void base64_decode_block(char *out, block64 *b) { + base64_decode(out, b->chunks[0]); + base64_decode(out + 24, b->chunks[1]); +} +static inline void base64_decode_block_safe(char *out, block64 *b) { + base64_decode(out, b->chunks[0]); + char buffer[32]; // We enforce safety with a buffer. + base64_decode(buffer, b->chunks[1]); + std::memcpy(out + 24, buffer, 24); +} + +template <bool base64_url, typename chartype> +full_result +compress_decode_base64(char *dst, const chartype *src, size_t srclen, + base64_options options, + last_chunk_handling_options last_chunk_options) { + const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value + : tables::base64::to_base64_value; + size_t equallocation = + srclen; // location of the first padding character if any + // skip trailing spaces + while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && + to_base64[uint8_t(src[srclen - 1])] == 64) { + srclen--; + } + size_t equalsigns = 0; + if (srclen > 0 && src[srclen - 1] == '=') { + equallocation = srclen - 1; + srclen--; + equalsigns = 1; + // skip trailing spaces + while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && + to_base64[uint8_t(src[srclen - 1])] == 64) { + srclen--; + } + if (srclen > 0 && src[srclen - 1] == '=') { + equallocation = srclen - 1; + srclen--; + equalsigns = 2; + } + } + if (srclen == 0) { + if (equalsigns > 0) { + return {INVALID_BASE64_CHARACTER, equallocation, 0}; + } + return {SUCCESS, 0, 0}; + } + char *end_of_safe_64byte_zone = + (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst; + + const chartype *const srcinit = src; + const char *const dstinit = dst; + const chartype *const srcend = src + srclen; + + constexpr size_t block_size = 6; + static_assert(block_size >= 2, "block_size must be at least two"); + char buffer[block_size * 64]; + char *bufferptr = buffer; + if (srclen >= 64) { + const chartype *const srcend64 = src + srclen - 64; + while (src <= srcend64) { + block64 b; + load_block(&b, src); + src += 64; + uint64_t error = 0; + uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error); + if (error) { + src -= 64; + size_t error_offset = _tzcnt_u64(error); + return {error_code::INVALID_BASE64_CHARACTER, + size_t(src - srcinit + error_offset), size_t(dst - dstinit)}; + } + if (badcharmask != 0) { + // optimization opportunity: check for simple masks like those made of + // continuous 1s followed by continuous 0s. And masks containing a + // single bad character. + bufferptr += compress_block(&b, badcharmask, bufferptr); + } else if (bufferptr != buffer) { + copy_block(&b, bufferptr); + bufferptr += 64; + } else { + if (dst >= end_of_safe_64byte_zone) { + base64_decode_block_safe(dst, &b); + } else { + base64_decode_block(dst, &b); + } + dst += 48; + } + if (bufferptr >= (block_size - 1) * 64 + buffer) { + for (size_t i = 0; i < (block_size - 2); i++) { + base64_decode_block(dst, buffer + i * 64); + dst += 48; + } + if (dst >= end_of_safe_64byte_zone) { + base64_decode_block_safe(dst, buffer + (block_size - 2) * 64); + } else { + base64_decode_block(dst, buffer + (block_size - 2) * 64); + } + dst += 48; + std::memcpy(buffer, buffer + (block_size - 1) * 64, + 64); // 64 might be too much + bufferptr -= (block_size - 1) * 64; + } + } + } + + char *buffer_start = buffer; + // Optimization note: if this is almost full, then it is worth our + // time, otherwise, we should just decode directly. + int last_block = (int)((bufferptr - buffer_start) % 64); + if (last_block != 0 && srcend - src + last_block >= 64) { + + while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { + uint8_t val = to_base64[uint8_t(*src)]; + *bufferptr = char(val); + if (!scalar::base64::is_eight_byte(*src) || val > 64) { + return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit), + size_t(dst - dstinit)}; + } + bufferptr += (val <= 63); + src++; + } + } + + for (; buffer_start + 64 <= bufferptr; buffer_start += 64) { + if (dst >= end_of_safe_64byte_zone) { + base64_decode_block_safe(dst, buffer_start); + } else { + base64_decode_block(dst, buffer_start); + } + dst += 48; + } + if ((bufferptr - buffer_start) % 64 != 0) { + while (buffer_start + 4 < bufferptr) { + uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + + (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + + (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + + (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) + << 8; + triple = scalar::utf32::swap_bytes(triple); + std::memcpy(dst, &triple, 4); + + dst += 3; + buffer_start += 4; + } + if (buffer_start + 4 <= bufferptr) { + uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) + + (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) + + (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) + + (uint32_t(uint8_t(buffer_start[3])) << 0 * 6)) + << 8; + triple = scalar::utf32::swap_bytes(triple); + std::memcpy(dst, &triple, 3); + + dst += 3; + buffer_start += 4; + } + // we may have 1, 2 or 3 bytes left and we need to decode them so let us + // backtrack + int leftover = int(bufferptr - buffer_start); + while (leftover > 0) { + while (to_base64[uint8_t(*(src - 1))] == 64) { + src--; + } + src--; + leftover--; + } + } + if (src < srcend + equalsigns) { + full_result r = scalar::base64::base64_tail_decode( + dst, src, srcend - src, equalsigns, options, last_chunk_options); + r.input_count += size_t(src - srcinit); + if (r.error == error_code::INVALID_BASE64_CHARACTER || + r.error == error_code::BASE64_EXTRA_BITS) { + return r; + } else { + r.output_count += size_t(dst - dstinit); + } + if (last_chunk_options != stop_before_partial && + r.error == error_code::SUCCESS && equalsigns > 0) { + // additional checks + if ((r.output_count % 3 == 0) || + ((r.output_count % 3) + 1 + equalsigns != 4)) { + r.error = error_code::INVALID_BASE64_CHARACTER; + r.input_count = equallocation; + } + } + return r; + } + if (equalsigns > 0) { + if ((size_t(dst - dstinit) % 3 == 0) || + ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { + return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)}; + } + } + return {SUCCESS, srclen, size_t(dst - dstinit)}; +} diff --git a/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf16.cpp b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf16.cpp new file mode 100644 index 000000000..6484dcedf --- /dev/null +++ b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf16.cpp @@ -0,0 +1,37 @@ +template <endianness big_endian> +std::pair<const char *, char16_t *> +avx2_convert_latin1_to_utf16(const char *latin1_input, size_t len, + char16_t *utf16_output) { + size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 32 + + size_t i = 0; + for (; i < rounded_len; i += 16) { + // Load 16 bytes from the address (input + i) into a xmm register + __m128i xmm0 = + _mm_loadu_si128(reinterpret_cast<const __m128i *>(latin1_input + i)); + + // Zero extend each byte in xmm0 to word and put it in another xmm register + __m128i xmm1 = _mm_cvtepu8_epi16(xmm0); + + // Shift xmm0 to the right by 8 bytes + xmm0 = _mm_srli_si128(xmm0, 8); + + // Zero extend each byte in the shifted xmm0 to word in xmm0 + xmm0 = _mm_cvtepu8_epi16(xmm0); + + if (big_endian) { + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + xmm0 = _mm_shuffle_epi8(xmm0, swap); + xmm1 = _mm_shuffle_epi8(xmm1, swap); + } + + // Store the contents of xmm1 into the address pointed by (output + i) + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + i), xmm1); + + // Store the contents of xmm0 into the address pointed by (output + i + 8) + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + i + 8), xmm0); + } + + return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len); +} diff --git a/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf32.cpp b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf32.cpp new file mode 100644 index 000000000..f89550b95 --- /dev/null +++ b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf32.cpp @@ -0,0 +1,20 @@ +std::pair<const char *, char32_t *> +avx2_convert_latin1_to_utf32(const char *buf, size_t len, + char32_t *utf32_output) { + size_t rounded_len = ((len | 7) ^ 7); // Round down to nearest multiple of 8 + + for (size_t i = 0; i < rounded_len; i += 8) { + // Load 8 Latin1 characters into a 64-bit register + __m128i in = _mm_loadl_epi64((__m128i *)&buf[i]); + + // Zero extend each set of 8 Latin1 characters to 8 32-bit integers using + // vpmovzxbd + __m256i out = _mm256_cvtepu8_epi32(in); + + // Store the results back to memory + _mm256_storeu_si256((__m256i *)&utf32_output[i], out); + } + + // return pointers pointing to where we left off + return std::make_pair(buf + rounded_len, utf32_output + rounded_len); +} diff --git a/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf8.cpp b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf8.cpp new file mode 100644 index 000000000..a637e1bb0 --- /dev/null +++ b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf8.cpp @@ -0,0 +1,83 @@ +std::pair<const char *, char *> +avx2_convert_latin1_to_utf8(const char *latin1_input, size_t len, + char *utf8_output) { + const char *end = latin1_input + len; + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080); + const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80); + const size_t safety_margin = 12; + + while (end - latin1_input >= std::ptrdiff_t(16 + safety_margin)) { + __m128i in8 = _mm_loadu_si128((__m128i *)latin1_input); + // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes + const __m128i v_80 = _mm_set1_epi8((char)0x80); + if (_mm_testz_si128(in8, v_80)) { // ASCII fast path!!!! + // 1. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, in8); + // 2. adjust pointers + latin1_input += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // We proceed only with the first 16 bytes. + const __m256i in = _mm256_cvtepu8_epi16((in8)); + + // 1. prepare 2-byte values + // input 16-bit word : [0000|0000|aabb|bbbb] x 8 + // expected output : [1100|00aa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [0000|00aa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in, 2); + // t1 = [0000|00aa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [1100|00aa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + + // no bits set above 7th bit + const __m256i one_byte_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000); + const uint32_t one_byte_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask)); + + const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t *row_2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)] + [0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8( + utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_extractf128_si256(utf8_packed, 1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + latin1_input += 16; + continue; + + } // while + return std::make_pair(latin1_input, utf8_output); +} diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf16_to_latin1.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_latin1.cpp new file mode 100644 index 000000000..8c46a23a8 --- /dev/null +++ b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_latin1.cpp @@ -0,0 +1,85 @@ +template <endianness big_endian> +std::pair<const char16_t *, char *> +avx2_convert_utf16_to_latin1(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *end = buf + len; + while (end - buf >= 16) { + // Load 16 UTF-16 characters into 256-bit AVX2 register + __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf)); + + if (!match_system(big_endian)) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + + __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00); + if (_mm256_testz_si256(in, high_byte_mask)) { + // Pack 16-bit characters into 8-bit and store in latin1_output + __m128i lo = _mm256_extractf128_si256(in, 0); + __m128i hi = _mm256_extractf128_si256(in, 1); + __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo); + __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi); + _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output), + latin1_packed_lo); + _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8), + latin1_packed_hi); + // Adjust pointers for next iteration + buf += 16; + latin1_output += 16; + } else { + return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output)); + } + } // while + return std::make_pair(buf, latin1_output); +} + +template <endianness big_endian> +std::pair<result, char *> +avx2_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len, + char *latin1_output) { + const char16_t *start = buf; + const char16_t *end = buf + len; + while (end - buf >= 16) { + __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf)); + + if (!match_system(big_endian)) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + + __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00); + if (_mm256_testz_si256(in, high_byte_mask)) { + __m128i lo = _mm256_extractf128_si256(in, 0); + __m128i hi = _mm256_extractf128_si256(in, 1); + __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo); + __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi); + _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output), + latin1_packed_lo); + _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8), + latin1_packed_hi); + buf += 16; + latin1_output += 16; + } else { + // Fallback to scalar code for handling errors + for (int k = 0; k < 16; k++) { + uint16_t word = !match_system(big_endian) + ? scalar::utf16::swap_bytes(buf[k]) + : buf[k]; + if (word <= 0xff) { + *latin1_output++ = char(word); + } else { + return std::make_pair( + result{error_code::TOO_LARGE, (size_t)(buf - start + k)}, + latin1_output); + } + } + buf += 16; + } + } // while + return std::make_pair(result{error_code::SUCCESS, (size_t)(buf - start)}, + latin1_output); +} diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf32.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf32.cpp new file mode 100644 index 000000000..d396893ca --- /dev/null +++ b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf32.cpp @@ -0,0 +1,210 @@ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit code units. + + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. + + Ad 1. + + When values are less than 0x0800, it means that a 16-bit code unit + can be converted into: 1) single UTF8 byte (when it is an ASCII + char) or 2) two UTF8 bytes. + + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + Ad 2. + + When values fit in 16-bit code units, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. + + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. + + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ + +/* + Returns a pair: the first unprocessed byte from buf and utf32_output + A scalar routing should carry on the conversion of the tail. +*/ +template <endianness big_endian> +std::pair<const char16_t *, char32_t *> +avx2_convert_utf16_to_utf32(const char16_t *buf, size_t len, + char32_t *utf32_output) { + const char16_t *end = buf + len; + const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); + + while (end - buf >= 16) { + __m256i in = _mm256_loadu_si256((__m256i *)buf); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m256i surrogates_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint32_t surrogates_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x00000000) { + // case: we extend all sixteen 16-bit code units to sixteen 32-bit code + // units + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in))); + _mm256_storeu_si256( + reinterpret_cast<__m256i *>(utf32_output + 8), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1))); + utf32_output += 16; + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if ((word & 0xF800) != 0xD800) { + // No surrogate pair + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair(nullptr, utf32_output); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + return std::make_pair(buf, utf32_output); +} + +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the + error. Otherwise, it is the position of the first unprocessed byte in buf + (even if finished). A scalar routing should carry on the conversion of the + tail if needed. +*/ +template <endianness big_endian> +std::pair<result, char32_t *> +avx2_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len, + char32_t *utf32_output) { + const char16_t *start = buf; + const char16_t *end = buf + len; + const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); + + while (end - buf >= 16) { + __m256i in = _mm256_loadu_si256((__m256i *)buf); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m256i surrogates_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint32_t surrogates_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x00000000) { + // case: we extend all sixteen 16-bit code units to sixteen 32-bit code + // units + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in))); + _mm256_storeu_si256( + reinterpret_cast<__m256i *>(utf32_output + 8), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1))); + utf32_output += 16; + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if ((word & 0xF800) != 0xD800) { + // No surrogate pair + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k - 1), + utf32_output); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output); +} diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf8.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf8.cpp new file mode 100644 index 000000000..2a26a0584 --- /dev/null +++ b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf8.cpp @@ -0,0 +1,602 @@ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit code units. + + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. + + Ad 1. + + When values are less than 0x0800, it means that a 16-bit code unit + can be converted into: 1) single UTF8 byte (when it is an ASCII + char) or 2) two UTF8 bytes. + + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + Ad 2. + + When values fit in 16-bit code units, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. + + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. + + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ + +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +template <endianness big_endian> +std::pair<const char16_t *, char *> +avx2_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) { + const char16_t *end = buf + len; + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); + const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080); + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + __m256i in = _mm256_loadu_si256((__m256i *)buf); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes + const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80); + if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16( + _mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000); + const uint32_t one_byte_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = + _mm256_blendv_epi8(t4, in, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t *row_2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> + 16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8( + utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_extractf128_si256(utf8_packed, 1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m256i surrogates_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint32_t surrogates_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x00000000) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + const __m256i dup_even = _mm256_setr_epi16( + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two + UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes + + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, + simdutf_vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef simdutf_vec + + // 4. expand code units 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be + // useful. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const __m256i shuffle = + _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, + 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 = + _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 = + _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1)); + const __m128i utf8_0 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + + const uint8_t mask1 = static_cast<uint8_t>(mask >> 8); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1)); + const __m128i utf8_1 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + + const uint8_t mask2 = static_cast<uint8_t>(mask >> 16); + const uint8_t *row2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1)); + const __m128i utf8_2 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2); + + const uint8_t mask3 = static_cast<uint8_t>(mask >> 24); + const uint8_t *row3 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1)); + const __m128i utf8_3 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3); + + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if ((word & 0xFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xF800) != 0xD800) { + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair(nullptr, utf8_output); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value >> 18) | 0b11110000); + *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + return std::make_pair(buf, utf8_output); +} + +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the + error. Otherwise, it is the position of the first unprocessed byte in buf + (even if finished). A scalar routing should carry on the conversion of the + tail if needed. +*/ +template <endianness big_endian> +std::pair<result, char *> +avx2_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len, + char *utf8_output) { + const char16_t *start = buf; + const char16_t *end = buf + len; + + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); + const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080); + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + __m256i in = _mm256_loadu_si256((__m256i *)buf); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes + const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80); + if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16( + _mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000); + const uint32_t one_byte_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = + _mm256_blendv_epi8(t4, in, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t *row_2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> + 16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8( + utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_extractf128_si256(utf8_packed, 1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m256i surrogates_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint32_t surrogates_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. + // However, it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x00000000) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + const __m256i dup_even = _mm256_setr_epi16( + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two + UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes + + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, + simdutf_vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef simdutf_vec + + // 4. expand code units 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be + // useful. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const __m256i shuffle = + _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, + 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 = + _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 = + _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1)); + const __m128i utf8_0 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + + const uint8_t mask1 = static_cast<uint8_t>(mask >> 8); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1)); + const __m128i utf8_1 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + + const uint8_t mask2 = static_cast<uint8_t>(mask >> 16); + const uint8_t *row2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1)); + const __m128i utf8_2 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2); + + const uint8_t mask3 = static_cast<uint8_t>(mask >> 24); + const uint8_t *row3 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1)); + const __m128i utf8_3 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3); + + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if ((word & 0xFF80) == 0) { + *utf8_output++ = char(word); + } else if ((word & 0xF800) == 0) { + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xF800) != 0xD800) { + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = + big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if ((diff | diff2) > 0x3FF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k - 1), + utf8_output); + } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value >> 18) | 0b11110000); + *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); +} diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf32_to_latin1.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_latin1.cpp new file mode 100644 index 000000000..d6a32d5df --- /dev/null +++ b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_latin1.cpp @@ -0,0 +1,93 @@ +std::pair<const char32_t *, char *> +avx2_convert_utf32_to_latin1(const char32_t *buf, size_t len, + char *latin1_output) { + const size_t rounded_len = + len & ~0x1F; // Round down to nearest multiple of 32 + + __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00); + + __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, 12, 8, 4, 0); + + for (size_t i = 0; i < rounded_len; i += 16) { + __m256i in1 = _mm256_loadu_si256((__m256i *)buf); + __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8)); + + __m256i check_combined = _mm256_or_si256(in1, in2); + + if (!_mm256_testz_si256(check_combined, high_bytes_mask)) { + return std::make_pair(nullptr, latin1_output); + } + + // Turn UTF32 bytes into latin 1 bytes + __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask); + __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask); + + // move Latin1 bytes to their correct spot + __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0); + __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1); + __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1); + __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2); + + __m256i result = _mm256_or_si256(reshuffled1, reshuffled2); + _mm_storeu_si128((__m128i *)latin1_output, _mm256_castsi256_si128(result)); + + latin1_output += 16; + buf += 16; + } + + return std::make_pair(buf, latin1_output); +} +std::pair<result, char *> +avx2_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, + char *latin1_output) { + const size_t rounded_len = + len & ~0x1F; // Round down to nearest multiple of 32 + + __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00); + __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, 12, 8, 4, 0); + + const char32_t *start = buf; + + for (size_t i = 0; i < rounded_len; i += 16) { + __m256i in1 = _mm256_loadu_si256((__m256i *)buf); + __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8)); + + __m256i check_combined = _mm256_or_si256(in1, in2); + + if (!_mm256_testz_si256(check_combined, high_bytes_mask)) { + // Fallback to scalar code for handling errors + for (int k = 0; k < 8; k++) { + char32_t codepoint = buf[k]; + if (codepoint <= 0xFF) { + *latin1_output++ = static_cast<char>(codepoint); + } else { + return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), + latin1_output); + } + } + buf += 8; + } else { + __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask); + __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask); + + __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0); + __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1); + __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1); + __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2); + + __m256i result = _mm256_or_si256(reshuffled1, reshuffled2); + _mm_storeu_si128((__m128i *)latin1_output, + _mm256_castsi256_si128(result)); + + latin1_output += 16; + buf += 16; + } + } + + return std::make_pair(result(error_code::SUCCESS, buf - start), + latin1_output); +} diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf16.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf16.cpp new file mode 100644 index 000000000..ffd6f1e47 --- /dev/null +++ b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf16.cpp @@ -0,0 +1,174 @@ +template <endianness big_endian> +std::pair<const char32_t *, char16_t *> +avx2_convert_utf32_to_utf16(const char32_t *buf, size_t len, + char16_t *utf16_output) { + const char32_t *end = buf + len; + + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + __m256i forbidden_bytemask = _mm256_setzero_si256(); + + while (end - buf >= std::ptrdiff_t(8 + safety_margin)) { + __m256i in = _mm256_loadu_si256((__m256i *)buf); + + const __m256i v_00000000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000); + + // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs + const __m256i saturation_bytemask = + _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); + const uint32_t saturation_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask)); + + if (saturation_bitmask == 0xffffffff) { + const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800); + forbidden_bytemask = _mm256_or_si256( + forbidden_bytemask, + _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800)); + + __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in), + _mm256_extractf128_si256(in, 1)); + if (big_endian) { + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + } + _mm_storeu_si128((__m128i *)utf16_output, utf16_packed); + utf16_output += 8; + buf += 8; + } else { + size_t forward = 7; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFF0000) == 0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair(nullptr, utf16_output); + } + *utf16_output++ = + big_endian + ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) + : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { + return std::make_pair(nullptr, utf16_output); + } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = + uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); + low_surrogate = + uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } + } + + // check for invalid input + if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { + return std::make_pair(nullptr, utf16_output); + } + + return std::make_pair(buf, utf16_output); +} + +template <endianness big_endian> +std::pair<result, char16_t *> +avx2_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len, + char16_t *utf16_output) { + const char32_t *start = buf; + const char32_t *end = buf + len; + + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + + while (end - buf >= std::ptrdiff_t(8 + safety_margin)) { + __m256i in = _mm256_loadu_si256((__m256i *)buf); + + const __m256i v_00000000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000); + + // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs + const __m256i saturation_bytemask = + _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); + const uint32_t saturation_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask)); + + if (saturation_bitmask == 0xffffffff) { + const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800); + const __m256i forbidden_bytemask = + _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800); + if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != + 0x0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), + utf16_output); + } + + __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in), + _mm256_extractf128_si256(in, 1)); + if (big_endian) { + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + } + _mm_storeu_si128((__m128i *)utf16_output, utf16_packed); + utf16_output += 8; + buf += 8; + } else { + size_t forward = 7; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFF0000) == 0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k), utf16_output); + } + *utf16_output++ = + big_endian + ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) + : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { + return std::make_pair( + result(error_code::TOO_LARGE, buf - start + k), utf16_output); + } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = + uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); + low_surrogate = + uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } + } + + return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output); +} diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf8.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf8.cpp new file mode 100644 index 000000000..e1fe5c222 --- /dev/null +++ b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf8.cpp @@ -0,0 +1,569 @@ +std::pair<const char32_t *, char *> +avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) { + const char32_t *end = buf + len; + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); + const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80); + const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800); + const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080); + const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff); + __m256i running_max = _mm256_setzero_si256(); + __m256i forbidden_bytemask = _mm256_setzero_si256(); + + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + __m256i in = _mm256_loadu_si256((__m256i *)buf); + __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1); + running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin); + + // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned + // saturation + __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), + _mm256_and_si256(nextin, v_7fffffff)); + in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); + + // Try to apply UTF-16 => UTF-8 routine on 256 bits + // (haswell/avx2_convert_utf16_to_utf8.cpp) + + if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16( + _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000); + const uint32_t one_byte_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in_16, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in_16, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = + _mm256_blendv_epi8(t4, in_16, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t *row_2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> + 16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8( + utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_extractf128_si256(utf8_packed, 1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // Must check for overflow in packing + const __m256i saturation_bytemask = _mm256_cmpeq_epi32( + _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask)); + if (saturation_bitmask == 0xffffffff) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); + forbidden_bytemask = _mm256_or_si256( + forbidden_bytemask, + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800)); + + const __m256i dup_even = _mm256_setr_epi16( + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two + UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes + + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in_16, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, + simdutf_vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef simdutf_vec + + // 4. expand code units 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be + // useful. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const __m256i shuffle = + _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, + 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 = + _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 = + _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1)); + const __m128i utf8_0 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + + const uint8_t mask1 = static_cast<uint8_t>(mask >> 8); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1)); + const __m128i utf8_1 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + + const uint8_t mask2 = static_cast<uint8_t>(mask >> 16); + const uint8_t *row2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1)); + const __m128i utf8_2 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2); + + const uint8_t mask3 = static_cast<uint8_t>(mask >> 24); + const uint8_t *row3 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1)); + const __m128i utf8_3 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3); + + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + } else { + // case: at least one 32-bit word is larger than 0xFFFF <=> it will + // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem + // wasteful to use scalar code, but being efficient with SIMD may require + // large, non-trivial tables? + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII) + *utf8_output++ = char(word); + } else if ((word & 0xFFFFF800) == 0) { // 2-byte + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xFFFF0000) == 0) { // 3-byte + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair(nullptr, utf8_output); + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { // 4-byte + if (word > 0x10FFFF) { + return std::make_pair(nullptr, utf8_output); + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + // check for invalid input + const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); + if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32( + _mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) { + return std::make_pair(nullptr, utf8_output); + } + + if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { + return std::make_pair(nullptr, utf8_output); + } + + return std::make_pair(buf, utf8_output); +} + +std::pair<result, char *> +avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, + char *utf8_output) { + const char32_t *end = buf + len; + const char32_t *start = buf; + + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); + const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80); + const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800); + const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080); + const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff); + const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); + + const size_t safety_margin = + 12; // to avoid overruns, see issue + // https://github.com/simdutf/simdutf/issues/92 + + while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { + __m256i in = _mm256_loadu_si256((__m256i *)buf); + __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1); + // Check for too large input + const __m256i max_input = + _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff); + if (static_cast<uint32_t>(_mm256_movemask_epi8( + _mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) { + return std::make_pair(result(error_code::TOO_LARGE, buf - start), + utf8_output); + } + + // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned + // saturation + __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), + _mm256_and_si256(nextin, v_7fffffff)); + in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); + + // Try to apply UTF-16 => UTF-8 routine on 256 bits + // (haswell/avx2_convert_utf16_to_utf8.cpp) + + if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16( + _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i *)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000); + const uint32_t one_byte_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in_16, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in_16, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = + _mm256_blendv_epi8(t4, in_16, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t *row = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t *row_2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> + 16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8( + utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i *)utf8_output, + _mm256_extractf128_si256(utf8_packed, 1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // Must check for overflow in packing + const __m256i saturation_bytemask = _mm256_cmpeq_epi32( + _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask)); + if (saturation_bitmask == 0xffffffff) { + // case: code units from register produce either 1, 2 or 3 UTF-8 bytes + + // Check for illegal surrogate code units + const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); + const __m256i forbidden_bytemask = + _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800); + if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != + 0x0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), + utf8_output); + } + + const __m256i dup_even = _mm256_setr_epi16( + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - + single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two + UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - + three UTF-8 bytes + + We expand the input word (16-bit) into two code units (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two code units we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in_16, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, + simdutf_vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef simdutf_vec + + // 4. expand code units 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be + // useful. + /*if(mask == 0) { + // We only have three-byte code units. Use fast path. + const __m256i shuffle = + _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, + 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 = + _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 = + _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, + _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t *row0 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1)); + const __m128i utf8_0 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + + const uint8_t mask1 = static_cast<uint8_t>(mask >> 8); + const uint8_t *row1 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1)); + const __m128i utf8_1 = + _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + + const uint8_t mask2 = static_cast<uint8_t>(mask >> 16); + const uint8_t *row2 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1)); + const __m128i utf8_2 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2); + + const uint8_t mask3 = static_cast<uint8_t>(mask >> 24); + const uint8_t *row3 = + &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1)); + const __m128i utf8_3 = + _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3); + + _mm_storeu_si128((__m128i *)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i *)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + } else { + // case: at least one 32-bit word is larger than 0xFFFF <=> it will + // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem + // wasteful to use scalar code, but being efficient with SIMD may require + // large, non-trivial tables? + size_t forward = 15; + size_t k = 0; + if (size_t(end - buf) < forward + 1) { + forward = size_t(end - buf - 1); + } + for (; k < forward; k++) { + uint32_t word = buf[k]; + if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII) + *utf8_output++ = char(word); + } else if ((word & 0xFFFFF800) == 0) { // 2-byte + *utf8_output++ = char((word >> 6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if ((word & 0xFFFF0000) == 0) { // 3-byte + if (word >= 0xD800 && word <= 0xDFFF) { + return std::make_pair( + result(error_code::SURROGATE, buf - start + k), utf8_output); + } + *utf8_output++ = char((word >> 12) | 0b11100000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { // 4-byte + if (word > 0x10FFFF) { + return std::make_pair( + result(error_code::TOO_LARGE, buf - start + k), utf8_output); + } + *utf8_output++ = char((word >> 18) | 0b11110000); + *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); +} diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf8_to_latin1.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_latin1.cpp new file mode 100644 index 000000000..8e78ab551 --- /dev/null +++ b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_latin1.cpp @@ -0,0 +1,60 @@ +// depends on "tables/utf8_to_utf16_tables.h" + +// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_latin1(const char *input, + uint64_t utf8_end_of_code_point_mask, + char *&latin1_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. + // + const __m128i in = _mm_loadu_si128((__m128i *)input); + + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & + 0xfff; // we are only processing 12 bytes in case it is not all ASCII + + if (utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. + _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in); + latin1_output += 12; // We wrote 12 characters. + return 12; // We consumed 1 bytes. + } + /// We do not have a fast path available, so we fallback. + const uint8_t idx = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + // this indicates an invalid input: + if (idx >= 64) { + return consumed; + } + // Here we should have (idx < 64), if not, there is a bug in the validation or + // elsewhere. SIX (6) input code-code units this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. On + // processors where pdep/pext is fast, we might be able to use a small lookup + // table. + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + const __m128i latin1_packed = _mm_packus_epi16(composed, composed); + // writing 8 bytes even though we only care about the first 6 bytes. + // performance note: it would be faster to use _mm_storeu_si128, we should + // investigate. + _mm_storel_epi64((__m128i *)latin1_output, latin1_packed); + latin1_output += 6; // We wrote 6 bytes. + return consumed; +} diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf16.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf16.cpp new file mode 100644 index 000000000..d99a8ed9d --- /dev/null +++ b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf16.cpp @@ -0,0 +1,195 @@ +// depends on "tables/utf8_to_utf16_tables.h" + +// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +template <endianness big_endian> +size_t convert_masked_utf8_to_utf16(const char *input, + uint64_t utf8_end_of_code_point_mask, + char16_t *&utf16_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. + // + // We first try a few fast paths. + const __m128i swap = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m128i in = _mm_loadu_si128((__m128i *)input); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + if (utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. + __m256i ascii = _mm256_cvtepu8_epi16(in); + if (big_endian) { + const __m256i swap256 = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, + 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + ascii = _mm256_shuffle_epi8(ascii, swap256); + } + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii); + utf16_output += 12; // We wrote 12 16-bit characters. + return 12; // We consumed 12 bytes. + } + if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { + // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte + // UTF-16 code units. There is probably a more efficient sequence, but the + // following might do. + const __m128i sh = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + if (big_endian) + composed = _mm_shuffle_epi8(composed, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed); + utf16_output += 8; // We wrote 16 bytes, 8 code points. + return 16; + } + if (input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte + // UTF-16 code units. There is probably a more efficient sequence, but the + // following might do. + const __m128i sh = + _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + __m128i composed_repacked = _mm_packus_epi32(composed, composed); + if (big_endian) + composed_repacked = _mm_shuffle_epi8(composed_repacked, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); + utf16_output += 4; + return 12; + } + + const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex + [input_utf8_end_of_code_point_mask][1]; + if (idx < 64) { + // SIX (6) input code-code units + // this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. On + // processors where pdep/pext is fast, we might be able to use a small + // lookup table. + const __m128i sh = _mm_loadu_si128( + (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + if (big_endian) + composed = _mm_shuffle_epi8(composed, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed); + utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential + // overflow of 4 bytes. + } else if (idx < 145) { + // FOUR (4) input code-code units + const __m128i sh = _mm_loadu_si128( + (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + __m128i composed_repacked = _mm_packus_epi32(composed, composed); + if (big_endian) + composed_repacked = _mm_shuffle_epi8(composed_repacked, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); + utf16_output += 4; // Here we overflow by 8 bytes. + } else if (idx < 209) { + // TWO (2) input code-code units + ////////////// + // There might be garbage inputs where a leading byte mascarades as a + // four-byte leading byte (by being followed by 3 continuation byte), but is + // not greater than 0xf0. This could trigger a buffer overflow if we only + // counted leading bytes of the form 0xf0 as generating surrogate pairs, + // without further UTF-8 validation. Thus we must be careful to ensure that + // only leading bytes at least as large as 0xf0 generate surrogate pairs. We + // do as at the cost of an extra mask. + ///////////// + const __m128i sh = _mm_loadu_si128( + (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); + const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000)); + // correct for spurious high bit + const __m128i correct = + _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1); + middlehighbyte = _mm_xor_si128(correct, middlehighbyte); + const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4); + // We deliberately carry the leading four bits in highbyte if they are + // present, we remove them later when computing hightenbits. + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000)); + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6); + // When we need to generate a surrogate pair (leading byte > 0xF0), then + // the corresponding 32-bit value in 'composed' will be greater than + // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the + // location of the surrogate pairs. + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), + _mm_or_si128(highbyte_shifted, middlehighbyte_shifted)); + const __m128i composedminus = + _mm_sub_epi32(composed, _mm_set1_epi32(0x10000)); + const __m128i lowtenbits = + _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff)); + // Notice the 0x3ff mask: + const __m128i hightenbits = + _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff)); + const __m128i lowtenbitsadd = + _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00)); + const __m128i hightenbitsadd = + _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800)); + const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16); + __m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted); + uint32_t basic_buffer[4]; + uint32_t basic_buffer_swap[4]; + if (big_endian) { + _mm_storeu_si128((__m128i *)basic_buffer_swap, + _mm_shuffle_epi8(composed, swap)); + surrogates = _mm_shuffle_epi8(surrogates, swap); + } + _mm_storeu_si128((__m128i *)basic_buffer, composed); + uint32_t surrogate_buffer[4]; + _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates); + for (size_t i = 0; i < 3; i++) { + if (basic_buffer[i] > 0x3c00000) { + utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff); + utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16); + utf16_output += 2; + } else { + utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) + : uint16_t(basic_buffer[i]); + utf16_output++; + } + } + } else { + // here we know that there is an error but we do not handle errors + } + return consumed; +} diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf32.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf32.cpp new file mode 100644 index 000000000..c5cf74143 --- /dev/null +++ b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf32.cpp @@ -0,0 +1,135 @@ +// depends on "tables/utf8_to_utf16_tables.h" + +// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_utf32(const char *input, + uint64_t utf8_end_of_code_point_mask, + char32_t *&utf32_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it + // is maybe beneficial to have fast paths that depend on branch prediction but + // have less latency. This results in more instructions but, potentially, also + // higher speeds. + // + // We first try a few fast paths. + const __m128i in = _mm_loadu_si128((__m128i *)input); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + if (utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), + _mm256_cvtepu8_epi32(in)); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), + _mm256_cvtepu8_epi32(_mm_srli_si128(in, 8))); + utf32_output += 12; // We wrote 12 32-bit characters. + return 12; // We consumed 12 bytes. + } + if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { + // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte + // UTF-32 code units. There is probably a more efficient sequence, but the + // following might do. + const __m128i sh = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + _mm256_storeu_si256((__m256i *)utf32_output, + _mm256_cvtepu16_epi32(composed)); + utf32_output += 8; // We wrote 16 bytes, 8 code points. + return 16; + } + if (input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte + // UTF-32 code units. There is probably a more efficient sequence, but the + // following might do. + const __m128i sh = + _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + _mm_storeu_si128((__m128i *)utf32_output, composed); + utf32_output += 4; + return 12; + } + /// We do not have a fast path available, so we fallback. + + const uint8_t idx = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + if (idx < 64) { + // SIX (6) input code-code units + // this is a relatively easy scenario + // we process SIX (6) input code-code units. The max length in bytes of six + // code code units spanning between 1 and 2 bytes each is 12 bytes. On + // processors where pdep/pext is fast, we might be able to use a small + // lookup table. + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + _mm256_storeu_si256((__m256i *)utf32_output, + _mm256_cvtepu16_epi32(composed)); + utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential + // overflow of 32 - 24 = 8 bytes. + } else if (idx < 145) { + // FOUR (4) input code-code units + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + _mm_storeu_si128((__m128i *)utf32_output, composed); + utf32_output += 4; + } else if (idx < 209) { + // TWO (2) input code-code units + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); + const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000)); + // correct for spurious high bit + const __m128i correct = + _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1); + middlehighbyte = _mm_xor_si128(correct, middlehighbyte); + const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000)); + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), + _mm_or_si128(highbyte_shifted, middlehighbyte_shifted)); + _mm_storeu_si128((__m128i *)utf32_output, composed); + utf32_output += + 3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes. + } else { + // here we know that there is an error but we do not handle errors + } + return consumed; +} diff --git a/contrib/simdutf/src/haswell/avx2_validate_utf16.cpp b/contrib/simdutf/src/haswell/avx2_validate_utf16.cpp new file mode 100644 index 000000000..0c54062d4 --- /dev/null +++ b/contrib/simdutf/src/haswell/avx2_validate_utf16.cpp @@ -0,0 +1,206 @@ +/* + In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning. + + In a vectorized algorithm we want to examine the most significant + nibble in order to select a fast path. If none of highest nibbles + are 0xD (13), than we are sure that UTF-16 chunk in a vector + register is valid. + + Let us analyze what we need to check if the nibble is 0xD. The + value of the preceding nibble determines what we have: + + 0xd000 .. 0xd7ff - a valid word + 0xd800 .. 0xdbff - low surrogate + 0xdc00 .. 0xdfff - high surrogate + + Other constraints we have to consider: + - there must not be two consecutive low surrogates (0xd800 .. 0xdbff) + - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff) + - there must not be sole low surrogate nor high surrogate + + We're going to build three bitmasks based on the 3rd nibble: + - V = valid word, + - L = low surrogate (0xd800 .. 0xdbff) + - H = high surrogate (0xdc00 .. 0xdfff) + + 0 1 2 3 4 5 6 7 <--- word index + [ V | L | H | L | H | V | V | L ] + 1 0 0 0 0 1 1 0 - V = valid masks + 0 1 0 1 0 0 0 1 - L = low surrogate + 0 0 1 0 1 0 0 0 - H high surrogate + + + 1 0 0 0 0 1 1 0 V = valid masks + 0 1 0 1 0 0 0 0 a = L & (H >> 1) + 0 0 1 0 1 0 0 0 b = a << 1 + 1 1 1 1 1 1 1 0 c = V | a | b + ^ + the last bit can be zero, we just consume 7 + code units and recheck this word in the next iteration +*/ + +/* Returns: + - pointer to the last unprocessed character (a scalar fallback should check + the rest); + - nullptr if an error was detected. +*/ +template <endianness big_endian> +const char16_t *avx2_validate_utf16(const char16_t *input, size_t size) { + const char16_t *end = input + size; + + const auto v_d8 = simd8<uint8_t>::splat(0xd8); + const auto v_f8 = simd8<uint8_t>::splat(0xf8); + const auto v_fc = simd8<uint8_t>::splat(0xfc); + const auto v_dc = simd8<uint8_t>::splat(0xdc); + + while (input + simd16<uint16_t>::ELEMENTS * 2 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + auto in0 = simd16<uint16_t>(input); + auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS); + + if (big_endian) { + in0 = in0.swap_bytes(); + in1 = in1.swap_bytes(); + } + + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + + const auto in = simd16<uint16_t>::pack(t0, t1); + + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const auto surrogates_wordmask = (in & v_f8) == v_d8; + const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask(); + if (surrogates_bitmask == 0x0) { + input += simd16<uint16_t>::ELEMENTS * 2; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher word) + + // V - non-surrogate code units + // V = not surrogates_wordmask + const uint32_t V = ~surrogates_bitmask; + + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = (in & v_fc) == v_dc; + const uint32_t H = vH.to_bitmask(); + + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint32_t L = ~H & surrogates_bitmask; + + const uint32_t a = + L & (H >> 1); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint32_t b = + a << 1; // Just mark that the opposite fact is hold, + // thanks to that we have only two masks for valid case. + const uint32_t c = V | a | b; // Combine all the masks into the final one. + + if (c == 0xffffffff) { + // The whole input register contains valid UTF-16, i.e., + // either single code units or proper surrogate pairs. + input += simd16<uint16_t>::ELEMENTS * 2; + } else if (c == 0x7fffffff) { + // The 31 lower code units of the input register contains valid UTF-16. + // The 31 word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += simd16<uint16_t>::ELEMENTS * 2 - 1; + } else { + return nullptr; + } + } + } + + return input; +} + +template <endianness big_endian> +const result avx2_validate_utf16_with_errors(const char16_t *input, + size_t size) { + if (simdutf_unlikely(size == 0)) { + return result(error_code::SUCCESS, 0); + } + const char16_t *start = input; + const char16_t *end = input + size; + + const auto v_d8 = simd8<uint8_t>::splat(0xd8); + const auto v_f8 = simd8<uint8_t>::splat(0xf8); + const auto v_fc = simd8<uint8_t>::splat(0xfc); + const auto v_dc = simd8<uint8_t>::splat(0xdc); + + while (input + simd16<uint16_t>::ELEMENTS * 2 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + auto in0 = simd16<uint16_t>(input); + auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS); + + if (big_endian) { + in0 = in0.swap_bytes(); + in1 = in1.swap_bytes(); + } + + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + + const auto in = simd16<uint16_t>::pack(t0, t1); + + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const auto surrogates_wordmask = (in & v_f8) == v_d8; + const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask(); + if (surrogates_bitmask == 0x0) { + input += simd16<uint16_t>::ELEMENTS * 2; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher word) + + // V - non-surrogate code units + // V = not surrogates_wordmask + const uint32_t V = ~surrogates_bitmask; + + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = (in & v_fc) == v_dc; + const uint32_t H = vH.to_bitmask(); + + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint32_t L = ~H & surrogates_bitmask; + + const uint32_t a = + L & (H >> 1); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint32_t b = + a << 1; // Just mark that the opposite fact is hold, + // thanks to that we have only two masks for valid case. + const uint32_t c = V | a | b; // Combine all the masks into the final one. + + if (c == 0xffffffff) { + // The whole input register contains valid UTF-16, i.e., + // either single code units or proper surrogate pairs. + input += simd16<uint16_t>::ELEMENTS * 2; + } else if (c == 0x7fffffff) { + // The 31 lower code units of the input register contains valid UTF-16. + // The 31 word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += simd16<uint16_t>::ELEMENTS * 2 - 1; + } else { + return result(error_code::SURROGATE, input - start); + } + } + } + + return result(error_code::SUCCESS, input - start); +} diff --git a/contrib/simdutf/src/haswell/avx2_validate_utf32le.cpp b/contrib/simdutf/src/haswell/avx2_validate_utf32le.cpp new file mode 100644 index 000000000..8cb1d5f3b --- /dev/null +++ b/contrib/simdutf/src/haswell/avx2_validate_utf32le.cpp @@ -0,0 +1,70 @@ +/* Returns: + - pointer to the last unprocessed character (a scalar fallback should check + the rest); + - nullptr if an error was detected. +*/ +const char32_t *avx2_validate_utf32le(const char32_t *input, size_t size) { + const char32_t *end = input + size; + + const __m256i standardmax = _mm256_set1_epi32(0x10ffff); + const __m256i offset = _mm256_set1_epi32(0xffff2000); + const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff); + __m256i currentmax = _mm256_setzero_si256(); + __m256i currentoffsetmax = _mm256_setzero_si256(); + + while (input + 8 < end) { + const __m256i in = _mm256_loadu_si256((__m256i *)input); + currentmax = _mm256_max_epu32(in, currentmax); + currentoffsetmax = + _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax); + input += 8; + } + __m256i is_zero = + _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax); + if (_mm256_testz_si256(is_zero, is_zero) == 0) { + return nullptr; + } + + is_zero = _mm256_xor_si256( + _mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); + if (_mm256_testz_si256(is_zero, is_zero) == 0) { + return nullptr; + } + + return input; +} + +const result avx2_validate_utf32le_with_errors(const char32_t *input, + size_t size) { + const char32_t *start = input; + const char32_t *end = input + size; + + const __m256i standardmax = _mm256_set1_epi32(0x10ffff); + const __m256i offset = _mm256_set1_epi32(0xffff2000); + const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff); + __m256i currentmax = _mm256_setzero_si256(); + __m256i currentoffsetmax = _mm256_setzero_si256(); + + while (input + 8 < end) { + const __m256i in = _mm256_loadu_si256((__m256i *)input); + currentmax = _mm256_max_epu32(in, currentmax); + currentoffsetmax = + _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax); + + __m256i is_zero = _mm256_xor_si256( + _mm256_max_epu32(currentmax, standardmax), standardmax); + if (_mm256_testz_si256(is_zero, is_zero) == 0) { + return result(error_code::TOO_LARGE, input - start); + } + + is_zero = + _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), + standardoffsetmax); + if (_mm256_testz_si256(is_zero, is_zero) == 0) { + return result(error_code::SURROGATE, input - start); + } + input += 8; + } + + return result(error_code::SUCCESS, input - start); +} diff --git a/contrib/simdutf/src/haswell/implementation.cpp b/contrib/simdutf/src/haswell/implementation.cpp new file mode 100644 index 000000000..0225f1f95 --- /dev/null +++ b/contrib/simdutf/src/haswell/implementation.cpp @@ -0,0 +1,1145 @@ +#include "tables/utf8_to_utf16_tables.h" +#include "scalar/utf8_to_utf16/valid_utf8_to_utf16.h" +#include "scalar/utf8_to_utf16/utf8_to_utf16.h" +#include "scalar/utf8_to_utf32/valid_utf8_to_utf32.h" +#include "scalar/utf8_to_utf32/utf8_to_utf32.h" +#include "tables/utf16_to_utf8_tables.h" +#include "scalar/utf8.h" +#include "scalar/utf16.h" +#include "scalar/latin1.h" +#include "scalar/utf8_to_latin1/valid_utf8_to_latin1.h" +#include "scalar/utf8_to_latin1/utf8_to_latin1.h" + +#include "simdutf/haswell/begin.h" +namespace simdutf { +namespace SIMDUTF_IMPLEMENTATION { +namespace { +#ifndef SIMDUTF_HASWELL_H + #error "haswell.h must be included" +#endif +using namespace simd; + +simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) { + return input.reduce_or().is_ascii(); +} + +simdutf_unused simdutf_really_inline simd8<bool> +must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, + const simd8<uint8_t> prev3) { + simd8<uint8_t> is_second_byte = + prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0 + simd8<uint8_t> is_third_byte = + prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0 + simd8<uint8_t> is_fourth_byte = + prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0 + // Caller requires a bool (all 1's). All values resulting from the subtraction + // will be <= 64, so signed comparison is fine. + return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > + int8_t(0); +} + +simdutf_really_inline simd8<bool> +must_be_2_3_continuation(const simd8<uint8_t> prev2, + const simd8<uint8_t> prev3) { + simd8<uint8_t> is_third_byte = + prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be > 0x80 + simd8<uint8_t> is_fourth_byte = + prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be > 0x80 + return simd8<bool>(is_third_byte | is_fourth_byte); +} + +#include "haswell/avx2_validate_utf16.cpp" +#include "haswell/avx2_validate_utf32le.cpp" + +#include "haswell/avx2_convert_latin1_to_utf8.cpp" +#include "haswell/avx2_convert_latin1_to_utf16.cpp" +#include "haswell/avx2_convert_latin1_to_utf32.cpp" + +#include "haswell/avx2_convert_utf8_to_utf16.cpp" +#include "haswell/avx2_convert_utf8_to_utf32.cpp" + +#include "haswell/avx2_convert_utf16_to_latin1.cpp" +#include "haswell/avx2_convert_utf16_to_utf8.cpp" +#include "haswell/avx2_convert_utf16_to_utf32.cpp" + +#include "haswell/avx2_convert_utf32_to_latin1.cpp" +#include "haswell/avx2_convert_utf32_to_utf8.cpp" +#include "haswell/avx2_convert_utf32_to_utf16.cpp" + +#include "haswell/avx2_convert_utf8_to_latin1.cpp" + +#include "haswell/avx2_base64.cpp" + +} // unnamed namespace +} // namespace SIMDUTF_IMPLEMENTATION +} // namespace simdutf + +#include "generic/buf_block_reader.h" +#include "generic/utf8_validation/utf8_lookup4_algorithm.h" +#include "generic/utf8_validation/utf8_validator.h" +// transcoding from UTF-8 to UTF-16 +#include "generic/utf8_to_utf16/valid_utf8_to_utf16.h" +#include "generic/utf8_to_utf16/utf8_to_utf16.h" +// transcoding from UTF-8 to UTF-32 +#include "generic/utf8_to_utf32/valid_utf8_to_utf32.h" +#include "generic/utf8_to_utf32/utf8_to_utf32.h" +// other functions +#include "generic/utf8.h" +#include "generic/utf16.h" + +// transcoding from UTF-8 to Latin 1 +#include "generic/utf8_to_latin1/utf8_to_latin1.h" +#include "generic/utf8_to_latin1/valid_utf8_to_latin1.h" + +namespace simdutf { +namespace SIMDUTF_IMPLEMENTATION { + +simdutf_warn_unused int +implementation::detect_encodings(const char *input, + size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if (bom_encoding != encoding_type::unspecified) { + return bom_encoding; + } + int out = 0; + if (validate_utf8(input, length)) { + out |= encoding_type::UTF8; + } + if ((length % 2) == 0) { + if (validate_utf16le(reinterpret_cast<const char16_t *>(input), + length / 2)) { + out |= encoding_type::UTF16_LE; + } + } + if ((length % 4) == 0) { + if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) { + out |= encoding_type::UTF32_LE; + } + } + return out; +} + +simdutf_warn_unused bool +implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return haswell::utf8_validation::generic_validate_utf8(buf, len); +} + +simdutf_warn_unused result implementation::validate_utf8_with_errors( + const char *buf, size_t len) const noexcept { + return haswell::utf8_validation::generic_validate_utf8_with_errors(buf, len); +} + +simdutf_warn_unused bool +implementation::validate_ascii(const char *buf, size_t len) const noexcept { + return haswell::utf8_validation::generic_validate_ascii(buf, len); +} + +simdutf_warn_unused result implementation::validate_ascii_with_errors( + const char *buf, size_t len) const noexcept { + return haswell::utf8_validation::generic_validate_ascii_with_errors(buf, len); +} + +simdutf_warn_unused bool +implementation::validate_utf16le(const char16_t *buf, + size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid UTF-16. protect the implementation from + // handling nullptr + return true; + } + const char16_t *tail = avx2_validate_utf16<endianness::LITTLE>(buf, len); + if (tail) { + return scalar::utf16::validate<endianness::LITTLE>(tail, + len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused bool +implementation::validate_utf16be(const char16_t *buf, + size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid UTF-16. protect the implementation from + // handling nullptr + return true; + } + const char16_t *tail = avx2_validate_utf16<endianness::BIG>(buf, len); + if (tail) { + return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused result implementation::validate_utf16le_with_errors( + const char16_t *buf, size_t len) const noexcept { + result res = avx2_validate_utf16_with_errors<endianness::LITTLE>(buf, len); + if (res.count != len) { + result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>( + buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} + +simdutf_warn_unused result implementation::validate_utf16be_with_errors( + const char16_t *buf, size_t len) const noexcept { + result res = avx2_validate_utf16_with_errors<endianness::BIG>(buf, len); + if (res.count != len) { + result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>( + buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} + +simdutf_warn_unused bool +implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid UTF-32. protect the implementation from + // handling nullptr + return true; + } + const char32_t *tail = avx2_validate_utf32le(buf, len); + if (tail) { + return scalar::utf32::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused result implementation::validate_utf32_with_errors( + const char32_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid UTF-32. protect the implementation from + // handling nullptr + return result(error_code::SUCCESS, 0); + } + result res = avx2_validate_utf32le_with_errors(buf, len); + if (res.count != len) { + result scalar_res = + scalar::utf32::validate_with_errors(buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf8( + const char *buf, size_t len, char *utf8_output) const noexcept { + std::pair<const char *, char *> ret = + avx2_convert_latin1_to_utf8(buf, len, utf8_output); + size_t converted_chars = ret.second - utf8_output; + + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + converted_chars += scalar_converted_chars; + } + + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair<const char *, char16_t *> ret = + avx2_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = + scalar::latin1_to_utf16::convert<endianness::LITTLE>( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_converted_chars == 0) { + return 0; + } + converted_chars += scalar_converted_chars; + } + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair<const char *, char16_t *> ret = + avx2_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } + size_t converted_chars = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = + scalar::latin1_to_utf16::convert<endianness::BIG>( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_converted_chars == 0) { + return 0; + } + converted_chars += scalar_converted_chars; + } + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_latin1_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair<const char *, char32_t *> ret = + avx2_convert_latin1_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { + return 0; + } + size_t converted_chars = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_converted_chars == 0) { + return 0; + } + converted_chars += scalar_converted_chars; + } + return converted_chars; +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_latin1( + const char *buf, size_t len, char *latin1_output) const noexcept { + utf8_to_latin1::validating_transcoder converter; + return converter.convert(buf, len, latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors( + const char *buf, size_t len, char *latin1_output) const noexcept { + utf8_to_latin1::validating_transcoder converter; + return converter.convert_with_errors(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1( + const char *input, size_t size, char *latin1_output) const noexcept { + return utf8_to_latin1::convert_valid(input, size, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert<endianness::LITTLE>(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert<endianness::BIG>(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors<endianness::LITTLE>(buf, len, + utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors( + const char *buf, size_t len, char16_t *utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le( + const char *input, size_t size, char16_t *utf16_output) const noexcept { + return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size, + utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be( + const char *input, size_t size, char16_t *utf16_output) const noexcept { + return utf8_to_utf16::convert_valid<endianness::BIG>(input, size, + utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert(buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors( + const char *buf, size_t len, char32_t *utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32( + const char *input, size_t size, char32_t *utf32_output) const noexcept { + return utf8_to_utf32::convert_valid(input, size, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair<const char16_t *, char *> ret = + haswell::avx2_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, + latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_latin1::convert<endianness::LITTLE>( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair<const char16_t *, char *> ret = + haswell::avx2_convert_utf16_to_latin1<endianness::BIG>(buf, len, + latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_latin1::convert<endianness::BIG>( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result +implementation::convert_utf16le_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair<result, char *> ret = + avx2_convert_utf16_to_latin1_with_errors<endianness::LITTLE>( + buf, len, latin1_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result +implementation::convert_utf16be_to_latin1_with_errors( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair<result, char *> ret = + avx2_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len, + latin1_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + // optimization opportunity: implement a custom function + return convert_utf16be_to_latin1(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1( + const char16_t *buf, size_t len, char *latin1_output) const noexcept { + // optimization opportunity: implement a custom function + return convert_utf16le_to_latin1(buf, len, latin1_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + std::pair<const char16_t *, char *> ret = + haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, + utf8_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf8::convert<endianness::LITTLE>( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + std::pair<const char16_t *, char *> ret = + haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len, + utf8_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf8::convert<endianness::BIG>( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair<result, char *> ret = + haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>( + buf, len, utf8_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair<result, char *> ret = + haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>( + buf, len, utf8_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf16le_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8( + const char16_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf16be_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + std::pair<const char32_t *, char *> ret = + avx2_convert_utf32_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + std::pair<const char32_t *, char *> ret = + avx2_convert_utf32_to_latin1(buf, len, latin1_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - latin1_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair<result, char *> ret = + avx2_convert_utf32_to_latin1_with_errors(buf, len, latin1_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_latin1::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + latin1_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1( + const char32_t *buf, size_t len, char *latin1_output) const noexcept { + return convert_utf32_to_latin1(buf, len, latin1_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair<result, char *> ret = + haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf8_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair<const char16_t *, char32_t *> ret = + haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, + utf32_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf32::convert<endianness::LITTLE>( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + std::pair<const char16_t *, char32_t *> ret = + haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len, + utf32_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf16_to_utf32::convert<endianness::BIG>( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair<result, char32_t *> ret = + haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>( + buf, len, utf32_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf32_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair<result, char32_t *> ret = + haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>( + buf, len, utf32_output); + if (ret.first.error) { + return ret.first; + } // Can return directly since scalar fallback already found correct + // ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = + scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf32_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8( + const char32_t *buf, size_t len, char *utf8_output) const noexcept { + return convert_utf32_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair<const char32_t *, char16_t *> ret = + avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf32_to_utf16::convert<endianness::LITTLE>( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + std::pair<const char32_t *, char16_t *> ret = + avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output); + if (ret.first == nullptr) { + return 0; + } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = + scalar::utf32_to_utf16::convert<endianness::BIG>( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { + return 0; + } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair<result, char16_t *> ret = + haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>( + buf, len, utf16_output); + if (ret.first.count != len) { + result scalar_res = + scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf16_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of + // code units written even if finished + std::pair<result, char16_t *> ret = + haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>( + buf, len, utf16_output); + if (ret.first.count != len) { + result scalar_res = + scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = + ret.second - + utf16_output; // Set count to the number of 8-bit code units written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return convert_utf32_to_utf16le(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be( + const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept { + return convert_utf32_to_utf16be(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return convert_utf16le_to_utf32(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32( + const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept { + return convert_utf16be_to_utf32(buf, len, utf32_output); +} + +void implementation::change_endianness_utf16(const char16_t *input, + size_t length, + char16_t *output) const noexcept { + utf16::change_endianness_utf16(input, length, output); +} + +simdutf_warn_unused size_t implementation::count_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::count_code_points<endianness::LITTLE>(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::count_code_points<endianness::BIG>(input, length); +} + +simdutf_warn_unused size_t +implementation::count_utf8(const char *input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::latin1_length_from_utf8( + const char *buf, size_t len) const noexcept { + return count_utf8(buf, len); +} + +simdutf_warn_unused size_t +implementation::latin1_length_from_utf16(size_t length) const noexcept { + return scalar::utf16::latin1_length_from_utf16(length); +} + +simdutf_warn_unused size_t +implementation::latin1_length_from_utf32(size_t length) const noexcept { + return scalar::utf32::latin1_length_from_utf32(length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16<endianness::BIG>(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le( + const char16_t *input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be( + const char16_t *input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16<endianness::BIG>(input, length); +} + +simdutf_warn_unused size_t +implementation::utf16_length_from_latin1(size_t length) const noexcept { + return scalar::latin1::utf16_length_from_latin1(length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf8( + const char *input, size_t length) const noexcept { + return utf8::utf16_length_from_utf8(input, length); +} + +simdutf_warn_unused size_t +implementation::utf32_length_from_latin1(size_t length) const noexcept { + return scalar::latin1::utf32_length_from_latin1(length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_latin1( + const char *input, size_t len) const noexcept { + const uint8_t *data = reinterpret_cast<const uint8_t *>(input); + size_t answer = len / sizeof(__m256i) * sizeof(__m256i); + size_t i = 0; + if (answer >= 2048) { // long strings optimization + __m256i four_64bits = _mm256_setzero_si256(); + while (i + sizeof(__m256i) <= len) { + __m256i runner = _mm256_setzero_si256(); + // We can do up to 255 loops without overflow. + size_t iterations = (len - i) / sizeof(__m256i); + if (iterations > 255) { + iterations = 255; + } + size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i); + for (; i + 4 * sizeof(__m256i) <= max_i; i += 4 * sizeof(__m256i)) { + __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i)); + __m256i input2 = + _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i))); + __m256i input3 = _mm256_loadu_si256( + (const __m256i *)(data + i + 2 * sizeof(__m256i))); + __m256i input4 = _mm256_loadu_si256( + (const __m256i *)(data + i + 3 * sizeof(__m256i))); + __m256i input12 = + _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1), + _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2)); + __m256i input23 = + _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3), + _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4)); + __m256i input1234 = _mm256_add_epi8(input12, input23); + runner = _mm256_sub_epi8(runner, input1234); + } + for (; i <= max_i; i += sizeof(__m256i)) { + __m256i input_256_chunk = + _mm256_loadu_si256((const __m256i *)(data + i)); + runner = _mm256_sub_epi8( + runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk)); + } + four_64bits = _mm256_add_epi64( + four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256())); + } + answer += _mm256_extract_epi64(four_64bits, 0) + + _mm256_extract_epi64(four_64bits, 1) + + _mm256_extract_epi64(four_64bits, 2) + + _mm256_extract_epi64(four_64bits, 3); + } else if (answer > 0) { + for (; i + sizeof(__m256i) <= len; i += sizeof(__m256i)) { + __m256i latin = _mm256_loadu_si256((const __m256i *)(data + i)); + uint32_t non_ascii = _mm256_movemask_epi8(latin); + answer += count_ones(non_ascii); + } + } + return answer + scalar::latin1::utf8_length_from_latin1( + reinterpret_cast<const char *>(data + i), len - i); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + const __m256i v_00000000 = _mm256_setzero_si256(); + const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80); + const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800); + const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); + size_t pos = 0; + size_t count = 0; + for (; pos + 8 <= length; pos += 8) { + __m256i in = _mm256_loadu_si256((__m256i *)(input + pos)); + const __m256i ascii_bytes_bytemask = + _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000); + const __m256i one_two_bytes_bytemask = + _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000); + const __m256i two_bytes_bytemask = + _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask); + const __m256i one_two_three_bytes_bytemask = + _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); + const __m256i three_bytes_bytemask = + _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask); + const uint32_t ascii_bytes_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(ascii_bytes_bytemask)); + const uint32_t two_bytes_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(two_bytes_bytemask)); + const uint32_t three_bytes_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(three_bytes_bytemask)); + + size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4; + size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4; + size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4; + count += 32 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count; + } + return count + + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf32( + const char32_t *input, size_t length) const noexcept { + const __m256i v_00000000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); + size_t pos = 0; + size_t count = 0; + for (; pos + 8 <= length; pos += 8) { + __m256i in = _mm256_loadu_si256((__m256i *)(input + pos)); + const __m256i surrogate_bytemask = + _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); + const uint32_t surrogate_bitmask = + static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask)); + size_t surrogate_count = (32 - count_ones(surrogate_bitmask)) / 4; + count += 8 + surrogate_count; + } + return count + + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf8( + const char *input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( + const char *input, size_t length) const noexcept { + return scalar::base64::maximal_binary_length_from_base64(input, length); +} + +simdutf_warn_unused result implementation::base64_to_binary( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + return (options & base64_url) + ? compress_decode_base64<true>(output, input, length, options, + last_chunk_options) + : compress_decode_base64<false>(output, input, length, options, + last_chunk_options); +} + +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + return (options & base64_url) + ? compress_decode_base64<true>(output, input, length, options, + last_chunk_options) + : compress_decode_base64<false>(output, input, length, options, + last_chunk_options); +} + +simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( + const char16_t *input, size_t length) const noexcept { + return scalar::base64::maximal_binary_length_from_base64(input, length); +} + +simdutf_warn_unused result implementation::base64_to_binary( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + return (options & base64_url) + ? compress_decode_base64<true>(output, input, length, options, + last_chunk_options) + : compress_decode_base64<false>(output, input, length, options, + last_chunk_options); +} + +simdutf_warn_unused full_result implementation::base64_to_binary_details( + const char16_t *input, size_t length, char *output, base64_options options, + last_chunk_handling_options last_chunk_options) const noexcept { + return (options & base64_url) + ? compress_decode_base64<true>(output, input, length, options, + last_chunk_options) + : compress_decode_base64<false>(output, input, length, options, + last_chunk_options); +} + +simdutf_warn_unused size_t implementation::base64_length_from_binary( + size_t length, base64_options options) const noexcept { + return scalar::base64::base64_length_from_binary(length, options); +} + +size_t implementation::binary_to_base64(const char *input, size_t length, + char *output, + base64_options options) const noexcept { + if (options & base64_url) { + return encode_base64<true>(output, input, length, options); + } else { + return encode_base64<false>(output, input, length, options); + } +} +} // namespace SIMDUTF_IMPLEMENTATION +} // namespace simdutf + +#include "simdutf/haswell/end.h" |