aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/simdutf/src/haswell
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rspamd.com>2024-11-29 11:31:35 +0000
committerVsevolod Stakhov <vsevolod@rspamd.com>2024-11-29 11:31:35 +0000
commitb39a9f52ed3f33082f13f51678d053ee80a2e1f4 (patch)
tree2144a18d85681df09f83e255f2e5c6d04e61e878 /contrib/simdutf/src/haswell
parent6c0223b32b8fcb6621fa64197214abb400a09f52 (diff)
downloadrspamd-b39a9f52ed3f33082f13f51678d053ee80a2e1f4.tar.gz
rspamd-b39a9f52ed3f33082f13f51678d053ee80a2e1f4.zip
[Rework] Replace fastutf with simdutf
Simdutf is faster and has way better support of the architectures (especially when it comes to non-x86 stuff). Hence, it is a good idea to use it instead of the non-supported fastutf8 stuff.
Diffstat (limited to 'contrib/simdutf/src/haswell')
-rw-r--r--contrib/simdutf/src/haswell/avx2_base64.cpp577
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf16.cpp37
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf32.cpp20
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf8.cpp83
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf16_to_latin1.cpp85
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf32.cpp210
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf8.cpp602
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf32_to_latin1.cpp93
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf16.cpp174
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf8.cpp569
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf8_to_latin1.cpp60
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf16.cpp195
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf32.cpp135
-rw-r--r--contrib/simdutf/src/haswell/avx2_validate_utf16.cpp206
-rw-r--r--contrib/simdutf/src/haswell/avx2_validate_utf32le.cpp70
-rw-r--r--contrib/simdutf/src/haswell/implementation.cpp1145
16 files changed, 4261 insertions, 0 deletions
diff --git a/contrib/simdutf/src/haswell/avx2_base64.cpp b/contrib/simdutf/src/haswell/avx2_base64.cpp
new file mode 100644
index 000000000..87302d181
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_base64.cpp
@@ -0,0 +1,577 @@
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
+
+template <bool base64_url>
+simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) {
+ // credit: Wojciech Muła
+ __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
+ const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
+ result =
+ _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
+ __m256i shift_LUT;
+ if (base64_url) {
+ shift_LUT = _mm256_setr_epi8(
+ 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+ '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0,
+
+ 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+ '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
+ } else {
+ shift_LUT = _mm256_setr_epi8(
+ 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+ '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
+
+ 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+ '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+ }
+
+ result = _mm256_shuffle_epi8(shift_LUT, result);
+ return _mm256_add_epi8(result, input);
+}
+
+template <bool isbase64url>
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+ base64_options options) {
+ // credit: Wojciech Muła
+ const uint8_t *input = (const uint8_t *)src;
+
+ uint8_t *out = (uint8_t *)dst;
+ const __m256i shuf =
+ _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
+
+ 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
+ size_t i = 0;
+ for (; i + 100 <= srclen; i += 96) {
+ const __m128i lo0 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
+ const __m128i hi0 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
+ const __m128i lo1 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
+ const __m128i hi1 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
+ const __m128i lo2 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
+ const __m128i hi2 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
+ const __m128i lo3 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
+ const __m128i hi3 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
+
+ __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
+ __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
+ __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
+ __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
+
+ const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
+ const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
+ const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
+ const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
+
+ const __m256i t1_0 =
+ _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
+ const __m256i t1_1 =
+ _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
+ const __m256i t1_2 =
+ _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
+ const __m256i t1_3 =
+ _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
+
+ const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
+ const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
+ const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
+ const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
+
+ const __m256i t3_0 =
+ _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
+ const __m256i t3_1 =
+ _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
+ const __m256i t3_2 =
+ _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
+ const __m256i t3_3 =
+ _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
+
+ const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
+ const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
+ const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
+ const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
+
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+ lookup_pshufb_improved<isbase64url>(input0));
+ out += 32;
+
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+ lookup_pshufb_improved<isbase64url>(input1));
+ out += 32;
+
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+ lookup_pshufb_improved<isbase64url>(input2));
+ out += 32;
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+ lookup_pshufb_improved<isbase64url>(input3));
+ out += 32;
+ }
+ for (; i + 28 <= srclen; i += 24) {
+ // lo = [xxxx|DDDC|CCBB|BAAA]
+ // hi = [xxxx|HHHG|GGFF|FEEE]
+ const __m128i lo =
+ _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
+ const __m128i hi =
+ _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
+
+ // bytes from groups A, B and C are needed in separate 32-bit lanes
+ // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
+ __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
+
+ // this part is well commented in encode.sse.cpp
+
+ const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
+ const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
+ const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
+ const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
+ const __m256i indices = _mm256_or_si256(t1, t3);
+
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+ lookup_pshufb_improved<isbase64url>(indices));
+ out += 32;
+ }
+ return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+ srclen - i, options);
+}
+
+static inline void compress(__m128i data, uint16_t mask, char *output) {
+ if (mask == 0) {
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
+ return;
+ }
+ // this particular implementation was inspired by work done by @animetosho
+ // we do it in two steps, first 8 bytes and then second 8 bytes
+ uint8_t mask1 = uint8_t(mask); // least significant 8 bits
+ uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+ // next line just loads the 64-bit values thintable_epi8[mask1] and
+ // thintable_epi8[mask2] into a 128-bit register, using only
+ // two instructions on most compilers.
+
+ __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2],
+ tables::base64::thintable_epi8[mask1]);
+ // we increment by 0x08 the second half of the mask
+ shufmask =
+ _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
+ // this is the version "nearly pruned"
+ __m128i pruned = _mm_shuffle_epi8(data, shufmask);
+ // we still need to put the two halves together.
+ // we compute the popcount of the first half:
+ int pop1 = tables::base64::BitsSetTable256mul2[mask1];
+ // then load the corresponding mask, what it does is to write
+ // only the first pop1 bytes from the first 8 bytes, and then
+ // it fills in with the bytes from the second 8 bytes + some filling
+ // at the end.
+ __m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
+ tables::base64::pshufb_combine_table + pop1 * 8));
+ __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
+
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
+}
+
+static inline void compress(__m256i data, uint32_t mask, char *output) {
+ if (mask == 0) {
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), data);
+ return;
+ }
+ compress(_mm256_castsi256_si128(data), uint16_t(mask), output);
+ compress(_mm256_extracti128_si256(data, 1), uint16_t(mask >> 16),
+ output + _mm_popcnt_u32(~mask & 0xFFFF));
+}
+
+struct block64 {
+ __m256i chunks[2];
+};
+
+template <bool base64_url>
+static inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) {
+ const __m256i ascii_space_tbl =
+ _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
+ 0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0);
+ // credit: aqrit
+ __m256i delta_asso;
+ if (base64_url) {
+ delta_asso =
+ _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
+ } else {
+ delta_asso = _mm256_setr_epi8(
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
+ }
+
+ __m256i delta_values;
+ if (base64_url) {
+ delta_values = _mm256_setr_epi8(
+ 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
+ uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0),
+ uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
+ uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3),
+ uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
+ } else {
+ delta_values = _mm256_setr_epi8(
+ int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
+ int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
+ int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+ int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+ int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
+ int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
+ int8_t(0xB9), int8_t(0xB9));
+ }
+ __m256i check_asso;
+
+ if (base64_url) {
+ check_asso =
+ _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
+ 0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6);
+ } else {
+
+ check_asso = _mm256_setr_epi8(
+ 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
+ 0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
+ }
+ __m256i check_values;
+ if (base64_url) {
+ check_values = _mm256_setr_epi8(
+ uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
+ uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6),
+ uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80),
+ 0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
+ uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6),
+ uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
+ uint8_t(0x80), 0x0, uint8_t(0x80));
+ } else {
+ check_values = _mm256_setr_epi8(
+ int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
+ int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
+ int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
+ int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+ int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
+ int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
+ int8_t(0x91), int8_t(0x80));
+ }
+ const __m256i shifted = _mm256_srli_epi32(*src, 3);
+ const __m256i delta_hash =
+ _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
+ const __m256i check_hash =
+ _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
+ const __m256i out =
+ _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
+ const __m256i chk =
+ _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src);
+ const int mask = _mm256_movemask_epi8(chk);
+ if (mask) {
+ __m256i ascii_space =
+ _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src);
+ *error = (mask ^ _mm256_movemask_epi8(ascii_space));
+ }
+ *src = out;
+ return (uint32_t)mask;
+}
+
+template <bool base64_url>
+static inline uint64_t to_base64_mask(block64 *b, uint64_t *error) {
+ uint32_t err0 = 0;
+ uint32_t err1 = 0;
+ uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], &err0);
+ uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], &err1);
+ *error = err0 | ((uint64_t)err1 << 32);
+ return m0 | (m1 << 32);
+}
+
+static inline void copy_block(block64 *b, char *output) {
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), b->chunks[0]);
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + 32), b->chunks[1]);
+}
+
+static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+ uint64_t nmask = ~mask;
+ compress(b->chunks[0], uint32_t(mask), output);
+ compress(b->chunks[1], uint32_t(mask >> 32),
+ output + _mm_popcnt_u64(nmask & 0xFFFFFFFF));
+ return _mm_popcnt_u64(nmask);
+}
+
+// The caller of this function is responsible to ensure that there are 64 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char *src) {
+ b->chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+ b->chunks[1] =
+ _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
+}
+
+// The caller of this function is responsible to ensure that there are 128 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char16_t *src) {
+ __m256i m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+ __m256i m2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 16));
+ __m256i m3 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
+ __m256i m4 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 48));
+ __m256i m1p = _mm256_permute2x128_si256(m1, m2, 0x20);
+ __m256i m2p = _mm256_permute2x128_si256(m1, m2, 0x31);
+ __m256i m3p = _mm256_permute2x128_si256(m3, m4, 0x20);
+ __m256i m4p = _mm256_permute2x128_si256(m3, m4, 0x31);
+ b->chunks[0] = _mm256_packus_epi16(m1p, m2p);
+ b->chunks[1] = _mm256_packus_epi16(m3p, m4p);
+}
+
+static inline void base64_decode(char *out, __m256i str) {
+ // credit: aqrit
+ const __m256i pack_shuffle =
+ _mm256_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
+ 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
+ const __m256i t0 = _mm256_maddubs_epi16(str, _mm256_set1_epi32(0x01400140));
+ const __m256i t1 = _mm256_madd_epi16(t0, _mm256_set1_epi32(0x00011000));
+ const __m256i t2 = _mm256_shuffle_epi8(t1, pack_shuffle);
+
+ // Store the output:
+ _mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(t2));
+ _mm_storeu_si128((__m128i *)(out + 12), _mm256_extracti128_si256(t2, 1));
+}
+// decode 64 bytes and output 48 bytes
+static inline void base64_decode_block(char *out, const char *src) {
+ base64_decode(out,
+ _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
+ base64_decode(out + 24, _mm256_loadu_si256(
+ reinterpret_cast<const __m256i *>(src + 32)));
+}
+static inline void base64_decode_block_safe(char *out, const char *src) {
+ base64_decode(out,
+ _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
+ char buffer[32]; // We enforce safety with a buffer.
+ base64_decode(
+ buffer, _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32)));
+ std::memcpy(out + 24, buffer, 24);
+}
+static inline void base64_decode_block(char *out, block64 *b) {
+ base64_decode(out, b->chunks[0]);
+ base64_decode(out + 24, b->chunks[1]);
+}
+static inline void base64_decode_block_safe(char *out, block64 *b) {
+ base64_decode(out, b->chunks[0]);
+ char buffer[32]; // We enforce safety with a buffer.
+ base64_decode(buffer, b->chunks[1]);
+ std::memcpy(out + 24, buffer, 24);
+}
+
+template <bool base64_url, typename chartype>
+full_result
+compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options) {
+ const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+ : tables::base64::to_base64_value;
+ size_t equallocation =
+ srclen; // location of the first padding character if any
+ // skip trailing spaces
+ while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+ to_base64[uint8_t(src[srclen - 1])] == 64) {
+ srclen--;
+ }
+ size_t equalsigns = 0;
+ if (srclen > 0 && src[srclen - 1] == '=') {
+ equallocation = srclen - 1;
+ srclen--;
+ equalsigns = 1;
+ // skip trailing spaces
+ while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+ to_base64[uint8_t(src[srclen - 1])] == 64) {
+ srclen--;
+ }
+ if (srclen > 0 && src[srclen - 1] == '=') {
+ equallocation = srclen - 1;
+ srclen--;
+ equalsigns = 2;
+ }
+ }
+ if (srclen == 0) {
+ if (equalsigns > 0) {
+ return {INVALID_BASE64_CHARACTER, equallocation, 0};
+ }
+ return {SUCCESS, 0, 0};
+ }
+ char *end_of_safe_64byte_zone =
+ (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
+
+ const chartype *const srcinit = src;
+ const char *const dstinit = dst;
+ const chartype *const srcend = src + srclen;
+
+ constexpr size_t block_size = 6;
+ static_assert(block_size >= 2, "block_size must be at least two");
+ char buffer[block_size * 64];
+ char *bufferptr = buffer;
+ if (srclen >= 64) {
+ const chartype *const srcend64 = src + srclen - 64;
+ while (src <= srcend64) {
+ block64 b;
+ load_block(&b, src);
+ src += 64;
+ uint64_t error = 0;
+ uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+ if (error) {
+ src -= 64;
+ size_t error_offset = _tzcnt_u64(error);
+ return {error_code::INVALID_BASE64_CHARACTER,
+ size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
+ }
+ if (badcharmask != 0) {
+ // optimization opportunity: check for simple masks like those made of
+ // continuous 1s followed by continuous 0s. And masks containing a
+ // single bad character.
+ bufferptr += compress_block(&b, badcharmask, bufferptr);
+ } else if (bufferptr != buffer) {
+ copy_block(&b, bufferptr);
+ bufferptr += 64;
+ } else {
+ if (dst >= end_of_safe_64byte_zone) {
+ base64_decode_block_safe(dst, &b);
+ } else {
+ base64_decode_block(dst, &b);
+ }
+ dst += 48;
+ }
+ if (bufferptr >= (block_size - 1) * 64 + buffer) {
+ for (size_t i = 0; i < (block_size - 2); i++) {
+ base64_decode_block(dst, buffer + i * 64);
+ dst += 48;
+ }
+ if (dst >= end_of_safe_64byte_zone) {
+ base64_decode_block_safe(dst, buffer + (block_size - 2) * 64);
+ } else {
+ base64_decode_block(dst, buffer + (block_size - 2) * 64);
+ }
+ dst += 48;
+ std::memcpy(buffer, buffer + (block_size - 1) * 64,
+ 64); // 64 might be too much
+ bufferptr -= (block_size - 1) * 64;
+ }
+ }
+ }
+
+ char *buffer_start = buffer;
+ // Optimization note: if this is almost full, then it is worth our
+ // time, otherwise, we should just decode directly.
+ int last_block = (int)((bufferptr - buffer_start) % 64);
+ if (last_block != 0 && srcend - src + last_block >= 64) {
+
+ while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
+ uint8_t val = to_base64[uint8_t(*src)];
+ *bufferptr = char(val);
+ if (!scalar::base64::is_eight_byte(*src) || val > 64) {
+ return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+ size_t(dst - dstinit)};
+ }
+ bufferptr += (val <= 63);
+ src++;
+ }
+ }
+
+ for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+ if (dst >= end_of_safe_64byte_zone) {
+ base64_decode_block_safe(dst, buffer_start);
+ } else {
+ base64_decode_block(dst, buffer_start);
+ }
+ dst += 48;
+ }
+ if ((bufferptr - buffer_start) % 64 != 0) {
+ while (buffer_start + 4 < bufferptr) {
+ uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+ (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+ (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+ (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+ << 8;
+ triple = scalar::utf32::swap_bytes(triple);
+ std::memcpy(dst, &triple, 4);
+
+ dst += 3;
+ buffer_start += 4;
+ }
+ if (buffer_start + 4 <= bufferptr) {
+ uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+ (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+ (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+ (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+ << 8;
+ triple = scalar::utf32::swap_bytes(triple);
+ std::memcpy(dst, &triple, 3);
+
+ dst += 3;
+ buffer_start += 4;
+ }
+ // we may have 1, 2 or 3 bytes left and we need to decode them so let us
+ // backtrack
+ int leftover = int(bufferptr - buffer_start);
+ while (leftover > 0) {
+ while (to_base64[uint8_t(*(src - 1))] == 64) {
+ src--;
+ }
+ src--;
+ leftover--;
+ }
+ }
+ if (src < srcend + equalsigns) {
+ full_result r = scalar::base64::base64_tail_decode(
+ dst, src, srcend - src, equalsigns, options, last_chunk_options);
+ r.input_count += size_t(src - srcinit);
+ if (r.error == error_code::INVALID_BASE64_CHARACTER ||
+ r.error == error_code::BASE64_EXTRA_BITS) {
+ return r;
+ } else {
+ r.output_count += size_t(dst - dstinit);
+ }
+ if (last_chunk_options != stop_before_partial &&
+ r.error == error_code::SUCCESS && equalsigns > 0) {
+ // additional checks
+ if ((r.output_count % 3 == 0) ||
+ ((r.output_count % 3) + 1 + equalsigns != 4)) {
+ r.error = error_code::INVALID_BASE64_CHARACTER;
+ r.input_count = equallocation;
+ }
+ }
+ return r;
+ }
+ if (equalsigns > 0) {
+ if ((size_t(dst - dstinit) % 3 == 0) ||
+ ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
+ return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
+ }
+ }
+ return {SUCCESS, srclen, size_t(dst - dstinit)};
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf16.cpp b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf16.cpp
new file mode 100644
index 000000000..6484dcedf
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf16.cpp
@@ -0,0 +1,37 @@
+template <endianness big_endian>
+std::pair<const char *, char16_t *>
+avx2_convert_latin1_to_utf16(const char *latin1_input, size_t len,
+ char16_t *utf16_output) {
+ size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 32
+
+ size_t i = 0;
+ for (; i < rounded_len; i += 16) {
+ // Load 16 bytes from the address (input + i) into a xmm register
+ __m128i xmm0 =
+ _mm_loadu_si128(reinterpret_cast<const __m128i *>(latin1_input + i));
+
+ // Zero extend each byte in xmm0 to word and put it in another xmm register
+ __m128i xmm1 = _mm_cvtepu8_epi16(xmm0);
+
+ // Shift xmm0 to the right by 8 bytes
+ xmm0 = _mm_srli_si128(xmm0, 8);
+
+ // Zero extend each byte in the shifted xmm0 to word in xmm0
+ xmm0 = _mm_cvtepu8_epi16(xmm0);
+
+ if (big_endian) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ xmm0 = _mm_shuffle_epi8(xmm0, swap);
+ xmm1 = _mm_shuffle_epi8(xmm1, swap);
+ }
+
+ // Store the contents of xmm1 into the address pointed by (output + i)
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + i), xmm1);
+
+ // Store the contents of xmm0 into the address pointed by (output + i + 8)
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + i + 8), xmm0);
+ }
+
+ return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf32.cpp b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf32.cpp
new file mode 100644
index 000000000..f89550b95
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf32.cpp
@@ -0,0 +1,20 @@
+std::pair<const char *, char32_t *>
+avx2_convert_latin1_to_utf32(const char *buf, size_t len,
+ char32_t *utf32_output) {
+ size_t rounded_len = ((len | 7) ^ 7); // Round down to nearest multiple of 8
+
+ for (size_t i = 0; i < rounded_len; i += 8) {
+ // Load 8 Latin1 characters into a 64-bit register
+ __m128i in = _mm_loadl_epi64((__m128i *)&buf[i]);
+
+ // Zero extend each set of 8 Latin1 characters to 8 32-bit integers using
+ // vpmovzxbd
+ __m256i out = _mm256_cvtepu8_epi32(in);
+
+ // Store the results back to memory
+ _mm256_storeu_si256((__m256i *)&utf32_output[i], out);
+ }
+
+ // return pointers pointing to where we left off
+ return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf8.cpp b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf8.cpp
new file mode 100644
index 000000000..a637e1bb0
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf8.cpp
@@ -0,0 +1,83 @@
+std::pair<const char *, char *>
+avx2_convert_latin1_to_utf8(const char *latin1_input, size_t len,
+ char *utf8_output) {
+ const char *end = latin1_input + len;
+ const __m256i v_0000 = _mm256_setzero_si256();
+ const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+ const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+ const size_t safety_margin = 12;
+
+ while (end - latin1_input >= std::ptrdiff_t(16 + safety_margin)) {
+ __m128i in8 = _mm_loadu_si128((__m128i *)latin1_input);
+ // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+ const __m128i v_80 = _mm_set1_epi8((char)0x80);
+ if (_mm_testz_si128(in8, v_80)) { // ASCII fast path!!!!
+ // 1. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, in8);
+ // 2. adjust pointers
+ latin1_input += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+ // We proceed only with the first 16 bytes.
+ const __m256i in = _mm256_cvtepu8_epi16((in8));
+
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0000|aabb|bbbb] x 8
+ // expected output : [1100|00aa|10bb|bbbb] x 8
+ const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+ const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+ // t0 = [0000|00aa|bbbb|bb00]
+ const __m256i t0 = _mm256_slli_epi16(in, 2);
+ // t1 = [0000|00aa|0000|0000]
+ const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m256i t2 = _mm256_and_si256(in, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m256i t3 = _mm256_or_si256(t1, t2);
+ // t4 = [1100|00aa|10bb|bbbb]
+ const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+ // 2. merge ASCII and 2-byte codewords
+
+ // no bits set above 7th bit
+ const __m256i one_byte_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+ const uint32_t one_byte_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+ const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
+
+ // 3. prepare bitmask for 8-bit lookup
+ const uint32_t M0 = one_byte_bitmask & 0x55555555;
+ const uint32_t M1 = M0 >> 7;
+ const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+ // 4. pack the bytes
+
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+ const uint8_t *row_2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)]
+ [0];
+
+ const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+ const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+
+ const __m256i utf8_packed = _mm256_shuffle_epi8(
+ utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+ // 5. store bytes
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_castsi256_si128(utf8_packed));
+ utf8_output += row[0];
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_extractf128_si256(utf8_packed, 1));
+ utf8_output += row_2[0];
+
+ // 6. adjust pointers
+ latin1_input += 16;
+ continue;
+
+ } // while
+ return std::make_pair(latin1_input, utf8_output);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf16_to_latin1.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_latin1.cpp
new file mode 100644
index 000000000..8c46a23a8
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_latin1.cpp
@@ -0,0 +1,85 @@
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+avx2_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_output) {
+ const char16_t *end = buf + len;
+ while (end - buf >= 16) {
+ // Load 16 UTF-16 characters into 256-bit AVX2 register
+ __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
+
+ if (!match_system(big_endian)) {
+ const __m256i swap = _mm256_setr_epi8(
+ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+ 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+ in = _mm256_shuffle_epi8(in, swap);
+ }
+
+ __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
+ if (_mm256_testz_si256(in, high_byte_mask)) {
+ // Pack 16-bit characters into 8-bit and store in latin1_output
+ __m128i lo = _mm256_extractf128_si256(in, 0);
+ __m128i hi = _mm256_extractf128_si256(in, 1);
+ __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo);
+ __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi);
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
+ latin1_packed_lo);
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8),
+ latin1_packed_hi);
+ // Adjust pointers for next iteration
+ buf += 16;
+ latin1_output += 16;
+ } else {
+ return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+ }
+ } // while
+ return std::make_pair(buf, latin1_output);
+}
+
+template <endianness big_endian>
+std::pair<result, char *>
+avx2_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+ char *latin1_output) {
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+ while (end - buf >= 16) {
+ __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
+
+ if (!match_system(big_endian)) {
+ const __m256i swap = _mm256_setr_epi8(
+ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+ 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+ in = _mm256_shuffle_epi8(in, swap);
+ }
+
+ __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
+ if (_mm256_testz_si256(in, high_byte_mask)) {
+ __m128i lo = _mm256_extractf128_si256(in, 0);
+ __m128i hi = _mm256_extractf128_si256(in, 1);
+ __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo);
+ __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi);
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
+ latin1_packed_lo);
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8),
+ latin1_packed_hi);
+ buf += 16;
+ latin1_output += 16;
+ } else {
+ // Fallback to scalar code for handling errors
+ for (int k = 0; k < 16; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if (word <= 0xff) {
+ *latin1_output++ = char(word);
+ } else {
+ return std::make_pair(
+ result{error_code::TOO_LARGE, (size_t)(buf - start + k)},
+ latin1_output);
+ }
+ }
+ buf += 16;
+ }
+ } // while
+ return std::make_pair(result{error_code::SUCCESS, (size_t)(buf - start)},
+ latin1_output);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf32.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf32.cpp
new file mode 100644
index 000000000..d396893ca
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf32.cpp
@@ -0,0 +1,210 @@
+/*
+ The vectorized algorithm works on single SSE register i.e., it
+ loads eight 16-bit code units.
+
+ We consider three cases:
+ 1. an input register contains no surrogates and each value
+ is in range 0x0000 .. 0x07ff.
+ 2. an input register contains no surrogates and values are
+ in range 0x0000 .. 0xffff.
+ 3. an input register contains surrogates --- i.e. codepoints
+ can have 16 or 32 bits.
+
+ Ad 1.
+
+ When values are less than 0x0800, it means that a 16-bit code unit
+ can be converted into: 1) single UTF8 byte (when it is an ASCII
+ char) or 2) two UTF8 bytes.
+
+ For this case we do only some shuffle to obtain these 2-byte
+ codes and finally compress the whole SSE register with a single
+ shuffle.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+ Ad 2.
+
+ When values fit in 16-bit code units, but are above 0x07ff, then
+ a single word may produce one, two or three UTF8 bytes.
+
+ We prepare data for all these three cases in two registers.
+ The first register contains lower two UTF8 bytes (used in all
+ cases), while the second one contains just the third byte for
+ the three-UTF8-bytes case.
+
+ Finally these two registers are interleaved forming eight-element
+ array of 32-bit values. The array spans two SSE registers.
+ The bytes from the registers are compressed using two shuffles.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+
+ To summarize:
+ - We need two 256-entry tables that have 8704 bytes in total.
+*/
+
+/*
+ Returns a pair: the first unprocessed byte from buf and utf32_output
+ A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char32_t *>
+avx2_convert_utf16_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_output) {
+ const char16_t *end = buf + len;
+ const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+ const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+
+ while (end - buf >= 16) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+ if (big_endian) {
+ const __m256i swap = _mm256_setr_epi8(
+ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+ 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+ in = _mm256_shuffle_epi8(in, swap);
+ }
+
+ // 1. Check if there are any surrogate word in the input chunk.
+ // We have also deal with situation when there is a surrogate word
+ // at the end of a chunk.
+ const __m256i surrogates_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+ // bitmask = 0x0000 if there are no surrogates
+ // = 0xc000 if the last word is a surrogate
+ const uint32_t surrogates_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (surrogates_bitmask == 0x00000000) {
+ // case: we extend all sixteen 16-bit code units to sixteen 32-bit code
+ // units
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
+ _mm256_storeu_si256(
+ reinterpret_cast<__m256i *>(utf32_output + 8),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
+ utf32_output += 16;
+ buf += 16;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+ if ((word & 0xF800) != 0xD800) {
+ // No surrogate pair
+ *utf32_output++ = char32_t(word);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word =
+ big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(nullptr, utf32_output);
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf32_output++ = char32_t(value);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(buf, utf32_output);
+}
+
+/*
+ Returns a pair: a result struct and utf8_output.
+ If there is an error, the count field of the result is the position of the
+ error. Otherwise, it is the position of the first unprocessed byte in buf
+ (even if finished). A scalar routing should carry on the conversion of the
+ tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char32_t *>
+avx2_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
+ char32_t *utf32_output) {
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+ const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+ const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+
+ while (end - buf >= 16) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+ if (big_endian) {
+ const __m256i swap = _mm256_setr_epi8(
+ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+ 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+ in = _mm256_shuffle_epi8(in, swap);
+ }
+
+ // 1. Check if there are any surrogate word in the input chunk.
+ // We have also deal with situation when there is a surrogate word
+ // at the end of a chunk.
+ const __m256i surrogates_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+ // bitmask = 0x0000 if there are no surrogates
+ // = 0xc000 if the last word is a surrogate
+ const uint32_t surrogates_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (surrogates_bitmask == 0x00000000) {
+ // case: we extend all sixteen 16-bit code units to sixteen 32-bit code
+ // units
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
+ _mm256_storeu_si256(
+ reinterpret_cast<__m256i *>(utf32_output + 8),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
+ utf32_output += 16;
+ buf += 16;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+ if ((word & 0xF800) != 0xD800) {
+ // No surrogate pair
+ *utf32_output++ = char32_t(word);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word =
+ big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k - 1),
+ utf32_output);
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf32_output++ = char32_t(value);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf8.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf8.cpp
new file mode 100644
index 000000000..2a26a0584
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf8.cpp
@@ -0,0 +1,602 @@
+/*
+ The vectorized algorithm works on single SSE register i.e., it
+ loads eight 16-bit code units.
+
+ We consider three cases:
+ 1. an input register contains no surrogates and each value
+ is in range 0x0000 .. 0x07ff.
+ 2. an input register contains no surrogates and values are
+ is in range 0x0000 .. 0xffff.
+ 3. an input register contains surrogates --- i.e. codepoints
+ can have 16 or 32 bits.
+
+ Ad 1.
+
+ When values are less than 0x0800, it means that a 16-bit code unit
+ can be converted into: 1) single UTF8 byte (when it is an ASCII
+ char) or 2) two UTF8 bytes.
+
+ For this case we do only some shuffle to obtain these 2-byte
+ codes and finally compress the whole SSE register with a single
+ shuffle.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+ Ad 2.
+
+ When values fit in 16-bit code units, but are above 0x07ff, then
+ a single word may produce one, two or three UTF8 bytes.
+
+ We prepare data for all these three cases in two registers.
+ The first register contains lower two UTF8 bytes (used in all
+ cases), while the second one contains just the third byte for
+ the three-UTF8-bytes case.
+
+ Finally these two registers are interleaved forming eight-element
+ array of 32-bit values. The array spans two SSE registers.
+ The bytes from the registers are compressed using two shuffles.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+
+ To summarize:
+ - We need two 256-entry tables that have 8704 bytes in total.
+*/
+
+/*
+ Returns a pair: the first unprocessed byte from buf and utf8_output
+ A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+avx2_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
+ const char16_t *end = buf + len;
+ const __m256i v_0000 = _mm256_setzero_si256();
+ const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+ const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+ const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+ if (big_endian) {
+ const __m256i swap = _mm256_setr_epi8(
+ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+ 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+ in = _mm256_shuffle_epi8(in, swap);
+ }
+ // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+ const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+ if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ const __m128i utf8_packed = _mm_packus_epi16(
+ _mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+ // 2. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+ // no bits set above 7th bit
+ const __m256i one_byte_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+ const uint32_t one_byte_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+ // no bits set above 11th bit
+ const __m256i one_or_two_bytes_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
+ const uint32_t one_or_two_bytes_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+ if (one_or_two_bytes_bitmask == 0xffffffff) {
+
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+ const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const __m256i t0 = _mm256_slli_epi16(in, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m256i t2 = _mm256_and_si256(in, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m256i t3 = _mm256_or_si256(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+ // 2. merge ASCII and 2-byte codewords
+ const __m256i utf8_unpacked =
+ _mm256_blendv_epi8(t4, in, one_byte_bytemask);
+
+ // 3. prepare bitmask for 8-bit lookup
+ const uint32_t M0 = one_byte_bitmask & 0x55555555;
+ const uint32_t M1 = M0 >> 7;
+ const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+ // 4. pack the bytes
+
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+ const uint8_t *row_2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+ 16)][0];
+
+ const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+ const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+
+ const __m256i utf8_packed = _mm256_shuffle_epi8(
+ utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+ // 5. store bytes
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_castsi256_si128(utf8_packed));
+ utf8_output += row[0];
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_extractf128_si256(utf8_packed, 1));
+ utf8_output += row_2[0];
+
+ // 6. adjust pointers
+ buf += 16;
+ continue;
+ }
+ // 1. Check if there are any surrogate word in the input chunk.
+ // We have also deal with situation when there is a surrogate word
+ // at the end of a chunk.
+ const __m256i surrogates_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+ // bitmask = 0x0000 if there are no surrogates
+ // = 0xc000 if the last word is a surrogate
+ const uint32_t surrogates_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (surrogates_bitmask == 0x00000000) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ const __m256i dup_even = _mm256_setr_epi16(
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
+ UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+ // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
+ const __m256i s0 = _mm256_srli_epi16(in, 4);
+ // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+ const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+ // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+ const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+ // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+ const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+ simdutf_vec(0b0100000000000000));
+ const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+ const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+ (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+ // Due to the wider registers, the following path is less likely to be
+ // useful.
+ /*if(mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const __m256i shuffle =
+ _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+ 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+ _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+ _mm256_shuffle_epi8(out1, shuffle);
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+ continue;
+ }*/
+ const uint8_t mask0 = uint8_t(mask);
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+ const __m128i utf8_0 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+ const __m128i utf8_1 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+ const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+ const uint8_t *row2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+ const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+ const __m128i utf8_2 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+ const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+ const uint8_t *row3 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+ const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+ const __m128i utf8_3 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += row0[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += row1[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+ utf8_output += row2[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+ utf8_output += row3[0];
+ buf += 16;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+ if ((word & 0xFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xF800) != 0xD800) {
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word =
+ big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf8_output++ = char((value >> 18) | 0b11110000);
+ *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((value & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(buf, utf8_output);
+}
+
+/*
+ Returns a pair: a result struct and utf8_output.
+ If there is an error, the count field of the result is the position of the
+ error. Otherwise, it is the position of the first unprocessed byte in buf
+ (even if finished). A scalar routing should carry on the conversion of the
+ tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char *>
+avx2_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
+ char *utf8_output) {
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+
+ const __m256i v_0000 = _mm256_setzero_si256();
+ const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+ const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+ const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+ if (big_endian) {
+ const __m256i swap = _mm256_setr_epi8(
+ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+ 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+ in = _mm256_shuffle_epi8(in, swap);
+ }
+ // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+ const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+ if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ const __m128i utf8_packed = _mm_packus_epi16(
+ _mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+ // 2. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+ // no bits set above 7th bit
+ const __m256i one_byte_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+ const uint32_t one_byte_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+ // no bits set above 11th bit
+ const __m256i one_or_two_bytes_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
+ const uint32_t one_or_two_bytes_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+ if (one_or_two_bytes_bitmask == 0xffffffff) {
+
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+ const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const __m256i t0 = _mm256_slli_epi16(in, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m256i t2 = _mm256_and_si256(in, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m256i t3 = _mm256_or_si256(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+ // 2. merge ASCII and 2-byte codewords
+ const __m256i utf8_unpacked =
+ _mm256_blendv_epi8(t4, in, one_byte_bytemask);
+
+ // 3. prepare bitmask for 8-bit lookup
+ const uint32_t M0 = one_byte_bitmask & 0x55555555;
+ const uint32_t M1 = M0 >> 7;
+ const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+ // 4. pack the bytes
+
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+ const uint8_t *row_2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+ 16)][0];
+
+ const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+ const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+
+ const __m256i utf8_packed = _mm256_shuffle_epi8(
+ utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+ // 5. store bytes
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_castsi256_si128(utf8_packed));
+ utf8_output += row[0];
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_extractf128_si256(utf8_packed, 1));
+ utf8_output += row_2[0];
+
+ // 6. adjust pointers
+ buf += 16;
+ continue;
+ }
+ // 1. Check if there are any surrogate word in the input chunk.
+ // We have also deal with situation when there is a surrogate word
+ // at the end of a chunk.
+ const __m256i surrogates_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+ // bitmask = 0x0000 if there are no surrogates
+ // = 0xc000 if the last word is a surrogate
+ const uint32_t surrogates_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (surrogates_bitmask == 0x00000000) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ const __m256i dup_even = _mm256_setr_epi16(
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
+ UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+ // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
+ const __m256i s0 = _mm256_srli_epi16(in, 4);
+ // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+ const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+ // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+ const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+ // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+ const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+ simdutf_vec(0b0100000000000000));
+ const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+ const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+ (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+ // Due to the wider registers, the following path is less likely to be
+ // useful.
+ /*if(mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const __m256i shuffle =
+ _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+ 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+ _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+ _mm256_shuffle_epi8(out1, shuffle);
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+ continue;
+ }*/
+ const uint8_t mask0 = uint8_t(mask);
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+ const __m128i utf8_0 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+ const __m128i utf8_1 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+ const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+ const uint8_t *row2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+ const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+ const __m128i utf8_2 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+ const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+ const uint8_t *row3 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+ const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+ const __m128i utf8_3 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += row0[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += row1[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+ utf8_output += row2[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+ utf8_output += row3[0];
+ buf += 16;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+ if ((word & 0xFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xF800) != 0xD800) {
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word =
+ big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k - 1),
+ utf8_output);
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf8_output++ = char((value >> 18) | 0b11110000);
+ *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((value & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf32_to_latin1.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_latin1.cpp
new file mode 100644
index 000000000..d6a32d5df
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_latin1.cpp
@@ -0,0 +1,93 @@
+std::pair<const char32_t *, char *>
+avx2_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) {
+ const size_t rounded_len =
+ len & ~0x1F; // Round down to nearest multiple of 32
+
+ __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
+
+ __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
+
+ for (size_t i = 0; i < rounded_len; i += 16) {
+ __m256i in1 = _mm256_loadu_si256((__m256i *)buf);
+ __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8));
+
+ __m256i check_combined = _mm256_or_si256(in1, in2);
+
+ if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
+ return std::make_pair(nullptr, latin1_output);
+ }
+
+ // Turn UTF32 bytes into latin 1 bytes
+ __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask);
+ __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask);
+
+ // move Latin1 bytes to their correct spot
+ __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0);
+ __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1);
+ __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1);
+ __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2);
+
+ __m256i result = _mm256_or_si256(reshuffled1, reshuffled2);
+ _mm_storeu_si128((__m128i *)latin1_output, _mm256_castsi256_si128(result));
+
+ latin1_output += 16;
+ buf += 16;
+ }
+
+ return std::make_pair(buf, latin1_output);
+}
+std::pair<result, char *>
+avx2_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+ char *latin1_output) {
+ const size_t rounded_len =
+ len & ~0x1F; // Round down to nearest multiple of 32
+
+ __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
+ __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
+
+ const char32_t *start = buf;
+
+ for (size_t i = 0; i < rounded_len; i += 16) {
+ __m256i in1 = _mm256_loadu_si256((__m256i *)buf);
+ __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8));
+
+ __m256i check_combined = _mm256_or_si256(in1, in2);
+
+ if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
+ // Fallback to scalar code for handling errors
+ for (int k = 0; k < 8; k++) {
+ char32_t codepoint = buf[k];
+ if (codepoint <= 0xFF) {
+ *latin1_output++ = static_cast<char>(codepoint);
+ } else {
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+ latin1_output);
+ }
+ }
+ buf += 8;
+ } else {
+ __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask);
+ __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask);
+
+ __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0);
+ __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1);
+ __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1);
+ __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2);
+
+ __m256i result = _mm256_or_si256(reshuffled1, reshuffled2);
+ _mm_storeu_si128((__m128i *)latin1_output,
+ _mm256_castsi256_si128(result));
+
+ latin1_output += 16;
+ buf += 16;
+ }
+ }
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ latin1_output);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf16.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf16.cpp
new file mode 100644
index 000000000..ffd6f1e47
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf16.cpp
@@ -0,0 +1,174 @@
+template <endianness big_endian>
+std::pair<const char32_t *, char16_t *>
+avx2_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+ char16_t *utf16_output) {
+ const char32_t *end = buf + len;
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+ __m256i forbidden_bytemask = _mm256_setzero_si256();
+
+ while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+
+ const __m256i v_00000000 = _mm256_setzero_si256();
+ const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+
+ // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+ const __m256i saturation_bytemask =
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+ const uint32_t saturation_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+
+ if (saturation_bitmask == 0xffffffff) {
+ const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+ const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+ forbidden_bytemask = _mm256_or_si256(
+ forbidden_bytemask,
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
+
+ __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
+ _mm256_extractf128_si256(in, 1));
+ if (big_endian) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+ }
+ _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
+ utf16_output += 8;
+ buf += 8;
+ } else {
+ size_t forward = 7;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFF0000) == 0) {
+ // will not generate a surrogate pair
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(nullptr, utf16_output);
+ }
+ *utf16_output++ =
+ big_endian
+ ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
+ : char16_t(word);
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return std::make_pair(nullptr, utf16_output);
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (big_endian) {
+ high_surrogate =
+ uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+ low_surrogate =
+ uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ }
+ }
+ buf += k;
+ }
+ }
+
+ // check for invalid input
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+ return std::make_pair(nullptr, utf16_output);
+ }
+
+ return std::make_pair(buf, utf16_output);
+}
+
+template <endianness big_endian>
+std::pair<result, char16_t *>
+avx2_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+ char16_t *utf16_output) {
+ const char32_t *start = buf;
+ const char32_t *end = buf + len;
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+
+ const __m256i v_00000000 = _mm256_setzero_si256();
+ const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+
+ // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+ const __m256i saturation_bytemask =
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+ const uint32_t saturation_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+
+ if (saturation_bitmask == 0xffffffff) {
+ const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+ const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+ const __m256i forbidden_bytemask =
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
+ 0x0) {
+ return std::make_pair(result(error_code::SURROGATE, buf - start),
+ utf16_output);
+ }
+
+ __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
+ _mm256_extractf128_si256(in, 1));
+ if (big_endian) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+ }
+ _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
+ utf16_output += 8;
+ buf += 8;
+ } else {
+ size_t forward = 7;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFF0000) == 0) {
+ // will not generate a surrogate pair
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k), utf16_output);
+ }
+ *utf16_output++ =
+ big_endian
+ ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
+ : char16_t(word);
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return std::make_pair(
+ result(error_code::TOO_LARGE, buf - start + k), utf16_output);
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (big_endian) {
+ high_surrogate =
+ uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+ low_surrogate =
+ uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ }
+ }
+ buf += k;
+ }
+ }
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf8.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf8.cpp
new file mode 100644
index 000000000..e1fe5c222
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf8.cpp
@@ -0,0 +1,569 @@
+std::pair<const char32_t *, char *>
+avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
+ const char32_t *end = buf + len;
+ const __m256i v_0000 = _mm256_setzero_si256();
+ const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+ const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+ const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+ const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+ const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+ __m256i running_max = _mm256_setzero_si256();
+ __m256i forbidden_bytemask = _mm256_setzero_si256();
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+ __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
+ running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
+
+ // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
+ // saturation
+ __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
+ _mm256_and_si256(nextin, v_7fffffff));
+ in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+ // Try to apply UTF-16 => UTF-8 routine on 256 bits
+ // (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+ if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ const __m128i utf8_packed = _mm_packus_epi16(
+ _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+ // 2. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+ // no bits set above 7th bit
+ const __m256i one_byte_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+ const uint32_t one_byte_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+ // no bits set above 11th bit
+ const __m256i one_or_two_bytes_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+ const uint32_t one_or_two_bytes_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+ if (one_or_two_bytes_bitmask == 0xffffffff) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+ const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m256i t3 = _mm256_or_si256(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+ // 2. merge ASCII and 2-byte codewords
+ const __m256i utf8_unpacked =
+ _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+ // 3. prepare bitmask for 8-bit lookup
+ const uint32_t M0 = one_byte_bitmask & 0x55555555;
+ const uint32_t M1 = M0 >> 7;
+ const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+ // 4. pack the bytes
+
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+ const uint8_t *row_2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+ 16)][0];
+
+ const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+ const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+
+ const __m256i utf8_packed = _mm256_shuffle_epi8(
+ utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+ // 5. store bytes
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_castsi256_si128(utf8_packed));
+ utf8_output += row[0];
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_extractf128_si256(utf8_packed, 1));
+ utf8_output += row_2[0];
+
+ // 6. adjust pointers
+ buf += 16;
+ continue;
+ }
+ // Must check for overflow in packing
+ const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
+ _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+ const uint32_t saturation_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+ if (saturation_bitmask == 0xffffffff) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+ forbidden_bytemask = _mm256_or_si256(
+ forbidden_bytemask,
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
+
+ const __m256i dup_even = _mm256_setr_epi16(
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
+ UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+ // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
+ const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+ // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+ const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+ // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+ const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+ // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+ const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+ simdutf_vec(0b0100000000000000));
+ const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+ const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+ (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+ // Due to the wider registers, the following path is less likely to be
+ // useful.
+ /*if(mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const __m256i shuffle =
+ _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+ 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+ _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+ _mm256_shuffle_epi8(out1, shuffle);
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+ continue;
+ }*/
+ const uint8_t mask0 = uint8_t(mask);
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+ const __m128i utf8_0 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+ const __m128i utf8_1 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+ const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+ const uint8_t *row2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+ const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+ const __m128i utf8_2 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+ const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+ const uint8_t *row3 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+ const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+ const __m128i utf8_3 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += row0[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += row1[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+ utf8_output += row2[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+ utf8_output += row3[0];
+ buf += 16;
+ } else {
+ // case: at least one 32-bit word is larger than 0xFFFF <=> it will
+ // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
+ // wasteful to use scalar code, but being efficient with SIMD may require
+ // large, non-trivial tables?
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+ *utf8_output++ = char(word);
+ } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else { // 4-byte
+ if (word > 0x10FFFF) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ // check for invalid input
+ const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(
+ _mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+
+ return std::make_pair(buf, utf8_output);
+}
+
+std::pair<result, char *>
+avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+ char *utf8_output) {
+ const char32_t *end = buf + len;
+ const char32_t *start = buf;
+
+ const __m256i v_0000 = _mm256_setzero_si256();
+ const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+ const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+ const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+ const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+ const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+ const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+ __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
+ // Check for too large input
+ const __m256i max_input =
+ _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(
+ _mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+ utf8_output);
+ }
+
+ // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
+ // saturation
+ __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
+ _mm256_and_si256(nextin, v_7fffffff));
+ in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+ // Try to apply UTF-16 => UTF-8 routine on 256 bits
+ // (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+ if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ const __m128i utf8_packed = _mm_packus_epi16(
+ _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+ // 2. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+ // no bits set above 7th bit
+ const __m256i one_byte_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+ const uint32_t one_byte_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+ // no bits set above 11th bit
+ const __m256i one_or_two_bytes_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+ const uint32_t one_or_two_bytes_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+ if (one_or_two_bytes_bitmask == 0xffffffff) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+ const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m256i t3 = _mm256_or_si256(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+ // 2. merge ASCII and 2-byte codewords
+ const __m256i utf8_unpacked =
+ _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+ // 3. prepare bitmask for 8-bit lookup
+ const uint32_t M0 = one_byte_bitmask & 0x55555555;
+ const uint32_t M1 = M0 >> 7;
+ const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+ // 4. pack the bytes
+
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+ const uint8_t *row_2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+ 16)][0];
+
+ const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+ const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+
+ const __m256i utf8_packed = _mm256_shuffle_epi8(
+ utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+ // 5. store bytes
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_castsi256_si128(utf8_packed));
+ utf8_output += row[0];
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_extractf128_si256(utf8_packed, 1));
+ utf8_output += row_2[0];
+
+ // 6. adjust pointers
+ buf += 16;
+ continue;
+ }
+ // Must check for overflow in packing
+ const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
+ _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+ const uint32_t saturation_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+ if (saturation_bitmask == 0xffffffff) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+
+ // Check for illegal surrogate code units
+ const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+ const __m256i forbidden_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
+ 0x0) {
+ return std::make_pair(result(error_code::SURROGATE, buf - start),
+ utf8_output);
+ }
+
+ const __m256i dup_even = _mm256_setr_epi16(
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
+ UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+ // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
+ const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+ // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+ const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+ // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+ const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+ // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+ const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+ simdutf_vec(0b0100000000000000));
+ const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+ const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+ (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+ // Due to the wider registers, the following path is less likely to be
+ // useful.
+ /*if(mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const __m256i shuffle =
+ _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+ 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+ _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+ _mm256_shuffle_epi8(out1, shuffle);
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+ continue;
+ }*/
+ const uint8_t mask0 = uint8_t(mask);
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+ const __m128i utf8_0 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+ const __m128i utf8_1 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+ const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+ const uint8_t *row2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+ const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+ const __m128i utf8_2 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+ const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+ const uint8_t *row3 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+ const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+ const __m128i utf8_3 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += row0[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += row1[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+ utf8_output += row2[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+ utf8_output += row3[0];
+ buf += 16;
+ } else {
+ // case: at least one 32-bit word is larger than 0xFFFF <=> it will
+ // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
+ // wasteful to use scalar code, but being efficient with SIMD may require
+ // large, non-trivial tables?
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+ *utf8_output++ = char(word);
+ } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k), utf8_output);
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else { // 4-byte
+ if (word > 0x10FFFF) {
+ return std::make_pair(
+ result(error_code::TOO_LARGE, buf - start + k), utf8_output);
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf8_to_latin1.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_latin1.cpp
new file mode 100644
index 000000000..8e78ab551
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_latin1.cpp
@@ -0,0 +1,60 @@
+// depends on "tables/utf8_to_utf16_tables.h"
+
+// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_latin1(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char *&latin1_output) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ //
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+ //
+ const __m128i in = _mm_loadu_si128((__m128i *)input);
+
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask &
+ 0xfff; // we are only processing 12 bytes in case it is not all ASCII
+
+ if (utf8_end_of_code_point_mask == 0xfff) {
+ // We process the data in chunks of 12 bytes.
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
+ latin1_output += 12; // We wrote 12 characters.
+ return 12; // We consumed 1 bytes.
+ }
+ /// We do not have a fast path available, so we fallback.
+ const uint8_t idx =
+ tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+ const uint8_t consumed =
+ tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+ // this indicates an invalid input:
+ if (idx >= 64) {
+ return consumed;
+ }
+ // Here we should have (idx < 64), if not, there is a bug in the validation or
+ // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
+ // we process SIX (6) input code-code units. The max length in bytes of six
+ // code code units spanning between 1 and 2 bytes each is 12 bytes. On
+ // processors where pdep/pext is fast, we might be able to use a small lookup
+ // table.
+ const __m128i sh =
+ _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+ __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+ const __m128i latin1_packed = _mm_packus_epi16(composed, composed);
+ // writing 8 bytes even though we only care about the first 6 bytes.
+ // performance note: it would be faster to use _mm_storeu_si128, we should
+ // investigate.
+ _mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
+ latin1_output += 6; // We wrote 6 bytes.
+ return consumed;
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf16.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf16.cpp
new file mode 100644
index 000000000..d99a8ed9d
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf16.cpp
@@ -0,0 +1,195 @@
+// depends on "tables/utf8_to_utf16_tables.h"
+
+// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+template <endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char16_t *&utf16_output) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ //
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+ //
+ // We first try a few fast paths.
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ const __m128i in = _mm_loadu_si128((__m128i *)input);
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask & 0xfff;
+ if (utf8_end_of_code_point_mask == 0xfff) {
+ // We process the data in chunks of 12 bytes.
+ __m256i ascii = _mm256_cvtepu8_epi16(in);
+ if (big_endian) {
+ const __m256i swap256 = _mm256_setr_epi8(
+ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+ 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+ ascii = _mm256_shuffle_epi8(ascii, swap256);
+ }
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii);
+ utf16_output += 12; // We wrote 12 16-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+ if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+ // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte
+ // UTF-16 code units. There is probably a more efficient sequence, but the
+ // following might do.
+ const __m128i sh =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+ __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+ if (big_endian)
+ composed = _mm_shuffle_epi8(composed, swap);
+ _mm_storeu_si128((__m128i *)utf16_output, composed);
+ utf16_output += 8; // We wrote 16 bytes, 8 code points.
+ return 16;
+ }
+ if (input_utf8_end_of_code_point_mask == 0x924) {
+ // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
+ // UTF-16 code units. There is probably a more efficient sequence, but the
+ // following might do.
+ const __m128i sh =
+ _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii =
+ _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+ const __m128i middlebyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ const __m128i highbyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+ __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+ if (big_endian)
+ composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+ _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+ utf16_output += 4;
+ return 12;
+ }
+
+ const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][0];
+ const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][1];
+ if (idx < 64) {
+ // SIX (6) input code-code units
+ // this is a relatively easy scenario
+ // we process SIX (6) input code-code units. The max length in bytes of six
+ // code code units spanning between 1 and 2 bytes each is 12 bytes. On
+ // processors where pdep/pext is fast, we might be able to use a small
+ // lookup table.
+ const __m128i sh = _mm_loadu_si128(
+ (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+ __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+ if (big_endian)
+ composed = _mm_shuffle_epi8(composed, swap);
+ _mm_storeu_si128((__m128i *)utf16_output, composed);
+ utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential
+ // overflow of 4 bytes.
+ } else if (idx < 145) {
+ // FOUR (4) input code-code units
+ const __m128i sh = _mm_loadu_si128(
+ (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii =
+ _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+ const __m128i middlebyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ const __m128i highbyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+ __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+ if (big_endian)
+ composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+ _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+ utf16_output += 4; // Here we overflow by 8 bytes.
+ } else if (idx < 209) {
+ // TWO (2) input code-code units
+ //////////////
+ // There might be garbage inputs where a leading byte mascarades as a
+ // four-byte leading byte (by being followed by 3 continuation byte), but is
+ // not greater than 0xf0. This could trigger a buffer overflow if we only
+ // counted leading bytes of the form 0xf0 as generating surrogate pairs,
+ // without further UTF-8 validation. Thus we must be careful to ensure that
+ // only leading bytes at least as large as 0xf0 generate surrogate pairs. We
+ // do as at the cost of an extra mask.
+ /////////////
+ const __m128i sh = _mm_loadu_si128(
+ (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+ const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+ // correct for spurious high bit
+ const __m128i correct =
+ _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+ middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+ const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+ // We deliberately carry the leading four bits in highbyte if they are
+ // present, we remove them later when computing hightenbits.
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+ // When we need to generate a surrogate pair (leading byte > 0xF0), then
+ // the corresponding 32-bit value in 'composed' will be greater than
+ // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
+ // location of the surrogate pairs.
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+ _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+ const __m128i composedminus =
+ _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
+ const __m128i lowtenbits =
+ _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
+ // Notice the 0x3ff mask:
+ const __m128i hightenbits =
+ _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
+ const __m128i lowtenbitsadd =
+ _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
+ const __m128i hightenbitsadd =
+ _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
+ const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
+ __m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
+ uint32_t basic_buffer[4];
+ uint32_t basic_buffer_swap[4];
+ if (big_endian) {
+ _mm_storeu_si128((__m128i *)basic_buffer_swap,
+ _mm_shuffle_epi8(composed, swap));
+ surrogates = _mm_shuffle_epi8(surrogates, swap);
+ }
+ _mm_storeu_si128((__m128i *)basic_buffer, composed);
+ uint32_t surrogate_buffer[4];
+ _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
+ for (size_t i = 0; i < 3; i++) {
+ if (basic_buffer[i] > 0x3c00000) {
+ utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
+ utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+ utf16_output += 2;
+ } else {
+ utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i])
+ : uint16_t(basic_buffer[i]);
+ utf16_output++;
+ }
+ }
+ } else {
+ // here we know that there is an error but we do not handle errors
+ }
+ return consumed;
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf32.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf32.cpp
new file mode 100644
index 000000000..c5cf74143
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf32.cpp
@@ -0,0 +1,135 @@
+// depends on "tables/utf8_to_utf16_tables.h"
+
+// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_utf32(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char32_t *&utf32_output) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ //
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+ //
+ // We first try a few fast paths.
+ const __m128i in = _mm_loadu_si128((__m128i *)input);
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask & 0xfff;
+ if (utf8_end_of_code_point_mask == 0xfff) {
+ // We process the data in chunks of 12 bytes.
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
+ _mm256_cvtepu8_epi32(in));
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8),
+ _mm256_cvtepu8_epi32(_mm_srli_si128(in, 8)));
+ utf32_output += 12; // We wrote 12 32-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+ if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+ // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte
+ // UTF-32 code units. There is probably a more efficient sequence, but the
+ // following might do.
+ const __m128i sh =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+ const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+ _mm256_storeu_si256((__m256i *)utf32_output,
+ _mm256_cvtepu16_epi32(composed));
+ utf32_output += 8; // We wrote 16 bytes, 8 code points.
+ return 16;
+ }
+ if (input_utf8_end_of_code_point_mask == 0x924) {
+ // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
+ // UTF-32 code units. There is probably a more efficient sequence, but the
+ // following might do.
+ const __m128i sh =
+ _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii =
+ _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+ const __m128i middlebyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ const __m128i highbyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+ _mm_storeu_si128((__m128i *)utf32_output, composed);
+ utf32_output += 4;
+ return 12;
+ }
+ /// We do not have a fast path available, so we fallback.
+
+ const uint8_t idx =
+ tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+ const uint8_t consumed =
+ tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+ if (idx < 64) {
+ // SIX (6) input code-code units
+ // this is a relatively easy scenario
+ // we process SIX (6) input code-code units. The max length in bytes of six
+ // code code units spanning between 1 and 2 bytes each is 12 bytes. On
+ // processors where pdep/pext is fast, we might be able to use a small
+ // lookup table.
+ const __m128i sh =
+ _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+ const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+ _mm256_storeu_si256((__m256i *)utf32_output,
+ _mm256_cvtepu16_epi32(composed));
+ utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential
+ // overflow of 32 - 24 = 8 bytes.
+ } else if (idx < 145) {
+ // FOUR (4) input code-code units
+ const __m128i sh =
+ _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii =
+ _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+ const __m128i middlebyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ const __m128i highbyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+ _mm_storeu_si128((__m128i *)utf32_output, composed);
+ utf32_output += 4;
+ } else if (idx < 209) {
+ // TWO (2) input code-code units
+ const __m128i sh =
+ _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+ const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+ // correct for spurious high bit
+ const __m128i correct =
+ _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+ middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+ const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+ _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+ _mm_storeu_si128((__m128i *)utf32_output, composed);
+ utf32_output +=
+ 3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes.
+ } else {
+ // here we know that there is an error but we do not handle errors
+ }
+ return consumed;
+}
diff --git a/contrib/simdutf/src/haswell/avx2_validate_utf16.cpp b/contrib/simdutf/src/haswell/avx2_validate_utf16.cpp
new file mode 100644
index 000000000..0c54062d4
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_validate_utf16.cpp
@@ -0,0 +1,206 @@
+/*
+ In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
+
+ In a vectorized algorithm we want to examine the most significant
+ nibble in order to select a fast path. If none of highest nibbles
+ are 0xD (13), than we are sure that UTF-16 chunk in a vector
+ register is valid.
+
+ Let us analyze what we need to check if the nibble is 0xD. The
+ value of the preceding nibble determines what we have:
+
+ 0xd000 .. 0xd7ff - a valid word
+ 0xd800 .. 0xdbff - low surrogate
+ 0xdc00 .. 0xdfff - high surrogate
+
+ Other constraints we have to consider:
+ - there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
+ - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
+ - there must not be sole low surrogate nor high surrogate
+
+ We're going to build three bitmasks based on the 3rd nibble:
+ - V = valid word,
+ - L = low surrogate (0xd800 .. 0xdbff)
+ - H = high surrogate (0xdc00 .. 0xdfff)
+
+ 0 1 2 3 4 5 6 7 <--- word index
+ [ V | L | H | L | H | V | V | L ]
+ 1 0 0 0 0 1 1 0 - V = valid masks
+ 0 1 0 1 0 0 0 1 - L = low surrogate
+ 0 0 1 0 1 0 0 0 - H high surrogate
+
+
+ 1 0 0 0 0 1 1 0 V = valid masks
+ 0 1 0 1 0 0 0 0 a = L & (H >> 1)
+ 0 0 1 0 1 0 0 0 b = a << 1
+ 1 1 1 1 1 1 1 0 c = V | a | b
+ ^
+ the last bit can be zero, we just consume 7
+ code units and recheck this word in the next iteration
+*/
+
+/* Returns:
+ - pointer to the last unprocessed character (a scalar fallback should check
+ the rest);
+ - nullptr if an error was detected.
+*/
+template <endianness big_endian>
+const char16_t *avx2_validate_utf16(const char16_t *input, size_t size) {
+ const char16_t *end = input + size;
+
+ const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+ const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+ const auto v_fc = simd8<uint8_t>::splat(0xfc);
+ const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+ while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
+ // 0. Load data: since the validation takes into account only higher
+ // byte of each word, we compress the two vectors into one which
+ // consists only the higher bytes.
+ auto in0 = simd16<uint16_t>(input);
+ auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
+
+ if (big_endian) {
+ in0 = in0.swap_bytes();
+ in1 = in1.swap_bytes();
+ }
+
+ const auto t0 = in0.shr<8>();
+ const auto t1 = in1.shr<8>();
+
+ const auto in = simd16<uint16_t>::pack(t0, t1);
+
+ // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+ const auto surrogates_wordmask = (in & v_f8) == v_d8;
+ const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
+ if (surrogates_bitmask == 0x0) {
+ input += simd16<uint16_t>::ELEMENTS * 2;
+ } else {
+ // 2. We have some surrogates that have to be distinguished:
+ // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+ // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+ //
+ // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+ // V - non-surrogate code units
+ // V = not surrogates_wordmask
+ const uint32_t V = ~surrogates_bitmask;
+
+ // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+ const auto vH = (in & v_fc) == v_dc;
+ const uint32_t H = vH.to_bitmask();
+
+ // L - word mask for low surrogates
+ // L = not H and surrogates_wordmask
+ const uint32_t L = ~H & surrogates_bitmask;
+
+ const uint32_t a =
+ L & (H >> 1); // A low surrogate must be followed by high one.
+ // (A low surrogate placed in the 7th register's word
+ // is an exception we handle.)
+ const uint32_t b =
+ a << 1; // Just mark that the opposite fact is hold,
+ // thanks to that we have only two masks for valid case.
+ const uint32_t c = V | a | b; // Combine all the masks into the final one.
+
+ if (c == 0xffffffff) {
+ // The whole input register contains valid UTF-16, i.e.,
+ // either single code units or proper surrogate pairs.
+ input += simd16<uint16_t>::ELEMENTS * 2;
+ } else if (c == 0x7fffffff) {
+ // The 31 lower code units of the input register contains valid UTF-16.
+ // The 31 word may be either a low or high surrogate. It the next
+ // iteration we 1) check if the low surrogate is followed by a high
+ // one, 2) reject sole high surrogate.
+ input += simd16<uint16_t>::ELEMENTS * 2 - 1;
+ } else {
+ return nullptr;
+ }
+ }
+ }
+
+ return input;
+}
+
+template <endianness big_endian>
+const result avx2_validate_utf16_with_errors(const char16_t *input,
+ size_t size) {
+ if (simdutf_unlikely(size == 0)) {
+ return result(error_code::SUCCESS, 0);
+ }
+ const char16_t *start = input;
+ const char16_t *end = input + size;
+
+ const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+ const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+ const auto v_fc = simd8<uint8_t>::splat(0xfc);
+ const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+ while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
+ // 0. Load data: since the validation takes into account only higher
+ // byte of each word, we compress the two vectors into one which
+ // consists only the higher bytes.
+ auto in0 = simd16<uint16_t>(input);
+ auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
+
+ if (big_endian) {
+ in0 = in0.swap_bytes();
+ in1 = in1.swap_bytes();
+ }
+
+ const auto t0 = in0.shr<8>();
+ const auto t1 = in1.shr<8>();
+
+ const auto in = simd16<uint16_t>::pack(t0, t1);
+
+ // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+ const auto surrogates_wordmask = (in & v_f8) == v_d8;
+ const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
+ if (surrogates_bitmask == 0x0) {
+ input += simd16<uint16_t>::ELEMENTS * 2;
+ } else {
+ // 2. We have some surrogates that have to be distinguished:
+ // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+ // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+ //
+ // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+ // V - non-surrogate code units
+ // V = not surrogates_wordmask
+ const uint32_t V = ~surrogates_bitmask;
+
+ // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+ const auto vH = (in & v_fc) == v_dc;
+ const uint32_t H = vH.to_bitmask();
+
+ // L - word mask for low surrogates
+ // L = not H and surrogates_wordmask
+ const uint32_t L = ~H & surrogates_bitmask;
+
+ const uint32_t a =
+ L & (H >> 1); // A low surrogate must be followed by high one.
+ // (A low surrogate placed in the 7th register's word
+ // is an exception we handle.)
+ const uint32_t b =
+ a << 1; // Just mark that the opposite fact is hold,
+ // thanks to that we have only two masks for valid case.
+ const uint32_t c = V | a | b; // Combine all the masks into the final one.
+
+ if (c == 0xffffffff) {
+ // The whole input register contains valid UTF-16, i.e.,
+ // either single code units or proper surrogate pairs.
+ input += simd16<uint16_t>::ELEMENTS * 2;
+ } else if (c == 0x7fffffff) {
+ // The 31 lower code units of the input register contains valid UTF-16.
+ // The 31 word may be either a low or high surrogate. It the next
+ // iteration we 1) check if the low surrogate is followed by a high
+ // one, 2) reject sole high surrogate.
+ input += simd16<uint16_t>::ELEMENTS * 2 - 1;
+ } else {
+ return result(error_code::SURROGATE, input - start);
+ }
+ }
+ }
+
+ return result(error_code::SUCCESS, input - start);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_validate_utf32le.cpp b/contrib/simdutf/src/haswell/avx2_validate_utf32le.cpp
new file mode 100644
index 000000000..8cb1d5f3b
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_validate_utf32le.cpp
@@ -0,0 +1,70 @@
+/* Returns:
+ - pointer to the last unprocessed character (a scalar fallback should check
+ the rest);
+ - nullptr if an error was detected.
+*/
+const char32_t *avx2_validate_utf32le(const char32_t *input, size_t size) {
+ const char32_t *end = input + size;
+
+ const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
+ const __m256i offset = _mm256_set1_epi32(0xffff2000);
+ const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
+ __m256i currentmax = _mm256_setzero_si256();
+ __m256i currentoffsetmax = _mm256_setzero_si256();
+
+ while (input + 8 < end) {
+ const __m256i in = _mm256_loadu_si256((__m256i *)input);
+ currentmax = _mm256_max_epu32(in, currentmax);
+ currentoffsetmax =
+ _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
+ input += 8;
+ }
+ __m256i is_zero =
+ _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
+ if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+ return nullptr;
+ }
+
+ is_zero = _mm256_xor_si256(
+ _mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
+ if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+ return nullptr;
+ }
+
+ return input;
+}
+
+const result avx2_validate_utf32le_with_errors(const char32_t *input,
+ size_t size) {
+ const char32_t *start = input;
+ const char32_t *end = input + size;
+
+ const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
+ const __m256i offset = _mm256_set1_epi32(0xffff2000);
+ const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
+ __m256i currentmax = _mm256_setzero_si256();
+ __m256i currentoffsetmax = _mm256_setzero_si256();
+
+ while (input + 8 < end) {
+ const __m256i in = _mm256_loadu_si256((__m256i *)input);
+ currentmax = _mm256_max_epu32(in, currentmax);
+ currentoffsetmax =
+ _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
+
+ __m256i is_zero = _mm256_xor_si256(
+ _mm256_max_epu32(currentmax, standardmax), standardmax);
+ if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+ return result(error_code::TOO_LARGE, input - start);
+ }
+
+ is_zero =
+ _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax),
+ standardoffsetmax);
+ if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+ return result(error_code::SURROGATE, input - start);
+ }
+ input += 8;
+ }
+
+ return result(error_code::SUCCESS, input - start);
+}
diff --git a/contrib/simdutf/src/haswell/implementation.cpp b/contrib/simdutf/src/haswell/implementation.cpp
new file mode 100644
index 000000000..0225f1f95
--- /dev/null
+++ b/contrib/simdutf/src/haswell/implementation.cpp
@@ -0,0 +1,1145 @@
+#include "tables/utf8_to_utf16_tables.h"
+#include "scalar/utf8_to_utf16/valid_utf8_to_utf16.h"
+#include "scalar/utf8_to_utf16/utf8_to_utf16.h"
+#include "scalar/utf8_to_utf32/valid_utf8_to_utf32.h"
+#include "scalar/utf8_to_utf32/utf8_to_utf32.h"
+#include "tables/utf16_to_utf8_tables.h"
+#include "scalar/utf8.h"
+#include "scalar/utf16.h"
+#include "scalar/latin1.h"
+#include "scalar/utf8_to_latin1/valid_utf8_to_latin1.h"
+#include "scalar/utf8_to_latin1/utf8_to_latin1.h"
+
+#include "simdutf/haswell/begin.h"
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+#ifndef SIMDUTF_HASWELL_H
+ #error "haswell.h must be included"
+#endif
+using namespace simd;
+
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
+ return input.reduce_or().is_ascii();
+}
+
+simdutf_unused simdutf_really_inline simd8<bool>
+must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
+ const simd8<uint8_t> prev3) {
+ simd8<uint8_t> is_second_byte =
+ prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
+ simd8<uint8_t> is_third_byte =
+ prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+ simd8<uint8_t> is_fourth_byte =
+ prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+ // Caller requires a bool (all 1's). All values resulting from the subtraction
+ // will be <= 64, so signed comparison is fine.
+ return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) >
+ int8_t(0);
+}
+
+simdutf_really_inline simd8<bool>
+must_be_2_3_continuation(const simd8<uint8_t> prev2,
+ const simd8<uint8_t> prev3) {
+ simd8<uint8_t> is_third_byte =
+ prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be > 0x80
+ simd8<uint8_t> is_fourth_byte =
+ prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be > 0x80
+ return simd8<bool>(is_third_byte | is_fourth_byte);
+}
+
+#include "haswell/avx2_validate_utf16.cpp"
+#include "haswell/avx2_validate_utf32le.cpp"
+
+#include "haswell/avx2_convert_latin1_to_utf8.cpp"
+#include "haswell/avx2_convert_latin1_to_utf16.cpp"
+#include "haswell/avx2_convert_latin1_to_utf32.cpp"
+
+#include "haswell/avx2_convert_utf8_to_utf16.cpp"
+#include "haswell/avx2_convert_utf8_to_utf32.cpp"
+
+#include "haswell/avx2_convert_utf16_to_latin1.cpp"
+#include "haswell/avx2_convert_utf16_to_utf8.cpp"
+#include "haswell/avx2_convert_utf16_to_utf32.cpp"
+
+#include "haswell/avx2_convert_utf32_to_latin1.cpp"
+#include "haswell/avx2_convert_utf32_to_utf8.cpp"
+#include "haswell/avx2_convert_utf32_to_utf16.cpp"
+
+#include "haswell/avx2_convert_utf8_to_latin1.cpp"
+
+#include "haswell/avx2_base64.cpp"
+
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#include "generic/buf_block_reader.h"
+#include "generic/utf8_validation/utf8_lookup4_algorithm.h"
+#include "generic/utf8_validation/utf8_validator.h"
+// transcoding from UTF-8 to UTF-16
+#include "generic/utf8_to_utf16/valid_utf8_to_utf16.h"
+#include "generic/utf8_to_utf16/utf8_to_utf16.h"
+// transcoding from UTF-8 to UTF-32
+#include "generic/utf8_to_utf32/valid_utf8_to_utf32.h"
+#include "generic/utf8_to_utf32/utf8_to_utf32.h"
+// other functions
+#include "generic/utf8.h"
+#include "generic/utf16.h"
+
+// transcoding from UTF-8 to Latin 1
+#include "generic/utf8_to_latin1/utf8_to_latin1.h"
+#include "generic/utf8_to_latin1/valid_utf8_to_latin1.h"
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+ size_t length) const noexcept {
+ // If there is a BOM, then we trust it.
+ auto bom_encoding = simdutf::BOM::check_bom(input, length);
+ if (bom_encoding != encoding_type::unspecified) {
+ return bom_encoding;
+ }
+ int out = 0;
+ if (validate_utf8(input, length)) {
+ out |= encoding_type::UTF8;
+ }
+ if ((length % 2) == 0) {
+ if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+ length / 2)) {
+ out |= encoding_type::UTF16_LE;
+ }
+ }
+ if ((length % 4) == 0) {
+ if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+ out |= encoding_type::UTF32_LE;
+ }
+ }
+ return out;
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+ return haswell::utf8_validation::generic_validate_utf8(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+ const char *buf, size_t len) const noexcept {
+ return haswell::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+ return haswell::utf8_validation::generic_validate_ascii(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+ const char *buf, size_t len) const noexcept {
+ return haswell::utf8_validation::generic_validate_ascii_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid UTF-16. protect the implementation from
+ // handling nullptr
+ return true;
+ }
+ const char16_t *tail = avx2_validate_utf16<endianness::LITTLE>(buf, len);
+ if (tail) {
+ return scalar::utf16::validate<endianness::LITTLE>(tail,
+ len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid UTF-16. protect the implementation from
+ // handling nullptr
+ return true;
+ }
+ const char16_t *tail = avx2_validate_utf16<endianness::BIG>(buf, len);
+ if (tail) {
+ return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ result res = avx2_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+ if (res.count != len) {
+ result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
+ buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ result res = avx2_validate_utf16_with_errors<endianness::BIG>(buf, len);
+ if (res.count != len) {
+ result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
+ buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid UTF-32. protect the implementation from
+ // handling nullptr
+ return true;
+ }
+ const char32_t *tail = avx2_validate_utf32le(buf, len);
+ if (tail) {
+ return scalar::utf32::validate(tail, len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid UTF-32. protect the implementation from
+ // handling nullptr
+ return result(error_code::SUCCESS, 0);
+ }
+ result res = avx2_validate_utf32le_with_errors(buf, len);
+ if (res.count != len) {
+ result scalar_res =
+ scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+ const char *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char *, char *> ret =
+ avx2_convert_latin1_to_utf8(buf, len, utf8_output);
+ size_t converted_chars = ret.second - utf8_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ converted_chars += scalar_converted_chars;
+ }
+
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char *, char16_t *> ret =
+ avx2_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t converted_chars = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars =
+ scalar::latin1_to_utf16::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_converted_chars == 0) {
+ return 0;
+ }
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char *, char16_t *> ret =
+ avx2_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t converted_chars = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars =
+ scalar::latin1_to_utf16::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_converted_chars == 0) {
+ return 0;
+ }
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char *, char32_t *> ret =
+ avx2_convert_latin1_to_utf32(buf, len, utf32_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t converted_chars = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_converted_chars == 0) {
+ return 0;
+ }
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ utf8_to_latin1::validating_transcoder converter;
+ return converter.convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ utf8_to_latin1::validating_transcoder converter;
+ return converter.convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+ const char *input, size_t size, char *latin1_output) const noexcept {
+ return utf8_to_latin1::convert_valid(input, size, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert_with_errors<endianness::LITTLE>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+ const char *input, size_t size, char16_t *utf16_output) const noexcept {
+ return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+ const char *input, size_t size, char16_t *utf16_output) const noexcept {
+ return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ utf8_to_utf32::validating_transcoder converter;
+ return converter.convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ utf8_to_utf32::validating_transcoder converter;
+ return converter.convert_with_errors(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+ const char *input, size_t size, char32_t *utf32_output) const noexcept {
+ return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ haswell::avx2_convert_utf16_to_latin1<endianness::LITTLE>(buf, len,
+ latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_latin1::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ haswell::avx2_convert_utf16_to_latin1<endianness::BIG>(buf, len,
+ latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_latin1::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<result, char *> ret =
+ avx2_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+ buf, len, latin1_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<result, char *> ret =
+ avx2_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
+ latin1_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ // optimization opportunity: implement a custom function
+ return convert_utf16be_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ // optimization opportunity: implement a custom function
+ return convert_utf16le_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len,
+ utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len,
+ utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf8::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(
+ buf, len, utf8_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>(
+ buf, len, utf8_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf16le_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf16be_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char32_t *, char *> ret =
+ avx2_convert_utf32_to_utf8(buf, len, utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char32_t *, char *> ret =
+ avx2_convert_utf32_to_latin1(buf, len, latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ avx2_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
+ if (ret.first.count != len) {
+ result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ return convert_utf32_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+ if (ret.first.count != len) {
+ result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char16_t *, char32_t *> ret =
+ haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
+ utf32_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char16_t *, char32_t *> ret =
+ haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len,
+ utf32_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf32::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char32_t *> ret =
+ haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(
+ buf, len, utf32_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf32_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char32_t *> ret =
+ haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>(
+ buf, len, utf32_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf32_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf32_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char32_t *, char16_t *> ret =
+ avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char32_t *, char16_t *> ret =
+ avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf32_to_utf16::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char16_t *> ret =
+ haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
+ buf, len, utf16_output);
+ if (ret.first.count != len) {
+ result scalar_res =
+ scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf16_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char16_t *> ret =
+ haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>(
+ buf, len, utf16_output);
+ if (ret.first.count != len) {
+ result scalar_res =
+ scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf16_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return convert_utf32_to_utf16le(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return convert_utf16le_to_utf32(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return convert_utf16be_to_utf32(buf, len, utf32_output);
+}
+
+void implementation::change_endianness_utf16(const char16_t *input,
+ size_t length,
+ char16_t *output) const noexcept {
+ utf16::change_endianness_utf16(input, length, output);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::count_code_points<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::count_code_points<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+ return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+ const char *buf, size_t len) const noexcept {
+ return count_utf8(buf, len);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+ return scalar::utf16::latin1_length_from_utf16(length);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+ return scalar::utf32::latin1_length_from_utf32(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+ return scalar::latin1::utf16_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ return utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+ return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+ const char *input, size_t len) const noexcept {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
+ size_t answer = len / sizeof(__m256i) * sizeof(__m256i);
+ size_t i = 0;
+ if (answer >= 2048) { // long strings optimization
+ __m256i four_64bits = _mm256_setzero_si256();
+ while (i + sizeof(__m256i) <= len) {
+ __m256i runner = _mm256_setzero_si256();
+ // We can do up to 255 loops without overflow.
+ size_t iterations = (len - i) / sizeof(__m256i);
+ if (iterations > 255) {
+ iterations = 255;
+ }
+ size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i);
+ for (; i + 4 * sizeof(__m256i) <= max_i; i += 4 * sizeof(__m256i)) {
+ __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i));
+ __m256i input2 =
+ _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i)));
+ __m256i input3 = _mm256_loadu_si256(
+ (const __m256i *)(data + i + 2 * sizeof(__m256i)));
+ __m256i input4 = _mm256_loadu_si256(
+ (const __m256i *)(data + i + 3 * sizeof(__m256i)));
+ __m256i input12 =
+ _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1),
+ _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2));
+ __m256i input23 =
+ _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3),
+ _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4));
+ __m256i input1234 = _mm256_add_epi8(input12, input23);
+ runner = _mm256_sub_epi8(runner, input1234);
+ }
+ for (; i <= max_i; i += sizeof(__m256i)) {
+ __m256i input_256_chunk =
+ _mm256_loadu_si256((const __m256i *)(data + i));
+ runner = _mm256_sub_epi8(
+ runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk));
+ }
+ four_64bits = _mm256_add_epi64(
+ four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256()));
+ }
+ answer += _mm256_extract_epi64(four_64bits, 0) +
+ _mm256_extract_epi64(four_64bits, 1) +
+ _mm256_extract_epi64(four_64bits, 2) +
+ _mm256_extract_epi64(four_64bits, 3);
+ } else if (answer > 0) {
+ for (; i + sizeof(__m256i) <= len; i += sizeof(__m256i)) {
+ __m256i latin = _mm256_loadu_si256((const __m256i *)(data + i));
+ uint32_t non_ascii = _mm256_movemask_epi8(latin);
+ answer += count_ones(non_ascii);
+ }
+ }
+ return answer + scalar::latin1::utf8_length_from_latin1(
+ reinterpret_cast<const char *>(data + i), len - i);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ const __m256i v_00000000 = _mm256_setzero_si256();
+ const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80);
+ const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800);
+ const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+ size_t pos = 0;
+ size_t count = 0;
+ for (; pos + 8 <= length; pos += 8) {
+ __m256i in = _mm256_loadu_si256((__m256i *)(input + pos));
+ const __m256i ascii_bytes_bytemask =
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000);
+ const __m256i one_two_bytes_bytemask =
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000);
+ const __m256i two_bytes_bytemask =
+ _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask);
+ const __m256i one_two_three_bytes_bytemask =
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+ const __m256i three_bytes_bytemask =
+ _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
+ const uint32_t ascii_bytes_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(ascii_bytes_bytemask));
+ const uint32_t two_bytes_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(two_bytes_bytemask));
+ const uint32_t three_bytes_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(three_bytes_bytemask));
+
+ size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
+ size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
+ size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
+ count += 32 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
+ }
+ return count +
+ scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ const __m256i v_00000000 = _mm256_setzero_si256();
+ const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+ size_t pos = 0;
+ size_t count = 0;
+ for (; pos + 8 <= length; pos += 8) {
+ __m256i in = _mm256_loadu_si256((__m256i *)(input + pos));
+ const __m256i surrogate_bytemask =
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+ const uint32_t surrogate_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask));
+ size_t surrogate_count = (32 - count_ones(surrogate_bitmask)) / 4;
+ count += 8 + surrogate_count;
+ }
+ return count +
+ scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+ size_t length, base64_options options) const noexcept {
+ return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+ char *output,
+ base64_options options) const noexcept {
+ if (options & base64_url) {
+ return encode_base64<true>(output, input, length, options);
+ } else {
+ return encode_base64<false>(output, input, length, options);
+ }
+}
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#include "simdutf/haswell/end.h"