aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/simdutf/src/icelake/icelake_convert_latin1_to_utf16.inl.cpp
blob: 4d4738d9cca2f003ade002db1c3920e624daa029 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// file included directly
template <endianness big_endian>
size_t icelake_convert_latin1_to_utf16(const char *latin1_input, size_t len,
                                       char16_t *utf16_output) {
  size_t rounded_len = len & ~0x1F; // Round down to nearest multiple of 32

  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
  for (size_t i = 0; i < rounded_len; i += 32) {
    // Load 32 Latin1 characters into a 256-bit register
    __m256i in = _mm256_loadu_si256((__m256i *)&latin1_input[i]);
    // Zero extend each set of 8 Latin1 characters to 32 16-bit integers
    __m512i out = _mm512_cvtepu8_epi16(in);
    if (big_endian) {
      out = _mm512_shuffle_epi8(out, byteflip);
    }
    // Store the results back to memory
    _mm512_storeu_si512((__m512i *)&utf16_output[i], out);
  }
  if (rounded_len != len) {
    uint32_t mask = uint32_t(1 << (len - rounded_len)) - 1;
    __m256i in = _mm256_maskz_loadu_epi8(mask, latin1_input + rounded_len);

    // Zero extend each set of 8 Latin1 characters to 32 16-bit integers
    __m512i out = _mm512_cvtepu8_epi16(in);
    if (big_endian) {
      out = _mm512_shuffle_epi8(out, byteflip);
    }
    // Store the results back to memory
    _mm512_mask_storeu_epi16(utf16_output + rounded_len, mask, out);
  }

  return len;
}