aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/simdutf/src/icelake/icelake_from_valid_utf8.inl.cpp
blob: bff746a540742e36fa20798703188b5b3cf41ffd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
// file included directly

// File contains conversion procedure from VALID UTF-8 strings.

/*
    valid_utf8_to_fixed_length converts a valid UTF-8 string into UTF-32.

    The `OUTPUT` template type decides what to do with UTF-32: store
    it directly or convert into UTF-16 (with AVX512).

    Input:
    - str           - valid UTF-8 string
    - len           - string length
    - out_buffer    - output buffer

    Result:
    - pair.first    - the first unprocessed input byte
    - pair.second   - the first unprocessed output word
*/
template <endianness big_endian, typename OUTPUT>
std::pair<const char *, OUTPUT *>
valid_utf8_to_fixed_length(const char *str, size_t len, OUTPUT *dwords) {
  constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
  constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
  static_assert(
      UTF32 or UTF16,
      "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
  static_assert(!(UTF32 and big_endian),
                "we do not currently support big-endian UTF-32");

  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
  const char *ptr = str;
  const char *end = ptr + len;

  OUTPUT *output = dwords;
  /**
   * In the main loop, we consume 64 bytes per iteration,
   * but we access 64 + 4 bytes.
   * We check for ptr + 64 + 64 <= end because
   * we want to be do maskless writes without overruns.
   */
  while (end - ptr >= 64 + 4) {
    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
    const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
    if (ascii == 0) {
      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
      output += 64;
      ptr += 64;
      continue;
    }

    const __m512i lane0 = broadcast_epi128<0>(utf8);
    const __m512i lane1 = broadcast_epi128<1>(utf8);
    int valid_count0;
    __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
    const __m512i lane2 = broadcast_epi128<2>(utf8);
    int valid_count1;
    __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
    if (valid_count0 + valid_count1 <= 16) {
      vec0 = _mm512_mask_expand_epi32(
          vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
      valid_count0 += valid_count1;
      vec0 = expand_utf8_to_utf32(vec0);
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
    } else {
      vec0 = expand_utf8_to_utf32(vec0);
      vec1 = expand_utf8_to_utf32(vec1);
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
    }
    const __m512i lane3 = broadcast_epi128<3>(utf8);
    int valid_count2;
    __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
    uint32_t tmp1;
    ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
    const __m512i lane4 = _mm512_set1_epi32(tmp1);
    int valid_count3;
    __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
    if (valid_count2 + valid_count3 <= 16) {
      vec2 = _mm512_mask_expand_epi32(
          vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
      valid_count2 += valid_count3;
      vec2 = expand_utf8_to_utf32(vec2);
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
    } else {
      vec2 = expand_utf8_to_utf32(vec2);
      vec3 = expand_utf8_to_utf32(vec3);
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
    }
    ptr += 4 * 16;
  }

  if (end - ptr >= 64) {
    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
    const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
    if (ascii == 0) {
      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
      output += 64;
      ptr += 64;
    } else {
      const __m512i lane0 = broadcast_epi128<0>(utf8);
      const __m512i lane1 = broadcast_epi128<1>(utf8);
      int valid_count0;
      __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
      const __m512i lane2 = broadcast_epi128<2>(utf8);
      int valid_count1;
      __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
      if (valid_count0 + valid_count1 <= 16) {
        vec0 = _mm512_mask_expand_epi32(
            vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
        valid_count0 += valid_count1;
        vec0 = expand_utf8_to_utf32(vec0);
        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
      } else {
        vec0 = expand_utf8_to_utf32(vec0);
        vec1 = expand_utf8_to_utf32(vec1);
        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
      }

      const __m512i lane3 = broadcast_epi128<3>(utf8);
      SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)

      ptr += 3 * 16;
    }
  }
  return {ptr, output};
}

using utf8_to_utf16_result = std::pair<const char *, char16_t *>;