aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/simdutf/src/icelake/icelake_convert_utf16_to_utf8.inl.cpp
blob: d2d698294e34a2ebf85cb973377621f7708f9175 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
// file included directly

/**
 * This function converts the input (inbuf, inlen), assumed to be valid
 * UTF16 (little endian) into UTF-8 (to outbuf). The number of code units
 * written is written to 'outlen' and the function reports the number of input
 * word consumed.
 */
template <endianness big_endian>
size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen,
                             unsigned char *outbuf, size_t *outlen) {
  __m512i in;
  __mmask32 inmask = _cvtu32_mask32(0x7fffffff);
  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
  const char16_t *const inbuf_orig = inbuf;
  const unsigned char *const outbuf_orig = outbuf;
  int adjust = 0;
  int carry = 0;

  while (inlen >= 32) {
    in = _mm512_loadu_si512(inbuf);
    if (big_endian) {
      in = _mm512_shuffle_epi8(in, byteflip);
    }
    inlen -= 31;
  lastiteration:
    inbuf += 31;

  failiteration:
    const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask(
        inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT);

    if (_ktestz_mask32_u8(inmask, is234byte)) {
      // fast path for ASCII only
      _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in);
      outbuf += 31;
      carry = 0;

      if (inlen < 32) {
        goto tail;
      } else {
        continue;
      }
    }

    const __mmask32 is12byte =
        _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT);

    if (_ktestc_mask32_u8(is12byte, inmask)) {
      // fast path for 1 and 2 byte only

      const __m512i twobytes = _mm512_ternarylogic_epi32(
          _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6),
          _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C
      in = _mm512_mask_add_epi16(in, is234byte, twobytes,
                                 _mm512_set1_epi16(int16_t(0x80c0)));
      const __m512i cmpmask =
          _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)),
                                  _mm512_set1_epi16(0x0800));
      const __mmask64 smoosh =
          _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT);
      const __m512i out = _mm512_maskz_compress_epi8(smoosh, in);
      _mm512_mask_storeu_epi8(outbuf,
                              _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh),
                                                       _cvtmask64_u64(smoosh))),
                              out);
      outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte));
      carry = 0;

      if (inlen < 32) {
        goto tail;
      } else {
        continue;
      }
    }
    __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
    __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));

    __m512i taglo = _mm512_set1_epi32(0x8080e000);
    __m512i taghi = taglo;

    const __m512i fc00masked =
        _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00)));
    const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask(
        inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ);
    const __mmask32 losurr = _mm512_cmp_epu16_mask(
        fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ);

    int carryout = 0;
    if (!_kortestz_mask32_u8(hisurr, losurr)) {
      // handle surrogates

      __m512i los = _mm512_alignr_epi32(hi, lo, 1);
      __m512i his = _mm512_alignr_epi32(lo, hi, 1);

      const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16);
      taglo = _mm512_mask_mov_epi32(taglo, __mmask16(hisurr),
                                    _mm512_set1_epi32(0x808080f0));
      taghi = _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi),
                                    _mm512_set1_epi32(0x808080f0));

      lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10);
      hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10);
      los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400));
      his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400));
      lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los);
      hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his);

      carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30));

      const uint32_t h = _cvtmask32_u32(hisurr);
      const uint32_t l = _cvtmask32_u32(losurr);
      // check for mismatched surrogates
      if ((h + h + carry) ^ l) {
        const uint32_t lonohi = l & ~(h + h + carry);
        const uint32_t hinolo = h & ~(l >> 1);
        inlen = _tzcnt_u32(hinolo | lonohi);
        inmask = __mmask32(0x7fffffff & ((1U << inlen) - 1));
        in = _mm512_maskz_mov_epi16(inmask, in);
        adjust = (int)inlen - 31;
        inlen = 0;
        goto failiteration;
      }
    }

    hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff), hi);
    carry = carryout;

    __m512i mslo =
        _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo);

    __m512i mshi =
        _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi);

    const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask));
    const __mmask64 outmhi = _kshiftri_mask64(outmask, 16);

    const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte));
    const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16);
    const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16);

    taglo = _mm512_mask_mov_epi32(taglo, __mmask16(is12byte),
                                  _mm512_set1_epi32(0x80c00000));
    taghi = _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi),
                                  _mm512_set1_epi32(0x80c00000));
    __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask),
                                              _mm512_set1_epi32(0xffffffff),
                                              _mm512_set1_epi32(0x00010101));
    __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi),
                                              _mm512_set1_epi32(0xffffffff),
                                              _mm512_set1_epi32(0x00010101));

    magiclo = _mm512_mask_blend_epi32(__mmask16(outmask),
                                      _mm512_set1_epi32(0xffffffff),
                                      _mm512_set1_epi32(0x00010101));
    magichi = _mm512_mask_blend_epi32(__mmask16(outmhi),
                                      _mm512_set1_epi32(0xffffffff),
                                      _mm512_set1_epi32(0x00010101));

    mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo,
                                     0xea); // A&B|C
    mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi,
                                     0xea);
    mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24);

    mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24);

    const __mmask64 wantlo =
        _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT);
    const __mmask64 wanthi =
        _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT);
    const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo);
    const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi);
    const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo);
    const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi);

    uint64_t advlo = _mm_popcnt_u64(wantlo_uint64);
    uint64_t advhi = _mm_popcnt_u64(wanthi_uint64);

    _mm512_mask_storeu_epi8(
        outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo);
    _mm512_mask_storeu_epi8(
        outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)),
        outhi);
    outbuf += advlo + advhi;
  }
  outbuf += -adjust;

tail:
  if (inlen != 0) {
    // We must have inlen < 31.
    inmask = _cvtu32_mask32((1U << inlen) - 1);
    in = _mm512_maskz_loadu_epi16(inmask, inbuf);
    if (big_endian) {
      in = _mm512_shuffle_epi8(in, byteflip);
    }
    adjust = (int)inlen - 31;
    inlen = 0;
    goto lastiteration;
  }
  *outlen = (outbuf - outbuf_orig) + adjust;
  return ((inbuf - inbuf_orig) + adjust);
}