aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/simdutf/src/icelake/icelake_utf8_validation.inl.cpp
blob: e00563dc4ca14ce442a8d162f553a2091a0f4f0b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
// file included directly

simdutf_really_inline __m512i check_special_cases(__m512i input,
                                                  const __m512i prev1) {
  __m512i mask1 = _mm512_setr_epi64(0x0202020202020202, 0x4915012180808080,
                                    0x0202020202020202, 0x4915012180808080,
                                    0x0202020202020202, 0x4915012180808080,
                                    0x0202020202020202, 0x4915012180808080);
  const __m512i v_0f = _mm512_set1_epi8(0x0f);
  __m512i index1 = _mm512_and_si512(_mm512_srli_epi16(prev1, 4), v_0f);

  __m512i byte_1_high = _mm512_shuffle_epi8(mask1, index1);
  __m512i mask2 = _mm512_setr_epi64(0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb);
  __m512i index2 = _mm512_and_si512(prev1, v_0f);

  __m512i byte_1_low = _mm512_shuffle_epi8(mask2, index2);
  __m512i mask3 =
      _mm512_setr_epi64(0x101010101010101, 0x1010101babaaee6, 0x101010101010101,
                        0x1010101babaaee6, 0x101010101010101, 0x1010101babaaee6,
                        0x101010101010101, 0x1010101babaaee6);
  __m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f);
  __m512i byte_2_high = _mm512_shuffle_epi8(mask3, index3);
  return _mm512_ternarylogic_epi64(byte_1_high, byte_1_low, byte_2_high, 128);
}

simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
                                                      const __m512i prev_input,
                                                      const __m512i sc) {
  __m512i prev2 = prev<2>(input, prev_input);
  __m512i prev3 = prev<3>(input, prev_input);
  __m512i is_third_byte = _mm512_subs_epu8(
      prev2, _mm512_set1_epi8(0b11100000u - 1)); // Only 111_____ will be > 0
  __m512i is_fourth_byte = _mm512_subs_epu8(
      prev3, _mm512_set1_epi8(0b11110000u - 1)); // Only 1111____ will be > 0
  __m512i is_third_or_fourth_byte =
      _mm512_or_si512(is_third_byte, is_fourth_byte);
  const __m512i v_7f = _mm512_set1_epi8(char(0x7f));
  is_third_or_fourth_byte = _mm512_adds_epu8(v_7f, is_third_or_fourth_byte);
  // We want to compute (is_third_or_fourth_byte AND v80) XOR sc.
  const __m512i v_80 = _mm512_set1_epi8(char(0x80));
  return _mm512_ternarylogic_epi32(is_third_or_fourth_byte, v_80, sc,
                                   0b1101010);
  //__m512i is_third_or_fourth_byte_mask =
  //_mm512_and_si512(is_third_or_fourth_byte, v_80); return
  // _mm512_xor_si512(is_third_or_fourth_byte_mask, sc);
}
//
// Return nonzero if there are incomplete multibyte characters at the end of the
// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
//
simdutf_really_inline __m512i is_incomplete(const __m512i input) {
  // If the previous input's last 3 bytes match this, they're too short (they
  // ended at EOF):
  // ... 1111____ 111_____ 11______
  __m512i max_value = _mm512_setr_epi64(0xffffffffffffffff, 0xffffffffffffffff,
                                        0xffffffffffffffff, 0xffffffffffffffff,
                                        0xffffffffffffffff, 0xffffffffffffffff,
                                        0xffffffffffffffff, 0xbfdfefffffffffff);
  return _mm512_subs_epu8(input, max_value);
}

struct avx512_utf8_checker {
  // If this is nonzero, there has been a UTF-8 error.
  __m512i error{};

  // The last input we received
  __m512i prev_input_block{};
  // Whether the last input we received was incomplete (used for ASCII fast
  // path)
  __m512i prev_incomplete{};

  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const __m512i input,
                                              const __m512i prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    __m512i prev1 = prev<1>(input, prev_input);
    __m512i sc = check_special_cases(input, prev1);
    this->error = _mm512_or_si512(
        check_multibyte_lengths(input, prev_input, sc), this->error);
  }

  // The only problem that can happen at EOF is that a multibyte character is
  // too short or a byte value too large in the last bytes: check_special_cases
  // only checks for bytes too large in the first of two bytes.
  simdutf_really_inline void check_eof() {
    // If the previous block had incomplete UTF-8 characters at the end, an
    // ASCII block can't possibly finish them.
    this->error = _mm512_or_si512(this->error, this->prev_incomplete);
  }

  // returns true if ASCII.
  simdutf_really_inline bool check_next_input(const __m512i input) {
    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
    const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
    if (ascii == 0) {
      this->error = _mm512_or_si512(this->error, this->prev_incomplete);
      return true;
    } else {
      this->check_utf8_bytes(input, this->prev_input_block);
      this->prev_incomplete = is_incomplete(input);
      this->prev_input_block = input;
      return false;
    }
  }
  // do not forget to call check_eof!
  simdutf_really_inline bool errors() const {
    return _mm512_test_epi8_mask(this->error, this->error) != 0;
  }
}; // struct avx512_utf8_checker