contrib/simdutf/src/arm64/arm_validate_utf16.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

template <endianness big_endian>
const char16_t *arm_validate_utf16(const char16_t *input, size_t size) {
  const char16_t *end = input + size;
  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
  const auto v_fc = simd8<uint8_t>::splat(0xfc);
  const auto v_dc = simd8<uint8_t>::splat(0xdc);
  while (end - input >= 16) {
    // 0. Load data: since the validation takes into account only higher
    //    byte of each word, we compress the two vectors into one which
    //    consists only the higher bytes.
    auto in0 = simd16<uint16_t>(input);
    auto in1 =
        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
    if (!match_system(big_endian)) {
      in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0)));
      in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1)));
    }
    const auto t0 = in0.shr<8>();
    const auto t1 = in1.shr<8>();
    const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
    const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
    if (surrogates_wordmask == 0) {
      input += 16;
    } else {
      // 2. We have some surrogates that have to be distinguished:
      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
      //
      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)

      // V - non-surrogate code units
      //     V = not surrogates_wordmask
      const uint64_t V = ~surrogates_wordmask;

      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
      const auto vH = ((in & v_fc) == v_dc);
      const uint64_t H = vH.to_bitmask64();

      // L - word mask for low surrogates
      //     L = not H and surrogates_wordmask
      const uint64_t L = ~H & surrogates_wordmask;

      const uint64_t a =
          L & (H >> 4); // A low surrogate must be followed by high one.
                        // (A low surrogate placed in the 7th register's word
                        // is an exception we handle.)
      const uint64_t b =
          a << 4; // Just mark that the opposite fact is hold,
                  // thanks to that we have only two masks for valid case.
      const uint64_t c = V | a | b; // Combine all the masks into the final one.
      if (c == ~0ull) {
        // The whole input register contains valid UTF-16, i.e.,
        // either single code units or proper surrogate pairs.
        input += 16;
      } else if (c == 0xfffffffffffffffull) {
        // The 15 lower code units of the input register contains valid UTF-16.
        // The 15th word may be either a low or high surrogate. It the next
        // iteration we 1) check if the low surrogate is followed by a high
        // one, 2) reject sole high surrogate.
        input += 15;
      } else {
        return nullptr;
      }
    }
  }
  return input;
}

template <endianness big_endian>
const result arm_validate_utf16_with_errors(const char16_t *input,
                                            size_t size) {
  const char16_t *start = input;
  const char16_t *end = input + size;

  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
  const auto v_fc = simd8<uint8_t>::splat(0xfc);
  const auto v_dc = simd8<uint8_t>::splat(0xdc);
  while (input + 16 < end) {
    // 0. Load data: since the validation takes into account only higher
    //    byte of each word, we compress the two vectors into one which
    //    consists only the higher bytes.
    auto in0 = simd16<uint16_t>(input);
    auto in1 =
        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));

    if (!match_system(big_endian)) {
      in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0)));
      in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1)));
    }
    const auto t0 = in0.shr<8>();
    const auto t1 = in1.shr<8>();
    const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
    const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
    if (surrogates_wordmask == 0) {
      input += 16;
    } else {
      // 2. We have some surrogates that have to be distinguished:
      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
      //
      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)

      // V - non-surrogate code units
      //     V = not surrogates_wordmask
      const uint64_t V = ~surrogates_wordmask;

      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
      const auto vH = ((in & v_fc) == v_dc);
      const uint64_t H = vH.to_bitmask64();

      // L - word mask for low surrogates
      //     L = not H and surrogates_wordmask
      const uint64_t L = ~H & surrogates_wordmask;

      const uint64_t a =
          L & (H >> 4); // A low surrogate must be followed by high one.
                        // (A low surrogate placed in the 7th register's word
                        // is an exception we handle.)
      const uint64_t b =
          a << 4; // Just mark that the opposite fact is hold,
                  // thanks to that we have only two masks for valid case.
      const uint64_t c = V | a | b; // Combine all the masks into the final one.
      if (c == ~0ull) {
        // The whole input register contains valid UTF-16, i.e.,
        // either single code units or proper surrogate pairs.
        input += 16;
      } else if (c == 0xfffffffffffffffull) {
        // The 15 lower code units of the input register contains valid UTF-16.
        // The 15th word may be either a low or high surrogate. It the next
        // iteration we 1) check if the low surrogate is followed by a high
        // one, 2) reject sole high surrogate.
        input += 15;
      } else {
        return result(error_code::SURROGATE, input - start);
      }
    }
  }
  return result(error_code::SUCCESS, input - start);
}