aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/simdutf/src/scalar/utf16.h
blob: 838e95dc7cbd648080bc966fff3ca92d48471dab (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#ifndef SIMDUTF_UTF16_H
#define SIMDUTF_UTF16_H

namespace simdutf {
namespace scalar {
namespace {
namespace utf16 {

inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
  return uint16_t((word >> 8) | (word << 8));
}

template <endianness big_endian>
inline simdutf_warn_unused bool validate(const char16_t *buf,
                                         size_t len) noexcept {
  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
  uint64_t pos = 0;
  while (pos < len) {
    uint16_t word =
        !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
    if ((word & 0xF800) == 0xD800) {
      if (pos + 1 >= len) {
        return false;
      }
      uint16_t diff = uint16_t(word - 0xD800);
      if (diff > 0x3FF) {
        return false;
      }
      uint16_t next_word =
          !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
      uint16_t diff2 = uint16_t(next_word - 0xDC00);
      if (diff2 > 0x3FF) {
        return false;
      }
      pos += 2;
    } else {
      pos++;
    }
  }
  return true;
}

template <endianness big_endian>
inline simdutf_warn_unused result validate_with_errors(const char16_t *buf,
                                                       size_t len) noexcept {
  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
  size_t pos = 0;
  while (pos < len) {
    uint16_t word =
        !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
    if ((word & 0xF800) == 0xD800) {
      if (pos + 1 >= len) {
        return result(error_code::SURROGATE, pos);
      }
      uint16_t diff = uint16_t(word - 0xD800);
      if (diff > 0x3FF) {
        return result(error_code::SURROGATE, pos);
      }
      uint16_t next_word =
          !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
      uint16_t diff2 = uint16_t(next_word - 0xDC00);
      if (diff2 > 0x3FF) {
        return result(error_code::SURROGATE, pos);
      }
      pos += 2;
    } else {
      pos++;
    }
  }
  return result(error_code::SUCCESS, pos);
}

template <endianness big_endian>
inline size_t count_code_points(const char16_t *buf, size_t len) {
  // We are not BOM aware.
  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
  size_t counter{0};
  for (size_t i = 0; i < len; i++) {
    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
    counter += ((word & 0xFC00) != 0xDC00);
  }
  return counter;
}

template <endianness big_endian>
inline size_t utf8_length_from_utf16(const char16_t *buf, size_t len) {
  // We are not BOM aware.
  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
  size_t counter{0};
  for (size_t i = 0; i < len; i++) {
    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
    counter++; // ASCII
    counter += static_cast<size_t>(
        word >
        0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
    counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) ||
                                   (word >= 0xE000)); // three-byte
  }
  return counter;
}

template <endianness big_endian>
inline size_t utf32_length_from_utf16(const char16_t *buf, size_t len) {
  // We are not BOM aware.
  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
  size_t counter{0};
  for (size_t i = 0; i < len; i++) {
    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
    counter += ((word & 0xFC00) != 0xDC00);
  }
  return counter;
}

inline size_t latin1_length_from_utf16(size_t len) { return len; }

simdutf_really_inline void change_endianness_utf16(const char16_t *in,
                                                   size_t size, char16_t *out) {
  const uint16_t *input = reinterpret_cast<const uint16_t *>(in);
  uint16_t *output = reinterpret_cast<uint16_t *>(out);
  for (size_t i = 0; i < size; i++) {
    *output++ = uint16_t(input[i] >> 8 | input[i] << 8);
  }
}

template <endianness big_endian>
simdutf_warn_unused inline size_t trim_partial_utf16(const char16_t *input,
                                                     size_t length) {
  if (length <= 1) {
    return length;
  }
  uint16_t last_word = uint16_t(input[length - 1]);
  last_word = !match_system(big_endian) ? swap_bytes(last_word) : last_word;
  length -= ((last_word & 0xFC00) == 0xD800);
  return length;
}

} // namespace utf16
} // unnamed namespace
} // namespace scalar
} // namespace simdutf

#endif