aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/simdutf/src/scalar/utf16.h
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/simdutf/src/scalar/utf16.h')
-rw-r--r--contrib/simdutf/src/scalar/utf16.h142
1 files changed, 142 insertions, 0 deletions
diff --git a/contrib/simdutf/src/scalar/utf16.h b/contrib/simdutf/src/scalar/utf16.h
new file mode 100644
index 000000000..838e95dc7
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf16.h
@@ -0,0 +1,142 @@
+#ifndef SIMDUTF_UTF16_H
+#define SIMDUTF_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16 {
+
+inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
+ return uint16_t((word >> 8) | (word << 8));
+}
+
+template <endianness big_endian>
+inline simdutf_warn_unused bool validate(const char16_t *buf,
+ size_t len) noexcept {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+ uint64_t pos = 0;
+ while (pos < len) {
+ uint16_t word =
+ !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
+ if ((word & 0xF800) == 0xD800) {
+ if (pos + 1 >= len) {
+ return false;
+ }
+ uint16_t diff = uint16_t(word - 0xD800);
+ if (diff > 0x3FF) {
+ return false;
+ }
+ uint16_t next_word =
+ !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if (diff2 > 0x3FF) {
+ return false;
+ }
+ pos += 2;
+ } else {
+ pos++;
+ }
+ }
+ return true;
+}
+
+template <endianness big_endian>
+inline simdutf_warn_unused result validate_with_errors(const char16_t *buf,
+ size_t len) noexcept {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+ size_t pos = 0;
+ while (pos < len) {
+ uint16_t word =
+ !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
+ if ((word & 0xF800) == 0xD800) {
+ if (pos + 1 >= len) {
+ return result(error_code::SURROGATE, pos);
+ }
+ uint16_t diff = uint16_t(word - 0xD800);
+ if (diff > 0x3FF) {
+ return result(error_code::SURROGATE, pos);
+ }
+ uint16_t next_word =
+ !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if (diff2 > 0x3FF) {
+ return result(error_code::SURROGATE, pos);
+ }
+ pos += 2;
+ } else {
+ pos++;
+ }
+ }
+ return result(error_code::SUCCESS, pos);
+}
+
+template <endianness big_endian>
+inline size_t count_code_points(const char16_t *buf, size_t len) {
+ // We are not BOM aware.
+ const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
+ size_t counter{0};
+ for (size_t i = 0; i < len; i++) {
+ uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+ counter += ((word & 0xFC00) != 0xDC00);
+ }
+ return counter;
+}
+
+template <endianness big_endian>
+inline size_t utf8_length_from_utf16(const char16_t *buf, size_t len) {
+ // We are not BOM aware.
+ const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
+ size_t counter{0};
+ for (size_t i = 0; i < len; i++) {
+ uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+ counter++; // ASCII
+ counter += static_cast<size_t>(
+ word >
+ 0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
+ counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) ||
+ (word >= 0xE000)); // three-byte
+ }
+ return counter;
+}
+
+template <endianness big_endian>
+inline size_t utf32_length_from_utf16(const char16_t *buf, size_t len) {
+ // We are not BOM aware.
+ const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
+ size_t counter{0};
+ for (size_t i = 0; i < len; i++) {
+ uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+ counter += ((word & 0xFC00) != 0xDC00);
+ }
+ return counter;
+}
+
+inline size_t latin1_length_from_utf16(size_t len) { return len; }
+
+simdutf_really_inline void change_endianness_utf16(const char16_t *in,
+ size_t size, char16_t *out) {
+ const uint16_t *input = reinterpret_cast<const uint16_t *>(in);
+ uint16_t *output = reinterpret_cast<uint16_t *>(out);
+ for (size_t i = 0; i < size; i++) {
+ *output++ = uint16_t(input[i] >> 8 | input[i] << 8);
+ }
+}
+
+template <endianness big_endian>
+simdutf_warn_unused inline size_t trim_partial_utf16(const char16_t *input,
+ size_t length) {
+ if (length <= 1) {
+ return length;
+ }
+ uint16_t last_word = uint16_t(input[length - 1]);
+ last_word = !match_system(big_endian) ? swap_bytes(last_word) : last_word;
+ length -= ((last_word & 0xFC00) == 0xD800);
+ return length;
+}
+
+} // namespace utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif