contrib/simdutf/src/generic/utf8_to_utf16/valid_utf8_to_utf16.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76

#include "scalar/utf8_to_utf16/utf8_to_utf16.h"

namespace simdutf {
namespace SIMDUTF_IMPLEMENTATION {
namespace {
namespace utf8_to_utf16 {

using namespace simd;

template <endianness endian>
simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
                                         char16_t *utf16_output) noexcept {
  // The implementation is not specific to haswell and should be moved to the
  // generic directory.
  size_t pos = 0;
  char16_t *start{utf16_output};
  const size_t safety_margin = 16; // to avoid overruns!
  while (pos + 64 + safety_margin <= size) {
    // this loop could be unrolled further. For example, we could process the
    // mask far more than 64 bytes.
    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
    if (in.is_ascii()) {
      in.store_ascii_as_utf16<endian>(utf16_output);
      utf16_output += 64;
      pos += 64;
    } else {
      // Slow path. We hope that the compiler will recognize that this is a slow
      // path. Anything that is not a continuation mask is a 'leading byte',
      // that is, the start of a new code point.
      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
      // -65 is 0b10111111 in two-complement's, so largest possible continuation
      // byte
      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
      // The *start* of code points is not so useful, rather, we want the *end*
      // of code points.
      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
      // We process in blocks of up to 12 bytes except possibly
      // for fast paths which may process up to 16 bytes. For the
      // slow path to work, we should have at least 12 input bytes left.
      size_t max_starting_point = (pos + 64) - 12;
      // Next loop is going to run at least five times when using solely
      // the slow/regular path, and at least four times if there are fast paths.
      while (pos < max_starting_point) {
        // Performance note: our ability to compute 'consumed' and
        // then shift and recompute is critical. If there is a
        // latency of, say, 4 cycles on getting 'consumed', then
        // the inner loop might have a total latency of about 6 cycles.
        // Yet we process between 6 to 12 inputs bytes, thus we get
        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
        // for this section of the code. Hence, there is a limit
        // to how much we can further increase this latency before
        // it seriously harms performance.
        //
        // Thus we may allow convert_masked_utf8_to_utf16 to process
        // more bytes at a time under a fast-path mode where 16 bytes
        // are consumed at once (e.g., when encountering ASCII).
        size_t consumed = convert_masked_utf8_to_utf16<endian>(
            input + pos, utf8_end_of_code_point_mask, utf16_output);
        pos += consumed;
        utf8_end_of_code_point_mask >>= consumed;
      }
      // At this point there may remain between 0 and 12 bytes in the
      // 64-byte block. These bytes will be processed again. So we have an
      // 80% efficiency (in the worst case). In practice we expect an
      // 85% to 90% efficiency.
    }
  }
  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
      input + pos, size - pos, utf16_output);
  return utf16_output - start;
}

} // namespace utf8_to_utf16
} // unnamed namespace
} // namespace SIMDUTF_IMPLEMENTATION
} // namespace simdutf