contrib/simdutf/src/generic/utf8_to_latin1/valid_utf8_to_latin1.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

#include "scalar/utf8_to_latin1/utf8_to_latin1.h"

namespace simdutf {
namespace SIMDUTF_IMPLEMENTATION {
namespace {
namespace utf8_to_latin1 {
using namespace simd;

simdutf_really_inline size_t convert_valid(const char *in, size_t size,
                                           char *latin1_output) {
  size_t pos = 0;
  char *start{latin1_output};
  // In the worst case, we have the haswell kernel which can cause an overflow
  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
  // 16 bytes, and if the data is valid, then it is entirely safe because 16
  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
  // assume that you have valid UTF-8 input, so we are going to go back from the
  // end counting 8 leading bytes, to give us a good margin.
  size_t leading_byte = 0;
  size_t margin = size;
  for (; margin > 0 && leading_byte < 8; margin--) {
    leading_byte += (int8_t(in[margin - 1]) >
                     -65); // twos complement of -65 is 1011 1111 ...
  }
  // If the input is long enough, then we have that margin-1 is the eight last
  // leading byte.
  const size_t safety_margin = size - margin + 1; // to avoid overruns!
  while (pos + 64 + safety_margin <= size) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    if (input.is_ascii()) {
      input.store((int8_t *)latin1_output);
      latin1_output += 64;
      pos += 64;
    } else {
      // you might think that a for-loop would work, but under Visual Studio, it
      // is not good enough.
      uint64_t utf8_continuation_mask =
          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
                             // this case, we also have ASCII to account for.
      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
      // We process in blocks of up to 12 bytes except possibly
      // for fast paths which may process up to 16 bytes. For the
      // slow path to work, we should have at least 12 input bytes left.
      size_t max_starting_point = (pos + 64) - 12;
      // Next loop is going to run at least five times.
      while (pos < max_starting_point) {
        // Performance note: our ability to compute 'consumed' and
        // then shift and recompute is critical. If there is a
        // latency of, say, 4 cycles on getting 'consumed', then
        // the inner loop might have a total latency of about 6 cycles.
        // Yet we process between 6 to 12 inputs bytes, thus we get
        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
        // for this section of the code. Hence, there is a limit
        // to how much we can further increase this latency before
        // it seriously harms performance.
        size_t consumed = convert_masked_utf8_to_latin1(
            in + pos, utf8_end_of_code_point_mask, latin1_output);
        pos += consumed;
        utf8_end_of_code_point_mask >>= consumed;
      }
      // At this point there may remain between 0 and 12 bytes in the
      // 64-byte block. These bytes will be processed again. So we have an
      // 80% efficiency (in the worst case). In practice we expect an
      // 85% to 90% efficiency.
    }
  }
  if (pos < size) {
    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
                                                           latin1_output);
    latin1_output += howmany;
  }
  return latin1_output - start;
}

} // namespace utf8_to_latin1
} // namespace
} // namespace SIMDUTF_IMPLEMENTATION
} // namespace simdutf
  // namespace simdutf