blob: 4110e69033d884bf89a98824feaad8e00294ebc1 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
#ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H
#define SIMDUTF_VALID_UTF8_TO_UTF32_H
namespace simdutf {
namespace scalar {
namespace {
namespace utf8_to_utf32 {
inline size_t convert_valid(const char *buf, size_t len,
char32_t *utf32_output) {
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
size_t pos = 0;
char32_t *start{utf32_output};
while (pos < len) {
// try to convert the next block of 8 ASCII bytes
if (pos + 8 <=
len) { // if it is safe to read 8 more bytes, check that they are ascii
uint64_t v;
::memcpy(&v, data + pos, sizeof(uint64_t));
if ((v & 0x8080808080808080) == 0) {
size_t final_pos = pos + 8;
while (pos < final_pos) {
*utf32_output++ = char32_t(buf[pos]);
pos++;
}
continue;
}
}
uint8_t leading_byte = data[pos]; // leading byte
if (leading_byte < 0b10000000) {
// converting one ASCII byte !!!
*utf32_output++ = char32_t(leading_byte);
pos++;
} else if ((leading_byte & 0b11100000) == 0b11000000) {
// We have a two-byte UTF-8
if (pos + 1 >= len) {
break;
} // minimal bound checking
*utf32_output++ = char32_t(((leading_byte & 0b00011111) << 6) |
(data[pos + 1] & 0b00111111));
pos += 2;
} else if ((leading_byte & 0b11110000) == 0b11100000) {
// We have a three-byte UTF-8
if (pos + 2 >= len) {
break;
} // minimal bound checking
*utf32_output++ = char32_t(((leading_byte & 0b00001111) << 12) |
((data[pos + 1] & 0b00111111) << 6) |
(data[pos + 2] & 0b00111111));
pos += 3;
} else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
// we have a 4-byte UTF-8 word.
if (pos + 3 >= len) {
break;
} // minimal bound checking
uint32_t code_word = ((leading_byte & 0b00000111) << 18) |
((data[pos + 1] & 0b00111111) << 12) |
((data[pos + 2] & 0b00111111) << 6) |
(data[pos + 3] & 0b00111111);
*utf32_output++ = char32_t(code_word);
pos += 4;
} else {
// we may have a continuation but we do not do error checking
return 0;
}
}
return utf32_output - start;
}
} // namespace utf8_to_utf32
} // unnamed namespace
} // namespace scalar
} // namespace simdutf
#endif
|