1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
#ifndef SIMDUTF_LATIN1_TO_UTF8_H
#define SIMDUTF_LATIN1_TO_UTF8_H
namespace simdutf {
namespace scalar {
namespace {
namespace latin1_to_utf8 {
inline size_t convert(const char *buf, size_t len, char *utf8_output) {
const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
size_t pos = 0;
size_t utf8_pos = 0;
while (pos < len) {
// try to convert the next block of 16 ASCII bytes
if (pos + 16 <=
len) { // if it is safe to read 16 more bytes, check that they are ascii
uint64_t v1;
::memcpy(&v1, data + pos, sizeof(uint64_t));
uint64_t v2;
::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
uint64_t v{v1 |
v2}; // We are only interested in these bits: 1000 1000 1000
// 1000, so it makes sense to concatenate everything
if ((v & 0x8080808080808080) ==
0) { // if NONE of these are set, e.g. all of them are zero, then
// everything is ASCII
size_t final_pos = pos + 16;
while (pos < final_pos) {
utf8_output[utf8_pos++] = char(buf[pos]);
pos++;
}
continue;
}
}
unsigned char byte = data[pos];
if ((byte & 0x80) == 0) { // if ASCII
// will generate one UTF-8 bytes
utf8_output[utf8_pos++] = char(byte);
pos++;
} else {
// will generate two UTF-8 bytes
utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
pos++;
}
}
return utf8_pos;
}
inline size_t convert_safe(const char *buf, size_t len, char *utf8_output,
size_t utf8_len) {
const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
size_t pos = 0;
size_t skip_pos = 0;
size_t utf8_pos = 0;
while (pos < len && utf8_pos < utf8_len) {
// try to convert the next block of 16 ASCII bytes
if (pos >= skip_pos && pos + 16 <= len &&
utf8_pos + 16 <= utf8_len) { // if it is safe to read 16 more bytes,
// check that they are ascii
uint64_t v1;
::memcpy(&v1, data + pos, sizeof(uint64_t));
uint64_t v2;
::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
uint64_t v{v1 |
v2}; // We are only interested in these bits: 1000 1000 1000
// 1000, so it makes sense to concatenate everything
if ((v & 0x8080808080808080) ==
0) { // if NONE of these are set, e.g. all of them are zero, then
// everything is ASCII
::memcpy(utf8_output + utf8_pos, buf + pos, 16);
utf8_pos += 16;
pos += 16;
} else {
// At least one of the next 16 bytes are not ASCII, we will process them
// one by one
skip_pos = pos + 16;
}
} else {
const auto byte = data[pos];
if ((byte & 0x80) == 0) { // if ASCII
// will generate one UTF-8 bytes
utf8_output[utf8_pos++] = char(byte);
pos++;
} else if (utf8_pos + 2 <= utf8_len) {
// will generate two UTF-8 bytes
utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
pos++;
} else {
break;
}
}
}
return utf8_pos;
}
} // namespace latin1_to_utf8
} // unnamed namespace
} // namespace scalar
} // namespace simdutf
#endif
|