1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
|
// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
// end of the code points. Only the least significant 12 bits of the mask
// are accessed.
// It returns how many bytes were consumed (up to 16, usually 12).
template <endianness big_endian>
size_t convert_masked_utf8_to_utf16(const char *input,
uint64_t utf8_end_of_code_point_mask,
char16_t *&utf16_output) {
// we use an approach where we try to process up to 12 input bytes.
// Why 12 input bytes and not 16? Because we are concerned with the size of
// the lookup tables. Also 12 is nicely divisible by two and three.
//
uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
const uint16_t input_utf8_end_of_code_point_mask =
utf8_end_of_code_point_mask & 0xfff;
//
// Optimization note: our main path below is load-latency dependent. Thus it
// is maybe beneficial to have fast paths that depend on branch prediction but
// have less latency. This results in more instructions but, potentially, also
// higher speeds.
// We first try a few fast paths.
// The obvious first test is ASCII, which actually consumes the full 16.
if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) {
// We process in chunks of 16 bytes
// The routine in simd.h is reused.
simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
temp.store_ascii_as_utf16<big_endian>(utf16_output);
utf16_output += 16; // We wrote 16 16-bit characters.
return 16; // We consumed 16 bytes.
}
// 3 byte sequences are the next most common, as seen in CJK, which has long
// sequences of these.
if (input_utf8_end_of_code_point_mask == 0x924) {
// We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
// UTF-16 code units.
uint16x4_t composed = convert_utf8_3_byte_to_utf16(in);
// Byte swap if necessary
if (!match_system(big_endian)) {
composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
}
vst1_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
utf16_output += 4; // We wrote 4 16-bit characters.
return 12; // We consumed 12 bytes.
}
// 2 byte sequences occur in short bursts in languages like Greek and Russian.
if ((utf8_end_of_code_point_mask & 0xFFF) == 0xaaa) {
// We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte
// UTF-16 code units.
uint16x8_t composed = convert_utf8_2_byte_to_utf16(in);
// Byte swap if necessary
if (!match_system(big_endian)) {
composed =
vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
}
vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
utf16_output += 6; // We wrote 6 16-bit characters.
return 12; // We consumed 12 bytes.
}
/// We do not have a fast path available, or the fast path is unimportant, so
/// we fallback.
const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
[input_utf8_end_of_code_point_mask][0];
const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
[input_utf8_end_of_code_point_mask][1];
if (idx < 64) {
// SIX (6) input code-code units
// Convert to UTF-16
uint16x8_t composed = convert_utf8_1_to_2_byte_to_utf16(in, idx);
// Byte swap if necessary
if (!match_system(big_endian)) {
composed =
vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
}
// Store
vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
utf16_output += 6; // We wrote 6 16-bit characters.
return consumed;
} else if (idx < 145) {
// FOUR (4) input code-code units
// UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
simdutf::tables::utf8_to_utf16::shufutf8[idx]));
// XXX: depending on the system scalar instructions might be faster.
// 1 byte: 00000000 00000000 0ccccccc
// 2 byte: 00000000 110bbbbb 10cccccc
// 3 byte: 1110aaaa 10bbbbbb 10cccccc
uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
// 1 byte: 00000000 0ccccccc
// 2 byte: xx0bbbbb x0cccccc
// 3 byte: xxbbbbbb x0cccccc
uint16x4_t lowperm = vmovn_u32(perm);
// Partially mask with bic (doesn't require a temporary register unlike and)
// The shift left insert below will clear the top bits.
// 1 byte: 00000000 00000000
// 2 byte: xx0bbbbb 00000000
// 3 byte: xxbbbbbb 00000000
uint16x4_t middlebyte = vbic_u16(lowperm, vmov_n_u16(uint16_t(~0xFF00)));
// ASCII
// 1 byte: 00000000 0ccccccc
// 2+byte: 00000000 00cccccc
uint16x4_t ascii = vand_u16(lowperm, vmov_n_u16(0x7F));
// Split into narrow vectors.
// 2 byte: 00000000 00000000
// 3 byte: 00000000 xxxxaaaa
uint16x4_t highperm = vshrn_n_u32(perm, 16);
// Shift right accumulate the middle byte
// 1 byte: 00000000 0ccccccc
// 2 byte: 00xx0bbb bbcccccc
// 3 byte: 00xxbbbb bbcccccc
uint16x4_t middlelow = vsra_n_u16(ascii, middlebyte, 2);
// Shift left and insert the top 4 bits, overwriting the garbage
// 1 byte: 00000000 0ccccccc
// 2 byte: 00000bbb bbcccccc
// 3 byte: aaaabbbb bbcccccc
uint16x4_t composed = vsli_n_u16(middlelow, highperm, 12);
// Byte swap if necessary
if (!match_system(big_endian)) {
composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
}
vst1_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
utf16_output += 4; // We wrote 4 16-bit codepoints
return consumed;
} else if (idx < 209) {
// THREE (3) input code-code units
if (input_utf8_end_of_code_point_mask == 0x888) {
// We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
// UTF-16 pairs. Generating surrogate pairs is a little tricky though, but
// it is easier when we can assume they are all pairs. This version does
// not use the LUT, but 4 byte sequences are less common and the overhead
// of the extra memory access is less important than the early branch
// overhead in shorter sequences.
// Swap byte pairs
// 10dddddd 10cccccc|10bbbbbb 11110aaa
// 10cccccc 10dddddd|11110aaa 10bbbbbb
uint8x16_t swap = vrev16q_u8(in);
// Shift left 2 bits
// cccccc00 dddddd00 xxxxxxxx bbbbbb00
uint32x4_t shift = vreinterpretq_u32_u8(vshlq_n_u8(swap, 2));
// Create a magic number containing the low 2 bits of the trail surrogate
// and all the corrections needed to create the pair. UTF-8 4b prefix =
// -0x0000|0xF000 surrogate offset = -0x0000|0x0040 (0x10000 << 6)
// surrogate high = +0x0000|0xD800
// surrogate low = +0xDC00|0x0000
// -------------------------------
// = +0xDC00|0xE7C0
uint32x4_t magic = vmovq_n_u32(0xDC00E7C0);
// Generate unadjusted trail surrogate minus lowest 2 bits
// xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
uint32x4_t trail =
vbslq_u32(vmovq_n_u32(0x0000FF00), vreinterpretq_u32_u8(swap), shift);
// Insert low 2 bits of trail surrogate to magic number for later
// 11011100 00000000 11100111 110000cc
uint16x8_t magic_with_low_2 =
vreinterpretq_u16_u32(vsraq_n_u32(magic, shift, 30));
// Generate lead surrogate
// xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx
uint32x4_t lead = vreinterpretq_u32_u16(
vsliq_n_u16(vreinterpretq_u16_u8(swap), vreinterpretq_u16_u8(in), 6));
// Mask out lead
// 000000cc ccdddddd|xxxxxxxx xxxxxxxx
lead = vbicq_u32(lead, vmovq_n_u32(uint32_t(~0x03FFFFFF)));
// Blend pairs
// 000000cc ccdddddd|11110aaa bbbbbb00
uint16x8_t blend = vreinterpretq_u16_u32(
vbslq_u32(vmovq_n_u32(0x0000FFFF), trail, lead));
// Add magic number to finish the result
// 110111CC CCDDDDDD|110110AA BBBBBBCC
uint16x8_t composed = vaddq_u16(blend, magic_with_low_2);
// Byte swap if necessary
if (!match_system(big_endian)) {
composed =
vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
}
uint16_t buffer[8];
vst1q_u16(reinterpret_cast<uint16_t *>(buffer), composed);
for (int k = 0; k < 6; k++) {
utf16_output[k] = buffer[k];
} // the loop might compiler to a couple of instructions.
utf16_output += 6; // We wrote 3 32-bit surrogate pairs.
return 12; // We consumed 12 bytes.
}
// 3 1-4 byte sequences
uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
simdutf::tables::utf8_to_utf16::shufutf8[idx]));
// 1 byte: 00000000 00000000 00000000 0ddddddd
// 3 byte: 00000000 00000000 110ccccc 10dddddd
// 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
// 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
// added to fix issue https://github.com/simdutf/simdutf/issues/514
// We only want to write 2 * 16-bit code units when that is actually what we
// have. Unfortunately, we cannot trust the input. So it is possible to get
// 0xff as an input byte and it should not result in a surrogate pair. We
// need to check for that.
uint32_t permbuffer[4];
vst1q_u32(permbuffer, perm);
// Mask the low and middle bytes
// 00000000 00000000 00000000 0ddddddd
uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7f));
// Because the surrogates need more work, the high surrogate is computed
// first.
uint32x4_t middlehigh = vshlq_n_u32(perm, 2);
// 00000000 00000000 00cccccc 00000000
uint32x4_t middlebyte = vandq_u32(perm, vmovq_n_u32(0x3F00));
// Start assembling the sequence. Since the 4th byte is in the same position
// as it would be in a surrogate and there is no dependency, shift left
// instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte:
// 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
uint32x4_t ab = vbslq_u32(vmovq_n_u32(0xFF000000), perm, middlehigh);
// Top 16 bits contains the high ten bits of the surrogate pair before
// correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa
// bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
uint32x4_t abc =
vbslq_u32(vmovq_n_u32(0xFFFC0000), ab, vshlq_n_u32(middlebyte, 4));
// Combine the low 6 or 7 bits by a shift right accumulate
// 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct
// 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o
// correction
uint32x4_t composed = vsraq_n_u32(ascii, abc, 6);
// After this is for surrogates
// Blend the low and high surrogates
// 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
uint32x4_t mixed = vbslq_u32(vmovq_n_u32(0xFFFF0000), abc, composed);
// Clear the upper 6 bits of the low surrogate. Don't clear the upper bits
// yet as 0x10000 was not subtracted from the codepoint yet. 4 byte:
// 11110aaa bbbbbbcc|000000cc ccdddddd
uint16x8_t masked_pair = vreinterpretq_u16_u32(
vbicq_u32(mixed, vmovq_n_u32(uint32_t(~0xFFFF03FF))));
// Correct the remaining UTF-8 prefix, surrogate offset, and add the
// surrogate prefixes in one magic 16-bit addition. similar magic number but
// without the continue byte adjust and halfword swapped UTF-8 4b prefix =
// -0xF000|0x0000 surrogate offset = -0x0040|0x0000 (0x10000 << 6)
// surrogate high = +0xD800|0x0000
// surrogate low = +0x0000|0xDC00
// -----------------------------------
// = +0xE7C0|0xDC00
uint16x8_t magic = vreinterpretq_u16_u32(vmovq_n_u32(0xE7C0DC00));
// 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete
uint32x4_t surrogates =
vreinterpretq_u32_u16(vaddq_u16(masked_pair, magic));
// If the high bit is 1 (s32 less than zero), this needs a surrogate pair
uint32x4_t is_pair = vcltzq_s32(vreinterpretq_s32_u32(perm));
// Select either the 4 byte surrogate pair or the 2 byte solo codepoint
// 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd
// 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD
uint32x4_t selected = vbslq_u32(is_pair, surrogates, composed);
// Byte swap if necessary
if (!match_system(big_endian)) {
selected =
vreinterpretq_u32_u8(vrev16q_u8(vreinterpretq_u8_u32(selected)));
}
// Attempting to shuffle and store would be complex, just scalarize.
uint32_t buffer[4];
vst1q_u32(buffer, selected);
// Test for the top bit of the surrogate mask. Remove due to issue 514
// const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 :
// 0x00800000;
for (size_t i = 0; i < 3; i++) {
// Surrogate
// Used to be if (buffer[i] & SURROGATE_MASK) {
// See discussion above.
// patch for issue https://github.com/simdutf/simdutf/issues/514
if ((permbuffer[i] & 0xf8000000) == 0xf0000000) {
utf16_output[0] = uint16_t(buffer[i] >> 16);
utf16_output[1] = uint16_t(buffer[i] & 0xFFFF);
utf16_output += 2;
} else {
utf16_output[0] = uint16_t(buffer[i] & 0xFFFF);
utf16_output++;
}
}
return consumed;
} else {
// here we know that there is an error but we do not handle errors
return 12;
}
}
|