aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt5
-rw-r--r--contrib/DEPENDENCY_INFO.md2
-rw-r--r--contrib/backward-cpp/CMakeLists.txt2
-rw-r--r--contrib/doctest/CMakeLists.txt2
-rw-r--r--contrib/fastutf8/CMakeLists.txt11
-rw-r--r--contrib/fastutf8/LICENSE22
-rw-r--r--contrib/fastutf8/avx2.c314
-rw-r--r--contrib/fastutf8/fastutf8.c160
-rw-r--r--contrib/fastutf8/fastutf8.h65
-rw-r--r--contrib/fastutf8/sse41.c272
-rw-r--r--contrib/simdutf/CMakeLists.txt114
-rw-r--r--contrib/simdutf/LICENSE-APACHE201
-rw-r--r--contrib/simdutf/cmake/CPM.cmake1161
-rw-r--r--contrib/simdutf/cmake/JoinPaths.cmake23
-rw-r--r--contrib/simdutf/cmake/Toolchains/loongarch64-linux-gnu.cmake4
-rw-r--r--contrib/simdutf/cmake/Toolchains/riscv64-linux-gnu.cmake4
-rw-r--r--contrib/simdutf/cmake/add_cpp_test.cmake63
-rw-r--r--contrib/simdutf/cmake/simdutf-config.cmake.in2
-rw-r--r--contrib/simdutf/cmake/simdutf-flags.cmake25
-rw-r--r--contrib/simdutf/include/simdutf.h26
-rw-r--r--contrib/simdutf/include/simdutf/avx512.h79
-rw-r--r--contrib/simdutf/include/simdutf/common_defs.h151
-rw-r--r--contrib/simdutf/include/simdutf/compiler_check.h45
-rw-r--r--contrib/simdutf/include/simdutf/encoding_types.h43
-rw-r--r--contrib/simdutf/include/simdutf/error.h69
-rw-r--r--contrib/simdutf/include/simdutf/implementation.h3716
-rw-r--r--contrib/simdutf/include/simdutf/internal/isadetection.h324
-rw-r--r--contrib/simdutf/include/simdutf/portability.h262
-rw-r--r--contrib/simdutf/include/simdutf/simdutf_version.h26
-rw-r--r--contrib/simdutf/src/CMakeLists.txt46
-rw-r--r--contrib/simdutf/src/arm64/arm_base64.cpp501
-rw-r--r--contrib/simdutf/src/arm64/arm_convert_latin1_to_utf16.cpp24
-rw-r--r--contrib/simdutf/src/arm64/arm_convert_latin1_to_utf32.cpp24
-rw-r--r--contrib/simdutf/src/arm64/arm_convert_latin1_to_utf8.cpp70
-rw-r--r--contrib/simdutf/src/arm64/arm_convert_utf16_to_latin1.cpp63
-rw-r--r--contrib/simdutf/src/arm64/arm_convert_utf16_to_utf32.cpp191
-rw-r--r--contrib/simdutf/src/arm64/arm_convert_utf16_to_utf8.cpp587
-rw-r--r--contrib/simdutf/src/arm64/arm_convert_utf32_to_latin1.cpp60
-rw-r--r--contrib/simdutf/src/arm64/arm_convert_utf32_to_utf16.cpp151
-rw-r--r--contrib/simdutf/src/arm64/arm_convert_utf32_to_utf8.cpp505
-rw-r--r--contrib/simdutf/src/arm64/arm_convert_utf8_to_latin1.cpp69
-rw-r--r--contrib/simdutf/src/arm64/arm_convert_utf8_to_utf16.cpp288
-rw-r--r--contrib/simdutf/src/arm64/arm_convert_utf8_to_utf32.cpp179
-rw-r--r--contrib/simdutf/src/arm64/arm_validate_utf16.cpp143
-rw-r--r--contrib/simdutf/src/arm64/arm_validate_utf32le.cpp65
-rw-r--r--contrib/simdutf/src/arm64/implementation.cpp1185
-rw-r--r--contrib/simdutf/src/encoding_types.cpp75
-rw-r--r--contrib/simdutf/src/error.cpp3
-rw-r--r--contrib/simdutf/src/fallback/implementation.cpp691
-rw-r--r--contrib/simdutf/src/generic/buf_block_reader.h109
-rw-r--r--contrib/simdutf/src/generic/utf16.h74
-rw-r--r--contrib/simdutf/src/generic/utf8.h40
-rw-r--r--contrib/simdutf/src/generic/utf8_to_latin1/utf8_to_latin1.h315
-rw-r--r--contrib/simdutf/src/generic/utf8_to_latin1/valid_utf8_to_latin1.h80
-rw-r--r--contrib/simdutf/src/generic/utf8_to_utf16/utf8_to_utf16.h334
-rw-r--r--contrib/simdutf/src/generic/utf8_to_utf16/valid_utf8_to_utf16.h76
-rw-r--r--contrib/simdutf/src/generic/utf8_to_utf32/utf8_to_utf32.h320
-rw-r--r--contrib/simdutf/src/generic/utf8_to_utf32/valid_utf8_to_utf32.h44
-rw-r--r--contrib/simdutf/src/generic/utf8_validation/utf8_lookup4_algorithm.h223
-rw-r--r--contrib/simdutf/src/generic/utf8_validation/utf8_validator.h138
-rw-r--r--contrib/simdutf/src/haswell/avx2_base64.cpp577
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf16.cpp37
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf32.cpp20
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf8.cpp83
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf16_to_latin1.cpp85
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf32.cpp210
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf8.cpp602
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf32_to_latin1.cpp93
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf16.cpp174
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf8.cpp569
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf8_to_latin1.cpp60
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf16.cpp195
-rw-r--r--contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf32.cpp135
-rw-r--r--contrib/simdutf/src/haswell/avx2_validate_utf16.cpp206
-rw-r--r--contrib/simdutf/src/haswell/avx2_validate_utf32le.cpp70
-rw-r--r--contrib/simdutf/src/haswell/implementation.cpp1145
-rw-r--r--contrib/simdutf/src/icelake/icelake_ascii_validation.inl.cpp19
-rw-r--r--contrib/simdutf/src/icelake/icelake_base64.inl.cpp358
-rw-r--r--contrib/simdutf/src/icelake/icelake_convert_latin1_to_utf16.inl.cpp36
-rw-r--r--contrib/simdutf/src/icelake/icelake_convert_latin1_to_utf32.inl.cpp20
-rw-r--r--contrib/simdutf/src/icelake/icelake_convert_latin1_to_utf8.inl.cpp107
-rw-r--r--contrib/simdutf/src/icelake/icelake_convert_utf16_to_latin1.inl.cpp103
-rw-r--r--contrib/simdutf/src/icelake/icelake_convert_utf16_to_utf32.inl.cpp136
-rw-r--r--contrib/simdutf/src/icelake/icelake_convert_utf16_to_utf8.inl.cpp206
-rw-r--r--contrib/simdutf/src/icelake/icelake_convert_utf32_to_latin1.inl.cpp74
-rw-r--r--contrib/simdutf/src/icelake/icelake_convert_utf32_to_utf16.inl.cpp178
-rw-r--r--contrib/simdutf/src/icelake/icelake_convert_utf32_to_utf8.inl.cpp574
-rw-r--r--contrib/simdutf/src/icelake/icelake_convert_utf8_to_latin1.inl.cpp104
-rw-r--r--contrib/simdutf/src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp69
-rw-r--r--contrib/simdutf/src/icelake/icelake_from_utf8.inl.cpp338
-rw-r--r--contrib/simdutf/src/icelake/icelake_from_valid_utf8.inl.cpp136
-rw-r--r--contrib/simdutf/src/icelake/icelake_macros.inl.cpp143
-rw-r--r--contrib/simdutf/src/icelake/icelake_utf32_validation.inl.cpp35
-rw-r--r--contrib/simdutf/src/icelake/icelake_utf8_common.inl.cpp796
-rw-r--r--contrib/simdutf/src/icelake/icelake_utf8_validation.inl.cpp116
-rw-r--r--contrib/simdutf/src/icelake/implementation.cpp1650
-rw-r--r--contrib/simdutf/src/implementation.cpp1991
-rw-r--r--contrib/simdutf/src/lasx/implementation.cpp1298
-rw-r--r--contrib/simdutf/src/lasx/lasx_base64.cpp596
-rw-r--r--contrib/simdutf/src/lasx/lasx_convert_latin1_to_utf16.cpp76
-rw-r--r--contrib/simdutf/src/lasx/lasx_convert_latin1_to_utf32.cpp55
-rw-r--r--contrib/simdutf/src/lasx/lasx_convert_latin1_to_utf8.cpp65
-rw-r--r--contrib/simdutf/src/lasx/lasx_convert_utf16_to_latin1.cpp66
-rw-r--r--contrib/simdutf/src/lasx/lasx_convert_utf16_to_utf32.cpp195
-rw-r--r--contrib/simdutf/src/lasx/lasx_convert_utf16_to_utf8.cpp558
-rw-r--r--contrib/simdutf/src/lasx/lasx_convert_utf32_to_latin1.cpp73
-rw-r--r--contrib/simdutf/src/lasx/lasx_convert_utf32_to_utf16.cpp218
-rw-r--r--contrib/simdutf/src/lasx/lasx_convert_utf32_to_utf8.cpp589
-rw-r--r--contrib/simdutf/src/lasx/lasx_convert_utf8_to_latin1.cpp72
-rw-r--r--contrib/simdutf/src/lasx/lasx_convert_utf8_to_utf16.cpp293
-rw-r--r--contrib/simdutf/src/lasx/lasx_convert_utf8_to_utf32.cpp193
-rw-r--r--contrib/simdutf/src/lasx/lasx_validate_utf16.cpp201
-rw-r--r--contrib/simdutf/src/lasx/lasx_validate_utf32le.cpp85
-rw-r--r--contrib/simdutf/src/lsx/implementation.cpp1178
-rw-r--r--contrib/simdutf/src/lsx/lsx_base64.cpp580
-rw-r--r--contrib/simdutf/src/lsx/lsx_convert_latin1_to_utf16.cpp39
-rw-r--r--contrib/simdutf/src/lsx/lsx_convert_latin1_to_utf32.cpp27
-rw-r--r--contrib/simdutf/src/lsx/lsx_convert_latin1_to_utf8.cpp56
-rw-r--r--contrib/simdutf/src/lsx/lsx_convert_utf16_to_latin1.cpp66
-rw-r--r--contrib/simdutf/src/lsx/lsx_convert_utf16_to_utf32.cpp139
-rw-r--r--contrib/simdutf/src/lsx/lsx_convert_utf16_to_utf8.cpp526
-rw-r--r--contrib/simdutf/src/lsx/lsx_convert_utf32_to_latin1.cpp66
-rw-r--r--contrib/simdutf/src/lsx/lsx_convert_utf32_to_utf16.cpp155
-rw-r--r--contrib/simdutf/src/lsx/lsx_convert_utf32_to_utf8.cpp459
-rw-r--r--contrib/simdutf/src/lsx/lsx_convert_utf8_to_latin1.cpp75
-rw-r--r--contrib/simdutf/src/lsx/lsx_convert_utf8_to_utf16.cpp288
-rw-r--r--contrib/simdutf/src/lsx/lsx_convert_utf8_to_utf32.cpp182
-rw-r--r--contrib/simdutf/src/lsx/lsx_validate_utf16.cpp201
-rw-r--r--contrib/simdutf/src/lsx/lsx_validate_utf32le.cpp69
-rw-r--r--contrib/simdutf/src/ppc64/implementation.cpp510
-rw-r--r--contrib/simdutf/src/rvv/implementation.cpp280
-rw-r--r--contrib/simdutf/src/rvv/rvv_helpers.inl.cpp23
-rw-r--r--contrib/simdutf/src/rvv/rvv_latin1_to.inl.cpp66
-rw-r--r--contrib/simdutf/src/rvv/rvv_length_from.inl.cpp165
-rw-r--r--contrib/simdutf/src/rvv/rvv_utf16_to.inl.cpp393
-rw-r--r--contrib/simdutf/src/rvv/rvv_utf32_to.inl.cpp289
-rw-r--r--contrib/simdutf/src/rvv/rvv_utf8_to.inl.cpp430
-rw-r--r--contrib/simdutf/src/rvv/rvv_validate.inl.cpp228
-rw-r--r--contrib/simdutf/src/scalar/ascii.h67
-rw-r--r--contrib/simdutf/src/scalar/base64.h434
-rw-r--r--contrib/simdutf/src/scalar/latin1.h32
-rw-r--r--contrib/simdutf/src/scalar/latin1_to_utf16/latin1_to_utf16.h49
-rw-r--r--contrib/simdutf/src/scalar/latin1_to_utf32/latin1_to_utf32.h23
-rw-r--r--contrib/simdutf/src/scalar/latin1_to_utf8/latin1_to_utf8.h104
-rw-r--r--contrib/simdutf/src/scalar/utf16.h142
-rw-r--r--contrib/simdutf/src/scalar/utf16_to_latin1/utf16_to_latin1.h95
-rw-r--r--contrib/simdutf/src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h31
-rw-r--r--contrib/simdutf/src/scalar/utf16_to_utf32/utf16_to_utf32.h87
-rw-r--r--contrib/simdutf/src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h45
-rw-r--r--contrib/simdutf/src/scalar/utf16_to_utf8/utf16_to_utf8.h160
-rw-r--r--contrib/simdutf/src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h83
-rw-r--r--contrib/simdutf/src/scalar/utf32.h80
-rw-r--r--contrib/simdutf/src/scalar/utf32_to_latin1/utf32_to_latin1.h62
-rw-r--r--contrib/simdutf/src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h49
-rw-r--r--contrib/simdutf/src/scalar/utf32_to_utf16/utf32_to_utf16.h85
-rw-r--r--contrib/simdutf/src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h45
-rw-r--r--contrib/simdutf/src/scalar/utf32_to_utf8/utf32_to_utf8.h123
-rw-r--r--contrib/simdutf/src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h66
-rw-r--r--contrib/simdutf/src/scalar/utf8.h295
-rw-r--r--contrib/simdutf/src/scalar/utf8_to_latin1/utf8_to_latin1.h207
-rw-r--r--contrib/simdutf/src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h78
-rw-r--r--contrib/simdutf/src/scalar/utf8_to_utf16/utf8_to_utf16.h326
-rw-r--r--contrib/simdutf/src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h98
-rw-r--r--contrib/simdutf/src/scalar/utf8_to_utf32/utf8_to_utf32.h282
-rw-r--r--contrib/simdutf/src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h75
-rw-r--r--contrib/simdutf/src/simdutf.cpp82
-rw-r--r--contrib/simdutf/src/simdutf/arm64.h43
-rw-r--r--contrib/simdutf/src/simdutf/arm64/begin.h1
-rw-r--r--contrib/simdutf/src/simdutf/arm64/bitmanipulation.h31
-rw-r--r--contrib/simdutf/src/simdutf/arm64/end.h1
-rw-r--r--contrib/simdutf/src/simdutf/arm64/implementation.h221
-rw-r--r--contrib/simdutf/src/simdutf/arm64/intrinsics.h10
-rw-r--r--contrib/simdutf/src/simdutf/arm64/simd.h725
-rw-r--r--contrib/simdutf/src/simdutf/arm64/simd16-inl.h407
-rw-r--r--contrib/simdutf/src/simdutf/fallback.h42
-rw-r--r--contrib/simdutf/src/simdutf/fallback/begin.h1
-rw-r--r--contrib/simdutf/src/simdutf/fallback/bitmanipulation.h13
-rw-r--r--contrib/simdutf/src/simdutf/fallback/end.h1
-rw-r--r--contrib/simdutf/src/simdutf/fallback/implementation.h217
-rw-r--r--contrib/simdutf/src/simdutf/haswell.h63
-rw-r--r--contrib/simdutf/src/simdutf/haswell/begin.h14
-rw-r--r--contrib/simdutf/src/simdutf/haswell/bitmanipulation.h33
-rw-r--r--contrib/simdutf/src/simdutf/haswell/end.h12
-rw-r--r--contrib/simdutf/src/simdutf/haswell/implementation.h226
-rw-r--r--contrib/simdutf/src/simdutf/haswell/intrinsics.h62
-rw-r--r--contrib/simdutf/src/simdutf/haswell/simd.h502
-rw-r--r--contrib/simdutf/src/simdutf/haswell/simd16-inl.h355
-rw-r--r--contrib/simdutf/src/simdutf/icelake.h71
-rw-r--r--contrib/simdutf/src/simdutf/icelake/begin.h14
-rw-r--r--contrib/simdutf/src/simdutf/icelake/bitmanipulation.h33
-rw-r--r--contrib/simdutf/src/simdutf/icelake/end.h12
-rw-r--r--contrib/simdutf/src/simdutf/icelake/implementation.h229
-rw-r--r--contrib/simdutf/src/simdutf/icelake/intrinsics.h138
-rw-r--r--contrib/simdutf/src/simdutf/lasx.h44
-rw-r--r--contrib/simdutf/src/simdutf/lasx/begin.h1
-rw-r--r--contrib/simdutf/src/simdutf/lasx/bitmanipulation.h25
-rw-r--r--contrib/simdutf/src/simdutf/lasx/end.h1
-rw-r--r--contrib/simdutf/src/simdutf/lasx/implementation.h230
-rw-r--r--contrib/simdutf/src/simdutf/lasx/intrinsics.h101
-rw-r--r--contrib/simdutf/src/simdutf/lasx/simd.h707
-rw-r--r--contrib/simdutf/src/simdutf/lasx/simd16-inl.h348
-rw-r--r--contrib/simdutf/src/simdutf/lsx.h44
-rw-r--r--contrib/simdutf/src/simdutf/lsx/begin.h1
-rw-r--r--contrib/simdutf/src/simdutf/lsx/bitmanipulation.h25
-rw-r--r--contrib/simdutf/src/simdutf/lsx/end.h1
-rw-r--r--contrib/simdutf/src/simdutf/lsx/implementation.h229
-rw-r--r--contrib/simdutf/src/simdutf/lsx/intrinsics.h10
-rw-r--r--contrib/simdutf/src/simdutf/lsx/simd.h600
-rw-r--r--contrib/simdutf/src/simdutf/lsx/simd16-inl.h378
-rw-r--r--contrib/simdutf/src/simdutf/ppc64.h40
-rw-r--r--contrib/simdutf/src/simdutf/ppc64/begin.h1
-rw-r--r--contrib/simdutf/src/simdutf/ppc64/bitmanipulation.h23
-rw-r--r--contrib/simdutf/src/simdutf/ppc64/end.h1
-rw-r--r--contrib/simdutf/src/simdutf/ppc64/implementation.h168
-rw-r--r--contrib/simdutf/src/simdutf/ppc64/intrinsics.h19
-rw-r--r--contrib/simdutf/src/simdutf/ppc64/simd.h479
-rw-r--r--contrib/simdutf/src/simdutf/rvv.h41
-rw-r--r--contrib/simdutf/src/simdutf/rvv/begin.h7
-rw-r--r--contrib/simdutf/src/simdutf/rvv/end.h7
-rw-r--r--contrib/simdutf/src/simdutf/rvv/implementation.h234
-rw-r--r--contrib/simdutf/src/simdutf/rvv/intrinsics.h131
-rw-r--r--contrib/simdutf/src/simdutf/westmere.h59
-rw-r--r--contrib/simdutf/src/simdutf/westmere/begin.h7
-rw-r--r--contrib/simdutf/src/simdutf/westmere/bitmanipulation.h35
-rw-r--r--contrib/simdutf/src/simdutf/westmere/end.h7
-rw-r--r--contrib/simdutf/src/simdutf/westmere/implementation.h222
-rw-r--r--contrib/simdutf/src/simdutf/westmere/intrinsics.h38
-rw-r--r--contrib/simdutf/src/simdutf/westmere/simd.h593
-rw-r--r--contrib/simdutf/src/simdutf/westmere/simd16-inl.h358
-rw-r--r--contrib/simdutf/src/tables/base64_tables.h688
-rw-r--r--contrib/simdutf/src/tables/utf16_to_utf8_tables.h768
-rw-r--r--contrib/simdutf/src/tables/utf8_to_utf16_tables.h826
-rw-r--r--contrib/simdutf/src/westmere/implementation.cpp1142
-rw-r--r--contrib/simdutf/src/westmere/internal/loader.cpp7
-rw-r--r--contrib/simdutf/src/westmere/internal/write_v_u16_11bits_to_utf8.cpp66
-rw-r--r--contrib/simdutf/src/westmere/sse_base64.cpp591
-rw-r--r--contrib/simdutf/src/westmere/sse_convert_latin1_to_utf16.cpp21
-rw-r--r--contrib/simdutf/src/westmere/sse_convert_latin1_to_utf32.cpp31
-rw-r--r--contrib/simdutf/src/westmere/sse_convert_latin1_to_utf8.cpp71
-rw-r--r--contrib/simdutf/src/westmere/sse_convert_utf16_to_latin1.cpp72
-rw-r--r--contrib/simdutf/src/westmere/sse_convert_utf16_to_utf32.cpp206
-rw-r--r--contrib/simdutf/src/westmere/sse_convert_utf16_to_utf8.cpp504
-rw-r--r--contrib/simdutf/src/westmere/sse_convert_utf32_to_latin1.cpp82
-rw-r--r--contrib/simdutf/src/westmere/sse_convert_utf32_to_utf16.cpp170
-rw-r--r--contrib/simdutf/src/westmere/sse_convert_utf32_to_utf8.cpp590
-rw-r--r--contrib/simdutf/src/westmere/sse_convert_utf8_to_latin1.cpp58
-rw-r--r--contrib/simdutf/src/westmere/sse_convert_utf8_to_utf16.cpp197
-rw-r--r--contrib/simdutf/src/westmere/sse_convert_utf8_to_utf32.cpp141
-rw-r--r--contrib/simdutf/src/westmere/sse_validate_utf16.cpp211
-rw-r--r--contrib/simdutf/src/westmere/sse_validate_utf32le.cpp69
-rw-r--r--src/CMakeLists.txt2
-rw-r--r--src/libmime/mime_encoding.c2
-rw-r--r--src/libmime/mime_parser.c2
-rw-r--r--src/libmime/mime_string.hxx8
-rw-r--r--src/libmime/scan_result.c2
-rw-r--r--src/libserver/cfg_utils.cxx10
-rw-r--r--src/libserver/maps/map_helpers.c4
-rw-r--r--src/libserver/protocol.c2
-rw-r--r--src/libserver/re_cache.c2
-rw-r--r--src/libutil/CMakeLists.txt1
-rw-r--r--src/libutil/cxx/rspamd-simdutf.cxx41
-rw-r--r--src/libutil/fstring.c2
-rw-r--r--src/libutil/regexp.c2
-rw-r--r--src/libutil/rspamd_simdutf.h34
-rw-r--r--src/libutil/str_util.c2
-rw-r--r--src/lua/lua_text.c2
-rw-r--r--src/lua/lua_util.c2
267 files changed, 54192 insertions, 875 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21ad6241e..6b8a80e8e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@
#
############################# INITIAL SECTION #############################################
-CMAKE_MINIMUM_REQUIRED(VERSION 3.12 FATAL_ERROR)
+CMAKE_MINIMUM_REQUIRED(VERSION 3.15 FATAL_ERROR)
SET(RSPAMD_VERSION_MAJOR 3)
SET(RSPAMD_VERSION_MINOR 11)
@@ -621,6 +621,8 @@ ADD_SUBDIRECTORY(contrib/http-parser)
ADD_SUBDIRECTORY(contrib/fpconv)
ADD_SUBDIRECTORY(contrib/lc-btrie)
ADD_SUBDIRECTORY(contrib/libottery)
+ADD_SUBDIRECTORY(contrib/simdutf)
+INCLUDE_DIRECTORIES("${CMAKE_SOURCE_DIR}/contrib/simdutf/include")
IF (SYSTEM_ZSTD MATCHES "OFF")
ADD_SUBDIRECTORY(contrib/zstd)
ELSE ()
@@ -639,7 +641,6 @@ ADD_SUBDIRECTORY(contrib/lua-lpeg)
ADD_SUBDIRECTORY(contrib/t1ha)
ADD_SUBDIRECTORY(contrib/libev)
ADD_SUBDIRECTORY(contrib/kann)
-ADD_SUBDIRECTORY(contrib/fastutf8)
ADD_SUBDIRECTORY(contrib/google-ced)
IF (ENABLE_BACKWARD MATCHES "ON")
ADD_SUBDIRECTORY(contrib/backward-cpp)
diff --git a/contrib/DEPENDENCY_INFO.md b/contrib/DEPENDENCY_INFO.md
index 86598129c..fc41c07a3 100644
--- a/contrib/DEPENDENCY_INFO.md
+++ b/contrib/DEPENDENCY_INFO.md
@@ -29,7 +29,7 @@
| google-ced | 37529e6 | Apache 2 | YES | build fixes |
| kann | ? | MIT | YES | blas/lapack changes |
| fpconv | ? | Boost | YES | many changes |
-| fastutf8 | ? | MIT | YES | many changes |
+| simdutf | ef7d39c | Apache 2 | NO | build system only |
| expected | v1.0 | Public Domain / CC0 | NO | |
| frozen | 1.0.1 | Apache 2 | NO | |
| fmt | 11.0.0 | MIT | NO | |
diff --git a/contrib/backward-cpp/CMakeLists.txt b/contrib/backward-cpp/CMakeLists.txt
index 038c50516..9aa7bb13d 100644
--- a/contrib/backward-cpp/CMakeLists.txt
+++ b/contrib/backward-cpp/CMakeLists.txt
@@ -20,7 +20,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.15)
project(backward CXX)
# Introduce variables:
diff --git a/contrib/doctest/CMakeLists.txt b/contrib/doctest/CMakeLists.txt
index c6b3f48ee..f85d74728 100644
--- a/contrib/doctest/CMakeLists.txt
+++ b/contrib/doctest/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.15)
if(POLICY CMP0077)
cmake_policy(SET CMP0077 NEW)
diff --git a/contrib/fastutf8/CMakeLists.txt b/contrib/fastutf8/CMakeLists.txt
deleted file mode 100644
index 2a98ed815..000000000
--- a/contrib/fastutf8/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-SET(UTFSRC ${CMAKE_CURRENT_SOURCE_DIR}/fastutf8.c)
-IF(HAVE_AVX2 AND "${ARCH}" STREQUAL "x86_64")
- SET(UTFSRC ${UTFSRC} ${CMAKE_CURRENT_SOURCE_DIR}/avx2.c)
- MESSAGE(STATUS "UTF8: AVX2 support is added")
-ENDIF()
-IF(HAVE_SSE41 AND "${ARCH}" STREQUAL "x86_64")
- SET(UTFSRC ${UTFSRC} ${CMAKE_CURRENT_SOURCE_DIR}/sse41.c)
- MESSAGE(STATUS "UTF8: SSE41 support is added")
-ENDIF()
-
-ADD_LIBRARY(rspamd-fastutf8 STATIC ${UTFSRC}) \ No newline at end of file
diff --git a/contrib/fastutf8/LICENSE b/contrib/fastutf8/LICENSE
deleted file mode 100644
index 9b5471be2..000000000
--- a/contrib/fastutf8/LICENSE
+++ /dev/null
@@ -1,22 +0,0 @@
-MIT License
-
-Copyright (c) 2019 Yibo Cai
-Copyright (c) 2019 Vsevolod Stakhov
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE. \ No newline at end of file
diff --git a/contrib/fastutf8/avx2.c b/contrib/fastutf8/avx2.c
deleted file mode 100644
index 765c62fdb..000000000
--- a/contrib/fastutf8/avx2.c
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * MIT License
- *
- * Copyright (c) 2019 Yibo Cai
- * Copyright (c) 2019 Vsevolod Stakhov
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "config.h"
-#include "fastutf8.h"
-#include "platform_config.h"
-
-
-#ifndef __clang__
-#pragma GCC push_options
-#pragma GCC target("avx2")
-#endif
-
-#ifndef __SSE2__
-#define __SSE2__
-#endif
-#ifndef __SSE__
-#define __SSE__
-#endif
-#ifndef __SSE4_2__
-#define __SSE4_2__
-#endif
-#ifndef __SSE4_1__
-#define __SSE4_1__
-#endif
-#ifndef __SSEE3__
-#define __SSEE3__
-#endif
-#ifndef __AVX__
-#define __AVX__
-#endif
-#ifndef __AVX2__
-#define __AVX2__
-#endif
-
-#include <immintrin.h>
-
-/*
- * Map high nibble of "First Byte" to legal character length minus 1
- * 0x00 ~ 0xBF --> 0
- * 0xC0 ~ 0xDF --> 1
- * 0xE0 ~ 0xEF --> 2
- * 0xF0 ~ 0xFF --> 3
- */
-static const int8_t _first_len_tbl[] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3,
-};
-
-/* Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) */
-static const int8_t _first_range_tbl[] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8,
-};
-
-/*
- * Range table, map range index to min and max values
- * Index 0 : 00 ~ 7F (First Byte, ascii)
- * Index 1,2,3: 80 ~ BF (Second, Third, Fourth Byte)
- * Index 4 : A0 ~ BF (Second Byte after E0)
- * Index 5 : 80 ~ 9F (Second Byte after ED)
- * Index 6 : 90 ~ BF (Second Byte after F0)
- * Index 7 : 80 ~ 8F (Second Byte after F4)
- * Index 8 : C2 ~ F4 (First Byte, non ascii)
- * Index 9~15 : illegal: i >= 127 && i <= -128
- */
-static const int8_t _range_min_tbl[] = {
- 0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80,
- 0xC2, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
- 0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80,
- 0xC2, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
-};
-static const int8_t _range_max_tbl[] = {
- 0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F,
- 0xF4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F,
- 0xF4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-};
-
-/*
- * Tables for fast handling of four special First Bytes(E0,ED,F0,F4), after
- * which the Second Byte are not 80~BF. It contains "range index adjustment".
- * +------------+---------------+------------------+----------------+
- * | First Byte | original range| range adjustment | adjusted range |
- * +------------+---------------+------------------+----------------+
- * | E0 | 2 | 2 | 4 |
- * +------------+---------------+------------------+----------------+
- * | ED | 2 | 3 | 5 |
- * +------------+---------------+------------------+----------------+
- * | F0 | 3 | 3 | 6 |
- * +------------+---------------+------------------+----------------+
- * | F4 | 4 | 4 | 8 |
- * +------------+---------------+------------------+----------------+
- */
-/* index1 -> E0, index14 -> ED */
-static const int8_t _df_ee_tbl[] = {
- 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
- 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
-};
-/* index1 -> F0, index5 -> F4 */
-static const int8_t _ef_fe_tbl[] = {
- 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-};
-
-static inline __m256i push_last_byte_of_a_to_b(__m256i a, __m256i b)
- __attribute__((__target__("avx2")));
-static inline __m256i push_last_byte_of_a_to_b(__m256i a, __m256i b)
-{
- return _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 15);
-}
-
-static inline __m256i push_last_2bytes_of_a_to_b(__m256i a, __m256i b)
- __attribute__((__target__("avx2")));
-static inline __m256i push_last_2bytes_of_a_to_b(__m256i a, __m256i b)
-{
- return _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 14);
-}
-
-static inline __m256i push_last_3bytes_of_a_to_b(__m256i a, __m256i b)
- __attribute__((__target__("avx2")));
-static inline __m256i push_last_3bytes_of_a_to_b(__m256i a, __m256i b)
-{
- return _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 13);
-}
-
-off_t rspamd_fast_utf8_validate_avx2 (const unsigned char *data, size_t len)
- __attribute__((__target__("avx2")));
-
-/* 5x faster than naive method */
-/* Return 0 - success, -1 - error, >0 - first error char(if RET_ERR_IDX = 1) */
-off_t rspamd_fast_utf8_validate_avx2 (const unsigned char *data, size_t len)
-{
- off_t err_pos = 1;
-
- if (len >= 32) {
- __m256i prev_input = _mm256_set1_epi8 (0);
- __m256i prev_first_len = _mm256_set1_epi8 (0);
-
- /* Cached tables */
- const __m256i first_len_tbl =
- _mm256_lddqu_si256 ((const __m256i *) _first_len_tbl);
- const __m256i first_range_tbl =
- _mm256_lddqu_si256 ((const __m256i *) _first_range_tbl);
- const __m256i range_min_tbl =
- _mm256_lddqu_si256 ((const __m256i *) _range_min_tbl);
- const __m256i range_max_tbl =
- _mm256_lddqu_si256 ((const __m256i *) _range_max_tbl);
- const __m256i df_ee_tbl =
- _mm256_lddqu_si256 ((const __m256i *) _df_ee_tbl);
- const __m256i ef_fe_tbl =
- _mm256_lddqu_si256 ((const __m256i *) _ef_fe_tbl);
-
- __m256i error = _mm256_set1_epi8 (0);
-
- while (len >= 32) {
- const __m256i input = _mm256_lddqu_si256 ((const __m256i *) data);
-
- /* high_nibbles = input >> 4 */
- const __m256i high_nibbles =
- _mm256_and_si256 (_mm256_srli_epi16 (input, 4), _mm256_set1_epi8 (0x0F));
-
- /* first_len = legal character length minus 1 */
- /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
- /* first_len = first_len_tbl[high_nibbles] */
- __m256i first_len = _mm256_shuffle_epi8 (first_len_tbl, high_nibbles);
-
- /* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */
- /* range = first_range_tbl[high_nibbles] */
- __m256i range = _mm256_shuffle_epi8 (first_range_tbl, high_nibbles);
-
- /* Second Byte: set range index to first_len */
- /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
- /* range |= (first_len, prev_first_len) << 1 byte */
- range = _mm256_or_si256 (
- range, push_last_byte_of_a_to_b (prev_first_len, first_len));
-
- /* Third Byte: set range index to saturate_sub(first_len, 1) */
- /* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */
- __m256i tmp1, tmp2;
-
- /* tmp1 = saturate_sub(first_len, 1) */
- tmp1 = _mm256_subs_epu8 (first_len, _mm256_set1_epi8 (1));
- /* tmp2 = saturate_sub(prev_first_len, 1) */
- tmp2 = _mm256_subs_epu8 (prev_first_len, _mm256_set1_epi8 (1));
-
- /* range |= (tmp1, tmp2) << 2 bytes */
- range = _mm256_or_si256 (range, push_last_2bytes_of_a_to_b (tmp2, tmp1));
-
- /* Fourth Byte: set range index to saturate_sub(first_len, 2) */
- /* 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF */
- /* tmp1 = saturate_sub(first_len, 2) */
- tmp1 = _mm256_subs_epu8 (first_len, _mm256_set1_epi8 (2));
- /* tmp2 = saturate_sub(prev_first_len, 2) */
- tmp2 = _mm256_subs_epu8 (prev_first_len, _mm256_set1_epi8 (2));
- /* range |= (tmp1, tmp2) << 3 bytes */
- range = _mm256_or_si256 (range, push_last_3bytes_of_a_to_b (tmp2, tmp1));
-
- /*
- * Now we have below range indices caluclated
- * Correct cases:
- * - 8 for C0~FF
- * - 3 for 1st byte after F0~FF
- * - 2 for 1st byte after E0~EF or 2nd byte after F0~FF
- * - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or
- * 3rd byte after F0~FF
- * - 0 for others
- * Error cases:
- * 9,10,11 if non ascii First Byte overlaps
- * E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error
- */
-
- /* Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) */
- /* Overlaps lead to index 9~15, which are illegal in range table */
- __m256i shift1, pos, range2;
- /* shift1 = (input, prev_input) << 1 byte */
- shift1 = push_last_byte_of_a_to_b (prev_input, input);
- pos = _mm256_sub_epi8 (shift1, _mm256_set1_epi8 (0xEF));
- /*
- * shift1: | EF F0 ... FE | FF 00 ... ... DE | DF E0 ... EE |
- * pos: | 0 1 15 | 16 17 239| 240 241 255|
- * pos-240: | 0 0 0 | 0 0 0 | 0 1 15 |
- * pos+112: | 112 113 127| >= 128 | >= 128 |
- */
- tmp1 = _mm256_subs_epu8 (pos, _mm256_set1_epi8 ((char)240));
- range2 = _mm256_shuffle_epi8 (df_ee_tbl, tmp1);
- tmp2 = _mm256_adds_epu8 (pos, _mm256_set1_epi8 (112));
- range2 = _mm256_add_epi8 (range2, _mm256_shuffle_epi8 (ef_fe_tbl, tmp2));
-
- range = _mm256_add_epi8 (range, range2);
-
- /* Load min and max values per calculated range index */
- __m256i minv = _mm256_shuffle_epi8 (range_min_tbl, range);
- __m256i maxv = _mm256_shuffle_epi8 (range_max_tbl, range);
-
- /* Check value range */
- error = _mm256_cmpgt_epi8(minv, input);
- error = _mm256_or_si256(error, _mm256_cmpgt_epi8(input, maxv));
- /* 5% performance drop from this conditional branch */
- if (!_mm256_testz_si256(error, error)) {
- break;
- }
-
- prev_input = input;
- prev_first_len = first_len;
-
- data += 32;
- len -= 32;
- err_pos += 32;
- }
-
- /* Error in first 16 bytes */
- if (err_pos == 1) {
- goto do_naive;
- }
-
- /* Find previous token (not 80~BF) */
- int32_t token4 = _mm256_extract_epi32 (prev_input, 7);
- const int8_t *token = (const int8_t *) &token4;
- int lookahead = 0;
-
- if (token[3] > (int8_t) 0xBF) {
- lookahead = 1;
- }
- else if (token[2] > (int8_t) 0xBF) {
- lookahead = 2;
- }
- else if (token[1] > (int8_t) 0xBF) {
- lookahead = 3;
- }
-
- data -= lookahead;
- len += lookahead;
- err_pos -= lookahead;
- }
-
- /* Check remaining bytes with naive method */
-do_naive:
- if (len > 0) {
- off_t err_pos2 = rspamd_fast_utf8_validate_ref (data, len);
-
- if (err_pos2) {
- return err_pos + err_pos2 - 1;
- }
- }
-
- return 0;
-}
-
-#ifndef __clang__
-#pragma GCC pop_options
-#endif
-
diff --git a/contrib/fastutf8/fastutf8.c b/contrib/fastutf8/fastutf8.c
deleted file mode 100644
index 89becaf0a..000000000
--- a/contrib/fastutf8/fastutf8.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * MIT License
- *
- * Copyright (c) 2019 Yibo Cai
- * Copyright (c) 2019 Vsevolod Stakhov
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "fastutf8.h"
-#include "libcryptobox/platform_config.h"
-
-
-/*
- * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
- *
- * Table 3-7. Well-Formed UTF-8 Byte Sequences
- *
- * +--------------------+------------+-------------+------------+-------------+
- * | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+0000..U+007F | 00..7F | | | |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+0080..U+07FF | C2..DF | 80..BF | | |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+D000..U+D7FF | ED | 80..9F | 80..BF | |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
- * +--------------------+------------+-------------+------------+-------------+
- * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
- * +--------------------+------------+-------------+------------+-------------+
- */
-
-/* Return 0 - success, >0 - index (1 based) of first error char */
-off_t
-rspamd_fast_utf8_validate_ref (const unsigned char *data, size_t len)
-{
- off_t err_pos = 1;
-
- while (len) {
- int bytes;
- const unsigned char byte1 = data[0];
-
- /* 00..7F */
- if (byte1 <= 0x7F) {
- bytes = 1;
- /* C2..DF, 80..BF */
- }
- else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF &&
- (signed char) data[1] <= (signed char) 0xBF) {
- bytes = 2;
- }
- else if (len >= 3) {
- const unsigned char byte2 = data[1];
-
- /* Is byte2, byte3 between 0x80 ~ 0xBF */
- const int byte2_ok = (signed char) byte2 <= (signed char) 0xBF;
- const int byte3_ok = (signed char) data[2] <= (signed char) 0xBF;
-
- if (byte2_ok && byte3_ok &&
- /* E0, A0..BF, 80..BF */
- ((byte1 == 0xE0 && byte2 >= 0xA0) ||
- /* E1..EC, 80..BF, 80..BF */
- (byte1 >= 0xE1 && byte1 <= 0xEC) ||
- /* ED, 80..9F, 80..BF */
- (byte1 == 0xED && byte2 <= 0x9F) ||
- /* EE..EF, 80..BF, 80..BF */
- (byte1 >= 0xEE && byte1 <= 0xEF))) {
- bytes = 3;
- }
- else if (len >= 4) {
- /* Is byte4 between 0x80 ~ 0xBF */
- const int byte4_ok = (signed char) data[3] <= (signed char) 0xBF;
-
- if (byte2_ok && byte3_ok && byte4_ok &&
- /* F0, 90..BF, 80..BF, 80..BF */
- ((byte1 == 0xF0 && byte2 >= 0x90) ||
- /* F1..F3, 80..BF, 80..BF, 80..BF */
- (byte1 >= 0xF1 && byte1 <= 0xF3) ||
- /* F4, 80..8F, 80..BF, 80..BF */
- (byte1 == 0xF4 && byte2 <= 0x8F))) {
- bytes = 4;
- }
- else {
- return err_pos;
- }
- }
- else {
- return err_pos;
- }
- }
- else {
- return err_pos;
- }
-
- len -= bytes;
- err_pos += bytes;
- data += bytes;
- }
-
- return 0;
-}
-
-/* Prototypes */
-#if defined(HAVE_SSE41) && defined(__x86_64__)
-extern off_t rspamd_fast_utf8_validate_sse41 (const unsigned char *data, size_t len);
-#endif
-#if defined(HAVE_AVX2) && defined(__x86_64__)
-extern off_t rspamd_fast_utf8_validate_avx2 (const unsigned char *data, size_t len);
-#endif
-
-static off_t (*validate_func) (const unsigned char *data, size_t len) =
- rspamd_fast_utf8_validate_ref;
-
-
-void
-rspamd_fast_utf8_library_init (unsigned flags)
-{
-#if defined(HAVE_SSE41) && defined(__x86_64__)
- if (flags & RSPAMD_FAST_UTF8_FLAG_SSE41) {
- validate_func = rspamd_fast_utf8_validate_sse41;
- }
-#endif
-#if defined(HAVE_AVX2) && defined(__x86_64__)
- if (flags & RSPAMD_FAST_UTF8_FLAG_AVX2) {
- validate_func = rspamd_fast_utf8_validate_avx2;
- }
-#endif
-}
-
-off_t
-rspamd_fast_utf8_validate (const unsigned char *data, size_t len)
-{
- return len >= 64 ?
- validate_func (data, len) :
- rspamd_fast_utf8_validate_ref (data, len);
-} \ No newline at end of file
diff --git a/contrib/fastutf8/fastutf8.h b/contrib/fastutf8/fastutf8.h
deleted file mode 100644
index a1e9cbf03..000000000
--- a/contrib/fastutf8/fastutf8.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * MIT License
- *
- * Copyright (c) 2019 Yibo Cai
- * Copyright (c) 2019 Vsevolod Stakhov
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef RSPAMD_FASTUTF8_H
-#define RSPAMD_FASTUTF8_H
-
-#include <sys/types.h>
-#include <stdbool.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-enum rspamd_fast_utf8_cpu_flags {
- RSPAMD_FAST_UTF8_FLAG_SSE41 = 1u << 0u,
- RSPAMD_FAST_UTF8_FLAG_AVX2 = 1u << 1u,
-};
-
-/**
- * Called to init codecs
- * @param flags
- */
-void rspamd_fast_utf8_library_init(unsigned flags);
-
-/**
- * Called to validate input using fast codec
- * @param data
- * @param len
- * @return
- */
-off_t rspamd_fast_utf8_validate(const unsigned char *data, size_t len);
-
-/**
- * Use plain C implementation
- * @param data
- * @param len
- * @return
- */
-off_t rspamd_fast_utf8_validate_ref(const unsigned char *data, size_t len);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/contrib/fastutf8/sse41.c b/contrib/fastutf8/sse41.c
deleted file mode 100644
index df338cf27..000000000
--- a/contrib/fastutf8/sse41.c
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * MIT License
- *
- * Copyright (c) 2019 Yibo Cai
- * Copyright (c) 2019 Vsevolod Stakhov
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "config.h"
-#include "fastutf8.h"
-#include "platform_config.h"
-
-#ifndef __clang__
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-#endif
-
-#ifndef __SSE2__
-#define __SSE2__
-#endif
-#ifndef __SSE__
-#define __SSE__
-#endif
-#ifndef __SSEE3__
-#define __SSEE3__
-#endif
-#ifndef __SSE4_1__
-#define __SSE4_1__
-#endif
-
-#include <smmintrin.h>
-
-/*
- * Map high nibble of "First Byte" to legal character length minus 1
- * 0x00 ~ 0xBF --> 0
- * 0xC0 ~ 0xDF --> 1
- * 0xE0 ~ 0xEF --> 2
- * 0xF0 ~ 0xFF --> 3
- */
-static const int8_t _first_len_tbl[] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3,
-};
-
-/* Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) */
-static const int8_t _first_range_tbl[] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8,
-};
-
-/*
- * Range table, map range index to min and max values
- * Index 0 : 00 ~ 7F (First Byte, ascii)
- * Index 1,2,3: 80 ~ BF (Second, Third, Fourth Byte)
- * Index 4 : A0 ~ BF (Second Byte after E0)
- * Index 5 : 80 ~ 9F (Second Byte after ED)
- * Index 6 : 90 ~ BF (Second Byte after F0)
- * Index 7 : 80 ~ 8F (Second Byte after F4)
- * Index 8 : C2 ~ F4 (First Byte, non ascii)
- * Index 9~15 : illegal: i >= 127 && i <= -128
- */
-static const int8_t _range_min_tbl[] = {
- 0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80,
- 0xC2, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
-};
-static const int8_t _range_max_tbl[] = {
- 0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F,
- 0xF4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-};
-
-/*
- * Tables for fast handling of four special First Bytes(E0,ED,F0,F4), after
- * which the Second Byte are not 80~BF. It contains "range index adjustment".
- * +------------+---------------+------------------+----------------+
- * | First Byte | original range| range adjustment | adjusted range |
- * +------------+---------------+------------------+----------------+
- * | E0 | 2 | 2 | 4 |
- * +------------+---------------+------------------+----------------+
- * | ED | 2 | 3 | 5 |
- * +------------+---------------+------------------+----------------+
- * | F0 | 3 | 3 | 6 |
- * +------------+---------------+------------------+----------------+
- * | F4 | 4 | 4 | 8 |
- * +------------+---------------+------------------+----------------+
- */
-/* index1 -> E0, index14 -> ED */
-static const int8_t _df_ee_tbl[] = {
- 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
-};
-/* index1 -> F0, index5 -> F4 */
-static const int8_t _ef_fe_tbl[] = {
- 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-};
-
-off_t
-rspamd_fast_utf8_validate_sse41 (const unsigned char *data, size_t len)
- __attribute__((__target__("sse4.1")));
-
-/* Return 0 - success, >0 - first error char(if RET_ERR_IDX = 1) */
-off_t
-rspamd_fast_utf8_validate_sse41 (const unsigned char *data, size_t len)
-{
- off_t err_pos = 1;
-
- if (len >= 16) {
- __m128i prev_input = _mm_set1_epi8 (0);
- __m128i prev_first_len = _mm_set1_epi8 (0);
-
- /* Cached tables */
- const __m128i first_len_tbl =
- _mm_lddqu_si128 ((const __m128i *) _first_len_tbl);
- const __m128i first_range_tbl =
- _mm_lddqu_si128 ((const __m128i *) _first_range_tbl);
- const __m128i range_min_tbl =
- _mm_lddqu_si128 ((const __m128i *) _range_min_tbl);
- const __m128i range_max_tbl =
- _mm_lddqu_si128 ((const __m128i *) _range_max_tbl);
- const __m128i df_ee_tbl =
- _mm_lddqu_si128 ((const __m128i *) _df_ee_tbl);
- const __m128i ef_fe_tbl =
- _mm_lddqu_si128 ((const __m128i *) _ef_fe_tbl);
-
- __m128i error = _mm_set1_epi8 (0);
-
- while (len >= 16) {
- const __m128i input = _mm_lddqu_si128 ((const __m128i *) data);
-
- /* high_nibbles = input >> 4 */
- const __m128i high_nibbles =
- _mm_and_si128 (_mm_srli_epi16 (input, 4), _mm_set1_epi8 (0x0F));
-
- /* first_len = legal character length minus 1 */
- /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
- /* first_len = first_len_tbl[high_nibbles] */
- __m128i first_len = _mm_shuffle_epi8 (first_len_tbl, high_nibbles);
-
- /* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */
- /* range = first_range_tbl[high_nibbles] */
- __m128i range = _mm_shuffle_epi8 (first_range_tbl, high_nibbles);
-
- /* Second Byte: set range index to first_len */
- /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
- /* range |= (first_len, prev_first_len) << 1 byte */
- range = _mm_or_si128 (
- range, _mm_alignr_epi8(first_len, prev_first_len, 15));
-
- /* Third Byte: set range index to saturate_sub(first_len, 1) */
- /* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */
- __m128i tmp1, tmp2;
- /* tmp1 = saturate_sub(first_len, 1) */
- tmp1 = _mm_subs_epu8 (first_len, _mm_set1_epi8 (1));
- /* tmp2 = saturate_sub(prev_first_len, 1) */
- tmp2 = _mm_subs_epu8 (prev_first_len, _mm_set1_epi8 (1));
- /* range |= (tmp1, tmp2) << 2 bytes */
- range = _mm_or_si128 (range, _mm_alignr_epi8(tmp1, tmp2, 14));
-
- /* Fourth Byte: set range index to saturate_sub(first_len, 2) */
- /* 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF */
- /* tmp1 = saturate_sub(first_len, 2) */
- tmp1 = _mm_subs_epu8 (first_len, _mm_set1_epi8 (2));
- /* tmp2 = saturate_sub(prev_first_len, 2) */
- tmp2 = _mm_subs_epu8 (prev_first_len, _mm_set1_epi8 (2));
- /* range |= (tmp1, tmp2) << 3 bytes */
- range = _mm_or_si128 (range, _mm_alignr_epi8(tmp1, tmp2, 13));
-
- /*
- * Now we have below range indices caluclated
- * Correct cases:
- * - 8 for C0~FF
- * - 3 for 1st byte after F0~FF
- * - 2 for 1st byte after E0~EF or 2nd byte after F0~FF
- * - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or
- * 3rd byte after F0~FF
- * - 0 for others
- * Error cases:
- * 9,10,11 if non ascii First Byte overlaps
- * E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error
- */
-
- /* Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) */
- /* Overlaps lead to index 9~15, which are illegal in range table */
- __m128i shift1, pos, range2;
- /* shift1 = (input, prev_input) << 1 byte */
- shift1 = _mm_alignr_epi8(input, prev_input, 15);
- pos = _mm_sub_epi8 (shift1, _mm_set1_epi8 (0xEF));
- /*
- * shift1: | EF F0 ... FE | FF 00 ... ... DE | DF E0 ... EE |
- * pos: | 0 1 15 | 16 17 239| 240 241 255|
- * pos-240: | 0 0 0 | 0 0 0 | 0 1 15 |
- * pos+112: | 112 113 127| >= 128 | >= 128 |
- */
- tmp1 = _mm_subs_epu8 (pos, _mm_set1_epi8 ((char)240));
- range2 = _mm_shuffle_epi8 (df_ee_tbl, tmp1);
- tmp2 = _mm_adds_epu8 (pos, _mm_set1_epi8 (112));
- range2 = _mm_add_epi8 (range2, _mm_shuffle_epi8 (ef_fe_tbl, tmp2));
-
- range = _mm_add_epi8 (range, range2);
-
- /* Load min and max values per calculated range index */
- __m128i minv = _mm_shuffle_epi8 (range_min_tbl, range);
- __m128i maxv = _mm_shuffle_epi8 (range_max_tbl, range);
-
- /* Check value range */
- error = _mm_cmplt_epi8(input, minv);
- error = _mm_or_si128(error, _mm_cmpgt_epi8(input, maxv));
- /* 5% performance drop from this conditional branch */
- if (!_mm_testz_si128(error, error)) {
- break;
- }
-
- prev_input = input;
- prev_first_len = first_len;
-
- data += 16;
- len -= 16;
- err_pos += 16;
- }
-
- /* Error in first 16 bytes */
- if (err_pos == 1) {
- goto do_naive;
- }
-
- /* Find previous token (not 80~BF) */
- int32_t token4 = _mm_extract_epi32 (prev_input, 3);
- const int8_t *token = (const int8_t *) &token4;
- int lookahead = 0;
-
- if (token[3] > (int8_t) 0xBF) {
- lookahead = 1;
- }
- else if (token[2] > (int8_t) 0xBF) {
- lookahead = 2;
- }
- else if (token[1] > (int8_t) 0xBF) {
- lookahead = 3;
- }
-
- data -= lookahead;
- len += lookahead;
- err_pos -= lookahead;
- }
-
- do_naive:
- if (len > 0) {
- off_t err_pos2 = rspamd_fast_utf8_validate_ref (data, len);
-
- if (err_pos2) {
- return err_pos + err_pos2 - 1;
- }
- }
-
- return 0;
-}
-
-#ifndef __clang__
-#pragma GCC pop_options
-#endif \ No newline at end of file
diff --git a/contrib/simdutf/CMakeLists.txt b/contrib/simdutf/CMakeLists.txt
new file mode 100644
index 000000000..f07a100d0
--- /dev/null
+++ b/contrib/simdutf/CMakeLists.txt
@@ -0,0 +1,114 @@
+cmake_minimum_required(VERSION 3.15)
+
+project(simdutf
+ DESCRIPTION "Fast Unicode validation, transcoding and processing"
+ LANGUAGES CXX
+ VERSION 5.6.3
+)
+
+include (TestBigEndian)
+TEST_BIG_ENDIAN(IS_BIG_ENDIAN)
+if(IS_BIG_ENDIAN)
+ message(STATUS "Big-endian system detected.")
+endif()
+
+include(GNUInstallDirs)
+# The following requires CMake 3.21.
+# if(PROJECT_IS_TOP_LEVEL)
+# message(STATUS "Building simdutf as a top-level project.")
+# include(CTest)
+#else()
+# message(STATUS "Building simdutf as a subproject.")
+#endif(PROJECT_IS_TOP_LEVEL)
+include(CTest)
+include(cmake/simdutf-flags.cmake)
+
+set(SIMDUTF_LIB_VERSION "11.0.0" CACHE STRING "simdutf library version")
+set(SIMDUTF_LIB_SOVERSION "11" CACHE STRING "simdutf library soversion")
+option(SIMDUTF_TESTS "Whether the tests are included as part of the CMake Build." OFF)
+option(SIMDUTF_BENCHMARKS "Whether the benchmarks are included as part of the CMake Build." OFF)
+option(SIMDUTF_TOOLS "Whether the tools are included as part of the CMake build. Requires C++17 or better." OFF)
+option(SIMDUTF_FUZZERS "Whether to build the fuzzers." OFF)
+
+
+set(SIMDUTF_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+
+add_subdirectory(src)
+
+message(STATUS "Compiling using the C++ standard:" ${CMAKE_CXX_STANDARD})
+# ---- Install rules ----
+add_library(rspamd-simdutf ALIAS simdutf)
+
+set_target_properties(
+ simdutf PROPERTIES
+ VERSION "${SIMDUTF_LIB_VERSION}"
+ SOVERSION "${SIMDUTF_LIB_SOVERSION}"
+ WINDOWS_EXPORT_ALL_SYMBOLS YES
+)
+
+include(CMakePackageConfigHelpers)
+include(GNUInstallDirs)
+
+configure_file(cmake/simdutf-config.cmake.in simdutf-config.cmake @ONLY)
+
+write_basic_package_version_file(
+ simdutf-config-version.cmake
+ COMPATIBILITY SameMinorVersion
+)
+
+set(
+ SIMDUTF_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/simdutf"
+ CACHE STRING "CMake package config location relative to the install prefix"
+)
+mark_as_advanced(SIMDUTF_INSTALL_CMAKEDIR)
+
+
+# pkg-config
+include(cmake/JoinPaths.cmake)
+join_paths(PKGCONFIG_INCLUDEDIR "\${prefix}" "${CMAKE_INSTALL_INCLUDEDIR}")
+join_paths(PKGCONFIG_LIBDIR "\${prefix}" "${CMAKE_INSTALL_LIBDIR}")
+
+if(NOT SIMDUTF_SANITIZE)
+ find_program(GREP grep)
+ find_program(NM nm)
+ if((NOT GREP) OR (NOT NM))
+ message("grep and nm are unavailable on this system.")
+ else()
+ add_test(
+ NAME "avoid_abort"
+ # Under FreeBSD, the __cxa_guard_abort symbol may appear but it is fine.
+ # So we want to look for <space><possibly _>abort as a test.
+ COMMAND sh -c "${NM} $<TARGET_FILE_NAME:simdutf> | ${GREP} ' _*abort' || exit 0 && exit 1"
+ WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+ )
+ add_test(
+ NAME "avoid_cout"
+ COMMAND sh -c "${NM} $<TARGET_FILE_NAME:simdutf> | ${GREP} ' _*cout' || exit 0 && exit 1"
+ WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+ )
+ add_test(
+ NAME "avoid_cerr"
+ COMMAND sh -c "${NM} $<TARGET_FILE_NAME:simdutf> | ${GREP} ' _*cerr' || exit 0 && exit 1"
+ WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+ )
+ add_test(
+ NAME "avoid_printf"
+ COMMAND sh -c "${NM} $<TARGET_FILE_NAME:simdutf> | ${GREP} ' _*printf' || exit 0 && exit 1"
+ WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+ )
+ add_test(
+ NAME "avoid_stdout"
+ COMMAND sh -c "${NM} $<TARGET_FILE_NAME:simdutf> | ${GREP} stdout || exit 0 && exit 1"
+ WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+ )
+ add_test(
+ NAME "avoid_stderr"
+ COMMAND sh -c "${NM} $<TARGET_FILE_NAME:simdutf> | ${GREP} stderr || exit 0 && exit 1"
+ WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+ )
+ endif()
+endif()
+
+if(SIMDUTF_FUZZERS)
+ add_subdirectory(fuzz)
+endif()
diff --git a/contrib/simdutf/LICENSE-APACHE b/contrib/simdutf/LICENSE-APACHE
new file mode 100644
index 000000000..fd2496567
--- /dev/null
+++ b/contrib/simdutf/LICENSE-APACHE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "{}"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright 2020 The simdutf authors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/contrib/simdutf/cmake/CPM.cmake b/contrib/simdutf/cmake/CPM.cmake
new file mode 100644
index 000000000..c82a38653
--- /dev/null
+++ b/contrib/simdutf/cmake/CPM.cmake
@@ -0,0 +1,1161 @@
+# CPM.cmake - CMake's missing package manager
+# ===========================================
+# See https://github.com/cpm-cmake/CPM.cmake for usage and update instructions.
+#
+# MIT License
+# -----------
+#[[
+ Copyright (c) 2019-2023 Lars Melchior and contributors
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in all
+ copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+]]
+
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+# Initialize logging prefix
+if(NOT CPM_INDENT)
+ set(CPM_INDENT
+ "CPM:"
+ CACHE INTERNAL ""
+ )
+endif()
+
+if(NOT COMMAND cpm_message)
+ function(cpm_message)
+ message(${ARGV})
+ endfunction()
+endif()
+
+set(CURRENT_CPM_VERSION 0.38.7)
+
+get_filename_component(CPM_CURRENT_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}" REALPATH)
+if(CPM_DIRECTORY)
+ if(NOT CPM_DIRECTORY STREQUAL CPM_CURRENT_DIRECTORY)
+ if(CPM_VERSION VERSION_LESS CURRENT_CPM_VERSION)
+ message(
+ AUTHOR_WARNING
+ "${CPM_INDENT} \
+A dependency is using a more recent CPM version (${CURRENT_CPM_VERSION}) than the current project (${CPM_VERSION}). \
+It is recommended to upgrade CPM to the most recent version. \
+See https://github.com/cpm-cmake/CPM.cmake for more information."
+ )
+ endif()
+ if(${CMAKE_VERSION} VERSION_LESS "3.17.0")
+ include(FetchContent)
+ endif()
+ return()
+ endif()
+
+ get_property(
+ CPM_INITIALIZED GLOBAL ""
+ PROPERTY CPM_INITIALIZED
+ SET
+ )
+ if(CPM_INITIALIZED)
+ return()
+ endif()
+endif()
+
+if(CURRENT_CPM_VERSION MATCHES "development-version")
+ message(
+ WARNING "${CPM_INDENT} Your project is using an unstable development version of CPM.cmake. \
+Please update to a recent release if possible. \
+See https://github.com/cpm-cmake/CPM.cmake for details."
+ )
+endif()
+
+set_property(GLOBAL PROPERTY CPM_INITIALIZED true)
+
+macro(cpm_set_policies)
+ # the policy allows us to change options without caching
+ cmake_policy(SET CMP0077 NEW)
+ set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+
+ # the policy allows us to change set(CACHE) without caching
+ if(POLICY CMP0126)
+ cmake_policy(SET CMP0126 NEW)
+ set(CMAKE_POLICY_DEFAULT_CMP0126 NEW)
+ endif()
+
+ # The policy uses the download time for timestamp, instead of the timestamp in the archive. This
+ # allows for proper rebuilds when a projects url changes
+ if(POLICY CMP0135)
+ cmake_policy(SET CMP0135 NEW)
+ set(CMAKE_POLICY_DEFAULT_CMP0135 NEW)
+ endif()
+
+ # treat relative git repository paths as being relative to the parent project's remote
+ if(POLICY CMP0150)
+ cmake_policy(SET CMP0150 NEW)
+ set(CMAKE_POLICY_DEFAULT_CMP0150 NEW)
+ endif()
+endmacro()
+cpm_set_policies()
+
+option(CPM_USE_LOCAL_PACKAGES "Always try to use `find_package` to get dependencies"
+ $ENV{CPM_USE_LOCAL_PACKAGES}
+)
+option(CPM_LOCAL_PACKAGES_ONLY "Only use `find_package` to get dependencies"
+ $ENV{CPM_LOCAL_PACKAGES_ONLY}
+)
+option(CPM_DOWNLOAD_ALL "Always download dependencies from source" $ENV{CPM_DOWNLOAD_ALL})
+option(CPM_DONT_UPDATE_MODULE_PATH "Don't update the module path to allow using find_package"
+ $ENV{CPM_DONT_UPDATE_MODULE_PATH}
+)
+option(CPM_DONT_CREATE_PACKAGE_LOCK "Don't create a package lock file in the binary path"
+ $ENV{CPM_DONT_CREATE_PACKAGE_LOCK}
+)
+option(CPM_INCLUDE_ALL_IN_PACKAGE_LOCK
+ "Add all packages added through CPM.cmake to the package lock"
+ $ENV{CPM_INCLUDE_ALL_IN_PACKAGE_LOCK}
+)
+option(CPM_USE_NAMED_CACHE_DIRECTORIES
+ "Use additional directory of package name in cache on the most nested level."
+ $ENV{CPM_USE_NAMED_CACHE_DIRECTORIES}
+)
+
+set(CPM_VERSION
+ ${CURRENT_CPM_VERSION}
+ CACHE INTERNAL ""
+)
+set(CPM_DIRECTORY
+ ${CPM_CURRENT_DIRECTORY}
+ CACHE INTERNAL ""
+)
+set(CPM_FILE
+ ${CMAKE_CURRENT_LIST_FILE}
+ CACHE INTERNAL ""
+)
+set(CPM_PACKAGES
+ ""
+ CACHE INTERNAL ""
+)
+set(CPM_DRY_RUN
+ OFF
+ CACHE INTERNAL "Don't download or configure dependencies (for testing)"
+)
+
+if(DEFINED ENV{CPM_SOURCE_CACHE})
+ set(CPM_SOURCE_CACHE_DEFAULT $ENV{CPM_SOURCE_CACHE})
+else()
+ set(CPM_SOURCE_CACHE_DEFAULT OFF)
+endif()
+
+set(CPM_SOURCE_CACHE
+ ${CPM_SOURCE_CACHE_DEFAULT}
+ CACHE PATH "Directory to download CPM dependencies"
+)
+
+if(NOT CPM_DONT_UPDATE_MODULE_PATH)
+ set(CPM_MODULE_PATH
+ "${CMAKE_BINARY_DIR}/CPM_modules"
+ CACHE INTERNAL ""
+ )
+ # remove old modules
+ file(REMOVE_RECURSE ${CPM_MODULE_PATH})
+ file(MAKE_DIRECTORY ${CPM_MODULE_PATH})
+ # locally added CPM modules should override global packages
+ set(CMAKE_MODULE_PATH "${CPM_MODULE_PATH};${CMAKE_MODULE_PATH}")
+endif()
+
+if(NOT CPM_DONT_CREATE_PACKAGE_LOCK)
+ set(CPM_PACKAGE_LOCK_FILE
+ "${CMAKE_BINARY_DIR}/cpm-package-lock.cmake"
+ CACHE INTERNAL ""
+ )
+ file(WRITE ${CPM_PACKAGE_LOCK_FILE}
+ "# CPM Package Lock\n# This file should be committed to version control\n\n"
+ )
+endif()
+
+include(FetchContent)
+
+# Try to infer package name from git repository uri (path or url)
+function(cpm_package_name_from_git_uri URI RESULT)
+ if("${URI}" MATCHES "([^/:]+)/?.git/?$")
+ set(${RESULT}
+ ${CMAKE_MATCH_1}
+ PARENT_SCOPE
+ )
+ else()
+ unset(${RESULT} PARENT_SCOPE)
+ endif()
+endfunction()
+
+# Try to infer package name and version from a url
+function(cpm_package_name_and_ver_from_url url outName outVer)
+ if(url MATCHES "[/\\?]([a-zA-Z0-9_\\.-]+)\\.(tar|tar\\.gz|tar\\.bz2|zip|ZIP)(\\?|/|$)")
+ # We matched an archive
+ set(filename "${CMAKE_MATCH_1}")
+
+ if(filename MATCHES "([a-zA-Z0-9_\\.-]+)[_-]v?(([0-9]+\\.)*[0-9]+[a-zA-Z0-9]*)")
+ # We matched <name>-<version> (ie foo-1.2.3)
+ set(${outName}
+ "${CMAKE_MATCH_1}"
+ PARENT_SCOPE
+ )
+ set(${outVer}
+ "${CMAKE_MATCH_2}"
+ PARENT_SCOPE
+ )
+ elseif(filename MATCHES "(([0-9]+\\.)+[0-9]+[a-zA-Z0-9]*)")
+ # We couldn't find a name, but we found a version
+ #
+ # In many cases (which we don't handle here) the url would look something like
+ # `irrelevant/ACTUAL_PACKAGE_NAME/irrelevant/1.2.3.zip`. In such a case we can't possibly
+ # distinguish the package name from the irrelevant bits. Moreover if we try to match the
+ # package name from the filename, we'd get bogus at best.
+ unset(${outName} PARENT_SCOPE)
+ set(${outVer}
+ "${CMAKE_MATCH_1}"
+ PARENT_SCOPE
+ )
+ else()
+ # Boldly assume that the file name is the package name.
+ #
+ # Yes, something like `irrelevant/ACTUAL_NAME/irrelevant/download.zip` will ruin our day, but
+ # such cases should be quite rare. No popular service does this... we think.
+ set(${outName}
+ "${filename}"
+ PARENT_SCOPE
+ )
+ unset(${outVer} PARENT_SCOPE)
+ endif()
+ else()
+ # No ideas yet what to do with non-archives
+ unset(${outName} PARENT_SCOPE)
+ unset(${outVer} PARENT_SCOPE)
+ endif()
+endfunction()
+
+function(cpm_find_package NAME VERSION)
+ string(REPLACE " " ";" EXTRA_ARGS "${ARGN}")
+ find_package(${NAME} ${VERSION} ${EXTRA_ARGS} QUIET)
+ if(${CPM_ARGS_NAME}_FOUND)
+ if(DEFINED ${CPM_ARGS_NAME}_VERSION)
+ set(VERSION ${${CPM_ARGS_NAME}_VERSION})
+ endif()
+ cpm_message(STATUS "${CPM_INDENT} Using local package ${CPM_ARGS_NAME}@${VERSION}")
+ CPMRegisterPackage(${CPM_ARGS_NAME} "${VERSION}")
+ set(CPM_PACKAGE_FOUND
+ YES
+ PARENT_SCOPE
+ )
+ else()
+ set(CPM_PACKAGE_FOUND
+ NO
+ PARENT_SCOPE
+ )
+ endif()
+endfunction()
+
+# Create a custom FindXXX.cmake module for a CPM package This prevents `find_package(NAME)` from
+# finding the system library
+function(cpm_create_module_file Name)
+ if(NOT CPM_DONT_UPDATE_MODULE_PATH)
+ # erase any previous modules
+ file(WRITE ${CPM_MODULE_PATH}/Find${Name}.cmake
+ "include(\"${CPM_FILE}\")\n${ARGN}\nset(${Name}_FOUND TRUE)"
+ )
+ endif()
+endfunction()
+
+# Find a package locally or fallback to CPMAddPackage
+function(CPMFindPackage)
+ set(oneValueArgs NAME VERSION GIT_TAG FIND_PACKAGE_ARGUMENTS)
+
+ cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "" ${ARGN})
+
+ if(NOT DEFINED CPM_ARGS_VERSION)
+ if(DEFINED CPM_ARGS_GIT_TAG)
+ cpm_get_version_from_git_tag("${CPM_ARGS_GIT_TAG}" CPM_ARGS_VERSION)
+ endif()
+ endif()
+
+ set(downloadPackage ${CPM_DOWNLOAD_ALL})
+ if(DEFINED CPM_DOWNLOAD_${CPM_ARGS_NAME})
+ set(downloadPackage ${CPM_DOWNLOAD_${CPM_ARGS_NAME}})
+ elseif(DEFINED ENV{CPM_DOWNLOAD_${CPM_ARGS_NAME}})
+ set(downloadPackage $ENV{CPM_DOWNLOAD_${CPM_ARGS_NAME}})
+ endif()
+ if(downloadPackage)
+ CPMAddPackage(${ARGN})
+ cpm_export_variables(${CPM_ARGS_NAME})
+ return()
+ endif()
+
+ cpm_check_if_package_already_added(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}")
+ if(CPM_PACKAGE_ALREADY_ADDED)
+ cpm_export_variables(${CPM_ARGS_NAME})
+ return()
+ endif()
+
+ cpm_find_package(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}" ${CPM_ARGS_FIND_PACKAGE_ARGUMENTS})
+
+ if(NOT CPM_PACKAGE_FOUND)
+ CPMAddPackage(${ARGN})
+ cpm_export_variables(${CPM_ARGS_NAME})
+ endif()
+
+endfunction()
+
+# checks if a package has been added before
+function(cpm_check_if_package_already_added CPM_ARGS_NAME CPM_ARGS_VERSION)
+ if("${CPM_ARGS_NAME}" IN_LIST CPM_PACKAGES)
+ CPMGetPackageVersion(${CPM_ARGS_NAME} CPM_PACKAGE_VERSION)
+ if("${CPM_PACKAGE_VERSION}" VERSION_LESS "${CPM_ARGS_VERSION}")
+ message(
+ WARNING
+ "${CPM_INDENT} Requires a newer version of ${CPM_ARGS_NAME} (${CPM_ARGS_VERSION}) than currently included (${CPM_PACKAGE_VERSION})."
+ )
+ endif()
+ cpm_get_fetch_properties(${CPM_ARGS_NAME})
+ set(${CPM_ARGS_NAME}_ADDED NO)
+ set(CPM_PACKAGE_ALREADY_ADDED
+ YES
+ PARENT_SCOPE
+ )
+ cpm_export_variables(${CPM_ARGS_NAME})
+ else()
+ set(CPM_PACKAGE_ALREADY_ADDED
+ NO
+ PARENT_SCOPE
+ )
+ endif()
+endfunction()
+
+# Parse the argument of CPMAddPackage in case a single one was provided and convert it to a list of
+# arguments which can then be parsed idiomatically. For example gh:foo/bar@1.2.3 will be converted
+# to: GITHUB_REPOSITORY;foo/bar;VERSION;1.2.3
+function(cpm_parse_add_package_single_arg arg outArgs)
+ # Look for a scheme
+ if("${arg}" MATCHES "^([a-zA-Z]+):(.+)$")
+ string(TOLOWER "${CMAKE_MATCH_1}" scheme)
+ set(uri "${CMAKE_MATCH_2}")
+
+ # Check for CPM-specific schemes
+ if(scheme STREQUAL "gh")
+ set(out "GITHUB_REPOSITORY;${uri}")
+ set(packageType "git")
+ elseif(scheme STREQUAL "gl")
+ set(out "GITLAB_REPOSITORY;${uri}")
+ set(packageType "git")
+ elseif(scheme STREQUAL "bb")
+ set(out "BITBUCKET_REPOSITORY;${uri}")
+ set(packageType "git")
+ # A CPM-specific scheme was not found. Looks like this is a generic URL so try to determine
+ # type
+ elseif(arg MATCHES ".git/?(@|#|$)")
+ set(out "GIT_REPOSITORY;${arg}")
+ set(packageType "git")
+ else()
+ # Fall back to a URL
+ set(out "URL;${arg}")
+ set(packageType "archive")
+
+ # We could also check for SVN since FetchContent supports it, but SVN is so rare these days.
+ # We just won't bother with the additional complexity it will induce in this function. SVN is
+ # done by multi-arg
+ endif()
+ else()
+ if(arg MATCHES ".git/?(@|#|$)")
+ set(out "GIT_REPOSITORY;${arg}")
+ set(packageType "git")
+ else()
+ # Give up
+ message(FATAL_ERROR "${CPM_INDENT} Can't determine package type of '${arg}'")
+ endif()
+ endif()
+
+ # For all packages we interpret @... as version. Only replace the last occurrence. Thus URIs
+ # containing '@' can be used
+ string(REGEX REPLACE "@([^@]+)$" ";VERSION;\\1" out "${out}")
+
+ # Parse the rest according to package type
+ if(packageType STREQUAL "git")
+ # For git repos we interpret #... as a tag or branch or commit hash
+ string(REGEX REPLACE "#([^#]+)$" ";GIT_TAG;\\1" out "${out}")
+ elseif(packageType STREQUAL "archive")
+ # For archives we interpret #... as a URL hash.
+ string(REGEX REPLACE "#([^#]+)$" ";URL_HASH;\\1" out "${out}")
+ # We don't try to parse the version if it's not provided explicitly. cpm_get_version_from_url
+ # should do this at a later point
+ else()
+ # We should never get here. This is an assertion and hitting it means there's a bug in the code
+ # above. A packageType was set, but not handled by this if-else.
+ message(FATAL_ERROR "${CPM_INDENT} Unsupported package type '${packageType}' of '${arg}'")
+ endif()
+
+ set(${outArgs}
+ ${out}
+ PARENT_SCOPE
+ )
+endfunction()
+
+# Check that the working directory for a git repo is clean
+function(cpm_check_git_working_dir_is_clean repoPath gitTag isClean)
+
+ find_package(Git REQUIRED)
+
+ if(NOT GIT_EXECUTABLE)
+ # No git executable, assume directory is clean
+ set(${isClean}
+ TRUE
+ PARENT_SCOPE
+ )
+ return()
+ endif()
+
+ # check for uncommitted changes
+ execute_process(
+ COMMAND ${GIT_EXECUTABLE} status --porcelain
+ RESULT_VARIABLE resultGitStatus
+ OUTPUT_VARIABLE repoStatus
+ OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET
+ WORKING_DIRECTORY ${repoPath}
+ )
+ if(resultGitStatus)
+ # not supposed to happen, assume clean anyway
+ message(WARNING "${CPM_INDENT} Calling git status on folder ${repoPath} failed")
+ set(${isClean}
+ TRUE
+ PARENT_SCOPE
+ )
+ return()
+ endif()
+
+ if(NOT "${repoStatus}" STREQUAL "")
+ set(${isClean}
+ FALSE
+ PARENT_SCOPE
+ )
+ return()
+ endif()
+
+ # check for committed changes
+ execute_process(
+ COMMAND ${GIT_EXECUTABLE} diff -s --exit-code ${gitTag}
+ RESULT_VARIABLE resultGitDiff
+ OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_QUIET
+ WORKING_DIRECTORY ${repoPath}
+ )
+
+ if(${resultGitDiff} EQUAL 0)
+ set(${isClean}
+ TRUE
+ PARENT_SCOPE
+ )
+ else()
+ set(${isClean}
+ FALSE
+ PARENT_SCOPE
+ )
+ endif()
+
+endfunction()
+
+# method to overwrite internal FetchContent properties, to allow using CPM.cmake to overload
+# FetchContent calls. As these are internal cmake properties, this method should be used carefully
+# and may need modification in future CMake versions. Source:
+# https://github.com/Kitware/CMake/blob/dc3d0b5a0a7d26d43d6cfeb511e224533b5d188f/Modules/FetchContent.cmake#L1152
+function(cpm_override_fetchcontent contentName)
+ cmake_parse_arguments(PARSE_ARGV 1 arg "" "SOURCE_DIR;BINARY_DIR" "")
+ if(NOT "${arg_UNPARSED_ARGUMENTS}" STREQUAL "")
+ message(FATAL_ERROR "${CPM_INDENT} Unsupported arguments: ${arg_UNPARSED_ARGUMENTS}")
+ endif()
+
+ string(TOLOWER ${contentName} contentNameLower)
+ set(prefix "_FetchContent_${contentNameLower}")
+
+ set(propertyName "${prefix}_sourceDir")
+ define_property(
+ GLOBAL
+ PROPERTY ${propertyName}
+ BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()"
+ FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}"
+ )
+ set_property(GLOBAL PROPERTY ${propertyName} "${arg_SOURCE_DIR}")
+
+ set(propertyName "${prefix}_binaryDir")
+ define_property(
+ GLOBAL
+ PROPERTY ${propertyName}
+ BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()"
+ FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}"
+ )
+ set_property(GLOBAL PROPERTY ${propertyName} "${arg_BINARY_DIR}")
+
+ set(propertyName "${prefix}_populated")
+ define_property(
+ GLOBAL
+ PROPERTY ${propertyName}
+ BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()"
+ FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}"
+ )
+ set_property(GLOBAL PROPERTY ${propertyName} TRUE)
+endfunction()
+
+# Download and add a package from source
+function(CPMAddPackage)
+ cpm_set_policies()
+
+ list(LENGTH ARGN argnLength)
+ if(argnLength EQUAL 1)
+ cpm_parse_add_package_single_arg("${ARGN}" ARGN)
+
+ # The shorthand syntax implies EXCLUDE_FROM_ALL and SYSTEM
+ set(ARGN "${ARGN};EXCLUDE_FROM_ALL;YES;SYSTEM;YES;")
+ endif()
+
+ set(oneValueArgs
+ NAME
+ FORCE
+ VERSION
+ GIT_TAG
+ DOWNLOAD_ONLY
+ GITHUB_REPOSITORY
+ GITLAB_REPOSITORY
+ BITBUCKET_REPOSITORY
+ GIT_REPOSITORY
+ SOURCE_DIR
+ FIND_PACKAGE_ARGUMENTS
+ NO_CACHE
+ SYSTEM
+ GIT_SHALLOW
+ EXCLUDE_FROM_ALL
+ SOURCE_SUBDIR
+ )
+
+ set(multiValueArgs URL OPTIONS DOWNLOAD_COMMAND)
+
+ cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}")
+
+ # Set default values for arguments
+
+ if(NOT DEFINED CPM_ARGS_VERSION)
+ if(DEFINED CPM_ARGS_GIT_TAG)
+ cpm_get_version_from_git_tag("${CPM_ARGS_GIT_TAG}" CPM_ARGS_VERSION)
+ endif()
+ endif()
+
+ if(CPM_ARGS_DOWNLOAD_ONLY)
+ set(DOWNLOAD_ONLY ${CPM_ARGS_DOWNLOAD_ONLY})
+ else()
+ set(DOWNLOAD_ONLY NO)
+ endif()
+
+ if(DEFINED CPM_ARGS_GITHUB_REPOSITORY)
+ set(CPM_ARGS_GIT_REPOSITORY "https://github.com/${CPM_ARGS_GITHUB_REPOSITORY}.git")
+ elseif(DEFINED CPM_ARGS_GITLAB_REPOSITORY)
+ set(CPM_ARGS_GIT_REPOSITORY "https://gitlab.com/${CPM_ARGS_GITLAB_REPOSITORY}.git")
+ elseif(DEFINED CPM_ARGS_BITBUCKET_REPOSITORY)
+ set(CPM_ARGS_GIT_REPOSITORY "https://bitbucket.org/${CPM_ARGS_BITBUCKET_REPOSITORY}.git")
+ endif()
+
+ if(DEFINED CPM_ARGS_GIT_REPOSITORY)
+ list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_REPOSITORY ${CPM_ARGS_GIT_REPOSITORY})
+ if(NOT DEFINED CPM_ARGS_GIT_TAG)
+ set(CPM_ARGS_GIT_TAG v${CPM_ARGS_VERSION})
+ endif()
+
+ # If a name wasn't provided, try to infer it from the git repo
+ if(NOT DEFINED CPM_ARGS_NAME)
+ cpm_package_name_from_git_uri(${CPM_ARGS_GIT_REPOSITORY} CPM_ARGS_NAME)
+ endif()
+ endif()
+
+ set(CPM_SKIP_FETCH FALSE)
+
+ if(DEFINED CPM_ARGS_GIT_TAG)
+ list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_TAG ${CPM_ARGS_GIT_TAG})
+ # If GIT_SHALLOW is explicitly specified, honor the value.
+ if(DEFINED CPM_ARGS_GIT_SHALLOW)
+ list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_SHALLOW ${CPM_ARGS_GIT_SHALLOW})
+ endif()
+ endif()
+
+ if(DEFINED CPM_ARGS_URL)
+ # If a name or version aren't provided, try to infer them from the URL
+ list(GET CPM_ARGS_URL 0 firstUrl)
+ cpm_package_name_and_ver_from_url(${firstUrl} nameFromUrl verFromUrl)
+ # If we fail to obtain name and version from the first URL, we could try other URLs if any.
+ # However multiple URLs are expected to be quite rare, so for now we won't bother.
+
+ # If the caller provided their own name and version, they trump the inferred ones.
+ if(NOT DEFINED CPM_ARGS_NAME)
+ set(CPM_ARGS_NAME ${nameFromUrl})
+ endif()
+ if(NOT DEFINED CPM_ARGS_VERSION)
+ set(CPM_ARGS_VERSION ${verFromUrl})
+ endif()
+
+ list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS URL "${CPM_ARGS_URL}")
+ endif()
+
+ # Check for required arguments
+
+ if(NOT DEFINED CPM_ARGS_NAME)
+ message(
+ FATAL_ERROR
+ "${CPM_INDENT} 'NAME' was not provided and couldn't be automatically inferred for package added with arguments: '${ARGN}'"
+ )
+ endif()
+
+ # Check if package has been added before
+ cpm_check_if_package_already_added(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}")
+ if(CPM_PACKAGE_ALREADY_ADDED)
+ cpm_export_variables(${CPM_ARGS_NAME})
+ return()
+ endif()
+
+ # Check for manual overrides
+ if(NOT CPM_ARGS_FORCE AND NOT "${CPM_${CPM_ARGS_NAME}_SOURCE}" STREQUAL "")
+ set(PACKAGE_SOURCE ${CPM_${CPM_ARGS_NAME}_SOURCE})
+ set(CPM_${CPM_ARGS_NAME}_SOURCE "")
+ CPMAddPackage(
+ NAME "${CPM_ARGS_NAME}"
+ SOURCE_DIR "${PACKAGE_SOURCE}"
+ EXCLUDE_FROM_ALL "${CPM_ARGS_EXCLUDE_FROM_ALL}"
+ SYSTEM "${CPM_ARGS_SYSTEM}"
+ OPTIONS "${CPM_ARGS_OPTIONS}"
+ SOURCE_SUBDIR "${CPM_ARGS_SOURCE_SUBDIR}"
+ DOWNLOAD_ONLY "${DOWNLOAD_ONLY}"
+ FORCE True
+ )
+ cpm_export_variables(${CPM_ARGS_NAME})
+ return()
+ endif()
+
+ # Check for available declaration
+ if(NOT CPM_ARGS_FORCE AND NOT "${CPM_DECLARATION_${CPM_ARGS_NAME}}" STREQUAL "")
+ set(declaration ${CPM_DECLARATION_${CPM_ARGS_NAME}})
+ set(CPM_DECLARATION_${CPM_ARGS_NAME} "")
+ CPMAddPackage(${declaration})
+ cpm_export_variables(${CPM_ARGS_NAME})
+ # checking again to ensure version and option compatibility
+ cpm_check_if_package_already_added(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}")
+ return()
+ endif()
+
+ if(NOT CPM_ARGS_FORCE)
+ if(CPM_USE_LOCAL_PACKAGES OR CPM_LOCAL_PACKAGES_ONLY)
+ cpm_find_package(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}" ${CPM_ARGS_FIND_PACKAGE_ARGUMENTS})
+
+ if(CPM_PACKAGE_FOUND)
+ cpm_export_variables(${CPM_ARGS_NAME})
+ return()
+ endif()
+
+ if(CPM_LOCAL_PACKAGES_ONLY)
+ message(
+ SEND_ERROR
+ "${CPM_INDENT} ${CPM_ARGS_NAME} not found via find_package(${CPM_ARGS_NAME} ${CPM_ARGS_VERSION})"
+ )
+ endif()
+ endif()
+ endif()
+
+ CPMRegisterPackage("${CPM_ARGS_NAME}" "${CPM_ARGS_VERSION}")
+
+ if(DEFINED CPM_ARGS_GIT_TAG)
+ set(PACKAGE_INFO "${CPM_ARGS_GIT_TAG}")
+ elseif(DEFINED CPM_ARGS_SOURCE_DIR)
+ set(PACKAGE_INFO "${CPM_ARGS_SOURCE_DIR}")
+ else()
+ set(PACKAGE_INFO "${CPM_ARGS_VERSION}")
+ endif()
+
+ if(DEFINED FETCHCONTENT_BASE_DIR)
+ # respect user's FETCHCONTENT_BASE_DIR if set
+ set(CPM_FETCHCONTENT_BASE_DIR ${FETCHCONTENT_BASE_DIR})
+ else()
+ set(CPM_FETCHCONTENT_BASE_DIR ${CMAKE_BINARY_DIR}/_deps)
+ endif()
+
+ if(DEFINED CPM_ARGS_DOWNLOAD_COMMAND)
+ list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS DOWNLOAD_COMMAND ${CPM_ARGS_DOWNLOAD_COMMAND})
+ elseif(DEFINED CPM_ARGS_SOURCE_DIR)
+ list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS SOURCE_DIR ${CPM_ARGS_SOURCE_DIR})
+ if(NOT IS_ABSOLUTE ${CPM_ARGS_SOURCE_DIR})
+ # Expand `CPM_ARGS_SOURCE_DIR` relative path. This is important because EXISTS doesn't work
+ # for relative paths.
+ get_filename_component(
+ source_directory ${CPM_ARGS_SOURCE_DIR} REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}
+ )
+ else()
+ set(source_directory ${CPM_ARGS_SOURCE_DIR})
+ endif()
+ if(NOT EXISTS ${source_directory})
+ string(TOLOWER ${CPM_ARGS_NAME} lower_case_name)
+ # remove timestamps so CMake will re-download the dependency
+ file(REMOVE_RECURSE "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild")
+ endif()
+ elseif(CPM_SOURCE_CACHE AND NOT CPM_ARGS_NO_CACHE)
+ string(TOLOWER ${CPM_ARGS_NAME} lower_case_name)
+ set(origin_parameters ${CPM_ARGS_UNPARSED_ARGUMENTS})
+ list(SORT origin_parameters)
+ if(CPM_USE_NAMED_CACHE_DIRECTORIES)
+ string(SHA1 origin_hash "${origin_parameters};NEW_CACHE_STRUCTURE_TAG")
+ set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${origin_hash}/${CPM_ARGS_NAME})
+ else()
+ string(SHA1 origin_hash "${origin_parameters}")
+ set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${origin_hash})
+ endif()
+ # Expand `download_directory` relative path. This is important because EXISTS doesn't work for
+ # relative paths.
+ get_filename_component(download_directory ${download_directory} ABSOLUTE)
+ list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS SOURCE_DIR ${download_directory})
+
+ if(CPM_SOURCE_CACHE)
+ file(LOCK ${download_directory}/../cmake.lock)
+ endif()
+
+ if(EXISTS ${download_directory})
+ if(CPM_SOURCE_CACHE)
+ file(LOCK ${download_directory}/../cmake.lock RELEASE)
+ endif()
+
+ cpm_store_fetch_properties(
+ ${CPM_ARGS_NAME} "${download_directory}"
+ "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-build"
+ )
+ cpm_get_fetch_properties("${CPM_ARGS_NAME}")
+
+ if(DEFINED CPM_ARGS_GIT_TAG AND NOT (PATCH_COMMAND IN_LIST CPM_ARGS_UNPARSED_ARGUMENTS))
+ # warn if cache has been changed since checkout
+ cpm_check_git_working_dir_is_clean(${download_directory} ${CPM_ARGS_GIT_TAG} IS_CLEAN)
+ if(NOT ${IS_CLEAN})
+ message(
+ WARNING "${CPM_INDENT} Cache for ${CPM_ARGS_NAME} (${download_directory}) is dirty"
+ )
+ endif()
+ endif()
+
+ cpm_add_subdirectory(
+ "${CPM_ARGS_NAME}"
+ "${DOWNLOAD_ONLY}"
+ "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}"
+ "${${CPM_ARGS_NAME}_BINARY_DIR}"
+ "${CPM_ARGS_EXCLUDE_FROM_ALL}"
+ "${CPM_ARGS_SYSTEM}"
+ "${CPM_ARGS_OPTIONS}"
+ )
+ set(PACKAGE_INFO "${PACKAGE_INFO} at ${download_directory}")
+
+ # As the source dir is already cached/populated, we override the call to FetchContent.
+ set(CPM_SKIP_FETCH TRUE)
+ cpm_override_fetchcontent(
+ "${lower_case_name}" SOURCE_DIR "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}"
+ BINARY_DIR "${${CPM_ARGS_NAME}_BINARY_DIR}"
+ )
+
+ else()
+ # Enable shallow clone when GIT_TAG is not a commit hash. Our guess may not be accurate, but
+ # it should guarantee no commit hash get mis-detected.
+ if(NOT DEFINED CPM_ARGS_GIT_SHALLOW)
+ cpm_is_git_tag_commit_hash("${CPM_ARGS_GIT_TAG}" IS_HASH)
+ if(NOT ${IS_HASH})
+ list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_SHALLOW TRUE)
+ endif()
+ endif()
+
+ # remove timestamps so CMake will re-download the dependency
+ file(REMOVE_RECURSE ${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild)
+ set(PACKAGE_INFO "${PACKAGE_INFO} to ${download_directory}")
+ endif()
+ endif()
+
+ cpm_create_module_file(${CPM_ARGS_NAME} "CPMAddPackage(\"${ARGN}\")")
+
+ if(CPM_PACKAGE_LOCK_ENABLED)
+ if((CPM_ARGS_VERSION AND NOT CPM_ARGS_SOURCE_DIR) OR CPM_INCLUDE_ALL_IN_PACKAGE_LOCK)
+ cpm_add_to_package_lock(${CPM_ARGS_NAME} "${ARGN}")
+ elseif(CPM_ARGS_SOURCE_DIR)
+ cpm_add_comment_to_package_lock(${CPM_ARGS_NAME} "local directory")
+ else()
+ cpm_add_comment_to_package_lock(${CPM_ARGS_NAME} "${ARGN}")
+ endif()
+ endif()
+
+ cpm_message(
+ STATUS "${CPM_INDENT} Adding package ${CPM_ARGS_NAME}@${CPM_ARGS_VERSION} (${PACKAGE_INFO})"
+ )
+
+ if(NOT CPM_SKIP_FETCH)
+ cpm_declare_fetch(
+ "${CPM_ARGS_NAME}" "${CPM_ARGS_VERSION}" "${PACKAGE_INFO}" "${CPM_ARGS_UNPARSED_ARGUMENTS}"
+ )
+ cpm_fetch_package("${CPM_ARGS_NAME}" populated)
+ if(CPM_SOURCE_CACHE AND download_directory)
+ file(LOCK ${download_directory}/../cmake.lock RELEASE)
+ endif()
+ if(${populated})
+ cpm_add_subdirectory(
+ "${CPM_ARGS_NAME}"
+ "${DOWNLOAD_ONLY}"
+ "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}"
+ "${${CPM_ARGS_NAME}_BINARY_DIR}"
+ "${CPM_ARGS_EXCLUDE_FROM_ALL}"
+ "${CPM_ARGS_SYSTEM}"
+ "${CPM_ARGS_OPTIONS}"
+ )
+ endif()
+ cpm_get_fetch_properties("${CPM_ARGS_NAME}")
+ endif()
+
+ set(${CPM_ARGS_NAME}_ADDED YES)
+ cpm_export_variables("${CPM_ARGS_NAME}")
+endfunction()
+
+# Fetch a previously declared package
+macro(CPMGetPackage Name)
+ if(DEFINED "CPM_DECLARATION_${Name}")
+ CPMAddPackage(NAME ${Name})
+ else()
+ message(SEND_ERROR "${CPM_INDENT} Cannot retrieve package ${Name}: no declaration available")
+ endif()
+endmacro()
+
+# export variables available to the caller to the parent scope expects ${CPM_ARGS_NAME} to be set
+macro(cpm_export_variables name)
+ set(${name}_SOURCE_DIR
+ "${${name}_SOURCE_DIR}"
+ PARENT_SCOPE
+ )
+ set(${name}_BINARY_DIR
+ "${${name}_BINARY_DIR}"
+ PARENT_SCOPE
+ )
+ set(${name}_ADDED
+ "${${name}_ADDED}"
+ PARENT_SCOPE
+ )
+ set(CPM_LAST_PACKAGE_NAME
+ "${name}"
+ PARENT_SCOPE
+ )
+endmacro()
+
+# declares a package, so that any call to CPMAddPackage for the package name will use these
+# arguments instead. Previous declarations will not be overridden.
+macro(CPMDeclarePackage Name)
+ if(NOT DEFINED "CPM_DECLARATION_${Name}")
+ set("CPM_DECLARATION_${Name}" "${ARGN}")
+ endif()
+endmacro()
+
+function(cpm_add_to_package_lock Name)
+ if(NOT CPM_DONT_CREATE_PACKAGE_LOCK)
+ cpm_prettify_package_arguments(PRETTY_ARGN false ${ARGN})
+ file(APPEND ${CPM_PACKAGE_LOCK_FILE} "# ${Name}\nCPMDeclarePackage(${Name}\n${PRETTY_ARGN})\n")
+ endif()
+endfunction()
+
+function(cpm_add_comment_to_package_lock Name)
+ if(NOT CPM_DONT_CREATE_PACKAGE_LOCK)
+ cpm_prettify_package_arguments(PRETTY_ARGN true ${ARGN})
+ file(APPEND ${CPM_PACKAGE_LOCK_FILE}
+ "# ${Name} (unversioned)\n# CPMDeclarePackage(${Name}\n${PRETTY_ARGN}#)\n"
+ )
+ endif()
+endfunction()
+
+# includes the package lock file if it exists and creates a target `cpm-update-package-lock` to
+# update it
+macro(CPMUsePackageLock file)
+ if(NOT CPM_DONT_CREATE_PACKAGE_LOCK)
+ get_filename_component(CPM_ABSOLUTE_PACKAGE_LOCK_PATH ${file} ABSOLUTE)
+ if(EXISTS ${CPM_ABSOLUTE_PACKAGE_LOCK_PATH})
+ include(${CPM_ABSOLUTE_PACKAGE_LOCK_PATH})
+ endif()
+ if(NOT TARGET cpm-update-package-lock)
+ add_custom_target(
+ cpm-update-package-lock COMMAND ${CMAKE_COMMAND} -E copy ${CPM_PACKAGE_LOCK_FILE}
+ ${CPM_ABSOLUTE_PACKAGE_LOCK_PATH}
+ )
+ endif()
+ set(CPM_PACKAGE_LOCK_ENABLED true)
+ endif()
+endmacro()
+
+# registers a package that has been added to CPM
+function(CPMRegisterPackage PACKAGE VERSION)
+ list(APPEND CPM_PACKAGES ${PACKAGE})
+ set(CPM_PACKAGES
+ ${CPM_PACKAGES}
+ CACHE INTERNAL ""
+ )
+ set("CPM_PACKAGE_${PACKAGE}_VERSION"
+ ${VERSION}
+ CACHE INTERNAL ""
+ )
+endfunction()
+
+# retrieve the current version of the package to ${OUTPUT}
+function(CPMGetPackageVersion PACKAGE OUTPUT)
+ set(${OUTPUT}
+ "${CPM_PACKAGE_${PACKAGE}_VERSION}"
+ PARENT_SCOPE
+ )
+endfunction()
+
+# declares a package in FetchContent_Declare
+function(cpm_declare_fetch PACKAGE VERSION INFO)
+ if(${CPM_DRY_RUN})
+ cpm_message(STATUS "${CPM_INDENT} Package not declared (dry run)")
+ return()
+ endif()
+
+ FetchContent_Declare(${PACKAGE} ${ARGN})
+endfunction()
+
+# returns properties for a package previously defined by cpm_declare_fetch
+function(cpm_get_fetch_properties PACKAGE)
+ if(${CPM_DRY_RUN})
+ return()
+ endif()
+
+ set(${PACKAGE}_SOURCE_DIR
+ "${CPM_PACKAGE_${PACKAGE}_SOURCE_DIR}"
+ PARENT_SCOPE
+ )
+ set(${PACKAGE}_BINARY_DIR
+ "${CPM_PACKAGE_${PACKAGE}_BINARY_DIR}"
+ PARENT_SCOPE
+ )
+endfunction()
+
+function(cpm_store_fetch_properties PACKAGE source_dir binary_dir)
+ if(${CPM_DRY_RUN})
+ return()
+ endif()
+
+ set(CPM_PACKAGE_${PACKAGE}_SOURCE_DIR
+ "${source_dir}"
+ CACHE INTERNAL ""
+ )
+ set(CPM_PACKAGE_${PACKAGE}_BINARY_DIR
+ "${binary_dir}"
+ CACHE INTERNAL ""
+ )
+endfunction()
+
+# adds a package as a subdirectory if viable, according to provided options
+function(
+ cpm_add_subdirectory
+ PACKAGE
+ DOWNLOAD_ONLY
+ SOURCE_DIR
+ BINARY_DIR
+ EXCLUDE
+ SYSTEM
+ OPTIONS
+)
+
+ if(NOT DOWNLOAD_ONLY AND EXISTS ${SOURCE_DIR}/CMakeLists.txt)
+ set(addSubdirectoryExtraArgs "")
+ if(EXCLUDE)
+ list(APPEND addSubdirectoryExtraArgs EXCLUDE_FROM_ALL)
+ endif()
+ if("${SYSTEM}" AND "${CMAKE_VERSION}" VERSION_GREATER_EQUAL "3.25")
+ # https://cmake.org/cmake/help/latest/prop_dir/SYSTEM.html#prop_dir:SYSTEM
+ list(APPEND addSubdirectoryExtraArgs SYSTEM)
+ endif()
+ if(OPTIONS)
+ foreach(OPTION ${OPTIONS})
+ cpm_parse_option("${OPTION}")
+ set(${OPTION_KEY} "${OPTION_VALUE}")
+ endforeach()
+ endif()
+ set(CPM_OLD_INDENT "${CPM_INDENT}")
+ set(CPM_INDENT "${CPM_INDENT} ${PACKAGE}:")
+ add_subdirectory(${SOURCE_DIR} ${BINARY_DIR} ${addSubdirectoryExtraArgs})
+ set(CPM_INDENT "${CPM_OLD_INDENT}")
+ endif()
+endfunction()
+
+# downloads a previously declared package via FetchContent and exports the variables
+# `${PACKAGE}_SOURCE_DIR` and `${PACKAGE}_BINARY_DIR` to the parent scope
+function(cpm_fetch_package PACKAGE populated)
+ set(${populated}
+ FALSE
+ PARENT_SCOPE
+ )
+ if(${CPM_DRY_RUN})
+ cpm_message(STATUS "${CPM_INDENT} Package ${PACKAGE} not fetched (dry run)")
+ return()
+ endif()
+
+ FetchContent_GetProperties(${PACKAGE})
+
+ string(TOLOWER "${PACKAGE}" lower_case_name)
+
+ if(NOT ${lower_case_name}_POPULATED)
+ FetchContent_Populate(${PACKAGE})
+ set(${populated}
+ TRUE
+ PARENT_SCOPE
+ )
+ endif()
+
+ cpm_store_fetch_properties(
+ ${CPM_ARGS_NAME} ${${lower_case_name}_SOURCE_DIR} ${${lower_case_name}_BINARY_DIR}
+ )
+
+ set(${PACKAGE}_SOURCE_DIR
+ ${${lower_case_name}_SOURCE_DIR}
+ PARENT_SCOPE
+ )
+ set(${PACKAGE}_BINARY_DIR
+ ${${lower_case_name}_BINARY_DIR}
+ PARENT_SCOPE
+ )
+endfunction()
+
+# splits a package option
+function(cpm_parse_option OPTION)
+ string(REGEX MATCH "^[^ ]+" OPTION_KEY "${OPTION}")
+ string(LENGTH "${OPTION}" OPTION_LENGTH)
+ string(LENGTH "${OPTION_KEY}" OPTION_KEY_LENGTH)
+ if(OPTION_KEY_LENGTH STREQUAL OPTION_LENGTH)
+ # no value for key provided, assume user wants to set option to "ON"
+ set(OPTION_VALUE "ON")
+ else()
+ math(EXPR OPTION_KEY_LENGTH "${OPTION_KEY_LENGTH}+1")
+ string(SUBSTRING "${OPTION}" "${OPTION_KEY_LENGTH}" "-1" OPTION_VALUE)
+ endif()
+ set(OPTION_KEY
+ "${OPTION_KEY}"
+ PARENT_SCOPE
+ )
+ set(OPTION_VALUE
+ "${OPTION_VALUE}"
+ PARENT_SCOPE
+ )
+endfunction()
+
+# guesses the package version from a git tag
+function(cpm_get_version_from_git_tag GIT_TAG RESULT)
+ string(LENGTH ${GIT_TAG} length)
+ if(length EQUAL 40)
+ # GIT_TAG is probably a git hash
+ set(${RESULT}
+ 0
+ PARENT_SCOPE
+ )
+ else()
+ string(REGEX MATCH "v?([0123456789.]*).*" _ ${GIT_TAG})
+ set(${RESULT}
+ ${CMAKE_MATCH_1}
+ PARENT_SCOPE
+ )
+ endif()
+endfunction()
+
+# guesses if the git tag is a commit hash or an actual tag or a branch name.
+function(cpm_is_git_tag_commit_hash GIT_TAG RESULT)
+ string(LENGTH "${GIT_TAG}" length)
+ # full hash has 40 characters, and short hash has at least 7 characters.
+ if(length LESS 7 OR length GREATER 40)
+ set(${RESULT}
+ 0
+ PARENT_SCOPE
+ )
+ else()
+ if(${GIT_TAG} MATCHES "^[a-fA-F0-9]+$")
+ set(${RESULT}
+ 1
+ PARENT_SCOPE
+ )
+ else()
+ set(${RESULT}
+ 0
+ PARENT_SCOPE
+ )
+ endif()
+ endif()
+endfunction()
+
+function(cpm_prettify_package_arguments OUT_VAR IS_IN_COMMENT)
+ set(oneValueArgs
+ NAME
+ FORCE
+ VERSION
+ GIT_TAG
+ DOWNLOAD_ONLY
+ GITHUB_REPOSITORY
+ GITLAB_REPOSITORY
+ BITBUCKET_REPOSITORY
+ GIT_REPOSITORY
+ SOURCE_DIR
+ FIND_PACKAGE_ARGUMENTS
+ NO_CACHE
+ SYSTEM
+ GIT_SHALLOW
+ EXCLUDE_FROM_ALL
+ SOURCE_SUBDIR
+ )
+ set(multiValueArgs URL OPTIONS DOWNLOAD_COMMAND)
+ cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+ foreach(oneArgName ${oneValueArgs})
+ if(DEFINED CPM_ARGS_${oneArgName})
+ if(${IS_IN_COMMENT})
+ string(APPEND PRETTY_OUT_VAR "#")
+ endif()
+ if(${oneArgName} STREQUAL "SOURCE_DIR")
+ string(REPLACE ${CMAKE_SOURCE_DIR} "\${CMAKE_SOURCE_DIR}" CPM_ARGS_${oneArgName}
+ ${CPM_ARGS_${oneArgName}}
+ )
+ endif()
+ string(APPEND PRETTY_OUT_VAR " ${oneArgName} ${CPM_ARGS_${oneArgName}}\n")
+ endif()
+ endforeach()
+ foreach(multiArgName ${multiValueArgs})
+ if(DEFINED CPM_ARGS_${multiArgName})
+ if(${IS_IN_COMMENT})
+ string(APPEND PRETTY_OUT_VAR "#")
+ endif()
+ string(APPEND PRETTY_OUT_VAR " ${multiArgName}\n")
+ foreach(singleOption ${CPM_ARGS_${multiArgName}})
+ if(${IS_IN_COMMENT})
+ string(APPEND PRETTY_OUT_VAR "#")
+ endif()
+ string(APPEND PRETTY_OUT_VAR " \"${singleOption}\"\n")
+ endforeach()
+ endif()
+ endforeach()
+
+ if(NOT "${CPM_ARGS_UNPARSED_ARGUMENTS}" STREQUAL "")
+ if(${IS_IN_COMMENT})
+ string(APPEND PRETTY_OUT_VAR "#")
+ endif()
+ string(APPEND PRETTY_OUT_VAR " ")
+ foreach(CPM_ARGS_UNPARSED_ARGUMENT ${CPM_ARGS_UNPARSED_ARGUMENTS})
+ string(APPEND PRETTY_OUT_VAR " ${CPM_ARGS_UNPARSED_ARGUMENT}")
+ endforeach()
+ string(APPEND PRETTY_OUT_VAR "\n")
+ endif()
+
+ set(${OUT_VAR}
+ ${PRETTY_OUT_VAR}
+ PARENT_SCOPE
+ )
+
+endfunction()
diff --git a/contrib/simdutf/cmake/JoinPaths.cmake b/contrib/simdutf/cmake/JoinPaths.cmake
new file mode 100644
index 000000000..07172d839
--- /dev/null
+++ b/contrib/simdutf/cmake/JoinPaths.cmake
@@ -0,0 +1,23 @@
+# This module provides function for joining paths
+ # known from most languages
+ #
+ # SPDX-License-Identifier: (MIT OR CC0-1.0)
+ # Copyright 2020 Jan Tojnar
+ # https://github.com/jtojnar/cmake-snips
+ #
+ # Modelled after Python’s os.path.join
+ # https://docs.python.org/3.7/library/os.path.html#os.path.join
+ # Windows not supported
+ function(join_paths joined_path first_path_segment)
+ set(temp_path "${first_path_segment}")
+ foreach(current_segment IN LISTS ARGN)
+ if(NOT ("${current_segment}" STREQUAL ""))
+ if(IS_ABSOLUTE "${current_segment}")
+ set(temp_path "${current_segment}")
+ else()
+ set(temp_path "${temp_path}/${current_segment}")
+ endif()
+ endif()
+ endforeach()
+ set(${joined_path} "${temp_path}" PARENT_SCOPE)
+ endfunction() \ No newline at end of file
diff --git a/contrib/simdutf/cmake/Toolchains/loongarch64-linux-gnu.cmake b/contrib/simdutf/cmake/Toolchains/loongarch64-linux-gnu.cmake
new file mode 100644
index 000000000..cded3d305
--- /dev/null
+++ b/contrib/simdutf/cmake/Toolchains/loongarch64-linux-gnu.cmake
@@ -0,0 +1,4 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR loongarch64)
+
+set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-loongarch64")
diff --git a/contrib/simdutf/cmake/Toolchains/riscv64-linux-gnu.cmake b/contrib/simdutf/cmake/Toolchains/riscv64-linux-gnu.cmake
new file mode 100644
index 000000000..ed58a2dba
--- /dev/null
+++ b/contrib/simdutf/cmake/Toolchains/riscv64-linux-gnu.cmake
@@ -0,0 +1,4 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR riscv64)
+
+set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-riscv64-static")
diff --git a/contrib/simdutf/cmake/add_cpp_test.cmake b/contrib/simdutf/cmake/add_cpp_test.cmake
new file mode 100644
index 000000000..ad57f6a6f
--- /dev/null
+++ b/contrib/simdutf/cmake/add_cpp_test.cmake
@@ -0,0 +1,63 @@
+# Helper so we don't have to repeat ourselves so much
+# Usage: add_cpp_test(testname [COMPILE_ONLY] [SOURCES a.cpp b.cpp ...] [LABELS acceptance per_implementation ...])
+# SOURCES defaults to testname.cpp if not specified.
+function(add_cpp_test TEST_NAME)
+ # Parse arguments
+ cmake_parse_arguments(PARSE_ARGV 1 ARGS "COMPILE_ONLY;LIBRARY;WILL_FAIL" "" "SOURCES;LABELS;DEPENDENCY_OF")
+ if (NOT ARGS_SOURCES)
+ list(APPEND ARGS_SOURCES ${TEST_NAME}.cpp)
+ endif()
+ if (ARGS_COMPILE_ONLY)
+ list(APPEND ${ARGS_LABELS} compile_only)
+ endif()
+ if(SIMDUTF_SANITIZE)
+ add_compile_options(-fsanitize=address -fno-omit-frame-pointer -fno-sanitize-recover=all)
+ add_compile_definitions(ASAN_OPTIONS=detect_leaks=1)
+ endif()
+ # Add the compile target
+ if (ARGS_LIBRARY)
+ add_library(${TEST_NAME} STATIC ${ARGS_SOURCES})
+ else(ARGS_LIBRARY)
+ add_executable(${TEST_NAME} ${ARGS_SOURCES})
+ endif(ARGS_LIBRARY)
+
+ # Add test
+ if (ARGS_COMPILE_ONLY OR ARGS_LIBRARY)
+ add_test(
+ NAME ${TEST_NAME}
+ COMMAND ${CMAKE_COMMAND} --build . --target ${TEST_NAME} --config $<CONFIGURATION>
+ WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+ )
+ set_target_properties(${TEST_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE EXCLUDE_FROM_DEFAULT_BUILD TRUE)
+ else()
+ if (CMAKE_CROSSCOMPILING_EMULATOR)
+ add_test(${TEST_NAME} ${CMAKE_CROSSCOMPILING_EMULATOR} ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME})
+ else()
+ add_test(${TEST_NAME} ${TEST_NAME})
+ endif()
+
+ # Add to <label>_tests make targets
+ foreach(label ${ARGS_LABELS})
+ list(APPEND ARGS_DEPENDENCY_OF ${label})
+ endforeach(label ${ARGS_LABELS})
+ endif()
+
+ # Add to test labels
+ if (ARGS_LABELS)
+ set_property(TEST ${TEST_NAME} APPEND PROPERTY LABELS ${ARGS_LABELS})
+ endif()
+
+ # Add as a dependency of given targets
+ foreach(dependency_of ${ARGS_DEPENDENCY_OF})
+ if (NOT TARGET ${dependency_of}_tests)
+ add_custom_target(${dependency_of}_tests)
+ add_dependencies(all_tests ${dependency_of}_tests)
+ endif(NOT TARGET ${dependency_of}_tests)
+ add_dependencies(${dependency_of}_tests ${TEST_NAME})
+ endforeach(dependency_of ${ARGS_DEPENDENCY_OF})
+
+ # If it will fail, mark the test as such
+ if (ARGS_WILL_FAIL)
+ set_property(TEST ${TEST_NAME} PROPERTY WILL_FAIL TRUE)
+ endif()
+endfunction()
diff --git a/contrib/simdutf/cmake/simdutf-config.cmake.in b/contrib/simdutf/cmake/simdutf-config.cmake.in
new file mode 100644
index 000000000..e7babd620
--- /dev/null
+++ b/contrib/simdutf/cmake/simdutf-config.cmake.in
@@ -0,0 +1,2 @@
+
+include("${CMAKE_CURRENT_LIST_DIR}/simdutfTargets.cmake") \ No newline at end of file
diff --git a/contrib/simdutf/cmake/simdutf-flags.cmake b/contrib/simdutf/cmake/simdutf-flags.cmake
new file mode 100644
index 000000000..7a96575b1
--- /dev/null
+++ b/contrib/simdutf/cmake/simdutf-flags.cmake
@@ -0,0 +1,25 @@
+
+option(SIMDUTF_SANITIZE "Sanitize addresses" OFF)
+option(SIMDUTF_SANITIZE_UNDEFINED "Sanitize undefined behavior" OFF)
+option(SIMDUTF_ALWAYS_INCLUDE_FALLBACK "Always include fallback" OFF)
+
+if (NOT CMAKE_BUILD_TYPE)
+ message(STATUS "No build type selected, default to Release")
+ if(SIMDUTF_SANITIZE OR SIMDUTF_SANITIZE_UNDEFINED)
+ set(CMAKE_BUILD_TYPE Debug CACHE STRING "Choose the type of build." FORCE)
+ # SIMDUTF_SANITIZE only applies to gcc/clang:
+ message(STATUS "Setting debug optimization flag to -O1 -g.")
+ set(CMAKE_CXX_FLAGS_DEBUG "-O1 -g" CACHE STRING "" FORCE)
+ else()
+ set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
+ endif()
+endif()
+
+set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/tools/cmake")
+
+# We compile tools, tests, etc. with C++ 11. Override yourself if you need on a target.
+set(SIMDUTF_CXX_STANDARD 11 CACHE STRING "the C++ standard to use for simdutf")
+
+set(CMAKE_CXX_STANDARD ${SIMDUTF_CXX_STANDARD})
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
diff --git a/contrib/simdutf/include/simdutf.h b/contrib/simdutf/include/simdutf.h
new file mode 100644
index 000000000..9ae9f4f1a
--- /dev/null
+++ b/contrib/simdutf/include/simdutf.h
@@ -0,0 +1,26 @@
+#ifndef SIMDUTF_H
+#define SIMDUTF_H
+#include <cstring>
+
+#include "simdutf/compiler_check.h"
+#include "simdutf/common_defs.h"
+#include "simdutf/encoding_types.h"
+#include "simdutf/error.h"
+
+SIMDUTF_PUSH_DISABLE_WARNINGS
+SIMDUTF_DISABLE_UNDESIRED_WARNINGS
+
+// Public API
+#include "simdutf/simdutf_version.h"
+#include "simdutf/implementation.h"
+
+// Implementation-internal files (must be included before the implementations
+// themselves, to keep amalgamation working--otherwise, the first time a file is
+// included, it might be put inside the #ifdef
+// SIMDUTF_IMPLEMENTATION_ARM64/FALLBACK/etc., which means the other
+// implementations can't compile unless that implementation is turned on).
+#include "simdutf/internal/isadetection.h"
+
+SIMDUTF_POP_DISABLE_WARNINGS
+
+#endif // SIMDUTF_H
diff --git a/contrib/simdutf/include/simdutf/avx512.h b/contrib/simdutf/include/simdutf/avx512.h
new file mode 100644
index 000000000..59f56c53c
--- /dev/null
+++ b/contrib/simdutf/include/simdutf/avx512.h
@@ -0,0 +1,79 @@
+#ifndef SIMDUTF_AVX512_H_
+#define SIMDUTF_AVX512_H_
+
+/*
+ It's possible to override AVX512 settings with cmake DCMAKE_CXX_FLAGS.
+
+ All preprocessor directives has form `SIMDUTF_HAS_AVX512{feature}`,
+ where a feature is a code name for extensions.
+
+ Please see the listing below to find which are supported.
+*/
+
+#ifndef SIMDUTF_HAS_AVX512F
+ #if defined(__AVX512F__) && __AVX512F__ == 1
+ #define SIMDUTF_HAS_AVX512F 1
+ #endif
+#endif
+
+#ifndef SIMDUTF_HAS_AVX512DQ
+ #if defined(__AVX512DQ__) && __AVX512DQ__ == 1
+ #define SIMDUTF_HAS_AVX512DQ 1
+ #endif
+#endif
+
+#ifndef SIMDUTF_HAS_AVX512IFMA
+ #if defined(__AVX512IFMA__) && __AVX512IFMA__ == 1
+ #define SIMDUTF_HAS_AVX512IFMA 1
+ #endif
+#endif
+
+#ifndef SIMDUTF_HAS_AVX512CD
+ #if defined(__AVX512CD__) && __AVX512CD__ == 1
+ #define SIMDUTF_HAS_AVX512CD 1
+ #endif
+#endif
+
+#ifndef SIMDUTF_HAS_AVX512BW
+ #if defined(__AVX512BW__) && __AVX512BW__ == 1
+ #define SIMDUTF_HAS_AVX512BW 1
+ #endif
+#endif
+
+#ifndef SIMDUTF_HAS_AVX512VL
+ #if defined(__AVX512VL__) && __AVX512VL__ == 1
+ #define SIMDUTF_HAS_AVX512VL 1
+ #endif
+#endif
+
+#ifndef SIMDUTF_HAS_AVX512VBMI
+ #if defined(__AVX512VBMI__) && __AVX512VBMI__ == 1
+ #define SIMDUTF_HAS_AVX512VBMI 1
+ #endif
+#endif
+
+#ifndef SIMDUTF_HAS_AVX512VBMI2
+ #if defined(__AVX512VBMI2__) && __AVX512VBMI2__ == 1
+ #define SIMDUTF_HAS_AVX512VBMI2 1
+ #endif
+#endif
+
+#ifndef SIMDUTF_HAS_AVX512VNNI
+ #if defined(__AVX512VNNI__) && __AVX512VNNI__ == 1
+ #define SIMDUTF_HAS_AVX512VNNI 1
+ #endif
+#endif
+
+#ifndef SIMDUTF_HAS_AVX512BITALG
+ #if defined(__AVX512BITALG__) && __AVX512BITALG__ == 1
+ #define SIMDUTF_HAS_AVX512BITALG 1
+ #endif
+#endif
+
+#ifndef SIMDUTF_HAS_AVX512VPOPCNTDQ
+ #if defined(__AVX512VPOPCNTDQ__) && __AVX512VPOPCNTDQ__ == 1
+ #define SIMDUTF_HAS_AVX512VPOPCNTDQ 1
+ #endif
+#endif
+
+#endif // SIMDUTF_AVX512_H_
diff --git a/contrib/simdutf/include/simdutf/common_defs.h b/contrib/simdutf/include/simdutf/common_defs.h
new file mode 100644
index 000000000..57dde25e6
--- /dev/null
+++ b/contrib/simdutf/include/simdutf/common_defs.h
@@ -0,0 +1,151 @@
+#ifndef SIMDUTF_COMMON_DEFS_H
+#define SIMDUTF_COMMON_DEFS_H
+
+#include <cassert>
+#include "simdutf/portability.h"
+#include "simdutf/avx512.h"
+
+#if defined(__GNUC__)
+ // Marks a block with a name so that MCA analysis can see it.
+ #define SIMDUTF_BEGIN_DEBUG_BLOCK(name) \
+ __asm volatile("# LLVM-MCA-BEGIN " #name);
+ #define SIMDUTF_END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name);
+ #define SIMDUTF_DEBUG_BLOCK(name, block) \
+ BEGIN_DEBUG_BLOCK(name); \
+ block; \
+ END_DEBUG_BLOCK(name);
+#else
+ #define SIMDUTF_BEGIN_DEBUG_BLOCK(name)
+ #define SIMDUTF_END_DEBUG_BLOCK(name)
+ #define SIMDUTF_DEBUG_BLOCK(name, block)
+#endif
+
+// Align to N-byte boundary
+#define SIMDUTF_ROUNDUP_N(a, n) (((a) + ((n) - 1)) & ~((n) - 1))
+#define SIMDUTF_ROUNDDOWN_N(a, n) ((a) & ~((n) - 1))
+
+#define SIMDUTF_ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n) - 1)) == 0)
+
+#if defined(SIMDUTF_REGULAR_VISUAL_STUDIO)
+ #define SIMDUTF_DEPRECATED __declspec(deprecated)
+
+ #define simdutf_really_inline __forceinline // really inline in release mode
+ #define simdutf_always_inline __forceinline // always inline, no matter what
+ #define simdutf_never_inline __declspec(noinline)
+
+ #define simdutf_unused
+ #define simdutf_warn_unused
+
+ #ifndef simdutf_likely
+ #define simdutf_likely(x) x
+ #endif
+ #ifndef simdutf_unlikely
+ #define simdutf_unlikely(x) x
+ #endif
+
+ #define SIMDUTF_PUSH_DISABLE_WARNINGS __pragma(warning(push))
+ #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS __pragma(warning(push, 0))
+ #define SIMDUTF_DISABLE_VS_WARNING(WARNING_NUMBER) \
+ __pragma(warning(disable : WARNING_NUMBER))
+ // Get rid of Intellisense-only warnings (Code Analysis)
+ // Though __has_include is C++17, it is supported in Visual Studio 2017 or
+ // better (_MSC_VER>=1910).
+ #ifdef __has_include
+ #if __has_include(<CppCoreCheck\Warnings.h>)
+ #include <CppCoreCheck\Warnings.h>
+ #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS \
+ SIMDUTF_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS)
+ #endif
+ #endif
+
+ #ifndef SIMDUTF_DISABLE_UNDESIRED_WARNINGS
+ #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
+ #endif
+
+ #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_VS_WARNING(4996)
+ #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING
+ #define SIMDUTF_POP_DISABLE_WARNINGS __pragma(warning(pop))
+
+#else // SIMDUTF_REGULAR_VISUAL_STUDIO
+ #if defined(__OPTIMIZE__) || defined(NDEBUG)
+ #define simdutf_really_inline inline __attribute__((always_inline))
+ #else
+ #define simdutf_really_inline inline
+ #endif
+ #define simdutf_always_inline \
+ inline __attribute__((always_inline)) // always inline, no matter what
+ #define SIMDUTF_DEPRECATED __attribute__((deprecated))
+ #define simdutf_never_inline inline __attribute__((noinline))
+
+ #define simdutf_unused __attribute__((unused))
+ #define simdutf_warn_unused __attribute__((warn_unused_result))
+
+ #ifndef simdutf_likely
+ #define simdutf_likely(x) __builtin_expect(!!(x), 1)
+ #endif
+ #ifndef simdutf_unlikely
+ #define simdutf_unlikely(x) __builtin_expect(!!(x), 0)
+ #endif
+
+ // clang-format off
+ #define SIMDUTF_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push")
+ // gcc doesn't seem to disable all warnings with all and extra, add warnings
+ // here as necessary
+ #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS \
+ SIMDUTF_PUSH_DISABLE_WARNINGS \
+ SIMDUTF_DISABLE_GCC_WARNING(-Weffc++) \
+ SIMDUTF_DISABLE_GCC_WARNING(-Wall) \
+ SIMDUTF_DISABLE_GCC_WARNING(-Wconversion) \
+ SIMDUTF_DISABLE_GCC_WARNING(-Wextra) \
+ SIMDUTF_DISABLE_GCC_WARNING(-Wattributes) \
+ SIMDUTF_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \
+ SIMDUTF_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \
+ SIMDUTF_DISABLE_GCC_WARNING(-Wreturn-type) \
+ SIMDUTF_DISABLE_GCC_WARNING(-Wshadow) \
+ SIMDUTF_DISABLE_GCC_WARNING(-Wunused-parameter) \
+ SIMDUTF_DISABLE_GCC_WARNING(-Wunused-variable)
+ #define SIMDUTF_PRAGMA(P) _Pragma(#P)
+ #define SIMDUTF_DISABLE_GCC_WARNING(WARNING) \
+ SIMDUTF_PRAGMA(GCC diagnostic ignored #WARNING)
+ #if defined(SIMDUTF_CLANG_VISUAL_STUDIO)
+ #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS \
+ SIMDUTF_DISABLE_GCC_WARNING(-Wmicrosoft-include)
+ #else
+ #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
+ #endif
+ #define SIMDUTF_DISABLE_DEPRECATED_WARNING \
+ SIMDUTF_DISABLE_GCC_WARNING(-Wdeprecated-declarations)
+ #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING \
+ SIMDUTF_DISABLE_GCC_WARNING(-Wstrict-overflow)
+ #define SIMDUTF_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop")
+ // clang-format on
+
+#endif // MSC_VER
+
+#ifndef SIMDUTF_DLLIMPORTEXPORT
+ #if defined(SIMDUTF_VISUAL_STUDIO)
+ /**
+ * It does not matter here whether you are using
+ * the regular visual studio or clang under visual
+ * studio.
+ */
+ #if SIMDUTF_USING_LIBRARY
+ #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport)
+ #else
+ #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport)
+ #endif
+ #else
+ #define SIMDUTF_DLLIMPORTEXPORT
+ #endif
+#endif
+
+/// If EXPR is an error, returns it.
+#define SIMDUTF_TRY(EXPR) \
+ { \
+ auto _err = (EXPR); \
+ if (_err) { \
+ return _err; \
+ } \
+ }
+
+#endif // SIMDUTF_COMMON_DEFS_H
diff --git a/contrib/simdutf/include/simdutf/compiler_check.h b/contrib/simdutf/include/simdutf/compiler_check.h
new file mode 100644
index 000000000..a0426d5c4
--- /dev/null
+++ b/contrib/simdutf/include/simdutf/compiler_check.h
@@ -0,0 +1,45 @@
+#ifndef SIMDUTF_COMPILER_CHECK_H
+#define SIMDUTF_COMPILER_CHECK_H
+
+#ifndef __cplusplus
+ #error simdutf requires a C++ compiler
+#endif
+
+#ifndef SIMDUTF_CPLUSPLUS
+ #if defined(_MSVC_LANG) && !defined(__clang__)
+ #define SIMDUTF_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG)
+ #else
+ #define SIMDUTF_CPLUSPLUS __cplusplus
+ #endif
+#endif
+
+// C++ 23
+#if !defined(SIMDUTF_CPLUSPLUS23) && (SIMDUTF_CPLUSPLUS >= 202302L)
+ #define SIMDUTF_CPLUSPLUS23 1
+#endif
+
+// C++ 20
+#if !defined(SIMDUTF_CPLUSPLUS20) && (SIMDUTF_CPLUSPLUS >= 202002L)
+ #define SIMDUTF_CPLUSPLUS20 1
+#endif
+
+// C++ 17
+#if !defined(SIMDUTF_CPLUSPLUS17) && (SIMDUTF_CPLUSPLUS >= 201703L)
+ #define SIMDUTF_CPLUSPLUS17 1
+#endif
+
+// C++ 14
+#if !defined(SIMDUTF_CPLUSPLUS14) && (SIMDUTF_CPLUSPLUS >= 201402L)
+ #define SIMDUTF_CPLUSPLUS14 1
+#endif
+
+// C++ 11
+#if !defined(SIMDUTF_CPLUSPLUS11) && (SIMDUTF_CPLUSPLUS >= 201103L)
+ #define SIMDUTF_CPLUSPLUS11 1
+#endif
+
+#ifndef SIMDUTF_CPLUSPLUS11
+ #error simdutf requires a compiler compliant with the C++11 standard
+#endif
+
+#endif // SIMDUTF_COMPILER_CHECK_H
diff --git a/contrib/simdutf/include/simdutf/encoding_types.h b/contrib/simdutf/include/simdutf/encoding_types.h
new file mode 100644
index 000000000..64ed9a2b1
--- /dev/null
+++ b/contrib/simdutf/include/simdutf/encoding_types.h
@@ -0,0 +1,43 @@
+#include <string>
+
+namespace simdutf {
+
+enum encoding_type {
+ UTF8 = 1, // BOM 0xef 0xbb 0xbf
+ UTF16_LE = 2, // BOM 0xff 0xfe
+ UTF16_BE = 4, // BOM 0xfe 0xff
+ UTF32_LE = 8, // BOM 0xff 0xfe 0x00 0x00
+ UTF32_BE = 16, // BOM 0x00 0x00 0xfe 0xff
+ Latin1 = 32,
+
+ unspecified = 0
+};
+
+enum endianness { LITTLE = 0, BIG = 1 };
+
+bool match_system(endianness e);
+
+std::string to_string(encoding_type bom);
+
+// Note that BOM for UTF8 is discouraged.
+namespace BOM {
+
+/**
+ * Checks for a BOM. If not, returns unspecified
+ * @param input the string to process
+ * @param length the length of the string in code units
+ * @return the corresponding encoding
+ */
+
+encoding_type check_bom(const uint8_t *byte, size_t length);
+encoding_type check_bom(const char *byte, size_t length);
+/**
+ * Returns the size, in bytes, of the BOM for a given encoding type.
+ * Note that UTF8 BOM are discouraged.
+ * @param bom the encoding type
+ * @return the size in bytes of the corresponding BOM
+ */
+size_t bom_byte_size(encoding_type bom);
+
+} // namespace BOM
+} // namespace simdutf
diff --git a/contrib/simdutf/include/simdutf/error.h b/contrib/simdutf/include/simdutf/error.h
new file mode 100644
index 000000000..cd8280299
--- /dev/null
+++ b/contrib/simdutf/include/simdutf/error.h
@@ -0,0 +1,69 @@
+#ifndef SIMDUTF_ERROR_H
+#define SIMDUTF_ERROR_H
+namespace simdutf {
+
+enum error_code {
+ SUCCESS = 0,
+ HEADER_BITS, // Any byte must have fewer than 5 header bits.
+ TOO_SHORT, // The leading byte must be followed by N-1 continuation bytes,
+ // where N is the UTF-8 character length This is also the error
+ // when the input is truncated.
+ TOO_LONG, // We either have too many consecutive continuation bytes or the
+ // string starts with a continuation byte.
+ OVERLONG, // The decoded character must be above U+7F for two-byte characters,
+ // U+7FF for three-byte characters, and U+FFFF for four-byte
+ // characters.
+ TOO_LARGE, // The decoded character must be less than or equal to
+ // U+10FFFF,less than or equal than U+7F for ASCII OR less than
+ // equal than U+FF for Latin1
+ SURROGATE, // The decoded character must be not be in U+D800...DFFF (UTF-8 or
+ // UTF-32) OR a high surrogate must be followed by a low surrogate
+ // and a low surrogate must be preceded by a high surrogate
+ // (UTF-16) OR there must be no surrogate at all (Latin1)
+ INVALID_BASE64_CHARACTER, // Found a character that cannot be part of a valid
+ // base64 string. This may include a misplaced
+ // padding character ('=').
+ BASE64_INPUT_REMAINDER, // The base64 input terminates with a single
+ // character, excluding padding (=).
+ BASE64_EXTRA_BITS, // The base64 input terminates with non-zero
+ // padding bits.
+ OUTPUT_BUFFER_TOO_SMALL, // The provided buffer is too small.
+ OTHER // Not related to validation/transcoding.
+};
+
+struct result {
+ error_code error;
+ size_t count; // In case of error, indicates the position of the error. In
+ // case of success, indicates the number of code units
+ // validated/written.
+
+ simdutf_really_inline result() : error{error_code::SUCCESS}, count{0} {}
+
+ simdutf_really_inline result(error_code err, size_t pos)
+ : error{err}, count{pos} {}
+};
+
+struct full_result {
+ error_code error;
+ size_t input_count;
+ size_t output_count;
+
+ simdutf_really_inline full_result()
+ : error{error_code::SUCCESS}, input_count{0}, output_count{0} {}
+
+ simdutf_really_inline full_result(error_code err, size_t pos_in,
+ size_t pos_out)
+ : error{err}, input_count{pos_in}, output_count{pos_out} {}
+
+ simdutf_really_inline operator result() const noexcept {
+ if (error == error_code::SUCCESS ||
+ error == error_code::BASE64_INPUT_REMAINDER) {
+ return result{error, output_count};
+ } else {
+ return result{error, input_count};
+ }
+ }
+};
+
+} // namespace simdutf
+#endif
diff --git a/contrib/simdutf/include/simdutf/implementation.h b/contrib/simdutf/include/simdutf/implementation.h
new file mode 100644
index 000000000..767e3a32c
--- /dev/null
+++ b/contrib/simdutf/include/simdutf/implementation.h
@@ -0,0 +1,3716 @@
+#ifndef SIMDUTF_IMPLEMENTATION_H
+#define SIMDUTF_IMPLEMENTATION_H
+#include <string>
+#if !defined(SIMDUTF_NO_THREADS)
+ #include <atomic>
+#endif
+#include <tuple>
+#include <vector>
+#include "simdutf/common_defs.h"
+#include "simdutf/internal/isadetection.h"
+
+namespace simdutf {
+
+/**
+ * Autodetect the encoding of the input, a single encoding is recommended.
+ * E.g., the function might return simdutf::encoding_type::UTF8,
+ * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
+ * simdutf::encoding_type::UTF32_LE.
+ *
+ * @param input the string to analyze.
+ * @param length the length of the string in bytes.
+ * @return the detected encoding type
+ */
+simdutf_warn_unused simdutf::encoding_type
+autodetect_encoding(const char *input, size_t length) noexcept;
+simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
+autodetect_encoding(const uint8_t *input, size_t length) noexcept {
+ return autodetect_encoding(reinterpret_cast<const char *>(input), length);
+}
+
+/**
+ * Autodetect the possible encodings of the input in one pass.
+ * E.g., if the input might be UTF-16LE or UTF-8, this function returns
+ * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE).
+ *
+ * Overridden by each implementation.
+ *
+ * @param input the string to analyze.
+ * @param length the length of the string in bytes.
+ * @return the detected encoding type
+ */
+simdutf_warn_unused int detect_encodings(const char *input,
+ size_t length) noexcept;
+simdutf_really_inline simdutf_warn_unused int
+detect_encodings(const uint8_t *input, size_t length) noexcept {
+ return detect_encodings(reinterpret_cast<const char *>(input), length);
+}
+
+/**
+ * Validate the UTF-8 string. This function may be best when you expect
+ * the input to be almost always valid. Otherwise, consider using
+ * validate_utf8_with_errors.
+ *
+ * Overridden by each implementation.
+ *
+ * @param buf the UTF-8 string to validate.
+ * @param len the length of the string in bytes.
+ * @return true if and only if the string is valid UTF-8.
+ */
+simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept;
+
+/**
+ * Validate the UTF-8 string and stop on error.
+ *
+ * Overridden by each implementation.
+ *
+ * @param buf the UTF-8 string to validate.
+ * @param len the length of the string in bytes.
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of code units validated if
+ * successful.
+ */
+simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
+ size_t len) noexcept;
+
+/**
+ * Validate the ASCII string.
+ *
+ * Overridden by each implementation.
+ *
+ * @param buf the ASCII string to validate.
+ * @param len the length of the string in bytes.
+ * @return true if and only if the string is valid ASCII.
+ */
+simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept;
+
+/**
+ * Validate the ASCII string and stop on error. It might be faster than
+ * validate_utf8 when an error is expected to occur early.
+ *
+ * Overridden by each implementation.
+ *
+ * @param buf the ASCII string to validate.
+ * @param len the length of the string in bytes.
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of code units validated if
+ * successful.
+ */
+simdutf_warn_unused result validate_ascii_with_errors(const char *buf,
+ size_t len) noexcept;
+
+/**
+ * Using native endianness; Validate the UTF-16 string.
+ * This function may be best when you expect the input to be almost always
+ * valid. Otherwise, consider using validate_utf16_with_errors.
+ *
+ * Overridden by each implementation.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param buf the UTF-16 string to validate.
+ * @param len the length of the string in number of 2-byte code units
+ * (char16_t).
+ * @return true if and only if the string is valid UTF-16.
+ */
+simdutf_warn_unused bool validate_utf16(const char16_t *buf,
+ size_t len) noexcept;
+
+/**
+ * Validate the UTF-16LE string. This function may be best when you expect
+ * the input to be almost always valid. Otherwise, consider using
+ * validate_utf16le_with_errors.
+ *
+ * Overridden by each implementation.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param buf the UTF-16LE string to validate.
+ * @param len the length of the string in number of 2-byte code units
+ * (char16_t).
+ * @return true if and only if the string is valid UTF-16LE.
+ */
+simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
+ size_t len) noexcept;
+
+/**
+ * Validate the UTF-16BE string. This function may be best when you expect
+ * the input to be almost always valid. Otherwise, consider using
+ * validate_utf16be_with_errors.
+ *
+ * Overridden by each implementation.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param buf the UTF-16BE string to validate.
+ * @param len the length of the string in number of 2-byte code units
+ * (char16_t).
+ * @return true if and only if the string is valid UTF-16BE.
+ */
+simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
+ size_t len) noexcept;
+
+/**
+ * Using native endianness; Validate the UTF-16 string and stop on error.
+ * It might be faster than validate_utf16 when an error is expected to occur
+ * early.
+ *
+ * Overridden by each implementation.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param buf the UTF-16 string to validate.
+ * @param len the length of the string in number of 2-byte code units
+ * (char16_t).
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of code units validated if
+ * successful.
+ */
+simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf,
+ size_t len) noexcept;
+
+/**
+ * Validate the UTF-16LE string and stop on error. It might be faster than
+ * validate_utf16le when an error is expected to occur early.
+ *
+ * Overridden by each implementation.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param buf the UTF-16LE string to validate.
+ * @param len the length of the string in number of 2-byte code units
+ * (char16_t).
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of code units validated if
+ * successful.
+ */
+simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf,
+ size_t len) noexcept;
+
+/**
+ * Validate the UTF-16BE string and stop on error. It might be faster than
+ * validate_utf16be when an error is expected to occur early.
+ *
+ * Overridden by each implementation.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param buf the UTF-16BE string to validate.
+ * @param len the length of the string in number of 2-byte code units
+ * (char16_t).
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of code units validated if
+ * successful.
+ */
+simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf,
+ size_t len) noexcept;
+
+/**
+ * Validate the UTF-32 string. This function may be best when you expect
+ * the input to be almost always valid. Otherwise, consider using
+ * validate_utf32_with_errors.
+ *
+ * Overridden by each implementation.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param buf the UTF-32 string to validate.
+ * @param len the length of the string in number of 4-byte code units
+ * (char32_t).
+ * @return true if and only if the string is valid UTF-32.
+ */
+simdutf_warn_unused bool validate_utf32(const char32_t *buf,
+ size_t len) noexcept;
+
+/**
+ * Validate the UTF-32 string and stop on error. It might be faster than
+ * validate_utf32 when an error is expected to occur early.
+ *
+ * Overridden by each implementation.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param buf the UTF-32 string to validate.
+ * @param len the length of the string in number of 4-byte code units
+ * (char32_t).
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of code units validated if
+ * successful.
+ */
+simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf,
+ size_t len) noexcept;
+
+/**
+ * Convert Latin1 string into UTF8 string.
+ *
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the Latin1 string to convert
+ * @param length the length of the string in bytes
+ * @param utf8_output the pointer to buffer that can hold conversion result
+ * @return the number of written char; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_latin1_to_utf8(const char *input,
+ size_t length,
+ char *utf8_output) noexcept;
+
+/**
+ * Convert Latin1 string into UTF8 string with output limit.
+ *
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the Latin1 string to convert
+ * @param length the length of the string in bytes
+ * @param utf8_output the pointer to buffer that can hold conversion result
+ * @param utf8_len the maximum output length
+ * @return the number of written char; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t
+convert_latin1_to_utf8_safe(const char *input, size_t length, char *utf8_output,
+ size_t utf8_len) noexcept;
+
+/**
+ * Convert possibly Latin1 string into UTF-16LE string.
+ *
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the Latin1 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_latin1_to_utf16le(
+ const char *input, size_t length, char16_t *utf16_output) noexcept;
+
+/**
+ * Convert Latin1 string into UTF-16BE string.
+ *
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the Latin1 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_latin1_to_utf16be(
+ const char *input, size_t length, char16_t *utf16_output) noexcept;
+
+/**
+ * Convert Latin1 string into UTF-32 string.
+ *
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the Latin1 string to convert
+ * @param length the length of the string in bytes
+ * @param utf32_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char32_t; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_latin1_to_utf32(
+ const char *input, size_t length, char32_t *utf32_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-8 string into latin1 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param latin1_output the pointer to buffer that can hold conversion result
+ * @return the number of written char; 0 if the input was not valid UTF-8 string
+ * or if it cannot be represented as Latin1
+ */
+simdutf_warn_unused size_t convert_utf8_to_latin1(const char *input,
+ size_t length,
+ char *latin1_output) noexcept;
+
+/**
+ * Using native endianness, convert possibly broken UTF-8 string into a UTF-16
+ * string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t; 0 if the input was not valid UTF-8
+ * string
+ */
+simdutf_warn_unused size_t convert_utf8_to_utf16(
+ const char *input, size_t length, char16_t *utf16_output) noexcept;
+
+/**
+ * Using native endianness, convert a Latin1 string into a UTF-16 string.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t.
+ */
+simdutf_warn_unused size_t convert_latin1_to_utf16(
+ const char *input, size_t length, char16_t *utf16_output) noexcept;
+
+/**
+ * Convert possibly broken UTF-8 string into UTF-16LE string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t; 0 if the input was not valid UTF-8
+ * string
+ */
+simdutf_warn_unused size_t convert_utf8_to_utf16le(
+ const char *input, size_t length, char16_t *utf16_output) noexcept;
+
+/**
+ * Convert possibly broken UTF-8 string into UTF-16BE string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t; 0 if the input was not valid UTF-8
+ * string
+ */
+simdutf_warn_unused size_t convert_utf8_to_utf16be(
+ const char *input, size_t length, char16_t *utf16_output) noexcept;
+
+/**
+ * Convert possibly broken UTF-8 string into latin1 string with errors.
+ * If the string cannot be represented as Latin1, an error
+ * code is returned.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param latin1_output the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of code units validated if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+ const char *input, size_t length, char *latin1_output) noexcept;
+
+/**
+ * Using native endianness, convert possibly broken UTF-8 string into UTF-16
+ * string and stop on error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char16_t written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
+ const char *input, size_t length, char16_t *utf16_output) noexcept;
+
+/**
+ * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char16_t written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+ const char *input, size_t length, char16_t *utf16_output) noexcept;
+
+/**
+ * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char16_t written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+ const char *input, size_t length, char16_t *utf16_output) noexcept;
+
+/**
+ * Convert possibly broken UTF-8 string into UTF-32 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf32_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char32_t; 0 if the input was not valid UTF-8
+ * string
+ */
+simdutf_warn_unused size_t convert_utf8_to_utf32(
+ const char *input, size_t length, char32_t *utf32_output) noexcept;
+
+/**
+ * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf32_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char32_t written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+ const char *input, size_t length, char32_t *utf32_output) noexcept;
+
+/**
+ * Convert valid UTF-8 string into latin1 string.
+ *
+ * This function assumes that the input string is valid UTF-8 and that it can be
+ * represented as Latin1. If you violate this assumption, the result is
+ * implementation defined and may include system-dependent behavior such as
+ * crashes.
+ *
+ * This function is for expert users only and not part of our public API. Use
+ * convert_utf8_to_latin1 instead. The function may be removed from the library
+ * in the future.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param latin1_output the pointer to buffer that can hold conversion result
+ * @return the number of written char; 0 if the input was not valid UTF-8 string
+ */
+simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+ const char *input, size_t length, char *latin1_output) noexcept;
+
+/**
+ * Using native endianness, convert valid UTF-8 string into a UTF-16 string.
+ *
+ * This function assumes that the input string is valid UTF-8.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t
+ */
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
+ const char *input, size_t length, char16_t *utf16_buffer) noexcept;
+
+/**
+ * Convert valid UTF-8 string into UTF-16LE string.
+ *
+ * This function assumes that the input string is valid UTF-8.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t
+ */
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+ const char *input, size_t length, char16_t *utf16_buffer) noexcept;
+
+/**
+ * Convert valid UTF-8 string into UTF-16BE string.
+ *
+ * This function assumes that the input string is valid UTF-8.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t
+ */
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+ const char *input, size_t length, char16_t *utf16_buffer) noexcept;
+
+/**
+ * Convert valid UTF-8 string into UTF-32 string.
+ *
+ * This function assumes that the input string is valid UTF-8.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf32_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char32_t
+ */
+simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+ const char *input, size_t length, char32_t *utf32_buffer) noexcept;
+
+/**
+ * Return the number of bytes that this Latin1 string would require in UTF-8
+ * format.
+ *
+ * @param input the Latin1 string to convert
+ * @param length the length of the string bytes
+ * @return the number of bytes required to encode the Latin1 string as UTF-8
+ */
+simdutf_warn_unused size_t utf8_length_from_latin1(const char *input,
+ size_t length) noexcept;
+
+/**
+ * Compute the number of bytes that this UTF-8 string would require in Latin1
+ * format.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-8 strings but in such cases the result is implementation defined.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in byte
+ * @return the number of bytes required to encode the UTF-8 string as Latin1
+ */
+simdutf_warn_unused size_t latin1_length_from_utf8(const char *input,
+ size_t length) noexcept;
+
+/**
+ * Compute the number of 2-byte code units that this UTF-8 string would require
+ * in UTF-16LE format.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-8 strings but in such cases the result is implementation defined.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-8 string to process
+ * @param length the length of the string in bytes
+ * @return the number of char16_t code units required to encode the UTF-8 string
+ * as UTF-16LE
+ */
+simdutf_warn_unused size_t utf16_length_from_utf8(const char *input,
+ size_t length) noexcept;
+
+/**
+ * Compute the number of 4-byte code units that this UTF-8 string would require
+ * in UTF-32 format.
+ *
+ * This function is equivalent to count_utf8
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-8 strings but in such cases the result is implementation defined.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-8 string to process
+ * @param length the length of the string in bytes
+ * @return the number of char32_t code units required to encode the UTF-8 string
+ * as UTF-32
+ */
+simdutf_warn_unused size_t utf32_length_from_utf8(const char *input,
+ size_t length) noexcept;
+
+/**
+ * Using native endianness, convert possibly broken UTF-16 string into UTF-8
+ * string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16 string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf8_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-16LE
+ * string
+ */
+simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *input,
+ size_t length,
+ char *utf8_buffer) noexcept;
+
+/**
+ * Using native endianness, convert possibly broken UTF-16 string into Latin1
+ * string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16 string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-16 string
+ * or if it cannot be represented as Latin1
+ */
+simdutf_warn_unused size_t convert_utf16_to_latin1(
+ const char16_t *input, size_t length, char *latin1_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-16LE string into Latin1 string.
+ * If the string cannot be represented as Latin1, an error
+ * is returned.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-16LE
+ * string or if it cannot be represented as Latin1
+ */
+simdutf_warn_unused size_t convert_utf16le_to_latin1(
+ const char16_t *input, size_t length, char *latin1_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-16BE string into Latin1 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-16BE
+ * string or if it cannot be represented as Latin1
+ */
+simdutf_warn_unused size_t convert_utf16be_to_latin1(
+ const char16_t *input, size_t length, char *latin1_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-16LE string into UTF-8 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf8_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-16LE
+ * string
+ */
+simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *input,
+ size_t length,
+ char *utf8_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-16BE string into UTF-8 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf8_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-16LE
+ * string
+ */
+simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *input,
+ size_t length,
+ char *utf8_buffer) noexcept;
+
+/**
+ * Using native endianness, convert possibly broken UTF-16 string into Latin1
+ * string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16 string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf16_to_latin1_with_errors(
+ const char16_t *input, size_t length, char *latin1_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-16LE string into Latin1 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+ const char16_t *input, size_t length, char *latin1_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-16BE string into Latin1 string.
+ * If the string cannot be represented as Latin1, an error
+ * is returned.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+ const char16_t *input, size_t length, char *latin1_buffer) noexcept;
+
+/**
+ * Using native endianness, convert possibly broken UTF-16 string into UTF-8
+ * string and stop on error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16 string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf8_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
+ const char16_t *input, size_t length, char *utf8_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf8_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+ const char16_t *input, size_t length, char *utf8_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf8_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+ const char16_t *input, size_t length, char *utf8_buffer) noexcept;
+
+/**
+ * Using native endianness, convert valid UTF-16 string into UTF-8 string.
+ *
+ * This function assumes that the input string is valid UTF-16LE.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16 string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf8_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
+ const char16_t *input, size_t length, char *utf8_buffer) noexcept;
+
+/**
+ * Using native endianness, convert UTF-16 string into Latin1 string.
+ *
+ * This function assumes that the input string is valid UTF-16 and that it can
+ * be represented as Latin1. If you violate this assumption, the result is
+ * implementation defined and may include system-dependent behavior such as
+ * crashes.
+ *
+ * This function is for expert users only and not part of our public API. Use
+ * convert_utf16_to_latin1 instead. The function may be removed from the library
+ * in the future.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16 string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
+ const char16_t *input, size_t length, char *latin1_buffer) noexcept;
+
+/**
+ * Convert valid UTF-16LE string into Latin1 string.
+ *
+ * This function assumes that the input string is valid UTF-16LE and that it can
+ * be represented as Latin1. If you violate this assumption, the result is
+ * implementation defined and may include system-dependent behavior such as
+ * crashes.
+ *
+ * This function is for expert users only and not part of our public API. Use
+ * convert_utf16le_to_latin1 instead. The function may be removed from the
+ * library in the future.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
+ const char16_t *input, size_t length, char *latin1_buffer) noexcept;
+
+/**
+ * Convert valid UTF-16BE string into Latin1 string.
+ *
+ * This function assumes that the input string is valid UTF-16BE and that it can
+ * be represented as Latin1. If you violate this assumption, the result is
+ * implementation defined and may include system-dependent behavior such as
+ * crashes.
+ *
+ * This function is for expert users only and not part of our public API. Use
+ * convert_utf16be_to_latin1 instead. The function may be removed from the
+ * library in the future.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
+ const char16_t *input, size_t length, char *latin1_buffer) noexcept;
+
+/**
+ * Convert valid UTF-16LE string into UTF-8 string.
+ *
+ * This function assumes that the input string is valid UTF-16LE and that it can
+ * be represented as Latin1.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf8_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+ const char16_t *input, size_t length, char *utf8_buffer) noexcept;
+
+/**
+ * Convert valid UTF-16BE string into UTF-8 string.
+ *
+ * This function assumes that the input string is valid UTF-16BE.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf8_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+ const char16_t *input, size_t length, char *utf8_buffer) noexcept;
+
+/**
+ * Using native endianness, convert possibly broken UTF-16 string into UTF-32
+ * string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16 string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf32_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-16LE
+ * string
+ */
+simdutf_warn_unused size_t convert_utf16_to_utf32(
+ const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-16LE string into UTF-32 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf32_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-16LE
+ * string
+ */
+simdutf_warn_unused size_t convert_utf16le_to_utf32(
+ const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-16BE string into UTF-32 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf32_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-16LE
+ * string
+ */
+simdutf_warn_unused size_t convert_utf16be_to_utf32(
+ const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
+
+/**
+ * Using native endianness, convert possibly broken UTF-16 string into
+ * UTF-32 string and stop on error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16 string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf32_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char32_t written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf16_to_utf32_with_errors(
+ const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf32_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char32_t written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+ const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf32_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char32_t written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+ const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
+
+/**
+ * Using native endianness, convert valid UTF-16 string into UTF-32 string.
+ *
+ * This function assumes that the input string is valid UTF-16 (native
+ * endianness).
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16 string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf32_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_valid_utf16_to_utf32(
+ const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
+
+/**
+ * Convert valid UTF-16LE string into UTF-32 string.
+ *
+ * This function assumes that the input string is valid UTF-16LE.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf32_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
+ const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
+
+/**
+ * Convert valid UTF-16BE string into UTF-32 string.
+ *
+ * This function assumes that the input string is valid UTF-16LE.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param utf32_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
+ const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
+
+/*
+ * Compute the number of bytes that this UTF-16LE/BE string would require in
+ * Latin1 format.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-16 strings but in such cases the result is implementation defined.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @return the number of bytes required to encode the UTF-16LE string as Latin1
+ */
+simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
+
+/**
+ * Using native endianness; Compute the number of bytes that this UTF-16
+ * string would require in UTF-8 format.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-16 strings but in such cases the result is implementation defined.
+ *
+ * @param input the UTF-16 string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @return the number of bytes required to encode the UTF-16LE string as UTF-8
+ */
+simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input,
+ size_t length) noexcept;
+
+/**
+ * Compute the number of bytes that this UTF-16LE string would require in UTF-8
+ * format.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-16 strings but in such cases the result is implementation defined.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @return the number of bytes required to encode the UTF-16LE string as UTF-8
+ */
+simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input,
+ size_t length) noexcept;
+
+/**
+ * Compute the number of bytes that this UTF-16BE string would require in UTF-8
+ * format.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-16 strings but in such cases the result is implementation defined.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @return the number of bytes required to encode the UTF-16BE string as UTF-8
+ */
+simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input,
+ size_t length) noexcept;
+
+/**
+ * Convert possibly broken UTF-32 string into UTF-8 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units (char32_t)
+ * @param utf8_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-32 string
+ */
+simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *input,
+ size_t length,
+ char *utf8_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units (char32_t)
+ * @param utf8_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+ const char32_t *input, size_t length, char *utf8_buffer) noexcept;
+
+/**
+ * Convert valid UTF-32 string into UTF-8 string.
+ *
+ * This function assumes that the input string is valid UTF-32.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units (char32_t)
+ * @param utf8_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+ const char32_t *input, size_t length, char *utf8_buffer) noexcept;
+
+/**
+ * Using native endianness, convert possibly broken UTF-32 string into a UTF-16
+ * string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units (char32_t)
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-32 string
+ */
+simdutf_warn_unused size_t convert_utf32_to_utf16(
+ const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-32 string into UTF-16LE string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units (char32_t)
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-32 string
+ */
+simdutf_warn_unused size_t convert_utf32_to_utf16le(
+ const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-32 string into Latin1 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units (char32_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-32 string
+ * or if it cannot be represented as Latin1
+ */
+simdutf_warn_unused size_t convert_utf32_to_latin1(
+ const char32_t *input, size_t length, char *latin1_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
+ * If the string cannot be represented as Latin1, an error is returned.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units (char32_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
+ const char32_t *input, size_t length, char *latin1_buffer) noexcept;
+
+/**
+ * Convert valid UTF-32 string into Latin1 string.
+ *
+ * This function assumes that the input string is valid UTF-32 and that it can
+ * be represented as Latin1. If you violate this assumption, the result is
+ * implementation defined and may include system-dependent behavior such as
+ * crashes.
+ *
+ * This function is for expert users only and not part of our public API. Use
+ * convert_utf32_to_latin1 instead. The function may be removed from the library
+ * in the future.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units (char32_t)
+ * @param latin1_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
+ const char32_t *input, size_t length, char *latin1_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-32 string into UTF-16BE string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units (char32_t)
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-32 string
+ */
+simdutf_warn_unused size_t convert_utf32_to_utf16be(
+ const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
+
+/**
+ * Using native endianness, convert possibly broken UTF-32 string into UTF-16
+ * string and stop on error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units (char32_t)
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char16_t written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf32_to_utf16_with_errors(
+ const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units (char32_t)
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char16_t written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+ const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
+
+/**
+ * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units (char32_t)
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char16_t written if
+ * successful.
+ */
+simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+ const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
+
+/**
+ * Using native endianness, convert valid UTF-32 string into a UTF-16 string.
+ *
+ * This function assumes that the input string is valid UTF-32.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units (char32_t)
+ * @param utf16_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16(
+ const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
+
+/**
+ * Convert valid UTF-32 string into UTF-16LE string.
+ *
+ * This function assumes that the input string is valid UTF-32.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units (char32_t)
+ * @param utf16_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
+ const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
+
+/**
+ * Convert valid UTF-32 string into UTF-16BE string.
+ *
+ * This function assumes that the input string is valid UTF-32.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units (char32_t)
+ * @param utf16_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
+ const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
+
+/**
+ * Change the endianness of the input. Can be used to go from UTF-16LE to
+ * UTF-16BE or from UTF-16BE to UTF-16LE.
+ *
+ * This function does not validate the input.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16 string to process
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @param output the pointer to buffer that can hold the conversion
+ * result
+ */
+void change_endianness_utf16(const char16_t *input, size_t length,
+ char16_t *output) noexcept;
+
+/**
+ * Compute the number of bytes that this UTF-32 string would require in UTF-8
+ * format.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-32 strings but in such cases the result is implementation defined.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units (char32_t)
+ * @return the number of bytes required to encode the UTF-32 string as UTF-8
+ */
+simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input,
+ size_t length) noexcept;
+
+/**
+ * Compute the number of two-byte code units that this UTF-32 string would
+ * require in UTF-16 format.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-32 strings but in such cases the result is implementation defined.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units (char32_t)
+ * @return the number of bytes required to encode the UTF-32 string as UTF-16
+ */
+simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input,
+ size_t length) noexcept;
+
+/**
+ * Using native endianness; Compute the number of bytes that this UTF-16
+ * string would require in UTF-32 format.
+ *
+ * This function is equivalent to count_utf16.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-16 strings but in such cases the result is implementation defined.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16 string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @return the number of bytes required to encode the UTF-16LE string as UTF-32
+ */
+simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input,
+ size_t length) noexcept;
+
+/**
+ * Compute the number of bytes that this UTF-16LE string would require in UTF-32
+ * format.
+ *
+ * This function is equivalent to count_utf16le.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-16 strings but in such cases the result is implementation defined.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @return the number of bytes required to encode the UTF-16LE string as UTF-32
+ */
+simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input,
+ size_t length) noexcept;
+
+/**
+ * Compute the number of bytes that this UTF-16BE string would require in UTF-32
+ * format.
+ *
+ * This function is equivalent to count_utf16be.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-16 strings but in such cases the result is implementation defined.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @return the number of bytes required to encode the UTF-16BE string as UTF-32
+ */
+simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input,
+ size_t length) noexcept;
+
+/**
+ * Count the number of code points (characters) in the string assuming that
+ * it is valid.
+ *
+ * This function assumes that the input string is valid UTF-16 (native
+ * endianness). It is acceptable to pass invalid UTF-16 strings but in such
+ * cases the result is implementation defined.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16 string to process
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @return number of code points
+ */
+simdutf_warn_unused size_t count_utf16(const char16_t *input,
+ size_t length) noexcept;
+
+/**
+ * Count the number of code points (characters) in the string assuming that
+ * it is valid.
+ *
+ * This function assumes that the input string is valid UTF-16LE.
+ * It is acceptable to pass invalid UTF-16 strings but in such cases
+ * the result is implementation defined.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to process
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @return number of code points
+ */
+simdutf_warn_unused size_t count_utf16le(const char16_t *input,
+ size_t length) noexcept;
+
+/**
+ * Count the number of code points (characters) in the string assuming that
+ * it is valid.
+ *
+ * This function assumes that the input string is valid UTF-16BE.
+ * It is acceptable to pass invalid UTF-16 strings but in such cases
+ * the result is implementation defined.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to process
+ * @param length the length of the string in 2-byte code units (char16_t)
+ * @return number of code points
+ */
+simdutf_warn_unused size_t count_utf16be(const char16_t *input,
+ size_t length) noexcept;
+
+/**
+ * Count the number of code points (characters) in the string assuming that
+ * it is valid.
+ *
+ * This function assumes that the input string is valid UTF-8.
+ * It is acceptable to pass invalid UTF-8 strings but in such cases
+ * the result is implementation defined.
+ *
+ * @param input the UTF-8 string to process
+ * @param length the length of the string in bytes
+ * @return number of code points
+ */
+simdutf_warn_unused size_t count_utf8(const char *input,
+ size_t length) noexcept;
+
+/**
+ * Given a valid UTF-8 string having a possibly truncated last character,
+ * this function checks the end of string. If the last character is truncated
+ * (or partial), then it returns a shorter length (shorter by 1 to 3 bytes) so
+ * that the short UTF-8 strings only contain complete characters. If there is no
+ * truncated character, the original length is returned.
+ *
+ * This function assumes that the input string is valid UTF-8, but possibly
+ * truncated.
+ *
+ * @param input the UTF-8 string to process
+ * @param length the length of the string in bytes
+ * @return the length of the string in bytes, possibly shorter by 1 to 3 bytes
+ */
+simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length);
+
+/**
+ * Given a valid UTF-16BE string having a possibly truncated last character,
+ * this function checks the end of string. If the last character is truncated
+ * (or partial), then it returns a shorter length (shorter by 1 unit) so that
+ * the short UTF-16BE strings only contain complete characters. If there is no
+ * truncated character, the original length is returned.
+ *
+ * This function assumes that the input string is valid UTF-16BE, but possibly
+ * truncated.
+ *
+ * @param input the UTF-16BE string to process
+ * @param length the length of the string in bytes
+ * @return the length of the string in bytes, possibly shorter by 1 unit
+ */
+simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input,
+ size_t length);
+
+/**
+ * Given a valid UTF-16LE string having a possibly truncated last character,
+ * this function checks the end of string. If the last character is truncated
+ * (or partial), then it returns a shorter length (shorter by 1 unit) so that
+ * the short UTF-16LE strings only contain complete characters. If there is no
+ * truncated character, the original length is returned.
+ *
+ * This function assumes that the input string is valid UTF-16LE, but possibly
+ * truncated.
+ *
+ * @param input the UTF-16LE string to process
+ * @param length the length of the string in bytes
+ * @return the length of the string in unit, possibly shorter by 1 unit
+ */
+simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input,
+ size_t length);
+
+/**
+ * Given a valid UTF-16 string having a possibly truncated last character,
+ * this function checks the end of string. If the last character is truncated
+ * (or partial), then it returns a shorter length (shorter by 1 unit) so that
+ * the short UTF-16 strings only contain complete characters. If there is no
+ * truncated character, the original length is returned.
+ *
+ * This function assumes that the input string is valid UTF-16, but possibly
+ * truncated. We use the native endianness.
+ *
+ * @param input the UTF-16 string to process
+ * @param length the length of the string in bytes
+ * @return the length of the string in unit, possibly shorter by 1 unit
+ */
+simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input,
+ size_t length);
+
+// base64_options are used to specify the base64 encoding options.
+enum base64_options : uint64_t {
+ base64_default = 0, /* standard base64 format (with padding) */
+ base64_url = 1, /* base64url format (no padding) */
+ base64_reverse_padding = 2, /* modifier for base64_default and base64_url */
+ base64_default_no_padding =
+ base64_default |
+ base64_reverse_padding, /* standard base64 format without padding */
+ base64_url_with_padding =
+ base64_url | base64_reverse_padding, /* base64url with padding */
+};
+
+// last_chunk_handling_options are used to specify the handling of the last
+// chunk in base64 decoding.
+// https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
+enum last_chunk_handling_options : uint64_t {
+ loose = 0, /* standard base64 format, decode partial final chunk */
+ strict = 1, /* error when the last chunk is partial, 2 or 3 chars, and
+ unpadded, or non-zero bit padding */
+ stop_before_partial =
+ 2, /* if the last chunk is partial (2 or 3 chars), ignore it (no error) */
+};
+
+/**
+ * Provide the maximal binary length in bytes given the base64 input.
+ * In general, if the input contains ASCII spaces, the result will be less than
+ * the maximum length.
+ *
+ * @param input the base64 input to process
+ * @param length the length of the base64 input in bytes
+ * @return maximum number of binary bytes
+ */
+simdutf_warn_unused size_t
+maximal_binary_length_from_base64(const char *input, size_t length) noexcept;
+
+/**
+ * Provide the maximal binary length in bytes given the base64 input.
+ * In general, if the input contains ASCII spaces, the result will be less than
+ * the maximum length.
+ *
+ * @param input the base64 input to process, in ASCII stored as 16-bit
+ * units
+ * @param length the length of the base64 input in 16-bit units
+ * @return maximal number of binary bytes
+ */
+simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) noexcept;
+
+/**
+ * Convert a base64 input to a binary output.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that it
+ * will ignore any ASCII spaces in the input. You may provide a padded input
+ * (with one or two equal signs at the end) or an unpadded input (without any
+ * equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. When last_chunk_options =
+ * loose, there are two possible reasons for failure: the input contains a
+ * number of base64 characters that when divided by 4, leaves a single remainder
+ * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
+ * not a valid base64 character (INVALID_BASE64_CHARACTER).
+ *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
+ * input where the invalid character was found. When the error is
+ * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
+ *
+ * The default option (simdutf::base64_default) expects the characters `+` and
+ * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
+ * characters `-` and `_` as part of its alphabet.
+ *
+ * The padding (`=`) is validated if present. There may be at most two padding
+ * characters at the end of the input. If there are any padding characters, the
+ * total number of characters (excluding spaces but including padding
+ * characters) must be divisible by four.
+ *
+ * You should call this function with a buffer that is at least
+ * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
+ * provide that much space, the function may cause a buffer overflow.
+ *
+ * Advanced users may want to taylor how the last chunk is handled. By default,
+ * we use a loose (forgiving) approach but we also support a strict approach
+ * as well as a stop_before_partial approach, as per the following proposal:
+ *
+ * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
+ *
+ * @param input the base64 string to process
+ * @param length the length of the string in bytes
+ * @param output the pointer to buffer that can hold the conversion
+ * result (should be at least maximal_binary_length_from_base64(input, length)
+ * bytes long).
+ * @param options the base64 options to use, usually base64_default or
+ * base64_url, and base64_default by default.
+ * @param last_chunk_options the last chunk handling options,
+ * last_chunk_handling_options::loose by default
+ * but can also be last_chunk_handling_options::strict or
+ * last_chunk_handling_options::stop_before_partial.
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in bytes) if any, or the number of bytes written if successful.
+ */
+simdutf_warn_unused result base64_to_binary(
+ const char *input, size_t length, char *output,
+ base64_options options = base64_default,
+ last_chunk_handling_options last_chunk_options = loose) noexcept;
+
+/**
+ * Provide the base64 length in bytes given the length of a binary input.
+ *
+ * @param length the length of the input in bytes
+ * @return number of base64 bytes
+ */
+simdutf_warn_unused size_t base64_length_from_binary(
+ size_t length, base64_options options = base64_default) noexcept;
+
+/**
+ * Convert a binary input to a base64 output.
+ *
+ * The default option (simdutf::base64_default) uses the characters `+` and `/`
+ * as part of its alphabet. Further, it adds padding (`=`) at the end of the
+ * output to ensure that the output length is a multiple of four.
+ *
+ * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
+ * of its alphabet. No padding is added at the end of the output.
+ *
+ * This function always succeeds.
+ *
+ * @param input the binary to process
+ * @param length the length of the input in bytes
+ * @param output the pointer to buffer that can hold the conversion
+ * result (should be at least base64_length_from_binary(length) bytes long)
+ * @param options the base64 options to use, can be base64_default or
+ * base64_url, is base64_default by default.
+ * @return number of written bytes, will be equal to
+ * base64_length_from_binary(length, options)
+ */
+size_t binary_to_base64(const char *input, size_t length, char *output,
+ base64_options options = base64_default) noexcept;
+
+/**
+ * Convert a base64 input to a binary output.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that it
+ * will ignore any ASCII spaces in the input. You may provide a padded input
+ * (with one or two equal signs at the end) or an unpadded input (without any
+ * equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. When last_chunk_options =
+ * loose, there are two possible reasons for failure: the input contains a
+ * number of base64 characters that when divided by 4, leaves a single remainder
+ * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
+ * not a valid base64 character (INVALID_BASE64_CHARACTER).
+ *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
+ * input where the invalid character was found. When the error is
+ * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
+ *
+ * The default option (simdutf::base64_default) expects the characters `+` and
+ * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
+ * characters `-` and `_` as part of its alphabet.
+ *
+ * The padding (`=`) is validated if present. There may be at most two padding
+ * characters at the end of the input. If there are any padding characters, the
+ * total number of characters (excluding spaces but including padding
+ * characters) must be divisible by four.
+ *
+ * You should call this function with a buffer that is at least
+ * maximal_binary_length_from_utf6_base64(input, length) bytes long. If you fail
+ * to provide that much space, the function may cause a buffer overflow.
+ *
+ * Advanced users may want to taylor how the last chunk is handled. By default,
+ * we use a loose (forgiving) approach but we also support a strict approach
+ * as well as a stop_before_partial approach, as per the following proposal:
+ *
+ * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
+ *
+ * @param input the base64 string to process, in ASCII stored as 16-bit
+ * units
+ * @param length the length of the string in 16-bit units
+ * @param output the pointer to buffer that can hold the conversion
+ * result (should be at least maximal_binary_length_from_base64(input, length)
+ * bytes long).
+ * @param options the base64 options to use, can be base64_default or
+ * base64_url, is base64_default by default.
+ * @param last_chunk_options the last chunk handling options,
+ * last_chunk_handling_options::loose by default
+ * but can also be last_chunk_handling_options::strict or
+ * last_chunk_handling_options::stop_before_partial.
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and position of the
+ * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
+ * of bytes written if successful.
+ */
+simdutf_warn_unused result
+base64_to_binary(const char16_t *input, size_t length, char *output,
+ base64_options options = base64_default,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) noexcept;
+
+/**
+ * Convert a base64 input to a binary output.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that it
+ * will ignore any ASCII spaces in the input. You may provide a padded input
+ * (with one or two equal signs at the end) or an unpadded input (without any
+ * equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. When last_chunk_options =
+ * loose, there are three possible reasons for failure: the input contains a
+ * number of base64 characters that when divided by 4, leaves a single remainder
+ * character (BASE64_INPUT_REMAINDER), the input contains a character that is
+ * not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer
+ * is too small (OUTPUT_BUFFER_TOO_SMALL).
+ *
+ * When OUTPUT_BUFFER_TOO_SMALL, we return both the number of bytes written
+ * and the number of units processed, see description of the parameters and
+ * returned value.
+ *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
+ * input where the invalid character was found. When the error is
+ * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
+ *
+ * The default option (simdutf::base64_default) expects the characters `+` and
+ * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
+ * characters `-` and `_` as part of its alphabet.
+ *
+ * The padding (`=`) is validated if present. There may be at most two padding
+ * characters at the end of the input. If there are any padding characters, the
+ * total number of characters (excluding spaces but including padding
+ * characters) must be divisible by four.
+ *
+ * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected
+ * to discard the output.
+ *
+ * Advanced users may want to taylor how the last chunk is handled. By default,
+ * we use a loose (forgiving) approach but we also support a strict approach
+ * as well as a stop_before_partial approach, as per the following proposal:
+ *
+ * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
+ *
+ * @param input the base64 string to process, in ASCII stored as 8-bit
+ * or 16-bit units
+ * @param length the length of the string in 8-bit or 16-bit units.
+ * @param output the pointer to buffer that can hold the conversion
+ * result.
+ * @param outlen the number of bytes that can be written in the output
+ * buffer. Upon return, it is modified to reflect how many bytes were written.
+ * @param options the base64 options to use, can be base64_default or
+ * base64_url, is base64_default by default.
+ * @param last_chunk_options the last chunk handling options,
+ * last_chunk_handling_options::loose by default
+ * but can also be last_chunk_handling_options::strict or
+ * last_chunk_handling_options::stop_before_partial.
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and position of the
+ * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
+ * of units processed if successful.
+ */
+simdutf_warn_unused result
+base64_to_binary_safe(const char *input, size_t length, char *output,
+ size_t &outlen, base64_options options = base64_default,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) noexcept;
+simdutf_warn_unused result
+base64_to_binary_safe(const char16_t *input, size_t length, char *output,
+ size_t &outlen, base64_options options = base64_default,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) noexcept;
+
+/**
+ * An implementation of simdutf for a particular CPU architecture.
+ *
+ * Also used to maintain the currently active implementation. The active
+ * implementation is automatically initialized on first use to the most advanced
+ * implementation supported by the host.
+ */
+class implementation {
+public:
+ /**
+ * The name of this implementation.
+ *
+ * const implementation *impl = simdutf::active_implementation;
+ * cout << "simdutf is optimized for " << impl->name() << "(" <<
+ * impl->description() << ")" << endl;
+ *
+ * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
+ */
+ virtual std::string name() const { return std::string(_name); }
+
+ /**
+ * The description of this implementation.
+ *
+ * const implementation *impl = simdutf::active_implementation;
+ * cout << "simdutf is optimized for " << impl->name() << "(" <<
+ * impl->description() << ")" << endl;
+ *
+ * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
+ */
+ virtual std::string description() const { return std::string(_description); }
+
+ /**
+ * The instruction sets this implementation is compiled against
+ * and the current CPU match. This function may poll the current CPU/system
+ * and should therefore not be called too often if performance is a concern.
+ *
+ *
+ * @return true if the implementation can be safely used on the current system
+ * (determined at runtime)
+ */
+ bool supported_by_runtime_system() const;
+
+ /**
+ * This function will try to detect the encoding
+ * @param input the string to identify
+ * @param length the length of the string in bytes.
+ * @return the encoding type detected
+ */
+ virtual encoding_type autodetect_encoding(const char *input,
+ size_t length) const noexcept;
+
+ /**
+ * This function will try to detect the possible encodings in one pass
+ * @param input the string to identify
+ * @param length the length of the string in bytes.
+ * @return the encoding type detected
+ */
+ virtual int detect_encodings(const char *input,
+ size_t length) const noexcept = 0;
+
+ /**
+ * @private For internal implementation use
+ *
+ * The instruction sets this implementation is compiled against.
+ *
+ * @return a mask of all required `internal::instruction_set::` values
+ */
+ virtual uint32_t required_instruction_sets() const {
+ return _required_instruction_sets;
+ }
+
+ /**
+ * Validate the UTF-8 string.
+ *
+ * Overridden by each implementation.
+ *
+ * @param buf the UTF-8 string to validate.
+ * @param len the length of the string in bytes.
+ * @return true if and only if the string is valid UTF-8.
+ */
+ simdutf_warn_unused virtual bool validate_utf8(const char *buf,
+ size_t len) const noexcept = 0;
+
+ /**
+ * Validate the UTF-8 string and stop on errors.
+ *
+ * Overridden by each implementation.
+ *
+ * @param buf the UTF-8 string to validate.
+ * @param len the length of the string in bytes.
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of code units validated
+ * if successful.
+ */
+ simdutf_warn_unused virtual result
+ validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0;
+
+ /**
+ * Validate the ASCII string.
+ *
+ * Overridden by each implementation.
+ *
+ * @param buf the ASCII string to validate.
+ * @param len the length of the string in bytes.
+ * @return true if and only if the string is valid ASCII.
+ */
+ simdutf_warn_unused virtual bool
+ validate_ascii(const char *buf, size_t len) const noexcept = 0;
+
+ /**
+ * Validate the ASCII string and stop on error.
+ *
+ * Overridden by each implementation.
+ *
+ * @param buf the ASCII string to validate.
+ * @param len the length of the string in bytes.
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of code units validated
+ * if successful.
+ */
+ simdutf_warn_unused virtual result
+ validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0;
+
+ /**
+ * Validate the UTF-16LE string.This function may be best when you expect
+ * the input to be almost always valid. Otherwise, consider using
+ * validate_utf16le_with_errors.
+ *
+ * Overridden by each implementation.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param buf the UTF-16LE string to validate.
+ * @param len the length of the string in number of 2-byte code units
+ * (char16_t).
+ * @return true if and only if the string is valid UTF-16LE.
+ */
+ simdutf_warn_unused virtual bool
+ validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0;
+
+ /**
+ * Validate the UTF-16BE string. This function may be best when you expect
+ * the input to be almost always valid. Otherwise, consider using
+ * validate_utf16be_with_errors.
+ *
+ * Overridden by each implementation.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param buf the UTF-16BE string to validate.
+ * @param len the length of the string in number of 2-byte code units
+ * (char16_t).
+ * @return true if and only if the string is valid UTF-16BE.
+ */
+ simdutf_warn_unused virtual bool
+ validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0;
+
+ /**
+ * Validate the UTF-16LE string and stop on error. It might be faster than
+ * validate_utf16le when an error is expected to occur early.
+ *
+ * Overridden by each implementation.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param buf the UTF-16LE string to validate.
+ * @param len the length of the string in number of 2-byte code units
+ * (char16_t).
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of code units validated
+ * if successful.
+ */
+ simdutf_warn_unused virtual result
+ validate_utf16le_with_errors(const char16_t *buf,
+ size_t len) const noexcept = 0;
+
+ /**
+ * Validate the UTF-16BE string and stop on error. It might be faster than
+ * validate_utf16be when an error is expected to occur early.
+ *
+ * Overridden by each implementation.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param buf the UTF-16BE string to validate.
+ * @param len the length of the string in number of 2-byte code units
+ * (char16_t).
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of code units validated
+ * if successful.
+ */
+ simdutf_warn_unused virtual result
+ validate_utf16be_with_errors(const char16_t *buf,
+ size_t len) const noexcept = 0;
+
+ /**
+ * Validate the UTF-32 string.
+ *
+ * Overridden by each implementation.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param buf the UTF-32 string to validate.
+ * @param len the length of the string in number of 4-byte code units
+ * (char32_t).
+ * @return true if and only if the string is valid UTF-32.
+ */
+ simdutf_warn_unused virtual bool
+ validate_utf32(const char32_t *buf, size_t len) const noexcept = 0;
+
+ /**
+ * Validate the UTF-32 string and stop on error.
+ *
+ * Overridden by each implementation.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param buf the UTF-32 string to validate.
+ * @param len the length of the string in number of 4-byte code units
+ * (char32_t).
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of code units validated
+ * if successful.
+ */
+ simdutf_warn_unused virtual result
+ validate_utf32_with_errors(const char32_t *buf,
+ size_t len) const noexcept = 0;
+
+ /**
+ * Convert Latin1 string into UTF8 string.
+ *
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the Latin1 string to convert
+ * @param length the length of the string in bytes
+ * @param utf8_output the pointer to buffer that can hold conversion result
+ * @return the number of written char; 0 if conversion is not possible
+ */
+ simdutf_warn_unused virtual size_t
+ convert_latin1_to_utf8(const char *input, size_t length,
+ char *utf8_output) const noexcept = 0;
+
+ /**
+ * Convert possibly Latin1 string into UTF-16LE string.
+ *
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the Latin1 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t; 0 if conversion is not possible
+ */
+ simdutf_warn_unused virtual size_t
+ convert_latin1_to_utf16le(const char *input, size_t length,
+ char16_t *utf16_output) const noexcept = 0;
+
+ /**
+ * Convert Latin1 string into UTF-16BE string.
+ *
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the Latin1 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t; 0 if conversion is not possible
+ */
+ simdutf_warn_unused virtual size_t
+ convert_latin1_to_utf16be(const char *input, size_t length,
+ char16_t *utf16_output) const noexcept = 0;
+
+ /**
+ * Convert Latin1 string into UTF-32 string.
+ *
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the Latin1 string to convert
+ * @param length the length of the string in bytes
+ * @param utf32_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char32_t; 0 if conversion is not possible
+ */
+ simdutf_warn_unused virtual size_t
+ convert_latin1_to_utf32(const char *input, size_t length,
+ char32_t *utf32_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-8 string into latin1 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param latin1_output the pointer to buffer that can hold conversion result
+ * @return the number of written char; 0 if the input was not valid UTF-8
+ * string or if it cannot be represented as Latin1
+ */
+ simdutf_warn_unused virtual size_t
+ convert_utf8_to_latin1(const char *input, size_t length,
+ char *latin1_output) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-8 string into latin1 string with errors.
+ * If the string cannot be represented as Latin1, an error
+ * code is returned.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param latin1_output the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of code units validated
+ * if successful.
+ */
+ simdutf_warn_unused virtual result
+ convert_utf8_to_latin1_with_errors(const char *input, size_t length,
+ char *latin1_output) const noexcept = 0;
+
+ /**
+ * Convert valid UTF-8 string into latin1 string.
+ *
+ * This function assumes that the input string is valid UTF-8 and that it can
+ * be represented as Latin1. If you violate this assumption, the result is
+ * implementation defined and may include system-dependent behavior such as
+ * crashes.
+ *
+ * This function is for expert users only and not part of our public API. Use
+ * convert_utf8_to_latin1 instead.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param latin1_output the pointer to buffer that can hold conversion result
+ * @return the number of written char; 0 if the input was not valid UTF-8
+ * string
+ */
+ simdutf_warn_unused virtual size_t
+ convert_valid_utf8_to_latin1(const char *input, size_t length,
+ char *latin1_output) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-8 string into UTF-16LE string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t; 0 if the input was not valid UTF-8
+ * string
+ */
+ simdutf_warn_unused virtual size_t
+ convert_utf8_to_utf16le(const char *input, size_t length,
+ char16_t *utf16_output) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-8 string into UTF-16BE string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t; 0 if the input was not valid UTF-8
+ * string
+ */
+ simdutf_warn_unused virtual size_t
+ convert_utf8_to_utf16be(const char *input, size_t length,
+ char16_t *utf16_output) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-8 string into UTF-16LE string and stop on
+ * error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of code units validated
+ * if successful.
+ */
+ simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(
+ const char *input, size_t length,
+ char16_t *utf16_output) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-8 string into UTF-16BE string and stop on
+ * error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of code units validated
+ * if successful.
+ */
+ simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(
+ const char *input, size_t length,
+ char16_t *utf16_output) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-8 string into UTF-32 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf32_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t; 0 if the input was not valid UTF-8
+ * string
+ */
+ simdutf_warn_unused virtual size_t
+ convert_utf8_to_utf32(const char *input, size_t length,
+ char32_t *utf32_output) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf32_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char32_t written if
+ * successful.
+ */
+ simdutf_warn_unused virtual result
+ convert_utf8_to_utf32_with_errors(const char *input, size_t length,
+ char32_t *utf32_output) const noexcept = 0;
+
+ /**
+ * Convert valid UTF-8 string into UTF-16LE string.
+ *
+ * This function assumes that the input string is valid UTF-8.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t
+ */
+ simdutf_warn_unused virtual size_t
+ convert_valid_utf8_to_utf16le(const char *input, size_t length,
+ char16_t *utf16_buffer) const noexcept = 0;
+
+ /**
+ * Convert valid UTF-8 string into UTF-16BE string.
+ *
+ * This function assumes that the input string is valid UTF-8.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t
+ */
+ simdutf_warn_unused virtual size_t
+ convert_valid_utf8_to_utf16be(const char *input, size_t length,
+ char16_t *utf16_buffer) const noexcept = 0;
+
+ /**
+ * Convert valid UTF-8 string into UTF-32 string.
+ *
+ * This function assumes that the input string is valid UTF-8.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in bytes
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return the number of written char32_t
+ */
+ simdutf_warn_unused virtual size_t
+ convert_valid_utf8_to_utf32(const char *input, size_t length,
+ char32_t *utf32_buffer) const noexcept = 0;
+
+ /**
+ * Compute the number of 2-byte code units that this UTF-8 string would
+ * require in UTF-16LE format.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-8 strings but in such cases the result is implementation defined.
+ *
+ * @param input the UTF-8 string to process
+ * @param length the length of the string in bytes
+ * @return the number of char16_t code units required to encode the UTF-8
+ * string as UTF-16LE
+ */
+ simdutf_warn_unused virtual size_t
+ utf16_length_from_utf8(const char *input, size_t length) const noexcept = 0;
+
+ /**
+ * Compute the number of 4-byte code units that this UTF-8 string would
+ * require in UTF-32 format.
+ *
+ * This function is equivalent to count_utf8. It is acceptable to pass invalid
+ * UTF-8 strings but in such cases the result is implementation defined.
+ *
+ * This function does not validate the input.
+ *
+ * @param input the UTF-8 string to process
+ * @param length the length of the string in bytes
+ * @return the number of char32_t code units required to encode the UTF-8
+ * string as UTF-32
+ */
+ simdutf_warn_unused virtual size_t
+ utf32_length_from_utf8(const char *input, size_t length) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-16LE string into Latin1 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion
+ * result
+ * @return number of written code units; 0 if input is not a valid UTF-16LE
+ * string or if it cannot be represented as Latin1
+ */
+ simdutf_warn_unused virtual size_t
+ convert_utf16le_to_latin1(const char16_t *input, size_t length,
+ char *latin1_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-16BE string into Latin1 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion
+ * result
+ * @return number of written code units; 0 if input is not a valid UTF-16BE
+ * string or if it cannot be represented as Latin1
+ */
+ simdutf_warn_unused virtual size_t
+ convert_utf16be_to_latin1(const char16_t *input, size_t length,
+ char *latin1_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-16LE string into Latin1 string.
+ * If the string cannot be represented as Latin1, an error
+ * is returned.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion
+ * result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char written if
+ * successful.
+ */
+ simdutf_warn_unused virtual result
+ convert_utf16le_to_latin1_with_errors(const char16_t *input, size_t length,
+ char *latin1_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-16BE string into Latin1 string.
+ * If the string cannot be represented as Latin1, an error
+ * is returned.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion
+ * result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char written if
+ * successful.
+ */
+ simdutf_warn_unused virtual result
+ convert_utf16be_to_latin1_with_errors(const char16_t *input, size_t length,
+ char *latin1_buffer) const noexcept = 0;
+
+ /**
+ * Convert valid UTF-16LE string into Latin1 string.
+ *
+ * This function assumes that the input string is valid UTF-L16LE and that it
+ * can be represented as Latin1. If you violate this assumption, the result is
+ * implementation defined and may include system-dependent behavior such as
+ * crashes.
+ *
+ * This function is for expert users only and not part of our public API. Use
+ * convert_utf16le_to_latin1 instead.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+ simdutf_warn_unused virtual size_t
+ convert_valid_utf16le_to_latin1(const char16_t *input, size_t length,
+ char *latin1_buffer) const noexcept = 0;
+
+ /**
+ * Convert valid UTF-16BE string into Latin1 string.
+ *
+ * This function assumes that the input string is valid UTF16-BE and that it
+ * can be represented as Latin1. If you violate this assumption, the result is
+ * implementation defined and may include system-dependent behavior such as
+ * crashes.
+ *
+ * This function is for expert users only and not part of our public API. Use
+ * convert_utf16be_to_latin1 instead.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+ simdutf_warn_unused virtual size_t
+ convert_valid_utf16be_to_latin1(const char16_t *input, size_t length,
+ char *latin1_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-16LE string into UTF-8 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param utf8_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-16LE
+ * string
+ */
+ simdutf_warn_unused virtual size_t
+ convert_utf16le_to_utf8(const char16_t *input, size_t length,
+ char *utf8_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-16BE string into UTF-8 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param utf8_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-16BE
+ * string
+ */
+ simdutf_warn_unused virtual size_t
+ convert_utf16be_to_utf8(const char16_t *input, size_t length,
+ char *utf8_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-16LE string into UTF-8 string and stop on
+ * error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param utf8_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char written if
+ * successful.
+ */
+ simdutf_warn_unused virtual result
+ convert_utf16le_to_utf8_with_errors(const char16_t *input, size_t length,
+ char *utf8_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-16BE string into UTF-8 string and stop on
+ * error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param utf8_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char written if
+ * successful.
+ */
+ simdutf_warn_unused virtual result
+ convert_utf16be_to_utf8_with_errors(const char16_t *input, size_t length,
+ char *utf8_buffer) const noexcept = 0;
+
+ /**
+ * Convert valid UTF-16LE string into UTF-8 string.
+ *
+ * This function assumes that the input string is valid UTF-16LE.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param utf8_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+ simdutf_warn_unused virtual size_t
+ convert_valid_utf16le_to_utf8(const char16_t *input, size_t length,
+ char *utf8_buffer) const noexcept = 0;
+
+ /**
+ * Convert valid UTF-16BE string into UTF-8 string.
+ *
+ * This function assumes that the input string is valid UTF-16BE.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param utf8_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+ simdutf_warn_unused virtual size_t
+ convert_valid_utf16be_to_utf8(const char16_t *input, size_t length,
+ char *utf8_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-16LE string into UTF-32 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param utf32_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-16LE
+ * string
+ */
+ simdutf_warn_unused virtual size_t
+ convert_utf16le_to_utf32(const char16_t *input, size_t length,
+ char32_t *utf32_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-16BE string into UTF-32 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param utf32_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-16BE
+ * string
+ */
+ simdutf_warn_unused virtual size_t
+ convert_utf16be_to_utf32(const char16_t *input, size_t length,
+ char32_t *utf32_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-16LE string into UTF-32 string and stop on
+ * error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param utf32_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char32_t written if
+ * successful.
+ */
+ simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(
+ const char16_t *input, size_t length,
+ char32_t *utf32_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-16BE string into UTF-32 string and stop on
+ * error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param utf32_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char32_t written if
+ * successful.
+ */
+ simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(
+ const char16_t *input, size_t length,
+ char32_t *utf32_buffer) const noexcept = 0;
+
+ /**
+ * Convert valid UTF-16LE string into UTF-32 string.
+ *
+ * This function assumes that the input string is valid UTF-16LE.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param utf32_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+ simdutf_warn_unused virtual size_t
+ convert_valid_utf16le_to_utf32(const char16_t *input, size_t length,
+ char32_t *utf32_buffer) const noexcept = 0;
+
+ /**
+ * Convert valid UTF-16LE string into UTF-32BE string.
+ *
+ * This function assumes that the input string is valid UTF-16BE.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param utf32_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+ simdutf_warn_unused virtual size_t
+ convert_valid_utf16be_to_utf32(const char16_t *input, size_t length,
+ char32_t *utf32_buffer) const noexcept = 0;
+
+ /**
+ * Compute the number of bytes that this UTF-16LE string would require in
+ * UTF-8 format.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-16 strings but in such cases the result is implementation defined.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @return the number of bytes required to encode the UTF-16LE string as UTF-8
+ */
+ simdutf_warn_unused virtual size_t
+ utf8_length_from_utf16le(const char16_t *input,
+ size_t length) const noexcept = 0;
+
+ /**
+ * Compute the number of bytes that this UTF-16BE string would require in
+ * UTF-8 format.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-16 strings but in such cases the result is implementation defined.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @return the number of bytes required to encode the UTF-16BE string as UTF-8
+ */
+ simdutf_warn_unused virtual size_t
+ utf8_length_from_utf16be(const char16_t *input,
+ size_t length) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-32 string into Latin1 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units
+ * (char32_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion
+ * result
+ * @return number of written code units; 0 if input is not a valid UTF-32
+ * string
+ */
+
+ simdutf_warn_unused virtual size_t
+ convert_utf32_to_latin1(const char32_t *input, size_t length,
+ char *latin1_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
+ * If the string cannot be represented as Latin1, an error is returned.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units
+ * (char32_t)
+ * @param latin1_buffer the pointer to buffer that can hold conversion
+ * result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char written if
+ * successful.
+ */
+ simdutf_warn_unused virtual result
+ convert_utf32_to_latin1_with_errors(const char32_t *input, size_t length,
+ char *latin1_buffer) const noexcept = 0;
+
+ /**
+ * Convert valid UTF-32 string into Latin1 string.
+ *
+ * This function assumes that the input string is valid UTF-32 and can be
+ * represented as Latin1. If you violate this assumption, the result is
+ * implementation defined and may include system-dependent behavior such as
+ * crashes.
+ *
+ * This function is for expert users only and not part of our public API. Use
+ * convert_utf32_to_latin1 instead.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units
+ * (char32_t)
+ * @param latin1_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+ simdutf_warn_unused virtual size_t
+ convert_valid_utf32_to_latin1(const char32_t *input, size_t length,
+ char *latin1_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-32 string into UTF-8 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units
+ * (char32_t)
+ * @param utf8_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-32
+ * string
+ */
+ simdutf_warn_unused virtual size_t
+ convert_utf32_to_utf8(const char32_t *input, size_t length,
+ char *utf8_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units
+ * (char32_t)
+ * @param utf8_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char written if
+ * successful.
+ */
+ simdutf_warn_unused virtual result
+ convert_utf32_to_utf8_with_errors(const char32_t *input, size_t length,
+ char *utf8_buffer) const noexcept = 0;
+
+ /**
+ * Convert valid UTF-32 string into UTF-8 string.
+ *
+ * This function assumes that the input string is valid UTF-32.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units
+ * (char32_t)
+ * @param utf8_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+ simdutf_warn_unused virtual size_t
+ convert_valid_utf32_to_utf8(const char32_t *input, size_t length,
+ char *utf8_buffer) const noexcept = 0;
+
+ /**
+ * Return the number of bytes that this UTF-16 string would require in Latin1
+ * format.
+ *
+ *
+ * @param input the UTF-16 string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @return the number of bytes required to encode the UTF-16 string as Latin1
+ */
+ simdutf_warn_unused virtual size_t
+ utf16_length_from_latin1(size_t length) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-32 string into UTF-16LE string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units
+ * (char32_t)
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-32
+ * string
+ */
+ simdutf_warn_unused virtual size_t
+ convert_utf32_to_utf16le(const char32_t *input, size_t length,
+ char16_t *utf16_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-32 string into UTF-16BE string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units
+ * (char32_t)
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return number of written code units; 0 if input is not a valid UTF-32
+ * string
+ */
+ simdutf_warn_unused virtual size_t
+ convert_utf32_to_utf16be(const char32_t *input, size_t length,
+ char16_t *utf16_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-32 string into UTF-16LE string and stop on
+ * error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units
+ * (char32_t)
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char16_t written if
+ * successful.
+ */
+ simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(
+ const char32_t *input, size_t length,
+ char16_t *utf16_buffer) const noexcept = 0;
+
+ /**
+ * Convert possibly broken UTF-32 string into UTF-16BE string and stop on
+ * error.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units
+ * (char32_t)
+ * @param utf16_buffer the pointer to buffer that can hold conversion result
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in code units) if any, or the number of char16_t written if
+ * successful.
+ */
+ simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(
+ const char32_t *input, size_t length,
+ char16_t *utf16_buffer) const noexcept = 0;
+
+ /**
+ * Convert valid UTF-32 string into UTF-16LE string.
+ *
+ * This function assumes that the input string is valid UTF-32.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units
+ * (char32_t)
+ * @param utf16_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+ simdutf_warn_unused virtual size_t
+ convert_valid_utf32_to_utf16le(const char32_t *input, size_t length,
+ char16_t *utf16_buffer) const noexcept = 0;
+
+ /**
+ * Convert valid UTF-32 string into UTF-16BE string.
+ *
+ * This function assumes that the input string is valid UTF-32.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units
+ * (char32_t)
+ * @param utf16_buffer the pointer to buffer that can hold the conversion
+ * result
+ * @return number of written code units; 0 if conversion is not possible
+ */
+ simdutf_warn_unused virtual size_t
+ convert_valid_utf32_to_utf16be(const char32_t *input, size_t length,
+ char16_t *utf16_buffer) const noexcept = 0;
+
+ /**
+ * Change the endianness of the input. Can be used to go from UTF-16LE to
+ * UTF-16BE or from UTF-16BE to UTF-16LE.
+ *
+ * This function does not validate the input.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16 string to process
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @param output the pointer to buffer that can hold the conversion
+ * result
+ */
+ virtual void change_endianness_utf16(const char16_t *input, size_t length,
+ char16_t *output) const noexcept = 0;
+
+ /**
+ * Return the number of bytes that this Latin1 string would require in UTF-8
+ * format.
+ *
+ * @param input the Latin1 string to convert
+ * @param length the length of the string bytes
+ * @return the number of bytes required to encode the Latin1 string as UTF-8
+ */
+ simdutf_warn_unused virtual size_t
+ utf8_length_from_latin1(const char *input, size_t length) const noexcept = 0;
+
+ /**
+ * Compute the number of bytes that this UTF-32 string would require in UTF-8
+ * format.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-32 strings but in such cases the result is implementation defined.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units
+ * (char32_t)
+ * @return the number of bytes required to encode the UTF-32 string as UTF-8
+ */
+ simdutf_warn_unused virtual size_t
+ utf8_length_from_utf32(const char32_t *input,
+ size_t length) const noexcept = 0;
+
+ /**
+ * Compute the number of bytes that this UTF-32 string would require in Latin1
+ * format.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-32 strings but in such cases the result is implementation defined.
+ *
+ * @param length the length of the string in 4-byte code units
+ * (char32_t)
+ * @return the number of bytes required to encode the UTF-32 string as Latin1
+ */
+ simdutf_warn_unused virtual size_t
+ latin1_length_from_utf32(size_t length) const noexcept = 0;
+
+ /**
+ * Compute the number of bytes that this UTF-8 string would require in Latin1
+ * format.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-8 strings but in such cases the result is implementation defined.
+ *
+ * @param input the UTF-8 string to convert
+ * @param length the length of the string in byte
+ * @return the number of bytes required to encode the UTF-8 string as Latin1
+ */
+ simdutf_warn_unused virtual size_t
+ latin1_length_from_utf8(const char *input, size_t length) const noexcept = 0;
+
+ /*
+ * Compute the number of bytes that this UTF-16LE/BE string would require in
+ * Latin1 format.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-16 strings but in such cases the result is implementation defined.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @return the number of bytes required to encode the UTF-16LE string as
+ * Latin1
+ */
+ simdutf_warn_unused virtual size_t
+ latin1_length_from_utf16(size_t length) const noexcept = 0;
+
+ /**
+ * Compute the number of two-byte code units that this UTF-32 string would
+ * require in UTF-16 format.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-32 strings but in such cases the result is implementation defined.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units
+ * (char32_t)
+ * @return the number of bytes required to encode the UTF-32 string as UTF-16
+ */
+ simdutf_warn_unused virtual size_t
+ utf16_length_from_utf32(const char32_t *input,
+ size_t length) const noexcept = 0;
+
+ /**
+ * Return the number of bytes that this UTF-32 string would require in Latin1
+ * format.
+ *
+ * @param input the UTF-32 string to convert
+ * @param length the length of the string in 4-byte code units
+ * (char32_t)
+ * @return the number of bytes required to encode the UTF-32 string as Latin1
+ */
+ simdutf_warn_unused virtual size_t
+ utf32_length_from_latin1(size_t length) const noexcept = 0;
+
+ /*
+ * Compute the number of bytes that this UTF-16LE string would require in
+ * UTF-32 format.
+ *
+ * This function is equivalent to count_utf16le.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-16 strings but in such cases the result is implementation defined.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @return the number of bytes required to encode the UTF-16LE string as
+ * UTF-32
+ */
+ simdutf_warn_unused virtual size_t
+ utf32_length_from_utf16le(const char16_t *input,
+ size_t length) const noexcept = 0;
+
+ /*
+ * Compute the number of bytes that this UTF-16BE string would require in
+ * UTF-32 format.
+ *
+ * This function is equivalent to count_utf16be.
+ *
+ * This function does not validate the input. It is acceptable to pass invalid
+ * UTF-16 strings but in such cases the result is implementation defined.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to convert
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @return the number of bytes required to encode the UTF-16BE string as
+ * UTF-32
+ */
+ simdutf_warn_unused virtual size_t
+ utf32_length_from_utf16be(const char16_t *input,
+ size_t length) const noexcept = 0;
+
+ /**
+ * Count the number of code points (characters) in the string assuming that
+ * it is valid.
+ *
+ * This function assumes that the input string is valid UTF-16LE.
+ * It is acceptable to pass invalid UTF-16 strings but in such cases
+ * the result is implementation defined.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16LE string to process
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @return number of code points
+ */
+ simdutf_warn_unused virtual size_t
+ count_utf16le(const char16_t *input, size_t length) const noexcept = 0;
+
+ /**
+ * Count the number of code points (characters) in the string assuming that
+ * it is valid.
+ *
+ * This function assumes that the input string is valid UTF-16BE.
+ * It is acceptable to pass invalid UTF-16 strings but in such cases
+ * the result is implementation defined.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input the UTF-16BE string to process
+ * @param length the length of the string in 2-byte code units
+ * (char16_t)
+ * @return number of code points
+ */
+ simdutf_warn_unused virtual size_t
+ count_utf16be(const char16_t *input, size_t length) const noexcept = 0;
+
+ /**
+ * Count the number of code points (characters) in the string assuming that
+ * it is valid.
+ *
+ * This function assumes that the input string is valid UTF-8.
+ * It is acceptable to pass invalid UTF-8 strings but in such cases
+ * the result is implementation defined.
+ *
+ * @param input the UTF-8 string to process
+ * @param length the length of the string in bytes
+ * @return number of code points
+ */
+ simdutf_warn_unused virtual size_t
+ count_utf8(const char *input, size_t length) const noexcept = 0;
+
+ /**
+ * Provide the maximal binary length in bytes given the base64 input.
+ * In general, if the input contains ASCII spaces, the result will be less
+ * than the maximum length. It is acceptable to pass invalid base64 strings
+ * but in such cases the result is implementation defined.
+ *
+ * @param input the base64 input to process
+ * @param length the length of the base64 input in bytes
+ * @return maximal number of binary bytes
+ */
+ simdutf_warn_unused virtual size_t
+ maximal_binary_length_from_base64(const char *input,
+ size_t length) const noexcept = 0;
+
+ /**
+ * Provide the maximal binary length in bytes given the base64 input.
+ * In general, if the input contains ASCII spaces, the result will be less
+ * than the maximum length. It is acceptable to pass invalid base64 strings
+ * but in such cases the result is implementation defined.
+ *
+ * @param input the base64 input to process, in ASCII stored as 16-bit
+ * units
+ * @param length the length of the base64 input in 16-bit units
+ * @return maximal number of binary bytes
+ */
+ simdutf_warn_unused virtual size_t
+ maximal_binary_length_from_base64(const char16_t *input,
+ size_t length) const noexcept = 0;
+
+ /**
+ * Convert a base64 input to a binary output.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that
+ * it will ignore any ASCII spaces in the input. You may provide a padded
+ * input (with one or two equal signs at the end) or an unpadded input
+ * (without any equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. When last_chunk_options =
+ * loose, there are two possible reasons for failure: the input contains a
+ * number of base64 characters that when divided by 4, leaves a single
+ * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
+ * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
+ *
+ * You should call this function with a buffer that is at least
+ * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
+ * provide that much space, the function may cause a buffer overflow.
+ *
+ * @param input the base64 string to process
+ * @param length the length of the string in bytes
+ * @param output the pointer to buffer that can hold the conversion
+ * result (should be at least maximal_binary_length_from_base64(input, length)
+ * bytes long).
+ * @param options the base64 options to use, can be base64_default or
+ * base64_url, is base64_default by default.
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and either position of the error
+ * (in the input in bytes) if any, or the number of bytes written if
+ * successful.
+ */
+ simdutf_warn_unused virtual result
+ base64_to_binary(const char *input, size_t length, char *output,
+ base64_options options = base64_default,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept = 0;
+
+ /**
+ * Convert a base64 input to a binary output while returning more details
+ * than base64_to_binary.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that
+ * it will ignore any ASCII spaces in the input. You may provide a padded
+ * input (with one or two equal signs at the end) or an unpadded input
+ * (without any equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. When last_chunk_options =
+ * loose, there are two possible reasons for failure: the input contains a
+ * number of base64 characters that when divided by 4, leaves a single
+ * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
+ * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
+ *
+ * You should call this function with a buffer that is at least
+ * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
+ * provide that much space, the function may cause a buffer overflow.
+ *
+ * @param input the base64 string to process
+ * @param length the length of the string in bytes
+ * @param output the pointer to buffer that can hold the conversion
+ * result (should be at least maximal_binary_length_from_base64(input, length)
+ * bytes long).
+ * @param options the base64 options to use, can be base64_default or
+ * base64_url, is base64_default by default.
+ * @return a full_result pair struct (of type simdutf::result containing the
+ * three fields error, input_count and output_count).
+ */
+ simdutf_warn_unused virtual full_result base64_to_binary_details(
+ const char *input, size_t length, char *output,
+ base64_options options = base64_default,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept = 0;
+ /**
+ * Convert a base64 input to a binary output.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that
+ * it will ignore any ASCII spaces in the input. You may provide a padded
+ * input (with one or two equal signs at the end) or an unpadded input
+ * (without any equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. When last_chunk_options =
+ * loose, there are two possible reasons for failure: the input contains a
+ * number of base64 characters that when divided by 4, leaves a single
+ * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
+ * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
+ *
+ * You should call this function with a buffer that is at least
+ * maximal_binary_length_from_utf6_base64(input, length) bytes long. If you
+ * fail to provide that much space, the function may cause a buffer overflow.
+ *
+ * @param input the base64 string to process, in ASCII stored as
+ * 16-bit units
+ * @param length the length of the string in 16-bit units
+ * @param output the pointer to buffer that can hold the conversion
+ * result (should be at least maximal_binary_length_from_base64(input, length)
+ * bytes long).
+ * @param options the base64 options to use, can be base64_default or
+ * base64_url, is base64_default by default.
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) with an error code and position of the
+ * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the
+ * number of bytes written if successful.
+ */
+ simdutf_warn_unused virtual result
+ base64_to_binary(const char16_t *input, size_t length, char *output,
+ base64_options options = base64_default,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept = 0;
+
+ /**
+ * Convert a base64 input to a binary output while returning more details
+ * than base64_to_binary.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that
+ * it will ignore any ASCII spaces in the input. You may provide a padded
+ * input (with one or two equal signs at the end) or an unpadded input
+ * (without any equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. When last_chunk_options =
+ * loose, there are two possible reasons for failure: the input contains a
+ * number of base64 characters that when divided by 4, leaves a single
+ * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
+ * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
+ *
+ * You should call this function with a buffer that is at least
+ * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
+ * provide that much space, the function may cause a buffer overflow.
+ *
+ * @param input the base64 string to process
+ * @param length the length of the string in bytes
+ * @param output the pointer to buffer that can hold the conversion
+ * result (should be at least maximal_binary_length_from_base64(input, length)
+ * bytes long).
+ * @param options the base64 options to use, can be base64_default or
+ * base64_url, is base64_default by default.
+ * @return a full_result pair struct (of type simdutf::result containing the
+ * three fields error, input_count and output_count).
+ */
+ simdutf_warn_unused virtual full_result base64_to_binary_details(
+ const char16_t *input, size_t length, char *output,
+ base64_options options = base64_default,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept = 0;
+ /**
+ * Provide the base64 length in bytes given the length of a binary input.
+ *
+ * @param length the length of the input in bytes
+ * @parem options the base64 options to use, can be base64_default or
+ * base64_url, is base64_default by default.
+ * @return number of base64 bytes
+ */
+ simdutf_warn_unused virtual size_t base64_length_from_binary(
+ size_t length,
+ base64_options options = base64_default) const noexcept = 0;
+
+ /**
+ * Convert a binary input to a base64 output.
+ *
+ * The default option (simdutf::base64_default) uses the characters `+` and
+ * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of
+ * the output to ensure that the output length is a multiple of four.
+ *
+ * The URL option (simdutf::base64_url) uses the characters `-` and `_` as
+ * part of its alphabet. No padding is added at the end of the output.
+ *
+ * This function always succeeds.
+ *
+ * @param input the binary to process
+ * @param length the length of the input in bytes
+ * @param output the pointer to buffer that can hold the conversion
+ * result (should be at least base64_length_from_binary(length) bytes long)
+ * @param options the base64 options to use, can be base64_default or
+ * base64_url, is base64_default by default.
+ * @return number of written bytes, will be equal to
+ * base64_length_from_binary(length, options)
+ */
+ virtual size_t
+ binary_to_base64(const char *input, size_t length, char *output,
+ base64_options options = base64_default) const noexcept = 0;
+
+protected:
+ /** @private Construct an implementation with the given name and description.
+ * For subclasses. */
+ simdutf_really_inline implementation(const char *name,
+ const char *description,
+ uint32_t required_instruction_sets)
+ : _name(name), _description(description),
+ _required_instruction_sets(required_instruction_sets) {}
+
+protected:
+ ~implementation() = default;
+
+private:
+ /**
+ * The name of this implementation.
+ */
+ const char *_name;
+
+ /**
+ * The description of this implementation.
+ */
+ const char *_description;
+
+ /**
+ * Instruction sets required for this implementation.
+ */
+ const uint32_t _required_instruction_sets;
+};
+
+/** @private */
+namespace internal {
+
+/**
+ * The list of available implementations compiled into simdutf.
+ */
+class available_implementation_list {
+public:
+ /** Get the list of available implementations compiled into simdutf */
+ simdutf_really_inline available_implementation_list() {}
+ /** Number of implementations */
+ size_t size() const noexcept;
+ /** STL const begin() iterator */
+ const implementation *const *begin() const noexcept;
+ /** STL const end() iterator */
+ const implementation *const *end() const noexcept;
+
+ /**
+ * Get the implementation with the given name.
+ *
+ * Case sensitive.
+ *
+ * const implementation *impl =
+ * simdutf::available_implementations["westmere"]; if (!impl) { exit(1); } if
+ * (!imp->supported_by_runtime_system()) { exit(1); }
+ * simdutf::active_implementation = impl;
+ *
+ * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
+ * @return the implementation, or nullptr if the parse failed.
+ */
+ const implementation *operator[](const std::string &name) const noexcept {
+ for (const implementation *impl : *this) {
+ if (impl->name() == name) {
+ return impl;
+ }
+ }
+ return nullptr;
+ }
+
+ /**
+ * Detect the most advanced implementation supported by the current host.
+ *
+ * This is used to initialize the implementation on startup.
+ *
+ * const implementation *impl =
+ * simdutf::available_implementation::detect_best_supported();
+ * simdutf::active_implementation = impl;
+ *
+ * @return the most advanced supported implementation for the current host, or
+ * an implementation that returns UNSUPPORTED_ARCHITECTURE if there is no
+ * supported implementation. Will never return nullptr.
+ */
+ const implementation *detect_best_supported() const noexcept;
+};
+
+template <typename T> class atomic_ptr {
+public:
+ atomic_ptr(T *_ptr) : ptr{_ptr} {}
+
+#if defined(SIMDUTF_NO_THREADS)
+ operator const T *() const { return ptr; }
+ const T &operator*() const { return *ptr; }
+ const T *operator->() const { return ptr; }
+
+ operator T *() { return ptr; }
+ T &operator*() { return *ptr; }
+ T *operator->() { return ptr; }
+ atomic_ptr &operator=(T *_ptr) {
+ ptr = _ptr;
+ return *this;
+ }
+
+#else
+ operator const T *() const { return ptr.load(); }
+ const T &operator*() const { return *ptr; }
+ const T *operator->() const { return ptr.load(); }
+
+ operator T *() { return ptr.load(); }
+ T &operator*() { return *ptr; }
+ T *operator->() { return ptr.load(); }
+ atomic_ptr &operator=(T *_ptr) {
+ ptr = _ptr;
+ return *this;
+ }
+
+#endif
+
+private:
+#if defined(SIMDUTF_NO_THREADS)
+ T *ptr;
+#else
+ std::atomic<T *> ptr;
+#endif
+};
+
+class detect_best_supported_implementation_on_first_use;
+
+} // namespace internal
+
+/**
+ * The list of available implementations compiled into simdutf.
+ */
+extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
+get_available_implementations();
+
+/**
+ * The active implementation.
+ *
+ * Automatically initialized on first use to the most advanced implementation
+ * supported by this hardware.
+ */
+extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
+get_active_implementation();
+
+} // namespace simdutf
+
+#endif // SIMDUTF_IMPLEMENTATION_H
diff --git a/contrib/simdutf/include/simdutf/internal/isadetection.h b/contrib/simdutf/include/simdutf/internal/isadetection.h
new file mode 100644
index 000000000..ea656bd7a
--- /dev/null
+++ b/contrib/simdutf/include/simdutf/internal/isadetection.h
@@ -0,0 +1,324 @@
+/* From
+https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
+Highly modified.
+
+Copyright (c) 2016- Facebook, Inc (Adam Paszke)
+Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
+Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute
+(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
+Samy Bengio, Johnny Mariethoz)
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
+America and IDIAP Research Institute nor the names of its contributors may be
+ used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef SIMDutf_INTERNAL_ISADETECTION_H
+#define SIMDutf_INTERNAL_ISADETECTION_H
+
+#include <cstdint>
+#include <cstdlib>
+#if defined(_MSC_VER)
+ #include <intrin.h>
+#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
+ #include <cpuid.h>
+#endif
+
+#include "simdutf/portability.h"
+
+// RISC-V ISA detection utilities
+#if SIMDUTF_IS_RISCV64 && defined(__linux__)
+ #include <unistd.h> // for syscall
+// We define these ourselves, for backwards compatibility
+struct simdutf_riscv_hwprobe {
+ int64_t key;
+ uint64_t value;
+};
+ #define simdutf_riscv_hwprobe(...) syscall(258, __VA_ARGS__)
+ #define SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0 4
+ #define SIMDUTF_RISCV_HWPROBE_IMA_V (1 << 2)
+ #define SIMDUTF_RISCV_HWPROBE_EXT_ZVBB (1 << 17)
+#endif // SIMDUTF_IS_RISCV64 && defined(__linux__)
+
+namespace simdutf {
+namespace internal {
+
+enum instruction_set {
+ DEFAULT = 0x0,
+ NEON = 0x1,
+ AVX2 = 0x4,
+ SSE42 = 0x8,
+ PCLMULQDQ = 0x10,
+ BMI1 = 0x20,
+ BMI2 = 0x40,
+ ALTIVEC = 0x80,
+ AVX512F = 0x100,
+ AVX512DQ = 0x200,
+ AVX512IFMA = 0x400,
+ AVX512PF = 0x800,
+ AVX512ER = 0x1000,
+ AVX512CD = 0x2000,
+ AVX512BW = 0x4000,
+ AVX512VL = 0x8000,
+ AVX512VBMI2 = 0x10000,
+ AVX512VPOPCNTDQ = 0x2000,
+ RVV = 0x4000,
+ ZVBB = 0x8000,
+ LSX = 0x40000,
+ LASX = 0x80000,
+};
+
+#if defined(__PPC64__)
+
+static inline uint32_t detect_supported_architectures() {
+ return instruction_set::ALTIVEC;
+}
+
+#elif SIMDUTF_IS_RISCV64
+
+static inline uint32_t detect_supported_architectures() {
+ uint32_t host_isa = instruction_set::DEFAULT;
+ #if SIMDUTF_IS_RVV
+ host_isa |= instruction_set::RVV;
+ #endif
+ #if SIMDUTF_IS_ZVBB
+ host_isa |= instruction_set::ZVBB;
+ #endif
+ #if defined(__linux__)
+ simdutf_riscv_hwprobe probes[] = {{SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0, 0}};
+ long ret = simdutf_riscv_hwprobe(&probes, sizeof probes / sizeof *probes, 0,
+ nullptr, 0);
+ if (ret == 0) {
+ uint64_t extensions = probes[0].value;
+ if (extensions & SIMDUTF_RISCV_HWPROBE_IMA_V)
+ host_isa |= instruction_set::RVV;
+ if (extensions & SIMDUTF_RISCV_HWPROBE_EXT_ZVBB)
+ host_isa |= instruction_set::ZVBB;
+ }
+ #endif
+ #if defined(RUN_IN_SPIKE_SIMULATOR)
+ // Proxy Kernel does not implement yet hwprobe syscall
+ host_isa |= instruction_set::RVV;
+ #endif
+ return host_isa;
+}
+
+#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+
+static inline uint32_t detect_supported_architectures() {
+ return instruction_set::NEON;
+}
+
+#elif defined(__x86_64__) || defined(_M_AMD64) // x64
+
+namespace {
+namespace cpuid_bit {
+// Can be found on Intel ISA Reference for CPUID
+
+// EAX = 0x01
+constexpr uint32_t pclmulqdq = uint32_t(1)
+ << 1; ///< @private bit 1 of ECX for EAX=0x1
+constexpr uint32_t sse42 = uint32_t(1)
+ << 20; ///< @private bit 20 of ECX for EAX=0x1
+constexpr uint32_t osxsave =
+ (uint32_t(1) << 26) |
+ (uint32_t(1) << 27); ///< @private bits 26+27 of ECX for EAX=0x1
+
+// EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf)
+// See: "Table 3-8. Information Returned by CPUID Instruction"
+namespace ebx {
+constexpr uint32_t bmi1 = uint32_t(1) << 3;
+constexpr uint32_t avx2 = uint32_t(1) << 5;
+constexpr uint32_t bmi2 = uint32_t(1) << 8;
+constexpr uint32_t avx512f = uint32_t(1) << 16;
+constexpr uint32_t avx512dq = uint32_t(1) << 17;
+constexpr uint32_t avx512ifma = uint32_t(1) << 21;
+constexpr uint32_t avx512cd = uint32_t(1) << 28;
+constexpr uint32_t avx512bw = uint32_t(1) << 30;
+constexpr uint32_t avx512vl = uint32_t(1) << 31;
+} // namespace ebx
+
+namespace ecx {
+constexpr uint32_t avx512vbmi = uint32_t(1) << 1;
+constexpr uint32_t avx512vbmi2 = uint32_t(1) << 6;
+constexpr uint32_t avx512vnni = uint32_t(1) << 11;
+constexpr uint32_t avx512bitalg = uint32_t(1) << 12;
+constexpr uint32_t avx512vpopcnt = uint32_t(1) << 14;
+} // namespace ecx
+namespace edx {
+constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8;
+}
+namespace xcr0_bit {
+constexpr uint64_t avx256_saved = uint64_t(1) << 2; ///< @private bit 2 = AVX
+constexpr uint64_t avx512_saved =
+ uint64_t(7) << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM
+} // namespace xcr0_bit
+} // namespace cpuid_bit
+} // namespace
+
+static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
+ uint32_t *edx) {
+ #if defined(_MSC_VER)
+ int cpu_info[4];
+ __cpuidex(cpu_info, *eax, *ecx);
+ *eax = cpu_info[0];
+ *ebx = cpu_info[1];
+ *ecx = cpu_info[2];
+ *edx = cpu_info[3];
+ #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
+ uint32_t level = *eax;
+ __get_cpuid(level, eax, ebx, ecx, edx);
+ #else
+ uint32_t a = *eax, b, c = *ecx, d;
+ asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
+ *eax = a;
+ *ebx = b;
+ *ecx = c;
+ *edx = d;
+ #endif
+}
+
+static inline uint64_t xgetbv() {
+ #if defined(_MSC_VER)
+ return _xgetbv(0);
+ #else
+ uint32_t xcr0_lo, xcr0_hi;
+ asm volatile("xgetbv\n\t" : "=a"(xcr0_lo), "=d"(xcr0_hi) : "c"(0));
+ return xcr0_lo | ((uint64_t)xcr0_hi << 32);
+ #endif
+}
+
+static inline uint32_t detect_supported_architectures() {
+ uint32_t eax;
+ uint32_t ebx = 0;
+ uint32_t ecx = 0;
+ uint32_t edx = 0;
+ uint32_t host_isa = 0x0;
+
+ // EBX for EAX=0x1
+ eax = 0x1;
+ cpuid(&eax, &ebx, &ecx, &edx);
+
+ if (ecx & cpuid_bit::sse42) {
+ host_isa |= instruction_set::SSE42;
+ }
+
+ if (ecx & cpuid_bit::pclmulqdq) {
+ host_isa |= instruction_set::PCLMULQDQ;
+ }
+
+ if ((ecx & cpuid_bit::osxsave) != cpuid_bit::osxsave) {
+ return host_isa;
+ }
+
+ // xgetbv for checking if the OS saves registers
+ uint64_t xcr0 = xgetbv();
+
+ if ((xcr0 & cpuid_bit::xcr0_bit::avx256_saved) == 0) {
+ return host_isa;
+ }
+ // ECX for EAX=0x7
+ eax = 0x7;
+ ecx = 0x0; // Sub-leaf = 0
+ cpuid(&eax, &ebx, &ecx, &edx);
+ if (ebx & cpuid_bit::ebx::avx2) {
+ host_isa |= instruction_set::AVX2;
+ }
+ if (ebx & cpuid_bit::ebx::bmi1) {
+ host_isa |= instruction_set::BMI1;
+ }
+ if (ebx & cpuid_bit::ebx::bmi2) {
+ host_isa |= instruction_set::BMI2;
+ }
+ if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) ==
+ cpuid_bit::xcr0_bit::avx512_saved)) {
+ return host_isa;
+ }
+ if (ebx & cpuid_bit::ebx::avx512f) {
+ host_isa |= instruction_set::AVX512F;
+ }
+ if (ebx & cpuid_bit::ebx::avx512bw) {
+ host_isa |= instruction_set::AVX512BW;
+ }
+ if (ebx & cpuid_bit::ebx::avx512cd) {
+ host_isa |= instruction_set::AVX512CD;
+ }
+ if (ebx & cpuid_bit::ebx::avx512dq) {
+ host_isa |= instruction_set::AVX512DQ;
+ }
+ if (ebx & cpuid_bit::ebx::avx512vl) {
+ host_isa |= instruction_set::AVX512VL;
+ }
+ if (ecx & cpuid_bit::ecx::avx512vbmi2) {
+ host_isa |= instruction_set::AVX512VBMI2;
+ }
+ if (ecx & cpuid_bit::ecx::avx512vpopcnt) {
+ host_isa |= instruction_set::AVX512VPOPCNTDQ;
+ }
+ return host_isa;
+}
+#elif defined(__loongarch__)
+ #if defined(__linux__)
+ #include <sys/auxv.h>
+ // bits/hwcap.h
+ // #define HWCAP_LOONGARCH_LSX (1 << 4)
+ // #define HWCAP_LOONGARCH_LASX (1 << 5)
+ #endif
+
+static inline uint32_t detect_supported_architectures() {
+ uint32_t host_isa = instruction_set::DEFAULT;
+ #if defined(__linux__)
+ uint64_t hwcap = 0;
+ hwcap = getauxval(AT_HWCAP);
+ if (hwcap & HWCAP_LOONGARCH_LSX) {
+ host_isa |= instruction_set::LSX;
+ }
+ if (hwcap & HWCAP_LOONGARCH_LASX) {
+ host_isa |= instruction_set::LASX;
+ }
+ #endif
+ return host_isa;
+}
+#else // fallback
+
+// includes 32-bit ARM.
+static inline uint32_t detect_supported_architectures() {
+ return instruction_set::DEFAULT;
+}
+
+#endif // end SIMD extension detection code
+
+} // namespace internal
+} // namespace simdutf
+
+#endif // SIMDutf_INTERNAL_ISADETECTION_H
diff --git a/contrib/simdutf/include/simdutf/portability.h b/contrib/simdutf/include/simdutf/portability.h
new file mode 100644
index 000000000..b935c4d3b
--- /dev/null
+++ b/contrib/simdutf/include/simdutf/portability.h
@@ -0,0 +1,262 @@
+#ifndef SIMDUTF_PORTABILITY_H
+#define SIMDUTF_PORTABILITY_H
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cfloat>
+#include <cassert>
+#ifndef _WIN32
+ // strcasecmp, strncasecmp
+ #include <strings.h>
+#endif
+
+/**
+ * We want to check that it is actually a little endian system at
+ * compile-time.
+ */
+
+#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
+ #define SIMDUTF_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#elif defined(_WIN32)
+ #define SIMDUTF_IS_BIG_ENDIAN 0
+#else
+ #if defined(__APPLE__) || \
+ defined(__FreeBSD__) // defined __BYTE_ORDER__ && defined
+ // __ORDER_BIG_ENDIAN__
+ #include <machine/endian.h>
+ #elif defined(sun) || \
+ defined(__sun) // defined(__APPLE__) || defined(__FreeBSD__)
+ #include <sys/byteorder.h>
+ #else // defined(__APPLE__) || defined(__FreeBSD__)
+
+ #ifdef __has_include
+ #if __has_include(<endian.h>)
+ #include <endian.h>
+ #endif //__has_include(<endian.h>)
+ #endif //__has_include
+
+ #endif // defined(__APPLE__) || defined(__FreeBSD__)
+
+ #ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__)
+ #define SIMDUTF_IS_BIG_ENDIAN 0
+ #endif
+
+ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ #define SIMDUTF_IS_BIG_ENDIAN 0
+ #else // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ #define SIMDUTF_IS_BIG_ENDIAN 1
+ #endif // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+
+#endif // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__
+
+/**
+ * At this point in time, SIMDUTF_IS_BIG_ENDIAN is defined.
+ */
+
+#ifdef _MSC_VER
+ #define SIMDUTF_VISUAL_STUDIO 1
+ /**
+ * We want to differentiate carefully between
+ * clang under visual studio and regular visual
+ * studio.
+ *
+ * Under clang for Windows, we enable:
+ * * target pragmas so that part and only part of the
+ * code gets compiled for advanced instructions.
+ *
+ */
+ #ifdef __clang__
+ // clang under visual studio
+ #define SIMDUTF_CLANG_VISUAL_STUDIO 1
+ #else
+ // just regular visual studio (best guess)
+ #define SIMDUTF_REGULAR_VISUAL_STUDIO 1
+ #endif // __clang__
+#endif // _MSC_VER
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ // https://en.wikipedia.org/wiki/C_alternative_tokens
+ // This header should have no effect, except maybe
+ // under Visual Studio.
+ #include <iso646.h>
+#endif
+
+#if (defined(__x86_64__) || defined(_M_AMD64)) && !defined(_M_ARM64EC)
+ #define SIMDUTF_IS_X86_64 1
+#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+ #define SIMDUTF_IS_ARM64 1
+#elif defined(__PPC64__) || defined(_M_PPC64)
+// #define SIMDUTF_IS_PPC64 1
+// The simdutf library does yet support SIMD acceleration under
+// POWER processors. Please see https://github.com/lemire/simdutf/issues/51
+#elif defined(__s390__)
+// s390 IBM system. Big endian.
+#elif (defined(__riscv) || defined(__riscv__)) && __riscv_xlen == 64
+ // RISC-V 64-bit
+ #define SIMDUTF_IS_RISCV64 1
+
+ // #if __riscv_v_intrinsic >= 1000000
+ // #define SIMDUTF_HAS_RVV_INTRINSICS 1
+ // #define SIMDUTF_HAS_RVV_TARGET_REGION 1
+ // #elif ...
+ // Check for special compiler versions that implement pre v1.0 intrinsics
+ #if __riscv_v_intrinsic >= 11000
+ #define SIMDUTF_HAS_RVV_INTRINSICS 1
+ #endif
+
+ #define SIMDUTF_HAS_ZVBB_INTRINSICS \
+ 0 // there is currently no way to detect this
+
+ #if SIMDUTF_HAS_RVV_INTRINSICS && __riscv_vector && \
+ __riscv_v_min_vlen >= 128 && __riscv_v_elen >= 64
+ // RISC-V V extension
+ #define SIMDUTF_IS_RVV 1
+ #if SIMDUTF_HAS_ZVBB_INTRINSICS && __riscv_zvbb >= 1000000
+ // RISC-V Vector Basic Bit-manipulation
+ #define SIMDUTF_IS_ZVBB 1
+ #endif
+ #endif
+
+#elif defined(__loongarch_lp64)
+ #if defined(__loongarch_sx) && defined(__loongarch_asx)
+ #define SIMDUTF_IS_LSX 1
+ #define SIMDUTF_IS_LASX 1
+ #elif defined(__loongarch_sx)
+ #define SIMDUTF_IS_LSX 1
+ #endif
+#else
+ // The simdutf library is designed
+ // for 64-bit processors and it seems that you are not
+ // compiling for a known 64-bit platform. Please
+ // use a 64-bit target such as x64 or 64-bit ARM for best performance.
+ #define SIMDUTF_IS_32BITS 1
+
+ // We do not support 32-bit platforms, but it can be
+ // handy to identify them.
+ #if defined(_M_IX86) || defined(__i386__)
+ #define SIMDUTF_IS_X86_32BITS 1
+ #elif defined(__arm__) || defined(_M_ARM)
+ #define SIMDUTF_IS_ARM_32BITS 1
+ #elif defined(__PPC__) || defined(_M_PPC)
+ #define SIMDUTF_IS_PPC_32BITS 1
+ #endif
+
+#endif // defined(__x86_64__) || defined(_M_AMD64)
+
+#ifdef SIMDUTF_IS_32BITS
+ #ifndef SIMDUTF_NO_PORTABILITY_WARNING
+ // In the future, we may want to warn users of 32-bit systems that
+ // the simdutf does not support accelerated kernels for such systems.
+ #endif // SIMDUTF_NO_PORTABILITY_WARNING
+#endif // SIMDUTF_IS_32BITS
+
+// this is almost standard?
+#define SIMDUTF_STRINGIFY_IMPLEMENTATION_(a) #a
+#define SIMDUTF_STRINGIFY(a) SIMDUTF_STRINGIFY_IMPLEMENTATION_(a)
+
+// Our fast kernels require 64-bit systems.
+//
+// On 32-bit x86, we lack 64-bit popcnt, lzcnt, blsr instructions.
+// Furthermore, the number of SIMD registers is reduced.
+//
+// On 32-bit ARM, we would have smaller registers.
+//
+// The simdutf users should still have the fallback kernel. It is
+// slower, but it should run everywhere.
+
+//
+// Enable valid runtime implementations, and select
+// SIMDUTF_BUILTIN_IMPLEMENTATION
+//
+
+// We are going to use runtime dispatch.
+#ifdef SIMDUTF_IS_X86_64
+ #ifdef __clang__
+ // clang does not have GCC push pop
+ // warning: clang attribute push can't be used within a namespace in clang
+ // up til 8.0 so SIMDUTF_TARGET_REGION and SIMDUTF_UNTARGET_REGION must be
+ // *outside* of a namespace.
+ #define SIMDUTF_TARGET_REGION(T) \
+ _Pragma(SIMDUTF_STRINGIFY(clang attribute push( \
+ __attribute__((target(T))), apply_to = function)))
+ #define SIMDUTF_UNTARGET_REGION _Pragma("clang attribute pop")
+ #elif defined(__GNUC__)
+ // GCC is easier
+ #define SIMDUTF_TARGET_REGION(T) \
+ _Pragma("GCC push_options") _Pragma(SIMDUTF_STRINGIFY(GCC target(T)))
+ #define SIMDUTF_UNTARGET_REGION _Pragma("GCC pop_options")
+ #endif // clang then gcc
+
+#endif // x86
+
+// Default target region macros don't do anything.
+#ifndef SIMDUTF_TARGET_REGION
+ #define SIMDUTF_TARGET_REGION(T)
+ #define SIMDUTF_UNTARGET_REGION
+#endif
+
+// Is threading enabled?
+#if defined(_REENTRANT) || defined(_MT)
+ #ifndef SIMDUTF_THREADS_ENABLED
+ #define SIMDUTF_THREADS_ENABLED
+ #endif
+#endif
+
+// workaround for large stack sizes under -O0.
+// https://github.com/simdutf/simdutf/issues/691
+#ifdef __APPLE__
+ #ifndef __OPTIMIZE__
+ // Apple systems have small stack sizes in secondary threads.
+ // Lack of compiler optimization may generate high stack usage.
+ // Users may want to disable threads for safety, but only when
+ // in debug mode which we detect by the fact that the __OPTIMIZE__
+ // macro is not defined.
+ #undef SIMDUTF_THREADS_ENABLED
+ #endif
+#endif
+
+#ifdef SIMDUTF_VISUAL_STUDIO
+ // This is one case where we do not distinguish between
+ // regular visual studio and clang under visual studio.
+ // clang under Windows has _stricmp (like visual studio) but not strcasecmp
+ // (as clang normally has)
+ #define simdutf_strcasecmp _stricmp
+ #define simdutf_strncasecmp _strnicmp
+#else
+ // The strcasecmp, strncasecmp, and strcasestr functions do not work with
+ // multibyte strings (e.g. UTF-8). So they are only useful for ASCII in our
+ // context.
+ // https://www.gnu.org/software/libunistring/manual/libunistring.html#char-_002a-strings
+ #define simdutf_strcasecmp strcasecmp
+ #define simdutf_strncasecmp strncasecmp
+#endif
+
+#ifdef NDEBUG
+
+ #ifdef SIMDUTF_VISUAL_STUDIO
+ #define SIMDUTF_UNREACHABLE() __assume(0)
+ #define SIMDUTF_ASSUME(COND) __assume(COND)
+ #else
+ #define SIMDUTF_UNREACHABLE() __builtin_unreachable();
+ #define SIMDUTF_ASSUME(COND) \
+ do { \
+ if (!(COND)) \
+ __builtin_unreachable(); \
+ } while (0)
+ #endif
+
+#else // NDEBUG
+
+ #define SIMDUTF_UNREACHABLE() assert(0);
+ #define SIMDUTF_ASSUME(COND) assert(COND)
+
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+ #if __GNUC__ >= 11
+ #define SIMDUTF_GCC11ORMORE 1
+ #endif // __GNUC__ >= 11
+#endif // defined(__GNUC__) && !defined(__clang__)
+
+#endif // SIMDUTF_PORTABILITY_H
diff --git a/contrib/simdutf/include/simdutf/simdutf_version.h b/contrib/simdutf/include/simdutf/simdutf_version.h
new file mode 100644
index 000000000..a02d5d3d8
--- /dev/null
+++ b/contrib/simdutf/include/simdutf/simdutf_version.h
@@ -0,0 +1,26 @@
+// /include/simdutf/simdutf_version.h automatically generated by release.py,
+// do not change by hand
+#ifndef SIMDUTF_SIMDUTF_VERSION_H
+#define SIMDUTF_SIMDUTF_VERSION_H
+
+/** The version of simdutf being used (major.minor.revision) */
+#define SIMDUTF_VERSION "5.6.3"
+
+namespace simdutf {
+enum {
+ /**
+ * The major version (MAJOR.minor.revision) of simdutf being used.
+ */
+ SIMDUTF_VERSION_MAJOR = 5,
+ /**
+ * The minor version (major.MINOR.revision) of simdutf being used.
+ */
+ SIMDUTF_VERSION_MINOR = 6,
+ /**
+ * The revision (major.minor.REVISION) of simdutf being used.
+ */
+ SIMDUTF_VERSION_REVISION = 3
+};
+} // namespace simdutf
+
+#endif // SIMDUTF_SIMDUTF_VERSION_H
diff --git a/contrib/simdutf/src/CMakeLists.txt b/contrib/simdutf/src/CMakeLists.txt
new file mode 100644
index 000000000..7a4a5c93b
--- /dev/null
+++ b/contrib/simdutf/src/CMakeLists.txt
@@ -0,0 +1,46 @@
+add_library(simdutf-include-source INTERFACE)
+target_include_directories(simdutf-include-source INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+add_library(simdutf-source INTERFACE)
+target_sources(simdutf-source INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>/simdutf.cpp)
+target_link_libraries(simdutf-source INTERFACE simdutf-include-source)
+add_library(simdutf STATIC simdutf.cpp ../../../src/libutil/cxx/rspamd-simdutf.cxx)
+target_include_directories(simdutf PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}> )
+target_include_directories(simdutf PUBLIC "$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>")
+
+if(MSVC)
+ if("${MSVC_TOOLSET_VERSION}" STREQUAL "140")
+ target_compile_options(simdutf PRIVATE /W0 /sdl)
+ set(SIMDUTF_LEGACY_VISUAL_STUDIO TRUE)
+ else()
+ target_compile_options(simdutf PRIVATE /WX /W3 /sdl /w34714) # https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-4-c4714?view=vs-2019
+ endif()
+else(MSVC)
+ if(NOT WIN32)
+ target_compile_options(simdutf INTERFACE -fPIC)
+ endif()
+ target_compile_options(simdutf PRIVATE -Wall -Wextra -Weffc++)
+ target_compile_options(simdutf PRIVATE -Wfatal-errors -Wsign-compare -Wshadow -Wwrite-strings -Wpointer-arith -Winit-self -Wconversion -Wno-sign-conversion -Wunused-function)
+endif(MSVC)
+
+# workaround for GNU GCC poor AVX load/store code generation
+if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86(_64)?)$"))
+ target_compile_options(simdutf PRIVATE -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store)
+endif()
+
+if(SIMDUTF_ALWAYS_INCLUDE_FALLBACK)
+ message(STATUS "SIMDUTF_ALWAYS_INCLUDE_FALLBACK is set to ${SIMDUTF_ALWAYS_INCLUDE_FALLBACK}")
+ target_compile_definitions(simdutf PRIVATE SIMDUTF_IMPLEMENTATION_FALLBACK=1)
+endif()
+
+if(SIMDUTF_SANITIZE)
+ target_compile_options(simdutf PUBLIC -fsanitize=address -fno-omit-frame-pointer -fno-sanitize-recover=all)
+ target_compile_definitions(simdutf PUBLIC ASAN_OPTIONS=detect_leaks=1)
+ target_link_libraries(simdutf PUBLIC -fsanitize=address -fno-omit-frame-pointer -fno-sanitize-recover=all)
+endif()
+if(SIMDUTF_SANITIZE_UNDEFINED)
+ target_compile_options(simdutf PUBLIC -fsanitize=undefined -fno-sanitize-recover=all)
+ target_link_libraries(simdutf PUBLIC -fsanitize=undefined)
+endif()
+if(MSVC AND BUILD_SHARED_LIBS)
+ set(SIMDUTF_WINDOWS_DLL TRUE)
+endif()
diff --git a/contrib/simdutf/src/arm64/arm_base64.cpp b/contrib/simdutf/src/arm64/arm_base64.cpp
new file mode 100644
index 000000000..c2c1fd63a
--- /dev/null
+++ b/contrib/simdutf/src/arm64/arm_base64.cpp
@@ -0,0 +1,501 @@
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
+
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+ base64_options options) {
+ // credit: Wojciech Muła
+ uint8_t *out = (uint8_t *)dst;
+ constexpr static uint8_t source_table[64] = {
+ 'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
+ 'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
+ 'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
+ '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
+ 'N', 'd', 't', '9', 'O', 'e', 'u', '+', 'P', 'f', 'v', '/',
+ };
+ constexpr static uint8_t source_table_url[64] = {
+ 'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
+ 'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
+ 'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
+ '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
+ 'N', 'd', 't', '9', 'O', 'e', 'u', '-', 'P', 'f', 'v', '_',
+ };
+ const uint8x16_t v3f = vdupq_n_u8(0x3f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ // When trying to load a uint8_t array, Visual Studio might
+ // error with: error C2664: '__n128x4 neon_ld4m_q8(const char *)':
+ // cannot convert argument 1 from 'const uint8_t [64]' to 'const char *
+ const uint8x16x4_t table = vld4q_u8(
+ (reinterpret_cast<const char *>(options & base64_url) ? source_table_url
+ : source_table));
+#else
+ const uint8x16x4_t table =
+ vld4q_u8((options & base64_url) ? source_table_url : source_table);
+#endif
+ size_t i = 0;
+ for (; i + 16 * 3 <= srclen; i += 16 * 3) {
+ const uint8x16x3_t in = vld3q_u8((const uint8_t *)src + i);
+ uint8x16x4_t result;
+ result.val[0] = vshrq_n_u8(in.val[0], 2);
+ result.val[1] =
+ vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[1], 4), in.val[0], 4), v3f);
+ result.val[2] =
+ vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[2], 6), in.val[1], 2), v3f);
+ result.val[3] = vandq_u8(in.val[2], v3f);
+ result.val[0] = vqtbl4q_u8(table, result.val[0]);
+ result.val[1] = vqtbl4q_u8(table, result.val[1]);
+ result.val[2] = vqtbl4q_u8(table, result.val[2]);
+ result.val[3] = vqtbl4q_u8(table, result.val[3]);
+ vst4q_u8(out, result);
+ out += 64;
+ }
+ out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i,
+ options);
+
+ return size_t((char *)out - dst);
+}
+
+static inline void compress(uint8x16_t data, uint16_t mask, char *output) {
+ if (mask == 0) {
+ vst1q_u8((uint8_t *)output, data);
+ return;
+ }
+ uint8_t mask1 = uint8_t(mask); // least significant 8 bits
+ uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+ uint64x2_t compactmasku64 = {tables::base64::thintable_epi8[mask1],
+ tables::base64::thintable_epi8[mask2]};
+ uint8x16_t compactmask = vreinterpretq_u8_u64(compactmasku64);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint8x16_t off =
+ simdutf_make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8);
+#else
+ const uint8x16_t off = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
+#endif
+
+ compactmask = vaddq_u8(compactmask, off);
+ uint8x16_t pruned = vqtbl1q_u8(data, compactmask);
+
+ int pop1 = tables::base64::BitsSetTable256mul2[mask1];
+ // then load the corresponding mask, what it does is to write
+ // only the first pop1 bytes from the first 8 bytes, and then
+ // it fills in with the bytes from the second 8 bytes + some filling
+ // at the end.
+ compactmask = vld1q_u8(tables::base64::pshufb_combine_table + pop1 * 8);
+ uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
+ vst1q_u8((uint8_t *)output, answer);
+}
+
+struct block64 {
+ uint8x16_t chunks[4];
+};
+
+static_assert(sizeof(block64) == 64, "block64 is not 64 bytes");
+template <bool base64_url> uint64_t to_base64_mask(block64 *b, bool *error) {
+ uint8x16_t v0f = vdupq_n_u8(0xf);
+
+ uint8x16_t underscore0, underscore1, underscore2, underscore3;
+ if (base64_url) {
+ underscore0 = vceqq_u8(b->chunks[0], vdupq_n_u8(0x5f));
+ underscore1 = vceqq_u8(b->chunks[1], vdupq_n_u8(0x5f));
+ underscore2 = vceqq_u8(b->chunks[2], vdupq_n_u8(0x5f));
+ underscore3 = vceqq_u8(b->chunks[3], vdupq_n_u8(0x5f));
+ } else {
+ (void)underscore0;
+ (void)underscore1;
+ (void)underscore2;
+ (void)underscore3;
+ }
+
+ uint8x16_t lo_nibbles0 = vandq_u8(b->chunks[0], v0f);
+ uint8x16_t lo_nibbles1 = vandq_u8(b->chunks[1], v0f);
+ uint8x16_t lo_nibbles2 = vandq_u8(b->chunks[2], v0f);
+ uint8x16_t lo_nibbles3 = vandq_u8(b->chunks[3], v0f);
+
+ // Needed by the decoding step.
+ uint8x16_t hi_nibbles0 = vshrq_n_u8(b->chunks[0], 4);
+ uint8x16_t hi_nibbles1 = vshrq_n_u8(b->chunks[1], 4);
+ uint8x16_t hi_nibbles2 = vshrq_n_u8(b->chunks[2], 4);
+ uint8x16_t hi_nibbles3 = vshrq_n_u8(b->chunks[3], 4);
+ uint8x16_t lut_lo;
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ if (base64_url) {
+ lut_lo =
+ simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+ 0x70, 0x61, 0xe1, 0xf4, 0xe5, 0xa5, 0xf4, 0xf4);
+ } else {
+ lut_lo =
+ simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+ 0x70, 0x61, 0xe1, 0xb4, 0xe5, 0xe5, 0xf4, 0xb4);
+ }
+#else
+ if (base64_url) {
+ lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+ 0x70, 0x61, 0xe1, 0xf4, 0xe5, 0xa5, 0xf4, 0xf4};
+ } else {
+ lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+ 0x70, 0x61, 0xe1, 0xb4, 0xe5, 0xe5, 0xf4, 0xb4};
+ }
+#endif
+ uint8x16_t lo0 = vqtbl1q_u8(lut_lo, lo_nibbles0);
+ uint8x16_t lo1 = vqtbl1q_u8(lut_lo, lo_nibbles1);
+ uint8x16_t lo2 = vqtbl1q_u8(lut_lo, lo_nibbles2);
+ uint8x16_t lo3 = vqtbl1q_u8(lut_lo, lo_nibbles3);
+ uint8x16_t lut_hi;
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ if (base64_url) {
+ lut_hi =
+ simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+ } else {
+ lut_hi =
+ simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+ }
+#else
+ if (base64_url) {
+ lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+ } else {
+ lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+ }
+#endif
+ uint8x16_t hi0 = vqtbl1q_u8(lut_hi, hi_nibbles0);
+ uint8x16_t hi1 = vqtbl1q_u8(lut_hi, hi_nibbles1);
+ uint8x16_t hi2 = vqtbl1q_u8(lut_hi, hi_nibbles2);
+ uint8x16_t hi3 = vqtbl1q_u8(lut_hi, hi_nibbles3);
+
+ if (base64_url) {
+ hi0 = vbicq_u8(hi0, underscore0);
+ hi1 = vbicq_u8(hi1, underscore1);
+ hi2 = vbicq_u8(hi2, underscore2);
+ hi3 = vbicq_u8(hi3, underscore3);
+ }
+
+ uint8_t checks =
+ vmaxvq_u8(vorrq_u8(vorrq_u8(vandq_u8(lo0, hi0), vandq_u8(lo1, hi1)),
+ vorrq_u8(vandq_u8(lo2, hi2), vandq_u8(lo3, hi3))));
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint8x16_t bit_mask =
+ simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+ 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+#else
+ const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+ 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+#endif
+ uint64_t badcharmask = 0;
+ *error = checks > 0x3;
+ if (checks) {
+ // Add each of the elements next to each other, successively, to stuff each
+ // 8 byte mask into one.
+ uint8x16_t test0 = vtstq_u8(lo0, hi0);
+ uint8x16_t test1 = vtstq_u8(lo1, hi1);
+ uint8x16_t test2 = vtstq_u8(lo2, hi2);
+ uint8x16_t test3 = vtstq_u8(lo3, hi3);
+ uint8x16_t sum0 =
+ vpaddq_u8(vandq_u8(test0, bit_mask), vandq_u8(test1, bit_mask));
+ uint8x16_t sum1 =
+ vpaddq_u8(vandq_u8(test2, bit_mask), vandq_u8(test3, bit_mask));
+ sum0 = vpaddq_u8(sum0, sum1);
+ sum0 = vpaddq_u8(sum0, sum0);
+ badcharmask = vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+ }
+ // This is the transformation step that can be done while we are waiting for
+ // sum0
+ uint8x16_t roll_lut;
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ if (base64_url) {
+ roll_lut =
+ simdutf_make_uint8x16_t(0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+ } else {
+ roll_lut =
+ simdutf_make_uint8x16_t(0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+ }
+#else
+ if (base64_url) {
+ roll_lut = uint8x16_t{0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+ } else {
+ roll_lut = uint8x16_t{0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+ }
+#endif
+ uint8x16_t vsecond_last = base64_url ? vdupq_n_u8(0x2d) : vdupq_n_u8(0x2f);
+ if (base64_url) {
+ hi_nibbles0 = vbicq_u8(hi_nibbles0, underscore0);
+ hi_nibbles1 = vbicq_u8(hi_nibbles1, underscore1);
+ hi_nibbles2 = vbicq_u8(hi_nibbles2, underscore2);
+ hi_nibbles3 = vbicq_u8(hi_nibbles3, underscore3);
+ }
+ uint8x16_t roll0 = vqtbl1q_u8(
+ roll_lut, vaddq_u8(vceqq_u8(b->chunks[0], vsecond_last), hi_nibbles0));
+ uint8x16_t roll1 = vqtbl1q_u8(
+ roll_lut, vaddq_u8(vceqq_u8(b->chunks[1], vsecond_last), hi_nibbles1));
+ uint8x16_t roll2 = vqtbl1q_u8(
+ roll_lut, vaddq_u8(vceqq_u8(b->chunks[2], vsecond_last), hi_nibbles2));
+ uint8x16_t roll3 = vqtbl1q_u8(
+ roll_lut, vaddq_u8(vceqq_u8(b->chunks[3], vsecond_last), hi_nibbles3));
+ b->chunks[0] = vaddq_u8(b->chunks[0], roll0);
+ b->chunks[1] = vaddq_u8(b->chunks[1], roll1);
+ b->chunks[2] = vaddq_u8(b->chunks[2], roll2);
+ b->chunks[3] = vaddq_u8(b->chunks[3], roll3);
+ return badcharmask;
+}
+
+void copy_block(block64 *b, char *output) {
+ vst1q_u8((uint8_t *)output, b->chunks[0]);
+ vst1q_u8((uint8_t *)output + 16, b->chunks[1]);
+ vst1q_u8((uint8_t *)output + 32, b->chunks[2]);
+ vst1q_u8((uint8_t *)output + 48, b->chunks[3]);
+}
+
+uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+ uint64_t popcounts =
+ vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
+ uint64_t offsets = popcounts * 0x0101010101010101;
+ compress(b->chunks[0], uint16_t(mask), output);
+ compress(b->chunks[1], uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF]);
+ compress(b->chunks[2], uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF]);
+ compress(b->chunks[3], uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF]);
+ return offsets >> 56;
+}
+
+// The caller of this function is responsible to ensure that there are 64 bytes
+// available from reading at src. The data is read into a block64 structure.
+void load_block(block64 *b, const char *src) {
+ b->chunks[0] = vld1q_u8(reinterpret_cast<const uint8_t *>(src));
+ b->chunks[1] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 16);
+ b->chunks[2] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 32);
+ b->chunks[3] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 48);
+}
+
+// The caller of this function is responsible to ensure that there are 32 bytes
+// available from reading at data. It returns a 16-byte value, narrowing with
+// saturation the 16-bit words.
+inline uint8x16_t load_satured(const uint16_t *data) {
+ uint16x8_t in1 = vld1q_u16(data);
+ uint16x8_t in2 = vld1q_u16(data + 8);
+ return vqmovn_high_u16(vqmovn_u16(in1), in2);
+}
+
+// The caller of this function is responsible to ensure that there are 128 bytes
+// available from reading at src. The data is read into a block64 structure.
+void load_block(block64 *b, const char16_t *src) {
+ b->chunks[0] = load_satured(reinterpret_cast<const uint16_t *>(src));
+ b->chunks[1] = load_satured(reinterpret_cast<const uint16_t *>(src) + 16);
+ b->chunks[2] = load_satured(reinterpret_cast<const uint16_t *>(src) + 32);
+ b->chunks[3] = load_satured(reinterpret_cast<const uint16_t *>(src) + 48);
+}
+
+// decode 64 bytes and output 48 bytes
+void base64_decode_block(char *out, const char *src) {
+ uint8x16x4_t str = vld4q_u8((uint8_t *)src);
+ uint8x16x3_t outvec;
+ outvec.val[0] =
+ vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
+ outvec.val[1] =
+ vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
+ outvec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
+ vst3q_u8((uint8_t *)out, outvec);
+}
+
+template <bool base64_url, typename char_type>
+full_result
+compress_decode_base64(char *dst, const char_type *src, size_t srclen,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options) {
+ const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+ : tables::base64::to_base64_value;
+ size_t equallocation =
+ srclen; // location of the first padding character if any
+ // skip trailing spaces
+ while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+ to_base64[uint8_t(src[srclen - 1])] == 64) {
+ srclen--;
+ }
+ size_t equalsigns = 0;
+ if (srclen > 0 && src[srclen - 1] == '=') {
+ equallocation = srclen - 1;
+ srclen--;
+ equalsigns = 1;
+ // skip trailing spaces
+ while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+ to_base64[uint8_t(src[srclen - 1])] == 64) {
+ srclen--;
+ }
+ if (srclen > 0 && src[srclen - 1] == '=') {
+ equallocation = srclen - 1;
+ srclen--;
+ equalsigns = 2;
+ }
+ }
+ if (srclen == 0) {
+ if (equalsigns > 0) {
+ return {INVALID_BASE64_CHARACTER, equallocation, 0};
+ }
+ return {SUCCESS, 0, 0};
+ }
+ const char_type *const srcinit = src;
+ const char *const dstinit = dst;
+ const char_type *const srcend = src + srclen;
+
+ constexpr size_t block_size = 10;
+ char buffer[block_size * 64];
+ char *bufferptr = buffer;
+ if (srclen >= 64) {
+ const char_type *const srcend64 = src + srclen - 64;
+ while (src <= srcend64) {
+ block64 b;
+ load_block(&b, src);
+ src += 64;
+ bool error = false;
+ uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+ if (badcharmask) {
+ if (error) {
+ src -= 64;
+ while (src < srcend && scalar::base64::is_eight_byte(*src) &&
+ to_base64[uint8_t(*src)] <= 64) {
+ src++;
+ }
+ if (src < srcend) {
+ // should never happen
+ }
+ return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+ size_t(dst - dstinit)};
+ }
+ }
+
+ if (badcharmask != 0) {
+ // optimization opportunity: check for simple masks like those made of
+ // continuous 1s followed by continuous 0s. And masks containing a
+ // single bad character.
+ bufferptr += compress_block(&b, badcharmask, bufferptr);
+ } else {
+ // optimization opportunity: if bufferptr == buffer and mask == 0, we
+ // can avoid the call to compress_block and decode directly.
+ copy_block(&b, bufferptr);
+ bufferptr += 64;
+ }
+ if (bufferptr >= (block_size - 1) * 64 + buffer) {
+ for (size_t i = 0; i < (block_size - 1); i++) {
+ base64_decode_block(dst, buffer + i * 64);
+ dst += 48;
+ }
+ std::memcpy(buffer, buffer + (block_size - 1) * 64,
+ 64); // 64 might be too much
+ bufferptr -= (block_size - 1) * 64;
+ }
+ }
+ }
+ char *buffer_start = buffer;
+ // Optimization note: if this is almost full, then it is worth our
+ // time, otherwise, we should just decode directly.
+ int last_block = (int)((bufferptr - buffer_start) % 64);
+ if (last_block != 0 && srcend - src + last_block >= 64) {
+ while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
+ uint8_t val = to_base64[uint8_t(*src)];
+ *bufferptr = char(val);
+ if (!scalar::base64::is_eight_byte(*src) || val > 64) {
+ return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+ size_t(dst - dstinit)};
+ }
+ bufferptr += (val <= 63);
+ src++;
+ }
+ }
+
+ for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+ base64_decode_block(dst, buffer_start);
+ dst += 48;
+ }
+ if ((bufferptr - buffer_start) % 64 != 0) {
+ while (buffer_start + 4 < bufferptr) {
+ uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+ (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+ (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+ (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+ << 8;
+ triple = scalar::utf32::swap_bytes(triple);
+ std::memcpy(dst, &triple, 4);
+
+ dst += 3;
+ buffer_start += 4;
+ }
+ if (buffer_start + 4 <= bufferptr) {
+ uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+ (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+ (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+ (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+ << 8;
+ triple = scalar::utf32::swap_bytes(triple);
+ std::memcpy(dst, &triple, 3);
+
+ dst += 3;
+ buffer_start += 4;
+ }
+ // we may have 1, 2 or 3 bytes left and we need to decode them so let us
+ // backtrack
+ int leftover = int(bufferptr - buffer_start);
+ while (leftover > 0) {
+ while (to_base64[uint8_t(*(src - 1))] == 64) {
+ src--;
+ }
+ src--;
+ leftover--;
+ }
+ }
+ if (src < srcend + equalsigns) {
+ full_result r = scalar::base64::base64_tail_decode(
+ dst, src, srcend - src, equalsigns, options, last_chunk_options);
+ r.input_count += size_t(src - srcinit);
+ if (r.error == error_code::INVALID_BASE64_CHARACTER ||
+ r.error == error_code::BASE64_EXTRA_BITS) {
+ return r;
+ } else {
+ r.output_count += size_t(dst - dstinit);
+ }
+ if (last_chunk_options != stop_before_partial &&
+ r.error == error_code::SUCCESS && equalsigns > 0) {
+ // additional checks
+ if ((r.output_count % 3 == 0) ||
+ ((r.output_count % 3) + 1 + equalsigns != 4)) {
+ r.error = error_code::INVALID_BASE64_CHARACTER;
+ r.input_count = equallocation;
+ }
+ }
+ return r;
+ }
+ if (equalsigns > 0) {
+ if ((size_t(dst - dstinit) % 3 == 0) ||
+ ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
+ return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
+ }
+ }
+ return {SUCCESS, srclen, size_t(dst - dstinit)};
+}
diff --git a/contrib/simdutf/src/arm64/arm_convert_latin1_to_utf16.cpp b/contrib/simdutf/src/arm64/arm_convert_latin1_to_utf16.cpp
new file mode 100644
index 000000000..543c4e6d0
--- /dev/null
+++ b/contrib/simdutf/src/arm64/arm_convert_latin1_to_utf16.cpp
@@ -0,0 +1,24 @@
+template <endianness big_endian>
+std::pair<const char *, char16_t *>
+arm_convert_latin1_to_utf16(const char *buf, size_t len,
+ char16_t *utf16_output) {
+ const char *end = buf + len;
+
+ while (end - buf >= 16) {
+ uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(buf));
+ uint16x8_t inlow = vmovl_u8(vget_low_u8(in8));
+ if (!match_system(big_endian)) {
+ inlow = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inlow)));
+ }
+ vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), inlow);
+ uint16x8_t inhigh = vmovl_u8(vget_high_u8(in8));
+ if (!match_system(big_endian)) {
+ inhigh = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inhigh)));
+ }
+ vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output + 8), inhigh);
+ utf16_output += 16;
+ buf += 16;
+ }
+
+ return std::make_pair(buf, utf16_output);
+}
diff --git a/contrib/simdutf/src/arm64/arm_convert_latin1_to_utf32.cpp b/contrib/simdutf/src/arm64/arm_convert_latin1_to_utf32.cpp
new file mode 100644
index 000000000..553929a74
--- /dev/null
+++ b/contrib/simdutf/src/arm64/arm_convert_latin1_to_utf32.cpp
@@ -0,0 +1,24 @@
+std::pair<const char *, char32_t *>
+arm_convert_latin1_to_utf32(const char *buf, size_t len,
+ char32_t *utf32_output) {
+ const char *end = buf + len;
+
+ while (end - buf >= 16) {
+ uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(buf));
+ uint16x8_t in8low = vmovl_u8(vget_low_u8(in8));
+ uint32x4_t in16lowlow = vmovl_u16(vget_low_u16(in8low));
+ uint32x4_t in16lowhigh = vmovl_u16(vget_high_u16(in8low));
+ uint16x8_t in8high = vmovl_u8(vget_high_u8(in8));
+ uint32x4_t in8highlow = vmovl_u16(vget_low_u16(in8high));
+ uint32x4_t in8highhigh = vmovl_u16(vget_high_u16(in8high));
+ vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output), in16lowlow);
+ vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 4), in16lowhigh);
+ vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 8), in8highlow);
+ vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 12), in8highhigh);
+
+ utf32_output += 16;
+ buf += 16;
+ }
+
+ return std::make_pair(buf, utf32_output);
+}
diff --git a/contrib/simdutf/src/arm64/arm_convert_latin1_to_utf8.cpp b/contrib/simdutf/src/arm64/arm_convert_latin1_to_utf8.cpp
new file mode 100644
index 000000000..a7a53d327
--- /dev/null
+++ b/contrib/simdutf/src/arm64/arm_convert_latin1_to_utf8.cpp
@@ -0,0 +1,70 @@
+/*
+ Returns a pair: the first unprocessed byte from buf and utf8_output
+ A scalar routing should carry on the conversion of the tail.
+*/
+std::pair<const char *, char *>
+arm_convert_latin1_to_utf8(const char *latin1_input, size_t len,
+ char *utf8_out) {
+ uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+ const char *end = latin1_input + len;
+ const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+ // We always write 16 bytes, of which more than the first 8 bytes
+ // are valid. A safety margin of 8 is more than sufficient.
+ while (end - latin1_input >= 16 + 8) {
+ uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(latin1_input));
+ if (vmaxvq_u8(in8) <= 0x7F) { // ASCII fast path!!!!
+ vst1q_u8(utf8_output, in8);
+ utf8_output += 16;
+ latin1_input += 16;
+ continue;
+ }
+
+ // We just fallback on UTF-16 code. This could be optimized/simplified
+ // further.
+ uint16x8_t in16 = vmovl_u8(vget_low_u8(in8));
+ // 1. prepare 2-byte values
+ // input 8-bit word : [aabb|bbbb] x 8
+ // expected output : [1100|00aa|10bb|bbbb] x 8
+ const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+ const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+ // t0 = [0000|00aa|bbbb|bb00]
+ const uint16x8_t t0 = vshlq_n_u16(in16, 2);
+ // t1 = [0000|00aa|0000|0000]
+ const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const uint16x8_t t2 = vandq_u16(in16, v_003f);
+ // t3 = [0000|00aa|00bb|bbbb]
+ const uint16x8_t t3 = vorrq_u16(t1, t2);
+ // t4 = [1100|00aa|10bb|bbbb]
+ const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+ // 2. merge ASCII and 2-byte codewords
+ const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+ const uint16x8_t one_byte_bytemask = vcleq_u16(in16, v_007f);
+ const uint8x16_t utf8_unpacked =
+ vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in16, t4));
+ // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint16x8_t mask = simdutf_make_uint16x8_t(
+ 0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
+#else
+ const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
+ 0x0002, 0x0008, 0x0020, 0x0080};
+#endif
+ uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+ // 4. pack the bytes
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+ const uint8x16_t shuffle = vld1q_u8(row + 1);
+ const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+ // 5. store bytes
+ vst1q_u8(utf8_output, utf8_packed);
+ // 6. adjust pointers
+ latin1_input += 8;
+ utf8_output += row[0];
+
+ } // while
+
+ return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
+}
diff --git a/contrib/simdutf/src/arm64/arm_convert_utf16_to_latin1.cpp b/contrib/simdutf/src/arm64/arm_convert_utf16_to_latin1.cpp
new file mode 100644
index 000000000..2ec54208d
--- /dev/null
+++ b/contrib/simdutf/src/arm64/arm_convert_utf16_to_latin1.cpp
@@ -0,0 +1,63 @@
+
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+arm_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_output) {
+ const char16_t *end = buf + len;
+ while (end - buf >= 8) {
+ uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+ if (!match_system(big_endian)) {
+ in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+ }
+ if (vmaxvq_u16(in) <= 0xff) {
+ // 1. pack the bytes
+ uint8x8_t latin1_packed = vmovn_u16(in);
+ // 2. store (8 bytes)
+ vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
+ // 3. adjust pointers
+ buf += 8;
+ latin1_output += 8;
+ } else {
+ return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+ }
+ } // while
+ return std::make_pair(buf, latin1_output);
+}
+
+template <endianness big_endian>
+std::pair<result, char *>
+arm_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+ char *latin1_output) {
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+ while (end - buf >= 8) {
+ uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+ if (!match_system(big_endian)) {
+ in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+ }
+ if (vmaxvq_u16(in) <= 0xff) {
+ // 1. pack the bytes
+ uint8x8_t latin1_packed = vmovn_u16(in);
+ // 2. store (8 bytes)
+ vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
+ // 3. adjust pointers
+ buf += 8;
+ latin1_output += 8;
+ } else {
+ // Let us do a scalar fallback.
+ for (int k = 0; k < 8; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if (word <= 0xff) {
+ *latin1_output++ = char(word);
+ } else {
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+ latin1_output);
+ }
+ }
+ }
+ } // while
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ latin1_output);
+}
diff --git a/contrib/simdutf/src/arm64/arm_convert_utf16_to_utf32.cpp b/contrib/simdutf/src/arm64/arm_convert_utf16_to_utf32.cpp
new file mode 100644
index 000000000..ce968a72e
--- /dev/null
+++ b/contrib/simdutf/src/arm64/arm_convert_utf16_to_utf32.cpp
@@ -0,0 +1,191 @@
+/*
+ The vectorized algorithm works on single SSE register i.e., it
+ loads eight 16-bit code units.
+
+ We consider three cases:
+ 1. an input register contains no surrogates and each value
+ is in range 0x0000 .. 0x07ff.
+ 2. an input register contains no surrogates and values are
+ is in range 0x0000 .. 0xffff.
+ 3. an input register contains surrogates --- i.e. codepoints
+ can have 16 or 32 bits.
+
+ Ad 1.
+
+ When values are less than 0x0800, it means that a 16-bit code unit
+ can be converted into: 1) single UTF8 byte (when it is an ASCII
+ char) or 2) two UTF8 bytes.
+
+ For this case we do only some shuffle to obtain these 2-byte
+ codes and finally compress the whole SSE register with a single
+ shuffle.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+ Ad 2.
+
+ When values fit in 16-bit code units, but are above 0x07ff, then
+ a single word may produce one, two or three UTF8 bytes.
+
+ We prepare data for all these three cases in two registers.
+ The first register contains lower two UTF8 bytes (used in all
+ cases), while the second one contains just the third byte for
+ the three-UTF8-bytes case.
+
+ Finally these two registers are interleaved forming eight-element
+ array of 32-bit values. The array spans two SSE registers.
+ The bytes from the registers are compressed using two shuffles.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+
+ To summarize:
+ - We need two 256-entry tables that have 8704 bytes in total.
+*/
+/*
+ Returns a pair: the first unprocessed byte from buf and utf8_output
+ A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char32_t *>
+arm_convert_utf16_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_out) {
+ uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+ const char16_t *end = buf + len;
+
+ const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+ const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+
+ while (end - buf >= 8) {
+ uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+ if (!match_system(big_endian)) {
+ in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+ }
+
+ const uint16x8_t surrogates_bytemask =
+ vceqq_u16(vandq_u16(in, v_f800), v_d800);
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (vmaxvq_u16(surrogates_bytemask) == 0) {
+ // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+ // units
+ vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
+ vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
+ utf32_output += 8;
+ buf += 8;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if ((word & 0xF800) != 0xD800) {
+ *utf32_output++ = char32_t(word);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k + 1])
+ : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char32_t *>(utf32_output));
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf32_output++ = char32_t(value);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(buf, reinterpret_cast<char32_t *>(utf32_output));
+}
+
+/*
+ Returns a pair: a result struct and utf8_output.
+ If there is an error, the count field of the result is the position of the
+ error. Otherwise, it is the position of the first unprocessed byte in buf
+ (even if finished). A scalar routing should carry on the conversion of the
+ tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char32_t *>
+arm_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
+ char32_t *utf32_out) {
+ uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+
+ const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+ const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+
+ while ((end - buf) >= 8) {
+ uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+ if (!match_system(big_endian)) {
+ in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+ }
+
+ const uint16x8_t surrogates_bytemask =
+ vceqq_u16(vandq_u16(in, v_f800), v_d800);
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (vmaxvq_u16(surrogates_bytemask) == 0) {
+ // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+ // units
+ vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
+ vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
+ utf32_output += 8;
+ buf += 8;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if ((word & 0xF800) != 0xD800) {
+ *utf32_output++ = char32_t(word);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k + 1])
+ : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k - 1),
+ reinterpret_cast<char32_t *>(utf32_output));
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf32_output++ = char32_t(value);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ reinterpret_cast<char32_t *>(utf32_output));
+}
diff --git a/contrib/simdutf/src/arm64/arm_convert_utf16_to_utf8.cpp b/contrib/simdutf/src/arm64/arm_convert_utf16_to_utf8.cpp
new file mode 100644
index 000000000..868663368
--- /dev/null
+++ b/contrib/simdutf/src/arm64/arm_convert_utf16_to_utf8.cpp
@@ -0,0 +1,587 @@
+/*
+ The vectorized algorithm works on single SSE register i.e., it
+ loads eight 16-bit code units.
+
+ We consider three cases:
+ 1. an input register contains no surrogates and each value
+ is in range 0x0000 .. 0x07ff.
+ 2. an input register contains no surrogates and values are
+ is in range 0x0000 .. 0xffff.
+ 3. an input register contains surrogates --- i.e. codepoints
+ can have 16 or 32 bits.
+
+ Ad 1.
+
+ When values are less than 0x0800, it means that a 16-bit code unit
+ can be converted into: 1) single UTF8 byte (when it is an ASCII
+ char) or 2) two UTF8 bytes.
+
+ For this case we do only some shuffle to obtain these 2-byte
+ codes and finally compress the whole SSE register with a single
+ shuffle.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+ Ad 2.
+
+ When values fit in 16-bit code units, but are above 0x07ff, then
+ a single word may produce one, two or three UTF8 bytes.
+
+ We prepare data for all these three cases in two registers.
+ The first register contains lower two UTF8 bytes (used in all
+ cases), while the second one contains just the third byte for
+ the three-UTF8-bytes case.
+
+ Finally these two registers are interleaved forming eight-element
+ array of 32-bit values. The array spans two SSE registers.
+ The bytes from the registers are compressed using two shuffles.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+
+ To summarize:
+ - We need two 256-entry tables that have 8704 bytes in total.
+*/
+/*
+ Returns a pair: the first unprocessed byte from buf and utf8_output
+ A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+arm_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
+ uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+ const char16_t *end = buf + len;
+
+ const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+ const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+ const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+ while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+ uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+ if (!match_system(big_endian)) {
+ in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+ }
+ if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
+ // It is common enough that we have sequences of 16 consecutive ASCII
+ // characters.
+ uint16x8_t nextin =
+ vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
+ if (!match_system(big_endian)) {
+ nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin)));
+ }
+ if (vmaxvq_u16(nextin) > 0x7F) {
+ // 1. pack the bytes
+ // obviously suboptimal.
+ uint8x8_t utf8_packed = vmovn_u16(in);
+ // 2. store (8 bytes)
+ vst1_u8(utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 8;
+ utf8_output += 8;
+ in = nextin;
+ } else {
+ // 1. pack the bytes
+ // obviously suboptimal.
+ uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
+ // 2. store (16 bytes)
+ vst1q_u8(utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+ }
+
+ if (vmaxvq_u16(in) <= 0x7FF) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+ const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const uint16x8_t t0 = vshlq_n_u16(in, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const uint16x8_t t2 = vandq_u16(in, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const uint16x8_t t3 = vorrq_u16(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+ // 2. merge ASCII and 2-byte codewords
+ const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+ const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+ const uint8x16_t utf8_unpacked =
+ vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
+ // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint16x8_t mask = simdutf_make_uint16x8_t(
+ 0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
+#else
+ const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
+ 0x0002, 0x0008, 0x0020, 0x0080};
+#endif
+ uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+ // 4. pack the bytes
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+ const uint8x16_t shuffle = vld1q_u8(row + 1);
+ const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+ // 5. store bytes
+ vst1q_u8(utf8_output, utf8_packed);
+
+ // 6. adjust pointers
+ buf += 8;
+ utf8_output += row[0];
+ continue;
+ }
+ const uint16x8_t surrogates_bytemask =
+ vceqq_u16(vandq_u16(in, v_f800), v_d800);
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (vmaxvq_u16(surrogates_bytemask) == 0) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint16x8_t dup_even = simdutf_make_uint16x8_t(
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+ const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
+ 0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+#endif
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
+ UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const uint16x8_t t0 = vreinterpretq_u16_u8(
+ vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+ // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+ const uint16x8_t s0 = vshrq_n_u16(in, 12);
+ // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+ const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
+ // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+ const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+ // [00bb|bbbb|0000|aaaa]
+ const uint16x8_t s2 = vorrq_u16(s0, s1s);
+ // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+ const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+ const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
+ const uint16x8_t m0 =
+ vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+ const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+ const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+ const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint16x8_t onemask = simdutf_make_uint16x8_t(
+ 0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
+ const uint16x8_t twomask = simdutf_make_uint16x8_t(
+ 0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
+#else
+ const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
+ 0x0100, 0x0400, 0x1000, 0x4000};
+ const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
+ 0x0200, 0x0800, 0x2000, 0x8000};
+#endif
+ const uint16x8_t combined =
+ vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
+ vandq_u16(one_or_two_bytes_bytemask, twomask));
+ const uint16_t mask = vaddvq_u16(combined);
+ // The following fast path may or may not be beneficial.
+ /*if(mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+ const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+ const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+ vst1q_u8(utf8_output, utf8_0);
+ utf8_output += 12;
+ vst1q_u8(utf8_output, utf8_1);
+ utf8_output += 12;
+ buf += 8;
+ continue;
+ }*/
+ const uint8_t mask0 = uint8_t(mask);
+
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+ const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+ const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+ vst1q_u8(utf8_output, utf8_0);
+ utf8_output += row0[0];
+ vst1q_u8(utf8_output, utf8_1);
+ utf8_output += row1[0];
+
+ buf += 8;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if ((word & 0xFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xF800) != 0xD800) {
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k + 1])
+ : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char *>(utf8_output));
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf8_output++ = char((value >> 18) | 0b11110000);
+ *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((value & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
+}
+
+/*
+ Returns a pair: a result struct and utf8_output.
+ If there is an error, the count field of the result is the position of the
+ error. Otherwise, it is the position of the first unprocessed byte in buf
+ (even if finished). A scalar routing should carry on the conversion of the
+ tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char *>
+arm_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
+ char *utf8_out) {
+ uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+
+ const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+ const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+ const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+ uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+ if (!match_system(big_endian)) {
+ in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+ }
+ if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
+ // It is common enough that we have sequences of 16 consecutive ASCII
+ // characters.
+ uint16x8_t nextin =
+ vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
+ if (!match_system(big_endian)) {
+ nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin)));
+ }
+ if (vmaxvq_u16(nextin) > 0x7F) {
+ // 1. pack the bytes
+ // obviously suboptimal.
+ uint8x8_t utf8_packed = vmovn_u16(in);
+ // 2. store (8 bytes)
+ vst1_u8(utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 8;
+ utf8_output += 8;
+ in = nextin;
+ } else {
+ // 1. pack the bytes
+ // obviously suboptimal.
+ uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
+ // 2. store (16 bytes)
+ vst1q_u8(utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+ }
+
+ if (vmaxvq_u16(in) <= 0x7FF) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+ const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const uint16x8_t t0 = vshlq_n_u16(in, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const uint16x8_t t2 = vandq_u16(in, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const uint16x8_t t3 = vorrq_u16(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+ // 2. merge ASCII and 2-byte codewords
+ const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+ const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+ const uint8x16_t utf8_unpacked =
+ vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
+ // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint16x8_t mask = simdutf_make_uint16x8_t(
+ 0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
+#else
+ const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
+ 0x0002, 0x0008, 0x0020, 0x0080};
+#endif
+ uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+ // 4. pack the bytes
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+ const uint8x16_t shuffle = vld1q_u8(row + 1);
+ const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+ // 5. store bytes
+ vst1q_u8(utf8_output, utf8_packed);
+
+ // 6. adjust pointers
+ buf += 8;
+ utf8_output += row[0];
+ continue;
+ }
+ const uint16x8_t surrogates_bytemask =
+ vceqq_u16(vandq_u16(in, v_f800), v_d800);
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (vmaxvq_u16(surrogates_bytemask) == 0) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint16x8_t dup_even = simdutf_make_uint16x8_t(
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+ const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
+ 0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+#endif
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
+ UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const uint16x8_t t0 = vreinterpretq_u16_u8(
+ vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+ // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+ const uint16x8_t s0 = vshrq_n_u16(in, 12);
+ // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+ const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
+ // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+ const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+ // [00bb|bbbb|0000|aaaa]
+ const uint16x8_t s2 = vorrq_u16(s0, s1s);
+ // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+ const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+ const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
+ const uint16x8_t m0 =
+ vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+ const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+ const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+ const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint16x8_t onemask = simdutf_make_uint16x8_t(
+ 0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
+ const uint16x8_t twomask = simdutf_make_uint16x8_t(
+ 0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
+#else
+ const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
+ 0x0100, 0x0400, 0x1000, 0x4000};
+ const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
+ 0x0200, 0x0800, 0x2000, 0x8000};
+#endif
+ const uint16x8_t combined =
+ vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
+ vandq_u16(one_or_two_bytes_bytemask, twomask));
+ const uint16_t mask = vaddvq_u16(combined);
+ // The following fast path may or may not be beneficial.
+ /*if(mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+ const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+ const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+ vst1q_u8(utf8_output, utf8_0);
+ utf8_output += 12;
+ vst1q_u8(utf8_output, utf8_1);
+ utf8_output += 12;
+ buf += 8;
+ continue;
+ }*/
+ const uint8_t mask0 = uint8_t(mask);
+
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+ const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+ const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+ vst1q_u8(utf8_output, utf8_0);
+ utf8_output += row0[0];
+ vst1q_u8(utf8_output, utf8_1);
+ utf8_output += row1[0];
+
+ buf += 8;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if ((word & 0xFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xF800) != 0xD800) {
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k + 1])
+ : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k - 1),
+ reinterpret_cast<char *>(utf8_output));
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf8_output++ = char((value >> 18) | 0b11110000);
+ *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((value & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ reinterpret_cast<char *>(utf8_output));
+}
diff --git a/contrib/simdutf/src/arm64/arm_convert_utf32_to_latin1.cpp b/contrib/simdutf/src/arm64/arm_convert_utf32_to_latin1.cpp
new file mode 100644
index 000000000..b4e09013c
--- /dev/null
+++ b/contrib/simdutf/src/arm64/arm_convert_utf32_to_latin1.cpp
@@ -0,0 +1,60 @@
+std::pair<const char32_t *, char *>
+arm_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) {
+ const char32_t *end = buf + len;
+ while (end - buf >= 8) {
+ uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+ uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+
+ uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2));
+ if (vmaxvq_u16(utf16_packed) <= 0xff) {
+ // 1. pack the bytes
+ uint8x8_t latin1_packed = vmovn_u16(utf16_packed);
+ // 2. store (8 bytes)
+ vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
+ // 3. adjust pointers
+ buf += 8;
+ latin1_output += 8;
+ } else {
+ return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+ }
+ } // while
+ return std::make_pair(buf, latin1_output);
+}
+
+std::pair<result, char *>
+arm_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+ char *latin1_output) {
+ const char32_t *start = buf;
+ const char32_t *end = buf + len;
+
+ while (end - buf >= 8) {
+ uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+ uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+
+ uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2));
+
+ if (vmaxvq_u16(utf16_packed) <= 0xff) {
+ // 1. pack the bytes
+ uint8x8_t latin1_packed = vmovn_u16(utf16_packed);
+ // 2. store (8 bytes)
+ vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
+ // 3. adjust pointers
+ buf += 8;
+ latin1_output += 8;
+ } else {
+ // Let us do a scalar fallback.
+ for (int k = 0; k < 8; k++) {
+ uint32_t word = buf[k];
+ if (word <= 0xff) {
+ *latin1_output++ = char(word);
+ } else {
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+ latin1_output);
+ }
+ }
+ }
+ } // while
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ latin1_output);
+}
diff --git a/contrib/simdutf/src/arm64/arm_convert_utf32_to_utf16.cpp b/contrib/simdutf/src/arm64/arm_convert_utf32_to_utf16.cpp
new file mode 100644
index 000000000..9453c2e29
--- /dev/null
+++ b/contrib/simdutf/src/arm64/arm_convert_utf32_to_utf16.cpp
@@ -0,0 +1,151 @@
+template <endianness big_endian>
+std::pair<const char32_t *, char16_t *>
+arm_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+ char16_t *utf16_out) {
+ uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
+ const char32_t *end = buf + len;
+
+ uint16x4_t forbidden_bytemask = vmov_n_u16(0x0);
+
+ while (end - buf >= 4) {
+ uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+
+ // Check if no bits set above 16th
+ if (vmaxvq_u32(in) <= 0xFFFF) {
+ uint16x4_t utf16_packed = vmovn_u32(in);
+
+ const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
+ const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
+ forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff),
+ vcge_u16(utf16_packed, v_d800)),
+ forbidden_bytemask);
+
+ if (!match_system(big_endian)) {
+ utf16_packed =
+ vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(utf16_packed)));
+ }
+ vst1_u16(utf16_output, utf16_packed);
+ utf16_output += 4;
+ buf += 4;
+ } else {
+ size_t forward = 3;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFF0000) == 0) {
+ // will not generate a surrogate pair
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+ *utf16_output++ = !match_system(big_endian)
+ ? char16_t(word >> 8 | word << 8)
+ : char16_t(word);
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (!match_system(big_endian)) {
+ high_surrogate =
+ uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+ low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ }
+ }
+ buf += k;
+ }
+ }
+
+ // check for invalid input
+ if (vmaxv_u16(forbidden_bytemask) != 0) {
+ return std::make_pair(nullptr, reinterpret_cast<char16_t *>(utf16_output));
+ }
+
+ return std::make_pair(buf, reinterpret_cast<char16_t *>(utf16_output));
+}
+
+template <endianness big_endian>
+std::pair<result, char16_t *>
+arm_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+ char16_t *utf16_out) {
+ uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
+ const char32_t *start = buf;
+ const char32_t *end = buf + len;
+
+ while (end - buf >= 4) {
+ uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+
+ // Check if no bits set above 16th
+ if (vmaxvq_u32(in) <= 0xFFFF) {
+ uint16x4_t utf16_packed = vmovn_u32(in);
+
+ const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
+ const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
+ const uint16x4_t forbidden_bytemask = vand_u16(
+ vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800));
+ if (vmaxv_u16(forbidden_bytemask) != 0) {
+ return std::make_pair(result(error_code::SURROGATE, buf - start),
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+
+ if (!match_system(big_endian)) {
+ utf16_packed =
+ vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(utf16_packed)));
+ }
+ vst1_u16(utf16_output, utf16_packed);
+ utf16_output += 4;
+ buf += 4;
+ } else {
+ size_t forward = 3;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFF0000) == 0) {
+ // will not generate a surrogate pair
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k),
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+ *utf16_output++ = !match_system(big_endian)
+ ? char16_t(word >> 8 | word << 8)
+ : char16_t(word);
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return std::make_pair(
+ result(error_code::TOO_LARGE, buf - start + k),
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (!match_system(big_endian)) {
+ high_surrogate =
+ uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+ low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ }
+ }
+ buf += k;
+ }
+ }
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ reinterpret_cast<char16_t *>(utf16_output));
+}
diff --git a/contrib/simdutf/src/arm64/arm_convert_utf32_to_utf8.cpp b/contrib/simdutf/src/arm64/arm_convert_utf32_to_utf8.cpp
new file mode 100644
index 000000000..63870eedb
--- /dev/null
+++ b/contrib/simdutf/src/arm64/arm_convert_utf32_to_utf8.cpp
@@ -0,0 +1,505 @@
+std::pair<const char32_t *, char *>
+arm_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
+ uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+ const char32_t *end = buf + len;
+
+ const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+
+ uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (buf + 16 + safety_margin < end) {
+ uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+ uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+
+ // Check if no bits set above 16th
+ if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
+ // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+ // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
+ uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
+ if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ // obviously suboptimal.
+ uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
+ // 2. store (8 bytes)
+ vst1_u8(utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 8;
+ utf8_output += 8;
+ continue; // we are done for this round!
+ }
+
+ if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+ const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const uint16x8_t t3 = vorrq_u16(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+ // 2. merge ASCII and 2-byte codewords
+ const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+ const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+ const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(
+ vbslq_u16(one_byte_bytemask, utf16_packed, t4));
+ // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint16x8_t mask = simdutf_make_uint16x8_t(
+ 0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
+#else
+ const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
+ 0x0002, 0x0008, 0x0020, 0x0080};
+#endif
+ uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+ // 4. pack the bytes
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+ const uint8x16_t shuffle = vld1q_u8(row + 1);
+ const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+ // 5. store bytes
+ vst1q_u8(utf8_output, utf8_packed);
+
+ // 6. adjust pointers
+ buf += 8;
+ utf8_output += row[0];
+ continue;
+ } else {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+ const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
+ forbidden_bytemask =
+ vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff),
+ vcgeq_u16(utf16_packed, v_d800)),
+ forbidden_bytemask);
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint16x8_t dup_even = simdutf_make_uint16x8_t(
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+ const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
+ 0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+#endif
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
+ two UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** --
+ precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+ they differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence,
+ taking into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const uint16x8_t t0 =
+ vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed),
+ vreinterpretq_u8_u16(dup_even)));
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+ // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+ const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
+ // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+ const uint16x8_t s1 =
+ vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
+ // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+ const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+ // [00bb|bbbb|0000|aaaa]
+ const uint16x8_t s2 = vorrq_u16(s0, s1s);
+ // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+ const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+ const uint16x8_t one_or_two_bytes_bytemask =
+ vcleq_u16(utf16_packed, v_07ff);
+ const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000),
+ one_or_two_bytes_bytemask);
+ const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+ const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+ const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint16x8_t onemask = simdutf_make_uint16x8_t(
+ 0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
+ const uint16x8_t twomask = simdutf_make_uint16x8_t(
+ 0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
+#else
+ const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
+ 0x0100, 0x0400, 0x1000, 0x4000};
+ const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
+ 0x0200, 0x0800, 0x2000, 0x8000};
+#endif
+ const uint16x8_t combined =
+ vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
+ vandq_u16(one_or_two_bytes_bytemask, twomask));
+ const uint16_t mask = vaddvq_u16(combined);
+ // The following fast path may or may not be beneficial.
+ /*if(mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+ const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+ const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+ vst1q_u8(utf8_output, utf8_0);
+ utf8_output += 12;
+ vst1q_u8(utf8_output, utf8_1);
+ utf8_output += 12;
+ buf += 8;
+ continue;
+ }*/
+ const uint8_t mask0 = uint8_t(mask);
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+ const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+ const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+ vst1q_u8(utf8_output, utf8_0);
+ utf8_output += row0[0];
+ vst1q_u8(utf8_output, utf8_1);
+ utf8_output += row1[0];
+
+ buf += 8;
+ }
+ // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+ // will produce four UTF-8 bytes.
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFFFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xFFFFF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xFFFF0000) == 0) {
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char *>(utf8_output));
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ if (word > 0x10FFFF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char *>(utf8_output));
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ // check for invalid input
+ if (vmaxvq_u16(forbidden_bytemask) != 0) {
+ return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
+ }
+ return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
+}
+
+std::pair<result, char *>
+arm_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+ char *utf8_out) {
+ uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+ const char32_t *start = buf;
+ const char32_t *end = buf + len;
+
+ const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (buf + 16 + safety_margin < end) {
+ uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+ uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+
+ // Check if no bits set above 16th
+ if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
+ // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+ // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
+ uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
+ if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ // obviously suboptimal.
+ uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
+ // 2. store (8 bytes)
+ vst1_u8(utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 8;
+ utf8_output += 8;
+ continue; // we are done for this round!
+ }
+
+ if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+ const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const uint16x8_t t3 = vorrq_u16(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+ // 2. merge ASCII and 2-byte codewords
+ const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+ const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+ const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(
+ vbslq_u16(one_byte_bytemask, utf16_packed, t4));
+ // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint16x8_t mask = simdutf_make_uint16x8_t(
+ 0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
+#else
+ const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
+ 0x0002, 0x0008, 0x0020, 0x0080};
+#endif
+ uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+ // 4. pack the bytes
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+ const uint8x16_t shuffle = vld1q_u8(row + 1);
+ const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+ // 5. store bytes
+ vst1q_u8(utf8_output, utf8_packed);
+
+ // 6. adjust pointers
+ buf += 8;
+ utf8_output += row[0];
+ continue;
+ } else {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+
+ // check for invalid input
+ const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+ const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
+ const uint16x8_t forbidden_bytemask = vandq_u16(
+ vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800));
+ if (vmaxvq_u16(forbidden_bytemask) != 0) {
+ return std::make_pair(result(error_code::SURROGATE, buf - start),
+ reinterpret_cast<char *>(utf8_output));
+ }
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint16x8_t dup_even = simdutf_make_uint16x8_t(
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+ const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
+ 0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+#endif
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
+ two UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** --
+ precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+ they differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence,
+ taking into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const uint16x8_t t0 =
+ vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed),
+ vreinterpretq_u8_u16(dup_even)));
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+ // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+ const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
+ // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+ const uint16x8_t s1 =
+ vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
+ // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+ const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+ // [00bb|bbbb|0000|aaaa]
+ const uint16x8_t s2 = vorrq_u16(s0, s1s);
+ // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+ const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+ const uint16x8_t one_or_two_bytes_bytemask =
+ vcleq_u16(utf16_packed, v_07ff);
+ const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000),
+ one_or_two_bytes_bytemask);
+ const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+ const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+ const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint16x8_t onemask = simdutf_make_uint16x8_t(
+ 0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
+ const uint16x8_t twomask = simdutf_make_uint16x8_t(
+ 0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
+#else
+ const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
+ 0x0100, 0x0400, 0x1000, 0x4000};
+ const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
+ 0x0200, 0x0800, 0x2000, 0x8000};
+#endif
+ const uint16x8_t combined =
+ vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
+ vandq_u16(one_or_two_bytes_bytemask, twomask));
+ const uint16_t mask = vaddvq_u16(combined);
+ // The following fast path may or may not be beneficial.
+ /*if(mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+ const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+ const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+ vst1q_u8(utf8_output, utf8_0);
+ utf8_output += 12;
+ vst1q_u8(utf8_output, utf8_1);
+ utf8_output += 12;
+ buf += 8;
+ continue;
+ }*/
+ const uint8_t mask0 = uint8_t(mask);
+
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+ const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+ const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+ vst1q_u8(utf8_output, utf8_0);
+ utf8_output += row0[0];
+ vst1q_u8(utf8_output, utf8_1);
+ utf8_output += row1[0];
+
+ buf += 8;
+ }
+ // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+ // will produce four UTF-8 bytes.
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFFFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xFFFFF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xFFFF0000) == 0) {
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k),
+ reinterpret_cast<char *>(utf8_output));
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ if (word > 0x10FFFF) {
+ return std::make_pair(
+ result(error_code::TOO_LARGE, buf - start + k),
+ reinterpret_cast<char *>(utf8_output));
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ reinterpret_cast<char *>(utf8_output));
+}
diff --git a/contrib/simdutf/src/arm64/arm_convert_utf8_to_latin1.cpp b/contrib/simdutf/src/arm64/arm_convert_utf8_to_latin1.cpp
new file mode 100644
index 000000000..b815279ee
--- /dev/null
+++ b/contrib/simdutf/src/arm64/arm_convert_utf8_to_latin1.cpp
@@ -0,0 +1,69 @@
+// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 16, usually 12).
+size_t convert_masked_utf8_to_latin1(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char *&latin1_output) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask & 0xfff;
+ //
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+
+ // We first try a few fast paths.
+ // The obvious first test is ASCII, which actually consumes the full 16.
+ if (utf8_end_of_code_point_mask == 0xfff) {
+ // We process in chunks of 12 bytes
+ vst1q_u8(reinterpret_cast<uint8_t *>(latin1_output), in);
+ latin1_output += 12; // We wrote 12 18-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+ /// We do not have a fast path available, or the fast path is unimportant, so
+ /// we fallback.
+ const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][0];
+
+ const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][1];
+ // this indicates an invalid input:
+ if (idx >= 64) {
+ return consumed;
+ }
+ // Here we should have (idx < 64), if not, there is a bug in the validation or
+ // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
+ // we process SIX (6) input code-code units. The max length in bytes of six
+ // code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6
+ // 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy
+ // scenario we process SIX (6) input code-code units. The max length in bytes
+ // of six code code units spanning between 1 and 2 bytes each is 12 bytes.
+ uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+ // Shuffle
+ // 1 byte: 00000000 0bbbbbbb
+ // 2 byte: 110aaaaa 10bbbbbb
+ uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh));
+ // Mask
+ // 1 byte: 00000000 0bbbbbbb
+ // 2 byte: 00000000 00bbbbbb
+ uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits
+ // 1 byte: 00000000 00000000
+ // 2 byte: 000aaaaa 00000000
+ uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits
+ // Combine with a shift right accumulate
+ // 1 byte: 00000000 0bbbbbbb
+ // 2 byte: 00000aaa aabbbbbb
+ uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2);
+ // writing 8 bytes even though we only care about the first 6 bytes.
+ uint8x8_t latin1_packed = vmovn_u16(composed);
+ vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
+ latin1_output += 6; // We wrote 6 bytes.
+ return consumed;
+}
diff --git a/contrib/simdutf/src/arm64/arm_convert_utf8_to_utf16.cpp b/contrib/simdutf/src/arm64/arm_convert_utf8_to_utf16.cpp
new file mode 100644
index 000000000..6683e6263
--- /dev/null
+++ b/contrib/simdutf/src/arm64/arm_convert_utf8_to_utf16.cpp
@@ -0,0 +1,288 @@
+// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 16, usually 12).
+template <endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char16_t *&utf16_output) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask & 0xfff;
+ //
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+
+ // We first try a few fast paths.
+ // The obvious first test is ASCII, which actually consumes the full 16.
+ if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) {
+ // We process in chunks of 16 bytes
+ // The routine in simd.h is reused.
+ simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
+ temp.store_ascii_as_utf16<big_endian>(utf16_output);
+ utf16_output += 16; // We wrote 16 16-bit characters.
+ return 16; // We consumed 16 bytes.
+ }
+
+ // 3 byte sequences are the next most common, as seen in CJK, which has long
+ // sequences of these.
+ if (input_utf8_end_of_code_point_mask == 0x924) {
+ // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
+ // UTF-16 code units.
+ uint16x4_t composed = convert_utf8_3_byte_to_utf16(in);
+ // Byte swap if necessary
+ if (!match_system(big_endian)) {
+ composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
+ }
+ vst1_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+ utf16_output += 4; // We wrote 4 16-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+
+ // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+ if ((utf8_end_of_code_point_mask & 0xFFF) == 0xaaa) {
+ // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte
+ // UTF-16 code units.
+ uint16x8_t composed = convert_utf8_2_byte_to_utf16(in);
+ // Byte swap if necessary
+ if (!match_system(big_endian)) {
+ composed =
+ vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
+ }
+ vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+
+ utf16_output += 6; // We wrote 6 16-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+
+ /// We do not have a fast path available, or the fast path is unimportant, so
+ /// we fallback.
+ const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][0];
+
+ const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][1];
+
+ if (idx < 64) {
+ // SIX (6) input code-code units
+ // Convert to UTF-16
+ uint16x8_t composed = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+ // Byte swap if necessary
+ if (!match_system(big_endian)) {
+ composed =
+ vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
+ }
+ // Store
+ vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+ utf16_output += 6; // We wrote 6 16-bit characters.
+ return consumed;
+ } else if (idx < 145) {
+ // FOUR (4) input code-code units
+ // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+ uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+ // XXX: depending on the system scalar instructions might be faster.
+ // 1 byte: 00000000 00000000 0ccccccc
+ // 2 byte: 00000000 110bbbbb 10cccccc
+ // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+ uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
+ // 1 byte: 00000000 0ccccccc
+ // 2 byte: xx0bbbbb x0cccccc
+ // 3 byte: xxbbbbbb x0cccccc
+ uint16x4_t lowperm = vmovn_u32(perm);
+ // Partially mask with bic (doesn't require a temporary register unlike and)
+ // The shift left insert below will clear the top bits.
+ // 1 byte: 00000000 00000000
+ // 2 byte: xx0bbbbb 00000000
+ // 3 byte: xxbbbbbb 00000000
+ uint16x4_t middlebyte = vbic_u16(lowperm, vmov_n_u16(uint16_t(~0xFF00)));
+ // ASCII
+ // 1 byte: 00000000 0ccccccc
+ // 2+byte: 00000000 00cccccc
+ uint16x4_t ascii = vand_u16(lowperm, vmov_n_u16(0x7F));
+ // Split into narrow vectors.
+ // 2 byte: 00000000 00000000
+ // 3 byte: 00000000 xxxxaaaa
+ uint16x4_t highperm = vshrn_n_u32(perm, 16);
+ // Shift right accumulate the middle byte
+ // 1 byte: 00000000 0ccccccc
+ // 2 byte: 00xx0bbb bbcccccc
+ // 3 byte: 00xxbbbb bbcccccc
+ uint16x4_t middlelow = vsra_n_u16(ascii, middlebyte, 2);
+ // Shift left and insert the top 4 bits, overwriting the garbage
+ // 1 byte: 00000000 0ccccccc
+ // 2 byte: 00000bbb bbcccccc
+ // 3 byte: aaaabbbb bbcccccc
+ uint16x4_t composed = vsli_n_u16(middlelow, highperm, 12);
+ // Byte swap if necessary
+ if (!match_system(big_endian)) {
+ composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
+ }
+ vst1_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+
+ utf16_output += 4; // We wrote 4 16-bit codepoints
+ return consumed;
+ } else if (idx < 209) {
+ // THREE (3) input code-code units
+ if (input_utf8_end_of_code_point_mask == 0x888) {
+ // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+ // UTF-16 pairs. Generating surrogate pairs is a little tricky though, but
+ // it is easier when we can assume they are all pairs. This version does
+ // not use the LUT, but 4 byte sequences are less common and the overhead
+ // of the extra memory access is less important than the early branch
+ // overhead in shorter sequences.
+
+ // Swap byte pairs
+ // 10dddddd 10cccccc|10bbbbbb 11110aaa
+ // 10cccccc 10dddddd|11110aaa 10bbbbbb
+ uint8x16_t swap = vrev16q_u8(in);
+ // Shift left 2 bits
+ // cccccc00 dddddd00 xxxxxxxx bbbbbb00
+ uint32x4_t shift = vreinterpretq_u32_u8(vshlq_n_u8(swap, 2));
+ // Create a magic number containing the low 2 bits of the trail surrogate
+ // and all the corrections needed to create the pair. UTF-8 4b prefix =
+ // -0x0000|0xF000 surrogate offset = -0x0000|0x0040 (0x10000 << 6)
+ // surrogate high = +0x0000|0xD800
+ // surrogate low = +0xDC00|0x0000
+ // -------------------------------
+ // = +0xDC00|0xE7C0
+ uint32x4_t magic = vmovq_n_u32(0xDC00E7C0);
+ // Generate unadjusted trail surrogate minus lowest 2 bits
+ // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
+ uint32x4_t trail =
+ vbslq_u32(vmovq_n_u32(0x0000FF00), vreinterpretq_u32_u8(swap), shift);
+ // Insert low 2 bits of trail surrogate to magic number for later
+ // 11011100 00000000 11100111 110000cc
+ uint16x8_t magic_with_low_2 =
+ vreinterpretq_u16_u32(vsraq_n_u32(magic, shift, 30));
+ // Generate lead surrogate
+ // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx
+ uint32x4_t lead = vreinterpretq_u32_u16(
+ vsliq_n_u16(vreinterpretq_u16_u8(swap), vreinterpretq_u16_u8(in), 6));
+ // Mask out lead
+ // 000000cc ccdddddd|xxxxxxxx xxxxxxxx
+ lead = vbicq_u32(lead, vmovq_n_u32(uint32_t(~0x03FFFFFF)));
+ // Blend pairs
+ // 000000cc ccdddddd|11110aaa bbbbbb00
+ uint16x8_t blend = vreinterpretq_u16_u32(
+ vbslq_u32(vmovq_n_u32(0x0000FFFF), trail, lead));
+ // Add magic number to finish the result
+ // 110111CC CCDDDDDD|110110AA BBBBBBCC
+ uint16x8_t composed = vaddq_u16(blend, magic_with_low_2);
+ // Byte swap if necessary
+ if (!match_system(big_endian)) {
+ composed =
+ vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
+ }
+ uint16_t buffer[8];
+ vst1q_u16(reinterpret_cast<uint16_t *>(buffer), composed);
+ for (int k = 0; k < 6; k++) {
+ utf16_output[k] = buffer[k];
+ } // the loop might compiler to a couple of instructions.
+ utf16_output += 6; // We wrote 3 32-bit surrogate pairs.
+ return 12; // We consumed 12 bytes.
+ }
+ // 3 1-4 byte sequences
+ uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+
+ // 1 byte: 00000000 00000000 00000000 0ddddddd
+ // 3 byte: 00000000 00000000 110ccccc 10dddddd
+ // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+ // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+ uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
+ // added to fix issue https://github.com/simdutf/simdutf/issues/514
+ // We only want to write 2 * 16-bit code units when that is actually what we
+ // have. Unfortunately, we cannot trust the input. So it is possible to get
+ // 0xff as an input byte and it should not result in a surrogate pair. We
+ // need to check for that.
+ uint32_t permbuffer[4];
+ vst1q_u32(permbuffer, perm);
+ // Mask the low and middle bytes
+ // 00000000 00000000 00000000 0ddddddd
+ uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7f));
+ // Because the surrogates need more work, the high surrogate is computed
+ // first.
+ uint32x4_t middlehigh = vshlq_n_u32(perm, 2);
+ // 00000000 00000000 00cccccc 00000000
+ uint32x4_t middlebyte = vandq_u32(perm, vmovq_n_u32(0x3F00));
+ // Start assembling the sequence. Since the 4th byte is in the same position
+ // as it would be in a surrogate and there is no dependency, shift left
+ // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte:
+ // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
+ uint32x4_t ab = vbslq_u32(vmovq_n_u32(0xFF000000), perm, middlehigh);
+ // Top 16 bits contains the high ten bits of the surrogate pair before
+ // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa
+ // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
+ uint32x4_t abc =
+ vbslq_u32(vmovq_n_u32(0xFFFC0000), ab, vshlq_n_u32(middlebyte, 4));
+ // Combine the low 6 or 7 bits by a shift right accumulate
+ // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct
+ // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o
+ // correction
+ uint32x4_t composed = vsraq_n_u32(ascii, abc, 6);
+ // After this is for surrogates
+ // Blend the low and high surrogates
+ // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
+ uint32x4_t mixed = vbslq_u32(vmovq_n_u32(0xFFFF0000), abc, composed);
+ // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits
+ // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte:
+ // 11110aaa bbbbbbcc|000000cc ccdddddd
+ uint16x8_t masked_pair = vreinterpretq_u16_u32(
+ vbicq_u32(mixed, vmovq_n_u32(uint32_t(~0xFFFF03FF))));
+ // Correct the remaining UTF-8 prefix, surrogate offset, and add the
+ // surrogate prefixes in one magic 16-bit addition. similar magic number but
+ // without the continue byte adjust and halfword swapped UTF-8 4b prefix =
+ // -0xF000|0x0000 surrogate offset = -0x0040|0x0000 (0x10000 << 6)
+ // surrogate high = +0xD800|0x0000
+ // surrogate low = +0x0000|0xDC00
+ // -----------------------------------
+ // = +0xE7C0|0xDC00
+ uint16x8_t magic = vreinterpretq_u16_u32(vmovq_n_u32(0xE7C0DC00));
+ // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete
+ uint32x4_t surrogates =
+ vreinterpretq_u32_u16(vaddq_u16(masked_pair, magic));
+ // If the high bit is 1 (s32 less than zero), this needs a surrogate pair
+ uint32x4_t is_pair = vcltzq_s32(vreinterpretq_s32_u32(perm));
+
+ // Select either the 4 byte surrogate pair or the 2 byte solo codepoint
+ // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd
+ // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD
+ uint32x4_t selected = vbslq_u32(is_pair, surrogates, composed);
+ // Byte swap if necessary
+ if (!match_system(big_endian)) {
+ selected =
+ vreinterpretq_u32_u8(vrev16q_u8(vreinterpretq_u8_u32(selected)));
+ }
+ // Attempting to shuffle and store would be complex, just scalarize.
+ uint32_t buffer[4];
+ vst1q_u32(buffer, selected);
+ // Test for the top bit of the surrogate mask. Remove due to issue 514
+ // const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 :
+ // 0x00800000;
+ for (size_t i = 0; i < 3; i++) {
+ // Surrogate
+ // Used to be if (buffer[i] & SURROGATE_MASK) {
+ // See discussion above.
+ // patch for issue https://github.com/simdutf/simdutf/issues/514
+ if ((permbuffer[i] & 0xf8000000) == 0xf0000000) {
+ utf16_output[0] = uint16_t(buffer[i] >> 16);
+ utf16_output[1] = uint16_t(buffer[i] & 0xFFFF);
+ utf16_output += 2;
+ } else {
+ utf16_output[0] = uint16_t(buffer[i] & 0xFFFF);
+ utf16_output++;
+ }
+ }
+ return consumed;
+ } else {
+ // here we know that there is an error but we do not handle errors
+ return 12;
+ }
+}
diff --git a/contrib/simdutf/src/arm64/arm_convert_utf8_to_utf32.cpp b/contrib/simdutf/src/arm64/arm_convert_utf8_to_utf32.cpp
new file mode 100644
index 000000000..1167b80bf
--- /dev/null
+++ b/contrib/simdutf/src/arm64/arm_convert_utf8_to_utf32.cpp
@@ -0,0 +1,179 @@
+// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_utf32(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char32_t *&utf32_out) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
+ uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask & 0xFFF;
+ //
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+ //
+ // We first try a few fast paths.
+ if (utf8_end_of_code_point_mask == 0xfff) {
+ // We process in chunks of 12 bytes.
+ // use fast implementation in src/simdutf/arm64/simd.h
+ // Ideally the compiler can keep the tables in registers.
+ simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
+ temp.store_ascii_as_utf32_tbl(utf32_out);
+ utf32_output += 12; // We wrote 12 32-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+ if (input_utf8_end_of_code_point_mask == 0x924) {
+ // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
+ // UTF-32 code units. Convert to UTF-16
+ uint16x4_t composed_utf16 = convert_utf8_3_byte_to_utf16(in);
+ // Zero extend and store via ST2 with a zero.
+ uint16x4x2_t interleaver = {{composed_utf16, vmov_n_u16(0)}};
+ vst2_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
+ utf32_output += 4; // We wrote 4 32-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+
+ // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+ if (input_utf8_end_of_code_point_mask == 0xaaa) {
+ // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
+ // UTF-32 code units. Convert to UTF-16
+ uint16x8_t composed_utf16 = convert_utf8_2_byte_to_utf16(in);
+ // Zero extend and store via ST2 with a zero.
+ uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}};
+ vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
+ utf32_output += 6; // We wrote 6 32-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+ /// Either no fast path or an unimportant fast path.
+
+ const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][0];
+ const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][1];
+
+ if (idx < 64) {
+ // SIX (6) input code-code units
+ // Convert to UTF-16
+ uint16x8_t composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+ // Zero extend and store with ST2 and zero
+ uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}};
+ vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
+ utf32_output += 6; // We wrote 6 32-bit characters.
+ return consumed;
+ } else if (idx < 145) {
+ // FOUR (4) input code-code units
+ // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+ uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+ // Shuffle
+ // 1 byte: 00000000 00000000 0ccccccc
+ // 2 byte: 00000000 110bbbbb 10cccccc
+ // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+ uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
+ // Split
+ // 00000000 00000000 0ccccccc
+ uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F)); // 6 or 7 bits
+ // Note: unmasked
+ // xxxxxxxx aaaaxxxx xxxxxxxx
+ uint32x4_t high = vshrq_n_u32(perm, 4); // 4 bits
+ // Use 16 bit bic instead of and.
+ // The top bits will be corrected later in the bsl
+ // 00000000 10bbbbbb 00000000
+ uint32x4_t middle = vreinterpretq_u32_u16(
+ vbicq_u16(vreinterpretq_u16_u32(perm),
+ vmovq_n_u16(uint16_t(~0xff00)))); // 5 or 6 bits
+ // Combine low and middle with shift right accumulate
+ // 00000000 00xxbbbb bbcccccc
+ uint32x4_t lowmid = vsraq_n_u32(ascii, middle, 2);
+ // Insert top 4 bits from high byte with bitwise select
+ // 00000000 aaaabbbb bbcccccc
+ uint32x4_t composed = vbslq_u32(vmovq_n_u32(0x0000F000), high, lowmid);
+ vst1q_u32(utf32_output, composed);
+ utf32_output += 4; // We wrote 4 32-bit characters.
+ return consumed;
+ } else if (idx < 209) {
+ // THREE (3) input code-code units
+ if (input_utf8_end_of_code_point_mask == 0x888) {
+ // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+ // UTF-32 code units. This uses the same method as the fixed 3 byte
+ // version, reversing and shift left insert. However, there is no need for
+ // a shuffle mask now, just rev16 and rev32.
+ //
+ // This version does not use the LUT, but 4 byte sequences are less common
+ // and the overhead of the extra memory access is less important than the
+ // early branch overhead in shorter sequences, so it comes last.
+
+ // Swap pairs of bytes
+ // 10dddddd|10cccccc|10bbbbbb|11110aaa
+ // 10cccccc 10dddddd|11110aaa 10bbbbbb
+ uint16x8_t swap1 = vreinterpretq_u16_u8(vrev16q_u8(in));
+ // Shift left and insert
+ // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
+ uint16x8_t merge1 = vsliq_n_u16(swap1, vreinterpretq_u16_u8(in), 6);
+ // Swap 16-bit lanes
+ // xxxxcccc ccdddddd xxxxxxxa aabbbbbb
+ // xxxxxxxa aabbbbbb xxxxcccc ccdddddd
+ uint32x4_t swap2 = vreinterpretq_u32_u16(vrev32q_u16(merge1));
+ // Shift insert again
+ // xxxxxxxx xxxaaabb bbbbcccc ccdddddd
+ uint32x4_t merge2 = vsliq_n_u32(swap2, vreinterpretq_u32_u16(merge1), 12);
+ // Clear the garbage
+ // 00000000 000aaabb bbbbcccc ccdddddd
+ uint32x4_t composed = vandq_u32(merge2, vmovq_n_u32(0x1FFFFF));
+ // Store
+ vst1q_u32(utf32_output, composed);
+
+ utf32_output += 3; // We wrote 3 32-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+ // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
+ // due to surrogates no longer being involved.
+ uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+ // 1 byte: 00000000 00000000 00000000 0ddddddd
+ // 2 byte: 00000000 00000000 110ccccc 10dddddd
+ // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+ // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+ uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
+ // Ascii
+ uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F));
+ uint32x4_t middle = vandq_u32(perm, vmovq_n_u32(0x3f00));
+ // When converting the way we do, the 3 byte prefix will be interpreted as
+ // the 18th bit being set, since the code would interpret the lead byte
+ // (0b1110bbbb) as a continuation byte (0b10bbbbbb). To fix this, we can
+ // either xor or do an 8 bit add of the 6th bit shifted right by 1. Since
+ // NEON has shift right accumulate, we use that.
+ // 4 byte 3 byte
+ // 10bbbbbb 1110bbbb
+ // 00000000 01000000 6th bit
+ // 00000000 00100000 shift right
+ // 10bbbbbb 0000bbbb add
+ // 00bbbbbb 0000bbbb mask
+ uint8x16_t correction =
+ vreinterpretq_u8_u32(vandq_u32(perm, vmovq_n_u32(0x00400000)));
+ uint32x4_t corrected = vreinterpretq_u32_u8(
+ vsraq_n_u8(vreinterpretq_u8_u32(perm), correction, 1));
+ // 00000000 00000000 0000cccc ccdddddd
+ uint32x4_t cd = vsraq_n_u32(ascii, middle, 2);
+ // Insert twice
+ // xxxxxxxx xxxaaabb bbbbxxxx xxxxxxxx
+ uint32x4_t ab = vbslq_u32(vmovq_n_u32(0x01C0000), vshrq_n_u32(corrected, 6),
+ vshrq_n_u32(corrected, 4));
+ // 00000000 000aaabb bbbbcccc ccdddddd
+ uint32x4_t composed = vbslq_u32(vmovq_n_u32(0xFFE00FFF), cd, ab);
+ // Store
+ vst1q_u32(utf32_output, composed);
+ utf32_output += 3; // We wrote 3 32-bit characters.
+ return consumed;
+ } else {
+ // here we know that there is an error but we do not handle errors
+ return 12;
+ }
+}
diff --git a/contrib/simdutf/src/arm64/arm_validate_utf16.cpp b/contrib/simdutf/src/arm64/arm_validate_utf16.cpp
new file mode 100644
index 000000000..64586a4e4
--- /dev/null
+++ b/contrib/simdutf/src/arm64/arm_validate_utf16.cpp
@@ -0,0 +1,143 @@
+template <endianness big_endian>
+const char16_t *arm_validate_utf16(const char16_t *input, size_t size) {
+ const char16_t *end = input + size;
+ const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+ const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+ const auto v_fc = simd8<uint8_t>::splat(0xfc);
+ const auto v_dc = simd8<uint8_t>::splat(0xdc);
+ while (end - input >= 16) {
+ // 0. Load data: since the validation takes into account only higher
+ // byte of each word, we compress the two vectors into one which
+ // consists only the higher bytes.
+ auto in0 = simd16<uint16_t>(input);
+ auto in1 =
+ simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+ if (!match_system(big_endian)) {
+ in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0)));
+ in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1)));
+ }
+ const auto t0 = in0.shr<8>();
+ const auto t1 = in1.shr<8>();
+ const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
+ // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+ const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
+ if (surrogates_wordmask == 0) {
+ input += 16;
+ } else {
+ // 2. We have some surrogates that have to be distinguished:
+ // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+ // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+ //
+ // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+ // V - non-surrogate code units
+ // V = not surrogates_wordmask
+ const uint64_t V = ~surrogates_wordmask;
+
+ // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+ const auto vH = ((in & v_fc) == v_dc);
+ const uint64_t H = vH.to_bitmask64();
+
+ // L - word mask for low surrogates
+ // L = not H and surrogates_wordmask
+ const uint64_t L = ~H & surrogates_wordmask;
+
+ const uint64_t a =
+ L & (H >> 4); // A low surrogate must be followed by high one.
+ // (A low surrogate placed in the 7th register's word
+ // is an exception we handle.)
+ const uint64_t b =
+ a << 4; // Just mark that the opposite fact is hold,
+ // thanks to that we have only two masks for valid case.
+ const uint64_t c = V | a | b; // Combine all the masks into the final one.
+ if (c == ~0ull) {
+ // The whole input register contains valid UTF-16, i.e.,
+ // either single code units or proper surrogate pairs.
+ input += 16;
+ } else if (c == 0xfffffffffffffffull) {
+ // The 15 lower code units of the input register contains valid UTF-16.
+ // The 15th word may be either a low or high surrogate. It the next
+ // iteration we 1) check if the low surrogate is followed by a high
+ // one, 2) reject sole high surrogate.
+ input += 15;
+ } else {
+ return nullptr;
+ }
+ }
+ }
+ return input;
+}
+
+template <endianness big_endian>
+const result arm_validate_utf16_with_errors(const char16_t *input,
+ size_t size) {
+ const char16_t *start = input;
+ const char16_t *end = input + size;
+
+ const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+ const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+ const auto v_fc = simd8<uint8_t>::splat(0xfc);
+ const auto v_dc = simd8<uint8_t>::splat(0xdc);
+ while (input + 16 < end) {
+ // 0. Load data: since the validation takes into account only higher
+ // byte of each word, we compress the two vectors into one which
+ // consists only the higher bytes.
+ auto in0 = simd16<uint16_t>(input);
+ auto in1 =
+ simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+
+ if (!match_system(big_endian)) {
+ in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0)));
+ in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1)));
+ }
+ const auto t0 = in0.shr<8>();
+ const auto t1 = in1.shr<8>();
+ const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
+ // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+ const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
+ if (surrogates_wordmask == 0) {
+ input += 16;
+ } else {
+ // 2. We have some surrogates that have to be distinguished:
+ // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+ // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+ //
+ // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+ // V - non-surrogate code units
+ // V = not surrogates_wordmask
+ const uint64_t V = ~surrogates_wordmask;
+
+ // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+ const auto vH = ((in & v_fc) == v_dc);
+ const uint64_t H = vH.to_bitmask64();
+
+ // L - word mask for low surrogates
+ // L = not H and surrogates_wordmask
+ const uint64_t L = ~H & surrogates_wordmask;
+
+ const uint64_t a =
+ L & (H >> 4); // A low surrogate must be followed by high one.
+ // (A low surrogate placed in the 7th register's word
+ // is an exception we handle.)
+ const uint64_t b =
+ a << 4; // Just mark that the opposite fact is hold,
+ // thanks to that we have only two masks for valid case.
+ const uint64_t c = V | a | b; // Combine all the masks into the final one.
+ if (c == ~0ull) {
+ // The whole input register contains valid UTF-16, i.e.,
+ // either single code units or proper surrogate pairs.
+ input += 16;
+ } else if (c == 0xfffffffffffffffull) {
+ // The 15 lower code units of the input register contains valid UTF-16.
+ // The 15th word may be either a low or high surrogate. It the next
+ // iteration we 1) check if the low surrogate is followed by a high
+ // one, 2) reject sole high surrogate.
+ input += 15;
+ } else {
+ return result(error_code::SURROGATE, input - start);
+ }
+ }
+ }
+ return result(error_code::SUCCESS, input - start);
+}
diff --git a/contrib/simdutf/src/arm64/arm_validate_utf32le.cpp b/contrib/simdutf/src/arm64/arm_validate_utf32le.cpp
new file mode 100644
index 000000000..490f1fdb1
--- /dev/null
+++ b/contrib/simdutf/src/arm64/arm_validate_utf32le.cpp
@@ -0,0 +1,65 @@
+
+const char32_t *arm_validate_utf32le(const char32_t *input, size_t size) {
+ const char32_t *end = input + size;
+
+ const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
+ const uint32x4_t offset = vmovq_n_u32(0xffff2000);
+ const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
+ uint32x4_t currentmax = vmovq_n_u32(0x0);
+ uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
+
+ while (end - input >= 4) {
+ const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input));
+ currentmax = vmaxq_u32(in, currentmax);
+ currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
+ input += 4;
+ }
+
+ uint32x4_t is_zero =
+ veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
+ if (vmaxvq_u32(is_zero) != 0) {
+ return nullptr;
+ }
+
+ is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax),
+ standardoffsetmax);
+ if (vmaxvq_u32(is_zero) != 0) {
+ return nullptr;
+ }
+
+ return input;
+}
+
+const result arm_validate_utf32le_with_errors(const char32_t *input,
+ size_t size) {
+ const char32_t *start = input;
+ const char32_t *end = input + size;
+
+ const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
+ const uint32x4_t offset = vmovq_n_u32(0xffff2000);
+ const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
+ uint32x4_t currentmax = vmovq_n_u32(0x0);
+ uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
+
+ while (end - input >= 4) {
+ const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input));
+ currentmax = vmaxq_u32(in, currentmax);
+ currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
+
+ uint32x4_t is_zero =
+ veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
+ if (vmaxvq_u32(is_zero) != 0) {
+ return result(error_code::TOO_LARGE, input - start);
+ }
+
+ is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax),
+ standardoffsetmax);
+ if (vmaxvq_u32(is_zero) != 0) {
+ return result(error_code::SURROGATE, input - start);
+ }
+
+ input += 4;
+ }
+
+ return result(error_code::SUCCESS, input - start);
+}
diff --git a/contrib/simdutf/src/arm64/implementation.cpp b/contrib/simdutf/src/arm64/implementation.cpp
new file mode 100644
index 000000000..b055fe2ec
--- /dev/null
+++ b/contrib/simdutf/src/arm64/implementation.cpp
@@ -0,0 +1,1185 @@
+#include "simdutf/arm64/begin.h"
+#include "simdutf/implementation.h"
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+#ifndef SIMDUTF_ARM64_H
+ #error "arm64.h must be included"
+#endif
+using namespace simd;
+
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
+ simd8<uint8_t> bits = input.reduce_or();
+ return bits.max_val() < 0b10000000u;
+}
+
+simdutf_unused simdutf_really_inline simd8<bool>
+must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
+ const simd8<uint8_t> prev3) {
+ simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
+ simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+ simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+ // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller
+ // is using ^ as well. This will work fine because we only have to report
+ // errors for cases with 0-1 lead bytes. Multiple lead bytes implies 2
+ // overlapping multibyte characters, and if that happens, there is guaranteed
+ // to be at least *one* lead byte that is part of only 1 other multibyte
+ // character. The error will be detected there.
+ return is_second_byte ^ is_third_byte ^ is_fourth_byte;
+}
+
+simdutf_really_inline simd8<bool>
+must_be_2_3_continuation(const simd8<uint8_t> prev2,
+ const simd8<uint8_t> prev3) {
+ simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+ simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+ return is_third_byte ^ is_fourth_byte;
+}
+
+// common functions for utf8 conversions
+simdutf_really_inline uint16x4_t convert_utf8_3_byte_to_utf16(uint8x16_t in) {
+ // Low half contains 10cccccc|1110aaaa
+ // High half contains 10bbbbbb|10bbbbbb
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint8x16_t sh = simdutf_make_uint8x16_t(0, 2, 3, 5, 6, 8, 9, 11, 1, 1,
+ 4, 4, 7, 7, 10, 10);
+#else
+ const uint8x16_t sh = {0, 2, 3, 5, 6, 8, 9, 11, 1, 1, 4, 4, 7, 7, 10, 10};
+#endif
+ uint8x16_t perm = vqtbl1q_u8(in, sh);
+ // Split into half vectors.
+ // 10cccccc|1110aaaa
+ uint8x8_t perm_low = vget_low_u8(perm); // no-op
+ // 10bbbbbb|10bbbbbb
+ uint8x8_t perm_high = vget_high_u8(perm);
+ // xxxxxxxx 10bbbbbb
+ uint16x4_t mid = vreinterpret_u16_u8(perm_high); // no-op
+ // xxxxxxxx 1110aaaa
+ uint16x4_t high = vreinterpret_u16_u8(perm_low); // no-op
+ // Assemble with shift left insert.
+ // xxxxxxaa aabbbbbb
+ uint16x4_t mid_high = vsli_n_u16(mid, high, 6);
+ // (perm_low << 8) | (perm_low >> 8)
+ // xxxxxxxx 10cccccc
+ uint16x4_t low = vreinterpret_u16_u8(vrev16_u8(perm_low));
+ // Shift left insert into the low bits
+ // aaaabbbb bbcccccc
+ uint16x4_t composed = vsli_n_u16(low, mid_high, 6);
+ return composed;
+}
+
+simdutf_really_inline uint16x8_t convert_utf8_2_byte_to_utf16(uint8x16_t in) {
+ // Converts 6 2 byte UTF-8 characters to 6 UTF-16 characters.
+ // Technically this calculates 8, but 6 does better and happens more often
+ // (The languages which use these codepoints use ASCII spaces so 8 would need
+ // to be in the middle of a very long word).
+
+ // 10bbbbbb 110aaaaa
+ uint16x8_t upper = vreinterpretq_u16_u8(in);
+ // (in << 8) | (in >> 8)
+ // 110aaaaa 10bbbbbb
+ uint16x8_t lower = vreinterpretq_u16_u8(vrev16q_u8(in));
+ // 00000000 000aaaaa
+ uint16x8_t upper_masked = vandq_u16(upper, vmovq_n_u16(0x1F));
+ // Assemble with shift left insert.
+ // 00000aaa aabbbbbb
+ uint16x8_t composed = vsliq_n_u16(lower, upper_masked, 6);
+ return composed;
+}
+
+simdutf_really_inline uint16x8_t
+convert_utf8_1_to_2_byte_to_utf16(uint8x16_t in, size_t shufutf8_idx) {
+ // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
+ // This is a relatively easy scenario
+ // we process SIX (6) input code-code units. The max length in bytes of six
+ // code code units spanning between 1 and 2 bytes each is 12 bytes.
+ uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]));
+ // Shuffle
+ // 1 byte: 00000000 0bbbbbbb
+ // 2 byte: 110aaaaa 10bbbbbb
+ uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh));
+ // Mask
+ // 1 byte: 00000000 0bbbbbbb
+ // 2 byte: 00000000 00bbbbbb
+ uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits
+ // 1 byte: 00000000 00000000
+ // 2 byte: 000aaaaa 00000000
+ uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits
+ // Combine with a shift right accumulate
+ // 1 byte: 00000000 0bbbbbbb
+ // 2 byte: 00000aaa aabbbbbb
+ uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2);
+ return composed;
+}
+
+#include "arm64/arm_validate_utf16.cpp"
+#include "arm64/arm_validate_utf32le.cpp"
+
+#include "arm64/arm_convert_latin1_to_utf16.cpp"
+#include "arm64/arm_convert_latin1_to_utf32.cpp"
+#include "arm64/arm_convert_latin1_to_utf8.cpp"
+
+#include "arm64/arm_convert_utf8_to_latin1.cpp"
+#include "arm64/arm_convert_utf8_to_utf16.cpp"
+#include "arm64/arm_convert_utf8_to_utf32.cpp"
+
+#include "arm64/arm_convert_utf16_to_latin1.cpp"
+#include "arm64/arm_convert_utf16_to_utf32.cpp"
+#include "arm64/arm_convert_utf16_to_utf8.cpp"
+
+#include "arm64/arm_base64.cpp"
+#include "arm64/arm_convert_utf32_to_latin1.cpp"
+#include "arm64/arm_convert_utf32_to_utf16.cpp"
+#include "arm64/arm_convert_utf32_to_utf8.cpp"
+
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+#include "generic/buf_block_reader.h"
+#include "generic/utf8_validation/utf8_lookup4_algorithm.h"
+#include "generic/utf8_validation/utf8_validator.h"
+// transcoding from UTF-8 to UTF-16
+#include "generic/utf8_to_utf16/utf8_to_utf16.h"
+#include "generic/utf8_to_utf16/valid_utf8_to_utf16.h"
+// transcoding from UTF-8 to UTF-32
+#include "generic/utf8_to_utf32/utf8_to_utf32.h"
+#include "generic/utf8_to_utf32/valid_utf8_to_utf32.h"
+// other functions
+#include "generic/utf16.h"
+#include "generic/utf8.h"
+// transcoding from UTF-8 to Latin 1
+#include "generic/utf8_to_latin1/utf8_to_latin1.h"
+#include "generic/utf8_to_latin1/valid_utf8_to_latin1.h"
+
+// placeholder scalars
+#include "scalar/latin1.h"
+
+//
+// Implementation-specific overrides
+//
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+ size_t length) const noexcept {
+ // If there is a BOM, then we trust it.
+ auto bom_encoding = simdutf::BOM::check_bom(input, length);
+ if (bom_encoding != encoding_type::unspecified) {
+ return bom_encoding;
+ }
+ // todo: reimplement as a one-pass algorithm.
+ int out = 0;
+ if (validate_utf8(input, length)) {
+ out |= encoding_type::UTF8;
+ }
+ if ((length % 2) == 0) {
+ if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+ length / 2)) {
+ out |= encoding_type::UTF16_LE;
+ }
+ }
+ if ((length % 4) == 0) {
+ if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+ out |= encoding_type::UTF32_LE;
+ }
+ }
+ return out;
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+ return arm64::utf8_validation::generic_validate_utf8(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+ const char *buf, size_t len) const noexcept {
+ return arm64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+ return arm64::utf8_validation::generic_validate_ascii(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+ const char *buf, size_t len) const noexcept {
+ return arm64::utf8_validation::generic_validate_ascii_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid. protected the implementation from nullptr.
+ return true;
+ }
+ const char16_t *tail = arm_validate_utf16<endianness::LITTLE>(buf, len);
+ if (tail) {
+ return scalar::utf16::validate<endianness::LITTLE>(tail,
+ len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid. protected the implementation from nullptr.
+ return true;
+ }
+ const char16_t *tail = arm_validate_utf16<endianness::BIG>(buf, len);
+ if (tail) {
+ return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return result(error_code::SUCCESS, 0);
+ }
+ result res = arm_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+ if (res.count != len) {
+ result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
+ buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return result(error_code::SUCCESS, 0);
+ }
+ result res = arm_validate_utf16_with_errors<endianness::BIG>(buf, len);
+ if (res.count != len) {
+ result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
+ buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid. protected the implementation from nullptr.
+ return true;
+ }
+ const char32_t *tail = arm_validate_utf32le(buf, len);
+ if (tail) {
+ return scalar::utf32::validate(tail, len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return result(error_code::SUCCESS, 0);
+ }
+ result res = arm_validate_utf32le_with_errors(buf, len);
+ if (res.count != len) {
+ result scalar_res =
+ scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+ const char *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char *, char *> ret =
+ arm_convert_latin1_to_utf8(buf, len, utf8_output);
+ size_t converted_chars = ret.second - utf8_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char *, char16_t *> ret =
+ arm_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+ size_t converted_chars = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars =
+ scalar::latin1_to_utf16::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char *, char16_t *> ret =
+ arm_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
+ size_t converted_chars = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars =
+ scalar::latin1_to_utf16::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char *, char32_t *> ret =
+ arm_convert_latin1_to_utf32(buf, len, utf32_output);
+ size_t converted_chars = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ utf8_to_latin1::validating_transcoder converter;
+ return converter.convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ utf8_to_latin1::validating_transcoder converter;
+ return converter.convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ return arm64::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert_with_errors<endianness::LITTLE>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+ const char *input, size_t size, char16_t *utf16_output) const noexcept {
+ return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+ const char *input, size_t size, char16_t *utf16_output) const noexcept {
+ return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ utf8_to_utf32::validating_transcoder converter;
+ return converter.convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ utf8_to_utf32::validating_transcoder converter;
+ return converter.convert_with_errors(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+ const char *input, size_t size, char32_t *utf32_output) const noexcept {
+ return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ arm_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_latin1::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ arm_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_latin1::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<result, char *> ret =
+ arm_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+ buf, len, latin1_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<result, char *> ret =
+ arm_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
+ latin1_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ // optimization opportunity: implement a custom function.
+ return convert_utf16be_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ // optimization opportunity: implement a custom function.
+ return convert_utf16le_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ arm_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ arm_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf8::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ arm_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len,
+ utf8_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ arm_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len,
+ utf8_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf16le_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf16be_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return 0;
+ }
+ std::pair<const char32_t *, char *> ret =
+ arm_convert_utf32_to_utf8(buf, len, utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return result(error_code::SUCCESS, 0);
+ }
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+ if (ret.first.count != len) {
+ result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char16_t *, char32_t *> ret =
+ arm_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char16_t *, char32_t *> ret =
+ arm_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf32::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char32_t *> ret =
+ arm_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len,
+ utf32_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf32_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char32_t *> ret =
+ arm_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len,
+ utf32_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf32_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char32_t *, char *> ret =
+ arm_convert_utf32_to_latin1(buf, len, latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<result, char *> ret =
+ arm_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char32_t *, char *> ret =
+ arm_convert_utf32_to_latin1(buf, len, latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
+ ret.first, len - (ret.first - buf), ret.second);
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ // optimization opportunity: implement a custom function.
+ return convert_utf32_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char32_t *, char16_t *> ret =
+ arm_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char32_t *, char16_t *> ret =
+ arm_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf32_to_utf16::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char16_t *> ret =
+ arm_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len,
+ utf16_output);
+ if (ret.first.count != len) {
+ result scalar_res =
+ scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf16_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char16_t *> ret =
+ arm_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
+ utf16_output);
+ if (ret.first.count != len) {
+ result scalar_res =
+ scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf16_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return convert_utf32_to_utf16le(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return convert_utf16le_to_utf32(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return convert_utf16be_to_utf32(buf, len, utf32_output);
+}
+
+void implementation::change_endianness_utf16(const char16_t *input,
+ size_t length,
+ char16_t *output) const noexcept {
+ utf16::change_endianness_utf16(input, length, output);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::count_code_points<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::count_code_points<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+ return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+ const char *buf, size_t len) const noexcept {
+ return count_utf8(buf, len);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+ return scalar::utf16::latin1_length_from_utf16(length);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+ return scalar::utf32::latin1_length_from_utf32(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+ const char *input, size_t length) const noexcept {
+ // See
+ // https://lemire.me/blog/2023/05/15/computing-the-utf-8-size-of-a-latin-1-string-quickly-arm-neon-edition/
+ // credit to Pete Cawley
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
+ uint64_t result = 0;
+ const int lanes = sizeof(uint8x16_t);
+ uint8_t rem = length % lanes;
+ const uint8_t *simd_end = data + (length / lanes) * lanes;
+ const uint8x16_t threshold = vdupq_n_u8(0x80);
+ for (; data < simd_end; data += lanes) {
+ // load 16 bytes
+ uint8x16_t input_vec = vld1q_u8(data);
+ // compare to threshold (0x80)
+ uint8x16_t withhighbit = vcgeq_u8(input_vec, threshold);
+ // vertical addition
+ result -= vaddvq_s8(vreinterpretq_s8_u8(withhighbit));
+ }
+ return result + (length / lanes) * lanes +
+ scalar::latin1::utf8_length_from_latin1((const char *)simd_end, rem);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+ return scalar::latin1::utf16_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+ return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ return utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f);
+ const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff);
+ const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
+ const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
+ size_t pos = 0;
+ size_t count = 0;
+ for (; pos + 4 <= length; pos += 4) {
+ uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
+ const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f);
+ const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff);
+ const uint32x4_t two_bytes_bytemask =
+ veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask);
+ const uint32x4_t three_bytes_bytemask =
+ veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask);
+
+ const uint16x8_t reduced_ascii_bytes_bytemask =
+ vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1));
+ const uint16x8_t reduced_two_bytes_bytemask =
+ vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1));
+ const uint16x8_t reduced_three_bytes_bytemask =
+ vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1));
+
+ const uint16x8_t compressed_bytemask0 =
+ vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask);
+ const uint16x8_t compressed_bytemask1 =
+ vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask);
+
+ size_t ascii_count = count_ones(
+ vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0));
+ size_t two_bytes_count = count_ones(
+ vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1));
+ size_t three_bytes_count = count_ones(
+ vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0));
+
+ count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
+ }
+ return count +
+ scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
+ const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
+ size_t pos = 0;
+ size_t count = 0;
+ for (; pos + 4 <= length; pos += 4) {
+ uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
+ const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff);
+ const uint16x8_t reduced_bytemask =
+ vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1));
+ const uint16x8_t compressed_bytemask =
+ vpaddq_u16(reduced_bytemask, reduced_bytemask);
+ size_t surrogate_count = count_ones(
+ vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0));
+ count += 4 + surrogate_count;
+ }
+ return count +
+ scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+ size_t length, base64_options options) const noexcept {
+ return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+ char *output,
+ base64_options options) const noexcept {
+ return encode_base64(output, input, length, options);
+}
+
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#include "simdutf/arm64/end.h"
diff --git a/contrib/simdutf/src/encoding_types.cpp b/contrib/simdutf/src/encoding_types.cpp
new file mode 100644
index 000000000..3029cae32
--- /dev/null
+++ b/contrib/simdutf/src/encoding_types.cpp
@@ -0,0 +1,75 @@
+
+namespace simdutf {
+bool match_system(endianness e) {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return e == endianness::BIG;
+#else
+ return e == endianness::LITTLE;
+#endif
+}
+
+std::string to_string(encoding_type bom) {
+ switch (bom) {
+ case UTF16_LE:
+ return "UTF16 little-endian";
+ case UTF16_BE:
+ return "UTF16 big-endian";
+ case UTF32_LE:
+ return "UTF32 little-endian";
+ case UTF32_BE:
+ return "UTF32 big-endian";
+ case UTF8:
+ return "UTF8";
+ case unspecified:
+ return "unknown";
+ default:
+ return "error";
+ }
+}
+
+namespace BOM {
+// Note that BOM for UTF8 is discouraged.
+encoding_type check_bom(const uint8_t *byte, size_t length) {
+ if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) {
+ if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) {
+ return encoding_type::UTF32_LE;
+ } else {
+ return encoding_type::UTF16_LE;
+ }
+ } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) {
+ return encoding_type::UTF16_BE;
+ } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and
+ byte[2] == 0xfe and byte[3] == 0xff) {
+ return encoding_type::UTF32_BE;
+ } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and
+ byte[2] == 0xbf) {
+ return encoding_type::UTF8;
+ }
+ return encoding_type::unspecified;
+}
+
+encoding_type check_bom(const char *byte, size_t length) {
+ return check_bom(reinterpret_cast<const uint8_t *>(byte), length);
+}
+
+size_t bom_byte_size(encoding_type bom) {
+ switch (bom) {
+ case UTF16_LE:
+ return 2;
+ case UTF16_BE:
+ return 2;
+ case UTF32_LE:
+ return 4;
+ case UTF32_BE:
+ return 4;
+ case UTF8:
+ return 3;
+ case unspecified:
+ return 0;
+ default:
+ return 0;
+ }
+}
+
+} // namespace BOM
+} // namespace simdutf
diff --git a/contrib/simdutf/src/error.cpp b/contrib/simdutf/src/error.cpp
new file mode 100644
index 000000000..64c709968
--- /dev/null
+++ b/contrib/simdutf/src/error.cpp
@@ -0,0 +1,3 @@
+namespace simdutf {
+// deliberately empty
+}
diff --git a/contrib/simdutf/src/fallback/implementation.cpp b/contrib/simdutf/src/fallback/implementation.cpp
new file mode 100644
index 000000000..fc9a53677
--- /dev/null
+++ b/contrib/simdutf/src/fallback/implementation.cpp
@@ -0,0 +1,691 @@
+#include "simdutf/fallback/begin.h"
+
+#include "scalar/utf8_to_utf16/valid_utf8_to_utf16.h"
+#include "scalar/utf8_to_utf16/utf8_to_utf16.h"
+
+#include "scalar/utf8_to_utf32/valid_utf8_to_utf32.h"
+#include "scalar/utf8_to_utf32/utf8_to_utf32.h"
+
+#include "scalar/utf8_to_latin1/valid_utf8_to_latin1.h"
+#include "scalar/utf8_to_latin1/utf8_to_latin1.h"
+
+#include "scalar/utf16_to_utf8/valid_utf16_to_utf8.h"
+#include "scalar/utf16_to_utf8/utf16_to_utf8.h"
+
+#include "scalar/utf16_to_utf32/valid_utf16_to_utf32.h"
+#include "scalar/utf16_to_utf32/utf16_to_utf32.h"
+
+#include "scalar/utf32_to_utf8/valid_utf32_to_utf8.h"
+#include "scalar/utf32_to_utf8/utf32_to_utf8.h"
+
+#include "scalar/utf32_to_utf16/valid_utf32_to_utf16.h"
+#include "scalar/utf32_to_utf16/utf32_to_utf16.h"
+
+#include "scalar/ascii.h"
+#include "scalar/base64.h"
+#include "scalar/utf8.h"
+#include "scalar/utf16.h"
+#include "scalar/latin1.h"
+#include "scalar/utf8_to_latin1/valid_utf8_to_latin1.h"
+#include "scalar/utf8_to_latin1/utf8_to_latin1.h"
+#include <cstdint>
+#include <cstring>
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+ size_t length) const noexcept {
+ // If there is a BOM, then we trust it.
+ auto bom_encoding = simdutf::BOM::check_bom(input, length);
+ if (bom_encoding != encoding_type::unspecified) {
+ return bom_encoding;
+ }
+ // todo: reimplement as a one-pass algorithm.
+ int out = 0;
+ if (validate_utf8(input, length)) {
+ out |= encoding_type::UTF8;
+ }
+ if ((length % 2) == 0) {
+ if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+ length / 2)) {
+ out |= encoding_type::UTF16_LE;
+ }
+ }
+ if ((length % 4) == 0) {
+ if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+ out |= encoding_type::UTF32_LE;
+ }
+ }
+ return out;
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+ return scalar::utf8::validate(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+ const char *buf, size_t len) const noexcept {
+ return scalar::utf8::validate_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+ return scalar::ascii::validate(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+ const char *buf, size_t len) const noexcept {
+ return scalar::ascii::validate_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept {
+ return scalar::utf16::validate<endianness::LITTLE>(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept {
+ return scalar::utf16::validate<endianness::BIG>(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+ return scalar::utf32::validate(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept {
+ return scalar::utf32::validate_with_errors(buf, len);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+ const char *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return scalar::latin1_to_utf32::convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf8_to_utf16::convert<endianness::LITTLE>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf8_to_utf16::convert<endianness::BIG>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(
+ buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(
+ buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return scalar::utf8_to_utf32::convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+ const char *input, size_t size, char32_t *utf32_output) const noexcept {
+ return scalar::utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len,
+ latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len,
+ latin1_output);
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
+ buf, len, latin1_output);
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
+ buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(
+ buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len,
+ latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len,
+ utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+ buf, len, utf8_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+ buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len,
+ utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len,
+ utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+ buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+ buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(
+ buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len,
+ utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len,
+ utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+ buf, len, utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+ buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(
+ buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len,
+ utf32_output);
+}
+
+void implementation::change_endianness_utf16(const char16_t *input,
+ size_t length,
+ char16_t *output) const noexcept {
+ scalar::utf16::change_endianness_utf16(input, length, output);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::utf16::count_code_points<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+ return scalar::utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+ const char *buf, size_t len) const noexcept {
+ return scalar::utf8::count_code_points(buf, len);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+ return scalar::utf16::latin1_length_from_utf16(length);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+ return length;
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+ const char *input, size_t length) const noexcept {
+ size_t answer = length;
+ size_t i = 0;
+ auto pop = [](uint64_t v) {
+ return (size_t)(((v >> 7) & UINT64_C(0x0101010101010101)) *
+ UINT64_C(0x0101010101010101) >>
+ 56);
+ };
+ for (; i + 32 <= length; i += 32) {
+ uint64_t v;
+ memcpy(&v, input + i, 8);
+ answer += pop(v);
+ memcpy(&v, input + i + 8, sizeof(v));
+ answer += pop(v);
+ memcpy(&v, input + i + 16, sizeof(v));
+ answer += pop(v);
+ memcpy(&v, input + i + 24, sizeof(v));
+ answer += pop(v);
+ }
+ for (; i + 8 <= length; i += 8) {
+ uint64_t v;
+ memcpy(&v, input + i, sizeof(v));
+ answer += pop(v);
+ }
+ for (; i + 1 <= length; i += 1) {
+ answer += static_cast<uint8_t>(input[i]) >> 7;
+ }
+ return answer;
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input,
+ length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input,
+ length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+ return scalar::latin1::utf16_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ return scalar::utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ return scalar::utf32::utf8_length_from_utf32(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ return scalar::utf32::utf16_length_from_utf32(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+ return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ return scalar::utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ size_t equallocation =
+ length; // location of the first padding character if any
+ size_t equalsigns = 0;
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ length -= 1;
+ equalsigns++;
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ equalsigns++;
+ length -= 1;
+ }
+ }
+ if (length == 0) {
+ if (equalsigns > 0) {
+ return {INVALID_BASE64_CHARACTER, equallocation};
+ }
+ return {SUCCESS, 0};
+ }
+ result r = scalar::base64::base64_tail_decode(
+ output, input, length, equalsigns, options, last_chunk_options);
+ if (last_chunk_options != stop_before_partial &&
+ r.error == error_code::SUCCESS && equalsigns > 0) {
+ // additional checks
+ if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+ return {INVALID_BASE64_CHARACTER, equallocation};
+ }
+ }
+ return r;
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ size_t equallocation =
+ length; // location of the first padding character if any
+ size_t equalsigns = 0;
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ length -= 1;
+ equalsigns++;
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ equalsigns++;
+ length -= 1;
+ }
+ }
+ if (length == 0) {
+ if (equalsigns > 0) {
+ return {INVALID_BASE64_CHARACTER, equallocation, 0};
+ }
+ return {SUCCESS, 0, 0};
+ }
+ full_result r = scalar::base64::base64_tail_decode(
+ output, input, length, equalsigns, options, last_chunk_options);
+ if (last_chunk_options != stop_before_partial &&
+ r.error == error_code::SUCCESS && equalsigns > 0) {
+ // additional checks
+ if ((r.output_count % 3 == 0) ||
+ ((r.output_count % 3) + 1 + equalsigns != 4)) {
+ return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
+ }
+ }
+ return r;
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ size_t equallocation =
+ length; // location of the first padding character if any
+ size_t equalsigns = 0;
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ length -= 1;
+ equalsigns++;
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ equalsigns++;
+ length -= 1;
+ }
+ }
+ if (length == 0) {
+ if (equalsigns > 0) {
+ return {INVALID_BASE64_CHARACTER, equallocation};
+ }
+ return {SUCCESS, 0};
+ }
+ result r = scalar::base64::base64_tail_decode(
+ output, input, length, equalsigns, options, last_chunk_options);
+ if (last_chunk_options != stop_before_partial &&
+ r.error == error_code::SUCCESS && equalsigns > 0) {
+ // additional checks
+ if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+ return {INVALID_BASE64_CHARACTER, equallocation};
+ }
+ }
+ return r;
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ size_t equallocation =
+ length; // location of the first padding character if any
+ size_t equalsigns = 0;
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ length -= 1;
+ equalsigns++;
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ equalsigns++;
+ length -= 1;
+ }
+ }
+ if (length == 0) {
+ if (equalsigns > 0) {
+ return {INVALID_BASE64_CHARACTER, equallocation, 0};
+ }
+ return {SUCCESS, 0, 0};
+ }
+ full_result r = scalar::base64::base64_tail_decode(
+ output, input, length, equalsigns, options, last_chunk_options);
+ if (last_chunk_options != stop_before_partial &&
+ r.error == error_code::SUCCESS && equalsigns > 0) {
+ // additional checks
+ if ((r.output_count % 3 == 0) ||
+ ((r.output_count % 3) + 1 + equalsigns != 4)) {
+ return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
+ }
+ }
+ return r;
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+ size_t length, base64_options options) const noexcept {
+ return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+ char *output,
+ base64_options options) const noexcept {
+ return scalar::base64::tail_encode_base64(output, input, length, options);
+}
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#include "simdutf/fallback/end.h"
diff --git a/contrib/simdutf/src/generic/buf_block_reader.h b/contrib/simdutf/src/generic/buf_block_reader.h
new file mode 100644
index 000000000..4c3afcc34
--- /dev/null
+++ b/contrib/simdutf/src/generic/buf_block_reader.h
@@ -0,0 +1,109 @@
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+
+// Walks through a buffer in block-sized increments, loading the last part with
+// spaces
+template <size_t STEP_SIZE> struct buf_block_reader {
+public:
+ simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+ simdutf_really_inline size_t block_index();
+ simdutf_really_inline bool has_full_block() const;
+ simdutf_really_inline const uint8_t *full_block() const;
+ /**
+ * Get the last block, padded with spaces.
+ *
+ * There will always be a last block, with at least 1 byte, unless len == 0
+ * (in which case this function fills the buffer with spaces and returns 0. In
+ * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
+ * block with STEP_SIZE bytes and no spaces for padding.
+ *
+ * @return the number of effective characters in the last block.
+ */
+ simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
+ simdutf_really_inline void advance();
+
+private:
+ const uint8_t *buf;
+ const size_t len;
+ const size_t lenminusstep;
+ size_t idx;
+};
+
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text_64(const uint8_t *text) {
+ static char *buf =
+ reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+ for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+ buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+ }
+ buf[sizeof(simd8x64<uint8_t>)] = '\0';
+ return buf;
+}
+
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
+ static char *buf =
+ reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+ in.store(reinterpret_cast<uint8_t *>(buf));
+ for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+ if (buf[i] < ' ') {
+ buf[i] = '_';
+ }
+ }
+ buf[sizeof(simd8x64<uint8_t>)] = '\0';
+ return buf;
+}
+
+simdutf_unused static char *format_mask(uint64_t mask) {
+ static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
+ for (size_t i = 0; i < 64; i++) {
+ buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+ }
+ buf[64] = '\0';
+ return buf;
+}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline
+buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
+ : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
+ idx{0} {}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
+ return idx;
+}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+ return idx < lenminusstep;
+}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline const uint8_t *
+buf_block_reader<STEP_SIZE>::full_block() const {
+ return &buf[idx];
+}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t
+buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+ if (len == idx) {
+ return 0;
+ } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+ std::memset(dst, 0x20,
+ STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
+ // to write out 8 or 16 bytes at once.
+ std::memcpy(dst, buf + idx, len - idx);
+ return len - idx;
+}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
+ idx += STEP_SIZE;
+}
+
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
diff --git a/contrib/simdutf/src/generic/utf16.h b/contrib/simdutf/src/generic/utf16.h
new file mode 100644
index 000000000..b845de117
--- /dev/null
+++ b/contrib/simdutf/src/generic/utf16.h
@@ -0,0 +1,74 @@
+#include "scalar/utf16.h"
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+namespace utf16 {
+
+template <endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t *in,
+ size_t size) {
+ size_t pos = 0;
+ size_t count = 0;
+ for (; pos < size / 32 * 32; pos += 32) {
+ simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+ if (!match_system(big_endian)) {
+ input.swap_bytes();
+ }
+ uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+ count += count_ones(not_pair) / 2;
+ }
+ return count +
+ scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
+}
+
+template <endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
+ size_t size) {
+ size_t pos = 0;
+ size_t count = 0;
+ // This algorithm could no doubt be improved!
+ for (; pos < size / 32 * 32; pos += 32) {
+ simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+ if (!match_system(big_endian)) {
+ input.swap_bytes();
+ }
+ uint64_t ascii_mask = input.lteq(0x7F);
+ uint64_t twobyte_mask = input.lteq(0x7FF);
+ uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+ size_t ascii_count = count_ones(ascii_mask) / 2;
+ size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+ size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+ size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+ count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
+ ascii_count;
+ }
+ return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
+ size - pos);
+}
+
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
+ size_t size) {
+ return count_code_points<big_endian>(in, size);
+}
+
+simdutf_really_inline void
+change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
+ size_t pos = 0;
+
+ while (pos < size / 32 * 32) {
+ simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+ input.swap_bytes();
+ input.store(reinterpret_cast<uint16_t *>(output));
+ pos += 32;
+ output += 32;
+ }
+
+ scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+}
+
+} // namespace utf16
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
diff --git a/contrib/simdutf/src/generic/utf8.h b/contrib/simdutf/src/generic/utf8.h
new file mode 100644
index 000000000..bd44a47e4
--- /dev/null
+++ b/contrib/simdutf/src/generic/utf8.h
@@ -0,0 +1,40 @@
+#include "scalar/utf8.h"
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+namespace utf8 {
+
+using namespace simd;
+
+simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
+ size_t pos = 0;
+ size_t count = 0;
+ for (; pos + 64 <= size; pos += 64) {
+ simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+ uint64_t utf8_continuation_mask = input.gt(-65);
+ count += count_ones(utf8_continuation_mask);
+ }
+ return count + scalar::utf8::count_code_points(in + pos, size - pos);
+}
+
+simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
+ size_t size) {
+ size_t pos = 0;
+ size_t count = 0;
+ // This algorithm could no doubt be improved!
+ for (; pos + 64 <= size; pos += 64) {
+ simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+ uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+ // We count one word for anything that is not a continuation (so
+ // leading bytes).
+ count += 64 - count_ones(utf8_continuation_mask);
+ int64_t utf8_4byte = input.gteq_unsigned(240);
+ count += count_ones(utf8_4byte);
+ }
+ return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
+}
+} // namespace utf8
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
diff --git a/contrib/simdutf/src/generic/utf8_to_latin1/utf8_to_latin1.h b/contrib/simdutf/src/generic/utf8_to_latin1/utf8_to_latin1.h
new file mode 100644
index 000000000..3af6b150d
--- /dev/null
+++ b/contrib/simdutf/src/generic/utf8_to_latin1/utf8_to_latin1.h
@@ -0,0 +1,315 @@
+#include "scalar/utf8_to_latin1/utf8_to_latin1.h"
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+namespace utf8_to_latin1 {
+using namespace simd;
+
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+ // For UTF-8 to Latin 1, we can allow any ASCII character, and any
+ // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
+ // 0b11000010 and nothing else.
+ //
+ // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+ // Bit 1 = Too Long (ASCII followed by continuation)
+ // Bit 2 = Overlong 3-byte
+ // Bit 4 = Surrogate
+ // Bit 5 = Overlong 2-byte
+ // Bit 7 = Two Continuations
+ constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
+ // 11______ 11______
+ constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+ constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+ constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+ constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+ constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+ constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
+ // 11110100 101_____
+ // 11110101 1001____
+ // 11110101 101_____
+ // 1111011_ 1001____
+ // 1111011_ 101_____
+ // 11111___ 1001____
+ // 11111___ 101_____
+ constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+ // 11110101 1000____
+ // 1111011_ 1000____
+ // 11111___ 1000____
+ constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+ constexpr const uint8_t FORBIDDEN = 0xff;
+
+ const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+ // 0_______ ________ <ASCII in byte 1>
+ TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+ TOO_LONG,
+ // 10______ ________ <continuation in byte 1>
+ TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+ // 1100____ ________ <two byte lead in byte 1>
+ TOO_SHORT | OVERLONG_2,
+ // 1101____ ________ <two byte lead in byte 1>
+ FORBIDDEN,
+ // 1110____ ________ <three byte lead in byte 1>
+ FORBIDDEN,
+ // 1111____ ________ <four+ byte lead in byte 1>
+ FORBIDDEN);
+ constexpr const uint8_t CARRY =
+ TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+ const simd8<uint8_t> byte_1_low =
+ (prev1 & 0x0F)
+ .lookup_16<uint8_t>(
+ // ____0000 ________
+ CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+ // ____0001 ________
+ CARRY | OVERLONG_2,
+ // ____001_ ________
+ CARRY, CARRY,
+
+ // ____0100 ________
+ FORBIDDEN,
+ // ____0101 ________
+ FORBIDDEN,
+ // ____011_ ________
+ FORBIDDEN, FORBIDDEN,
+
+ // ____1___ ________
+ FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
+ // ____1101 ________
+ FORBIDDEN, FORBIDDEN, FORBIDDEN);
+ const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+ // ________ 0_______ <ASCII in byte 2>
+ TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+ TOO_SHORT, TOO_SHORT,
+
+ // ________ 1000____
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+ OVERLONG_4,
+ // ________ 1001____
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+ // ________ 101_____
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+ // ________ 11______
+ TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+ return (byte_1_high & byte_1_low & byte_2_high);
+}
+
+struct validating_transcoder {
+ // If this is nonzero, there has been a UTF-8 error.
+ simd8<uint8_t> error;
+
+ validating_transcoder() : error(uint8_t(0)) {}
+ //
+ // Check whether the current bytes are valid UTF-8.
+ //
+ simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+ const simd8<uint8_t> prev_input) {
+ // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+ // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+ // small negative numbers)
+ simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+ this->error |= check_special_cases(input, prev1);
+ }
+
+ simdutf_really_inline size_t convert(const char *in, size_t size,
+ char *latin1_output) {
+ size_t pos = 0;
+ char *start{latin1_output};
+ // In the worst case, we have the haswell kernel which can cause an overflow
+ // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+ // last 16 bytes, and if the data is valid, then it is entirely safe because
+ // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+ // generally assume that you have valid UTF-8 input, so we are going to go
+ // back from the end counting 16 leading bytes, to give us a good margin.
+ size_t leading_byte = 0;
+ size_t margin = size;
+ for (; margin > 0 && leading_byte < 16; margin--) {
+ leading_byte += (int8_t(in[margin - 1]) >
+ -65); // twos complement of -65 is 1011 1111 ...
+ }
+ // If the input is long enough, then we have that margin-1 is the eight last
+ // leading byte.
+ const size_t safety_margin = size - margin + 1; // to avoid overruns!
+ while (pos + 64 + safety_margin <= size) {
+ simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+ if (input.is_ascii()) {
+ input.store((int8_t *)latin1_output);
+ latin1_output += 64;
+ pos += 64;
+ } else {
+ // you might think that a for-loop would work, but under Visual Studio,
+ // it is not good enough.
+ static_assert(
+ (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+ (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+ "We support either two or four chunks per 64-byte block.");
+ auto zero = simd8<uint8_t>{uint8_t(0)};
+ if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+ this->check_utf8_bytes(input.chunks[0], zero);
+ this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+ } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+ this->check_utf8_bytes(input.chunks[0], zero);
+ this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+ this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+ this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+ }
+ uint64_t utf8_continuation_mask =
+ input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+ // this case, we also have ASCII to account for.
+ uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+ uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+ // We process in blocks of up to 12 bytes except possibly
+ // for fast paths which may process up to 16 bytes. For the
+ // slow path to work, we should have at least 12 input bytes left.
+ size_t max_starting_point = (pos + 64) - 12;
+ // Next loop is going to run at least five times.
+ while (pos < max_starting_point) {
+ // Performance note: our ability to compute 'consumed' and
+ // then shift and recompute is critical. If there is a
+ // latency of, say, 4 cycles on getting 'consumed', then
+ // the inner loop might have a total latency of about 6 cycles.
+ // Yet we process between 6 to 12 inputs bytes, thus we get
+ // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+ // for this section of the code. Hence, there is a limit
+ // to how much we can further increase this latency before
+ // it seriously harms performance.
+ size_t consumed = convert_masked_utf8_to_latin1(
+ in + pos, utf8_end_of_code_point_mask, latin1_output);
+ pos += consumed;
+ utf8_end_of_code_point_mask >>= consumed;
+ }
+ // At this point there may remain between 0 and 12 bytes in the
+ // 64-byte block. These bytes will be processed again. So we have an
+ // 80% efficiency (in the worst case). In practice we expect an
+ // 85% to 90% efficiency.
+ }
+ }
+ if (errors()) {
+ return 0;
+ }
+ if (pos < size) {
+ size_t howmany =
+ scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
+ if (howmany == 0) {
+ return 0;
+ }
+ latin1_output += howmany;
+ }
+ return latin1_output - start;
+ }
+
+ simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+ char *latin1_output) {
+ size_t pos = 0;
+ char *start{latin1_output};
+ // In the worst case, we have the haswell kernel which can cause an overflow
+ // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+ // last 16 bytes, and if the data is valid, then it is entirely safe because
+ // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+ // generally assume that you have valid UTF-8 input, so we are going to go
+ // back from the end counting 8 leading bytes, to give us a good margin.
+ size_t leading_byte = 0;
+ size_t margin = size;
+ for (; margin > 0 && leading_byte < 8; margin--) {
+ leading_byte += (int8_t(in[margin - 1]) > -65);
+ }
+ // If the input is long enough, then we have that margin-1 is the eight last
+ // leading byte.
+ const size_t safety_margin = size - margin + 1; // to avoid overruns!
+ while (pos + 64 + safety_margin <= size) {
+ simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+ if (input.is_ascii()) {
+ input.store((int8_t *)latin1_output);
+ latin1_output += 64;
+ pos += 64;
+ } else {
+ // you might think that a for-loop would work, but under Visual Studio,
+ // it is not good enough.
+ static_assert(
+ (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+ (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+ "We support either two or four chunks per 64-byte block.");
+ auto zero = simd8<uint8_t>{uint8_t(0)};
+ if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+ this->check_utf8_bytes(input.chunks[0], zero);
+ this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+ } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+ this->check_utf8_bytes(input.chunks[0], zero);
+ this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+ this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+ this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+ }
+ if (errors()) {
+ // rewind_and_convert_with_errors will seek a potential error from
+ // in+pos onward, with the ability to go back up to pos bytes, and
+ // read size-pos bytes forward.
+ result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+ pos, in + pos, size - pos, latin1_output);
+ res.count += pos;
+ return res;
+ }
+ uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+ uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+ uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+ // We process in blocks of up to 12 bytes except possibly
+ // for fast paths which may process up to 16 bytes. For the
+ // slow path to work, we should have at least 12 input bytes left.
+ size_t max_starting_point = (pos + 64) - 12;
+ // Next loop is going to run at least five times.
+ while (pos < max_starting_point) {
+ // Performance note: our ability to compute 'consumed' and
+ // then shift and recompute is critical. If there is a
+ // latency of, say, 4 cycles on getting 'consumed', then
+ // the inner loop might have a total latency of about 6 cycles.
+ // Yet we process between 6 to 12 inputs bytes, thus we get
+ // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+ // for this section of the code. Hence, there is a limit
+ // to how much we can further increase this latency before
+ // it seriously harms performance.
+ size_t consumed = convert_masked_utf8_to_latin1(
+ in + pos, utf8_end_of_code_point_mask, latin1_output);
+ pos += consumed;
+ utf8_end_of_code_point_mask >>= consumed;
+ }
+ // At this point there may remain between 0 and 12 bytes in the
+ // 64-byte block. These bytes will be processed again. So we have an
+ // 80% efficiency (in the worst case). In practice we expect an
+ // 85% to 90% efficiency.
+ }
+ }
+ if (errors()) {
+ // rewind_and_convert_with_errors will seek a potential error from in+pos
+ // onward, with the ability to go back up to pos bytes, and read size-pos
+ // bytes forward.
+ result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+ pos, in + pos, size - pos, latin1_output);
+ res.count += pos;
+ return res;
+ }
+ if (pos < size) {
+ // rewind_and_convert_with_errors will seek a potential error from in+pos
+ // onward, with the ability to go back up to pos bytes, and read size-pos
+ // bytes forward.
+ result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+ pos, in + pos, size - pos, latin1_output);
+ if (res.error) { // In case of error, we want the error position
+ res.count += pos;
+ return res;
+ } else { // In case of success, we want the number of word written
+ latin1_output += res.count;
+ }
+ }
+ return result(error_code::SUCCESS, latin1_output - start);
+ }
+
+ simdutf_really_inline bool errors() const {
+ return this->error.any_bits_set_anywhere();
+ }
+
+}; // struct utf8_checker
+} // namespace utf8_to_latin1
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
diff --git a/contrib/simdutf/src/generic/utf8_to_latin1/valid_utf8_to_latin1.h b/contrib/simdutf/src/generic/utf8_to_latin1/valid_utf8_to_latin1.h
new file mode 100644
index 000000000..4ba34adb7
--- /dev/null
+++ b/contrib/simdutf/src/generic/utf8_to_latin1/valid_utf8_to_latin1.h
@@ -0,0 +1,80 @@
+#include "scalar/utf8_to_latin1/utf8_to_latin1.h"
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+namespace utf8_to_latin1 {
+using namespace simd;
+
+simdutf_really_inline size_t convert_valid(const char *in, size_t size,
+ char *latin1_output) {
+ size_t pos = 0;
+ char *start{latin1_output};
+ // In the worst case, we have the haswell kernel which can cause an overflow
+ // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
+ // 16 bytes, and if the data is valid, then it is entirely safe because 16
+ // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
+ // assume that you have valid UTF-8 input, so we are going to go back from the
+ // end counting 8 leading bytes, to give us a good margin.
+ size_t leading_byte = 0;
+ size_t margin = size;
+ for (; margin > 0 && leading_byte < 8; margin--) {
+ leading_byte += (int8_t(in[margin - 1]) >
+ -65); // twos complement of -65 is 1011 1111 ...
+ }
+ // If the input is long enough, then we have that margin-1 is the eight last
+ // leading byte.
+ const size_t safety_margin = size - margin + 1; // to avoid overruns!
+ while (pos + 64 + safety_margin <= size) {
+ simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+ if (input.is_ascii()) {
+ input.store((int8_t *)latin1_output);
+ latin1_output += 64;
+ pos += 64;
+ } else {
+ // you might think that a for-loop would work, but under Visual Studio, it
+ // is not good enough.
+ uint64_t utf8_continuation_mask =
+ input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+ // this case, we also have ASCII to account for.
+ uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+ uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+ // We process in blocks of up to 12 bytes except possibly
+ // for fast paths which may process up to 16 bytes. For the
+ // slow path to work, we should have at least 12 input bytes left.
+ size_t max_starting_point = (pos + 64) - 12;
+ // Next loop is going to run at least five times.
+ while (pos < max_starting_point) {
+ // Performance note: our ability to compute 'consumed' and
+ // then shift and recompute is critical. If there is a
+ // latency of, say, 4 cycles on getting 'consumed', then
+ // the inner loop might have a total latency of about 6 cycles.
+ // Yet we process between 6 to 12 inputs bytes, thus we get
+ // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+ // for this section of the code. Hence, there is a limit
+ // to how much we can further increase this latency before
+ // it seriously harms performance.
+ size_t consumed = convert_masked_utf8_to_latin1(
+ in + pos, utf8_end_of_code_point_mask, latin1_output);
+ pos += consumed;
+ utf8_end_of_code_point_mask >>= consumed;
+ }
+ // At this point there may remain between 0 and 12 bytes in the
+ // 64-byte block. These bytes will be processed again. So we have an
+ // 80% efficiency (in the worst case). In practice we expect an
+ // 85% to 90% efficiency.
+ }
+ }
+ if (pos < size) {
+ size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
+ latin1_output);
+ latin1_output += howmany;
+ }
+ return latin1_output - start;
+}
+
+} // namespace utf8_to_latin1
+} // namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+ // namespace simdutf
diff --git a/contrib/simdutf/src/generic/utf8_to_utf16/utf8_to_utf16.h b/contrib/simdutf/src/generic/utf8_to_utf16/utf8_to_utf16.h
new file mode 100644
index 000000000..9cf3392e4
--- /dev/null
+++ b/contrib/simdutf/src/generic/utf8_to_utf16/utf8_to_utf16.h
@@ -0,0 +1,334 @@
+#include "scalar/utf8_to_utf16/utf8_to_utf16.h"
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+namespace utf8_to_utf16 {
+using namespace simd;
+
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+ // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+ // Bit 1 = Too Long (ASCII followed by continuation)
+ // Bit 2 = Overlong 3-byte
+ // Bit 4 = Surrogate
+ // Bit 5 = Overlong 2-byte
+ // Bit 7 = Two Continuations
+ constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
+ // 11______ 11______
+ constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+ constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+ constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+ constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+ constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+ constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
+ // 11110100 101_____
+ // 11110101 1001____
+ // 11110101 101_____
+ // 1111011_ 1001____
+ // 1111011_ 101_____
+ // 11111___ 1001____
+ // 11111___ 101_____
+ constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+ // 11110101 1000____
+ // 1111011_ 1000____
+ // 11111___ 1000____
+ constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+
+ const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+ // 0_______ ________ <ASCII in byte 1>
+ TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+ TOO_LONG,
+ // 10______ ________ <continuation in byte 1>
+ TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+ // 1100____ ________ <two byte lead in byte 1>
+ TOO_SHORT | OVERLONG_2,
+ // 1101____ ________ <two byte lead in byte 1>
+ TOO_SHORT,
+ // 1110____ ________ <three byte lead in byte 1>
+ TOO_SHORT | OVERLONG_3 | SURROGATE,
+ // 1111____ ________ <four+ byte lead in byte 1>
+ TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+ constexpr const uint8_t CARRY =
+ TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+ const simd8<uint8_t> byte_1_low =
+ (prev1 & 0x0F)
+ .lookup_16<uint8_t>(
+ // ____0000 ________
+ CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+ // ____0001 ________
+ CARRY | OVERLONG_2,
+ // ____001_ ________
+ CARRY, CARRY,
+
+ // ____0100 ________
+ CARRY | TOO_LARGE,
+ // ____0101 ________
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ // ____011_ ________
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+ // ____1___ ________
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ // ____1101 ________
+ CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000);
+ const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+ // ________ 0_______ <ASCII in byte 2>
+ TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+ TOO_SHORT, TOO_SHORT,
+
+ // ________ 1000____
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+ OVERLONG_4,
+ // ________ 1001____
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+ // ________ 101_____
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+ // ________ 11______
+ TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+ return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+ const simd8<uint8_t> prev_input,
+ const simd8<uint8_t> sc) {
+ simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+ simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+ simd8<uint8_t> must23 =
+ simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+ simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+ return must23_80 ^ sc;
+}
+
+struct validating_transcoder {
+ // If this is nonzero, there has been a UTF-8 error.
+ simd8<uint8_t> error;
+
+ validating_transcoder() : error(uint8_t(0)) {}
+ //
+ // Check whether the current bytes are valid UTF-8.
+ //
+ simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+ const simd8<uint8_t> prev_input) {
+ // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+ // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+ // small negative numbers)
+ simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+ simd8<uint8_t> sc = check_special_cases(input, prev1);
+ this->error |= check_multibyte_lengths(input, prev_input, sc);
+ }
+
+ template <endianness endian>
+ simdutf_really_inline size_t convert(const char *in, size_t size,
+ char16_t *utf16_output) {
+ size_t pos = 0;
+ char16_t *start{utf16_output};
+ // In the worst case, we have the haswell kernel which can cause an overflow
+ // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+ // last 16 bytes, and if the data is valid, then it is entirely safe because
+ // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+ // generally assume that you have valid UTF-8 input, so we are going to go
+ // back from the end counting 8 leading bytes, to give us a good margin.
+ size_t leading_byte = 0;
+ size_t margin = size;
+ for (; margin > 0 && leading_byte < 8; margin--) {
+ leading_byte += (int8_t(in[margin - 1]) > -65);
+ }
+ // If the input is long enough, then we have that margin-1 is the eight last
+ // leading byte.
+ const size_t safety_margin = size - margin + 1; // to avoid overruns!
+ while (pos + 64 + safety_margin <= size) {
+ simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+ if (input.is_ascii()) {
+ input.store_ascii_as_utf16<endian>(utf16_output);
+ utf16_output += 64;
+ pos += 64;
+ } else {
+ // you might think that a for-loop would work, but under Visual Studio,
+ // it is not good enough.
+ static_assert(
+ (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+ (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+ "We support either two or four chunks per 64-byte block.");
+ auto zero = simd8<uint8_t>{uint8_t(0)};
+ if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+ this->check_utf8_bytes(input.chunks[0], zero);
+ this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+ } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+ this->check_utf8_bytes(input.chunks[0], zero);
+ this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+ this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+ this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+ }
+ uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+ if (utf8_continuation_mask & 1) {
+ return 0; // error
+ }
+ uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+ uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+ // We process in blocks of up to 12 bytes except possibly
+ // for fast paths which may process up to 16 bytes. For the
+ // slow path to work, we should have at least 12 input bytes left.
+ size_t max_starting_point = (pos + 64) - 12;
+ // Next loop is going to run at least five times.
+ while (pos < max_starting_point) {
+ // Performance note: our ability to compute 'consumed' and
+ // then shift and recompute is critical. If there is a
+ // latency of, say, 4 cycles on getting 'consumed', then
+ // the inner loop might have a total latency of about 6 cycles.
+ // Yet we process between 6 to 12 inputs bytes, thus we get
+ // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+ // for this section of the code. Hence, there is a limit
+ // to how much we can further increase this latency before
+ // it seriously harms performance.
+ size_t consumed = convert_masked_utf8_to_utf16<endian>(
+ in + pos, utf8_end_of_code_point_mask, utf16_output);
+ pos += consumed;
+ utf8_end_of_code_point_mask >>= consumed;
+ }
+ // At this point there may remain between 0 and 12 bytes in the
+ // 64-byte block. These bytes will be processed again. So we have an
+ // 80% efficiency (in the worst case). In practice we expect an
+ // 85% to 90% efficiency.
+ }
+ }
+ if (errors()) {
+ return 0;
+ }
+ if (pos < size) {
+ size_t howmany = scalar::utf8_to_utf16::convert<endian>(
+ in + pos, size - pos, utf16_output);
+ if (howmany == 0) {
+ return 0;
+ }
+ utf16_output += howmany;
+ }
+ return utf16_output - start;
+ }
+
+ template <endianness endian>
+ simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+ char16_t *utf16_output) {
+ size_t pos = 0;
+ char16_t *start{utf16_output};
+ // In the worst case, we have the haswell kernel which can cause an overflow
+ // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+ // last 16 bytes, and if the data is valid, then it is entirely safe because
+ // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+ // generally assume that you have valid UTF-8 input, so we are going to go
+ // back from the end counting 8 leading bytes, to give us a good margin.
+ size_t leading_byte = 0;
+ size_t margin = size;
+ for (; margin > 0 && leading_byte < 8; margin--) {
+ leading_byte += (int8_t(in[margin - 1]) > -65);
+ }
+ // If the input is long enough, then we have that margin-1 is the eight last
+ // leading byte.
+ const size_t safety_margin = size - margin + 1; // to avoid overruns!
+ while (pos + 64 + safety_margin <= size) {
+ simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+ if (input.is_ascii()) {
+ input.store_ascii_as_utf16<endian>(utf16_output);
+ utf16_output += 64;
+ pos += 64;
+ } else {
+ // you might think that a for-loop would work, but under Visual Studio,
+ // it is not good enough.
+ static_assert(
+ (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+ (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+ "We support either two or four chunks per 64-byte block.");
+ auto zero = simd8<uint8_t>{uint8_t(0)};
+ if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+ this->check_utf8_bytes(input.chunks[0], zero);
+ this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+ } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+ this->check_utf8_bytes(input.chunks[0], zero);
+ this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+ this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+ this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+ }
+ uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+ if (errors() || (utf8_continuation_mask & 1)) {
+ // rewind_and_convert_with_errors will seek a potential error from
+ // in+pos onward, with the ability to go back up to pos bytes, and
+ // read size-pos bytes forward.
+ result res =
+ scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+ pos, in + pos, size - pos, utf16_output);
+ res.count += pos;
+ return res;
+ }
+ uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+ uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+ // We process in blocks of up to 12 bytes except possibly
+ // for fast paths which may process up to 16 bytes. For the
+ // slow path to work, we should have at least 12 input bytes left.
+ size_t max_starting_point = (pos + 64) - 12;
+ // Next loop is going to run at least five times.
+ while (pos < max_starting_point) {
+ // Performance note: our ability to compute 'consumed' and
+ // then shift and recompute is critical. If there is a
+ // latency of, say, 4 cycles on getting 'consumed', then
+ // the inner loop might have a total latency of about 6 cycles.
+ // Yet we process between 6 to 12 inputs bytes, thus we get
+ // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+ // for this section of the code. Hence, there is a limit
+ // to how much we can further increase this latency before
+ // it seriously harms performance.
+ size_t consumed = convert_masked_utf8_to_utf16<endian>(
+ in + pos, utf8_end_of_code_point_mask, utf16_output);
+ pos += consumed;
+ utf8_end_of_code_point_mask >>= consumed;
+ }
+ // At this point there may remain between 0 and 12 bytes in the
+ // 64-byte block. These bytes will be processed again. So we have an
+ // 80% efficiency (in the worst case). In practice we expect an
+ // 85% to 90% efficiency.
+ }
+ }
+ if (errors()) {
+ // rewind_and_convert_with_errors will seek a potential error from in+pos
+ // onward, with the ability to go back up to pos bytes, and read size-pos
+ // bytes forward.
+ result res =
+ scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+ pos, in + pos, size - pos, utf16_output);
+ res.count += pos;
+ return res;
+ }
+ if (pos < size) {
+ // rewind_and_convert_with_errors will seek a potential error from in+pos
+ // onward, with the ability to go back up to pos bytes, and read size-pos
+ // bytes forward.
+ result res =
+ scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+ pos, in + pos, size - pos, utf16_output);
+ if (res.error) { // In case of error, we want the error position
+ res.count += pos;
+ return res;
+ } else { // In case of success, we want the number of word written
+ utf16_output += res.count;
+ }
+ }
+ return result(error_code::SUCCESS, utf16_output - start);
+ }
+
+ simdutf_really_inline bool errors() const {
+ return this->error.any_bits_set_anywhere();
+ }
+
+}; // struct utf8_checker
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
diff --git a/contrib/simdutf/src/generic/utf8_to_utf16/valid_utf8_to_utf16.h b/contrib/simdutf/src/generic/utf8_to_utf16/valid_utf8_to_utf16.h
new file mode 100644
index 000000000..ceda631b1
--- /dev/null
+++ b/contrib/simdutf/src/generic/utf8_to_utf16/valid_utf8_to_utf16.h
@@ -0,0 +1,76 @@
+#include "scalar/utf8_to_utf16/utf8_to_utf16.h"
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+namespace utf8_to_utf16 {
+
+using namespace simd;
+
+template <endianness endian>
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+ char16_t *utf16_output) noexcept {
+ // The implementation is not specific to haswell and should be moved to the
+ // generic directory.
+ size_t pos = 0;
+ char16_t *start{utf16_output};
+ const size_t safety_margin = 16; // to avoid overruns!
+ while (pos + 64 + safety_margin <= size) {
+ // this loop could be unrolled further. For example, we could process the
+ // mask far more than 64 bytes.
+ simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+ if (in.is_ascii()) {
+ in.store_ascii_as_utf16<endian>(utf16_output);
+ utf16_output += 64;
+ pos += 64;
+ } else {
+ // Slow path. We hope that the compiler will recognize that this is a slow
+ // path. Anything that is not a continuation mask is a 'leading byte',
+ // that is, the start of a new code point.
+ uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+ // -65 is 0b10111111 in two-complement's, so largest possible continuation
+ // byte
+ uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+ // The *start* of code points is not so useful, rather, we want the *end*
+ // of code points.
+ uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+ // We process in blocks of up to 12 bytes except possibly
+ // for fast paths which may process up to 16 bytes. For the
+ // slow path to work, we should have at least 12 input bytes left.
+ size_t max_starting_point = (pos + 64) - 12;
+ // Next loop is going to run at least five times when using solely
+ // the slow/regular path, and at least four times if there are fast paths.
+ while (pos < max_starting_point) {
+ // Performance note: our ability to compute 'consumed' and
+ // then shift and recompute is critical. If there is a
+ // latency of, say, 4 cycles on getting 'consumed', then
+ // the inner loop might have a total latency of about 6 cycles.
+ // Yet we process between 6 to 12 inputs bytes, thus we get
+ // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+ // for this section of the code. Hence, there is a limit
+ // to how much we can further increase this latency before
+ // it seriously harms performance.
+ //
+ // Thus we may allow convert_masked_utf8_to_utf16 to process
+ // more bytes at a time under a fast-path mode where 16 bytes
+ // are consumed at once (e.g., when encountering ASCII).
+ size_t consumed = convert_masked_utf8_to_utf16<endian>(
+ input + pos, utf8_end_of_code_point_mask, utf16_output);
+ pos += consumed;
+ utf8_end_of_code_point_mask >>= consumed;
+ }
+ // At this point there may remain between 0 and 12 bytes in the
+ // 64-byte block. These bytes will be processed again. So we have an
+ // 80% efficiency (in the worst case). In practice we expect an
+ // 85% to 90% efficiency.
+ }
+ }
+ utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
+ input + pos, size - pos, utf16_output);
+ return utf16_output - start;
+}
+
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
diff --git a/contrib/simdutf/src/generic/utf8_to_utf32/utf8_to_utf32.h b/contrib/simdutf/src/generic/utf8_to_utf32/utf8_to_utf32.h
new file mode 100644
index 000000000..376f13d32
--- /dev/null
+++ b/contrib/simdutf/src/generic/utf8_to_utf32/utf8_to_utf32.h
@@ -0,0 +1,320 @@
+#include "scalar/utf8_to_utf32/utf8_to_utf32.h"
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+namespace utf8_to_utf32 {
+using namespace simd;
+
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+ // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+ // Bit 1 = Too Long (ASCII followed by continuation)
+ // Bit 2 = Overlong 3-byte
+ // Bit 4 = Surrogate
+ // Bit 5 = Overlong 2-byte
+ // Bit 7 = Two Continuations
+ constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
+ // 11______ 11______
+ constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+ constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+ constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+ constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+ constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+ constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
+ // 11110100 101_____
+ // 11110101 1001____
+ // 11110101 101_____
+ // 1111011_ 1001____
+ // 1111011_ 101_____
+ // 11111___ 1001____
+ // 11111___ 101_____
+ constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+ // 11110101 1000____
+ // 1111011_ 1000____
+ // 11111___ 1000____
+ constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+
+ const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+ // 0_______ ________ <ASCII in byte 1>
+ TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+ TOO_LONG,
+ // 10______ ________ <continuation in byte 1>
+ TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+ // 1100____ ________ <two byte lead in byte 1>
+ TOO_SHORT | OVERLONG_2,
+ // 1101____ ________ <two byte lead in byte 1>
+ TOO_SHORT,
+ // 1110____ ________ <three byte lead in byte 1>
+ TOO_SHORT | OVERLONG_3 | SURROGATE,
+ // 1111____ ________ <four+ byte lead in byte 1>
+ TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+ constexpr const uint8_t CARRY =
+ TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+ const simd8<uint8_t> byte_1_low =
+ (prev1 & 0x0F)
+ .lookup_16<uint8_t>(
+ // ____0000 ________
+ CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+ // ____0001 ________
+ CARRY | OVERLONG_2,
+ // ____001_ ________
+ CARRY, CARRY,
+
+ // ____0100 ________
+ CARRY | TOO_LARGE,
+ // ____0101 ________
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ // ____011_ ________
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+ // ____1___ ________
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ // ____1101 ________
+ CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000);
+ const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+ // ________ 0_______ <ASCII in byte 2>
+ TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+ TOO_SHORT, TOO_SHORT,
+
+ // ________ 1000____
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+ OVERLONG_4,
+ // ________ 1001____
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+ // ________ 101_____
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+ // ________ 11______
+ TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+ return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+ const simd8<uint8_t> prev_input,
+ const simd8<uint8_t> sc) {
+ simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+ simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+ simd8<uint8_t> must23 =
+ simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+ simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+ return must23_80 ^ sc;
+}
+
+struct validating_transcoder {
+ // If this is nonzero, there has been a UTF-8 error.
+ simd8<uint8_t> error;
+
+ validating_transcoder() : error(uint8_t(0)) {}
+ //
+ // Check whether the current bytes are valid UTF-8.
+ //
+ simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+ const simd8<uint8_t> prev_input) {
+ // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+ // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+ // small negative numbers)
+ simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+ simd8<uint8_t> sc = check_special_cases(input, prev1);
+ this->error |= check_multibyte_lengths(input, prev_input, sc);
+ }
+
+ simdutf_really_inline size_t convert(const char *in, size_t size,
+ char32_t *utf32_output) {
+ size_t pos = 0;
+ char32_t *start{utf32_output};
+ // In the worst case, we have the haswell kernel which can cause an overflow
+ // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
+ // last 16 bytes, and if the data is valid, then it is entirely safe because
+ // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+ // generally assume that you have valid UTF-8 input, so we are going to go
+ // back from the end counting 16 leading bytes, to give us a good margin.
+ size_t leading_byte = 0;
+ size_t margin = size;
+ for (; margin > 0 && leading_byte < 8; margin--) {
+ leading_byte += (int8_t(in[margin - 1]) > -65);
+ }
+ // If the input is long enough, then we have that margin-1 is the fourth
+ // last leading byte.
+ const size_t safety_margin = size - margin + 1; // to avoid overruns!
+ while (pos + 64 + safety_margin <= size) {
+ simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+ if (input.is_ascii()) {
+ input.store_ascii_as_utf32(utf32_output);
+ utf32_output += 64;
+ pos += 64;
+ } else {
+ // you might think that a for-loop would work, but under Visual Studio,
+ // it is not good enough.
+ static_assert(
+ (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+ (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+ "We support either two or four chunks per 64-byte block.");
+ auto zero = simd8<uint8_t>{uint8_t(0)};
+ if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+ this->check_utf8_bytes(input.chunks[0], zero);
+ this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+ } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+ this->check_utf8_bytes(input.chunks[0], zero);
+ this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+ this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+ this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+ }
+ uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+ if (utf8_continuation_mask & 1) {
+ return 0; // we have an error
+ }
+ uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+ uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+ // We process in blocks of up to 12 bytes except possibly
+ // for fast paths which may process up to 16 bytes. For the
+ // slow path to work, we should have at least 12 input bytes left.
+ size_t max_starting_point = (pos + 64) - 12;
+ // Next loop is going to run at least five times.
+ while (pos < max_starting_point) {
+ // Performance note: our ability to compute 'consumed' and
+ // then shift and recompute is critical. If there is a
+ // latency of, say, 4 cycles on getting 'consumed', then
+ // the inner loop might have a total latency of about 6 cycles.
+ // Yet we process between 6 to 12 inputs bytes, thus we get
+ // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+ // for this section of the code. Hence, there is a limit
+ // to how much we can further increase this latency before
+ // it seriously harms performance.
+ size_t consumed = convert_masked_utf8_to_utf32(
+ in + pos, utf8_end_of_code_point_mask, utf32_output);
+ pos += consumed;
+ utf8_end_of_code_point_mask >>= consumed;
+ }
+ // At this point there may remain between 0 and 12 bytes in the
+ // 64-byte block. These bytes will be processed again. So we have an
+ // 80% efficiency (in the worst case). In practice we expect an
+ // 85% to 90% efficiency.
+ }
+ }
+ if (errors()) {
+ return 0;
+ }
+ if (pos < size) {
+ size_t howmany =
+ scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+ if (howmany == 0) {
+ return 0;
+ }
+ utf32_output += howmany;
+ }
+ return utf32_output - start;
+ }
+
+ simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+ char32_t *utf32_output) {
+ size_t pos = 0;
+ char32_t *start{utf32_output};
+ // In the worst case, we have the haswell kernel which can cause an overflow
+ // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
+ // last 16 bytes, and if the data is valid, then it is entirely safe because
+ // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+ // generally assume that you have valid UTF-8 input, so we are going to go
+ // back from the end counting 8 leading bytes, to give us a good margin.
+ size_t leading_byte = 0;
+ size_t margin = size;
+ for (; margin > 0 && leading_byte < 8; margin--) {
+ leading_byte += (int8_t(in[margin - 1]) > -65);
+ }
+ // If the input is long enough, then we have that margin-1 is the fourth
+ // last leading byte.
+ const size_t safety_margin = size - margin + 1; // to avoid overruns!
+ while (pos + 64 + safety_margin <= size) {
+ simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+ if (input.is_ascii()) {
+ input.store_ascii_as_utf32(utf32_output);
+ utf32_output += 64;
+ pos += 64;
+ } else {
+ // you might think that a for-loop would work, but under Visual Studio,
+ // it is not good enough.
+ static_assert(
+ (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+ (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+ "We support either two or four chunks per 64-byte block.");
+ auto zero = simd8<uint8_t>{uint8_t(0)};
+ if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+ this->check_utf8_bytes(input.chunks[0], zero);
+ this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+ } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+ this->check_utf8_bytes(input.chunks[0], zero);
+ this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+ this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+ this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+ }
+ uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+ if (errors() || (utf8_continuation_mask & 1)) {
+ result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+ pos, in + pos, size - pos, utf32_output);
+ res.count += pos;
+ return res;
+ }
+ uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+ uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+ // We process in blocks of up to 12 bytes except possibly
+ // for fast paths which may process up to 16 bytes. For the
+ // slow path to work, we should have at least 12 input bytes left.
+ size_t max_starting_point = (pos + 64) - 12;
+ // Next loop is going to run at least five times.
+ while (pos < max_starting_point) {
+ // Performance note: our ability to compute 'consumed' and
+ // then shift and recompute is critical. If there is a
+ // latency of, say, 4 cycles on getting 'consumed', then
+ // the inner loop might have a total latency of about 6 cycles.
+ // Yet we process between 6 to 12 inputs bytes, thus we get
+ // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+ // for this section of the code. Hence, there is a limit
+ // to how much we can further increase this latency before
+ // it seriously harms performance.
+ size_t consumed = convert_masked_utf8_to_utf32(
+ in + pos, utf8_end_of_code_point_mask, utf32_output);
+ pos += consumed;
+ utf8_end_of_code_point_mask >>= consumed;
+ }
+ // At this point there may remain between 0 and 12 bytes in the
+ // 64-byte block. These bytes will be processed again. So we have an
+ // 80% efficiency (in the worst case). In practice we expect an
+ // 85% to 90% efficiency.
+ }
+ }
+ if (errors()) {
+ result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+ pos, in + pos, size - pos, utf32_output);
+ res.count += pos;
+ return res;
+ }
+ if (pos < size) {
+ result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+ pos, in + pos, size - pos, utf32_output);
+ if (res.error) { // In case of error, we want the error position
+ res.count += pos;
+ return res;
+ } else { // In case of success, we want the number of word written
+ utf32_output += res.count;
+ }
+ }
+ return result(error_code::SUCCESS, utf32_output - start);
+ }
+
+ simdutf_really_inline bool errors() const {
+ return this->error.any_bits_set_anywhere();
+ }
+
+}; // struct utf8_checker
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
diff --git a/contrib/simdutf/src/generic/utf8_to_utf32/valid_utf8_to_utf32.h b/contrib/simdutf/src/generic/utf8_to_utf32/valid_utf8_to_utf32.h
new file mode 100644
index 000000000..c2dc6342c
--- /dev/null
+++ b/contrib/simdutf/src/generic/utf8_to_utf32/valid_utf8_to_utf32.h
@@ -0,0 +1,44 @@
+#include "scalar/utf8_to_utf32/valid_utf8_to_utf32.h"
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+namespace utf8_to_utf32 {
+
+using namespace simd;
+
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+ char32_t *utf32_output) noexcept {
+ size_t pos = 0;
+ char32_t *start{utf32_output};
+ const size_t safety_margin = 16; // to avoid overruns!
+ while (pos + 64 + safety_margin <= size) {
+ simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+ if (in.is_ascii()) {
+ in.store_ascii_as_utf32(utf32_output);
+ utf32_output += 64;
+ pos += 64;
+ } else {
+ // -65 is 0b10111111 in two-complement's, so largest possible continuation
+ // byte
+ uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+ uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+ uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+ size_t max_starting_point = (pos + 64) - 12;
+ while (pos < max_starting_point) {
+ size_t consumed = convert_masked_utf8_to_utf32(
+ input + pos, utf8_end_of_code_point_mask, utf32_output);
+ pos += consumed;
+ utf8_end_of_code_point_mask >>= consumed;
+ }
+ }
+ }
+ utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
+ utf32_output);
+ return utf32_output - start;
+}
+
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
diff --git a/contrib/simdutf/src/generic/utf8_validation/utf8_lookup4_algorithm.h b/contrib/simdutf/src/generic/utf8_validation/utf8_lookup4_algorithm.h
new file mode 100644
index 000000000..ff01e2329
--- /dev/null
+++ b/contrib/simdutf/src/generic/utf8_validation/utf8_lookup4_algorithm.h
@@ -0,0 +1,223 @@
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+namespace utf8_validation {
+
+using namespace simd;
+
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+ // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+ // Bit 1 = Too Long (ASCII followed by continuation)
+ // Bit 2 = Overlong 3-byte
+ // Bit 4 = Surrogate
+ // Bit 5 = Overlong 2-byte
+ // Bit 7 = Two Continuations
+ constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
+ // 11______ 11______
+ constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+ constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+ constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+ constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+ constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+ constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
+ // 11110100 101_____
+ // 11110101 1001____
+ // 11110101 101_____
+ // 1111011_ 1001____
+ // 1111011_ 101_____
+ // 11111___ 1001____
+ // 11111___ 101_____
+ constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+ // 11110101 1000____
+ // 1111011_ 1000____
+ // 11111___ 1000____
+ constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+
+ const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+ // 0_______ ________ <ASCII in byte 1>
+ TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+ TOO_LONG,
+ // 10______ ________ <continuation in byte 1>
+ TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+ // 1100____ ________ <two byte lead in byte 1>
+ TOO_SHORT | OVERLONG_2,
+ // 1101____ ________ <two byte lead in byte 1>
+ TOO_SHORT,
+ // 1110____ ________ <three byte lead in byte 1>
+ TOO_SHORT | OVERLONG_3 | SURROGATE,
+ // 1111____ ________ <four+ byte lead in byte 1>
+ TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+ constexpr const uint8_t CARRY =
+ TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+ const simd8<uint8_t> byte_1_low =
+ (prev1 & 0x0F)
+ .lookup_16<uint8_t>(
+ // ____0000 ________
+ CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+ // ____0001 ________
+ CARRY | OVERLONG_2,
+ // ____001_ ________
+ CARRY, CARRY,
+
+ // ____0100 ________
+ CARRY | TOO_LARGE,
+ // ____0101 ________
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ // ____011_ ________
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+ // ____1___ ________
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ // ____1101 ________
+ CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+ CARRY | TOO_LARGE | TOO_LARGE_1000,
+ CARRY | TOO_LARGE | TOO_LARGE_1000);
+ const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+ // ________ 0_______ <ASCII in byte 2>
+ TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+ TOO_SHORT, TOO_SHORT,
+
+ // ________ 1000____
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+ OVERLONG_4,
+ // ________ 1001____
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+ // ________ 101_____
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+ // ________ 11______
+ TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+ return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+ const simd8<uint8_t> prev_input,
+ const simd8<uint8_t> sc) {
+ simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+ simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+ simd8<uint8_t> must23 =
+ simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+ simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+ return must23_80 ^ sc;
+}
+
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the
+// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
+//
+simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+ // If the previous input's last 3 bytes match this, they're too short (they
+ // ended at EOF):
+ // ... 1111____ 111_____ 11______
+ static const uint8_t max_array[32] = {255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 255,
+ 0b11110000u - 1,
+ 0b11100000u - 1,
+ 0b11000000u - 1};
+ const simd8<uint8_t> max_value(
+ &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
+ return input.gt_bits(max_value);
+}
+
+struct utf8_checker {
+ // If this is nonzero, there has been a UTF-8 error.
+ simd8<uint8_t> error;
+ // The last input we received
+ simd8<uint8_t> prev_input_block;
+ // Whether the last input we received was incomplete (used for ASCII fast
+ // path)
+ simd8<uint8_t> prev_incomplete;
+
+ //
+ // Check whether the current bytes are valid UTF-8.
+ //
+ simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+ const simd8<uint8_t> prev_input) {
+ // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+ // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+ // small negative numbers)
+ simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+ simd8<uint8_t> sc = check_special_cases(input, prev1);
+ this->error |= check_multibyte_lengths(input, prev_input, sc);
+ }
+
+ // The only problem that can happen at EOF is that a multibyte character is
+ // too short or a byte value too large in the last bytes: check_special_cases
+ // only checks for bytes too large in the first of two bytes.
+ simdutf_really_inline void check_eof() {
+ // If the previous block had incomplete UTF-8 characters at the end, an
+ // ASCII block can't possibly finish them.
+ this->error |= this->prev_incomplete;
+ }
+
+ simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
+ if (simdutf_likely(is_ascii(input))) {
+ this->error |= this->prev_incomplete;
+ } else {
+ // you might think that a for-loop would work, but under Visual Studio, it
+ // is not good enough.
+ static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+ (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+ "We support either two or four chunks per 64-byte block.");
+ if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+ this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+ this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+ } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+ this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+ this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+ this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+ this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+ }
+ this->prev_incomplete =
+ is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+ this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
+ }
+ }
+
+ // do not forget to call check_eof!
+ simdutf_really_inline bool errors() const {
+ return this->error.any_bits_set_anywhere();
+ }
+
+}; // struct utf8_checker
+} // namespace utf8_validation
+
+using utf8_validation::utf8_checker;
+
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
diff --git a/contrib/simdutf/src/generic/utf8_validation/utf8_validator.h b/contrib/simdutf/src/generic/utf8_validation/utf8_validator.h
new file mode 100644
index 000000000..a8c92d95a
--- /dev/null
+++ b/contrib/simdutf/src/generic/utf8_validation/utf8_validator.h
@@ -0,0 +1,138 @@
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+namespace utf8_validation {
+
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template <class checker>
+bool generic_validate_utf8(const uint8_t *input, size_t length) {
+ checker c{};
+ buf_block_reader<64> reader(input, length);
+ while (reader.has_full_block()) {
+ simd::simd8x64<uint8_t> in(reader.full_block());
+ c.check_next_input(in);
+ reader.advance();
+ }
+ uint8_t block[64]{};
+ reader.get_remainder(block);
+ simd::simd8x64<uint8_t> in(block);
+ c.check_next_input(in);
+ reader.advance();
+ c.check_eof();
+ return !c.errors();
+}
+
+bool generic_validate_utf8(const char *input, size_t length) {
+ return generic_validate_utf8<utf8_checker>(
+ reinterpret_cast<const uint8_t *>(input), length);
+}
+
+/**
+ * Validates that the string is actual UTF-8 and stops on errors.
+ */
+template <class checker>
+result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
+ checker c{};
+ buf_block_reader<64> reader(input, length);
+ size_t count{0};
+ while (reader.has_full_block()) {
+ simd::simd8x64<uint8_t> in(reader.full_block());
+ c.check_next_input(in);
+ if (c.errors()) {
+ if (count != 0) {
+ count--;
+ } // Sometimes the error is only detected in the next chunk
+ result res = scalar::utf8::rewind_and_validate_with_errors(
+ reinterpret_cast<const char *>(input),
+ reinterpret_cast<const char *>(input + count), length - count);
+ res.count += count;
+ return res;
+ }
+ reader.advance();
+ count += 64;
+ }
+ uint8_t block[64]{};
+ reader.get_remainder(block);
+ simd::simd8x64<uint8_t> in(block);
+ c.check_next_input(in);
+ reader.advance();
+ c.check_eof();
+ if (c.errors()) {
+ if (count != 0) {
+ count--;
+ } // Sometimes the error is only detected in the next chunk
+ result res = scalar::utf8::rewind_and_validate_with_errors(
+ reinterpret_cast<const char *>(input),
+ reinterpret_cast<const char *>(input) + count, length - count);
+ res.count += count;
+ return res;
+ } else {
+ return result(error_code::SUCCESS, length);
+ }
+}
+
+result generic_validate_utf8_with_errors(const char *input, size_t length) {
+ return generic_validate_utf8_with_errors<utf8_checker>(
+ reinterpret_cast<const uint8_t *>(input), length);
+}
+
+template <class checker>
+bool generic_validate_ascii(const uint8_t *input, size_t length) {
+ buf_block_reader<64> reader(input, length);
+ uint8_t blocks[64]{};
+ simd::simd8x64<uint8_t> running_or(blocks);
+ while (reader.has_full_block()) {
+ simd::simd8x64<uint8_t> in(reader.full_block());
+ running_or |= in;
+ reader.advance();
+ }
+ uint8_t block[64]{};
+ reader.get_remainder(block);
+ simd::simd8x64<uint8_t> in(block);
+ running_or |= in;
+ return running_or.is_ascii();
+}
+
+bool generic_validate_ascii(const char *input, size_t length) {
+ return generic_validate_ascii<utf8_checker>(
+ reinterpret_cast<const uint8_t *>(input), length);
+}
+
+template <class checker>
+result generic_validate_ascii_with_errors(const uint8_t *input, size_t length) {
+ buf_block_reader<64> reader(input, length);
+ size_t count{0};
+ while (reader.has_full_block()) {
+ simd::simd8x64<uint8_t> in(reader.full_block());
+ if (!in.is_ascii()) {
+ result res = scalar::ascii::validate_with_errors(
+ reinterpret_cast<const char *>(input + count), length - count);
+ return result(res.error, count + res.count);
+ }
+ reader.advance();
+
+ count += 64;
+ }
+ uint8_t block[64]{};
+ reader.get_remainder(block);
+ simd::simd8x64<uint8_t> in(block);
+ if (!in.is_ascii()) {
+ result res = scalar::ascii::validate_with_errors(
+ reinterpret_cast<const char *>(input + count), length - count);
+ return result(res.error, count + res.count);
+ } else {
+ return result(error_code::SUCCESS, length);
+ }
+}
+
+result generic_validate_ascii_with_errors(const char *input, size_t length) {
+ return generic_validate_ascii_with_errors<utf8_checker>(
+ reinterpret_cast<const uint8_t *>(input), length);
+}
+
+} // namespace utf8_validation
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
diff --git a/contrib/simdutf/src/haswell/avx2_base64.cpp b/contrib/simdutf/src/haswell/avx2_base64.cpp
new file mode 100644
index 000000000..87302d181
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_base64.cpp
@@ -0,0 +1,577 @@
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
+
+template <bool base64_url>
+simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) {
+ // credit: Wojciech Muła
+ __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
+ const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
+ result =
+ _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
+ __m256i shift_LUT;
+ if (base64_url) {
+ shift_LUT = _mm256_setr_epi8(
+ 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+ '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0,
+
+ 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+ '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
+ } else {
+ shift_LUT = _mm256_setr_epi8(
+ 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+ '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
+
+ 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+ '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+ }
+
+ result = _mm256_shuffle_epi8(shift_LUT, result);
+ return _mm256_add_epi8(result, input);
+}
+
+template <bool isbase64url>
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+ base64_options options) {
+ // credit: Wojciech Muła
+ const uint8_t *input = (const uint8_t *)src;
+
+ uint8_t *out = (uint8_t *)dst;
+ const __m256i shuf =
+ _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
+
+ 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
+ size_t i = 0;
+ for (; i + 100 <= srclen; i += 96) {
+ const __m128i lo0 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
+ const __m128i hi0 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
+ const __m128i lo1 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
+ const __m128i hi1 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
+ const __m128i lo2 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
+ const __m128i hi2 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
+ const __m128i lo3 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
+ const __m128i hi3 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
+
+ __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
+ __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
+ __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
+ __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
+
+ const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
+ const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
+ const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
+ const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
+
+ const __m256i t1_0 =
+ _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
+ const __m256i t1_1 =
+ _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
+ const __m256i t1_2 =
+ _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
+ const __m256i t1_3 =
+ _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
+
+ const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
+ const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
+ const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
+ const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
+
+ const __m256i t3_0 =
+ _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
+ const __m256i t3_1 =
+ _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
+ const __m256i t3_2 =
+ _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
+ const __m256i t3_3 =
+ _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
+
+ const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
+ const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
+ const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
+ const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
+
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+ lookup_pshufb_improved<isbase64url>(input0));
+ out += 32;
+
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+ lookup_pshufb_improved<isbase64url>(input1));
+ out += 32;
+
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+ lookup_pshufb_improved<isbase64url>(input2));
+ out += 32;
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+ lookup_pshufb_improved<isbase64url>(input3));
+ out += 32;
+ }
+ for (; i + 28 <= srclen; i += 24) {
+ // lo = [xxxx|DDDC|CCBB|BAAA]
+ // hi = [xxxx|HHHG|GGFF|FEEE]
+ const __m128i lo =
+ _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
+ const __m128i hi =
+ _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
+
+ // bytes from groups A, B and C are needed in separate 32-bit lanes
+ // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
+ __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
+
+ // this part is well commented in encode.sse.cpp
+
+ const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
+ const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
+ const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
+ const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
+ const __m256i indices = _mm256_or_si256(t1, t3);
+
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+ lookup_pshufb_improved<isbase64url>(indices));
+ out += 32;
+ }
+ return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+ srclen - i, options);
+}
+
+static inline void compress(__m128i data, uint16_t mask, char *output) {
+ if (mask == 0) {
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
+ return;
+ }
+ // this particular implementation was inspired by work done by @animetosho
+ // we do it in two steps, first 8 bytes and then second 8 bytes
+ uint8_t mask1 = uint8_t(mask); // least significant 8 bits
+ uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+ // next line just loads the 64-bit values thintable_epi8[mask1] and
+ // thintable_epi8[mask2] into a 128-bit register, using only
+ // two instructions on most compilers.
+
+ __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2],
+ tables::base64::thintable_epi8[mask1]);
+ // we increment by 0x08 the second half of the mask
+ shufmask =
+ _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
+ // this is the version "nearly pruned"
+ __m128i pruned = _mm_shuffle_epi8(data, shufmask);
+ // we still need to put the two halves together.
+ // we compute the popcount of the first half:
+ int pop1 = tables::base64::BitsSetTable256mul2[mask1];
+ // then load the corresponding mask, what it does is to write
+ // only the first pop1 bytes from the first 8 bytes, and then
+ // it fills in with the bytes from the second 8 bytes + some filling
+ // at the end.
+ __m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
+ tables::base64::pshufb_combine_table + pop1 * 8));
+ __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
+
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
+}
+
+static inline void compress(__m256i data, uint32_t mask, char *output) {
+ if (mask == 0) {
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), data);
+ return;
+ }
+ compress(_mm256_castsi256_si128(data), uint16_t(mask), output);
+ compress(_mm256_extracti128_si256(data, 1), uint16_t(mask >> 16),
+ output + _mm_popcnt_u32(~mask & 0xFFFF));
+}
+
+struct block64 {
+ __m256i chunks[2];
+};
+
+template <bool base64_url>
+static inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) {
+ const __m256i ascii_space_tbl =
+ _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
+ 0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0);
+ // credit: aqrit
+ __m256i delta_asso;
+ if (base64_url) {
+ delta_asso =
+ _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
+ } else {
+ delta_asso = _mm256_setr_epi8(
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
+ }
+
+ __m256i delta_values;
+ if (base64_url) {
+ delta_values = _mm256_setr_epi8(
+ 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
+ uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0),
+ uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
+ uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3),
+ uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
+ } else {
+ delta_values = _mm256_setr_epi8(
+ int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
+ int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
+ int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+ int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+ int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
+ int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
+ int8_t(0xB9), int8_t(0xB9));
+ }
+ __m256i check_asso;
+
+ if (base64_url) {
+ check_asso =
+ _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
+ 0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6);
+ } else {
+
+ check_asso = _mm256_setr_epi8(
+ 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
+ 0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
+ }
+ __m256i check_values;
+ if (base64_url) {
+ check_values = _mm256_setr_epi8(
+ uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
+ uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6),
+ uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80),
+ 0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
+ uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6),
+ uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
+ uint8_t(0x80), 0x0, uint8_t(0x80));
+ } else {
+ check_values = _mm256_setr_epi8(
+ int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
+ int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
+ int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
+ int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+ int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
+ int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
+ int8_t(0x91), int8_t(0x80));
+ }
+ const __m256i shifted = _mm256_srli_epi32(*src, 3);
+ const __m256i delta_hash =
+ _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
+ const __m256i check_hash =
+ _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
+ const __m256i out =
+ _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
+ const __m256i chk =
+ _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src);
+ const int mask = _mm256_movemask_epi8(chk);
+ if (mask) {
+ __m256i ascii_space =
+ _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src);
+ *error = (mask ^ _mm256_movemask_epi8(ascii_space));
+ }
+ *src = out;
+ return (uint32_t)mask;
+}
+
+template <bool base64_url>
+static inline uint64_t to_base64_mask(block64 *b, uint64_t *error) {
+ uint32_t err0 = 0;
+ uint32_t err1 = 0;
+ uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], &err0);
+ uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], &err1);
+ *error = err0 | ((uint64_t)err1 << 32);
+ return m0 | (m1 << 32);
+}
+
+static inline void copy_block(block64 *b, char *output) {
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), b->chunks[0]);
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + 32), b->chunks[1]);
+}
+
+static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+ uint64_t nmask = ~mask;
+ compress(b->chunks[0], uint32_t(mask), output);
+ compress(b->chunks[1], uint32_t(mask >> 32),
+ output + _mm_popcnt_u64(nmask & 0xFFFFFFFF));
+ return _mm_popcnt_u64(nmask);
+}
+
+// The caller of this function is responsible to ensure that there are 64 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char *src) {
+ b->chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+ b->chunks[1] =
+ _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
+}
+
+// The caller of this function is responsible to ensure that there are 128 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char16_t *src) {
+ __m256i m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+ __m256i m2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 16));
+ __m256i m3 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
+ __m256i m4 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 48));
+ __m256i m1p = _mm256_permute2x128_si256(m1, m2, 0x20);
+ __m256i m2p = _mm256_permute2x128_si256(m1, m2, 0x31);
+ __m256i m3p = _mm256_permute2x128_si256(m3, m4, 0x20);
+ __m256i m4p = _mm256_permute2x128_si256(m3, m4, 0x31);
+ b->chunks[0] = _mm256_packus_epi16(m1p, m2p);
+ b->chunks[1] = _mm256_packus_epi16(m3p, m4p);
+}
+
+static inline void base64_decode(char *out, __m256i str) {
+ // credit: aqrit
+ const __m256i pack_shuffle =
+ _mm256_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
+ 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
+ const __m256i t0 = _mm256_maddubs_epi16(str, _mm256_set1_epi32(0x01400140));
+ const __m256i t1 = _mm256_madd_epi16(t0, _mm256_set1_epi32(0x00011000));
+ const __m256i t2 = _mm256_shuffle_epi8(t1, pack_shuffle);
+
+ // Store the output:
+ _mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(t2));
+ _mm_storeu_si128((__m128i *)(out + 12), _mm256_extracti128_si256(t2, 1));
+}
+// decode 64 bytes and output 48 bytes
+static inline void base64_decode_block(char *out, const char *src) {
+ base64_decode(out,
+ _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
+ base64_decode(out + 24, _mm256_loadu_si256(
+ reinterpret_cast<const __m256i *>(src + 32)));
+}
+static inline void base64_decode_block_safe(char *out, const char *src) {
+ base64_decode(out,
+ _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
+ char buffer[32]; // We enforce safety with a buffer.
+ base64_decode(
+ buffer, _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32)));
+ std::memcpy(out + 24, buffer, 24);
+}
+static inline void base64_decode_block(char *out, block64 *b) {
+ base64_decode(out, b->chunks[0]);
+ base64_decode(out + 24, b->chunks[1]);
+}
+static inline void base64_decode_block_safe(char *out, block64 *b) {
+ base64_decode(out, b->chunks[0]);
+ char buffer[32]; // We enforce safety with a buffer.
+ base64_decode(buffer, b->chunks[1]);
+ std::memcpy(out + 24, buffer, 24);
+}
+
+template <bool base64_url, typename chartype>
+full_result
+compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options) {
+ const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+ : tables::base64::to_base64_value;
+ size_t equallocation =
+ srclen; // location of the first padding character if any
+ // skip trailing spaces
+ while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+ to_base64[uint8_t(src[srclen - 1])] == 64) {
+ srclen--;
+ }
+ size_t equalsigns = 0;
+ if (srclen > 0 && src[srclen - 1] == '=') {
+ equallocation = srclen - 1;
+ srclen--;
+ equalsigns = 1;
+ // skip trailing spaces
+ while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+ to_base64[uint8_t(src[srclen - 1])] == 64) {
+ srclen--;
+ }
+ if (srclen > 0 && src[srclen - 1] == '=') {
+ equallocation = srclen - 1;
+ srclen--;
+ equalsigns = 2;
+ }
+ }
+ if (srclen == 0) {
+ if (equalsigns > 0) {
+ return {INVALID_BASE64_CHARACTER, equallocation, 0};
+ }
+ return {SUCCESS, 0, 0};
+ }
+ char *end_of_safe_64byte_zone =
+ (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
+
+ const chartype *const srcinit = src;
+ const char *const dstinit = dst;
+ const chartype *const srcend = src + srclen;
+
+ constexpr size_t block_size = 6;
+ static_assert(block_size >= 2, "block_size must be at least two");
+ char buffer[block_size * 64];
+ char *bufferptr = buffer;
+ if (srclen >= 64) {
+ const chartype *const srcend64 = src + srclen - 64;
+ while (src <= srcend64) {
+ block64 b;
+ load_block(&b, src);
+ src += 64;
+ uint64_t error = 0;
+ uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+ if (error) {
+ src -= 64;
+ size_t error_offset = _tzcnt_u64(error);
+ return {error_code::INVALID_BASE64_CHARACTER,
+ size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
+ }
+ if (badcharmask != 0) {
+ // optimization opportunity: check for simple masks like those made of
+ // continuous 1s followed by continuous 0s. And masks containing a
+ // single bad character.
+ bufferptr += compress_block(&b, badcharmask, bufferptr);
+ } else if (bufferptr != buffer) {
+ copy_block(&b, bufferptr);
+ bufferptr += 64;
+ } else {
+ if (dst >= end_of_safe_64byte_zone) {
+ base64_decode_block_safe(dst, &b);
+ } else {
+ base64_decode_block(dst, &b);
+ }
+ dst += 48;
+ }
+ if (bufferptr >= (block_size - 1) * 64 + buffer) {
+ for (size_t i = 0; i < (block_size - 2); i++) {
+ base64_decode_block(dst, buffer + i * 64);
+ dst += 48;
+ }
+ if (dst >= end_of_safe_64byte_zone) {
+ base64_decode_block_safe(dst, buffer + (block_size - 2) * 64);
+ } else {
+ base64_decode_block(dst, buffer + (block_size - 2) * 64);
+ }
+ dst += 48;
+ std::memcpy(buffer, buffer + (block_size - 1) * 64,
+ 64); // 64 might be too much
+ bufferptr -= (block_size - 1) * 64;
+ }
+ }
+ }
+
+ char *buffer_start = buffer;
+ // Optimization note: if this is almost full, then it is worth our
+ // time, otherwise, we should just decode directly.
+ int last_block = (int)((bufferptr - buffer_start) % 64);
+ if (last_block != 0 && srcend - src + last_block >= 64) {
+
+ while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
+ uint8_t val = to_base64[uint8_t(*src)];
+ *bufferptr = char(val);
+ if (!scalar::base64::is_eight_byte(*src) || val > 64) {
+ return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+ size_t(dst - dstinit)};
+ }
+ bufferptr += (val <= 63);
+ src++;
+ }
+ }
+
+ for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+ if (dst >= end_of_safe_64byte_zone) {
+ base64_decode_block_safe(dst, buffer_start);
+ } else {
+ base64_decode_block(dst, buffer_start);
+ }
+ dst += 48;
+ }
+ if ((bufferptr - buffer_start) % 64 != 0) {
+ while (buffer_start + 4 < bufferptr) {
+ uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+ (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+ (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+ (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+ << 8;
+ triple = scalar::utf32::swap_bytes(triple);
+ std::memcpy(dst, &triple, 4);
+
+ dst += 3;
+ buffer_start += 4;
+ }
+ if (buffer_start + 4 <= bufferptr) {
+ uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+ (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+ (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+ (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+ << 8;
+ triple = scalar::utf32::swap_bytes(triple);
+ std::memcpy(dst, &triple, 3);
+
+ dst += 3;
+ buffer_start += 4;
+ }
+ // we may have 1, 2 or 3 bytes left and we need to decode them so let us
+ // backtrack
+ int leftover = int(bufferptr - buffer_start);
+ while (leftover > 0) {
+ while (to_base64[uint8_t(*(src - 1))] == 64) {
+ src--;
+ }
+ src--;
+ leftover--;
+ }
+ }
+ if (src < srcend + equalsigns) {
+ full_result r = scalar::base64::base64_tail_decode(
+ dst, src, srcend - src, equalsigns, options, last_chunk_options);
+ r.input_count += size_t(src - srcinit);
+ if (r.error == error_code::INVALID_BASE64_CHARACTER ||
+ r.error == error_code::BASE64_EXTRA_BITS) {
+ return r;
+ } else {
+ r.output_count += size_t(dst - dstinit);
+ }
+ if (last_chunk_options != stop_before_partial &&
+ r.error == error_code::SUCCESS && equalsigns > 0) {
+ // additional checks
+ if ((r.output_count % 3 == 0) ||
+ ((r.output_count % 3) + 1 + equalsigns != 4)) {
+ r.error = error_code::INVALID_BASE64_CHARACTER;
+ r.input_count = equallocation;
+ }
+ }
+ return r;
+ }
+ if (equalsigns > 0) {
+ if ((size_t(dst - dstinit) % 3 == 0) ||
+ ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
+ return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
+ }
+ }
+ return {SUCCESS, srclen, size_t(dst - dstinit)};
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf16.cpp b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf16.cpp
new file mode 100644
index 000000000..6484dcedf
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf16.cpp
@@ -0,0 +1,37 @@
+template <endianness big_endian>
+std::pair<const char *, char16_t *>
+avx2_convert_latin1_to_utf16(const char *latin1_input, size_t len,
+ char16_t *utf16_output) {
+ size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 32
+
+ size_t i = 0;
+ for (; i < rounded_len; i += 16) {
+ // Load 16 bytes from the address (input + i) into a xmm register
+ __m128i xmm0 =
+ _mm_loadu_si128(reinterpret_cast<const __m128i *>(latin1_input + i));
+
+ // Zero extend each byte in xmm0 to word and put it in another xmm register
+ __m128i xmm1 = _mm_cvtepu8_epi16(xmm0);
+
+ // Shift xmm0 to the right by 8 bytes
+ xmm0 = _mm_srli_si128(xmm0, 8);
+
+ // Zero extend each byte in the shifted xmm0 to word in xmm0
+ xmm0 = _mm_cvtepu8_epi16(xmm0);
+
+ if (big_endian) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ xmm0 = _mm_shuffle_epi8(xmm0, swap);
+ xmm1 = _mm_shuffle_epi8(xmm1, swap);
+ }
+
+ // Store the contents of xmm1 into the address pointed by (output + i)
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + i), xmm1);
+
+ // Store the contents of xmm0 into the address pointed by (output + i + 8)
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + i + 8), xmm0);
+ }
+
+ return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf32.cpp b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf32.cpp
new file mode 100644
index 000000000..f89550b95
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf32.cpp
@@ -0,0 +1,20 @@
+std::pair<const char *, char32_t *>
+avx2_convert_latin1_to_utf32(const char *buf, size_t len,
+ char32_t *utf32_output) {
+ size_t rounded_len = ((len | 7) ^ 7); // Round down to nearest multiple of 8
+
+ for (size_t i = 0; i < rounded_len; i += 8) {
+ // Load 8 Latin1 characters into a 64-bit register
+ __m128i in = _mm_loadl_epi64((__m128i *)&buf[i]);
+
+ // Zero extend each set of 8 Latin1 characters to 8 32-bit integers using
+ // vpmovzxbd
+ __m256i out = _mm256_cvtepu8_epi32(in);
+
+ // Store the results back to memory
+ _mm256_storeu_si256((__m256i *)&utf32_output[i], out);
+ }
+
+ // return pointers pointing to where we left off
+ return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf8.cpp b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf8.cpp
new file mode 100644
index 000000000..a637e1bb0
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_latin1_to_utf8.cpp
@@ -0,0 +1,83 @@
+std::pair<const char *, char *>
+avx2_convert_latin1_to_utf8(const char *latin1_input, size_t len,
+ char *utf8_output) {
+ const char *end = latin1_input + len;
+ const __m256i v_0000 = _mm256_setzero_si256();
+ const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+ const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+ const size_t safety_margin = 12;
+
+ while (end - latin1_input >= std::ptrdiff_t(16 + safety_margin)) {
+ __m128i in8 = _mm_loadu_si128((__m128i *)latin1_input);
+ // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+ const __m128i v_80 = _mm_set1_epi8((char)0x80);
+ if (_mm_testz_si128(in8, v_80)) { // ASCII fast path!!!!
+ // 1. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, in8);
+ // 2. adjust pointers
+ latin1_input += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+ // We proceed only with the first 16 bytes.
+ const __m256i in = _mm256_cvtepu8_epi16((in8));
+
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0000|aabb|bbbb] x 8
+ // expected output : [1100|00aa|10bb|bbbb] x 8
+ const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+ const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+ // t0 = [0000|00aa|bbbb|bb00]
+ const __m256i t0 = _mm256_slli_epi16(in, 2);
+ // t1 = [0000|00aa|0000|0000]
+ const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m256i t2 = _mm256_and_si256(in, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m256i t3 = _mm256_or_si256(t1, t2);
+ // t4 = [1100|00aa|10bb|bbbb]
+ const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+ // 2. merge ASCII and 2-byte codewords
+
+ // no bits set above 7th bit
+ const __m256i one_byte_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+ const uint32_t one_byte_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+ const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
+
+ // 3. prepare bitmask for 8-bit lookup
+ const uint32_t M0 = one_byte_bitmask & 0x55555555;
+ const uint32_t M1 = M0 >> 7;
+ const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+ // 4. pack the bytes
+
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+ const uint8_t *row_2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)]
+ [0];
+
+ const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+ const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+
+ const __m256i utf8_packed = _mm256_shuffle_epi8(
+ utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+ // 5. store bytes
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_castsi256_si128(utf8_packed));
+ utf8_output += row[0];
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_extractf128_si256(utf8_packed, 1));
+ utf8_output += row_2[0];
+
+ // 6. adjust pointers
+ latin1_input += 16;
+ continue;
+
+ } // while
+ return std::make_pair(latin1_input, utf8_output);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf16_to_latin1.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_latin1.cpp
new file mode 100644
index 000000000..8c46a23a8
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_latin1.cpp
@@ -0,0 +1,85 @@
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+avx2_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_output) {
+ const char16_t *end = buf + len;
+ while (end - buf >= 16) {
+ // Load 16 UTF-16 characters into 256-bit AVX2 register
+ __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
+
+ if (!match_system(big_endian)) {
+ const __m256i swap = _mm256_setr_epi8(
+ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+ 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+ in = _mm256_shuffle_epi8(in, swap);
+ }
+
+ __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
+ if (_mm256_testz_si256(in, high_byte_mask)) {
+ // Pack 16-bit characters into 8-bit and store in latin1_output
+ __m128i lo = _mm256_extractf128_si256(in, 0);
+ __m128i hi = _mm256_extractf128_si256(in, 1);
+ __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo);
+ __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi);
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
+ latin1_packed_lo);
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8),
+ latin1_packed_hi);
+ // Adjust pointers for next iteration
+ buf += 16;
+ latin1_output += 16;
+ } else {
+ return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+ }
+ } // while
+ return std::make_pair(buf, latin1_output);
+}
+
+template <endianness big_endian>
+std::pair<result, char *>
+avx2_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+ char *latin1_output) {
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+ while (end - buf >= 16) {
+ __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
+
+ if (!match_system(big_endian)) {
+ const __m256i swap = _mm256_setr_epi8(
+ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+ 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+ in = _mm256_shuffle_epi8(in, swap);
+ }
+
+ __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
+ if (_mm256_testz_si256(in, high_byte_mask)) {
+ __m128i lo = _mm256_extractf128_si256(in, 0);
+ __m128i hi = _mm256_extractf128_si256(in, 1);
+ __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo);
+ __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi);
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
+ latin1_packed_lo);
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8),
+ latin1_packed_hi);
+ buf += 16;
+ latin1_output += 16;
+ } else {
+ // Fallback to scalar code for handling errors
+ for (int k = 0; k < 16; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if (word <= 0xff) {
+ *latin1_output++ = char(word);
+ } else {
+ return std::make_pair(
+ result{error_code::TOO_LARGE, (size_t)(buf - start + k)},
+ latin1_output);
+ }
+ }
+ buf += 16;
+ }
+ } // while
+ return std::make_pair(result{error_code::SUCCESS, (size_t)(buf - start)},
+ latin1_output);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf32.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf32.cpp
new file mode 100644
index 000000000..d396893ca
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf32.cpp
@@ -0,0 +1,210 @@
+/*
+ The vectorized algorithm works on single SSE register i.e., it
+ loads eight 16-bit code units.
+
+ We consider three cases:
+ 1. an input register contains no surrogates and each value
+ is in range 0x0000 .. 0x07ff.
+ 2. an input register contains no surrogates and values are
+ in range 0x0000 .. 0xffff.
+ 3. an input register contains surrogates --- i.e. codepoints
+ can have 16 or 32 bits.
+
+ Ad 1.
+
+ When values are less than 0x0800, it means that a 16-bit code unit
+ can be converted into: 1) single UTF8 byte (when it is an ASCII
+ char) or 2) two UTF8 bytes.
+
+ For this case we do only some shuffle to obtain these 2-byte
+ codes and finally compress the whole SSE register with a single
+ shuffle.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+ Ad 2.
+
+ When values fit in 16-bit code units, but are above 0x07ff, then
+ a single word may produce one, two or three UTF8 bytes.
+
+ We prepare data for all these three cases in two registers.
+ The first register contains lower two UTF8 bytes (used in all
+ cases), while the second one contains just the third byte for
+ the three-UTF8-bytes case.
+
+ Finally these two registers are interleaved forming eight-element
+ array of 32-bit values. The array spans two SSE registers.
+ The bytes from the registers are compressed using two shuffles.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+
+ To summarize:
+ - We need two 256-entry tables that have 8704 bytes in total.
+*/
+
+/*
+ Returns a pair: the first unprocessed byte from buf and utf32_output
+ A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char32_t *>
+avx2_convert_utf16_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_output) {
+ const char16_t *end = buf + len;
+ const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+ const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+
+ while (end - buf >= 16) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+ if (big_endian) {
+ const __m256i swap = _mm256_setr_epi8(
+ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+ 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+ in = _mm256_shuffle_epi8(in, swap);
+ }
+
+ // 1. Check if there are any surrogate word in the input chunk.
+ // We have also deal with situation when there is a surrogate word
+ // at the end of a chunk.
+ const __m256i surrogates_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+ // bitmask = 0x0000 if there are no surrogates
+ // = 0xc000 if the last word is a surrogate
+ const uint32_t surrogates_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (surrogates_bitmask == 0x00000000) {
+ // case: we extend all sixteen 16-bit code units to sixteen 32-bit code
+ // units
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
+ _mm256_storeu_si256(
+ reinterpret_cast<__m256i *>(utf32_output + 8),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
+ utf32_output += 16;
+ buf += 16;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+ if ((word & 0xF800) != 0xD800) {
+ // No surrogate pair
+ *utf32_output++ = char32_t(word);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word =
+ big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(nullptr, utf32_output);
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf32_output++ = char32_t(value);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(buf, utf32_output);
+}
+
+/*
+ Returns a pair: a result struct and utf8_output.
+ If there is an error, the count field of the result is the position of the
+ error. Otherwise, it is the position of the first unprocessed byte in buf
+ (even if finished). A scalar routing should carry on the conversion of the
+ tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char32_t *>
+avx2_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
+ char32_t *utf32_output) {
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+ const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+ const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+
+ while (end - buf >= 16) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+ if (big_endian) {
+ const __m256i swap = _mm256_setr_epi8(
+ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+ 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+ in = _mm256_shuffle_epi8(in, swap);
+ }
+
+ // 1. Check if there are any surrogate word in the input chunk.
+ // We have also deal with situation when there is a surrogate word
+ // at the end of a chunk.
+ const __m256i surrogates_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+ // bitmask = 0x0000 if there are no surrogates
+ // = 0xc000 if the last word is a surrogate
+ const uint32_t surrogates_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (surrogates_bitmask == 0x00000000) {
+ // case: we extend all sixteen 16-bit code units to sixteen 32-bit code
+ // units
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
+ _mm256_storeu_si256(
+ reinterpret_cast<__m256i *>(utf32_output + 8),
+ _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
+ utf32_output += 16;
+ buf += 16;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+ if ((word & 0xF800) != 0xD800) {
+ // No surrogate pair
+ *utf32_output++ = char32_t(word);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word =
+ big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k - 1),
+ utf32_output);
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf32_output++ = char32_t(value);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf8.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf8.cpp
new file mode 100644
index 000000000..2a26a0584
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf16_to_utf8.cpp
@@ -0,0 +1,602 @@
+/*
+ The vectorized algorithm works on single SSE register i.e., it
+ loads eight 16-bit code units.
+
+ We consider three cases:
+ 1. an input register contains no surrogates and each value
+ is in range 0x0000 .. 0x07ff.
+ 2. an input register contains no surrogates and values are
+ is in range 0x0000 .. 0xffff.
+ 3. an input register contains surrogates --- i.e. codepoints
+ can have 16 or 32 bits.
+
+ Ad 1.
+
+ When values are less than 0x0800, it means that a 16-bit code unit
+ can be converted into: 1) single UTF8 byte (when it is an ASCII
+ char) or 2) two UTF8 bytes.
+
+ For this case we do only some shuffle to obtain these 2-byte
+ codes and finally compress the whole SSE register with a single
+ shuffle.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+ Ad 2.
+
+ When values fit in 16-bit code units, but are above 0x07ff, then
+ a single word may produce one, two or three UTF8 bytes.
+
+ We prepare data for all these three cases in two registers.
+ The first register contains lower two UTF8 bytes (used in all
+ cases), while the second one contains just the third byte for
+ the three-UTF8-bytes case.
+
+ Finally these two registers are interleaved forming eight-element
+ array of 32-bit values. The array spans two SSE registers.
+ The bytes from the registers are compressed using two shuffles.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+
+ To summarize:
+ - We need two 256-entry tables that have 8704 bytes in total.
+*/
+
+/*
+ Returns a pair: the first unprocessed byte from buf and utf8_output
+ A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+avx2_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
+ const char16_t *end = buf + len;
+ const __m256i v_0000 = _mm256_setzero_si256();
+ const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+ const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+ const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+ if (big_endian) {
+ const __m256i swap = _mm256_setr_epi8(
+ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+ 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+ in = _mm256_shuffle_epi8(in, swap);
+ }
+ // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+ const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+ if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ const __m128i utf8_packed = _mm_packus_epi16(
+ _mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+ // 2. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+ // no bits set above 7th bit
+ const __m256i one_byte_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+ const uint32_t one_byte_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+ // no bits set above 11th bit
+ const __m256i one_or_two_bytes_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
+ const uint32_t one_or_two_bytes_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+ if (one_or_two_bytes_bitmask == 0xffffffff) {
+
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+ const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const __m256i t0 = _mm256_slli_epi16(in, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m256i t2 = _mm256_and_si256(in, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m256i t3 = _mm256_or_si256(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+ // 2. merge ASCII and 2-byte codewords
+ const __m256i utf8_unpacked =
+ _mm256_blendv_epi8(t4, in, one_byte_bytemask);
+
+ // 3. prepare bitmask for 8-bit lookup
+ const uint32_t M0 = one_byte_bitmask & 0x55555555;
+ const uint32_t M1 = M0 >> 7;
+ const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+ // 4. pack the bytes
+
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+ const uint8_t *row_2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+ 16)][0];
+
+ const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+ const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+
+ const __m256i utf8_packed = _mm256_shuffle_epi8(
+ utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+ // 5. store bytes
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_castsi256_si128(utf8_packed));
+ utf8_output += row[0];
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_extractf128_si256(utf8_packed, 1));
+ utf8_output += row_2[0];
+
+ // 6. adjust pointers
+ buf += 16;
+ continue;
+ }
+ // 1. Check if there are any surrogate word in the input chunk.
+ // We have also deal with situation when there is a surrogate word
+ // at the end of a chunk.
+ const __m256i surrogates_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+ // bitmask = 0x0000 if there are no surrogates
+ // = 0xc000 if the last word is a surrogate
+ const uint32_t surrogates_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (surrogates_bitmask == 0x00000000) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ const __m256i dup_even = _mm256_setr_epi16(
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
+ UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+ // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
+ const __m256i s0 = _mm256_srli_epi16(in, 4);
+ // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+ const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+ // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+ const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+ // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+ const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+ simdutf_vec(0b0100000000000000));
+ const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+ const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+ (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+ // Due to the wider registers, the following path is less likely to be
+ // useful.
+ /*if(mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const __m256i shuffle =
+ _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+ 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+ _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+ _mm256_shuffle_epi8(out1, shuffle);
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+ continue;
+ }*/
+ const uint8_t mask0 = uint8_t(mask);
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+ const __m128i utf8_0 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+ const __m128i utf8_1 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+ const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+ const uint8_t *row2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+ const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+ const __m128i utf8_2 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+ const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+ const uint8_t *row3 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+ const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+ const __m128i utf8_3 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += row0[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += row1[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+ utf8_output += row2[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+ utf8_output += row3[0];
+ buf += 16;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+ if ((word & 0xFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xF800) != 0xD800) {
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word =
+ big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf8_output++ = char((value >> 18) | 0b11110000);
+ *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((value & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(buf, utf8_output);
+}
+
+/*
+ Returns a pair: a result struct and utf8_output.
+ If there is an error, the count field of the result is the position of the
+ error. Otherwise, it is the position of the first unprocessed byte in buf
+ (even if finished). A scalar routing should carry on the conversion of the
+ tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char *>
+avx2_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
+ char *utf8_output) {
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+
+ const __m256i v_0000 = _mm256_setzero_si256();
+ const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+ const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+ const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+ if (big_endian) {
+ const __m256i swap = _mm256_setr_epi8(
+ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+ 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+ in = _mm256_shuffle_epi8(in, swap);
+ }
+ // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+ const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+ if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ const __m128i utf8_packed = _mm_packus_epi16(
+ _mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+ // 2. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+ // no bits set above 7th bit
+ const __m256i one_byte_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+ const uint32_t one_byte_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+ // no bits set above 11th bit
+ const __m256i one_or_two_bytes_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
+ const uint32_t one_or_two_bytes_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+ if (one_or_two_bytes_bitmask == 0xffffffff) {
+
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+ const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const __m256i t0 = _mm256_slli_epi16(in, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m256i t2 = _mm256_and_si256(in, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m256i t3 = _mm256_or_si256(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+ // 2. merge ASCII and 2-byte codewords
+ const __m256i utf8_unpacked =
+ _mm256_blendv_epi8(t4, in, one_byte_bytemask);
+
+ // 3. prepare bitmask for 8-bit lookup
+ const uint32_t M0 = one_byte_bitmask & 0x55555555;
+ const uint32_t M1 = M0 >> 7;
+ const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+ // 4. pack the bytes
+
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+ const uint8_t *row_2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+ 16)][0];
+
+ const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+ const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+
+ const __m256i utf8_packed = _mm256_shuffle_epi8(
+ utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+ // 5. store bytes
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_castsi256_si128(utf8_packed));
+ utf8_output += row[0];
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_extractf128_si256(utf8_packed, 1));
+ utf8_output += row_2[0];
+
+ // 6. adjust pointers
+ buf += 16;
+ continue;
+ }
+ // 1. Check if there are any surrogate word in the input chunk.
+ // We have also deal with situation when there is a surrogate word
+ // at the end of a chunk.
+ const __m256i surrogates_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+ // bitmask = 0x0000 if there are no surrogates
+ // = 0xc000 if the last word is a surrogate
+ const uint32_t surrogates_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (surrogates_bitmask == 0x00000000) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ const __m256i dup_even = _mm256_setr_epi16(
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
+ UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+ // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
+ const __m256i s0 = _mm256_srli_epi16(in, 4);
+ // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+ const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+ // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+ const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+ // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+ const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+ simdutf_vec(0b0100000000000000));
+ const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+ const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+ (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+ // Due to the wider registers, the following path is less likely to be
+ // useful.
+ /*if(mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const __m256i shuffle =
+ _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+ 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+ _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+ _mm256_shuffle_epi8(out1, shuffle);
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+ continue;
+ }*/
+ const uint8_t mask0 = uint8_t(mask);
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+ const __m128i utf8_0 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+ const __m128i utf8_1 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+ const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+ const uint8_t *row2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+ const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+ const __m128i utf8_2 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+ const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+ const uint8_t *row3 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+ const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+ const __m128i utf8_3 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += row0[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += row1[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+ utf8_output += row2[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+ utf8_output += row3[0];
+ buf += 16;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+ if ((word & 0xFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xF800) != 0xD800) {
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word =
+ big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k - 1),
+ utf8_output);
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf8_output++ = char((value >> 18) | 0b11110000);
+ *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((value & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf32_to_latin1.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_latin1.cpp
new file mode 100644
index 000000000..d6a32d5df
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_latin1.cpp
@@ -0,0 +1,93 @@
+std::pair<const char32_t *, char *>
+avx2_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) {
+ const size_t rounded_len =
+ len & ~0x1F; // Round down to nearest multiple of 32
+
+ __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
+
+ __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
+
+ for (size_t i = 0; i < rounded_len; i += 16) {
+ __m256i in1 = _mm256_loadu_si256((__m256i *)buf);
+ __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8));
+
+ __m256i check_combined = _mm256_or_si256(in1, in2);
+
+ if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
+ return std::make_pair(nullptr, latin1_output);
+ }
+
+ // Turn UTF32 bytes into latin 1 bytes
+ __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask);
+ __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask);
+
+ // move Latin1 bytes to their correct spot
+ __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0);
+ __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1);
+ __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1);
+ __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2);
+
+ __m256i result = _mm256_or_si256(reshuffled1, reshuffled2);
+ _mm_storeu_si128((__m128i *)latin1_output, _mm256_castsi256_si128(result));
+
+ latin1_output += 16;
+ buf += 16;
+ }
+
+ return std::make_pair(buf, latin1_output);
+}
+std::pair<result, char *>
+avx2_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+ char *latin1_output) {
+ const size_t rounded_len =
+ len & ~0x1F; // Round down to nearest multiple of 32
+
+ __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
+ __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
+
+ const char32_t *start = buf;
+
+ for (size_t i = 0; i < rounded_len; i += 16) {
+ __m256i in1 = _mm256_loadu_si256((__m256i *)buf);
+ __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8));
+
+ __m256i check_combined = _mm256_or_si256(in1, in2);
+
+ if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
+ // Fallback to scalar code for handling errors
+ for (int k = 0; k < 8; k++) {
+ char32_t codepoint = buf[k];
+ if (codepoint <= 0xFF) {
+ *latin1_output++ = static_cast<char>(codepoint);
+ } else {
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+ latin1_output);
+ }
+ }
+ buf += 8;
+ } else {
+ __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask);
+ __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask);
+
+ __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0);
+ __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1);
+ __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1);
+ __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2);
+
+ __m256i result = _mm256_or_si256(reshuffled1, reshuffled2);
+ _mm_storeu_si128((__m128i *)latin1_output,
+ _mm256_castsi256_si128(result));
+
+ latin1_output += 16;
+ buf += 16;
+ }
+ }
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ latin1_output);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf16.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf16.cpp
new file mode 100644
index 000000000..ffd6f1e47
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf16.cpp
@@ -0,0 +1,174 @@
+template <endianness big_endian>
+std::pair<const char32_t *, char16_t *>
+avx2_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+ char16_t *utf16_output) {
+ const char32_t *end = buf + len;
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+ __m256i forbidden_bytemask = _mm256_setzero_si256();
+
+ while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+
+ const __m256i v_00000000 = _mm256_setzero_si256();
+ const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+
+ // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+ const __m256i saturation_bytemask =
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+ const uint32_t saturation_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+
+ if (saturation_bitmask == 0xffffffff) {
+ const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+ const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+ forbidden_bytemask = _mm256_or_si256(
+ forbidden_bytemask,
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
+
+ __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
+ _mm256_extractf128_si256(in, 1));
+ if (big_endian) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+ }
+ _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
+ utf16_output += 8;
+ buf += 8;
+ } else {
+ size_t forward = 7;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFF0000) == 0) {
+ // will not generate a surrogate pair
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(nullptr, utf16_output);
+ }
+ *utf16_output++ =
+ big_endian
+ ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
+ : char16_t(word);
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return std::make_pair(nullptr, utf16_output);
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (big_endian) {
+ high_surrogate =
+ uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+ low_surrogate =
+ uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ }
+ }
+ buf += k;
+ }
+ }
+
+ // check for invalid input
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+ return std::make_pair(nullptr, utf16_output);
+ }
+
+ return std::make_pair(buf, utf16_output);
+}
+
+template <endianness big_endian>
+std::pair<result, char16_t *>
+avx2_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+ char16_t *utf16_output) {
+ const char32_t *start = buf;
+ const char32_t *end = buf + len;
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+
+ const __m256i v_00000000 = _mm256_setzero_si256();
+ const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+
+ // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+ const __m256i saturation_bytemask =
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+ const uint32_t saturation_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+
+ if (saturation_bitmask == 0xffffffff) {
+ const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+ const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+ const __m256i forbidden_bytemask =
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
+ 0x0) {
+ return std::make_pair(result(error_code::SURROGATE, buf - start),
+ utf16_output);
+ }
+
+ __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
+ _mm256_extractf128_si256(in, 1));
+ if (big_endian) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+ }
+ _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
+ utf16_output += 8;
+ buf += 8;
+ } else {
+ size_t forward = 7;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFF0000) == 0) {
+ // will not generate a surrogate pair
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k), utf16_output);
+ }
+ *utf16_output++ =
+ big_endian
+ ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
+ : char16_t(word);
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return std::make_pair(
+ result(error_code::TOO_LARGE, buf - start + k), utf16_output);
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (big_endian) {
+ high_surrogate =
+ uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+ low_surrogate =
+ uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ }
+ }
+ buf += k;
+ }
+ }
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf8.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf8.cpp
new file mode 100644
index 000000000..e1fe5c222
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf32_to_utf8.cpp
@@ -0,0 +1,569 @@
+std::pair<const char32_t *, char *>
+avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
+ const char32_t *end = buf + len;
+ const __m256i v_0000 = _mm256_setzero_si256();
+ const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+ const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+ const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+ const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+ const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+ __m256i running_max = _mm256_setzero_si256();
+ __m256i forbidden_bytemask = _mm256_setzero_si256();
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+ __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
+ running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
+
+ // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
+ // saturation
+ __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
+ _mm256_and_si256(nextin, v_7fffffff));
+ in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+ // Try to apply UTF-16 => UTF-8 routine on 256 bits
+ // (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+ if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ const __m128i utf8_packed = _mm_packus_epi16(
+ _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+ // 2. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+ // no bits set above 7th bit
+ const __m256i one_byte_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+ const uint32_t one_byte_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+ // no bits set above 11th bit
+ const __m256i one_or_two_bytes_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+ const uint32_t one_or_two_bytes_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+ if (one_or_two_bytes_bitmask == 0xffffffff) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+ const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m256i t3 = _mm256_or_si256(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+ // 2. merge ASCII and 2-byte codewords
+ const __m256i utf8_unpacked =
+ _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+ // 3. prepare bitmask for 8-bit lookup
+ const uint32_t M0 = one_byte_bitmask & 0x55555555;
+ const uint32_t M1 = M0 >> 7;
+ const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+ // 4. pack the bytes
+
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+ const uint8_t *row_2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+ 16)][0];
+
+ const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+ const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+
+ const __m256i utf8_packed = _mm256_shuffle_epi8(
+ utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+ // 5. store bytes
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_castsi256_si128(utf8_packed));
+ utf8_output += row[0];
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_extractf128_si256(utf8_packed, 1));
+ utf8_output += row_2[0];
+
+ // 6. adjust pointers
+ buf += 16;
+ continue;
+ }
+ // Must check for overflow in packing
+ const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
+ _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+ const uint32_t saturation_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+ if (saturation_bitmask == 0xffffffff) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+ forbidden_bytemask = _mm256_or_si256(
+ forbidden_bytemask,
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
+
+ const __m256i dup_even = _mm256_setr_epi16(
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
+ UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+ // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
+ const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+ // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+ const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+ // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+ const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+ // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+ const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+ simdutf_vec(0b0100000000000000));
+ const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+ const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+ (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+ // Due to the wider registers, the following path is less likely to be
+ // useful.
+ /*if(mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const __m256i shuffle =
+ _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+ 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+ _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+ _mm256_shuffle_epi8(out1, shuffle);
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+ continue;
+ }*/
+ const uint8_t mask0 = uint8_t(mask);
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+ const __m128i utf8_0 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+ const __m128i utf8_1 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+ const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+ const uint8_t *row2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+ const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+ const __m128i utf8_2 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+ const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+ const uint8_t *row3 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+ const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+ const __m128i utf8_3 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += row0[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += row1[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+ utf8_output += row2[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+ utf8_output += row3[0];
+ buf += 16;
+ } else {
+ // case: at least one 32-bit word is larger than 0xFFFF <=> it will
+ // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
+ // wasteful to use scalar code, but being efficient with SIMD may require
+ // large, non-trivial tables?
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+ *utf8_output++ = char(word);
+ } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else { // 4-byte
+ if (word > 0x10FFFF) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ // check for invalid input
+ const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(
+ _mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+
+ return std::make_pair(buf, utf8_output);
+}
+
+std::pair<result, char *>
+avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+ char *utf8_output) {
+ const char32_t *end = buf + len;
+ const char32_t *start = buf;
+
+ const __m256i v_0000 = _mm256_setzero_si256();
+ const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+ const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+ const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+ const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+ const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+ const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+ __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
+ // Check for too large input
+ const __m256i max_input =
+ _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(
+ _mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+ utf8_output);
+ }
+
+ // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
+ // saturation
+ __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
+ _mm256_and_si256(nextin, v_7fffffff));
+ in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+ // Try to apply UTF-16 => UTF-8 routine on 256 bits
+ // (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+ if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ const __m128i utf8_packed = _mm_packus_epi16(
+ _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+ // 2. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+ // no bits set above 7th bit
+ const __m256i one_byte_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+ const uint32_t one_byte_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+ // no bits set above 11th bit
+ const __m256i one_or_two_bytes_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+ const uint32_t one_or_two_bytes_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+ if (one_or_two_bytes_bitmask == 0xffffffff) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+ const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m256i t3 = _mm256_or_si256(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+ // 2. merge ASCII and 2-byte codewords
+ const __m256i utf8_unpacked =
+ _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+ // 3. prepare bitmask for 8-bit lookup
+ const uint32_t M0 = one_byte_bitmask & 0x55555555;
+ const uint32_t M1 = M0 >> 7;
+ const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+ // 4. pack the bytes
+
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+ const uint8_t *row_2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+ 16)][0];
+
+ const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+ const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+
+ const __m256i utf8_packed = _mm256_shuffle_epi8(
+ utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+ // 5. store bytes
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_castsi256_si128(utf8_packed));
+ utf8_output += row[0];
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_extractf128_si256(utf8_packed, 1));
+ utf8_output += row_2[0];
+
+ // 6. adjust pointers
+ buf += 16;
+ continue;
+ }
+ // Must check for overflow in packing
+ const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
+ _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+ const uint32_t saturation_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+ if (saturation_bitmask == 0xffffffff) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+
+ // Check for illegal surrogate code units
+ const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+ const __m256i forbidden_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
+ 0x0) {
+ return std::make_pair(result(error_code::SURROGATE, buf - start),
+ utf8_output);
+ }
+
+ const __m256i dup_even = _mm256_setr_epi16(
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
+ UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+ // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
+ const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+ // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+ const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+ // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+ const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+ // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+ const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+ simdutf_vec(0b0100000000000000));
+ const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+ const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+ (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+ // Due to the wider registers, the following path is less likely to be
+ // useful.
+ /*if(mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const __m256i shuffle =
+ _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+ 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+ _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+ _mm256_shuffle_epi8(out1, shuffle);
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+ continue;
+ }*/
+ const uint8_t mask0 = uint8_t(mask);
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+ const __m128i utf8_0 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+ const __m128i utf8_1 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+ const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+ const uint8_t *row2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+ const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+ const __m128i utf8_2 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+ const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+ const uint8_t *row3 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+ const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+ const __m128i utf8_3 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += row0[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += row1[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+ utf8_output += row2[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+ utf8_output += row3[0];
+ buf += 16;
+ } else {
+ // case: at least one 32-bit word is larger than 0xFFFF <=> it will
+ // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
+ // wasteful to use scalar code, but being efficient with SIMD may require
+ // large, non-trivial tables?
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+ *utf8_output++ = char(word);
+ } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k), utf8_output);
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else { // 4-byte
+ if (word > 0x10FFFF) {
+ return std::make_pair(
+ result(error_code::TOO_LARGE, buf - start + k), utf8_output);
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf8_to_latin1.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_latin1.cpp
new file mode 100644
index 000000000..8e78ab551
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_latin1.cpp
@@ -0,0 +1,60 @@
+// depends on "tables/utf8_to_utf16_tables.h"
+
+// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_latin1(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char *&latin1_output) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ //
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+ //
+ const __m128i in = _mm_loadu_si128((__m128i *)input);
+
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask &
+ 0xfff; // we are only processing 12 bytes in case it is not all ASCII
+
+ if (utf8_end_of_code_point_mask == 0xfff) {
+ // We process the data in chunks of 12 bytes.
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
+ latin1_output += 12; // We wrote 12 characters.
+ return 12; // We consumed 1 bytes.
+ }
+ /// We do not have a fast path available, so we fallback.
+ const uint8_t idx =
+ tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+ const uint8_t consumed =
+ tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+ // this indicates an invalid input:
+ if (idx >= 64) {
+ return consumed;
+ }
+ // Here we should have (idx < 64), if not, there is a bug in the validation or
+ // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
+ // we process SIX (6) input code-code units. The max length in bytes of six
+ // code code units spanning between 1 and 2 bytes each is 12 bytes. On
+ // processors where pdep/pext is fast, we might be able to use a small lookup
+ // table.
+ const __m128i sh =
+ _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+ __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+ const __m128i latin1_packed = _mm_packus_epi16(composed, composed);
+ // writing 8 bytes even though we only care about the first 6 bytes.
+ // performance note: it would be faster to use _mm_storeu_si128, we should
+ // investigate.
+ _mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
+ latin1_output += 6; // We wrote 6 bytes.
+ return consumed;
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf16.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf16.cpp
new file mode 100644
index 000000000..d99a8ed9d
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf16.cpp
@@ -0,0 +1,195 @@
+// depends on "tables/utf8_to_utf16_tables.h"
+
+// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+template <endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char16_t *&utf16_output) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ //
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+ //
+ // We first try a few fast paths.
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ const __m128i in = _mm_loadu_si128((__m128i *)input);
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask & 0xfff;
+ if (utf8_end_of_code_point_mask == 0xfff) {
+ // We process the data in chunks of 12 bytes.
+ __m256i ascii = _mm256_cvtepu8_epi16(in);
+ if (big_endian) {
+ const __m256i swap256 = _mm256_setr_epi8(
+ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+ 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+ ascii = _mm256_shuffle_epi8(ascii, swap256);
+ }
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii);
+ utf16_output += 12; // We wrote 12 16-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+ if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+ // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte
+ // UTF-16 code units. There is probably a more efficient sequence, but the
+ // following might do.
+ const __m128i sh =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+ __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+ if (big_endian)
+ composed = _mm_shuffle_epi8(composed, swap);
+ _mm_storeu_si128((__m128i *)utf16_output, composed);
+ utf16_output += 8; // We wrote 16 bytes, 8 code points.
+ return 16;
+ }
+ if (input_utf8_end_of_code_point_mask == 0x924) {
+ // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
+ // UTF-16 code units. There is probably a more efficient sequence, but the
+ // following might do.
+ const __m128i sh =
+ _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii =
+ _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+ const __m128i middlebyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ const __m128i highbyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+ __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+ if (big_endian)
+ composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+ _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+ utf16_output += 4;
+ return 12;
+ }
+
+ const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][0];
+ const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][1];
+ if (idx < 64) {
+ // SIX (6) input code-code units
+ // this is a relatively easy scenario
+ // we process SIX (6) input code-code units. The max length in bytes of six
+ // code code units spanning between 1 and 2 bytes each is 12 bytes. On
+ // processors where pdep/pext is fast, we might be able to use a small
+ // lookup table.
+ const __m128i sh = _mm_loadu_si128(
+ (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+ __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+ if (big_endian)
+ composed = _mm_shuffle_epi8(composed, swap);
+ _mm_storeu_si128((__m128i *)utf16_output, composed);
+ utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential
+ // overflow of 4 bytes.
+ } else if (idx < 145) {
+ // FOUR (4) input code-code units
+ const __m128i sh = _mm_loadu_si128(
+ (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii =
+ _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+ const __m128i middlebyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ const __m128i highbyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+ __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+ if (big_endian)
+ composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+ _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+ utf16_output += 4; // Here we overflow by 8 bytes.
+ } else if (idx < 209) {
+ // TWO (2) input code-code units
+ //////////////
+ // There might be garbage inputs where a leading byte mascarades as a
+ // four-byte leading byte (by being followed by 3 continuation byte), but is
+ // not greater than 0xf0. This could trigger a buffer overflow if we only
+ // counted leading bytes of the form 0xf0 as generating surrogate pairs,
+ // without further UTF-8 validation. Thus we must be careful to ensure that
+ // only leading bytes at least as large as 0xf0 generate surrogate pairs. We
+ // do as at the cost of an extra mask.
+ /////////////
+ const __m128i sh = _mm_loadu_si128(
+ (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+ const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+ // correct for spurious high bit
+ const __m128i correct =
+ _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+ middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+ const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+ // We deliberately carry the leading four bits in highbyte if they are
+ // present, we remove them later when computing hightenbits.
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+ // When we need to generate a surrogate pair (leading byte > 0xF0), then
+ // the corresponding 32-bit value in 'composed' will be greater than
+ // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
+ // location of the surrogate pairs.
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+ _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+ const __m128i composedminus =
+ _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
+ const __m128i lowtenbits =
+ _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
+ // Notice the 0x3ff mask:
+ const __m128i hightenbits =
+ _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
+ const __m128i lowtenbitsadd =
+ _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
+ const __m128i hightenbitsadd =
+ _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
+ const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
+ __m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
+ uint32_t basic_buffer[4];
+ uint32_t basic_buffer_swap[4];
+ if (big_endian) {
+ _mm_storeu_si128((__m128i *)basic_buffer_swap,
+ _mm_shuffle_epi8(composed, swap));
+ surrogates = _mm_shuffle_epi8(surrogates, swap);
+ }
+ _mm_storeu_si128((__m128i *)basic_buffer, composed);
+ uint32_t surrogate_buffer[4];
+ _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
+ for (size_t i = 0; i < 3; i++) {
+ if (basic_buffer[i] > 0x3c00000) {
+ utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
+ utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+ utf16_output += 2;
+ } else {
+ utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i])
+ : uint16_t(basic_buffer[i]);
+ utf16_output++;
+ }
+ }
+ } else {
+ // here we know that there is an error but we do not handle errors
+ }
+ return consumed;
+}
diff --git a/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf32.cpp b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf32.cpp
new file mode 100644
index 000000000..c5cf74143
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_convert_utf8_to_utf32.cpp
@@ -0,0 +1,135 @@
+// depends on "tables/utf8_to_utf16_tables.h"
+
+// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_utf32(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char32_t *&utf32_output) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ //
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+ //
+ // We first try a few fast paths.
+ const __m128i in = _mm_loadu_si128((__m128i *)input);
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask & 0xfff;
+ if (utf8_end_of_code_point_mask == 0xfff) {
+ // We process the data in chunks of 12 bytes.
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
+ _mm256_cvtepu8_epi32(in));
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8),
+ _mm256_cvtepu8_epi32(_mm_srli_si128(in, 8)));
+ utf32_output += 12; // We wrote 12 32-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+ if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+ // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte
+ // UTF-32 code units. There is probably a more efficient sequence, but the
+ // following might do.
+ const __m128i sh =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+ const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+ _mm256_storeu_si256((__m256i *)utf32_output,
+ _mm256_cvtepu16_epi32(composed));
+ utf32_output += 8; // We wrote 16 bytes, 8 code points.
+ return 16;
+ }
+ if (input_utf8_end_of_code_point_mask == 0x924) {
+ // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
+ // UTF-32 code units. There is probably a more efficient sequence, but the
+ // following might do.
+ const __m128i sh =
+ _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii =
+ _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+ const __m128i middlebyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ const __m128i highbyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+ _mm_storeu_si128((__m128i *)utf32_output, composed);
+ utf32_output += 4;
+ return 12;
+ }
+ /// We do not have a fast path available, so we fallback.
+
+ const uint8_t idx =
+ tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+ const uint8_t consumed =
+ tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+ if (idx < 64) {
+ // SIX (6) input code-code units
+ // this is a relatively easy scenario
+ // we process SIX (6) input code-code units. The max length in bytes of six
+ // code code units spanning between 1 and 2 bytes each is 12 bytes. On
+ // processors where pdep/pext is fast, we might be able to use a small
+ // lookup table.
+ const __m128i sh =
+ _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+ const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+ _mm256_storeu_si256((__m256i *)utf32_output,
+ _mm256_cvtepu16_epi32(composed));
+ utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential
+ // overflow of 32 - 24 = 8 bytes.
+ } else if (idx < 145) {
+ // FOUR (4) input code-code units
+ const __m128i sh =
+ _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii =
+ _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+ const __m128i middlebyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ const __m128i highbyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+ _mm_storeu_si128((__m128i *)utf32_output, composed);
+ utf32_output += 4;
+ } else if (idx < 209) {
+ // TWO (2) input code-code units
+ const __m128i sh =
+ _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+ const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+ // correct for spurious high bit
+ const __m128i correct =
+ _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+ middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+ const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+ _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+ _mm_storeu_si128((__m128i *)utf32_output, composed);
+ utf32_output +=
+ 3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes.
+ } else {
+ // here we know that there is an error but we do not handle errors
+ }
+ return consumed;
+}
diff --git a/contrib/simdutf/src/haswell/avx2_validate_utf16.cpp b/contrib/simdutf/src/haswell/avx2_validate_utf16.cpp
new file mode 100644
index 000000000..0c54062d4
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_validate_utf16.cpp
@@ -0,0 +1,206 @@
+/*
+ In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
+
+ In a vectorized algorithm we want to examine the most significant
+ nibble in order to select a fast path. If none of highest nibbles
+ are 0xD (13), than we are sure that UTF-16 chunk in a vector
+ register is valid.
+
+ Let us analyze what we need to check if the nibble is 0xD. The
+ value of the preceding nibble determines what we have:
+
+ 0xd000 .. 0xd7ff - a valid word
+ 0xd800 .. 0xdbff - low surrogate
+ 0xdc00 .. 0xdfff - high surrogate
+
+ Other constraints we have to consider:
+ - there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
+ - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
+ - there must not be sole low surrogate nor high surrogate
+
+ We're going to build three bitmasks based on the 3rd nibble:
+ - V = valid word,
+ - L = low surrogate (0xd800 .. 0xdbff)
+ - H = high surrogate (0xdc00 .. 0xdfff)
+
+ 0 1 2 3 4 5 6 7 <--- word index
+ [ V | L | H | L | H | V | V | L ]
+ 1 0 0 0 0 1 1 0 - V = valid masks
+ 0 1 0 1 0 0 0 1 - L = low surrogate
+ 0 0 1 0 1 0 0 0 - H high surrogate
+
+
+ 1 0 0 0 0 1 1 0 V = valid masks
+ 0 1 0 1 0 0 0 0 a = L & (H >> 1)
+ 0 0 1 0 1 0 0 0 b = a << 1
+ 1 1 1 1 1 1 1 0 c = V | a | b
+ ^
+ the last bit can be zero, we just consume 7
+ code units and recheck this word in the next iteration
+*/
+
+/* Returns:
+ - pointer to the last unprocessed character (a scalar fallback should check
+ the rest);
+ - nullptr if an error was detected.
+*/
+template <endianness big_endian>
+const char16_t *avx2_validate_utf16(const char16_t *input, size_t size) {
+ const char16_t *end = input + size;
+
+ const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+ const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+ const auto v_fc = simd8<uint8_t>::splat(0xfc);
+ const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+ while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
+ // 0. Load data: since the validation takes into account only higher
+ // byte of each word, we compress the two vectors into one which
+ // consists only the higher bytes.
+ auto in0 = simd16<uint16_t>(input);
+ auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
+
+ if (big_endian) {
+ in0 = in0.swap_bytes();
+ in1 = in1.swap_bytes();
+ }
+
+ const auto t0 = in0.shr<8>();
+ const auto t1 = in1.shr<8>();
+
+ const auto in = simd16<uint16_t>::pack(t0, t1);
+
+ // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+ const auto surrogates_wordmask = (in & v_f8) == v_d8;
+ const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
+ if (surrogates_bitmask == 0x0) {
+ input += simd16<uint16_t>::ELEMENTS * 2;
+ } else {
+ // 2. We have some surrogates that have to be distinguished:
+ // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+ // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+ //
+ // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+ // V - non-surrogate code units
+ // V = not surrogates_wordmask
+ const uint32_t V = ~surrogates_bitmask;
+
+ // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+ const auto vH = (in & v_fc) == v_dc;
+ const uint32_t H = vH.to_bitmask();
+
+ // L - word mask for low surrogates
+ // L = not H and surrogates_wordmask
+ const uint32_t L = ~H & surrogates_bitmask;
+
+ const uint32_t a =
+ L & (H >> 1); // A low surrogate must be followed by high one.
+ // (A low surrogate placed in the 7th register's word
+ // is an exception we handle.)
+ const uint32_t b =
+ a << 1; // Just mark that the opposite fact is hold,
+ // thanks to that we have only two masks for valid case.
+ const uint32_t c = V | a | b; // Combine all the masks into the final one.
+
+ if (c == 0xffffffff) {
+ // The whole input register contains valid UTF-16, i.e.,
+ // either single code units or proper surrogate pairs.
+ input += simd16<uint16_t>::ELEMENTS * 2;
+ } else if (c == 0x7fffffff) {
+ // The 31 lower code units of the input register contains valid UTF-16.
+ // The 31 word may be either a low or high surrogate. It the next
+ // iteration we 1) check if the low surrogate is followed by a high
+ // one, 2) reject sole high surrogate.
+ input += simd16<uint16_t>::ELEMENTS * 2 - 1;
+ } else {
+ return nullptr;
+ }
+ }
+ }
+
+ return input;
+}
+
+template <endianness big_endian>
+const result avx2_validate_utf16_with_errors(const char16_t *input,
+ size_t size) {
+ if (simdutf_unlikely(size == 0)) {
+ return result(error_code::SUCCESS, 0);
+ }
+ const char16_t *start = input;
+ const char16_t *end = input + size;
+
+ const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+ const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+ const auto v_fc = simd8<uint8_t>::splat(0xfc);
+ const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+ while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
+ // 0. Load data: since the validation takes into account only higher
+ // byte of each word, we compress the two vectors into one which
+ // consists only the higher bytes.
+ auto in0 = simd16<uint16_t>(input);
+ auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
+
+ if (big_endian) {
+ in0 = in0.swap_bytes();
+ in1 = in1.swap_bytes();
+ }
+
+ const auto t0 = in0.shr<8>();
+ const auto t1 = in1.shr<8>();
+
+ const auto in = simd16<uint16_t>::pack(t0, t1);
+
+ // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+ const auto surrogates_wordmask = (in & v_f8) == v_d8;
+ const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
+ if (surrogates_bitmask == 0x0) {
+ input += simd16<uint16_t>::ELEMENTS * 2;
+ } else {
+ // 2. We have some surrogates that have to be distinguished:
+ // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+ // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+ //
+ // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+ // V - non-surrogate code units
+ // V = not surrogates_wordmask
+ const uint32_t V = ~surrogates_bitmask;
+
+ // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+ const auto vH = (in & v_fc) == v_dc;
+ const uint32_t H = vH.to_bitmask();
+
+ // L - word mask for low surrogates
+ // L = not H and surrogates_wordmask
+ const uint32_t L = ~H & surrogates_bitmask;
+
+ const uint32_t a =
+ L & (H >> 1); // A low surrogate must be followed by high one.
+ // (A low surrogate placed in the 7th register's word
+ // is an exception we handle.)
+ const uint32_t b =
+ a << 1; // Just mark that the opposite fact is hold,
+ // thanks to that we have only two masks for valid case.
+ const uint32_t c = V | a | b; // Combine all the masks into the final one.
+
+ if (c == 0xffffffff) {
+ // The whole input register contains valid UTF-16, i.e.,
+ // either single code units or proper surrogate pairs.
+ input += simd16<uint16_t>::ELEMENTS * 2;
+ } else if (c == 0x7fffffff) {
+ // The 31 lower code units of the input register contains valid UTF-16.
+ // The 31 word may be either a low or high surrogate. It the next
+ // iteration we 1) check if the low surrogate is followed by a high
+ // one, 2) reject sole high surrogate.
+ input += simd16<uint16_t>::ELEMENTS * 2 - 1;
+ } else {
+ return result(error_code::SURROGATE, input - start);
+ }
+ }
+ }
+
+ return result(error_code::SUCCESS, input - start);
+}
diff --git a/contrib/simdutf/src/haswell/avx2_validate_utf32le.cpp b/contrib/simdutf/src/haswell/avx2_validate_utf32le.cpp
new file mode 100644
index 000000000..8cb1d5f3b
--- /dev/null
+++ b/contrib/simdutf/src/haswell/avx2_validate_utf32le.cpp
@@ -0,0 +1,70 @@
+/* Returns:
+ - pointer to the last unprocessed character (a scalar fallback should check
+ the rest);
+ - nullptr if an error was detected.
+*/
+const char32_t *avx2_validate_utf32le(const char32_t *input, size_t size) {
+ const char32_t *end = input + size;
+
+ const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
+ const __m256i offset = _mm256_set1_epi32(0xffff2000);
+ const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
+ __m256i currentmax = _mm256_setzero_si256();
+ __m256i currentoffsetmax = _mm256_setzero_si256();
+
+ while (input + 8 < end) {
+ const __m256i in = _mm256_loadu_si256((__m256i *)input);
+ currentmax = _mm256_max_epu32(in, currentmax);
+ currentoffsetmax =
+ _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
+ input += 8;
+ }
+ __m256i is_zero =
+ _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
+ if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+ return nullptr;
+ }
+
+ is_zero = _mm256_xor_si256(
+ _mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
+ if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+ return nullptr;
+ }
+
+ return input;
+}
+
+const result avx2_validate_utf32le_with_errors(const char32_t *input,
+ size_t size) {
+ const char32_t *start = input;
+ const char32_t *end = input + size;
+
+ const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
+ const __m256i offset = _mm256_set1_epi32(0xffff2000);
+ const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
+ __m256i currentmax = _mm256_setzero_si256();
+ __m256i currentoffsetmax = _mm256_setzero_si256();
+
+ while (input + 8 < end) {
+ const __m256i in = _mm256_loadu_si256((__m256i *)input);
+ currentmax = _mm256_max_epu32(in, currentmax);
+ currentoffsetmax =
+ _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
+
+ __m256i is_zero = _mm256_xor_si256(
+ _mm256_max_epu32(currentmax, standardmax), standardmax);
+ if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+ return result(error_code::TOO_LARGE, input - start);
+ }
+
+ is_zero =
+ _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax),
+ standardoffsetmax);
+ if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+ return result(error_code::SURROGATE, input - start);
+ }
+ input += 8;
+ }
+
+ return result(error_code::SUCCESS, input - start);
+}
diff --git a/contrib/simdutf/src/haswell/implementation.cpp b/contrib/simdutf/src/haswell/implementation.cpp
new file mode 100644
index 000000000..0225f1f95
--- /dev/null
+++ b/contrib/simdutf/src/haswell/implementation.cpp
@@ -0,0 +1,1145 @@
+#include "tables/utf8_to_utf16_tables.h"
+#include "scalar/utf8_to_utf16/valid_utf8_to_utf16.h"
+#include "scalar/utf8_to_utf16/utf8_to_utf16.h"
+#include "scalar/utf8_to_utf32/valid_utf8_to_utf32.h"
+#include "scalar/utf8_to_utf32/utf8_to_utf32.h"
+#include "tables/utf16_to_utf8_tables.h"
+#include "scalar/utf8.h"
+#include "scalar/utf16.h"
+#include "scalar/latin1.h"
+#include "scalar/utf8_to_latin1/valid_utf8_to_latin1.h"
+#include "scalar/utf8_to_latin1/utf8_to_latin1.h"
+
+#include "simdutf/haswell/begin.h"
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+#ifndef SIMDUTF_HASWELL_H
+ #error "haswell.h must be included"
+#endif
+using namespace simd;
+
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
+ return input.reduce_or().is_ascii();
+}
+
+simdutf_unused simdutf_really_inline simd8<bool>
+must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
+ const simd8<uint8_t> prev3) {
+ simd8<uint8_t> is_second_byte =
+ prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
+ simd8<uint8_t> is_third_byte =
+ prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+ simd8<uint8_t> is_fourth_byte =
+ prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+ // Caller requires a bool (all 1's). All values resulting from the subtraction
+ // will be <= 64, so signed comparison is fine.
+ return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) >
+ int8_t(0);
+}
+
+simdutf_really_inline simd8<bool>
+must_be_2_3_continuation(const simd8<uint8_t> prev2,
+ const simd8<uint8_t> prev3) {
+ simd8<uint8_t> is_third_byte =
+ prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be > 0x80
+ simd8<uint8_t> is_fourth_byte =
+ prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be > 0x80
+ return simd8<bool>(is_third_byte | is_fourth_byte);
+}
+
+#include "haswell/avx2_validate_utf16.cpp"
+#include "haswell/avx2_validate_utf32le.cpp"
+
+#include "haswell/avx2_convert_latin1_to_utf8.cpp"
+#include "haswell/avx2_convert_latin1_to_utf16.cpp"
+#include "haswell/avx2_convert_latin1_to_utf32.cpp"
+
+#include "haswell/avx2_convert_utf8_to_utf16.cpp"
+#include "haswell/avx2_convert_utf8_to_utf32.cpp"
+
+#include "haswell/avx2_convert_utf16_to_latin1.cpp"
+#include "haswell/avx2_convert_utf16_to_utf8.cpp"
+#include "haswell/avx2_convert_utf16_to_utf32.cpp"
+
+#include "haswell/avx2_convert_utf32_to_latin1.cpp"
+#include "haswell/avx2_convert_utf32_to_utf8.cpp"
+#include "haswell/avx2_convert_utf32_to_utf16.cpp"
+
+#include "haswell/avx2_convert_utf8_to_latin1.cpp"
+
+#include "haswell/avx2_base64.cpp"
+
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#include "generic/buf_block_reader.h"
+#include "generic/utf8_validation/utf8_lookup4_algorithm.h"
+#include "generic/utf8_validation/utf8_validator.h"
+// transcoding from UTF-8 to UTF-16
+#include "generic/utf8_to_utf16/valid_utf8_to_utf16.h"
+#include "generic/utf8_to_utf16/utf8_to_utf16.h"
+// transcoding from UTF-8 to UTF-32
+#include "generic/utf8_to_utf32/valid_utf8_to_utf32.h"
+#include "generic/utf8_to_utf32/utf8_to_utf32.h"
+// other functions
+#include "generic/utf8.h"
+#include "generic/utf16.h"
+
+// transcoding from UTF-8 to Latin 1
+#include "generic/utf8_to_latin1/utf8_to_latin1.h"
+#include "generic/utf8_to_latin1/valid_utf8_to_latin1.h"
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+ size_t length) const noexcept {
+ // If there is a BOM, then we trust it.
+ auto bom_encoding = simdutf::BOM::check_bom(input, length);
+ if (bom_encoding != encoding_type::unspecified) {
+ return bom_encoding;
+ }
+ int out = 0;
+ if (validate_utf8(input, length)) {
+ out |= encoding_type::UTF8;
+ }
+ if ((length % 2) == 0) {
+ if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+ length / 2)) {
+ out |= encoding_type::UTF16_LE;
+ }
+ }
+ if ((length % 4) == 0) {
+ if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+ out |= encoding_type::UTF32_LE;
+ }
+ }
+ return out;
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+ return haswell::utf8_validation::generic_validate_utf8(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+ const char *buf, size_t len) const noexcept {
+ return haswell::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+ return haswell::utf8_validation::generic_validate_ascii(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+ const char *buf, size_t len) const noexcept {
+ return haswell::utf8_validation::generic_validate_ascii_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid UTF-16. protect the implementation from
+ // handling nullptr
+ return true;
+ }
+ const char16_t *tail = avx2_validate_utf16<endianness::LITTLE>(buf, len);
+ if (tail) {
+ return scalar::utf16::validate<endianness::LITTLE>(tail,
+ len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid UTF-16. protect the implementation from
+ // handling nullptr
+ return true;
+ }
+ const char16_t *tail = avx2_validate_utf16<endianness::BIG>(buf, len);
+ if (tail) {
+ return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ result res = avx2_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+ if (res.count != len) {
+ result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
+ buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ result res = avx2_validate_utf16_with_errors<endianness::BIG>(buf, len);
+ if (res.count != len) {
+ result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
+ buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid UTF-32. protect the implementation from
+ // handling nullptr
+ return true;
+ }
+ const char32_t *tail = avx2_validate_utf32le(buf, len);
+ if (tail) {
+ return scalar::utf32::validate(tail, len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid UTF-32. protect the implementation from
+ // handling nullptr
+ return result(error_code::SUCCESS, 0);
+ }
+ result res = avx2_validate_utf32le_with_errors(buf, len);
+ if (res.count != len) {
+ result scalar_res =
+ scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+ const char *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char *, char *> ret =
+ avx2_convert_latin1_to_utf8(buf, len, utf8_output);
+ size_t converted_chars = ret.second - utf8_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ converted_chars += scalar_converted_chars;
+ }
+
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char *, char16_t *> ret =
+ avx2_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t converted_chars = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars =
+ scalar::latin1_to_utf16::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_converted_chars == 0) {
+ return 0;
+ }
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char *, char16_t *> ret =
+ avx2_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t converted_chars = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars =
+ scalar::latin1_to_utf16::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_converted_chars == 0) {
+ return 0;
+ }
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char *, char32_t *> ret =
+ avx2_convert_latin1_to_utf32(buf, len, utf32_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t converted_chars = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_converted_chars == 0) {
+ return 0;
+ }
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ utf8_to_latin1::validating_transcoder converter;
+ return converter.convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ utf8_to_latin1::validating_transcoder converter;
+ return converter.convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+ const char *input, size_t size, char *latin1_output) const noexcept {
+ return utf8_to_latin1::convert_valid(input, size, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert_with_errors<endianness::LITTLE>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+ const char *input, size_t size, char16_t *utf16_output) const noexcept {
+ return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+ const char *input, size_t size, char16_t *utf16_output) const noexcept {
+ return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ utf8_to_utf32::validating_transcoder converter;
+ return converter.convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ utf8_to_utf32::validating_transcoder converter;
+ return converter.convert_with_errors(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+ const char *input, size_t size, char32_t *utf32_output) const noexcept {
+ return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ haswell::avx2_convert_utf16_to_latin1<endianness::LITTLE>(buf, len,
+ latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_latin1::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ haswell::avx2_convert_utf16_to_latin1<endianness::BIG>(buf, len,
+ latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_latin1::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<result, char *> ret =
+ avx2_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+ buf, len, latin1_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<result, char *> ret =
+ avx2_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
+ latin1_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ // optimization opportunity: implement a custom function
+ return convert_utf16be_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ // optimization opportunity: implement a custom function
+ return convert_utf16le_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len,
+ utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len,
+ utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf8::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(
+ buf, len, utf8_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>(
+ buf, len, utf8_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf16le_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf16be_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char32_t *, char *> ret =
+ avx2_convert_utf32_to_utf8(buf, len, utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char32_t *, char *> ret =
+ avx2_convert_utf32_to_latin1(buf, len, latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ avx2_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
+ if (ret.first.count != len) {
+ result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ return convert_utf32_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+ if (ret.first.count != len) {
+ result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char16_t *, char32_t *> ret =
+ haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
+ utf32_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char16_t *, char32_t *> ret =
+ haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len,
+ utf32_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf32::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char32_t *> ret =
+ haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(
+ buf, len, utf32_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf32_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char32_t *> ret =
+ haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>(
+ buf, len, utf32_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf32_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf32_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char32_t *, char16_t *> ret =
+ avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char32_t *, char16_t *> ret =
+ avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf32_to_utf16::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char16_t *> ret =
+ haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
+ buf, len, utf16_output);
+ if (ret.first.count != len) {
+ result scalar_res =
+ scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf16_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char16_t *> ret =
+ haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>(
+ buf, len, utf16_output);
+ if (ret.first.count != len) {
+ result scalar_res =
+ scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf16_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return convert_utf32_to_utf16le(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return convert_utf16le_to_utf32(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return convert_utf16be_to_utf32(buf, len, utf32_output);
+}
+
+void implementation::change_endianness_utf16(const char16_t *input,
+ size_t length,
+ char16_t *output) const noexcept {
+ utf16::change_endianness_utf16(input, length, output);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::count_code_points<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::count_code_points<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+ return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+ const char *buf, size_t len) const noexcept {
+ return count_utf8(buf, len);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+ return scalar::utf16::latin1_length_from_utf16(length);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+ return scalar::utf32::latin1_length_from_utf32(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+ return scalar::latin1::utf16_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ return utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+ return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+ const char *input, size_t len) const noexcept {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
+ size_t answer = len / sizeof(__m256i) * sizeof(__m256i);
+ size_t i = 0;
+ if (answer >= 2048) { // long strings optimization
+ __m256i four_64bits = _mm256_setzero_si256();
+ while (i + sizeof(__m256i) <= len) {
+ __m256i runner = _mm256_setzero_si256();
+ // We can do up to 255 loops without overflow.
+ size_t iterations = (len - i) / sizeof(__m256i);
+ if (iterations > 255) {
+ iterations = 255;
+ }
+ size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i);
+ for (; i + 4 * sizeof(__m256i) <= max_i; i += 4 * sizeof(__m256i)) {
+ __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i));
+ __m256i input2 =
+ _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i)));
+ __m256i input3 = _mm256_loadu_si256(
+ (const __m256i *)(data + i + 2 * sizeof(__m256i)));
+ __m256i input4 = _mm256_loadu_si256(
+ (const __m256i *)(data + i + 3 * sizeof(__m256i)));
+ __m256i input12 =
+ _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1),
+ _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2));
+ __m256i input23 =
+ _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3),
+ _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4));
+ __m256i input1234 = _mm256_add_epi8(input12, input23);
+ runner = _mm256_sub_epi8(runner, input1234);
+ }
+ for (; i <= max_i; i += sizeof(__m256i)) {
+ __m256i input_256_chunk =
+ _mm256_loadu_si256((const __m256i *)(data + i));
+ runner = _mm256_sub_epi8(
+ runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk));
+ }
+ four_64bits = _mm256_add_epi64(
+ four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256()));
+ }
+ answer += _mm256_extract_epi64(four_64bits, 0) +
+ _mm256_extract_epi64(four_64bits, 1) +
+ _mm256_extract_epi64(four_64bits, 2) +
+ _mm256_extract_epi64(four_64bits, 3);
+ } else if (answer > 0) {
+ for (; i + sizeof(__m256i) <= len; i += sizeof(__m256i)) {
+ __m256i latin = _mm256_loadu_si256((const __m256i *)(data + i));
+ uint32_t non_ascii = _mm256_movemask_epi8(latin);
+ answer += count_ones(non_ascii);
+ }
+ }
+ return answer + scalar::latin1::utf8_length_from_latin1(
+ reinterpret_cast<const char *>(data + i), len - i);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ const __m256i v_00000000 = _mm256_setzero_si256();
+ const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80);
+ const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800);
+ const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+ size_t pos = 0;
+ size_t count = 0;
+ for (; pos + 8 <= length; pos += 8) {
+ __m256i in = _mm256_loadu_si256((__m256i *)(input + pos));
+ const __m256i ascii_bytes_bytemask =
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000);
+ const __m256i one_two_bytes_bytemask =
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000);
+ const __m256i two_bytes_bytemask =
+ _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask);
+ const __m256i one_two_three_bytes_bytemask =
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+ const __m256i three_bytes_bytemask =
+ _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
+ const uint32_t ascii_bytes_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(ascii_bytes_bytemask));
+ const uint32_t two_bytes_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(two_bytes_bytemask));
+ const uint32_t three_bytes_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(three_bytes_bytemask));
+
+ size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
+ size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
+ size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
+ count += 32 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
+ }
+ return count +
+ scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ const __m256i v_00000000 = _mm256_setzero_si256();
+ const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+ size_t pos = 0;
+ size_t count = 0;
+ for (; pos + 8 <= length; pos += 8) {
+ __m256i in = _mm256_loadu_si256((__m256i *)(input + pos));
+ const __m256i surrogate_bytemask =
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+ const uint32_t surrogate_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask));
+ size_t surrogate_count = (32 - count_ones(surrogate_bitmask)) / 4;
+ count += 8 + surrogate_count;
+ }
+ return count +
+ scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+ size_t length, base64_options options) const noexcept {
+ return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+ char *output,
+ base64_options options) const noexcept {
+ if (options & base64_url) {
+ return encode_base64<true>(output, input, length, options);
+ } else {
+ return encode_base64<false>(output, input, length, options);
+ }
+}
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#include "simdutf/haswell/end.h"
diff --git a/contrib/simdutf/src/icelake/icelake_ascii_validation.inl.cpp b/contrib/simdutf/src/icelake/icelake_ascii_validation.inl.cpp
new file mode 100644
index 000000000..3c28276f3
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_ascii_validation.inl.cpp
@@ -0,0 +1,19 @@
+// file included directly
+
+bool validate_ascii(const char *buf, size_t len) {
+ const char *end = buf + len;
+ const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
+ __m512i running_or = _mm512_setzero_si512();
+ for (; end - buf >= 64; buf += 64) {
+ const __m512i utf8 = _mm512_loadu_si512((const __m512i *)buf);
+ running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii,
+ 0xf8); // running_or | (utf8 & ascii)
+ }
+ if (buf < end) {
+ const __m512i utf8 = _mm512_maskz_loadu_epi8(
+ (uint64_t(1) << (end - buf)) - 1, (const __m512i *)buf);
+ running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii,
+ 0xf8); // running_or | (utf8 & ascii)
+ }
+ return (_mm512_test_epi8_mask(running_or, running_or) == 0);
+}
diff --git a/contrib/simdutf/src/icelake/icelake_base64.inl.cpp b/contrib/simdutf/src/icelake/icelake_base64.inl.cpp
new file mode 100644
index 000000000..fe4844264
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_base64.inl.cpp
@@ -0,0 +1,358 @@
+// file included directly
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
+
+struct block64 {
+ __m512i chunks[1];
+};
+
+template <bool base64_url>
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+ base64_options options) {
+ // credit: Wojciech Muła
+ const uint8_t *input = (const uint8_t *)src;
+
+ uint8_t *out = (uint8_t *)dst;
+ static const char *lookup_tbl =
+ base64_url
+ ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+ : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+ const __m512i shuffle_input = _mm512_setr_epi32(
+ 0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10,
+ 0x13141213, 0x16171516, 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
+ 0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e);
+ const __m512i lookup =
+ _mm512_loadu_si512(reinterpret_cast<const __m512i *>(lookup_tbl));
+ const __m512i multi_shifts = _mm512_set1_epi64(UINT64_C(0x3036242a1016040a));
+ size_t size = srclen;
+ __mmask64 input_mask = 0xffffffffffff; // (1 << 48) - 1
+ while (size >= 48) {
+ const __m512i v = _mm512_maskz_loadu_epi8(
+ input_mask, reinterpret_cast<const __m512i *>(input));
+ const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
+ const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in);
+ const __m512i result = _mm512_permutexvar_epi8(indices, lookup);
+ _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result);
+ out += 64;
+ input += 48;
+ size -= 48;
+ }
+ input_mask = ((__mmask64)1 << size) - 1;
+ const __m512i v = _mm512_maskz_loadu_epi8(
+ input_mask, reinterpret_cast<const __m512i *>(input));
+ const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
+ const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in);
+ bool padding_needed =
+ (((options & base64_url) == 0) ^
+ ((options & base64_reverse_padding) == base64_reverse_padding));
+ size_t padding_amount = ((size % 3) > 0) ? (3 - (size % 3)) : 0;
+ size_t output_len = ((size + 2) / 3) * 4;
+ size_t non_padded_output_len = output_len - padding_amount;
+ if (!padding_needed) {
+ output_len = non_padded_output_len;
+ }
+ __mmask64 output_mask = output_len == 64 ? (__mmask64)UINT64_MAX
+ : ((__mmask64)1 << output_len) - 1;
+ __m512i result = _mm512_mask_permutexvar_epi8(
+ _mm512_set1_epi8('='), ((__mmask64)1 << non_padded_output_len) - 1,
+ indices, lookup);
+ _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask,
+ result);
+ return (size_t)(out - (uint8_t *)dst) + output_len;
+}
+
+template <bool base64_url>
+static inline uint64_t to_base64_mask(block64 *b, uint64_t *error) {
+ __m512i input = b->chunks[0];
+ const __m512i ascii_space_tbl = _mm512_set_epi8(
+ 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10,
+ 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0,
+ 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32);
+ __m512i lookup0;
+ if (base64_url) {
+ lookup0 = _mm512_set_epi8(
+ -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
+ 52, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, -128,
+ -128, -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128,
+ -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1,
+ -128, -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1);
+ } else {
+ lookup0 = _mm512_set_epi8(
+ -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
+ 52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128,
+ -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128,
+ -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128,
+ -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128);
+ }
+ __m512i lookup1;
+ if (base64_url) {
+ lookup1 = _mm512_set_epi8(
+ -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
+ 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
+ 63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
+ 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
+ } else {
+ lookup1 = _mm512_set_epi8(
+ -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
+ 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
+ -128, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
+ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
+ }
+
+ const __m512i translated = _mm512_permutex2var_epi8(lookup0, input, lookup1);
+ const __m512i combined = _mm512_or_si512(translated, input);
+ const __mmask64 mask = _mm512_movepi8_mask(combined);
+ if (mask) {
+ const __mmask64 spaces = _mm512_cmpeq_epi8_mask(
+ _mm512_shuffle_epi8(ascii_space_tbl, input), input);
+ *error = (mask ^ spaces);
+ }
+ b->chunks[0] = translated;
+
+ return mask;
+}
+
+static inline void copy_block(block64 *b, char *output) {
+ _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), b->chunks[0]);
+}
+
+static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+ uint64_t nmask = ~mask;
+ __m512i c = _mm512_maskz_compress_epi8(nmask, b->chunks[0]);
+ _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), c);
+ return _mm_popcnt_u64(nmask);
+}
+
+// The caller of this function is responsible to ensure that there are 64 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char *src) {
+ b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
+}
+
+// The caller of this function is responsible to ensure that there are 128 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char16_t *src) {
+ __m512i m1 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
+ __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 32));
+ __m512i p = _mm512_packus_epi16(m1, m2);
+ b->chunks[0] =
+ _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
+}
+
+static inline void base64_decode(char *out, __m512i str) {
+ const __m512i merge_ab_and_bc =
+ _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140));
+ const __m512i merged =
+ _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
+ const __m512i pack = _mm512_set_epi8(
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58,
+ 52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34,
+ 28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4,
+ 5, 6, 0, 1, 2);
+ const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged);
+ _mm512_mask_storeu_epi8(
+ (__m512i *)out, 0xffffffffffff,
+ shuffled); // mask would be 0xffffffffffff since we write 48 bytes.
+}
+// decode 64 bytes and output 48 bytes
+static inline void base64_decode_block(char *out, const char *src) {
+ base64_decode(out,
+ _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src)));
+}
+static inline void base64_decode_block(char *out, block64 *b) {
+ base64_decode(out, b->chunks[0]);
+}
+
+template <bool base64_url, typename chartype>
+full_result
+compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options) {
+ const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+ : tables::base64::to_base64_value;
+ size_t equallocation =
+ srclen; // location of the first padding character if any
+ size_t equalsigns = 0;
+ // skip trailing spaces
+ while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+ to_base64[uint8_t(src[srclen - 1])] == 64) {
+ srclen--;
+ }
+ if (srclen > 0 && src[srclen - 1] == '=') {
+ equallocation = srclen - 1;
+ srclen--;
+ equalsigns = 1;
+ // skip trailing spaces
+ while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+ to_base64[uint8_t(src[srclen - 1])] == 64) {
+ srclen--;
+ }
+ if (srclen > 0 && src[srclen - 1] == '=') {
+ equallocation = srclen - 1;
+ srclen--;
+ equalsigns = 2;
+ }
+ }
+ if (srclen == 0) {
+ if (equalsigns > 0) {
+ return {INVALID_BASE64_CHARACTER, equallocation, 0};
+ }
+ return {SUCCESS, 0, 0};
+ }
+ const chartype *const srcinit = src;
+ const char *const dstinit = dst;
+ const chartype *const srcend = src + srclen;
+
+ // figure out why block_size == 2 is sometimes best???
+ constexpr size_t block_size = 6;
+ char buffer[block_size * 64];
+ char *bufferptr = buffer;
+ if (srclen >= 64) {
+ const chartype *const srcend64 = src + srclen - 64;
+ while (src <= srcend64) {
+ block64 b;
+ load_block(&b, src);
+ src += 64;
+ uint64_t error = 0;
+ uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+ if (error) {
+ src -= 64;
+ size_t error_offset = _tzcnt_u64(error);
+ return {error_code::INVALID_BASE64_CHARACTER,
+ size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
+ }
+ if (badcharmask != 0) {
+ // optimization opportunity: check for simple masks like those made of
+ // continuous 1s followed by continuous 0s. And masks containing a
+ // single bad character.
+ bufferptr += compress_block(&b, badcharmask, bufferptr);
+ } else if (bufferptr != buffer) {
+ copy_block(&b, bufferptr);
+ bufferptr += 64;
+ } else {
+ base64_decode_block(dst, &b);
+ dst += 48;
+ }
+ if (bufferptr >= (block_size - 1) * 64 + buffer) {
+ for (size_t i = 0; i < (block_size - 1); i++) {
+ base64_decode_block(dst, buffer + i * 64);
+ dst += 48;
+ }
+ std::memcpy(buffer, buffer + (block_size - 1) * 64,
+ 64); // 64 might be too much
+ bufferptr -= (block_size - 1) * 64;
+ }
+ }
+ }
+
+ char *buffer_start = buffer;
+ // Optimization note: if this is almost full, then it is worth our
+ // time, otherwise, we should just decode directly.
+ int last_block = (int)((bufferptr - buffer_start) % 64);
+ if (last_block != 0 && srcend - src + last_block >= 64) {
+
+ while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
+ uint8_t val = to_base64[uint8_t(*src)];
+ *bufferptr = char(val);
+ if (!scalar::base64::is_eight_byte(*src) || val > 64) {
+ return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+ size_t(dst - dstinit)};
+ }
+ bufferptr += (val <= 63);
+ src++;
+ }
+ }
+
+ for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+ base64_decode_block(dst, buffer_start);
+ dst += 48;
+ }
+ if ((bufferptr - buffer_start) % 64 != 0) {
+ while (buffer_start + 4 < bufferptr) {
+ uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+ (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+ (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+ (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+ << 8;
+ triple = scalar::utf32::swap_bytes(triple);
+ std::memcpy(dst, &triple, 4);
+ dst += 3;
+ buffer_start += 4;
+ }
+ if (buffer_start + 4 <= bufferptr) {
+ uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+ (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+ (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+ (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+ << 8;
+ triple = scalar::utf32::swap_bytes(triple);
+ std::memcpy(dst, &triple, 3);
+ dst += 3;
+ buffer_start += 4;
+ }
+ // we may have 1, 2 or 3 bytes left and we need to decode them so let us
+ // backtrack
+ int leftover = int(bufferptr - buffer_start);
+ while (leftover > 0) {
+ while (to_base64[uint8_t(*(src - 1))] == 64) {
+ src--;
+ }
+ src--;
+ leftover--;
+ }
+ }
+ if (src < srcend + equalsigns) {
+ full_result r = scalar::base64::base64_tail_decode(
+ dst, src, srcend - src, equalsigns, options, last_chunk_options);
+ r.input_count += size_t(src - srcinit);
+ if (r.error == error_code::INVALID_BASE64_CHARACTER ||
+ r.error == error_code::BASE64_EXTRA_BITS) {
+ return r;
+ } else {
+ r.output_count += size_t(dst - dstinit);
+ }
+ if (last_chunk_options != stop_before_partial &&
+ r.error == error_code::SUCCESS && equalsigns > 0) {
+ // additional checks
+ if ((r.output_count % 3 == 0) ||
+ ((r.output_count % 3) + 1 + equalsigns != 4)) {
+ r.error = error_code::INVALID_BASE64_CHARACTER;
+ r.input_count = equallocation;
+ }
+ }
+ return r;
+ }
+ if (equalsigns > 0) {
+ if ((size_t(dst - dstinit) % 3 == 0) ||
+ ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
+ return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
+ }
+ }
+ return {SUCCESS, srclen, size_t(dst - dstinit)};
+}
diff --git a/contrib/simdutf/src/icelake/icelake_convert_latin1_to_utf16.inl.cpp b/contrib/simdutf/src/icelake/icelake_convert_latin1_to_utf16.inl.cpp
new file mode 100644
index 000000000..4d4738d9c
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_convert_latin1_to_utf16.inl.cpp
@@ -0,0 +1,36 @@
+// file included directly
+template <endianness big_endian>
+size_t icelake_convert_latin1_to_utf16(const char *latin1_input, size_t len,
+ char16_t *utf16_output) {
+ size_t rounded_len = len & ~0x1F; // Round down to nearest multiple of 32
+
+ __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809);
+ for (size_t i = 0; i < rounded_len; i += 32) {
+ // Load 32 Latin1 characters into a 256-bit register
+ __m256i in = _mm256_loadu_si256((__m256i *)&latin1_input[i]);
+ // Zero extend each set of 8 Latin1 characters to 32 16-bit integers
+ __m512i out = _mm512_cvtepu8_epi16(in);
+ if (big_endian) {
+ out = _mm512_shuffle_epi8(out, byteflip);
+ }
+ // Store the results back to memory
+ _mm512_storeu_si512((__m512i *)&utf16_output[i], out);
+ }
+ if (rounded_len != len) {
+ uint32_t mask = uint32_t(1 << (len - rounded_len)) - 1;
+ __m256i in = _mm256_maskz_loadu_epi8(mask, latin1_input + rounded_len);
+
+ // Zero extend each set of 8 Latin1 characters to 32 16-bit integers
+ __m512i out = _mm512_cvtepu8_epi16(in);
+ if (big_endian) {
+ out = _mm512_shuffle_epi8(out, byteflip);
+ }
+ // Store the results back to memory
+ _mm512_mask_storeu_epi16(utf16_output + rounded_len, mask, out);
+ }
+
+ return len;
+}
diff --git a/contrib/simdutf/src/icelake/icelake_convert_latin1_to_utf32.inl.cpp b/contrib/simdutf/src/icelake/icelake_convert_latin1_to_utf32.inl.cpp
new file mode 100644
index 000000000..8a9b40703
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_convert_latin1_to_utf32.inl.cpp
@@ -0,0 +1,20 @@
+std::pair<const char *, char32_t *>
+avx512_convert_latin1_to_utf32(const char *buf, size_t len,
+ char32_t *utf32_output) {
+ size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
+
+ for (size_t i = 0; i < rounded_len; i += 16) {
+ // Load 16 Latin1 characters into a 128-bit register
+ __m128i in = _mm_loadu_si128((__m128i *)&buf[i]);
+
+ // Zero extend each set of 8 Latin1 characters to 16 32-bit integers using
+ // vpmovzxbd
+ __m512i out = _mm512_cvtepu8_epi32(in);
+
+ // Store the results back to memory
+ _mm512_storeu_si512((__m512i *)&utf32_output[i], out);
+ }
+
+ // Return pointers pointing to where we left off
+ return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
+}
diff --git a/contrib/simdutf/src/icelake/icelake_convert_latin1_to_utf8.inl.cpp b/contrib/simdutf/src/icelake/icelake_convert_latin1_to_utf8.inl.cpp
new file mode 100644
index 000000000..4543731fe
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_convert_latin1_to_utf8.inl.cpp
@@ -0,0 +1,107 @@
+// file included directly
+
+static inline size_t latin1_to_utf8_avx512_vec(__m512i input, size_t input_len,
+ char *utf8_output,
+ int mask_output) {
+ __mmask64 nonascii = _mm512_movepi8_mask(input);
+ size_t output_size = input_len + (size_t)count_ones(nonascii);
+
+ // Mask to denote whether the byte is a leading byte that is not ascii
+ __mmask64 sixth = _mm512_cmpge_epu8_mask(
+ input, _mm512_set1_epi8(-64)); // binary representation of -64: 1100 0000
+
+ const uint64_t alternate_bits = UINT64_C(0x5555555555555555);
+ uint64_t ascii = ~nonascii;
+ // the bits in ascii are inverted and zeros are interspersed in between them
+ uint64_t maskA = ~_pdep_u64(ascii, alternate_bits);
+ uint64_t maskB = ~_pdep_u64(ascii >> 32, alternate_bits);
+
+ // interleave bytes from top and bottom halves (abcd...ABCD -> aAbBcCdD)
+ __m512i input_interleaved = _mm512_permutexvar_epi8(
+ _mm512_set_epi32(0x3f1f3e1e, 0x3d1d3c1c, 0x3b1b3a1a, 0x39193818,
+ 0x37173616, 0x35153414, 0x33133212, 0x31113010,
+ 0x2f0f2e0e, 0x2d0d2c0c, 0x2b0b2a0a, 0x29092808,
+ 0x27072606, 0x25052404, 0x23032202, 0x21012000),
+ input);
+
+ // double size of each byte, and insert the leading byte 1100 0010
+
+ /*
+ upscale the bytes to 16-bit value, adding the 0b11000000 leading byte in the
+ process. We adjust for the bytes that have their two most significant bits.
+ This takes care of the first 32 bytes, assuming we interleaved the bytes. */
+ __m512i outputA =
+ _mm512_shldi_epi16(input_interleaved, _mm512_set1_epi8(-62), 8);
+ outputA = _mm512_mask_add_epi16(
+ outputA, (__mmask32)sixth, outputA,
+ _mm512_set1_epi16(1 - 0x4000)); // 1- 0x4000 = 1100 0000 0000 0001????
+
+ // in the second 32-bit half, set first or second option based on whether
+ // original input is leading byte (second case) or not (first case)
+ __m512i leadingB =
+ _mm512_mask_blend_epi16((__mmask32)(sixth >> 32),
+ _mm512_set1_epi16(0x00c2), // 0000 0000 1101 0010
+ _mm512_set1_epi16(0x40c3)); // 0100 0000 1100 0011
+ __m512i outputB = _mm512_ternarylogic_epi32(
+ input_interleaved, leadingB, _mm512_set1_epi16((short)0xff00),
+ (240 & 170) ^ 204); // (input_interleaved & 0xff00) ^ leadingB
+
+ // prune redundant bytes
+ outputA = _mm512_maskz_compress_epi8(maskA, outputA);
+ outputB = _mm512_maskz_compress_epi8(maskB, outputB);
+
+ size_t output_sizeA = (size_t)count_ones((uint32_t)nonascii) + 32;
+
+ if (mask_output) {
+ if (input_len > 32) { // is the second half of the input vector used?
+ __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_sizeA);
+ _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA);
+ utf8_output += output_sizeA;
+ write_mask = _bzhi_u64(~0ULL, (unsigned int)(output_size - output_sizeA));
+ _mm512_mask_storeu_epi8(utf8_output, write_mask, outputB);
+ } else {
+ __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_size);
+ _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA);
+ }
+ } else {
+ _mm512_storeu_si512(utf8_output, outputA);
+ utf8_output += output_sizeA;
+ _mm512_storeu_si512(utf8_output, outputB);
+ }
+ return output_size;
+}
+
+static inline size_t latin1_to_utf8_avx512_branch(__m512i input,
+ char *utf8_output) {
+ __mmask64 nonascii = _mm512_movepi8_mask(input);
+ if (nonascii) {
+ return latin1_to_utf8_avx512_vec(input, 64, utf8_output, 0);
+ } else {
+ _mm512_storeu_si512(utf8_output, input);
+ return 64;
+ }
+}
+
+size_t latin1_to_utf8_avx512_start(const char *buf, size_t len,
+ char *utf8_output) {
+ char *start = utf8_output;
+ size_t pos = 0;
+ // if there's at least 128 bytes remaining, we don't need to mask the output
+ for (; pos + 128 <= len; pos += 64) {
+ __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
+ utf8_output += latin1_to_utf8_avx512_branch(input, utf8_output);
+ }
+ // in the last 128 bytes, the first 64 may require masking the output
+ if (pos + 64 <= len) {
+ __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
+ utf8_output += latin1_to_utf8_avx512_vec(input, 64, utf8_output, 1);
+ pos += 64;
+ }
+ // with the last 64 bytes, the input also needs to be masked
+ if (pos < len) {
+ __mmask64 load_mask = _bzhi_u64(~0ULL, (unsigned int)(len - pos));
+ __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)(buf + pos));
+ utf8_output += latin1_to_utf8_avx512_vec(input, len - pos, utf8_output, 1);
+ }
+ return (size_t)(utf8_output - start);
+}
diff --git a/contrib/simdutf/src/icelake/icelake_convert_utf16_to_latin1.inl.cpp b/contrib/simdutf/src/icelake/icelake_convert_utf16_to_latin1.inl.cpp
new file mode 100644
index 000000000..f17cccf59
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_convert_utf16_to_latin1.inl.cpp
@@ -0,0 +1,103 @@
+// file included directly
+template <endianness big_endian>
+size_t icelake_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_output) {
+ const char16_t *end = buf + len;
+ __m512i v_0xFF = _mm512_set1_epi16(0xff);
+ __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809);
+ __m512i shufmask = _mm512_set_epi8(
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38,
+ 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+ while (end - buf >= 32) {
+ __m512i in = _mm512_loadu_si512((__m512i *)buf);
+ if (big_endian) {
+ in = _mm512_shuffle_epi8(in, byteflip);
+ }
+ if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
+ return 0;
+ }
+ _mm256_storeu_si256(
+ (__m256i *)latin1_output,
+ _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
+ latin1_output += 32;
+ buf += 32;
+ }
+ if (buf < end) {
+ uint32_t mask(uint32_t(1 << (end - buf)) - 1);
+ __m512i in = _mm512_maskz_loadu_epi16(mask, buf);
+ if (big_endian) {
+ in = _mm512_shuffle_epi8(in, byteflip);
+ }
+ if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
+ return 0;
+ }
+ _mm256_mask_storeu_epi8(
+ latin1_output, mask,
+ _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
+ }
+ return len;
+}
+
+template <endianness big_endian>
+std::pair<result, char *>
+icelake_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+ char *latin1_output) {
+ const char16_t *end = buf + len;
+ const char16_t *start = buf;
+ __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809);
+ __m512i v_0xFF = _mm512_set1_epi16(0xff);
+ __m512i shufmask = _mm512_set_epi8(
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38,
+ 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+ while (end - buf >= 32) {
+ __m512i in = _mm512_loadu_si512((__m512i *)buf);
+ if (big_endian) {
+ in = _mm512_shuffle_epi8(in, byteflip);
+ }
+ if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
+ uint16_t word;
+ while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf))
+ : uint16_t(*buf))) <= 0xff) {
+ *latin1_output++ = uint8_t(word);
+ buf++;
+ }
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+ latin1_output);
+ }
+ _mm256_storeu_si256(
+ (__m256i *)latin1_output,
+ _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
+ latin1_output += 32;
+ buf += 32;
+ }
+ if (buf < end) {
+ uint32_t mask(uint32_t(1 << (end - buf)) - 1);
+ __m512i in = _mm512_maskz_loadu_epi16(mask, buf);
+ if (big_endian) {
+ in = _mm512_shuffle_epi8(in, byteflip);
+ }
+ if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
+
+ uint16_t word;
+ while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf))
+ : uint16_t(*buf))) <= 0xff) {
+ *latin1_output++ = uint8_t(word);
+ buf++;
+ }
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+ latin1_output);
+ }
+ _mm256_mask_storeu_epi8(
+ latin1_output, mask,
+ _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
+ }
+ return std::make_pair(result(error_code::SUCCESS, len), latin1_output);
+}
diff --git a/contrib/simdutf/src/icelake/icelake_convert_utf16_to_utf32.inl.cpp b/contrib/simdutf/src/icelake/icelake_convert_utf16_to_utf32.inl.cpp
new file mode 100644
index 000000000..c5c0d2bcb
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_convert_utf16_to_utf32.inl.cpp
@@ -0,0 +1,136 @@
+// file included directly
+
+/*
+ Returns a pair: the first unprocessed byte from buf and utf32_output
+ A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::tuple<const char16_t *, char32_t *, bool>
+convert_utf16_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_output) {
+ const char16_t *end = buf + len;
+ const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00);
+ const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+ const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00);
+ __mmask32 carry{0};
+ const __m512i byteflip = _mm512_setr_epi64(
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+ 0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809);
+ while (std::distance(buf, end) >= 32) {
+ // Always safe because buf + 32 <= end so that end - buf >= 32 bytes:
+ __m512i in = _mm512_loadu_si512((__m512i *)buf);
+ if (big_endian) {
+ in = _mm512_shuffle_epi8(in, byteflip);
+ }
+
+ // H - bitmask for high surrogates
+ const __mmask32 H =
+ _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800);
+ // H - bitmask for low surrogates
+ const __mmask32 L =
+ _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00);
+
+ if ((H | L)) {
+ // surrogate pair(s) in a register
+ const __mmask32 V =
+ (L ^
+ (carry | (H << 1))); // A high surrogate must be followed by low one
+ // and a low one must be preceded by a high one.
+ // If valid, V should be equal to 0
+
+ if (V == 0) {
+ // valid case
+ /*
+ Input surrogate pair:
+ |1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb|
+ low surrogate high surrogate
+ */
+ /* 1. Expand all code units to 32-bit code units
+ in
+ |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
+ */
+ const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
+ const __m512i second =
+ _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
+
+ /* 2. Shift by one 16-bit word to align low surrogates with high
+ surrogates in
+ |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
+ shifted
+ |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
+ */
+ const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1);
+ const __m512i shifted_second =
+ _mm512_alignr_epi32(_mm512_setzero_si512(), second, 1);
+
+ /* 3. Align all high surrogates in first and second by shifting to the
+ left by 10 bits
+ |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
+ */
+ const __m512i aligned_first =
+ _mm512_mask_slli_epi32(first, (__mmask16)H, first, 10);
+ const __m512i aligned_second =
+ _mm512_mask_slli_epi32(second, (__mmask16)(H >> 16), second, 10);
+
+ /* 4. Remove surrogate prefixes and add offset 0x10000 by adding in,
+ shifted and constant in
+ |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
+ shifted
+ |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
+ constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000|
+ */
+ const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400);
+ const __m512i added_first = _mm512_mask_add_epi32(
+ aligned_first, (__mmask16)H, aligned_first, shifted_first);
+ const __m512i utf32_first = _mm512_mask_add_epi32(
+ added_first, (__mmask16)H, added_first, constant);
+
+ const __m512i added_second =
+ _mm512_mask_add_epi32(aligned_second, (__mmask16)(H >> 16),
+ aligned_second, shifted_second);
+ const __m512i utf32_second = _mm512_mask_add_epi32(
+ added_second, (__mmask16)(H >> 16), added_second, constant);
+
+ // 5. Store all valid UTF-32 code units (low surrogate positions and
+ // 32nd word are invalid)
+ const __mmask32 valid = ~L & 0x7fffffff;
+ // We deliberately do a _mm512_maskz_compress_epi32 followed by
+ // storeu_epi32 to ease performance portability to Zen 4.
+ const __m512i compressed_first =
+ _mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first);
+ const size_t howmany1 = count_ones((uint16_t)(valid));
+ _mm512_storeu_si512((__m512i *)utf32_output, compressed_first);
+ utf32_output += howmany1;
+ const __m512i compressed_second =
+ _mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second);
+ const size_t howmany2 = count_ones((uint16_t)(valid >> 16));
+ // The following could be unsafe in some cases?
+ //_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second);
+ _mm512_mask_storeu_epi32((__m512i *)utf32_output,
+ __mmask16((1 << howmany2) - 1),
+ compressed_second);
+ utf32_output += howmany2;
+ // Only process 31 code units, but keep track if the 31st word is a high
+ // surrogate as a carry
+ buf += 31;
+ carry = (H >> 30) & 0x1;
+ } else {
+ // invalid case
+ return std::make_tuple(buf + carry, utf32_output, false);
+ }
+ } else {
+ // no surrogates
+ // extend all thirty-two 16-bit code units to thirty-two 32-bit code units
+ _mm512_storeu_si512((__m512i *)(utf32_output),
+ _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)));
+ _mm512_storeu_si512(
+ (__m512i *)(utf32_output) + 1,
+ _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1)));
+ utf32_output += 32;
+ buf += 32;
+ carry = 0;
+ }
+ } // while
+ return std::make_tuple(buf + carry, utf32_output, true);
+}
diff --git a/contrib/simdutf/src/icelake/icelake_convert_utf16_to_utf8.inl.cpp b/contrib/simdutf/src/icelake/icelake_convert_utf16_to_utf8.inl.cpp
new file mode 100644
index 000000000..d2d698294
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_convert_utf16_to_utf8.inl.cpp
@@ -0,0 +1,206 @@
+// file included directly
+
+/**
+ * This function converts the input (inbuf, inlen), assumed to be valid
+ * UTF16 (little endian) into UTF-8 (to outbuf). The number of code units
+ * written is written to 'outlen' and the function reports the number of input
+ * word consumed.
+ */
+template <endianness big_endian>
+size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen,
+ unsigned char *outbuf, size_t *outlen) {
+ __m512i in;
+ __mmask32 inmask = _cvtu32_mask32(0x7fffffff);
+ __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809);
+ const char16_t *const inbuf_orig = inbuf;
+ const unsigned char *const outbuf_orig = outbuf;
+ int adjust = 0;
+ int carry = 0;
+
+ while (inlen >= 32) {
+ in = _mm512_loadu_si512(inbuf);
+ if (big_endian) {
+ in = _mm512_shuffle_epi8(in, byteflip);
+ }
+ inlen -= 31;
+ lastiteration:
+ inbuf += 31;
+
+ failiteration:
+ const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask(
+ inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT);
+
+ if (_ktestz_mask32_u8(inmask, is234byte)) {
+ // fast path for ASCII only
+ _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in);
+ outbuf += 31;
+ carry = 0;
+
+ if (inlen < 32) {
+ goto tail;
+ } else {
+ continue;
+ }
+ }
+
+ const __mmask32 is12byte =
+ _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT);
+
+ if (_ktestc_mask32_u8(is12byte, inmask)) {
+ // fast path for 1 and 2 byte only
+
+ const __m512i twobytes = _mm512_ternarylogic_epi32(
+ _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6),
+ _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C
+ in = _mm512_mask_add_epi16(in, is234byte, twobytes,
+ _mm512_set1_epi16(int16_t(0x80c0)));
+ const __m512i cmpmask =
+ _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)),
+ _mm512_set1_epi16(0x0800));
+ const __mmask64 smoosh =
+ _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT);
+ const __m512i out = _mm512_maskz_compress_epi8(smoosh, in);
+ _mm512_mask_storeu_epi8(outbuf,
+ _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh),
+ _cvtmask64_u64(smoosh))),
+ out);
+ outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte));
+ carry = 0;
+
+ if (inlen < 32) {
+ goto tail;
+ } else {
+ continue;
+ }
+ }
+ __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
+ __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
+
+ __m512i taglo = _mm512_set1_epi32(0x8080e000);
+ __m512i taghi = taglo;
+
+ const __m512i fc00masked =
+ _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00)));
+ const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask(
+ inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ);
+ const __mmask32 losurr = _mm512_cmp_epu16_mask(
+ fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ);
+
+ int carryout = 0;
+ if (!_kortestz_mask32_u8(hisurr, losurr)) {
+ // handle surrogates
+
+ __m512i los = _mm512_alignr_epi32(hi, lo, 1);
+ __m512i his = _mm512_alignr_epi32(lo, hi, 1);
+
+ const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16);
+ taglo = _mm512_mask_mov_epi32(taglo, __mmask16(hisurr),
+ _mm512_set1_epi32(0x808080f0));
+ taghi = _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi),
+ _mm512_set1_epi32(0x808080f0));
+
+ lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10);
+ hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10);
+ los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400));
+ his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400));
+ lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los);
+ hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his);
+
+ carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30));
+
+ const uint32_t h = _cvtmask32_u32(hisurr);
+ const uint32_t l = _cvtmask32_u32(losurr);
+ // check for mismatched surrogates
+ if ((h + h + carry) ^ l) {
+ const uint32_t lonohi = l & ~(h + h + carry);
+ const uint32_t hinolo = h & ~(l >> 1);
+ inlen = _tzcnt_u32(hinolo | lonohi);
+ inmask = __mmask32(0x7fffffff & ((1U << inlen) - 1));
+ in = _mm512_maskz_mov_epi16(inmask, in);
+ adjust = (int)inlen - 31;
+ inlen = 0;
+ goto failiteration;
+ }
+ }
+
+ hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff), hi);
+ carry = carryout;
+
+ __m512i mslo =
+ _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo);
+
+ __m512i mshi =
+ _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi);
+
+ const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask));
+ const __mmask64 outmhi = _kshiftri_mask64(outmask, 16);
+
+ const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte));
+ const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16);
+ const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16);
+
+ taglo = _mm512_mask_mov_epi32(taglo, __mmask16(is12byte),
+ _mm512_set1_epi32(0x80c00000));
+ taghi = _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi),
+ _mm512_set1_epi32(0x80c00000));
+ __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask),
+ _mm512_set1_epi32(0xffffffff),
+ _mm512_set1_epi32(0x00010101));
+ __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi),
+ _mm512_set1_epi32(0xffffffff),
+ _mm512_set1_epi32(0x00010101));
+
+ magiclo = _mm512_mask_blend_epi32(__mmask16(outmask),
+ _mm512_set1_epi32(0xffffffff),
+ _mm512_set1_epi32(0x00010101));
+ magichi = _mm512_mask_blend_epi32(__mmask16(outmhi),
+ _mm512_set1_epi32(0xffffffff),
+ _mm512_set1_epi32(0x00010101));
+
+ mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo,
+ 0xea); // A&B|C
+ mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi,
+ 0xea);
+ mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24);
+
+ mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24);
+
+ const __mmask64 wantlo =
+ _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT);
+ const __mmask64 wanthi =
+ _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT);
+ const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo);
+ const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi);
+ const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo);
+ const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi);
+
+ uint64_t advlo = _mm_popcnt_u64(wantlo_uint64);
+ uint64_t advhi = _mm_popcnt_u64(wanthi_uint64);
+
+ _mm512_mask_storeu_epi8(
+ outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo);
+ _mm512_mask_storeu_epi8(
+ outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)),
+ outhi);
+ outbuf += advlo + advhi;
+ }
+ outbuf += -adjust;
+
+tail:
+ if (inlen != 0) {
+ // We must have inlen < 31.
+ inmask = _cvtu32_mask32((1U << inlen) - 1);
+ in = _mm512_maskz_loadu_epi16(inmask, inbuf);
+ if (big_endian) {
+ in = _mm512_shuffle_epi8(in, byteflip);
+ }
+ adjust = (int)inlen - 31;
+ inlen = 0;
+ goto lastiteration;
+ }
+ *outlen = (outbuf - outbuf_orig) + adjust;
+ return ((inbuf - inbuf_orig) + adjust);
+}
diff --git a/contrib/simdutf/src/icelake/icelake_convert_utf32_to_latin1.inl.cpp b/contrib/simdutf/src/icelake/icelake_convert_utf32_to_latin1.inl.cpp
new file mode 100644
index 000000000..1e7e4296e
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_convert_utf32_to_latin1.inl.cpp
@@ -0,0 +1,74 @@
+// file included directly
+size_t icelake_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) {
+ const char32_t *end = buf + len;
+ __m512i v_0xFF = _mm512_set1_epi32(0xff);
+ __m512i shufmask = _mm512_set_epi8(
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
+ 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
+ while (end - buf >= 16) {
+ __m512i in = _mm512_loadu_si512((__m512i *)buf);
+ if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
+ return 0;
+ }
+ _mm_storeu_si128(
+ (__m128i *)latin1_output,
+ _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
+ latin1_output += 16;
+ buf += 16;
+ }
+ if (buf < end) {
+ uint16_t mask = uint16_t((1 << (end - buf)) - 1);
+ __m512i in = _mm512_maskz_loadu_epi32(mask, buf);
+ if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
+ return 0;
+ }
+ _mm_mask_storeu_epi8(
+ latin1_output, mask,
+ _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
+ }
+ return len;
+}
+
+std::pair<result, char *>
+icelake_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+ char *latin1_output) {
+ const char32_t *end = buf + len;
+ const char32_t *start = buf;
+ __m512i v_0xFF = _mm512_set1_epi32(0xff);
+ __m512i shufmask = _mm512_set_epi8(
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
+ 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
+ while (end - buf >= 16) {
+ __m512i in = _mm512_loadu_si512((__m512i *)buf);
+ if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
+ while (uint32_t(*buf) <= 0xff) {
+ *latin1_output++ = uint8_t(*buf++);
+ }
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+ latin1_output);
+ }
+ _mm_storeu_si128(
+ (__m128i *)latin1_output,
+ _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
+ latin1_output += 16;
+ buf += 16;
+ }
+ if (buf < end) {
+ uint16_t mask = uint16_t((1 << (end - buf)) - 1);
+ __m512i in = _mm512_maskz_loadu_epi32(mask, buf);
+ if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
+ while (uint32_t(*buf) <= 0xff) {
+ *latin1_output++ = uint8_t(*buf++);
+ }
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+ latin1_output);
+ }
+ _mm_mask_storeu_epi8(
+ latin1_output, mask,
+ _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
+ }
+ return std::make_pair(result(error_code::SUCCESS, len), latin1_output);
+}
diff --git a/contrib/simdutf/src/icelake/icelake_convert_utf32_to_utf16.inl.cpp b/contrib/simdutf/src/icelake/icelake_convert_utf32_to_utf16.inl.cpp
new file mode 100644
index 000000000..70df94dac
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_convert_utf32_to_utf16.inl.cpp
@@ -0,0 +1,178 @@
+// file included directly
+
+// Todo: currently, this is just the haswell code, optimize for icelake kernel.
+template <endianness big_endian>
+std::pair<const char32_t *, char16_t *>
+avx512_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+ char16_t *utf16_output) {
+ const char32_t *end = buf + len;
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+ __m256i forbidden_bytemask = _mm256_setzero_si256();
+
+ while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+
+ const __m256i v_00000000 = _mm256_setzero_si256();
+ const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+
+ // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+ const __m256i saturation_bytemask =
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+ const uint32_t saturation_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+
+ if (saturation_bitmask == 0xffffffff) {
+ const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+ const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+ forbidden_bytemask = _mm256_or_si256(
+ forbidden_bytemask,
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
+
+ __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
+ _mm256_extractf128_si256(in, 1));
+ if (big_endian) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+ }
+ _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
+ utf16_output += 8;
+ buf += 8;
+ } else {
+ size_t forward = 7;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFF0000) == 0) {
+ // will not generate a surrogate pair
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(nullptr, utf16_output);
+ }
+ *utf16_output++ =
+ big_endian
+ ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
+ : char16_t(word);
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return std::make_pair(nullptr, utf16_output);
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (big_endian) {
+ high_surrogate =
+ uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+ low_surrogate =
+ uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ }
+ }
+ buf += k;
+ }
+ }
+
+ // check for invalid input
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+ return std::make_pair(nullptr, utf16_output);
+ }
+
+ return std::make_pair(buf, utf16_output);
+}
+
+// Todo: currently, this is just the haswell code, optimize for icelake kernel.
+template <endianness big_endian>
+std::pair<result, char16_t *>
+avx512_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+ char16_t *utf16_output) {
+ const char32_t *start = buf;
+ const char32_t *end = buf + len;
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+
+ const __m256i v_00000000 = _mm256_setzero_si256();
+ const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+
+ // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+ const __m256i saturation_bytemask =
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+ const uint32_t saturation_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+
+ if (saturation_bitmask == 0xffffffff) {
+ const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+ const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+ const __m256i forbidden_bytemask =
+ _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
+ 0x0) {
+ return std::make_pair(result(error_code::SURROGATE, buf - start),
+ utf16_output);
+ }
+
+ __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
+ _mm256_extractf128_si256(in, 1));
+ if (big_endian) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+ }
+ _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
+ utf16_output += 8;
+ buf += 8;
+ } else {
+ size_t forward = 7;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFF0000) == 0) {
+ // will not generate a surrogate pair
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k), utf16_output);
+ }
+ *utf16_output++ =
+ big_endian
+ ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
+ : char16_t(word);
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return std::make_pair(
+ result(error_code::TOO_LARGE, buf - start + k), utf16_output);
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (big_endian) {
+ high_surrogate =
+ uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+ low_surrogate =
+ uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ }
+ }
+ buf += k;
+ }
+ }
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+}
diff --git a/contrib/simdutf/src/icelake/icelake_convert_utf32_to_utf8.inl.cpp b/contrib/simdutf/src/icelake/icelake_convert_utf32_to_utf8.inl.cpp
new file mode 100644
index 000000000..b5ce4d83a
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_convert_utf32_to_utf8.inl.cpp
@@ -0,0 +1,574 @@
+// file included directly
+
+// Todo: currently, this is just the haswell code, optimize for icelake kernel.
+std::pair<const char32_t *, char *>
+avx512_convert_utf32_to_utf8(const char32_t *buf, size_t len,
+ char *utf8_output) {
+ const char32_t *end = buf + len;
+ const __m256i v_0000 = _mm256_setzero_si256();
+ const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+ const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+ const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+ const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+ const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+ __m256i running_max = _mm256_setzero_si256();
+ __m256i forbidden_bytemask = _mm256_setzero_si256();
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+ __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
+ running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
+
+ // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
+ // saturation
+ __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
+ _mm256_and_si256(nextin, v_7fffffff));
+ in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+ // Try to apply UTF-16 => UTF-8 routine on 256 bits
+ // (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+ if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ const __m128i utf8_packed = _mm_packus_epi16(
+ _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+ // 2. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+ // no bits set above 7th bit
+ const __m256i one_byte_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+ const uint32_t one_byte_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+ // no bits set above 11th bit
+ const __m256i one_or_two_bytes_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+ const uint32_t one_or_two_bytes_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+ if (one_or_two_bytes_bitmask == 0xffffffff) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+ const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m256i t3 = _mm256_or_si256(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+ // 2. merge ASCII and 2-byte codewords
+ const __m256i utf8_unpacked =
+ _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+ // 3. prepare bitmask for 8-bit lookup
+ const uint32_t M0 = one_byte_bitmask & 0x55555555;
+ const uint32_t M1 = M0 >> 7;
+ const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+ // 4. pack the bytes
+
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+ const uint8_t *row_2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+ 16)][0];
+
+ const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+ const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+
+ const __m256i utf8_packed = _mm256_shuffle_epi8(
+ utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+ // 5. store bytes
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_castsi256_si128(utf8_packed));
+ utf8_output += row[0];
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_extractf128_si256(utf8_packed, 1));
+ utf8_output += row_2[0];
+
+ // 6. adjust pointers
+ buf += 16;
+ continue;
+ }
+ // Must check for overflow in packing
+ const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
+ _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+ const uint32_t saturation_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+ if (saturation_bitmask == 0xffffffff) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+ forbidden_bytemask = _mm256_or_si256(
+ forbidden_bytemask,
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
+
+ const __m256i dup_even = _mm256_setr_epi16(
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
+ UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+ // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
+ const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+ // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+ const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+ // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+ const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+ // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+ const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+ simdutf_vec(0b0100000000000000));
+ const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+ const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+ (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+ // Due to the wider registers, the following path is less likely to be
+ // useful.
+ /*if(mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const __m256i shuffle =
+ _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+ 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+ _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+ _mm256_shuffle_epi8(out1, shuffle);
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+ continue;
+ }*/
+ const uint8_t mask0 = uint8_t(mask);
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+ const __m128i utf8_0 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+ const __m128i utf8_1 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+ const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+ const uint8_t *row2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+ const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+ const __m128i utf8_2 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+ const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+ const uint8_t *row3 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+ const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+ const __m128i utf8_3 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += row0[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += row1[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+ utf8_output += row2[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+ utf8_output += row3[0];
+ buf += 16;
+ } else {
+ // case: at least one 32-bit word is larger than 0xFFFF <=> it will
+ // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
+ // wasteful to use scalar code, but being efficient with SIMD may require
+ // large, non-trivial tables?
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+ *utf8_output++ = char(word);
+ } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else { // 4-byte
+ if (word > 0x10FFFF) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ // check for invalid input
+ const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(
+ _mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+
+ return std::make_pair(buf, utf8_output);
+}
+
+// Todo: currently, this is just the haswell code, optimize for icelake kernel.
+std::pair<result, char *>
+avx512_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+ char *utf8_output) {
+ const char32_t *end = buf + len;
+ const char32_t *start = buf;
+
+ const __m256i v_0000 = _mm256_setzero_si256();
+ const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+ const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+ const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+ const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+ const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+ const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+ __m256i in = _mm256_loadu_si256((__m256i *)buf);
+ __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
+ // Check for too large input
+ const __m256i max_input =
+ _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(
+ _mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+ utf8_output);
+ }
+
+ // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
+ // saturation
+ __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
+ _mm256_and_si256(nextin, v_7fffffff));
+ in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+ // Try to apply UTF-16 => UTF-8 routine on 256 bits
+ // (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+ if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ const __m128i utf8_packed = _mm_packus_epi16(
+ _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+ // 2. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+ // no bits set above 7th bit
+ const __m256i one_byte_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+ const uint32_t one_byte_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+ // no bits set above 11th bit
+ const __m256i one_or_two_bytes_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+ const uint32_t one_or_two_bytes_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+ if (one_or_two_bytes_bitmask == 0xffffffff) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+ const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m256i t3 = _mm256_or_si256(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+ // 2. merge ASCII and 2-byte codewords
+ const __m256i utf8_unpacked =
+ _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+ // 3. prepare bitmask for 8-bit lookup
+ const uint32_t M0 = one_byte_bitmask & 0x55555555;
+ const uint32_t M1 = M0 >> 7;
+ const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+ // 4. pack the bytes
+
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+ const uint8_t *row_2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+ 16)][0];
+
+ const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+ const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+
+ const __m256i utf8_packed = _mm256_shuffle_epi8(
+ utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+ // 5. store bytes
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_castsi256_si128(utf8_packed));
+ utf8_output += row[0];
+ _mm_storeu_si128((__m128i *)utf8_output,
+ _mm256_extractf128_si256(utf8_packed, 1));
+ utf8_output += row_2[0];
+
+ // 6. adjust pointers
+ buf += 16;
+ continue;
+ }
+ // Must check for overflow in packing
+ const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
+ _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+ const uint32_t saturation_bitmask =
+ static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+ if (saturation_bitmask == 0xffffffff) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+
+ // Check for illegal surrogate code units
+ const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+ const __m256i forbidden_bytemask =
+ _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
+ if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
+ 0x0) {
+ return std::make_pair(result(error_code::SURROGATE, buf - start),
+ utf8_output);
+ }
+
+ const __m256i dup_even = _mm256_setr_epi16(
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+ 0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
+ UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+ // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
+ const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+ // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+ const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+ // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+ const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+ // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+ const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+ simdutf_vec(0b0100000000000000));
+ const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+ const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+ (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+ // Due to the wider registers, the following path is less likely to be
+ // useful.
+ /*if(mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const __m256i shuffle =
+ _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+ 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+ _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+ _mm256_shuffle_epi8(out1, shuffle);
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+ _mm_storeu_si128((__m128i*)utf8_output,
+ _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+ continue;
+ }*/
+ const uint8_t mask0 = uint8_t(mask);
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+ const __m128i utf8_0 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+ const __m128i utf8_1 =
+ _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+ const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+ const uint8_t *row2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+ const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+ const __m128i utf8_2 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+ const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+ const uint8_t *row3 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+ const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+ const __m128i utf8_3 =
+ _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += row0[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += row1[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+ utf8_output += row2[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+ utf8_output += row3[0];
+ buf += 16;
+ } else {
+ // case: at least one 32-bit word is larger than 0xFFFF <=> it will
+ // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
+ // wasteful to use scalar code, but being efficient with SIMD may require
+ // large, non-trivial tables?
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+ *utf8_output++ = char(word);
+ } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k), utf8_output);
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else { // 4-byte
+ if (word > 0x10FFFF) {
+ return std::make_pair(
+ result(error_code::TOO_LARGE, buf - start + k), utf8_output);
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+}
diff --git a/contrib/simdutf/src/icelake/icelake_convert_utf8_to_latin1.inl.cpp b/contrib/simdutf/src/icelake/icelake_convert_utf8_to_latin1.inl.cpp
new file mode 100644
index 000000000..59f7ea7bc
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_convert_utf8_to_latin1.inl.cpp
@@ -0,0 +1,104 @@
+// file included directly
+
+// File contains conversion procedure from possibly invalid UTF-8 strings.
+
+template <bool is_remaining>
+simdutf_really_inline size_t process_block_from_utf8_to_latin1(
+ const char *buf, size_t len, char *latin_output, __m512i minus64,
+ __m512i one, __mmask64 *next_leading_ptr, __mmask64 *next_bit6_ptr) {
+ __mmask64 load_mask =
+ is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL;
+ __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
+ __mmask64 nonascii = _mm512_movepi8_mask(input);
+ if (nonascii == 0) {
+ if (*next_leading_ptr) { // If we ended with a leading byte, it is an error.
+ return 0; // Indicates error
+ }
+ is_remaining
+ ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
+ : _mm512_storeu_si512((__m512i *)latin_output, input);
+ return len;
+ }
+
+ const __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
+
+ __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
+ __mmask64 invalid_leading_bytes =
+ _mm512_mask_cmpgt_epu8_mask(leading, highbits, one);
+
+ if (invalid_leading_bytes) {
+ return 0; // Indicates error
+ }
+
+ __mmask64 leading_shift = (leading << 1) | *next_leading_ptr;
+
+ if ((nonascii ^ leading) != leading_shift) {
+ return 0; // Indicates error
+ }
+
+ const __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
+ input =
+ _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
+
+ __mmask64 retain = ~leading & load_mask;
+ __m512i output = _mm512_maskz_compress_epi8(retain, input);
+ int64_t written_out = count_ones(retain);
+ if (written_out == 0) {
+ return 0; // Indicates error
+ }
+ *next_bit6_ptr = bit6 >> 63;
+ *next_leading_ptr = leading >> 63;
+
+ __mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out);
+
+ _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
+
+ return written_out;
+}
+
+size_t utf8_to_latin1_avx512(const char *&inbuf, size_t len,
+ char *&inlatin_output) {
+ const char *buf = inbuf;
+ char *latin_output = inlatin_output;
+ char *start = latin_output;
+ size_t pos = 0;
+ __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000
+ __m512i one = _mm512_set1_epi8(1);
+ __mmask64 next_leading = 0;
+ __mmask64 next_bit6 = 0;
+
+ while (pos + 64 <= len) {
+ size_t written = process_block_from_utf8_to_latin1<false>(
+ buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6);
+ if (written == 0) {
+ inlatin_output = latin_output;
+ inbuf = buf + pos - next_leading;
+ return 0; // Indicates error at pos or after, or just before pos (too
+ // short error)
+ }
+ latin_output += written;
+ pos += 64;
+ }
+
+ if (pos < len) {
+ size_t remaining = len - pos;
+ size_t written = process_block_from_utf8_to_latin1<true>(
+ buf + pos, remaining, latin_output, minus64, one, &next_leading,
+ &next_bit6);
+ if (written == 0) {
+ inbuf = buf + pos - next_leading;
+ inlatin_output = latin_output;
+ return 0; // Indicates error at pos or after, or just before pos (too
+ // short error)
+ }
+ latin_output += written;
+ }
+ if (next_leading) {
+ inbuf = buf + len - next_leading;
+ inlatin_output = latin_output;
+ return 0; // Indicates error at end of buffer
+ }
+ inlatin_output = latin_output;
+ inbuf += len;
+ return size_t(latin_output - start);
+}
diff --git a/contrib/simdutf/src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp b/contrib/simdutf/src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp
new file mode 100644
index 000000000..819209787
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp
@@ -0,0 +1,69 @@
+// file included directly
+
+// File contains conversion procedure from valid UTF-8 strings.
+
+template <bool is_remaining>
+simdutf_really_inline size_t process_valid_block_from_utf8_to_latin1(
+ const char *buf, size_t len, char *latin_output, __m512i minus64,
+ __m512i one, __mmask64 *next_leading_ptr, __mmask64 *next_bit6_ptr) {
+ __mmask64 load_mask =
+ is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL;
+ __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
+ __mmask64 nonascii = _mm512_movepi8_mask(input);
+
+ if (nonascii == 0) {
+ is_remaining
+ ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
+ : _mm512_storeu_si512((__m512i *)latin_output, input);
+ return len;
+ }
+
+ __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
+
+ __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
+
+ *next_leading_ptr = leading >> 63;
+
+ __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
+ input =
+ _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
+ *next_bit6_ptr = bit6 >> 63;
+
+ __mmask64 retain = ~leading & load_mask;
+ __m512i output = _mm512_maskz_compress_epi8(retain, input);
+ int64_t written_out = count_ones(retain);
+ if (written_out == 0) {
+ return 0; // Indicates error
+ }
+ __mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out);
+ // Optimization opportunity: sometimes, masked writes are not needed.
+ _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
+ return written_out;
+}
+
+size_t valid_utf8_to_latin1_avx512(const char *buf, size_t len,
+ char *latin_output) {
+ char *start = latin_output;
+ size_t pos = 0;
+ __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000
+ __m512i one = _mm512_set1_epi8(1);
+ __mmask64 next_leading = 0;
+ __mmask64 next_bit6 = 0;
+
+ while (pos + 64 <= len) {
+ size_t written = process_valid_block_from_utf8_to_latin1<false>(
+ buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6);
+ latin_output += written;
+ pos += 64;
+ }
+
+ if (pos < len) {
+ size_t remaining = len - pos;
+ size_t written = process_valid_block_from_utf8_to_latin1<true>(
+ buf + pos, remaining, latin_output, minus64, one, &next_leading,
+ &next_bit6);
+ latin_output += written;
+ }
+
+ return (size_t)(latin_output - start);
+}
diff --git a/contrib/simdutf/src/icelake/icelake_from_utf8.inl.cpp b/contrib/simdutf/src/icelake/icelake_from_utf8.inl.cpp
new file mode 100644
index 000000000..224fced1a
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_from_utf8.inl.cpp
@@ -0,0 +1,338 @@
+// file included directly
+
+// File contains conversion procedure from possibly invalid UTF-8 strings.
+
+/**
+ * Attempts to convert up to len 1-byte code units from in (in UTF-8 format) to
+ * out.
+ * Returns the position of the input and output after the processing is
+ * completed. Upon error, the output is set to null.
+ */
+
+template <endianness big_endian>
+utf8_to_utf16_result
+fast_avx512_convert_utf8_to_utf16(const char *in, size_t len, char16_t *out) {
+ const char *const final_in = in + len;
+ bool result = true;
+ while (result) {
+ if (final_in - in >= 64) {
+ result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(
+ in, out, final_in - in);
+ } else if (in < final_in) {
+ result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(
+ in, out, final_in - in);
+ } else {
+ break;
+ }
+ }
+ if (!result) {
+ out = nullptr;
+ }
+ return std::make_pair(in, out);
+}
+
+template <endianness big_endian>
+simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in,
+ size_t len,
+ char16_t *out) {
+ const char *const init_in = in;
+ const char16_t *const init_out = out;
+ const char *const final_in = in + len;
+ bool result = true;
+ while (result) {
+ if (final_in - in >= 64) {
+ result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(
+ in, out, final_in - in);
+ } else if (in < final_in) {
+ result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(
+ in, out, final_in - in);
+ } else {
+ break;
+ }
+ }
+ if (!result) {
+ size_t pos = size_t(in - init_in);
+ if (pos < len && (init_in[pos] & 0xc0) == 0x80 && pos >= 64) {
+ // We must check whether we are the fourth continuation byte
+ bool c1 = (init_in[pos - 1] & 0xc0) == 0x80;
+ bool c2 = (init_in[pos - 2] & 0xc0) == 0x80;
+ bool c3 = (init_in[pos - 3] & 0xc0) == 0x80;
+ if (c1 && c2 && c3) {
+ return {simdutf::TOO_LONG, pos};
+ }
+ }
+ // rewind_and_convert_with_errors will seek a potential error from in
+ // onward, with the ability to go back up to in - init_in bytes, and read
+ // final_in - in bytes forward.
+ simdutf::result res =
+ scalar::utf8_to_utf16::rewind_and_convert_with_errors<big_endian>(
+ in - init_in, in, final_in - in, out);
+ res.count += (in - init_in);
+ return res;
+ } else {
+ return simdutf::result(error_code::SUCCESS, out - init_out);
+ }
+}
+
+template <endianness big_endian, typename OUTPUT>
+// todo: replace with the utf-8 to utf-16 routine adapted to utf-32. This code
+// is legacy.
+std::pair<const char *, OUTPUT *>
+validating_utf8_to_fixed_length(const char *str, size_t len, OUTPUT *dwords) {
+ constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
+ constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
+ static_assert(
+ UTF32 or UTF16,
+ "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
+ static_assert(!(UTF32 and big_endian),
+ "we do not currently support big-endian UTF-32");
+
+ const char *ptr = str;
+ const char *end = ptr + len;
+ __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809);
+ OUTPUT *output = dwords;
+ avx512_utf8_checker checker{};
+ /**
+ * In the main loop, we consume 64 bytes per iteration,
+ * but we access 64 + 4 bytes.
+ * We use masked writes to avoid overruns, see
+ * https://github.com/simdutf/simdutf/issues/471
+ */
+ while (end - ptr >= 64 + 4) {
+ const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+ if (checker.check_next_input(utf8)) {
+ SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+ output += 64;
+ ptr += 64;
+ continue;
+ }
+ const __m512i lane0 = broadcast_epi128<0>(utf8);
+ const __m512i lane1 = broadcast_epi128<1>(utf8);
+ int valid_count0;
+ __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+ const __m512i lane2 = broadcast_epi128<2>(utf8);
+ int valid_count1;
+ __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+ if (valid_count0 + valid_count1 <= 16) {
+ vec0 = _mm512_mask_expand_epi32(
+ vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+ valid_count0 += valid_count1;
+ vec0 = expand_utf8_to_utf32(vec0);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+ } else {
+ vec0 = expand_utf8_to_utf32(vec0);
+ vec1 = expand_utf8_to_utf32(vec1);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
+ }
+ const __m512i lane3 = broadcast_epi128<3>(utf8);
+ int valid_count2;
+ __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
+ uint32_t tmp1;
+ ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
+ const __m512i lane4 = _mm512_set1_epi32(tmp1);
+ int valid_count3;
+ __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
+ if (valid_count2 + valid_count3 <= 16) {
+ vec2 = _mm512_mask_expand_epi32(
+ vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
+ valid_count2 += valid_count3;
+ vec2 = expand_utf8_to_utf32(vec2);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+ } else {
+ vec2 = expand_utf8_to_utf32(vec2);
+ vec3 = expand_utf8_to_utf32(vec3);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
+ }
+ ptr += 4 * 16;
+ }
+ const char *validatedptr = ptr; // validated up to ptr
+
+ // For the final pass, we validate 64 bytes, but we only transcode
+ // 3*16 bytes, so we may end up double-validating 16 bytes.
+ if (end - ptr >= 64) {
+ const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+ if (checker.check_next_input(utf8)) {
+ SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+ output += 64;
+ ptr += 64;
+ } else {
+ const __m512i lane0 = broadcast_epi128<0>(utf8);
+ const __m512i lane1 = broadcast_epi128<1>(utf8);
+ int valid_count0;
+ __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+ const __m512i lane2 = broadcast_epi128<2>(utf8);
+ int valid_count1;
+ __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+ if (valid_count0 + valid_count1 <= 16) {
+ vec0 = _mm512_mask_expand_epi32(
+ vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+ valid_count0 += valid_count1;
+ vec0 = expand_utf8_to_utf32(vec0);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+ } else {
+ vec0 = expand_utf8_to_utf32(vec0);
+ vec1 = expand_utf8_to_utf32(vec1);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
+ }
+
+ const __m512i lane3 = broadcast_epi128<3>(utf8);
+ SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
+
+ ptr += 3 * 16;
+ }
+ validatedptr += 4 * 16;
+ }
+ if (end != validatedptr) {
+ const __m512i utf8 =
+ _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)),
+ (const __m512i *)validatedptr);
+ checker.check_next_input(utf8);
+ }
+ checker.check_eof();
+ if (checker.errors()) {
+ return {ptr, nullptr}; // We found an error.
+ }
+ return {ptr, output};
+}
+
+// Like validating_utf8_to_fixed_length but returns as soon as an error is
+// identified todo: replace with the utf-8 to utf-16 routine adapted to utf-32.
+// This code is legacy.
+template <endianness big_endian, typename OUTPUT>
+std::tuple<const char *, OUTPUT *, bool>
+validating_utf8_to_fixed_length_with_constant_checks(const char *str,
+ size_t len,
+ OUTPUT *dwords) {
+ constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
+ constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
+ static_assert(
+ UTF32 or UTF16,
+ "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
+ static_assert(!(UTF32 and big_endian),
+ "we do not currently support big-endian UTF-32");
+
+ const char *ptr = str;
+ const char *end = ptr + len;
+ __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809);
+ OUTPUT *output = dwords;
+ avx512_utf8_checker checker{};
+ /**
+ * In the main loop, we consume 64 bytes per iteration,
+ * but we access 64 + 4 bytes.
+ */
+ while (end - ptr >= 4 + 64) {
+ const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+ bool ascii = checker.check_next_input(utf8);
+ if (checker.errors()) {
+ return {ptr, output, false}; // We found an error.
+ }
+ if (ascii) {
+ SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+ output += 64;
+ ptr += 64;
+ continue;
+ }
+ const __m512i lane0 = broadcast_epi128<0>(utf8);
+ const __m512i lane1 = broadcast_epi128<1>(utf8);
+ int valid_count0;
+ __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+ const __m512i lane2 = broadcast_epi128<2>(utf8);
+ int valid_count1;
+ __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+ if (valid_count0 + valid_count1 <= 16) {
+ vec0 = _mm512_mask_expand_epi32(
+ vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+ valid_count0 += valid_count1;
+ vec0 = expand_utf8_to_utf32(vec0);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+ } else {
+ vec0 = expand_utf8_to_utf32(vec0);
+ vec1 = expand_utf8_to_utf32(vec1);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
+ }
+ const __m512i lane3 = broadcast_epi128<3>(utf8);
+ int valid_count2;
+ __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
+ uint32_t tmp1;
+ ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
+ const __m512i lane4 = _mm512_set1_epi32(tmp1);
+ int valid_count3;
+ __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
+ if (valid_count2 + valid_count3 <= 16) {
+ vec2 = _mm512_mask_expand_epi32(
+ vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
+ valid_count2 += valid_count3;
+ vec2 = expand_utf8_to_utf32(vec2);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+ } else {
+ vec2 = expand_utf8_to_utf32(vec2);
+ vec3 = expand_utf8_to_utf32(vec3);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
+ }
+ ptr += 4 * 16;
+ }
+ const char *validatedptr = ptr; // validated up to ptr
+
+ // For the final pass, we validate 64 bytes, but we only transcode
+ // 3*16 bytes, so we may end up double-validating 16 bytes.
+ if (end - ptr >= 64) {
+ const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+ bool ascii = checker.check_next_input(utf8);
+ if (checker.errors()) {
+ return {ptr, output, false}; // We found an error.
+ }
+ if (ascii) {
+ SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+ output += 64;
+ ptr += 64;
+ } else {
+ const __m512i lane0 = broadcast_epi128<0>(utf8);
+ const __m512i lane1 = broadcast_epi128<1>(utf8);
+ int valid_count0;
+ __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+ const __m512i lane2 = broadcast_epi128<2>(utf8);
+ int valid_count1;
+ __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+ if (valid_count0 + valid_count1 <= 16) {
+ vec0 = _mm512_mask_expand_epi32(
+ vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+ valid_count0 += valid_count1;
+ vec0 = expand_utf8_to_utf32(vec0);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+ } else {
+ vec0 = expand_utf8_to_utf32(vec0);
+ vec1 = expand_utf8_to_utf32(vec1);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
+ }
+
+ const __m512i lane3 = broadcast_epi128<3>(utf8);
+ SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
+
+ ptr += 3 * 16;
+ }
+ validatedptr += 4 * 16;
+ }
+ if (end != validatedptr) {
+ const __m512i utf8 =
+ _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)),
+ (const __m512i *)validatedptr);
+ checker.check_next_input(utf8);
+ }
+ checker.check_eof();
+ if (checker.errors()) {
+ return {ptr, output, false}; // We found an error.
+ }
+ return {ptr, output, true};
+}
diff --git a/contrib/simdutf/src/icelake/icelake_from_valid_utf8.inl.cpp b/contrib/simdutf/src/icelake/icelake_from_valid_utf8.inl.cpp
new file mode 100644
index 000000000..bff746a54
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_from_valid_utf8.inl.cpp
@@ -0,0 +1,136 @@
+// file included directly
+
+// File contains conversion procedure from VALID UTF-8 strings.
+
+/*
+ valid_utf8_to_fixed_length converts a valid UTF-8 string into UTF-32.
+
+ The `OUTPUT` template type decides what to do with UTF-32: store
+ it directly or convert into UTF-16 (with AVX512).
+
+ Input:
+ - str - valid UTF-8 string
+ - len - string length
+ - out_buffer - output buffer
+
+ Result:
+ - pair.first - the first unprocessed input byte
+ - pair.second - the first unprocessed output word
+*/
+template <endianness big_endian, typename OUTPUT>
+std::pair<const char *, OUTPUT *>
+valid_utf8_to_fixed_length(const char *str, size_t len, OUTPUT *dwords) {
+ constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
+ constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
+ static_assert(
+ UTF32 or UTF16,
+ "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
+ static_assert(!(UTF32 and big_endian),
+ "we do not currently support big-endian UTF-32");
+
+ __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809);
+ const char *ptr = str;
+ const char *end = ptr + len;
+
+ OUTPUT *output = dwords;
+ /**
+ * In the main loop, we consume 64 bytes per iteration,
+ * but we access 64 + 4 bytes.
+ * We check for ptr + 64 + 64 <= end because
+ * we want to be do maskless writes without overruns.
+ */
+ while (end - ptr >= 64 + 4) {
+ const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+ const __m512i v_80 = _mm512_set1_epi8(char(0x80));
+ const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
+ if (ascii == 0) {
+ SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+ output += 64;
+ ptr += 64;
+ continue;
+ }
+
+ const __m512i lane0 = broadcast_epi128<0>(utf8);
+ const __m512i lane1 = broadcast_epi128<1>(utf8);
+ int valid_count0;
+ __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+ const __m512i lane2 = broadcast_epi128<2>(utf8);
+ int valid_count1;
+ __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+ if (valid_count0 + valid_count1 <= 16) {
+ vec0 = _mm512_mask_expand_epi32(
+ vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+ valid_count0 += valid_count1;
+ vec0 = expand_utf8_to_utf32(vec0);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+ } else {
+ vec0 = expand_utf8_to_utf32(vec0);
+ vec1 = expand_utf8_to_utf32(vec1);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
+ }
+ const __m512i lane3 = broadcast_epi128<3>(utf8);
+ int valid_count2;
+ __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
+ uint32_t tmp1;
+ ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
+ const __m512i lane4 = _mm512_set1_epi32(tmp1);
+ int valid_count3;
+ __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
+ if (valid_count2 + valid_count3 <= 16) {
+ vec2 = _mm512_mask_expand_epi32(
+ vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
+ valid_count2 += valid_count3;
+ vec2 = expand_utf8_to_utf32(vec2);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+ } else {
+ vec2 = expand_utf8_to_utf32(vec2);
+ vec3 = expand_utf8_to_utf32(vec3);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
+ }
+ ptr += 4 * 16;
+ }
+
+ if (end - ptr >= 64) {
+ const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+ const __m512i v_80 = _mm512_set1_epi8(char(0x80));
+ const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
+ if (ascii == 0) {
+ SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+ output += 64;
+ ptr += 64;
+ } else {
+ const __m512i lane0 = broadcast_epi128<0>(utf8);
+ const __m512i lane1 = broadcast_epi128<1>(utf8);
+ int valid_count0;
+ __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+ const __m512i lane2 = broadcast_epi128<2>(utf8);
+ int valid_count1;
+ __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+ if (valid_count0 + valid_count1 <= 16) {
+ vec0 = _mm512_mask_expand_epi32(
+ vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+ valid_count0 += valid_count1;
+ vec0 = expand_utf8_to_utf32(vec0);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+ } else {
+ vec0 = expand_utf8_to_utf32(vec0);
+ vec1 = expand_utf8_to_utf32(vec1);
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+ SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
+ }
+
+ const __m512i lane3 = broadcast_epi128<3>(utf8);
+ SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
+
+ ptr += 3 * 16;
+ }
+ }
+ return {ptr, output};
+}
+
+using utf8_to_utf16_result = std::pair<const char *, char16_t *>;
diff --git a/contrib/simdutf/src/icelake/icelake_macros.inl.cpp b/contrib/simdutf/src/icelake/icelake_macros.inl.cpp
new file mode 100644
index 000000000..cc694e817
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_macros.inl.cpp
@@ -0,0 +1,143 @@
+
+/*
+ This upcoming macro (SIMDUTF_ICELAKE_TRANSCODE16) takes 16 + 4 bytes (of a
+ UTF-8 string) and loads all possible 4-byte substring into an AVX512
+ register.
+
+ For example if we have bytes abcdefgh... we create following 32-bit lanes
+
+ [abcd|bcde|cdef|defg|efgh|...]
+ ^ ^
+ byte 0 of reg byte 63 of reg
+*/
+/** pshufb
+ # lane{0,1,2} have got bytes: [ 0, 1, 2, 3, 4, 5, 6, 8, 9, 10,
+ 11, 12, 13, 14, 15] # lane3 has got bytes: [ 16, 17, 18, 19, 4, 5,
+ 6, 8, 9, 10, 11, 12, 13, 14, 15]
+
+ expand_ver2 = [
+ # lane 0:
+ 0, 1, 2, 3,
+ 1, 2, 3, 4,
+ 2, 3, 4, 5,
+ 3, 4, 5, 6,
+
+ # lane 1:
+ 4, 5, 6, 7,
+ 5, 6, 7, 8,
+ 6, 7, 8, 9,
+ 7, 8, 9, 10,
+
+ # lane 2:
+ 8, 9, 10, 11,
+ 9, 10, 11, 12,
+ 10, 11, 12, 13,
+ 11, 12, 13, 14,
+
+ # lane 3 order: 13, 14, 15, 16 14, 15, 16, 17, 15, 16, 17, 18, 16,
+ 17, 18, 19 12, 13, 14, 15, 13, 14, 15, 0, 14, 15, 0, 1, 15, 0, 1, 2,
+ ]
+*/
+
+#define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED) \
+ { \
+ const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1); \
+ const __m512i expand_ver2 = _mm512_setr_epi64( \
+ 0x0403020103020100, 0x0605040305040302, 0x0807060507060504, \
+ 0x0a09080709080706, 0x0c0b0a090b0a0908, 0x0e0d0c0b0d0c0b0a, \
+ 0x000f0e0d0f0e0d0c, 0x0201000f01000f0e); \
+ const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2); \
+ \
+ __mmask16 leading_bytes; \
+ const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0); \
+ const __m512i t0 = _mm512_and_si512(input, v_0000_00c0); \
+ const __m512i v_0000_0080 = _mm512_set1_epi32(0x80); \
+ leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080); \
+ \
+ __m512i char_class; \
+ char_class = _mm512_srli_epi32(input, 4); \
+ /* char_class = ((input >> 4) & 0x0f) | 0x80808000 */ \
+ const __m512i v_0000_000f = _mm512_set1_epi32(0x0f); \
+ const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000); \
+ char_class = \
+ _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea); \
+ \
+ const int valid_count = static_cast<int>(count_ones(leading_bytes)); \
+ const __m512i utf32 = expanded_utf8_to_utf32(char_class, input); \
+ \
+ const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(), \
+ leading_bytes, utf32); \
+ \
+ if (UTF32) { \
+ if (MASKED) { \
+ const __mmask16 valid = uint16_t((1 << valid_count) - 1); \
+ _mm512_mask_storeu_epi32((__m512i *)output, valid, out); \
+ } else { \
+ _mm512_storeu_si512((__m512i *)output, out); \
+ } \
+ output += valid_count; \
+ } else { \
+ if (MASKED) { \
+ output += utf32_to_utf16_masked<big_endian>( \
+ byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
+ } else { \
+ output += utf32_to_utf16<big_endian>( \
+ byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
+ } \
+ } \
+ }
+
+#define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED) \
+ { \
+ if (UTF32) { \
+ if (MASKED) { \
+ const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1); \
+ _mm512_mask_storeu_epi32((__m512i *)output, valid_mask, INPUT); \
+ } else { \
+ _mm512_storeu_si512((__m512i *)output, INPUT); \
+ } \
+ output += VALID_COUNT; \
+ } else { \
+ if (MASKED) { \
+ output += utf32_to_utf16_masked<big_endian>( \
+ byteflip, INPUT, VALID_COUNT, \
+ reinterpret_cast<char16_t *>(output)); \
+ } else { \
+ output += \
+ utf32_to_utf16<big_endian>(byteflip, INPUT, VALID_COUNT, \
+ reinterpret_cast<char16_t *>(output)); \
+ } \
+ } \
+ }
+
+#define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) \
+ if (UTF32) { \
+ const __m128i t0 = _mm512_castsi512_si128(utf8); \
+ const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1); \
+ const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2); \
+ const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3); \
+ _mm512_storeu_si512((__m512i *)(output + 0 * 16), \
+ _mm512_cvtepu8_epi32(t0)); \
+ _mm512_storeu_si512((__m512i *)(output + 1 * 16), \
+ _mm512_cvtepu8_epi32(t1)); \
+ _mm512_storeu_si512((__m512i *)(output + 2 * 16), \
+ _mm512_cvtepu8_epi32(t2)); \
+ _mm512_storeu_si512((__m512i *)(output + 3 * 16), \
+ _mm512_cvtepu8_epi32(t3)); \
+ } else { \
+ const __m256i h0 = _mm512_castsi512_si256(utf8); \
+ const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1); \
+ if (big_endian) { \
+ _mm512_storeu_si512( \
+ (__m512i *)(output + 0 * 16), \
+ _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip)); \
+ _mm512_storeu_si512( \
+ (__m512i *)(output + 2 * 16), \
+ _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip)); \
+ } else { \
+ _mm512_storeu_si512((__m512i *)(output + 0 * 16), \
+ _mm512_cvtepu8_epi16(h0)); \
+ _mm512_storeu_si512((__m512i *)(output + 2 * 16), \
+ _mm512_cvtepu8_epi16(h1)); \
+ } \
+ }
diff --git a/contrib/simdutf/src/icelake/icelake_utf32_validation.inl.cpp b/contrib/simdutf/src/icelake/icelake_utf32_validation.inl.cpp
new file mode 100644
index 000000000..0e37d3f5e
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_utf32_validation.inl.cpp
@@ -0,0 +1,35 @@
+// file included directly
+
+const char32_t *validate_utf32(const char32_t *buf, size_t len) {
+ if (len < 16) {
+ return buf;
+ }
+ const char32_t *end = buf + len - 16;
+
+ const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000);
+ __m512i currentmax = _mm512_setzero_si512();
+ __m512i currentoffsetmax = _mm512_setzero_si512();
+
+ while (buf <= end) {
+ __m512i utf32 = _mm512_loadu_si512((const __m512i *)buf);
+ buf += 16;
+ currentoffsetmax =
+ _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
+ currentmax = _mm512_max_epu32(utf32, currentmax);
+ }
+
+ const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff);
+ const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff);
+ __m512i is_zero =
+ _mm512_xor_si512(_mm512_max_epu32(currentmax, standardmax), standardmax);
+ if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
+ return nullptr;
+ }
+ is_zero = _mm512_xor_si512(
+ _mm512_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
+ if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
+ return nullptr;
+ }
+
+ return buf;
+}
diff --git a/contrib/simdutf/src/icelake/icelake_utf8_common.inl.cpp b/contrib/simdutf/src/icelake/icelake_utf8_common.inl.cpp
new file mode 100644
index 000000000..7eade34ad
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_utf8_common.inl.cpp
@@ -0,0 +1,796 @@
+// Common procedures for both validating and non-validating conversions from
+// UTF-8.
+enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL };
+
+using utf8_to_utf16_result = std::pair<const char *, char16_t *>;
+using utf8_to_utf32_result = std::pair<const char *, uint32_t *>;
+
+/*
+ process_block_utf8_to_utf16 converts up to 64 bytes from 'in' from UTF-8
+ to UTF-16. When tail = SIMDUTF_FULL, then the full input buffer (64 bytes)
+ might be used. When tail = SIMDUTF_TAIL, we take into account 'gap' which
+ indicates how many input bytes are relevant.
+
+ Returns true when the result is correct, otherwise it returns false.
+
+ The provided in and out pointers are advanced according to how many input
+ bytes have been processed, upon success.
+*/
+template <block_processing_mode tail, endianness big_endian>
+simdutf_really_inline bool
+process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) {
+ // constants
+ __m512i mask_identity = _mm512_set_epi8(
+ 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46,
+ 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28,
+ 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9,
+ 8, 7, 6, 5, 4, 3, 2, 1, 0);
+ __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0);
+ __m512i mask_80808080 = _mm512_set1_epi32(0x80808080);
+ __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0);
+ __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(
+ 0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf,
+ 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf,
+ 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf);
+ __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2);
+ __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff);
+ __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0);
+ __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00);
+ __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809);
+ // Note that 'tail' is a compile-time constant !
+ __mmask64 b =
+ (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1;
+ __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in)
+ : _mm512_maskz_loadu_epi8(b, in);
+ __mmask64 m1 = (tail == SIMDUTF_FULL)
+ ? _mm512_cmplt_epu8_mask(input, mask_80808080)
+ : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080);
+ if (_ktestc_mask64_u8(m1,
+ b)) { // NOT(m1) AND b -- if all zeroes, then all ASCII
+ // alternatively, we could do 'if (m1 == b) { '
+ if (tail == SIMDUTF_FULL) {
+ in += 64; // consumed 64 bytes
+ // we convert a full 64-byte block, writing 128 bytes.
+ __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+ if (big_endian) {
+ input1 = _mm512_shuffle_epi8(input1, byteflip);
+ }
+ _mm512_storeu_si512(out, input1);
+ out += 32;
+ __m512i input2 =
+ _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
+ if (big_endian) {
+ input2 = _mm512_shuffle_epi8(input2, byteflip);
+ }
+ _mm512_storeu_si512(out, input2);
+ out += 32;
+ return true; // we are done
+ } else {
+ in += gap;
+ if (gap <= 32) {
+ __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+ if (big_endian) {
+ input1 = _mm512_shuffle_epi8(input1, byteflip);
+ }
+ _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1),
+ input1);
+ out += gap;
+ } else {
+ __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+ if (big_endian) {
+ input1 = _mm512_shuffle_epi8(input1, byteflip);
+ }
+ _mm512_storeu_si512(out, input1);
+ out += 32;
+ __m512i input2 =
+ _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
+ if (big_endian) {
+ input2 = _mm512_shuffle_epi8(input2, byteflip);
+ }
+ _mm512_mask_storeu_epi16(
+ out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2);
+ out += gap - 32;
+ }
+ return true; // we are done
+ }
+ }
+ // classify characters further
+ __mmask64 m234 = _mm512_cmp_epu8_mask(
+ mask_c0c0c0c0, input,
+ _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte
+ __mmask64 m34 =
+ _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input,
+ _MM_CMPINT_LT); // 0xdf < input, 3 or 4 leading byte
+
+ __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(
+ m234, input, mask_c2c2c2c2,
+ _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence)
+ // Overlong 2-byte sequence
+ if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) {
+ // Overlong 2-byte sequence
+ return false;
+ }
+ if (_ktestz_mask64_u8(m34, m34) == 0) {
+ // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a
+ // 4-byte sequence!
+ __mmask64 m4 = _mm512_cmp_epu8_mask(
+ input, mask_f0f0f0f0,
+ _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes)
+
+ __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL)
+ ? _knot_mask64(m1)
+ : _kand_mask64(_knot_mask64(m1), b);
+
+ __mmask64 mp1 = _kshiftli_mask64(m234, 1);
+ __mmask64 mp2 = _kshiftli_mask64(m34, 2);
+ // We could do it as follows...
+ // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit
+ // masks a and b and return 1 if all zeroes but GCC generates better code
+ // when we do:
+ if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and
+ // return 1 if all zeroes
+ // Fast path with 1,2,3 bytes
+ __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes
+ __mmask64 m1234 = _kor_mask64(m1, m234);
+ // mismatched continuation bytes:
+ if (tail == SIMDUTF_FULL) {
+ __mmask64 xnormcm1234 = _kxnor_mask64(
+ mc,
+ m1234); // XNOR of mc and m1234 should be all zero if they differ
+ // the presence of a 1 bit indicates that they overlap.
+ // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return
+ // 1 if all zeroes.
+ if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
+ return false;
+ }
+ } else {
+ __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
+ if (mc != bxorm1234) {
+ return false;
+ }
+ }
+ // mend: identifying the last bytes of each sequence to be decoded
+ __mmask64 mend = _kshiftri_mask64(m1234, 1);
+ if (tail != SIMDUTF_FULL) {
+ mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1)));
+ }
+
+ __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
+ __m512i last_and_thirdu16 =
+ _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
+
+ __m512i nonasciitags = _mm512_maskz_mov_epi8(
+ mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000
+ __m512i clearedbytes = _mm512_andnot_si512(
+ nonasciitags, input); // high two bits cleared where not ASCII
+ __m512i lastbytes = _mm512_maskz_permutexvar_epi8(
+ 0x5555555555555555, last_and_thirdu16,
+ clearedbytes); // the last byte of each character
+
+ __mmask64 mask_before_non_ascii = _kshiftri_mask64(
+ mask_not_ascii, 1); // bytes that precede non-ASCII bytes
+ __m512i indexofsecondlastbytes = _mm512_add_epi16(
+ mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
+ __m512i beforeasciibytes =
+ _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
+ __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(
+ 0x5555555555555555, indexofsecondlastbytes,
+ beforeasciibytes); // the second last bytes (of two, three byte seq,
+ // surrogates)
+ secondlastbytes =
+ _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
+
+ __m512i indexofthirdlastbytes = _mm512_add_epi16(
+ mask_ffffffff,
+ indexofsecondlastbytes); // indices of the second last bytes
+ __m512i thirdlastbyte =
+ _mm512_maskz_mov_epi8(m34,
+ clearedbytes); // only those that are the third
+ // last byte of a sequence
+ __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(
+ 0x5555555555555555, indexofthirdlastbytes,
+ thirdlastbyte); // the third last bytes (of three byte sequences, hi
+ // surrogate)
+ thirdlastbytes =
+ _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
+ __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes,
+ thirdlastbytes, 254);
+ // the elements of Wout excluding the last element if it happens to be a
+ // high surrogate:
+
+ __mmask64 mprocessed =
+ (tail == SIMDUTF_FULL)
+ ? _pdep_u64(0xFFFFFFFF, mend)
+ : _pdep_u64(
+ 0xFFFFFFFF,
+ _kand_mask64(
+ mend, b)); // we adjust mend at the end of the output.
+
+ // Encodings out of range...
+ {
+ // the location of 3-byte sequence start bytes in the input
+ __mmask64 m3 = m34 & (b ^ m4);
+ // code units in Wout corresponding to 3-byte sequences.
+ __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
+ __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
+ __mmask32 Msmall800 =
+ _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
+ __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
+ __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
+ __mmask32 M3s =
+ _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
+ if (_kor_mask32(Msmall800, M3s)) {
+ return false;
+ }
+ }
+ int64_t nout = _mm_popcnt_u64(mprocessed);
+ in += 64 - _lzcnt_u64(mprocessed);
+ if (big_endian) {
+ Wout = _mm512_shuffle_epi8(Wout, byteflip);
+ }
+ _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
+ out += nout;
+ return true; // ok
+ }
+ //
+ // We have a 4-byte sequence, this is the general case.
+ // Slow!
+ __mmask64 mp3 = _kshiftli_mask64(m4, 3);
+ __mmask64 mc =
+ _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes
+ __mmask64 m1234 = _kor_mask64(m1, m234);
+
+ // mend: identifying the last bytes of each sequence to be decoded
+ __mmask64 mend =
+ _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3);
+ if (tail != SIMDUTF_FULL) {
+ mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1)));
+ }
+ __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
+ __m512i last_and_thirdu16 =
+ _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
+
+ __m512i nonasciitags = _mm512_maskz_mov_epi8(
+ mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000
+ __m512i clearedbytes = _mm512_andnot_si512(
+ nonasciitags, input); // high two bits cleared where not ASCII
+ __m512i lastbytes = _mm512_maskz_permutexvar_epi8(
+ 0x5555555555555555, last_and_thirdu16,
+ clearedbytes); // the last byte of each character
+
+ __mmask64 mask_before_non_ascii = _kshiftri_mask64(
+ mask_not_ascii, 1); // bytes that precede non-ASCII bytes
+ __m512i indexofsecondlastbytes = _mm512_add_epi16(
+ mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
+ __m512i beforeasciibytes =
+ _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
+ __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(
+ 0x5555555555555555, indexofsecondlastbytes,
+ beforeasciibytes); // the second last bytes (of two, three byte seq,
+ // surrogates)
+ secondlastbytes =
+ _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
+
+ __m512i indexofthirdlastbytes = _mm512_add_epi16(
+ mask_ffffffff,
+ indexofsecondlastbytes); // indices of the second last bytes
+ __m512i thirdlastbyte = _mm512_maskz_mov_epi8(
+ m34,
+ clearedbytes); // only those that are the third last byte of a sequence
+ __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(
+ 0x5555555555555555, indexofthirdlastbytes,
+ thirdlastbyte); // the third last bytes (of three byte sequences, hi
+ // surrogate)
+ thirdlastbytes =
+ _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
+ __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(
+ lastbytes, secondlastbytes, thirdlastbytes, 254);
+ uint64_t Mlo_uint64 = _pext_u64(mp3, mend);
+ __mmask32 Mlo = __mmask32(Mlo_uint64);
+ __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1);
+ __m512i lo_surr_mask = _mm512_maskz_mov_epi16(
+ Mlo,
+ mask_dc00dc00); // lo surr: 1101110000000000, other: 0000000000000000
+ __m512i shifted4_thirdsecondandlastbytes =
+ _mm512_srli_epi16(thirdsecondandlastbytes,
+ 4); // hi surr: 00000WVUTSRQPNML vuts = WVUTS - 1
+ __m512i tagged_lo_surrogates = _mm512_or_si512(
+ thirdsecondandlastbytes,
+ lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other: unchanged
+ __m512i Wout = _mm512_mask_add_epi16(
+ tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes,
+ mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other: unchanged
+ // the elements of Wout excluding the last element if it happens to be a
+ // high surrogate:
+ __mmask32 Mout = ~(Mhi & 0x80000000);
+ __mmask64 mprocessed =
+ (tail == SIMDUTF_FULL)
+ ? _pdep_u64(Mout, mend)
+ : _pdep_u64(
+ Mout,
+ _kand_mask64(mend,
+ b)); // we adjust mend at the end of the output.
+
+ // mismatched continuation bytes:
+ if (tail == SIMDUTF_FULL) {
+ __mmask64 xnormcm1234 = _kxnor_mask64(
+ mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
+ // the presence of a 1 bit indicates that they overlap.
+ // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1
+ // if all zeroes.
+ if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
+ return false;
+ }
+ } else {
+ __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
+ if (mc != bxorm1234) {
+ return false;
+ }
+ }
+ // Encodings out of range...
+ {
+ // the location of 3-byte sequence start bytes in the input
+ __mmask64 m3 = m34 & (b ^ m4);
+ // code units in Wout corresponding to 3-byte sequences.
+ __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
+ __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
+ __mmask32 Msmall800 =
+ _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
+ __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
+ __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
+ __mmask32 M3s =
+ _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
+ __m512i mask_04000400 = _mm512_set1_epi32(0x04000400);
+ __mmask32 M4s =
+ _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400);
+ if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) {
+ return false;
+ }
+ }
+ in += 64 - _lzcnt_u64(mprocessed);
+ int64_t nout = _mm_popcnt_u64(mprocessed);
+ if (big_endian) {
+ Wout = _mm512_shuffle_epi8(Wout, byteflip);
+ }
+ _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
+ out += nout;
+ return true; // ok
+ }
+ // Fast path 2: all ASCII or 2 byte
+ __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL)
+ ? _knot_mask64(m234)
+ : _kand_mask64(_knot_mask64(m234), b);
+ // on top of -0xc0 we subtract -2 which we get back later of the
+ // continuation byte tags
+ __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
+ __mmask64 leading = tail == (tail == SIMDUTF_FULL)
+ ? _kor_mask64(m1, m234)
+ : _kand_mask64(_kor_mask64(m1, m234),
+ b); // first bytes of each sequence
+ if (tail == SIMDUTF_FULL) {
+ __mmask64 xnor234leading =
+ _kxnor_mask64(_kshiftli_mask64(m234, 1), leading);
+ if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) {
+ return false;
+ }
+ } else {
+ __mmask64 bxorleading = _kxor_mask64(b, leading);
+ if (_kshiftli_mask64(m234, 1) != bxorleading) {
+ return false;
+ }
+ }
+ //
+ if (tail == SIMDUTF_FULL) {
+ // In the two-byte/ASCII scenario, we are easily latency bound, so we want
+ // to increment the input buffer as quickly as possible.
+ // We process 32 bytes unless the byte at index 32 is a continuation byte,
+ // in which case we include it as well for a total of 33 bytes.
+ // Note that if x is an ASCII byte, then the following is false:
+ // int8_t(x) <= int8_t(0xc0) under two's complement.
+ in += 32;
+ if (int8_t(*in) <= int8_t(0xc0))
+ in++;
+ // The alternative is to do
+ // in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
+ // but it requires loading the input, doing the mask computation, and
+ // converting back the mask to a general register. It just takes too long,
+ // leaving the processor likely to be idle.
+ } else {
+ in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
+ }
+ __m512i lead = _mm512_maskz_compress_epi8(
+ leading, leading2byte); // will contain zero for ascii, and the data
+ lead = _mm512_cvtepu8_epi16(
+ _mm512_castsi512_si256(lead)); // ... zero extended into code units
+ __m512i follow = _mm512_maskz_compress_epi8(
+ continuation_or_ascii, input); // the last bytes of each sequence
+ follow = _mm512_cvtepu8_epi16(
+ _mm512_castsi512_si256(follow)); // ... zero extended into code units
+ lead = _mm512_slli_epi16(lead, 6); // shifted into position
+ __m512i final = _mm512_add_epi16(follow, lead); // combining lead and follow
+
+ if (big_endian) {
+ final = _mm512_shuffle_epi8(final, byteflip);
+ }
+ if (tail == SIMDUTF_FULL) {
+ // Next part is UTF-16 specific and can be generalized to UTF-32.
+ int nout = _mm_popcnt_u32(uint32_t(leading));
+ _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
+ out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
+ } else {
+ int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading)));
+ _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
+ out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
+ }
+
+ return true; // we are fine.
+}
+
+/*
+ utf32_to_utf16_masked converts `count` lower UTF-32 code units
+ from input `utf32` into UTF-16. It differs from utf32_to_utf16
+ in that it 'masks' the writes.
+
+ Returns how many 16-bit code units were stored.
+
+ byteflip is used for flipping 16-bit code units, and it should be
+ __m512i byteflip = _mm512_setr_epi64(
+ 0x0607040502030001,
+ 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001,
+ 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001,
+ 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001,
+ 0x0e0f0c0d0a0b0809
+ );
+ We pass it to the (always inlined) function to encourage the compiler to
+ keep the value in a (constant) register.
+*/
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip,
+ __m512i utf32,
+ unsigned int count,
+ char16_t *output) {
+
+ const __mmask16 valid = uint16_t((1 << count) - 1);
+ // 1. check if we have any surrogate pairs
+ const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
+ const __mmask16 sp_mask =
+ _mm512_mask_cmpgt_epu32_mask(valid, utf32, v_0000_ffff);
+
+ if (sp_mask == 0) {
+ if (big_endian) {
+ _mm256_mask_storeu_epi16(
+ (__m256i *)output, valid,
+ _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),
+ _mm512_castsi512_si256(byteflip)));
+
+ } else {
+ _mm256_mask_storeu_epi16((__m256i *)output, valid,
+ _mm512_cvtepi32_epi16(utf32));
+ }
+ return count;
+ }
+
+ {
+ // build surrogate pair code units in 32-bit lanes
+
+ // t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
+ const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
+ const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
+
+ // t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
+ const __m512i t1 = _mm512_slli_epi32(t0, 6);
+
+ // t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1
+ // to t0
+ // 0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
+ const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
+ const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
+
+ // t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1
+ // to t0
+ // 0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
+ const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
+ const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
+ const __m512i t3 =
+ _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
+ const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
+ __m512i t5 = _mm512_ror_epi32(t4, 16);
+ // Here we want to trim all of the upper 16-bit code units from the 2-byte
+ // characters represented as 4-byte values. We can compute it from
+ // sp_mask or the following... It can be more optimized!
+ const __mmask32 nonzero = _kor_mask32(
+ 0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
+ const __mmask32 nonzero_masked =
+ _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2 * count)) - 1));
+ if (big_endian) {
+ t5 = _mm512_shuffle_epi8(t5, byteflip);
+ }
+ // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
+ // (zen4)
+ __m512i compressed = _mm512_maskz_compress_epi16(nonzero_masked, t5);
+ _mm512_mask_storeu_epi16(
+ output,
+ (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1,
+ compressed);
+ //_mm512_mask_compressstoreu_epi16(output, nonzero_masked, t5);
+ }
+
+ return count + static_cast<unsigned int>(count_ones(sp_mask));
+}
+
+/*
+ utf32_to_utf16 converts `count` lower UTF-32 code units
+ from input `utf32` into UTF-16. It may overflow.
+
+ Returns how many 16-bit code units were stored.
+
+ byteflip is used for flipping 16-bit code units, and it should be
+ __m512i byteflip = _mm512_setr_epi64(
+ 0x0607040502030001,
+ 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001,
+ 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001,
+ 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001,
+ 0x0e0f0c0d0a0b0809
+ );
+ We pass it to the (always inlined) function to encourage the compiler to
+ keep the value in a (constant) register.
+*/
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip,
+ __m512i utf32, unsigned int count,
+ char16_t *output) {
+ // check if we have any surrogate pairs
+ const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
+ const __mmask16 sp_mask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
+
+ if (sp_mask == 0) {
+ // technically, it should be _mm256_storeu_epi16
+ if (big_endian) {
+ _mm256_storeu_si256(
+ (__m256i *)output,
+ _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),
+ _mm512_castsi512_si256(byteflip)));
+ } else {
+ _mm256_storeu_si256((__m256i *)output, _mm512_cvtepi32_epi16(utf32));
+ }
+ return count;
+ }
+
+ {
+ // build surrogate pair code units in 32-bit lanes
+
+ // t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
+ const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
+ const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
+
+ // t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
+ const __m512i t1 = _mm512_slli_epi32(t0, 6);
+
+ // t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1
+ // to t0
+ // 0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
+ const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
+ const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
+
+ // t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1
+ // to t0
+ // 0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
+ const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
+ const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
+ const __m512i t3 =
+ _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
+ const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
+ __m512i t5 = _mm512_ror_epi32(t4, 16);
+ const __mmask32 nonzero = _kor_mask32(
+ 0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
+ if (big_endian) {
+ t5 = _mm512_shuffle_epi8(t5, byteflip);
+ }
+ // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
+ // (zen4)
+ __m512i compressed = _mm512_maskz_compress_epi16(nonzero, t5);
+ _mm512_mask_storeu_epi16(
+ output,
+ (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1,
+ compressed);
+ //_mm512_mask_compressstoreu_epi16(output, nonzero, t5);
+ }
+
+ return count + static_cast<unsigned int>(count_ones(sp_mask));
+}
+
+/**
+ * Store the last N bytes of previous followed by 512-N bytes from input.
+ */
+template <int N> __m512i prev(__m512i input, __m512i previous) {
+ static_assert(N <= 32, "N must be no larger than 32");
+ const __m512i movemask =
+ _mm512_setr_epi32(28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+ const __m512i rotated = _mm512_permutex2var_epi32(input, movemask, previous);
+#if SIMDUTF_GCC8 || SIMDUTF_GCC9
+ constexpr int shift = 16 - N; // workaround for GCC8,9
+ return _mm512_alignr_epi8(input, rotated, shift);
+#else
+ return _mm512_alignr_epi8(input, rotated, 16 - N);
+#endif // SIMDUTF_GCC8 || SIMDUTF_GCC9
+}
+
+template <unsigned idx0, unsigned idx1, unsigned idx2, unsigned idx3>
+__m512i shuffle_epi128(__m512i v) {
+ static_assert((idx0 >= 0 && idx0 <= 3), "idx0 must be in range 0..3");
+ static_assert((idx1 >= 0 && idx1 <= 3), "idx1 must be in range 0..3");
+ static_assert((idx2 >= 0 && idx2 <= 3), "idx2 must be in range 0..3");
+ static_assert((idx3 >= 0 && idx3 <= 3), "idx3 must be in range 0..3");
+
+ constexpr unsigned shuffle = idx0 | (idx1 << 2) | (idx2 << 4) | (idx3 << 6);
+ return _mm512_shuffle_i32x4(v, v, shuffle);
+}
+
+template <unsigned idx> constexpr __m512i broadcast_epi128(__m512i v) {
+ return shuffle_epi128<idx, idx, idx, idx>(v);
+}
+
+/**
+ * Current unused.
+ */
+template <int N> __m512i rotate_by_N_epi8(const __m512i input) {
+
+ // lanes order: 1, 2, 3, 0 => 0b00_11_10_01
+ const __m512i permuted = _mm512_shuffle_i32x4(input, input, 0x39);
+
+ return _mm512_alignr_epi8(permuted, input, N);
+}
+
+/*
+ expanded_utf8_to_utf32 converts expanded UTF-8 characters (`utf8`)
+ stored at separate 32-bit lanes.
+
+ For each lane we have also a character class (`char_class), given in form
+ 0x8080800N, where N is 4 highest bits from the leading byte; 0x80 resets
+ corresponding bytes during pshufb.
+*/
+simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class,
+ __m512i utf8) {
+ /*
+ Input:
+ - utf8: bytes stored at separate 32-bit code units
+ - valid: which code units have valid UTF-8 characters
+
+ Bit layout of single word. We show 4 cases for each possible
+ UTF-8 character encoding. The `?` denotes bits we must not
+ assume their value.
+
+ |10dd.dddd|10cc.cccc|10bb.bbbb|1111.0aaa| 4-byte char
+ |????.????|10cc.cccc|10bb.bbbb|1110.aaaa| 3-byte char
+ |????.????|????.????|10bb.bbbb|110a.aaaa| 2-byte char
+ |????.????|????.????|????.????|0aaa.aaaa| ASCII char
+ byte 3 byte 2 byte 1 byte 0
+ */
+
+ /* 1. Reset control bits of continuation bytes and the MSB
+ of the leading byte; this makes all bytes unsigned (and
+ does not alter ASCII char).
+
+ |00dd.dddd|00cc.cccc|00bb.bbbb|0111.0aaa| 4-byte char
+ |00??.????|00cc.cccc|00bb.bbbb|0110.aaaa| 3-byte char
+ |00??.????|00??.????|00bb.bbbb|010a.aaaa| 2-byte char
+ |00??.????|00??.????|00??.????|0aaa.aaaa| ASCII char
+ ^^ ^^ ^^ ^
+ */
+ __m512i values;
+ const __m512i v_3f3f_3f7f = _mm512_set1_epi32(0x3f3f3f7f);
+ values = _mm512_and_si512(utf8, v_3f3f_3f7f);
+
+ /* 2. Swap and join fields A-B and C-D
+
+ |0000.cccc|ccdd.dddd|0001.110a|aabb.bbbb| 4-byte char
+ |0000.cccc|cc??.????|0001.10aa|aabb.bbbb| 3-byte char
+ |0000.????|????.????|0001.0aaa|aabb.bbbb| 2-byte char
+ |0000.????|????.????|000a.aaaa|aa??.????| ASCII char */
+ const __m512i v_0140_0140 = _mm512_set1_epi32(0x01400140);
+ values = _mm512_maddubs_epi16(values, v_0140_0140);
+
+ /* 3. Swap and join fields AB & CD
+
+ |0000.0001|110a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char
+ |0000.0001|10aa.aabb|bbbb.cccc|cc??.????| 3-byte char
+ |0000.0001|0aaa.aabb|bbbb.????|????.????| 2-byte char
+ |0000.000a|aaaa.aa??|????.????|????.????| ASCII char */
+ const __m512i v_0001_1000 = _mm512_set1_epi32(0x00011000);
+ values = _mm512_madd_epi16(values, v_0001_1000);
+
+ /* 4. Shift left the values by variable amounts to reset highest UTF-8 bits
+ |aaab.bbbb|bccc.cccd|dddd.d000|0000.0000| 4-byte char -- by 11
+ |aaaa.bbbb|bbcc.cccc|????.??00|0000.0000| 3-byte char -- by 10
+ |aaaa.abbb|bbb?.????|????.???0|0000.0000| 2-byte char -- by 9
+ |aaaa.aaa?|????.????|????.????|?000.0000| ASCII char -- by 7 */
+ {
+ /** pshufb
+
+ continuation = 0
+ ascii = 7
+ _2_bytes = 9
+ _3_bytes = 10
+ _4_bytes = 11
+
+ shift_left_v3 = 4 * [
+ ascii, # 0000
+ ascii, # 0001
+ ascii, # 0010
+ ascii, # 0011
+ ascii, # 0100
+ ascii, # 0101
+ ascii, # 0110
+ ascii, # 0111
+ continuation, # 1000
+ continuation, # 1001
+ continuation, # 1010
+ continuation, # 1011
+ _2_bytes, # 1100
+ _2_bytes, # 1101
+ _3_bytes, # 1110
+ _4_bytes, # 1111
+ ] */
+ const __m512i shift_left_v3 = _mm512_setr_epi64(
+ 0x0707070707070707, 0x0b0a090900000000, 0x0707070707070707,
+ 0x0b0a090900000000, 0x0707070707070707, 0x0b0a090900000000,
+ 0x0707070707070707, 0x0b0a090900000000);
+
+ const __m512i shift = _mm512_shuffle_epi8(shift_left_v3, char_class);
+ values = _mm512_sllv_epi32(values, shift);
+ }
+
+ /* 5. Shift right the values by variable amounts to reset lowest bits
+ |0000.0000|000a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char -- by 11
+ |0000.0000|0000.0000|aaaa.bbbb|bbcc.cccc| 3-byte char -- by 16
+ |0000.0000|0000.0000|0000.0aaa|aabb.bbbb| 2-byte char -- by 21
+ |0000.0000|0000.0000|0000.0000|0aaa.aaaa| ASCII char -- by 25 */
+ {
+ // 4 * [25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 21, 21, 16, 11]
+ const __m512i shift_right = _mm512_setr_epi64(
+ 0x1919191919191919, 0x0b10151500000000, 0x1919191919191919,
+ 0x0b10151500000000, 0x1919191919191919, 0x0b10151500000000,
+ 0x1919191919191919, 0x0b10151500000000);
+
+ const __m512i shift = _mm512_shuffle_epi8(shift_right, char_class);
+ values = _mm512_srlv_epi32(values, shift);
+ }
+
+ return values;
+}
+
+simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1,
+ int &count) {
+ const __m512i merged = _mm512_mask_mov_epi32(lane0, 0x1000, lane1);
+ const __m512i expand_ver2 = _mm512_setr_epi64(
+ 0x0403020103020100, 0x0605040305040302, 0x0807060507060504,
+ 0x0a09080709080706, 0x0c0b0a090b0a0908, 0x0e0d0c0b0d0c0b0a,
+ 0x000f0e0d0f0e0d0c, 0x0201000f01000f0e);
+ const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);
+ const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);
+ const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);
+ const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);
+ const __mmask16 leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);
+ count = static_cast<int>(count_ones(leading_bytes));
+ return _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes,
+ input);
+}
+
+simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
+ __m512i char_class = _mm512_srli_epi32(input, 4);
+ /* char_class = ((input >> 4) & 0x0f) | 0x80808000 */
+ const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);
+ const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);
+ char_class =
+ _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);
+ return expanded_utf8_to_utf32(char_class, input);
+}
diff --git a/contrib/simdutf/src/icelake/icelake_utf8_validation.inl.cpp b/contrib/simdutf/src/icelake/icelake_utf8_validation.inl.cpp
new file mode 100644
index 000000000..e00563dc4
--- /dev/null
+++ b/contrib/simdutf/src/icelake/icelake_utf8_validation.inl.cpp
@@ -0,0 +1,116 @@
+// file included directly
+
+simdutf_really_inline __m512i check_special_cases(__m512i input,
+ const __m512i prev1) {
+ __m512i mask1 = _mm512_setr_epi64(0x0202020202020202, 0x4915012180808080,
+ 0x0202020202020202, 0x4915012180808080,
+ 0x0202020202020202, 0x4915012180808080,
+ 0x0202020202020202, 0x4915012180808080);
+ const __m512i v_0f = _mm512_set1_epi8(0x0f);
+ __m512i index1 = _mm512_and_si512(_mm512_srli_epi16(prev1, 4), v_0f);
+
+ __m512i byte_1_high = _mm512_shuffle_epi8(mask1, index1);
+ __m512i mask2 = _mm512_setr_epi64(0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
+ 0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
+ 0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
+ 0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb);
+ __m512i index2 = _mm512_and_si512(prev1, v_0f);
+
+ __m512i byte_1_low = _mm512_shuffle_epi8(mask2, index2);
+ __m512i mask3 =
+ _mm512_setr_epi64(0x101010101010101, 0x1010101babaaee6, 0x101010101010101,
+ 0x1010101babaaee6, 0x101010101010101, 0x1010101babaaee6,
+ 0x101010101010101, 0x1010101babaaee6);
+ __m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f);
+ __m512i byte_2_high = _mm512_shuffle_epi8(mask3, index3);
+ return _mm512_ternarylogic_epi64(byte_1_high, byte_1_low, byte_2_high, 128);
+}
+
+simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
+ const __m512i prev_input,
+ const __m512i sc) {
+ __m512i prev2 = prev<2>(input, prev_input);
+ __m512i prev3 = prev<3>(input, prev_input);
+ __m512i is_third_byte = _mm512_subs_epu8(
+ prev2, _mm512_set1_epi8(0b11100000u - 1)); // Only 111_____ will be > 0
+ __m512i is_fourth_byte = _mm512_subs_epu8(
+ prev3, _mm512_set1_epi8(0b11110000u - 1)); // Only 1111____ will be > 0
+ __m512i is_third_or_fourth_byte =
+ _mm512_or_si512(is_third_byte, is_fourth_byte);
+ const __m512i v_7f = _mm512_set1_epi8(char(0x7f));
+ is_third_or_fourth_byte = _mm512_adds_epu8(v_7f, is_third_or_fourth_byte);
+ // We want to compute (is_third_or_fourth_byte AND v80) XOR sc.
+ const __m512i v_80 = _mm512_set1_epi8(char(0x80));
+ return _mm512_ternarylogic_epi32(is_third_or_fourth_byte, v_80, sc,
+ 0b1101010);
+ //__m512i is_third_or_fourth_byte_mask =
+ //_mm512_and_si512(is_third_or_fourth_byte, v_80); return
+ // _mm512_xor_si512(is_third_or_fourth_byte_mask, sc);
+}
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the
+// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
+//
+simdutf_really_inline __m512i is_incomplete(const __m512i input) {
+ // If the previous input's last 3 bytes match this, they're too short (they
+ // ended at EOF):
+ // ... 1111____ 111_____ 11______
+ __m512i max_value = _mm512_setr_epi64(0xffffffffffffffff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xbfdfefffffffffff);
+ return _mm512_subs_epu8(input, max_value);
+}
+
+struct avx512_utf8_checker {
+ // If this is nonzero, there has been a UTF-8 error.
+ __m512i error{};
+
+ // The last input we received
+ __m512i prev_input_block{};
+ // Whether the last input we received was incomplete (used for ASCII fast
+ // path)
+ __m512i prev_incomplete{};
+
+ //
+ // Check whether the current bytes are valid UTF-8.
+ //
+ simdutf_really_inline void check_utf8_bytes(const __m512i input,
+ const __m512i prev_input) {
+ // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+ // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+ // small negative numbers)
+ __m512i prev1 = prev<1>(input, prev_input);
+ __m512i sc = check_special_cases(input, prev1);
+ this->error = _mm512_or_si512(
+ check_multibyte_lengths(input, prev_input, sc), this->error);
+ }
+
+ // The only problem that can happen at EOF is that a multibyte character is
+ // too short or a byte value too large in the last bytes: check_special_cases
+ // only checks for bytes too large in the first of two bytes.
+ simdutf_really_inline void check_eof() {
+ // If the previous block had incomplete UTF-8 characters at the end, an
+ // ASCII block can't possibly finish them.
+ this->error = _mm512_or_si512(this->error, this->prev_incomplete);
+ }
+
+ // returns true if ASCII.
+ simdutf_really_inline bool check_next_input(const __m512i input) {
+ const __m512i v_80 = _mm512_set1_epi8(char(0x80));
+ const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
+ if (ascii == 0) {
+ this->error = _mm512_or_si512(this->error, this->prev_incomplete);
+ return true;
+ } else {
+ this->check_utf8_bytes(input, this->prev_input_block);
+ this->prev_incomplete = is_incomplete(input);
+ this->prev_input_block = input;
+ return false;
+ }
+ }
+ // do not forget to call check_eof!
+ simdutf_really_inline bool errors() const {
+ return _mm512_test_epi8_mask(this->error, this->error) != 0;
+ }
+}; // struct avx512_utf8_checker
diff --git a/contrib/simdutf/src/icelake/implementation.cpp b/contrib/simdutf/src/icelake/implementation.cpp
new file mode 100644
index 000000000..0b9e31c68
--- /dev/null
+++ b/contrib/simdutf/src/icelake/implementation.cpp
@@ -0,0 +1,1650 @@
+#include "simdutf/icelake/intrinsics.h"
+
+#include "scalar/utf16_to_utf8/valid_utf16_to_utf8.h"
+#include "scalar/utf16_to_utf8/utf16_to_utf8.h"
+#include "scalar/utf8_to_utf16/valid_utf8_to_utf16.h"
+#include "scalar/utf8_to_utf16/utf8_to_utf16.h"
+#include "scalar/utf8.h"
+#include "scalar/utf16.h"
+#include "scalar/latin1.h"
+#include "scalar/utf8_to_latin1/valid_utf8_to_latin1.h"
+#include "scalar/utf8_to_latin1/utf8_to_latin1.h"
+
+#include "simdutf/icelake/begin.h"
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+#ifndef SIMDUTF_ICELAKE_H
+ #error "icelake.h must be included"
+#endif
+#include "icelake/icelake_utf8_common.inl.cpp"
+#include "icelake/icelake_macros.inl.cpp"
+#include "icelake/icelake_from_valid_utf8.inl.cpp"
+#include "icelake/icelake_utf8_validation.inl.cpp"
+#include "icelake/icelake_from_utf8.inl.cpp"
+#include "icelake/icelake_convert_utf8_to_latin1.inl.cpp"
+#include "icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp"
+#include "icelake/icelake_convert_utf16_to_latin1.inl.cpp"
+#include "icelake/icelake_convert_utf16_to_utf8.inl.cpp"
+#include "icelake/icelake_convert_utf16_to_utf32.inl.cpp"
+#include "icelake/icelake_convert_utf32_to_latin1.inl.cpp"
+#include "icelake/icelake_convert_utf32_to_utf8.inl.cpp"
+#include "icelake/icelake_convert_utf32_to_utf16.inl.cpp"
+#include "icelake/icelake_ascii_validation.inl.cpp"
+#include "icelake/icelake_utf32_validation.inl.cpp"
+#include "icelake/icelake_convert_latin1_to_utf8.inl.cpp"
+#include "icelake/icelake_convert_latin1_to_utf16.inl.cpp"
+#include "icelake/icelake_convert_latin1_to_utf32.inl.cpp"
+#include "icelake/icelake_base64.inl.cpp"
+
+#include <cstdint>
+
+} // namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+ size_t length) const noexcept {
+ // If there is a BOM, then we trust it.
+ auto bom_encoding = simdutf::BOM::check_bom(input, length);
+ // todo: convert to a one-pass algorithm
+ if (bom_encoding != encoding_type::unspecified) {
+ return bom_encoding;
+ }
+ int out = 0;
+ if (validate_utf8(input, length)) {
+ out |= encoding_type::UTF8;
+ }
+ if ((length % 2) == 0) {
+ if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+ length / 2)) {
+ out |= encoding_type::UTF16_LE;
+ }
+ }
+ if ((length % 4) == 0) {
+ if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+ out |= encoding_type::UTF32_LE;
+ }
+ }
+ return out;
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return true;
+ }
+ avx512_utf8_checker checker{};
+ const char *ptr = buf;
+ const char *end = ptr + len;
+ for (; end - ptr >= 64; ptr += 64) {
+ const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+ checker.check_next_input(utf8);
+ }
+ if (end != ptr) {
+ const __m512i utf8 = _mm512_maskz_loadu_epi8(
+ ~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i *)ptr);
+ checker.check_next_input(utf8);
+ }
+ checker.check_eof();
+ return !checker.errors();
+}
+
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+ const char *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return result(error_code::SUCCESS, len);
+ }
+ avx512_utf8_checker checker{};
+ const char *ptr = buf;
+ const char *end = ptr + len;
+ size_t count{0};
+ for (; end - ptr >= 64; ptr += 64) {
+ const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+ checker.check_next_input(utf8);
+ if (checker.errors()) {
+ if (count != 0) {
+ count--;
+ } // Sometimes the error is only detected in the next chunk
+ result res = scalar::utf8::rewind_and_validate_with_errors(
+ reinterpret_cast<const char *>(buf),
+ reinterpret_cast<const char *>(buf + count), len - count);
+ res.count += count;
+ return res;
+ }
+ count += 64;
+ }
+ if (end != ptr) {
+ const __m512i utf8 = _mm512_maskz_loadu_epi8(
+ ~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i *)ptr);
+ checker.check_next_input(utf8);
+ }
+ checker.check_eof();
+ if (checker.errors()) {
+ if (count != 0) {
+ count--;
+ } // Sometimes the error is only detected in the next chunk
+ result res = scalar::utf8::rewind_and_validate_with_errors(
+ reinterpret_cast<const char *>(buf),
+ reinterpret_cast<const char *>(buf + count), len - count);
+ res.count += count;
+ return res;
+ }
+ return result(error_code::SUCCESS, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+ return icelake::validate_ascii(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+ const char *buf, size_t len) const noexcept {
+ const char *buf_orig = buf;
+ const char *end = buf + len;
+ const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
+ for (; end - buf >= 64; buf += 64) {
+ const __m512i input = _mm512_loadu_si512((const __m512i *)buf);
+ __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
+ if (notascii) {
+ return result(error_code::TOO_LARGE,
+ buf - buf_orig + _tzcnt_u64(notascii));
+ }
+ }
+ if (end != buf) {
+ const __m512i input = _mm512_maskz_loadu_epi8(
+ ~UINT64_C(0) >> (64 - (end - buf)), (const __m512i *)buf);
+ __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
+ if (notascii) {
+ return result(error_code::TOO_LARGE,
+ buf - buf_orig + _tzcnt_u64(notascii));
+ }
+ }
+ return result(error_code::SUCCESS, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept {
+ const char16_t *end = buf + len;
+
+ for (; end - buf >= 32;) {
+ __m512i in = _mm512_loadu_si512((__m512i *)buf);
+ __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+ __mmask32 surrogates =
+ _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+ if (surrogates) {
+ __mmask32 highsurrogates =
+ _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+ __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+ // high must be followed by low
+ if ((highsurrogates << 1) != lowsurrogates) {
+ return false;
+ }
+ bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+ if (ends_with_high) {
+ buf += 31; // advance only by 31 code units so that we start with the
+ // high surrogate on the next round.
+ } else {
+ buf += 32;
+ }
+ } else {
+ buf += 32;
+ }
+ }
+ if (buf < end) {
+ __m512i in =
+ _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf);
+ __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+ __mmask32 surrogates =
+ _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+ if (surrogates) {
+ __mmask32 highsurrogates =
+ _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+ __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+ // high must be followed by low
+ if ((highsurrogates << 1) != lowsurrogates) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept {
+ const char16_t *end = buf + len;
+ const __m512i byteflip = _mm512_setr_epi64(
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+ 0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809);
+ for (; end - buf >= 32;) {
+ __m512i in =
+ _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)buf), byteflip);
+ __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+ __mmask32 surrogates =
+ _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+ if (surrogates) {
+ __mmask32 highsurrogates =
+ _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+ __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+ // high must be followed by low
+ if ((highsurrogates << 1) != lowsurrogates) {
+ return false;
+ }
+ bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+ if (ends_with_high) {
+ buf += 31; // advance only by 31 code units so that we start with the
+ // high surrogate on the next round.
+ } else {
+ buf += 32;
+ }
+ } else {
+ buf += 32;
+ }
+ }
+ if (buf < end) {
+ __m512i in = _mm512_shuffle_epi8(
+ _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf),
+ byteflip);
+ __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+ __mmask32 surrogates =
+ _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+ if (surrogates) {
+ __mmask32 highsurrogates =
+ _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+ __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+ // high must be followed by low
+ if ((highsurrogates << 1) != lowsurrogates) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ const char16_t *start_buf = buf;
+ const char16_t *end = buf + len;
+ for (; end - buf >= 32;) {
+ __m512i in = _mm512_loadu_si512((__m512i *)buf);
+ __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+ __mmask32 surrogates =
+ _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+ if (surrogates) {
+ __mmask32 highsurrogates =
+ _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+ __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+ // high must be followed by low
+ if ((highsurrogates << 1) != lowsurrogates) {
+ uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+ uint32_t extra_high =
+ _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+ return result(error_code::SURROGATE,
+ (buf - start_buf) +
+ (extra_low < extra_high ? extra_low : extra_high));
+ }
+ bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+ if (ends_with_high) {
+ buf += 31; // advance only by 31 code units so that we start with the
+ // high surrogate on the next round.
+ } else {
+ buf += 32;
+ }
+ } else {
+ buf += 32;
+ }
+ }
+ if (buf < end) {
+ __m512i in =
+ _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf);
+ __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+ __mmask32 surrogates =
+ _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+ if (surrogates) {
+ __mmask32 highsurrogates =
+ _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+ __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+ // high must be followed by low
+ if ((highsurrogates << 1) != lowsurrogates) {
+ uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+ uint32_t extra_high =
+ _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+ return result(error_code::SURROGATE,
+ (buf - start_buf) +
+ (extra_low < extra_high ? extra_low : extra_high));
+ }
+ }
+ }
+ return result(error_code::SUCCESS, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ const char16_t *start_buf = buf;
+ const char16_t *end = buf + len;
+ const __m512i byteflip = _mm512_setr_epi64(
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+ 0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809);
+ for (; end - buf >= 32;) {
+ __m512i in =
+ _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)buf), byteflip);
+ __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+ __mmask32 surrogates =
+ _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+ if (surrogates) {
+ __mmask32 highsurrogates =
+ _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+ __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+ // high must be followed by low
+ if ((highsurrogates << 1) != lowsurrogates) {
+ uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+ uint32_t extra_high =
+ _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+ return result(error_code::SURROGATE,
+ (buf - start_buf) +
+ (extra_low < extra_high ? extra_low : extra_high));
+ }
+ bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+ if (ends_with_high) {
+ buf += 31; // advance only by 31 code units so that we start with the
+ // high surrogate on the next round.
+ } else {
+ buf += 32;
+ }
+ } else {
+ buf += 32;
+ }
+ }
+ if (buf < end) {
+ __m512i in = _mm512_shuffle_epi8(
+ _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf),
+ byteflip);
+ __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+ __mmask32 surrogates =
+ _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+ if (surrogates) {
+ __mmask32 highsurrogates =
+ _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+ __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+ // high must be followed by low
+ if ((highsurrogates << 1) != lowsurrogates) {
+ uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+ uint32_t extra_high =
+ _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+ return result(error_code::SURROGATE,
+ (buf - start_buf) +
+ (extra_low < extra_high ? extra_low : extra_high));
+ }
+ }
+ }
+ return result(error_code::SUCCESS, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+ const char32_t *tail = icelake::validate_utf32(buf, len);
+ if (tail) {
+ return scalar::utf32::validate(tail, len - (tail - buf));
+ } else {
+ // we come here if there was an error, or buf was nullptr which may happen
+ // for empty input.
+ return len == 0;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept {
+ const char32_t *buf_orig = buf;
+ if (len >= 16) {
+ const char32_t *end = buf + len - 16;
+ while (buf <= end) {
+ __m512i utf32 = _mm512_loadu_si512((const __m512i *)buf);
+ __mmask16 outside_range = _mm512_cmp_epu32_mask(
+ utf32, _mm512_set1_epi32(0x10ffff), _MM_CMPINT_GT);
+
+ __m512i utf32_off =
+ _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
+
+ __mmask16 surrogate_range = _mm512_cmp_epu32_mask(
+ utf32_off, _mm512_set1_epi32(0xfffff7ff), _MM_CMPINT_GT);
+ if ((outside_range | surrogate_range)) {
+ auto outside_idx = _tzcnt_u32(outside_range);
+ auto surrogate_idx = _tzcnt_u32(surrogate_range);
+
+ if (outside_idx < surrogate_idx) {
+ return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx);
+ }
+
+ return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx);
+ }
+
+ buf += 16;
+ }
+ }
+ if (len > 0) {
+ __m512i utf32 = _mm512_maskz_loadu_epi32(
+ __mmask16((1U << (buf_orig + len - buf)) - 1), (const __m512i *)buf);
+ __mmask16 outside_range = _mm512_cmp_epu32_mask(
+ utf32, _mm512_set1_epi32(0x10ffff), _MM_CMPINT_GT);
+ __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
+
+ __mmask16 surrogate_range = _mm512_cmp_epu32_mask(
+ utf32_off, _mm512_set1_epi32(0xfffff7ff), _MM_CMPINT_GT);
+ if ((outside_range | surrogate_range)) {
+ auto outside_idx = _tzcnt_u32(outside_range);
+ auto surrogate_idx = _tzcnt_u32(surrogate_range);
+
+ if (outside_idx < surrogate_idx) {
+ return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx);
+ }
+
+ return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx);
+ }
+ }
+
+ return result(error_code::SUCCESS, len);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+ const char *buf, size_t len, char *utf8_output) const noexcept {
+ return icelake::latin1_to_utf8_avx512_start(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return icelake_convert_latin1_to_utf16<endianness::LITTLE>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return icelake_convert_latin1_to_utf16<endianness::BIG>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char *, char32_t *> ret =
+ avx512_convert_latin1_to_utf32(buf, len, utf32_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t converted_chars = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_converted_chars == 0) {
+ return 0;
+ }
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ return icelake::utf8_to_latin1_avx512(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ // First, try to convert as much as possible using the SIMD implementation.
+ const char *obuf = buf;
+ char *olatin1_output = latin1_output;
+ size_t written = icelake::utf8_to_latin1_avx512(obuf, len, olatin1_output);
+
+ // If we have completely converted the string
+ if (obuf == buf + len) {
+ return {simdutf::SUCCESS, written};
+ }
+ size_t pos = obuf - buf;
+ result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+ pos, buf + pos, len - pos, latin1_output);
+ res.count += pos;
+ return res;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ return icelake::valid_utf8_to_latin1_avx512(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16_result ret =
+ fast_avx512_convert_utf8_to_utf16<endianness::LITTLE>(buf, len,
+ utf16_output);
+ if (ret.second == nullptr) {
+ return 0;
+ }
+ return ret.second - utf16_output;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::BIG>(
+ buf, len, utf16_output);
+ if (ret.second == nullptr) {
+ return 0;
+ }
+ return ret.second - utf16_output;
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::LITTLE>(
+ buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::BIG>(
+ buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16_result ret =
+ icelake::valid_utf8_to_fixed_length<endianness::LITTLE, char16_t>(
+ buf, len, utf16_output);
+ size_t saved_bytes = ret.second - utf16_output;
+ const char *end = buf + len;
+ if (ret.first == end) {
+ return saved_bytes;
+ }
+
+ // Note: AVX512 procedure looks up 4 bytes forward, and
+ // correctly converts multi-byte chars even if their
+ // continuation bytes lie outsiede 16-byte window.
+ // It meas, we have to skip continuation bytes from
+ // the beginning ret.first, as they were already consumed.
+ while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+ ret.first += 1;
+ }
+
+ if (ret.first != end) {
+ const size_t scalar_saved_bytes =
+ scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16_result ret =
+ icelake::valid_utf8_to_fixed_length<endianness::BIG, char16_t>(
+ buf, len, utf16_output);
+ size_t saved_bytes = ret.second - utf16_output;
+ const char *end = buf + len;
+ if (ret.first == end) {
+ return saved_bytes;
+ }
+
+ // Note: AVX512 procedure looks up 4 bytes forward, and
+ // correctly converts multi-byte chars even if their
+ // continuation bytes lie outsiede 16-byte window.
+ // It meas, we have to skip continuation bytes from
+ // the beginning ret.first, as they were already consumed.
+ while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+ ret.first += 1;
+ }
+
+ if (ret.first != end) {
+ const size_t scalar_saved_bytes =
+ scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_out) const noexcept {
+ uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+ utf8_to_utf32_result ret =
+ icelake::validating_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(
+ buf, len, utf32_output);
+ if (ret.second == nullptr)
+ return 0;
+
+ size_t saved_bytes = ret.second - utf32_output;
+ const char *end = buf + len;
+ if (ret.first == end) {
+ return saved_bytes;
+ }
+
+ // Note: the AVX512 procedure looks up 4 bytes forward, and
+ // correctly converts multi-byte chars even if their
+ // continuation bytes lie outside 16-byte window.
+ // It means, we have to skip continuation bytes from
+ // the beginning ret.first, as they were already consumed.
+ while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+ ret.first += 1;
+ }
+ if (ret.first != end) {
+ const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert(
+ ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len, char32_t *utf32) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return {error_code::SUCCESS, 0};
+ }
+ uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32);
+ auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks<
+ endianness::LITTLE, uint32_t>(buf, len, utf32_output);
+
+ if (!std::get<2>(ret)) {
+ size_t pos = std::get<0>(ret) - buf;
+ // We might have an error that occurs right before pos.
+ // This is only a concern if buf[pos] is not a continuation byte.
+ if ((buf[pos] & 0xc0) != 0x80 && pos >= 64) {
+ pos -= 1;
+ } else if ((buf[pos] & 0xc0) == 0x80 && pos >= 64) {
+ // We must check whether we are the fourth continuation byte
+ bool c1 = (buf[pos - 1] & 0xc0) == 0x80;
+ bool c2 = (buf[pos - 2] & 0xc0) == 0x80;
+ bool c3 = (buf[pos - 3] & 0xc0) == 0x80;
+ if (c1 && c2 && c3) {
+ return {simdutf::TOO_LONG, pos};
+ }
+ }
+ // todo: we reset the output to utf32 instead of using std::get<2.(ret) as
+ // you'd expect. that is because
+ // validating_utf8_to_fixed_length_with_constant_checks may have processed
+ // data beyond the error.
+ result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+ pos, buf + pos, len - pos, utf32);
+ res.count += pos;
+ return res;
+ }
+ size_t saved_bytes = std::get<1>(ret) - utf32_output;
+ const char *end = buf + len;
+ if (std::get<0>(ret) == end) {
+ return {simdutf::SUCCESS, saved_bytes};
+ }
+
+ // Note: the AVX512 procedure looks up 4 bytes forward, and
+ // correctly converts multi-byte chars even if their
+ // continuation bytes lie outside 16-byte window.
+ // It means, we have to skip continuation bytes from
+ // the beginning ret.first, as they were already consumed.
+ while (std::get<0>(ret) != end and
+ ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) {
+ std::get<0>(ret) += 1;
+ }
+
+ if (std::get<0>(ret) != end) {
+ auto scalar_result = scalar::utf8_to_utf32::convert_with_errors(
+ std::get<0>(ret), len - (std::get<0>(ret) - buf),
+ reinterpret_cast<char32_t *>(utf32_output) + saved_bytes);
+ if (scalar_result.error != simdutf::SUCCESS) {
+ scalar_result.count += (std::get<0>(ret) - buf);
+ } else {
+ scalar_result.count += saved_bytes;
+ }
+ return scalar_result;
+ }
+
+ return {simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output)};
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_out) const noexcept {
+ uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+ utf8_to_utf32_result ret =
+ icelake::valid_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(
+ buf, len, utf32_output);
+ size_t saved_bytes = ret.second - utf32_output;
+ const char *end = buf + len;
+ if (ret.first == end) {
+ return saved_bytes;
+ }
+
+ // Note: AVX512 procedure looks up 4 bytes forward, and
+ // correctly converts multi-byte chars even if their
+ // continuation bytes lie outsiede 16-byte window.
+ // It meas, we have to skip continuation bytes from
+ // the beginning ret.first, as they were already consumed.
+ while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+ ret.first += 1;
+ }
+
+ if (ret.first != end) {
+ const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid(
+ ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ return icelake_convert_utf16_to_latin1<endianness::LITTLE>(buf, len,
+ latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ return icelake_convert_utf16_to_latin1<endianness::BIG>(buf, len,
+ latin1_output);
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ return icelake_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+ buf, len, latin1_output)
+ .first;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ return icelake_convert_utf16_to_latin1_with_errors<endianness::BIG>(
+ buf, len, latin1_output)
+ .first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ // optimization opportunity: implement custom function
+ return convert_utf16be_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ // optimization opportunity: implement custom function
+ return convert_utf16le_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ size_t outlen;
+ size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(
+ buf, len, (unsigned char *)utf8_output, &outlen);
+ if (inlen != len) {
+ return 0;
+ }
+ return outlen;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ size_t outlen;
+ size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(
+ buf, len, (unsigned char *)utf8_output, &outlen);
+ if (inlen != len) {
+ return 0;
+ }
+ return outlen;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ size_t outlen;
+ size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(
+ buf, len, (unsigned char *)utf8_output, &outlen);
+ if (inlen != len) {
+ result res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+ buf + inlen, len - inlen, utf8_output + outlen);
+ res.count += inlen;
+ return res;
+ }
+ return {simdutf::SUCCESS, outlen};
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ size_t outlen;
+ size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(
+ buf, len, (unsigned char *)utf8_output, &outlen);
+ if (inlen != len) {
+ result res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+ buf + inlen, len - inlen, utf8_output + outlen);
+ res.count += inlen;
+ return res;
+ }
+ return {simdutf::SUCCESS, outlen};
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf16le_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf16be_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ return icelake_convert_utf32_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ return icelake_convert_utf32_to_latin1_with_errors(buf, len, latin1_output)
+ .first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ return icelake_convert_utf32_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char32_t *, char *> ret =
+ avx512_convert_utf32_to_utf8(buf, len, utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+ if (ret.first.count != len) {
+ result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf32_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char32_t *, char16_t *> ret =
+ avx512_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char32_t *, char16_t *> ret =
+ avx512_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf32_to_utf16::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char16_t *> ret =
+ avx512_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
+ buf, len, utf16_output);
+ if (ret.first.count != len) {
+ result scalar_res =
+ scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf16_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char16_t *> ret =
+ avx512_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
+ utf16_output);
+ if (ret.first.count != len) {
+ result scalar_res =
+ scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf16_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return convert_utf32_to_utf16le(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::tuple<const char16_t *, char32_t *, bool> ret =
+ icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
+ utf32_output);
+ if (!std::get<2>(ret)) {
+ return 0;
+ }
+ size_t saved_bytes = std::get<1>(ret) - utf32_output;
+ if (std::get<0>(ret) != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+ std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::tuple<const char16_t *, char32_t *, bool> ret =
+ icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+ if (!std::get<2>(ret)) {
+ return 0;
+ }
+ size_t saved_bytes = std::get<1>(ret) - utf32_output;
+ if (std::get<0>(ret) != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf32::convert<endianness::BIG>(
+ std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::tuple<const char16_t *, char32_t *, bool> ret =
+ icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
+ utf32_output);
+ if (!std::get<2>(ret)) {
+ result scalar_res =
+ scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+ std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+ scalar_res.count += (std::get<0>(ret) - buf);
+ return scalar_res;
+ }
+ size_t saved_bytes = std::get<1>(ret) - utf32_output;
+ if (std::get<0>(ret) != buf + len) {
+ result scalar_res =
+ scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+ std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+ if (scalar_res.error) {
+ scalar_res.count += (std::get<0>(ret) - buf);
+ return scalar_res;
+ } else {
+ scalar_res.count += saved_bytes;
+ return scalar_res;
+ }
+ }
+ return simdutf::result(simdutf::SUCCESS, saved_bytes);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::tuple<const char16_t *, char32_t *, bool> ret =
+ icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+ if (!std::get<2>(ret)) {
+ result scalar_res =
+ scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+ std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+ scalar_res.count += (std::get<0>(ret) - buf);
+ return scalar_res;
+ }
+ size_t saved_bytes = std::get<1>(ret) - utf32_output;
+ if (std::get<0>(ret) != buf + len) {
+ result scalar_res =
+ scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+ std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+ if (scalar_res.error) {
+ scalar_res.count += (std::get<0>(ret) - buf);
+ return scalar_res;
+ } else {
+ scalar_res.count += saved_bytes;
+ return scalar_res;
+ }
+ }
+ return simdutf::result(simdutf::SUCCESS, saved_bytes);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::tuple<const char16_t *, char32_t *, bool> ret =
+ icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
+ utf32_output);
+ if (!std::get<2>(ret)) {
+ return 0;
+ }
+ size_t saved_bytes = std::get<1>(ret) - utf32_output;
+ if (std::get<0>(ret) != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+ std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::tuple<const char16_t *, char32_t *, bool> ret =
+ icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+ if (!std::get<2>(ret)) {
+ return 0;
+ }
+ size_t saved_bytes = std::get<1>(ret) - utf32_output;
+ if (std::get<0>(ret) != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf32::convert<endianness::BIG>(
+ std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+void implementation::change_endianness_utf16(const char16_t *input,
+ size_t length,
+ char16_t *output) const noexcept {
+ size_t pos = 0;
+ const __m512i byteflip = _mm512_setr_epi64(
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+ 0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809);
+ while (pos + 32 <= length) {
+ __m512i utf16 = _mm512_loadu_si512((const __m512i *)(input + pos));
+ utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+ _mm512_storeu_si512(output + pos, utf16);
+ pos += 32;
+ }
+ if (pos < length) {
+ __mmask32 m((1U << (length - pos)) - 1);
+ __m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i *)(input + pos));
+ utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+ _mm512_mask_storeu_epi16(output + pos, m, utf16);
+ }
+}
+
+simdutf_warn_unused size_t implementation::count_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ const char16_t *ptr = input;
+ size_t count{0};
+
+ if (length >= 32) {
+ const char16_t *end = input + length - 32;
+
+ const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
+ const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
+
+ while (ptr <= end) {
+ __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
+ ptr += 32;
+ uint64_t not_high_surrogate =
+ static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) |
+ _mm512_cmplt_epu16_mask(utf16, low));
+ count += count_ones(not_high_surrogate);
+ }
+ }
+
+ return count + scalar::utf16::count_code_points<endianness::LITTLE>(
+ ptr, length - (ptr - input));
+}
+
+simdutf_warn_unused size_t implementation::count_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ const char16_t *ptr = input;
+ size_t count{0};
+ if (length >= 32) {
+
+ const char16_t *end = input + length - 32;
+
+ const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
+ const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
+
+ const __m512i byteflip = _mm512_setr_epi64(
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+ 0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809);
+ while (ptr <= end) {
+ __m512i utf16 =
+ _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)ptr), byteflip);
+ ptr += 32;
+ uint64_t not_high_surrogate =
+ static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) |
+ _mm512_cmplt_epu16_mask(utf16, low));
+ count += count_ones(not_high_surrogate);
+ }
+ }
+
+ return count + scalar::utf16::count_code_points<endianness::BIG>(
+ ptr, length - (ptr - input));
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+ const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
+ size_t answer =
+ length / sizeof(__m512i) *
+ sizeof(__m512i); // Number of 512-bit chunks that fits into the length.
+ size_t i = 0;
+ __m512i unrolled_popcount{0};
+
+ const __m512i continuation = _mm512_set1_epi8(char(0b10111111));
+
+ while (i + sizeof(__m512i) <= length) {
+ size_t iterations = (length - i) / sizeof(__m512i);
+
+ size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
+ for (; i + 8 * sizeof(__m512i) <= max_i; i += 8 * sizeof(__m512i)) {
+ __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
+ __m512i input2 =
+ _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
+ __m512i input3 =
+ _mm512_loadu_si512((const __m512i *)(str + i + 2 * sizeof(__m512i)));
+ __m512i input4 =
+ _mm512_loadu_si512((const __m512i *)(str + i + 3 * sizeof(__m512i)));
+ __m512i input5 =
+ _mm512_loadu_si512((const __m512i *)(str + i + 4 * sizeof(__m512i)));
+ __m512i input6 =
+ _mm512_loadu_si512((const __m512i *)(str + i + 5 * sizeof(__m512i)));
+ __m512i input7 =
+ _mm512_loadu_si512((const __m512i *)(str + i + 6 * sizeof(__m512i)));
+ __m512i input8 =
+ _mm512_loadu_si512((const __m512i *)(str + i + 7 * sizeof(__m512i)));
+
+ __mmask64 mask1 = _mm512_cmple_epi8_mask(input1, continuation);
+ __mmask64 mask2 = _mm512_cmple_epi8_mask(input2, continuation);
+ __mmask64 mask3 = _mm512_cmple_epi8_mask(input3, continuation);
+ __mmask64 mask4 = _mm512_cmple_epi8_mask(input4, continuation);
+ __mmask64 mask5 = _mm512_cmple_epi8_mask(input5, continuation);
+ __mmask64 mask6 = _mm512_cmple_epi8_mask(input6, continuation);
+ __mmask64 mask7 = _mm512_cmple_epi8_mask(input7, continuation);
+ __mmask64 mask8 = _mm512_cmple_epi8_mask(input8, continuation);
+
+ __m512i mask_register = _mm512_set_epi64(mask8, mask7, mask6, mask5,
+ mask4, mask3, mask2, mask1);
+
+ unrolled_popcount = _mm512_add_epi64(unrolled_popcount,
+ _mm512_popcnt_epi64(mask_register));
+ }
+
+ for (; i <= max_i; i += sizeof(__m512i)) {
+ __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
+ uint64_t continuation_bitmask = static_cast<uint64_t>(
+ _mm512_cmple_epi8_mask(more_input, continuation));
+ answer -= count_ones(continuation_bitmask);
+ }
+ }
+
+ __m256i first_half = _mm512_extracti64x4_epi64(unrolled_popcount, 0);
+ __m256i second_half = _mm512_extracti64x4_epi64(unrolled_popcount, 1);
+ answer -= (size_t)_mm256_extract_epi64(first_half, 0) +
+ (size_t)_mm256_extract_epi64(first_half, 1) +
+ (size_t)_mm256_extract_epi64(first_half, 2) +
+ (size_t)_mm256_extract_epi64(first_half, 3) +
+ (size_t)_mm256_extract_epi64(second_half, 0) +
+ (size_t)_mm256_extract_epi64(second_half, 1) +
+ (size_t)_mm256_extract_epi64(second_half, 2) +
+ (size_t)_mm256_extract_epi64(second_half, 3);
+
+ return answer + scalar::utf8::count_code_points(
+ reinterpret_cast<const char *>(str + i), length - i);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+ const char *buf, size_t len) const noexcept {
+ return count_utf8(buf, len);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+ return scalar::utf16::latin1_length_from_utf16(length);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+ return scalar::utf32::latin1_length_from_utf32(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ const char16_t *ptr = input;
+ size_t count{0};
+ if (length >= 32) {
+ const char16_t *end = input + length - 32;
+
+ const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
+ const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
+ const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
+ const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+
+ while (ptr <= end) {
+ __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
+ ptr += 32;
+ __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
+ __mmask32 two_bytes_bitmask =
+ _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
+ __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
+ __mmask32 surrogates_bitmask =
+ _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) &
+ _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
+
+ size_t ascii_count = count_ones(ascii_bitmask);
+ size_t two_bytes_count = count_ones(two_bytes_bitmask);
+ size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
+ size_t three_bytes_count =
+ 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
+
+ count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count +
+ 2 * surrogate_bytes_count;
+ }
+ }
+
+ return count + scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(
+ ptr, length - (ptr - input));
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ const char16_t *ptr = input;
+ size_t count{0};
+
+ if (length >= 32) {
+ const char16_t *end = input + length - 32;
+
+ const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
+ const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
+ const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
+ const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+
+ const __m512i byteflip = _mm512_setr_epi64(
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+ 0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+ 0x0607040502030001, 0x0e0f0c0d0a0b0809);
+ while (ptr <= end) {
+ __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
+ utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+ ptr += 32;
+ __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
+ __mmask32 two_bytes_bitmask =
+ _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
+ __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
+ __mmask32 surrogates_bitmask =
+ _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) &
+ _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
+
+ size_t ascii_count = count_ones(ascii_bitmask);
+ size_t two_bytes_count = count_ones(two_bytes_bitmask);
+ size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
+ size_t three_bytes_count =
+ 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
+ count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count +
+ 2 * surrogate_bytes_count;
+ }
+ }
+
+ return count + scalar::utf16::utf8_length_from_utf16<endianness::BIG>(
+ ptr, length - (ptr - input));
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return implementation::count_utf16le(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return implementation::count_utf16be(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+ return scalar::latin1::utf16_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+ return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+ const char *input, size_t length) const noexcept {
+ const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
+ size_t answer = length / sizeof(__m512i) * sizeof(__m512i);
+ size_t i = 0;
+ if (answer >= 2048) { // long strings optimization
+ unsigned char v_0xFF = 0xff;
+ __m512i eight_64bits = _mm512_setzero_si512();
+ while (i + sizeof(__m512i) <= length) {
+ __m512i runner = _mm512_setzero_si512();
+ size_t iterations = (length - i) / sizeof(__m512i);
+ if (iterations > 255) {
+ iterations = 255;
+ }
+ size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
+ for (; i + 4 * sizeof(__m512i) <= max_i; i += 4 * sizeof(__m512i)) {
+ // Load four __m512i vectors
+ __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
+ __m512i input2 =
+ _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
+ __m512i input3 = _mm512_loadu_si512(
+ (const __m512i *)(str + i + 2 * sizeof(__m512i)));
+ __m512i input4 = _mm512_loadu_si512(
+ (const __m512i *)(str + i + 3 * sizeof(__m512i)));
+
+ // Generate four masks
+ __mmask64 mask1 =
+ _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input1);
+ __mmask64 mask2 =
+ _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input2);
+ __mmask64 mask3 =
+ _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input3);
+ __mmask64 mask4 =
+ _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input4);
+ // Apply the masks and subtract from the runner
+ __m512i not_ascii1 =
+ _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask1, v_0xFF);
+ __m512i not_ascii2 =
+ _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask2, v_0xFF);
+ __m512i not_ascii3 =
+ _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask3, v_0xFF);
+ __m512i not_ascii4 =
+ _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask4, v_0xFF);
+
+ runner = _mm512_sub_epi8(runner, not_ascii1);
+ runner = _mm512_sub_epi8(runner, not_ascii2);
+ runner = _mm512_sub_epi8(runner, not_ascii3);
+ runner = _mm512_sub_epi8(runner, not_ascii4);
+ }
+
+ for (; i <= max_i; i += sizeof(__m512i)) {
+ __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
+
+ __mmask64 mask =
+ _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), more_input);
+ __m512i not_ascii =
+ _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask, v_0xFF);
+ runner = _mm512_sub_epi8(runner, not_ascii);
+ }
+
+ eight_64bits = _mm512_add_epi64(
+ eight_64bits, _mm512_sad_epu8(runner, _mm512_setzero_si512()));
+ }
+
+ __m256i first_half = _mm512_extracti64x4_epi64(eight_64bits, 0);
+ __m256i second_half = _mm512_extracti64x4_epi64(eight_64bits, 1);
+ answer += (size_t)_mm256_extract_epi64(first_half, 0) +
+ (size_t)_mm256_extract_epi64(first_half, 1) +
+ (size_t)_mm256_extract_epi64(first_half, 2) +
+ (size_t)_mm256_extract_epi64(first_half, 3) +
+ (size_t)_mm256_extract_epi64(second_half, 0) +
+ (size_t)_mm256_extract_epi64(second_half, 1) +
+ (size_t)_mm256_extract_epi64(second_half, 2) +
+ (size_t)_mm256_extract_epi64(second_half, 3);
+ } else if (answer > 0) {
+ for (; i + sizeof(__m512i) <= length; i += sizeof(__m512i)) {
+ __m512i latin = _mm512_loadu_si512((const __m512i *)(str + i));
+ uint64_t non_ascii = _mm512_movepi8_mask(latin);
+ answer += count_ones(non_ascii);
+ }
+ }
+ return answer + scalar::latin1::utf8_length_from_latin1(
+ reinterpret_cast<const char *>(str + i), length - i);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ size_t pos = 0;
+ size_t count = 0;
+ // This algorithm could no doubt be improved!
+ for (; pos + 64 <= length; pos += 64) {
+ __m512i utf8 = _mm512_loadu_si512((const __m512i *)(input + pos));
+ uint64_t utf8_continuation_mask =
+ _mm512_cmplt_epi8_mask(utf8, _mm512_set1_epi8(-65 + 1));
+ // We count one word for anything that is not a continuation (so
+ // leading bytes).
+ count += 64 - count_ones(utf8_continuation_mask);
+ uint64_t utf8_4byte =
+ _mm512_cmpge_epu8_mask(utf8, _mm512_set1_epi8(int8_t(240)));
+ count += count_ones(utf8_4byte);
+ }
+ return count +
+ scalar::utf8::utf16_length_from_utf8(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ const char32_t *ptr = input;
+ size_t count{0};
+
+ if (length >= 16) {
+ const char32_t *end = input + length - 16;
+
+ const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f);
+ const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff);
+ const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
+
+ while (ptr <= end) {
+ __m512i utf32 = _mm512_loadu_si512((const __m512i *)ptr);
+ ptr += 16;
+ __mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f);
+ __mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(
+ _knot_mask16(ascii_bitmask), utf32, v_0000_07ff);
+ __mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(
+ _knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32,
+ v_0000_ffff);
+
+ size_t ascii_count = count_ones(ascii_bitmask);
+ size_t two_bytes_count = count_ones(two_bytes_bitmask);
+ size_t three_bytes_count = count_ones(three_bytes_bitmask);
+ size_t four_bytes_count =
+ 16 - ascii_count - two_bytes_count - three_bytes_count;
+ count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count +
+ 4 * four_bytes_count;
+ }
+ }
+
+ return count +
+ scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input));
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ const char32_t *ptr = input;
+ size_t count{0};
+
+ if (length >= 16) {
+ const char32_t *end = input + length - 16;
+
+ const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
+
+ while (ptr <= end) {
+ __m512i utf32 = _mm512_loadu_si512((const __m512i *)ptr);
+ ptr += 16;
+ __mmask16 surrogates_bitmask =
+ _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
+
+ count += 16 + count_ones(surrogates_bitmask);
+ }
+ }
+
+ return count +
+ scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input));
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ return implementation::count_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+ size_t length, base64_options options) const noexcept {
+ return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+ char *output,
+ base64_options options) const noexcept {
+ if (options & base64_url) {
+ return encode_base64<true>(output, input, length, options);
+ } else {
+ return encode_base64<false>(output, input, length, options);
+ }
+}
+
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#include "simdutf/icelake/end.h"
diff --git a/contrib/simdutf/src/implementation.cpp b/contrib/simdutf/src/implementation.cpp
new file mode 100644
index 000000000..0397bcdd6
--- /dev/null
+++ b/contrib/simdutf/src/implementation.cpp
@@ -0,0 +1,1991 @@
+#include "simdutf.h"
+#include <initializer_list>
+#include <climits>
+#include <type_traits>
+
+// Useful for debugging purposes
+namespace simdutf {
+namespace {
+
+template <typename T> std::string toBinaryString(T b) {
+ std::string binary = "";
+ T mask = T(1) << (sizeof(T) * CHAR_BIT - 1);
+ while (mask > 0) {
+ binary += ((b & mask) == 0) ? '0' : '1';
+ mask >>= 1;
+ }
+ return binary;
+}
+} // namespace
+} // namespace simdutf
+
+// Implementations
+// The best choice should always come first!
+#include "simdutf/arm64.h"
+#include "simdutf/icelake.h"
+#include "simdutf/haswell.h"
+#include "simdutf/westmere.h"
+#include "simdutf/ppc64.h"
+#include "simdutf/rvv.h"
+#include "simdutf/lsx.h"
+#include "simdutf/lasx.h"
+#include "simdutf/fallback.h" // have it always last.
+
+#include "scalar/utf8.h"
+#include "scalar/utf16.h"
+#include "scalar/utf32.h"
+#include "scalar/base64.h"
+#include "scalar/latin1_to_utf8/latin1_to_utf8.h"
+
+namespace simdutf {
+bool implementation::supported_by_runtime_system() const {
+ uint32_t required_instruction_sets = this->required_instruction_sets();
+ uint32_t supported_instruction_sets =
+ internal::detect_supported_architectures();
+ return ((supported_instruction_sets & required_instruction_sets) ==
+ required_instruction_sets);
+}
+
+simdutf_warn_unused encoding_type implementation::autodetect_encoding(
+ const char *input, size_t length) const noexcept {
+ // If there is a BOM, then we trust it.
+ auto bom_encoding = simdutf::BOM::check_bom(input, length);
+ if (bom_encoding != encoding_type::unspecified) {
+ return bom_encoding;
+ }
+ // UTF8 is common, it includes ASCII, and is commonly represented
+ // without a BOM, so if it fits, go with that. Note that it is still
+ // possible to get it wrong, we are only 'guessing'. If some has UTF-16
+ // data without a BOM, it could pass as UTF-8.
+ //
+ // An interesting twist might be to check for UTF-16 ASCII first (every
+ // other byte is zero).
+ if (validate_utf8(input, length)) {
+ return encoding_type::UTF8;
+ }
+ // The next most common encoding that might appear without BOM is probably
+ // UTF-16LE, so try that next.
+ if ((length % 2) == 0) {
+ // important: we need to divide by two
+ if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+ length / 2)) {
+ return encoding_type::UTF16_LE;
+ }
+ }
+ if ((length % 4) == 0) {
+ if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+ return encoding_type::UTF32_LE;
+ }
+ }
+ return encoding_type::unspecified;
+}
+
+namespace internal {
+// When there is a single implementation, we should not pay a price
+// for dispatching to the best implementation. We should just use the
+// one we have. This is a compile-time check.
+#define SIMDUTF_SINGLE_IMPLEMENTATION \
+ (SIMDUTF_IMPLEMENTATION_ICELAKE + SIMDUTF_IMPLEMENTATION_HASWELL + \
+ SIMDUTF_IMPLEMENTATION_WESTMERE + SIMDUTF_IMPLEMENTATION_ARM64 + \
+ SIMDUTF_IMPLEMENTATION_PPC64 + SIMDUTF_IMPLEMENTATION_LSX + \
+ SIMDUTF_IMPLEMENTATION_LASX + SIMDUTF_IMPLEMENTATION_FALLBACK == \
+ 1)
+
+// Static array of known implementations. We are hoping these get baked into the
+// executable without requiring a static initializer.
+
+#if SIMDUTF_IMPLEMENTATION_ICELAKE
+static const icelake::implementation *get_icelake_singleton() {
+ static const icelake::implementation icelake_singleton{};
+ return &icelake_singleton;
+}
+#endif
+#if SIMDUTF_IMPLEMENTATION_HASWELL
+static const haswell::implementation *get_haswell_singleton() {
+ static const haswell::implementation haswell_singleton{};
+ return &haswell_singleton;
+}
+#endif
+#if SIMDUTF_IMPLEMENTATION_WESTMERE
+static const westmere::implementation *get_westmere_singleton() {
+ static const westmere::implementation westmere_singleton{};
+ return &westmere_singleton;
+}
+#endif
+#if SIMDUTF_IMPLEMENTATION_ARM64
+static const arm64::implementation *get_arm64_singleton() {
+ static const arm64::implementation arm64_singleton{};
+ return &arm64_singleton;
+}
+#endif
+#if SIMDUTF_IMPLEMENTATION_PPC64
+static const ppc64::implementation *get_ppc64_singleton() {
+ static const ppc64::implementation ppc64_singleton{};
+ return &ppc64_singleton;
+}
+#endif
+#if SIMDUTF_IMPLEMENTATION_RVV
+static const rvv::implementation *get_rvv_singleton() {
+ static const rvv::implementation rvv_singleton{};
+ return &rvv_singleton;
+}
+#endif
+#if SIMDUTF_IMPLEMENTATION_LSX
+static const lsx::implementation *get_lsx_singleton() {
+ static const lsx::implementation lsx_singleton{};
+ return &lsx_singleton;
+}
+#endif
+#if SIMDUTF_IMPLEMENTATION_LASX
+static const lasx::implementation *get_lasx_singleton() {
+ static const lasx::implementation lasx_singleton{};
+ return &lasx_singleton;
+}
+#endif
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+static const fallback::implementation *get_fallback_singleton() {
+ static const fallback::implementation fallback_singleton{};
+ return &fallback_singleton;
+}
+#endif
+
+#if SIMDUTF_SINGLE_IMPLEMENTATION
+static const implementation *get_single_implementation() {
+ return
+ #if SIMDUTF_IMPLEMENTATION_ICELAKE
+ get_icelake_singleton();
+ #endif
+ #if SIMDUTF_IMPLEMENTATION_HASWELL
+ get_haswell_singleton();
+ #endif
+ #if SIMDUTF_IMPLEMENTATION_WESTMERE
+ get_westmere_singleton();
+ #endif
+ #if SIMDUTF_IMPLEMENTATION_ARM64
+ get_arm64_singleton();
+ #endif
+ #if SIMDUTF_IMPLEMENTATION_PPC64
+ get_ppc64_singleton();
+ #endif
+ #if SIMDUTF_IMPLEMENTATION_LSX
+ get_lsx_singleton();
+ #endif
+ #if SIMDUTF_IMPLEMENTATION_LASX
+ get_lasx_singleton();
+ #endif
+ #if SIMDUTF_IMPLEMENTATION_FALLBACK
+ get_fallback_singleton();
+ #endif
+}
+#endif
+
+/**
+ * @private Detects best supported implementation on first use, and sets it
+ */
+class detect_best_supported_implementation_on_first_use final
+ : public implementation {
+public:
+ std::string name() const noexcept final { return set_best()->name(); }
+ std::string description() const noexcept final {
+ return set_best()->description();
+ }
+ uint32_t required_instruction_sets() const noexcept final {
+ return set_best()->required_instruction_sets();
+ }
+
+ simdutf_warn_unused int
+ detect_encodings(const char *input, size_t length) const noexcept override {
+ return set_best()->detect_encodings(input, length);
+ }
+
+ simdutf_warn_unused bool
+ validate_utf8(const char *buf, size_t len) const noexcept final override {
+ return set_best()->validate_utf8(buf, len);
+ }
+
+ simdutf_warn_unused result validate_utf8_with_errors(
+ const char *buf, size_t len) const noexcept final override {
+ return set_best()->validate_utf8_with_errors(buf, len);
+ }
+
+ simdutf_warn_unused bool
+ validate_ascii(const char *buf, size_t len) const noexcept final override {
+ return set_best()->validate_ascii(buf, len);
+ }
+
+ simdutf_warn_unused result validate_ascii_with_errors(
+ const char *buf, size_t len) const noexcept final override {
+ return set_best()->validate_ascii_with_errors(buf, len);
+ }
+
+ simdutf_warn_unused bool
+ validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept final override {
+ return set_best()->validate_utf16le(buf, len);
+ }
+
+ simdutf_warn_unused bool
+ validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept final override {
+ return set_best()->validate_utf16be(buf, len);
+ }
+
+ simdutf_warn_unused result validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept final override {
+ return set_best()->validate_utf16le_with_errors(buf, len);
+ }
+
+ simdutf_warn_unused result validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept final override {
+ return set_best()->validate_utf16be_with_errors(buf, len);
+ }
+
+ simdutf_warn_unused bool
+ validate_utf32(const char32_t *buf,
+ size_t len) const noexcept final override {
+ return set_best()->validate_utf32(buf, len);
+ }
+
+ simdutf_warn_unused result validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept final override {
+ return set_best()->validate_utf32_with_errors(buf, len);
+ }
+
+ simdutf_warn_unused size_t
+ convert_latin1_to_utf8(const char *buf, size_t len,
+ char *utf8_output) const noexcept final override {
+ return set_best()->convert_latin1_to_utf8(buf, len, utf8_output);
+ }
+
+ simdutf_warn_unused size_t convert_latin1_to_utf16le(
+ const char *buf, size_t len,
+ char16_t *utf16_output) const noexcept final override {
+ return set_best()->convert_latin1_to_utf16le(buf, len, utf16_output);
+ }
+
+ simdutf_warn_unused size_t convert_latin1_to_utf16be(
+ const char *buf, size_t len,
+ char16_t *utf16_output) const noexcept final override {
+ return set_best()->convert_latin1_to_utf16be(buf, len, utf16_output);
+ }
+
+ simdutf_warn_unused size_t convert_latin1_to_utf32(
+ const char *buf, size_t len,
+ char32_t *latin1_output) const noexcept final override {
+ return set_best()->convert_latin1_to_utf32(buf, len, latin1_output);
+ }
+
+ simdutf_warn_unused size_t
+ convert_utf8_to_latin1(const char *buf, size_t len,
+ char *latin1_output) const noexcept final override {
+ return set_best()->convert_utf8_to_latin1(buf, len, latin1_output);
+ }
+
+ simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len,
+ char *latin1_output) const noexcept final override {
+ return set_best()->convert_utf8_to_latin1_with_errors(buf, len,
+ latin1_output);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+ const char *buf, size_t len,
+ char *latin1_output) const noexcept final override {
+ return set_best()->convert_valid_utf8_to_latin1(buf, len, latin1_output);
+ }
+
+ simdutf_warn_unused size_t convert_utf8_to_utf16le(
+ const char *buf, size_t len,
+ char16_t *utf16_output) const noexcept final override {
+ return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output);
+ }
+
+ simdutf_warn_unused size_t convert_utf8_to_utf16be(
+ const char *buf, size_t len,
+ char16_t *utf16_output) const noexcept final override {
+ return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output);
+ }
+
+ simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len,
+ char16_t *utf16_output) const noexcept final override {
+ return set_best()->convert_utf8_to_utf16le_with_errors(buf, len,
+ utf16_output);
+ }
+
+ simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len,
+ char16_t *utf16_output) const noexcept final override {
+ return set_best()->convert_utf8_to_utf16be_with_errors(buf, len,
+ utf16_output);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+ const char *buf, size_t len,
+ char16_t *utf16_output) const noexcept final override {
+ return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+ const char *buf, size_t len,
+ char16_t *utf16_output) const noexcept final override {
+ return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output);
+ }
+
+ simdutf_warn_unused size_t
+ convert_utf8_to_utf32(const char *buf, size_t len,
+ char32_t *utf32_output) const noexcept final override {
+ return set_best()->convert_utf8_to_utf32(buf, len, utf32_output);
+ }
+
+ simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len,
+ char32_t *utf32_output) const noexcept final override {
+ return set_best()->convert_utf8_to_utf32_with_errors(buf, len,
+ utf32_output);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+ const char *buf, size_t len,
+ char32_t *utf32_output) const noexcept final override {
+ return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output);
+ }
+
+ simdutf_warn_unused size_t
+ convert_utf16le_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_output) const noexcept final override {
+ return set_best()->convert_utf16le_to_latin1(buf, len, latin1_output);
+ }
+
+ simdutf_warn_unused size_t
+ convert_utf16be_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_output) const noexcept final override {
+ return set_best()->convert_utf16be_to_latin1(buf, len, latin1_output);
+ }
+
+ simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_output) const noexcept final override {
+ return set_best()->convert_utf16le_to_latin1_with_errors(buf, len,
+ latin1_output);
+ }
+
+ simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_output) const noexcept final override {
+ return set_best()->convert_utf16be_to_latin1_with_errors(buf, len,
+ latin1_output);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
+ const char16_t *buf, size_t len,
+ char *latin1_output) const noexcept final override {
+ return set_best()->convert_valid_utf16le_to_latin1(buf, len, latin1_output);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
+ const char16_t *buf, size_t len,
+ char *latin1_output) const noexcept final override {
+ return set_best()->convert_valid_utf16be_to_latin1(buf, len, latin1_output);
+ }
+
+ simdutf_warn_unused size_t
+ convert_utf16le_to_utf8(const char16_t *buf, size_t len,
+ char *utf8_output) const noexcept final override {
+ return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output);
+ }
+
+ simdutf_warn_unused size_t
+ convert_utf16be_to_utf8(const char16_t *buf, size_t len,
+ char *utf8_output) const noexcept final override {
+ return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output);
+ }
+
+ simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len,
+ char *utf8_output) const noexcept final override {
+ return set_best()->convert_utf16le_to_utf8_with_errors(buf, len,
+ utf8_output);
+ }
+
+ simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len,
+ char *utf8_output) const noexcept final override {
+ return set_best()->convert_utf16be_to_utf8_with_errors(buf, len,
+ utf8_output);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len,
+ char *utf8_output) const noexcept final override {
+ return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len,
+ char *utf8_output) const noexcept final override {
+ return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output);
+ }
+
+ simdutf_warn_unused size_t
+ convert_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final override {
+ return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
+ }
+
+ simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
+ const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final override {
+ return set_best()->convert_utf32_to_latin1_with_errors(buf, len,
+ latin1_output);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
+ const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final override {
+ return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
+ }
+
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf8(const char32_t *buf, size_t len,
+ char *utf8_output) const noexcept final override {
+ return set_best()->convert_utf32_to_utf8(buf, len, utf8_output);
+ }
+
+ simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len,
+ char *utf8_output) const noexcept final override {
+ return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+ }
+
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf8(const char32_t *buf, size_t len,
+ char *utf8_output) const noexcept final override {
+ return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output);
+ }
+
+ simdutf_warn_unused size_t convert_utf32_to_utf16le(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_output) const noexcept final override {
+ return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output);
+ }
+
+ simdutf_warn_unused size_t convert_utf32_to_utf16be(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_output) const noexcept final override {
+ return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output);
+ }
+
+ simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_output) const noexcept final override {
+ return set_best()->convert_utf32_to_utf16le_with_errors(buf, len,
+ utf16_output);
+ }
+
+ simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_output) const noexcept final override {
+ return set_best()->convert_utf32_to_utf16be_with_errors(buf, len,
+ utf16_output);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_output) const noexcept final override {
+ return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_output) const noexcept final override {
+ return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output);
+ }
+
+ simdutf_warn_unused size_t convert_utf16le_to_utf32(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_output) const noexcept final override {
+ return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output);
+ }
+
+ simdutf_warn_unused size_t convert_utf16be_to_utf32(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_output) const noexcept final override {
+ return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output);
+ }
+
+ simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_output) const noexcept final override {
+ return set_best()->convert_utf16le_to_utf32_with_errors(buf, len,
+ utf32_output);
+ }
+
+ simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_output) const noexcept final override {
+ return set_best()->convert_utf16be_to_utf32_with_errors(buf, len,
+ utf32_output);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_output) const noexcept final override {
+ return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_output) const noexcept final override {
+ return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output);
+ }
+
+ void change_endianness_utf16(const char16_t *buf, size_t len,
+ char16_t *output) const noexcept final override {
+ set_best()->change_endianness_utf16(buf, len, output);
+ }
+
+ simdutf_warn_unused size_t
+ count_utf16le(const char16_t *buf, size_t len) const noexcept final override {
+ return set_best()->count_utf16le(buf, len);
+ }
+
+ simdutf_warn_unused size_t
+ count_utf16be(const char16_t *buf, size_t len) const noexcept final override {
+ return set_best()->count_utf16be(buf, len);
+ }
+
+ simdutf_warn_unused size_t
+ count_utf8(const char *buf, size_t len) const noexcept final override {
+ return set_best()->count_utf8(buf, len);
+ }
+
+ simdutf_warn_unused size_t
+ latin1_length_from_utf8(const char *buf, size_t len) const noexcept override {
+ return set_best()->latin1_length_from_utf8(buf, len);
+ }
+
+ simdutf_warn_unused size_t
+ latin1_length_from_utf16(size_t len) const noexcept override {
+ return set_best()->latin1_length_from_utf16(len);
+ }
+
+ simdutf_warn_unused size_t
+ latin1_length_from_utf32(size_t len) const noexcept override {
+ return set_best()->latin1_length_from_utf32(len);
+ }
+
+ simdutf_warn_unused size_t
+ utf8_length_from_latin1(const char *buf, size_t len) const noexcept override {
+ return set_best()->utf8_length_from_latin1(buf, len);
+ }
+
+ simdutf_warn_unused size_t utf8_length_from_utf16le(
+ const char16_t *buf, size_t len) const noexcept override {
+ return set_best()->utf8_length_from_utf16le(buf, len);
+ }
+
+ simdutf_warn_unused size_t utf8_length_from_utf16be(
+ const char16_t *buf, size_t len) const noexcept override {
+ return set_best()->utf8_length_from_utf16be(buf, len);
+ }
+
+ simdutf_warn_unused size_t
+ utf16_length_from_latin1(size_t len) const noexcept override {
+ return set_best()->utf16_length_from_latin1(len);
+ }
+
+ simdutf_warn_unused size_t
+ utf32_length_from_latin1(size_t len) const noexcept override {
+ return set_best()->utf32_length_from_latin1(len);
+ }
+
+ simdutf_warn_unused size_t utf32_length_from_utf16le(
+ const char16_t *buf, size_t len) const noexcept override {
+ return set_best()->utf32_length_from_utf16le(buf, len);
+ }
+
+ simdutf_warn_unused size_t utf32_length_from_utf16be(
+ const char16_t *buf, size_t len) const noexcept override {
+ return set_best()->utf32_length_from_utf16be(buf, len);
+ }
+
+ simdutf_warn_unused size_t
+ utf16_length_from_utf8(const char *buf, size_t len) const noexcept override {
+ return set_best()->utf16_length_from_utf8(buf, len);
+ }
+
+ simdutf_warn_unused size_t utf8_length_from_utf32(
+ const char32_t *buf, size_t len) const noexcept override {
+ return set_best()->utf8_length_from_utf32(buf, len);
+ }
+
+ simdutf_warn_unused size_t utf16_length_from_utf32(
+ const char32_t *buf, size_t len) const noexcept override {
+ return set_best()->utf16_length_from_utf32(buf, len);
+ }
+
+ simdutf_warn_unused size_t
+ utf32_length_from_utf8(const char *buf, size_t len) const noexcept override {
+ return set_best()->utf32_length_from_utf8(buf, len);
+ }
+
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept override {
+ return set_best()->maximal_binary_length_from_base64(input, length);
+ }
+
+ simdutf_warn_unused result base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_handling_options =
+ last_chunk_handling_options::loose) const noexcept override {
+ return set_best()->base64_to_binary(input, length, output, options,
+ last_chunk_handling_options);
+ }
+
+ simdutf_warn_unused full_result base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_handling_options =
+ last_chunk_handling_options::loose) const noexcept override {
+ return set_best()->base64_to_binary_details(input, length, output, options,
+ last_chunk_handling_options);
+ }
+
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept override {
+ return set_best()->maximal_binary_length_from_base64(input, length);
+ }
+
+ simdutf_warn_unused result base64_to_binary(
+ const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_handling_options =
+ last_chunk_handling_options::loose) const noexcept override {
+ return set_best()->base64_to_binary(input, length, output, options,
+ last_chunk_handling_options);
+ }
+
+ simdutf_warn_unused full_result base64_to_binary_details(
+ const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_handling_options =
+ last_chunk_handling_options::loose) const noexcept override {
+ return set_best()->base64_to_binary_details(input, length, output, options,
+ last_chunk_handling_options);
+ }
+
+ simdutf_warn_unused size_t base64_length_from_binary(
+ size_t length, base64_options options) const noexcept override {
+ return set_best()->base64_length_from_binary(length, options);
+ }
+
+ size_t binary_to_base64(const char *input, size_t length, char *output,
+ base64_options options) const noexcept override {
+ return set_best()->binary_to_base64(input, length, output, options);
+ }
+
+ simdutf_really_inline
+ detect_best_supported_implementation_on_first_use() noexcept
+ : implementation("best_supported_detector",
+ "Detects the best supported implementation and sets it",
+ 0) {}
+
+private:
+ const implementation *set_best() const noexcept;
+};
+
+static_assert(std::is_trivially_destructible<
+ detect_best_supported_implementation_on_first_use>::value,
+ "detect_best_supported_implementation_on_first_use should be "
+ "trivially destructible");
+
+static const std::initializer_list<const implementation *> &
+get_available_implementation_pointers() {
+ static const std::initializer_list<const implementation *>
+ available_implementation_pointers{
+#if SIMDUTF_IMPLEMENTATION_ICELAKE
+ get_icelake_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_HASWELL
+ get_haswell_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_WESTMERE
+ get_westmere_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_ARM64
+ get_arm64_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_PPC64
+ get_ppc64_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_RVV
+ get_rvv_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_LSX
+ get_lsx_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_LASX
+ get_lasx_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+ get_fallback_singleton(),
+#endif
+ }; // available_implementation_pointers
+ return available_implementation_pointers;
+}
+
+// So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no
+// support
+class unsupported_implementation final : public implementation {
+public:
+ simdutf_warn_unused int detect_encodings(const char *,
+ size_t) const noexcept override {
+ return encoding_type::unspecified;
+ }
+
+ simdutf_warn_unused bool validate_utf8(const char *,
+ size_t) const noexcept final override {
+ return false; // Just refuse to validate. Given that we have a fallback
+ // implementation
+ // it seems unlikely that unsupported_implementation will ever be used. If
+ // it is used, then it will flag all strings as invalid. The alternative is
+ // to return an error_code from which the user has to figure out whether the
+ // string is valid UTF-8... which seems like a lot of work just to handle
+ // the very unlikely case that we have an unsupported implementation. And,
+ // when it does happen (that we have an unsupported implementation), what
+ // are the chances that the programmer has a fallback? Given that *we*
+ // provide the fallback, it implies that the programmer would need a
+ // fallback for our fallback.
+ }
+
+ simdutf_warn_unused result validate_utf8_with_errors(
+ const char *, size_t) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused bool
+ validate_ascii(const char *, size_t) const noexcept final override {
+ return false;
+ }
+
+ simdutf_warn_unused result validate_ascii_with_errors(
+ const char *, size_t) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused bool
+ validate_utf16le(const char16_t *, size_t) const noexcept final override {
+ return false;
+ }
+
+ simdutf_warn_unused bool
+ validate_utf16be(const char16_t *, size_t) const noexcept final override {
+ return false;
+ }
+
+ simdutf_warn_unused result validate_utf16le_with_errors(
+ const char16_t *, size_t) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused result validate_utf16be_with_errors(
+ const char16_t *, size_t) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused bool
+ validate_utf32(const char32_t *, size_t) const noexcept final override {
+ return false;
+ }
+
+ simdutf_warn_unused result validate_utf32_with_errors(
+ const char32_t *, size_t) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused size_t convert_latin1_to_utf8(
+ const char *, size_t, char *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_latin1_to_utf16le(
+ const char *, size_t, char16_t *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_latin1_to_utf16be(
+ const char *, size_t, char16_t *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_latin1_to_utf32(
+ const char *, size_t, char32_t *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_utf8_to_latin1(
+ const char *, size_t, char *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+ const char *, size_t, char *) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+ const char *, size_t, char *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_utf8_to_utf16le(
+ const char *, size_t, char16_t *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_utf8_to_utf16be(
+ const char *, size_t, char16_t *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+ const char *, size_t, char16_t *) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+ const char *, size_t, char16_t *) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+ const char *, size_t, char16_t *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+ const char *, size_t, char16_t *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_utf8_to_utf32(
+ const char *, size_t, char32_t *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+ const char *, size_t, char32_t *) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+ const char *, size_t, char32_t *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_utf16le_to_latin1(
+ const char16_t *, size_t, char *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_utf16be_to_latin1(
+ const char16_t *, size_t, char *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+ const char16_t *, size_t, char *) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+ const char16_t *, size_t, char *) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
+ const char16_t *, size_t, char *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
+ const char16_t *, size_t, char *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_utf16le_to_utf8(
+ const char16_t *, size_t, char *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_utf16be_to_utf8(
+ const char16_t *, size_t, char *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+ const char16_t *, size_t, char *) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+ const char16_t *, size_t, char *) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+ const char16_t *, size_t, char *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+ const char16_t *, size_t, char *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_utf32_to_latin1(
+ const char32_t *, size_t, char *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
+ const char32_t *, size_t, char *) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
+ const char32_t *, size_t, char *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_utf32_to_utf8(
+ const char32_t *, size_t, char *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+ const char32_t *, size_t, char *) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+ const char32_t *, size_t, char *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_utf32_to_utf16le(
+ const char32_t *, size_t, char16_t *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_utf32_to_utf16be(
+ const char32_t *, size_t, char16_t *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+ const char32_t *, size_t, char16_t *) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+ const char32_t *, size_t, char16_t *) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
+ const char32_t *, size_t, char16_t *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
+ const char32_t *, size_t, char16_t *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_utf16le_to_utf32(
+ const char16_t *, size_t, char32_t *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_utf16be_to_utf32(
+ const char16_t *, size_t, char32_t *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+ const char16_t *, size_t, char32_t *) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+ const char16_t *, size_t, char32_t *) const noexcept final override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
+ const char16_t *, size_t, char32_t *) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
+ const char16_t *, size_t, char32_t *) const noexcept final override {
+ return 0;
+ }
+
+ void change_endianness_utf16(const char16_t *, size_t,
+ char16_t *) const noexcept final override {}
+
+ simdutf_warn_unused size_t
+ count_utf16le(const char16_t *, size_t) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t
+ count_utf16be(const char16_t *, size_t) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t count_utf8(const char *,
+ size_t) const noexcept final override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t
+ latin1_length_from_utf8(const char *, size_t) const noexcept override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t
+ latin1_length_from_utf16(size_t) const noexcept override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t
+ latin1_length_from_utf32(size_t) const noexcept override {
+ return 0;
+ }
+ simdutf_warn_unused size_t
+ utf8_length_from_latin1(const char *, size_t) const noexcept override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16le(const char16_t *, size_t) const noexcept override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16be(const char16_t *, size_t) const noexcept override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t
+ utf32_length_from_utf16le(const char16_t *, size_t) const noexcept override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t
+ utf32_length_from_utf16be(const char16_t *, size_t) const noexcept override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t
+ utf32_length_from_latin1(size_t) const noexcept override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t
+ utf16_length_from_utf8(const char *, size_t) const noexcept override {
+ return 0;
+ }
+ simdutf_warn_unused size_t
+ utf16_length_from_latin1(size_t) const noexcept override {
+ return 0;
+ }
+ simdutf_warn_unused size_t
+ utf8_length_from_utf32(const char32_t *, size_t) const noexcept override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t
+ utf16_length_from_utf32(const char32_t *, size_t) const noexcept override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t
+ utf32_length_from_utf8(const char *, size_t) const noexcept override {
+ return 0;
+ }
+
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char *, size_t) const noexcept override {
+ return 0;
+ }
+
+ simdutf_warn_unused result
+ base64_to_binary(const char *, size_t, char *, base64_options,
+ last_chunk_handling_options) const noexcept override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused full_result base64_to_binary_details(
+ const char *, size_t, char *, base64_options,
+ last_chunk_handling_options) const noexcept override {
+ return full_result(error_code::OTHER, 0, 0);
+ }
+
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char16_t *, size_t) const noexcept override {
+ return 0;
+ }
+
+ simdutf_warn_unused result
+ base64_to_binary(const char16_t *, size_t, char *, base64_options,
+ last_chunk_handling_options) const noexcept override {
+ return result(error_code::OTHER, 0);
+ }
+
+ simdutf_warn_unused full_result base64_to_binary_details(
+ const char16_t *, size_t, char *, base64_options,
+ last_chunk_handling_options) const noexcept override {
+ return full_result(error_code::OTHER, 0, 0);
+ }
+
+ simdutf_warn_unused size_t
+ base64_length_from_binary(size_t, base64_options) const noexcept override {
+ return 0;
+ }
+
+ size_t binary_to_base64(const char *, size_t, char *,
+ base64_options) const noexcept override {
+ return 0;
+ }
+
+ unsupported_implementation()
+ : implementation("unsupported",
+ "Unsupported CPU (no detected SIMD instructions)", 0) {}
+};
+
+const unsupported_implementation *get_unsupported_singleton() {
+ static const unsupported_implementation unsupported_singleton{};
+ return &unsupported_singleton;
+}
+static_assert(std::is_trivially_destructible<unsupported_implementation>::value,
+ "unsupported_singleton should be trivially destructible");
+
+size_t available_implementation_list::size() const noexcept {
+ return internal::get_available_implementation_pointers().size();
+}
+const implementation *const *
+available_implementation_list::begin() const noexcept {
+ return internal::get_available_implementation_pointers().begin();
+}
+const implementation *const *
+available_implementation_list::end() const noexcept {
+ return internal::get_available_implementation_pointers().end();
+}
+const implementation *
+available_implementation_list::detect_best_supported() const noexcept {
+ // They are prelisted in priority order, so we just go down the list
+ uint32_t supported_instruction_sets =
+ internal::detect_supported_architectures();
+ for (const implementation *impl :
+ internal::get_available_implementation_pointers()) {
+ uint32_t required_instruction_sets = impl->required_instruction_sets();
+ if ((supported_instruction_sets & required_instruction_sets) ==
+ required_instruction_sets) {
+ return impl;
+ }
+ }
+ return get_unsupported_singleton(); // this should never happen?
+}
+
+const implementation *
+detect_best_supported_implementation_on_first_use::set_best() const noexcept {
+ SIMDUTF_PUSH_DISABLE_WARNINGS
+ SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC:
+ // manually verified this is safe
+ char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION");
+ SIMDUTF_POP_DISABLE_WARNINGS
+
+ if (force_implementation_name) {
+ auto force_implementation =
+ get_available_implementations()[force_implementation_name];
+ if (force_implementation) {
+ return get_active_implementation() = force_implementation;
+ } else {
+ // Note: abort() and stderr usage within the library is forbidden.
+ return get_active_implementation() = get_unsupported_singleton();
+ }
+ }
+ return get_active_implementation() =
+ get_available_implementations().detect_best_supported();
+}
+
+} // namespace internal
+
+/**
+ * The list of available implementations compiled into simdutf.
+ */
+SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
+get_available_implementations() {
+ static const internal::available_implementation_list
+ available_implementations{};
+ return available_implementations;
+}
+
+/**
+ * The active implementation.
+ */
+SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
+get_active_implementation() {
+#if SIMDUTF_SINGLE_IMPLEMENTATION
+ // skip runtime detection
+ static internal::atomic_ptr<const implementation> active_implementation{
+ internal::get_single_implementation()};
+ return active_implementation;
+#else
+ static const internal::detect_best_supported_implementation_on_first_use
+ detect_best_supported_implementation_on_first_use_singleton;
+ static internal::atomic_ptr<const implementation> active_implementation{
+ &detect_best_supported_implementation_on_first_use_singleton};
+ return active_implementation;
+#endif
+}
+
+#if SIMDUTF_SINGLE_IMPLEMENTATION
+const implementation *get_default_implementation() {
+ return internal::get_single_implementation();
+}
+#else
+internal::atomic_ptr<const implementation> &get_default_implementation() {
+ return get_active_implementation();
+}
+#endif
+#define SIMDUTF_GET_CURRENT_IMPLEMENTION
+
+simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
+ return get_default_implementation()->validate_utf8(buf, len);
+}
+simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
+ size_t len) noexcept {
+ return get_default_implementation()->validate_utf8_with_errors(buf, len);
+}
+simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept {
+ return get_default_implementation()->validate_ascii(buf, len);
+}
+simdutf_warn_unused result validate_ascii_with_errors(const char *buf,
+ size_t len) noexcept {
+ return get_default_implementation()->validate_ascii_with_errors(buf, len);
+}
+simdutf_warn_unused size_t convert_utf8_to_utf16(
+ const char *input, size_t length, char16_t *utf16_output) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return convert_utf8_to_utf16be(input, length, utf16_output);
+#else
+ return convert_utf8_to_utf16le(input, length, utf16_output);
+#endif
+}
+simdutf_warn_unused size_t convert_latin1_to_utf8(const char *buf, size_t len,
+ char *utf8_output) noexcept {
+ return get_default_implementation()->convert_latin1_to_utf8(buf, len,
+ utf8_output);
+}
+simdutf_warn_unused size_t convert_latin1_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) noexcept {
+ return get_default_implementation()->convert_latin1_to_utf16le(buf, len,
+ utf16_output);
+}
+simdutf_warn_unused size_t convert_latin1_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) noexcept {
+ return get_default_implementation()->convert_latin1_to_utf16be(buf, len,
+ utf16_output);
+}
+simdutf_warn_unused size_t convert_latin1_to_utf32(
+ const char *buf, size_t len, char32_t *latin1_output) noexcept {
+ return get_default_implementation()->convert_latin1_to_utf32(buf, len,
+ latin1_output);
+}
+simdutf_warn_unused size_t convert_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) noexcept {
+ return get_default_implementation()->convert_utf8_to_latin1(buf, len,
+ latin1_output);
+}
+simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len, char *latin1_output) noexcept {
+ return get_default_implementation()->convert_utf8_to_latin1_with_errors(
+ buf, len, latin1_output);
+}
+simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) noexcept {
+ return get_default_implementation()->convert_valid_utf8_to_latin1(
+ buf, len, latin1_output);
+}
+simdutf_warn_unused size_t convert_utf8_to_utf16le(
+ const char *input, size_t length, char16_t *utf16_output) noexcept {
+ return get_default_implementation()->convert_utf8_to_utf16le(input, length,
+ utf16_output);
+}
+simdutf_warn_unused size_t convert_utf8_to_utf16be(
+ const char *input, size_t length, char16_t *utf16_output) noexcept {
+ return get_default_implementation()->convert_utf8_to_utf16be(input, length,
+ utf16_output);
+}
+simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
+ const char *input, size_t length, char16_t *utf16_output) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
+#else
+ return convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
+#endif
+}
+simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+ const char *input, size_t length, char16_t *utf16_output) noexcept {
+ return get_default_implementation()->convert_utf8_to_utf16le_with_errors(
+ input, length, utf16_output);
+}
+simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+ const char *input, size_t length, char16_t *utf16_output) noexcept {
+ return get_default_implementation()->convert_utf8_to_utf16be_with_errors(
+ input, length, utf16_output);
+}
+simdutf_warn_unused size_t convert_utf8_to_utf32(
+ const char *input, size_t length, char32_t *utf32_output) noexcept {
+ return get_default_implementation()->convert_utf8_to_utf32(input, length,
+ utf32_output);
+}
+simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+ const char *input, size_t length, char32_t *utf32_output) noexcept {
+ return get_default_implementation()->convert_utf8_to_utf32_with_errors(
+ input, length, utf32_output);
+}
+simdutf_warn_unused bool validate_utf16(const char16_t *buf,
+ size_t len) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return validate_utf16be(buf, len);
+#else
+ return validate_utf16le(buf, len);
+#endif
+}
+simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
+ size_t len) noexcept {
+ return get_default_implementation()->validate_utf16le(buf, len);
+}
+simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
+ size_t len) noexcept {
+ return get_default_implementation()->validate_utf16be(buf, len);
+}
+simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf,
+ size_t len) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return validate_utf16be_with_errors(buf, len);
+#else
+ return validate_utf16le_with_errors(buf, len);
+#endif
+}
+simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf,
+ size_t len) noexcept {
+ return get_default_implementation()->validate_utf16le_with_errors(buf, len);
+}
+simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf,
+ size_t len) noexcept {
+ return get_default_implementation()->validate_utf16be_with_errors(buf, len);
+}
+simdutf_warn_unused bool validate_utf32(const char32_t *buf,
+ size_t len) noexcept {
+ return get_default_implementation()->validate_utf32(buf, len);
+}
+simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf,
+ size_t len) noexcept {
+ return get_default_implementation()->validate_utf32_with_errors(buf, len);
+}
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
+ const char *input, size_t length, char16_t *utf16_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
+#else
+ return convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+ const char *input, size_t length, char16_t *utf16_buffer) noexcept {
+ return get_default_implementation()->convert_valid_utf8_to_utf16le(
+ input, length, utf16_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+ const char *input, size_t length, char16_t *utf16_buffer) noexcept {
+ return get_default_implementation()->convert_valid_utf8_to_utf16be(
+ input, length, utf16_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+ const char *input, size_t length, char32_t *utf32_buffer) noexcept {
+ return get_default_implementation()->convert_valid_utf8_to_utf32(
+ input, length, utf32_buffer);
+}
+simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *buf,
+ size_t len,
+ char *utf8_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return convert_utf16be_to_utf8(buf, len, utf8_buffer);
+#else
+ return convert_utf16le_to_utf8(buf, len, utf8_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_utf16_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return convert_utf16be_to_latin1(buf, len, latin1_buffer);
+#else
+ return convert_utf16le_to_latin1(buf, len, latin1_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_latin1_to_utf16(
+ const char *buf, size_t len, char16_t *utf16_output) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return convert_latin1_to_utf16be(buf, len, utf16_output);
+#else
+ return convert_latin1_to_utf16le(buf, len, utf16_output);
+#endif
+}
+simdutf_warn_unused size_t convert_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+ return get_default_implementation()->convert_utf16be_to_latin1(buf, len,
+ latin1_buffer);
+}
+simdutf_warn_unused size_t convert_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+ return get_default_implementation()->convert_utf16le_to_latin1(buf, len,
+ latin1_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+ return get_default_implementation()->convert_valid_utf16be_to_latin1(
+ buf, len, latin1_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+ return get_default_implementation()->convert_valid_utf16le_to_latin1(
+ buf, len, latin1_buffer);
+}
+simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+ return get_default_implementation()->convert_utf16le_to_latin1_with_errors(
+ buf, len, latin1_buffer);
+}
+simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+ return get_default_implementation()->convert_utf16be_to_latin1_with_errors(
+ buf, len, latin1_buffer);
+}
+simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *buf,
+ size_t len,
+ char *utf8_buffer) noexcept {
+ return get_default_implementation()->convert_utf16le_to_utf8(buf, len,
+ utf8_buffer);
+}
+simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *buf,
+ size_t len,
+ char *utf8_buffer) noexcept {
+ return get_default_implementation()->convert_utf16be_to_utf8(buf, len,
+ utf8_buffer);
+}
+simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
+#else
+ return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
+#endif
+}
+simdutf_warn_unused result convert_utf16_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer);
+#else
+ return convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer);
+#endif
+}
+simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+ return get_default_implementation()->convert_utf16le_to_utf8_with_errors(
+ buf, len, utf8_buffer);
+}
+simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+ return get_default_implementation()->convert_utf16be_to_utf8_with_errors(
+ buf, len, utf8_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
+#else
+ return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return convert_valid_utf16be_to_latin1(buf, len, latin1_buffer);
+#else
+ return convert_valid_utf16le_to_latin1(buf, len, latin1_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+ return get_default_implementation()->convert_valid_utf16le_to_utf8(
+ buf, len, utf8_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+ return get_default_implementation()->convert_valid_utf16be_to_utf8(
+ buf, len, utf8_buffer);
+}
+simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *buf,
+ size_t len,
+ char *utf8_buffer) noexcept {
+ return get_default_implementation()->convert_utf32_to_utf8(buf, len,
+ utf8_buffer);
+}
+simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_buffer) noexcept {
+ return get_default_implementation()->convert_utf32_to_utf8_with_errors(
+ buf, len, utf8_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) noexcept {
+ return get_default_implementation()->convert_valid_utf32_to_utf8(buf, len,
+ utf8_buffer);
+}
+simdutf_warn_unused size_t convert_utf32_to_utf16(
+ const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return convert_utf32_to_utf16be(buf, len, utf16_buffer);
+#else
+ return convert_utf32_to_utf16le(buf, len, utf16_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_utf32_to_latin1(
+ const char32_t *input, size_t length, char *latin1_output) noexcept {
+ return get_default_implementation()->convert_utf32_to_latin1(input, length,
+ latin1_output);
+}
+simdutf_warn_unused size_t convert_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+ return get_default_implementation()->convert_utf32_to_utf16le(buf, len,
+ utf16_buffer);
+}
+simdutf_warn_unused size_t convert_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+ return get_default_implementation()->convert_utf32_to_utf16be(buf, len,
+ utf16_buffer);
+}
+simdutf_warn_unused result convert_utf32_to_utf16_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
+#else
+ return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
+#endif
+}
+simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+ return get_default_implementation()->convert_utf32_to_utf16le_with_errors(
+ buf, len, utf16_buffer);
+}
+simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+ return get_default_implementation()->convert_utf32_to_utf16be_with_errors(
+ buf, len, utf16_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16(
+ const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
+#else
+ return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+ return get_default_implementation()->convert_valid_utf32_to_utf16le(
+ buf, len, utf16_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+ return get_default_implementation()->convert_valid_utf32_to_utf16be(
+ buf, len, utf16_buffer);
+}
+simdutf_warn_unused size_t convert_utf16_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return convert_utf16be_to_utf32(buf, len, utf32_buffer);
+#else
+ return convert_utf16le_to_utf32(buf, len, utf32_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+ return get_default_implementation()->convert_utf16le_to_utf32(buf, len,
+ utf32_buffer);
+}
+simdutf_warn_unused size_t convert_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+ return get_default_implementation()->convert_utf16be_to_utf32(buf, len,
+ utf32_buffer);
+}
+simdutf_warn_unused result convert_utf16_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
+#else
+ return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
+#endif
+}
+simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+ return get_default_implementation()->convert_utf16le_to_utf32_with_errors(
+ buf, len, utf32_buffer);
+}
+simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+ return get_default_implementation()->convert_utf16be_to_utf32_with_errors(
+ buf, len, utf32_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
+#else
+ return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+ return get_default_implementation()->convert_valid_utf16le_to_utf32(
+ buf, len, utf32_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+ return get_default_implementation()->convert_valid_utf16be_to_utf32(
+ buf, len, utf32_buffer);
+}
+void change_endianness_utf16(const char16_t *input, size_t length,
+ char16_t *output) noexcept {
+ get_default_implementation()->change_endianness_utf16(input, length, output);
+}
+simdutf_warn_unused size_t count_utf16(const char16_t *input,
+ size_t length) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return count_utf16be(input, length);
+#else
+ return count_utf16le(input, length);
+#endif
+}
+simdutf_warn_unused size_t count_utf16le(const char16_t *input,
+ size_t length) noexcept {
+ return get_default_implementation()->count_utf16le(input, length);
+}
+simdutf_warn_unused size_t count_utf16be(const char16_t *input,
+ size_t length) noexcept {
+ return get_default_implementation()->count_utf16be(input, length);
+}
+simdutf_warn_unused size_t count_utf8(const char *input,
+ size_t length) noexcept {
+ return get_default_implementation()->count_utf8(input, length);
+}
+simdutf_warn_unused size_t latin1_length_from_utf8(const char *buf,
+ size_t len) noexcept {
+ return get_default_implementation()->latin1_length_from_utf8(buf, len);
+}
+simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) noexcept {
+ return get_default_implementation()->latin1_length_from_utf16(len);
+}
+simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) noexcept {
+ return get_default_implementation()->latin1_length_from_utf32(len);
+}
+simdutf_warn_unused size_t utf8_length_from_latin1(const char *buf,
+ size_t len) noexcept {
+ return get_default_implementation()->utf8_length_from_latin1(buf, len);
+}
+simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input,
+ size_t length) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return utf8_length_from_utf16be(input, length);
+#else
+ return utf8_length_from_utf16le(input, length);
+#endif
+}
+simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input,
+ size_t length) noexcept {
+ return get_default_implementation()->utf8_length_from_utf16le(input, length);
+}
+simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input,
+ size_t length) noexcept {
+ return get_default_implementation()->utf8_length_from_utf16be(input, length);
+}
+simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input,
+ size_t length) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return utf32_length_from_utf16be(input, length);
+#else
+ return utf32_length_from_utf16le(input, length);
+#endif
+}
+simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input,
+ size_t length) noexcept {
+ return get_default_implementation()->utf32_length_from_utf16le(input, length);
+}
+simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input,
+ size_t length) noexcept {
+ return get_default_implementation()->utf32_length_from_utf16be(input, length);
+}
+simdutf_warn_unused size_t utf16_length_from_utf8(const char *input,
+ size_t length) noexcept {
+ return get_default_implementation()->utf16_length_from_utf8(input, length);
+}
+simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept {
+ return get_default_implementation()->utf16_length_from_latin1(length);
+}
+simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input,
+ size_t length) noexcept {
+ return get_default_implementation()->utf8_length_from_utf32(input, length);
+}
+simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input,
+ size_t length) noexcept {
+ return get_default_implementation()->utf16_length_from_utf32(input, length);
+}
+simdutf_warn_unused size_t utf32_length_from_utf8(const char *input,
+ size_t length) noexcept {
+ return get_default_implementation()->utf32_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t
+maximal_binary_length_from_base64(const char *input, size_t length) noexcept {
+ return get_default_implementation()->maximal_binary_length_from_base64(
+ input, length);
+}
+
+simdutf_warn_unused result base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_handling_options) noexcept {
+ return get_default_implementation()->base64_to_binary(
+ input, length, output, options, last_chunk_handling_options);
+}
+
+simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) noexcept {
+ return get_default_implementation()->maximal_binary_length_from_base64(
+ input, length);
+}
+
+simdutf_warn_unused result base64_to_binary(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_handling_options) noexcept {
+ return get_default_implementation()->base64_to_binary(
+ input, length, output, options, last_chunk_handling_options);
+}
+
+template <typename chartype>
+simdutf_warn_unused result base64_to_binary_safe_impl(
+ const chartype *input, size_t length, char *output, size_t &outlen,
+ base64_options options,
+ last_chunk_handling_options last_chunk_handling_options) noexcept {
+ static_assert(std::is_same<chartype, char>::value ||
+ std::is_same<chartype, char16_t>::value,
+ "Only char and char16_t are supported.");
+ // The implementation could be nicer, but we expect that most times, the user
+ // will provide us with a buffer that is large enough.
+ size_t max_length = maximal_binary_length_from_base64(input, length);
+ if (outlen >= max_length) {
+ // fast path
+ full_result r = get_default_implementation()->base64_to_binary_details(
+ input, length, output, options, last_chunk_handling_options);
+ if (r.error != error_code::INVALID_BASE64_CHARACTER &&
+ r.error != error_code::BASE64_EXTRA_BITS) {
+ outlen = r.output_count;
+ if (last_chunk_handling_options == stop_before_partial) {
+ if ((r.output_count % 3) != 0) {
+ bool empty_trail = true;
+ for (size_t i = r.input_count; i < length; i++) {
+ if (!scalar::base64::is_ascii_white_space_or_padding(input[i])) {
+ empty_trail = false;
+ break;
+ }
+ }
+ if (empty_trail) {
+ r.input_count = length;
+ }
+ }
+ return {r.error, r.input_count};
+ }
+ return {r.error, length};
+ }
+ return r;
+ }
+ // The output buffer is maybe too small. We will decode a truncated version of
+ // the input.
+ size_t outlen3 = outlen / 3 * 3; // round down to multiple of 3
+ size_t safe_input = base64_length_from_binary(outlen3, options);
+ full_result r = get_default_implementation()->base64_to_binary_details(
+ input, safe_input, output, options, loose);
+ if (r.error == error_code::INVALID_BASE64_CHARACTER) {
+ return r;
+ }
+ size_t offset =
+ (r.error == error_code::BASE64_INPUT_REMAINDER)
+ ? 1
+ : ((r.output_count % 3) == 0 ? 0 : (r.output_count % 3) + 1);
+ size_t output_index = r.output_count - (r.output_count % 3);
+ size_t input_index = safe_input;
+ // offset is a value that is no larger than 3. We backtrack
+ // by up to offset characters + an undetermined number of
+ // white space characters. It is expected that the next loop
+ // runs at most 3 times + the number of white space characters
+ // in between them, so we are not worried about performance.
+ while (offset > 0 && input_index > 0) {
+ chartype c = input[--input_index];
+ if (scalar::base64::is_ascii_white_space(c)) {
+ // skipping
+ } else {
+ offset--;
+ }
+ }
+ size_t remaining_out = outlen - output_index;
+ const chartype *tail_input = input + input_index;
+ size_t tail_length = length - input_index;
+ while (tail_length > 0 &&
+ scalar::base64::is_ascii_white_space(tail_input[tail_length - 1])) {
+ tail_length--;
+ }
+ size_t padding_characts = 0;
+ if (tail_length > 0 && tail_input[tail_length - 1] == '=') {
+ tail_length--;
+ padding_characts++;
+ while (tail_length > 0 &&
+ scalar::base64::is_ascii_white_space(tail_input[tail_length - 1])) {
+ tail_length--;
+ }
+ if (tail_length > 0 && tail_input[tail_length - 1] == '=') {
+ tail_length--;
+ padding_characts++;
+ }
+ }
+ // this will advance tail_input and tail_length
+ result rr = scalar::base64::base64_tail_decode_safe(
+ output + output_index, remaining_out, tail_input, tail_length,
+ padding_characts, options, last_chunk_handling_options);
+ outlen = output_index + remaining_out;
+ if (last_chunk_handling_options != stop_before_partial &&
+ rr.error == error_code::SUCCESS && padding_characts > 0) {
+ // additional checks
+ if ((outlen % 3 == 0) || ((outlen % 3) + 1 + padding_characts != 4)) {
+ rr.error = error_code::INVALID_BASE64_CHARACTER;
+ }
+ }
+ if (rr.error == error_code::SUCCESS &&
+ last_chunk_handling_options == stop_before_partial) {
+ if (tail_input > input + input_index) {
+ rr.count = tail_input - input;
+ } else if (r.input_count > 0) {
+ rr.count = r.input_count + rr.count;
+ }
+ return rr;
+ }
+ rr.count += input_index;
+ return rr;
+}
+
+simdutf_warn_unused size_t convert_latin1_to_utf8_safe(
+ const char *buf, size_t len, char *utf8_output, size_t utf8_len) noexcept {
+ const auto start{utf8_output};
+
+ while (true) {
+ // convert_latin1_to_utf8 will never write more than input length * 2
+ auto read_len = std::min(len, utf8_len >> 1);
+ if (read_len <= 16) {
+ break;
+ }
+
+ const auto write_len =
+ simdutf::convert_latin1_to_utf8(buf, read_len, utf8_output);
+
+ utf8_output += write_len;
+ utf8_len -= write_len;
+ buf += read_len;
+ len -= read_len;
+ }
+
+ utf8_output +=
+ scalar::latin1_to_utf8::convert_safe(buf, len, utf8_output, utf8_len);
+
+ return utf8_output - start;
+}
+
+simdutf_warn_unused result base64_to_binary_safe(
+ const char *input, size_t length, char *output, size_t &outlen,
+ base64_options options,
+ last_chunk_handling_options last_chunk_handling_options) noexcept {
+ return base64_to_binary_safe_impl<char>(input, length, output, outlen,
+ options, last_chunk_handling_options);
+}
+simdutf_warn_unused result base64_to_binary_safe(
+ const char16_t *input, size_t length, char *output, size_t &outlen,
+ base64_options options,
+ last_chunk_handling_options last_chunk_handling_options) noexcept {
+ return base64_to_binary_safe_impl<char16_t>(
+ input, length, output, outlen, options, last_chunk_handling_options);
+}
+
+simdutf_warn_unused size_t
+base64_length_from_binary(size_t length, base64_options options) noexcept {
+ return get_default_implementation()->base64_length_from_binary(length,
+ options);
+}
+
+size_t binary_to_base64(const char *input, size_t length, char *output,
+ base64_options options) noexcept {
+ return get_default_implementation()->binary_to_base64(input, length, output,
+ options);
+}
+
+simdutf_warn_unused simdutf::encoding_type
+autodetect_encoding(const char *buf, size_t length) noexcept {
+ return get_default_implementation()->autodetect_encoding(buf, length);
+}
+simdutf_warn_unused int detect_encodings(const char *buf,
+ size_t length) noexcept {
+ return get_default_implementation()->detect_encodings(buf, length);
+}
+const implementation *builtin_implementation() {
+ static const implementation *builtin_impl =
+ get_available_implementations()[SIMDUTF_STRINGIFY(
+ SIMDUTF_BUILTIN_IMPLEMENTATION)];
+ return builtin_impl;
+}
+
+simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length) {
+ return scalar::utf8::trim_partial_utf8(input, length);
+}
+
+simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input,
+ size_t length) {
+ return scalar::utf16::trim_partial_utf16<BIG>(input, length);
+}
+
+simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input,
+ size_t length) {
+ return scalar::utf16::trim_partial_utf16<LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input,
+ size_t length) {
+#if SIMDUTF_IS_BIG_ENDIAN
+ return trim_partial_utf16be(input, length);
+#else
+ return trim_partial_utf16le(input, length);
+#endif
+}
+
+} // namespace simdutf
diff --git a/contrib/simdutf/src/lasx/implementation.cpp b/contrib/simdutf/src/lasx/implementation.cpp
new file mode 100644
index 000000000..1bffc20ab
--- /dev/null
+++ b/contrib/simdutf/src/lasx/implementation.cpp
@@ -0,0 +1,1298 @@
+#include "simdutf/lasx/begin.h"
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+#ifndef SIMDUTF_LASX_H
+ #error "lasx.h must be included"
+#endif
+using namespace simd;
+
+// convert vmskltz/vmskgez/vmsknz to
+// simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes index
+const uint8_t lasx_1_2_utf8_bytes_mask[] = {
+ 0, 1, 4, 5, 16, 17, 20, 21, 64, 65, 68, 69, 80, 81, 84,
+ 85, 2, 3, 6, 7, 18, 19, 22, 23, 66, 67, 70, 71, 82, 83,
+ 86, 87, 8, 9, 12, 13, 24, 25, 28, 29, 72, 73, 76, 77, 88,
+ 89, 92, 93, 10, 11, 14, 15, 26, 27, 30, 31, 74, 75, 78, 79,
+ 90, 91, 94, 95, 32, 33, 36, 37, 48, 49, 52, 53, 96, 97, 100,
+ 101, 112, 113, 116, 117, 34, 35, 38, 39, 50, 51, 54, 55, 98, 99,
+ 102, 103, 114, 115, 118, 119, 40, 41, 44, 45, 56, 57, 60, 61, 104,
+ 105, 108, 109, 120, 121, 124, 125, 42, 43, 46, 47, 58, 59, 62, 63,
+ 106, 107, 110, 111, 122, 123, 126, 127, 128, 129, 132, 133, 144, 145, 148,
+ 149, 192, 193, 196, 197, 208, 209, 212, 213, 130, 131, 134, 135, 146, 147,
+ 150, 151, 194, 195, 198, 199, 210, 211, 214, 215, 136, 137, 140, 141, 152,
+ 153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221, 138, 139, 142, 143,
+ 154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223, 160, 161, 164,
+ 165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245, 162, 163,
+ 166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247, 168,
+ 169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253,
+ 170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254,
+ 255};
+
+simdutf_really_inline __m128i lsx_swap_bytes(__m128i vec) {
+ return __lsx_vshuf4i_b(vec, 0b10110001);
+}
+simdutf_really_inline __m256i lasx_swap_bytes(__m256i vec) {
+ return __lasx_xvshuf4i_b(vec, 0b10110001);
+}
+
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
+ return input.is_ascii();
+}
+
+simdutf_unused simdutf_really_inline simd8<bool>
+must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
+ const simd8<uint8_t> prev3) {
+ simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
+ simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+ simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+ // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller
+ // is using ^ as well. This will work fine because we only have to report
+ // errors for cases with 0-1 lead bytes. Multiple lead bytes implies 2
+ // overlapping multibyte characters, and if that happens, there is guaranteed
+ // to be at least *one* lead byte that is part of only 1 other multibyte
+ // character. The error will be detected there.
+ return is_second_byte ^ is_third_byte ^ is_fourth_byte;
+}
+
+simdutf_really_inline simd8<bool>
+must_be_2_3_continuation(const simd8<uint8_t> prev2,
+ const simd8<uint8_t> prev3) {
+ simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+ simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+ return is_third_byte ^ is_fourth_byte;
+}
+
+// common functions for utf8 conversions
+simdutf_really_inline __m128i convert_utf8_3_byte_to_utf16(__m128i in) {
+ // Low half contains 10bbbbbb|10cccccc
+ // High half contains 1110aaaa|1110aaaa
+ const v16u8 sh = {2, 1, 5, 4, 8, 7, 11, 10, 0, 0, 3, 3, 6, 6, 9, 9};
+ const v8u16 v0fff = {0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff};
+
+ __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, (__m128i)sh);
+ // 1110aaaa => aaaa0000
+ __m128i perm_high = __lsx_vslli_b(__lsx_vbsrl_v(perm, 8), 4);
+ // 10bbbbbb 10cccccc => 0010bbbb bbcccccc
+ __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), /* perm >> 2*/
+ perm, __lsx_vrepli_h(0x3f) /* 0x003f */);
+ // 0010bbbb bbcccccc => aaaabbbb bbcccccc
+ composed = __lsx_vbitsel_v(perm_high, composed, (__m128i)v0fff);
+
+ return composed;
+}
+
+simdutf_really_inline __m128i convert_utf8_2_byte_to_utf16(__m128i in) {
+ // 10bbbbb 110aaaaa => 00bbbbb 000aaaaa
+ __m128i composed = __lsx_vand_v(in, __lsx_vldi(0x3f));
+ // 00bbbbbb 000aaaaa => 00000aaa aabbbbbb
+ composed = __lsx_vbitsel_v(
+ __lsx_vsrli_h(__lsx_vslli_h(composed, 8), 2), /* (aaaaa << 8) >> 2 */
+ __lsx_vsrli_h(composed, 8), /* bbbbbb >> 8 */
+ __lsx_vrepli_h(0x3f)); /* 0x003f */
+ return composed;
+}
+
+simdutf_really_inline __m128i
+convert_utf8_1_to_2_byte_to_utf16(__m128i in, size_t shufutf8_idx) {
+ // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
+ // This is a relatively easy scenario
+ // we process SIX (6) input code-code units. The max length in bytes of six
+ // code code units spanning between 1 and 2 bytes each is 12 bytes.
+ __m128i sh =
+ __lsx_vld(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]),
+ 0);
+ // Shuffle
+ // 1 byte: 00000000 0bbbbbbb
+ // 2 byte: 110aaaaa 10bbbbbb
+ __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
+ // 1 byte: 00000000 0bbbbbbb
+ // 2 byte: 00000000 00bbbbbb
+ __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_h(0x7f)); // 6 or 7 bits
+ // 1 byte: 00000000 00000000
+ // 2 byte: 00000aaa aa000000
+ __m128i v1f00 = __lsx_vldi(-2785); // -2785(13bit) => 151f
+ __m128i composed = __lsx_vsrli_h(__lsx_vand_v(perm, v1f00), 2); // 5 bits
+ // Combine with a shift right accumulate
+ // 1 byte: 00000000 0bbbbbbb
+ // 2 byte: 00000aaa aabbbbbb
+ composed = __lsx_vadd_h(ascii, composed);
+ return composed;
+}
+
+#include "lasx/lasx_validate_utf16.cpp"
+#include "lasx/lasx_validate_utf32le.cpp"
+
+#include "lasx/lasx_convert_latin1_to_utf8.cpp"
+#include "lasx/lasx_convert_latin1_to_utf16.cpp"
+#include "lasx/lasx_convert_latin1_to_utf32.cpp"
+
+#include "lasx/lasx_convert_utf8_to_utf16.cpp"
+#include "lasx/lasx_convert_utf8_to_utf32.cpp"
+#include "lasx/lasx_convert_utf8_to_latin1.cpp"
+
+#include "lasx/lasx_convert_utf16_to_latin1.cpp"
+#include "lasx/lasx_convert_utf16_to_utf8.cpp"
+#include "lasx/lasx_convert_utf16_to_utf32.cpp"
+
+#include "lasx/lasx_convert_utf32_to_latin1.cpp"
+#include "lasx/lasx_convert_utf32_to_utf8.cpp"
+#include "lasx/lasx_convert_utf32_to_utf16.cpp"
+#include "lasx/lasx_base64.cpp"
+
+} // namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#include "generic/buf_block_reader.h"
+#include "generic/utf8_validation/utf8_lookup4_algorithm.h"
+#include "generic/utf8_validation/utf8_validator.h"
+
+// transcoding from UTF-8 to Latin 1
+#include "generic/utf8_to_latin1/utf8_to_latin1.h"
+#include "generic/utf8_to_latin1/valid_utf8_to_latin1.h"
+// transcoding from UTF-8 to UTF-16
+#include "generic/utf8_to_utf16/valid_utf8_to_utf16.h"
+#include "generic/utf8_to_utf16/utf8_to_utf16.h"
+// transcoding from UTF-8 to UTF-32
+#include "generic/utf8_to_utf32/valid_utf8_to_utf32.h"
+#include "generic/utf8_to_utf32/utf8_to_utf32.h"
+
+#include "scalar/utf32_to_utf16/valid_utf32_to_utf16.h"
+#include "scalar/utf32_to_utf16/utf32_to_utf16.h"
+
+// other functions
+#include "generic/utf8.h"
+#include "generic/utf16.h"
+#include "scalar/latin1.h"
+
+//
+// Implementation-specific overrides
+//
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+ size_t length) const noexcept {
+ // If there is a BOM, then we trust it.
+ auto bom_encoding = simdutf::BOM::check_bom(input, length);
+ // todo: reimplement as a one-pass algorithm.
+ if (bom_encoding != encoding_type::unspecified) {
+ return bom_encoding;
+ }
+ int out = 0;
+ if (validate_utf8(input, length)) {
+ out |= encoding_type::UTF8;
+ }
+ if ((length % 2) == 0) {
+ if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+ length / 2)) {
+ out |= encoding_type::UTF16_LE;
+ }
+ }
+ if ((length % 4) == 0) {
+ if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+ out |= encoding_type::UTF32_LE;
+ }
+ }
+ return out;
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+ return lasx::utf8_validation::generic_validate_utf8(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+ const char *buf, size_t len) const noexcept {
+ return lasx::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+ return lasx::utf8_validation::generic_validate_ascii(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+ const char *buf, size_t len) const noexcept {
+ return lasx::utf8_validation::generic_validate_ascii_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid. protected the implementation from nullptr.
+ return true;
+ }
+ const char16_t *tail = lasx_validate_utf16<endianness::LITTLE>(buf, len);
+ if (tail) {
+ return scalar::utf16::validate<endianness::LITTLE>(tail,
+ len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid. protected the implementation from nullptr.
+ return true;
+ }
+ const char16_t *tail = lasx_validate_utf16<endianness::BIG>(buf, len);
+ if (tail) {
+ return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return result(error_code::SUCCESS, 0);
+ }
+ result res = lasx_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+ if (res.count != len) {
+ result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
+ buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return result(error_code::SUCCESS, 0);
+ }
+ result res = lasx_validate_utf16_with_errors<endianness::BIG>(buf, len);
+ if (res.count != len) {
+ result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
+ buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid. protected the implementation from nullptr.
+ return true;
+ }
+ const char32_t *tail = lasx_validate_utf32le(buf, len);
+ if (tail) {
+ return scalar::utf32::validate(tail, len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return result(error_code::SUCCESS, 0);
+ }
+ result res = lasx_validate_utf32le_with_errors(buf, len);
+ if (res.count != len) {
+ result scalar_res =
+ scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+ const char *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char *, char *> ret =
+ lasx_convert_latin1_to_utf8(buf, len, utf8_output);
+ size_t converted_chars = ret.second - utf8_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char *, char16_t *> ret =
+ lasx_convert_latin1_to_utf16le(buf, len, utf16_output);
+ size_t converted_chars = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars =
+ scalar::latin1_to_utf16::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char *, char16_t *> ret =
+ lasx_convert_latin1_to_utf16be(buf, len, utf16_output);
+ size_t converted_chars = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars =
+ scalar::latin1_to_utf16::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char *, char32_t *> ret =
+ lasx_convert_latin1_to_utf32(buf, len, utf32_output);
+ size_t converted_chars = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ size_t pos = 0;
+ char *output_start{latin1_output};
+ // Performance degradation when memory address is not 32-byte aligned
+ while (((uint64_t)latin1_output & 0x1F) && pos < len) {
+ if (buf[pos] & 0x80) {
+ if (pos + 1 >= len)
+ return 0;
+ if ((buf[pos] & 0b11100000) == 0b11000000) {
+ if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+ return 0;
+ uint32_t code_point =
+ (buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111);
+ if (code_point < 0x80 || 0xFF < code_point) {
+ return 0;
+ }
+ *latin1_output++ = char(code_point);
+ pos += 2;
+ } else {
+ return 0;
+ }
+ } else {
+ *latin1_output++ = char(buf[pos]);
+ pos++;
+ }
+ }
+ size_t convert_size = latin1_output - output_start;
+ if (pos == len)
+ return convert_size;
+ utf8_to_latin1::validating_transcoder converter;
+ size_t convert_result =
+ converter.convert(buf + pos, len - pos, latin1_output);
+ return convert_result ? convert_size + convert_result : 0;
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ size_t pos = 0;
+ char *output_start{latin1_output};
+ // Performance degradation when memory address is not 32-byte aligned
+ while (((uint64_t)latin1_output & 0x1F) && pos < len) {
+ if (buf[pos] & 0x80) {
+ if ((buf[pos] & 0b11100000) == 0b11000000) {
+ if (pos + 1 >= len)
+ return result(error_code::TOO_SHORT, pos);
+ if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+ return result(error_code::TOO_SHORT, pos);
+ uint32_t code_point =
+ (buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111);
+ if (code_point < 0x80)
+ return result(error_code::OVERLONG, pos);
+ if (0xFF < code_point)
+ return result(error_code::TOO_LARGE, pos);
+ *latin1_output++ = char(code_point);
+ pos += 2;
+ } else if ((buf[pos] & 0b11110000) == 0b11100000) {
+ return result(error_code::TOO_LARGE, pos);
+ } else if ((buf[pos] & 0b11111000) == 0b11110000) {
+ return result(error_code::TOO_LARGE, pos);
+ } else {
+ if ((buf[pos] & 0b11000000) == 0b10000000) {
+ return result(error_code::TOO_LONG, pos);
+ }
+ return result(error_code::HEADER_BITS, pos);
+ }
+ } else {
+ *latin1_output++ = char(buf[pos]);
+ pos++;
+ }
+ }
+ size_t convert_size = latin1_output - output_start;
+ if (pos == len)
+ return result(error_code::SUCCESS, convert_size);
+
+ utf8_to_latin1::validating_transcoder converter;
+ result res =
+ converter.convert_with_errors(buf + pos, len - pos, latin1_output);
+ return res.error ? result(res.error, res.count + pos)
+ : result(res.error, res.count + convert_size);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ size_t pos = 0;
+ char *output_start{latin1_output};
+ // Performance degradation when memory address is not 32-byte aligned
+ while (((uint64_t)latin1_output & 0x1F) && pos < len) {
+ if (buf[pos] & 0x80) {
+ if (pos + 1 >= len)
+ break;
+ if ((buf[pos] & 0b11100000) == 0b11000000) {
+ if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+ return 0;
+ uint32_t code_point =
+ (buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111);
+ *latin1_output++ = char(code_point);
+ pos += 2;
+ } else {
+ return 0;
+ }
+ } else {
+ *latin1_output++ = char(buf[pos]);
+ pos++;
+ }
+ }
+ size_t convert_size = latin1_output - output_start;
+ if (pos == len)
+ return convert_size;
+
+ size_t convert_result =
+ lasx::utf8_to_latin1::convert_valid(buf + pos, len - pos, latin1_output);
+ return convert_result ? convert_size + convert_result : 0;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert_with_errors<endianness::LITTLE>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+ const char *input, size_t size, char16_t *utf16_output) const noexcept {
+ return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+ const char *input, size_t size, char16_t *utf16_output) const noexcept {
+ return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ utf8_to_utf32::validating_transcoder converter;
+ return converter.convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ utf8_to_utf32::validating_transcoder converter;
+ return converter.convert_with_errors(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+ const char *input, size_t size, char32_t *utf32_output) const noexcept {
+ return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ lasx_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_latin1::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ lasx_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_latin1::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<result, char *> ret =
+ lasx_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+ buf, len, latin1_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<result, char *> ret =
+ lasx_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
+ latin1_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ // optimization opportunity: implement a custom function.
+ return convert_utf16be_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ // optimization opportunity: implement a custom function.
+ return convert_utf16le_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ lasx_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ lasx_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf8::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ lasx_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len,
+ utf8_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ lasx_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len,
+ utf8_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf16le_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf16be_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return 0;
+ }
+ std::pair<const char32_t *, char *> ret =
+ lasx_convert_utf32_to_utf8(buf, len, utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return result(error_code::SUCCESS, 0);
+ }
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ lasx_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+ if (ret.first.count != len) {
+ result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char16_t *, char32_t *> ret =
+ lasx_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char16_t *, char32_t *> ret =
+ lasx_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf32::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char32_t *> ret =
+ lasx_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len,
+ utf32_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf32_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char32_t *> ret =
+ lasx_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len,
+ utf32_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf32_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char32_t *, char *> ret =
+ lasx_convert_utf32_to_latin1(buf, len, latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<result, char *> ret =
+ lasx_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char32_t *, char *> ret =
+ lasx_convert_utf32_to_latin1(buf, len, latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
+ ret.first, len - (ret.first - buf), ret.second);
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ // optimization opportunity: implement a custom function.
+ return convert_utf32_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char32_t *, char16_t *> ret =
+ lasx_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char32_t *, char16_t *> ret =
+ lasx_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf32_to_utf16::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char16_t *> ret =
+ lasx_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len,
+ utf16_output);
+ if (ret.first.count != len) {
+ result scalar_res =
+ scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf16_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char16_t *> ret =
+ lasx_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
+ utf16_output);
+ if (ret.first.count != len) {
+ result scalar_res =
+ scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf16_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return convert_utf32_to_utf16le(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return convert_utf16le_to_utf32(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return convert_utf16be_to_utf32(buf, len, utf32_output);
+}
+
+void implementation::change_endianness_utf16(const char16_t *input,
+ size_t length,
+ char16_t *output) const noexcept {
+ utf16::change_endianness_utf16(input, length, output);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::count_code_points<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::count_code_points<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+ size_t pos = 0;
+ size_t count = 0;
+ // Performance degradation when memory address is not 32-byte aligned
+ while ((((uint64_t)input + pos) & 0x1F && pos < length)) {
+ if (input[pos++] > -65) {
+ count++;
+ }
+ }
+ __m256i v_bf = __lasx_xvldi(0xBF); // 0b10111111
+ for (; pos + 32 <= length; pos += 32) {
+ __m256i in = __lasx_xvld(reinterpret_cast<const int8_t *>(input + pos), 0);
+ __m256i utf8_count =
+ __lasx_xvpcnt_h(__lasx_xvmskltz_b(__lasx_xvslt_b(v_bf, in)));
+ count = count + __lasx_xvpickve2gr_wu(utf8_count, 0) +
+ __lasx_xvpickve2gr_wu(utf8_count, 4);
+ }
+ return count + scalar::utf8::count_code_points(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+ const char *buf, size_t len) const noexcept {
+ return count_utf8(buf, len);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+ return length;
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+ return length;
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+ const char *input, size_t length) const noexcept {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
+ const uint8_t *data_end = data + length;
+ uint64_t result = 0;
+ while (data + 16 < data_end) {
+ uint64_t two_bytes = 0;
+ __m128i input_vec = __lsx_vld(data, 0);
+ two_bytes =
+ __lsx_vpickve2gr_hu(__lsx_vpcnt_h(__lsx_vmskltz_b(input_vec)), 0);
+ result += 16 + two_bytes;
+ data += 16;
+ }
+ return result + scalar::latin1::utf8_length_from_latin1((const char *)data,
+ data_end - data);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+ return length;
+}
+
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+ return length;
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ return utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ __m256i v_80 = __lasx_xvrepli_w(0x80); /*0x00000080*/
+ __m256i v_800 = __lasx_xvldi(-3832); /*0x00000800*/
+ __m256i v_10000 = __lasx_xvldi(-3583); /*0x00010000*/
+ size_t pos = 0;
+ size_t count = 0;
+ for (; pos + 8 <= length; pos += 8) {
+ __m256i in =
+ __lasx_xvld(reinterpret_cast<const uint32_t *>(input + pos), 0);
+ __m256i ascii_bytes_bytemask = __lasx_xvslt_w(in, v_80);
+ __m256i one_two_bytes_bytemask = __lasx_xvslt_w(in, v_800);
+ __m256i two_bytes_bytemask =
+ __lasx_xvxor_v(one_two_bytes_bytemask, ascii_bytes_bytemask);
+ __m256i three_bytes_bytemask =
+ __lasx_xvxor_v(__lasx_xvslt_w(in, v_10000), one_two_bytes_bytemask);
+
+ __m256i ascii_bytes =
+ __lasx_xvpcnt_w(__lasx_xvmskltz_w(ascii_bytes_bytemask));
+ const uint32_t ascii_bytes_count = __lasx_xvpickve2gr_wu(ascii_bytes, 0) +
+ __lasx_xvpickve2gr_wu(ascii_bytes, 4);
+ __m256i two_bytes = __lasx_xvpcnt_w(__lasx_xvmskltz_w(two_bytes_bytemask));
+ const uint32_t two_bytes_count = __lasx_xvpickve2gr_wu(two_bytes, 0) +
+ __lasx_xvpickve2gr_wu(two_bytes, 4);
+ __m256i three_bytes =
+ __lasx_xvpcnt_w(__lasx_xvmskltz_w(three_bytes_bytemask));
+ const uint32_t three_bytes_count = __lasx_xvpickve2gr_wu(three_bytes, 0) +
+ __lasx_xvpickve2gr_wu(three_bytes, 4);
+
+ count +=
+ 32 - 3 * ascii_bytes_count - 2 * two_bytes_count - three_bytes_count;
+ }
+ return count +
+ scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ __m128i v_ffff = __lsx_vldi(-2304); /*0x0000ffff*/
+ size_t pos = 0;
+ size_t count = 0;
+ for (; pos + 4 <= length; pos += 4) {
+ __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input + pos), 0);
+ __m128i surrogate_bytemask = __lsx_vslt_wu(v_ffff, in);
+ size_t surrogate_count = __lsx_vpickve2gr_bu(
+ __lsx_vpcnt_b(__lsx_vmskltz_w(surrogate_bytemask)), 0);
+ count += 4 + surrogate_count;
+ }
+ return count +
+ scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+ size_t length, base64_options options) const noexcept {
+ return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+ char *output,
+ base64_options options) const noexcept {
+ if (options & base64_url) {
+ return encode_base64<true>(output, input, length, options);
+ } else {
+ return encode_base64<false>(output, input, length, options);
+ }
+}
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#include "simdutf/lasx/end.h"
diff --git a/contrib/simdutf/src/lasx/lasx_base64.cpp b/contrib/simdutf/src/lasx/lasx_base64.cpp
new file mode 100644
index 000000000..33515f2f8
--- /dev/null
+++ b/contrib/simdutf/src/lasx/lasx_base64.cpp
@@ -0,0 +1,596 @@
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
+
+template <bool isbase64url>
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+ base64_options options) {
+ // credit: Wojciech Muła
+ // SSE (lookup: pshufb improved unrolled)
+ const uint8_t *input = (const uint8_t *)src;
+ static const char *lookup_tbl =
+ isbase64url
+ ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+ : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+ uint8_t *out = (uint8_t *)dst;
+
+ v32u8 shuf;
+ __m256i v_fc0fc00, v_3f03f0, shift_r, shift_l, base64_tbl0, base64_tbl1,
+ base64_tbl2, base64_tbl3;
+ if (srclen >= 28) {
+ shuf = v32u8{1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10,
+ 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10};
+
+ v_fc0fc00 = __lasx_xvreplgr2vr_w(uint32_t(0x0fc0fc00));
+ v_3f03f0 = __lasx_xvreplgr2vr_w(uint32_t(0x003f03f0));
+ shift_r = __lasx_xvreplgr2vr_w(uint32_t(0x0006000a));
+ shift_l = __lasx_xvreplgr2vr_w(uint32_t(0x00080004));
+ base64_tbl0 = ____m256i(__lsx_vld(lookup_tbl, 0));
+ base64_tbl1 = ____m256i(__lsx_vld(lookup_tbl, 16));
+ base64_tbl2 = ____m256i(__lsx_vld(lookup_tbl, 32));
+ base64_tbl3 = ____m256i(__lsx_vld(lookup_tbl, 48));
+ }
+ size_t i = 0;
+ for (; i + 100 <= srclen; i += 96) {
+ __m128i in0_lo =
+ __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 0);
+ __m128i in0_hi =
+ __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 1);
+ __m128i in1_lo =
+ __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 2);
+ __m128i in1_hi =
+ __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 3);
+ __m128i in2_lo =
+ __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 4);
+ __m128i in2_hi =
+ __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 5);
+ __m128i in3_lo =
+ __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 6);
+ __m128i in3_hi =
+ __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 7);
+
+ __m256i in0 = lasx_set_q(in0_hi, in0_lo);
+ __m256i in1 = lasx_set_q(in1_hi, in1_lo);
+ __m256i in2 = lasx_set_q(in2_hi, in2_lo);
+ __m256i in3 = lasx_set_q(in3_hi, in3_lo);
+
+ in0 = __lasx_xvshuf_b(in0, in0, (__m256i)shuf);
+ in1 = __lasx_xvshuf_b(in1, in1, (__m256i)shuf);
+ in2 = __lasx_xvshuf_b(in2, in2, (__m256i)shuf);
+ in3 = __lasx_xvshuf_b(in3, in3, (__m256i)shuf);
+
+ __m256i t0_0 = __lasx_xvand_v(in0, v_fc0fc00);
+ __m256i t0_1 = __lasx_xvand_v(in1, v_fc0fc00);
+ __m256i t0_2 = __lasx_xvand_v(in2, v_fc0fc00);
+ __m256i t0_3 = __lasx_xvand_v(in3, v_fc0fc00);
+
+ __m256i t1_0 = __lasx_xvsrl_h(t0_0, shift_r);
+ __m256i t1_1 = __lasx_xvsrl_h(t0_1, shift_r);
+ __m256i t1_2 = __lasx_xvsrl_h(t0_2, shift_r);
+ __m256i t1_3 = __lasx_xvsrl_h(t0_3, shift_r);
+
+ __m256i t2_0 = __lasx_xvand_v(in0, v_3f03f0);
+ __m256i t2_1 = __lasx_xvand_v(in1, v_3f03f0);
+ __m256i t2_2 = __lasx_xvand_v(in2, v_3f03f0);
+ __m256i t2_3 = __lasx_xvand_v(in3, v_3f03f0);
+
+ __m256i t3_0 = __lasx_xvsll_h(t2_0, shift_l);
+ __m256i t3_1 = __lasx_xvsll_h(t2_1, shift_l);
+ __m256i t3_2 = __lasx_xvsll_h(t2_2, shift_l);
+ __m256i t3_3 = __lasx_xvsll_h(t2_3, shift_l);
+
+ __m256i input0 = __lasx_xvor_v(t1_0, t3_0);
+ __m256i input0_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input0);
+ __m256i input0_shuf1 = __lasx_xvshuf_b(
+ base64_tbl3, base64_tbl2, __lasx_xvsub_b(input0, __lasx_xvldi(32)));
+ __m256i input0_mask = __lasx_xvslei_bu(input0, 31);
+ __m256i input0_result =
+ __lasx_xvbitsel_v(input0_shuf1, input0_shuf0, input0_mask);
+ __lasx_xvst(input0_result, reinterpret_cast<__m256i *>(out), 0);
+ out += 32;
+
+ __m256i input1 = __lasx_xvor_v(t1_1, t3_1);
+ __m256i input1_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input1);
+ __m256i input1_shuf1 = __lasx_xvshuf_b(
+ base64_tbl3, base64_tbl2, __lasx_xvsub_b(input1, __lasx_xvldi(32)));
+ __m256i input1_mask = __lasx_xvslei_bu(input1, 31);
+ __m256i input1_result =
+ __lasx_xvbitsel_v(input1_shuf1, input1_shuf0, input1_mask);
+ __lasx_xvst(input1_result, reinterpret_cast<__m256i *>(out), 0);
+ out += 32;
+
+ __m256i input2 = __lasx_xvor_v(t1_2, t3_2);
+ __m256i input2_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input2);
+ __m256i input2_shuf1 = __lasx_xvshuf_b(
+ base64_tbl3, base64_tbl2, __lasx_xvsub_b(input2, __lasx_xvldi(32)));
+ __m256i input2_mask = __lasx_xvslei_bu(input2, 31);
+ __m256i input2_result =
+ __lasx_xvbitsel_v(input2_shuf1, input2_shuf0, input2_mask);
+ __lasx_xvst(input2_result, reinterpret_cast<__m256i *>(out), 0);
+ out += 32;
+
+ __m256i input3 = __lasx_xvor_v(t1_3, t3_3);
+ __m256i input3_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input3);
+ __m256i input3_shuf1 = __lasx_xvshuf_b(
+ base64_tbl3, base64_tbl2, __lasx_xvsub_b(input3, __lasx_xvldi(32)));
+ __m256i input3_mask = __lasx_xvslei_bu(input3, 31);
+ __m256i input3_result =
+ __lasx_xvbitsel_v(input3_shuf1, input3_shuf0, input3_mask);
+ __lasx_xvst(input3_result, reinterpret_cast<__m256i *>(out), 0);
+ out += 32;
+ }
+ for (; i + 28 <= srclen; i += 24) {
+
+ __m128i in_lo = __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 0);
+ __m128i in_hi =
+ __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 1);
+
+ __m256i in = lasx_set_q(in_hi, in_lo);
+
+ // bytes from groups A, B and C are needed in separate 32-bit lanes
+ // in = [DDDD|CCCC|BBBB|AAAA]
+ //
+ // an input triplet has layout
+ // [????????|ccdddddd|bbbbcccc|aaaaaabb]
+ // byte 3 byte 2 byte 1 byte 0 -- byte 3 comes from the next
+ // triplet
+ //
+ // shuffling changes the order of bytes: 1, 0, 2, 1
+ // [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc]
+ // ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^
+ // processed bits
+ in = __lasx_xvshuf_b(in, in, (__m256i)shuf);
+
+ // unpacking
+ // t0 = [0000cccc|cc000000|aaaaaa00|00000000]
+ __m256i t0 = __lasx_xvand_v(in, v_fc0fc00);
+ // t1 = [00000000|00cccccc|00000000|00aaaaaa]
+ // ((c >> 6), (a >> 10))
+ __m256i t1 = __lasx_xvsrl_h(t0, shift_r);
+
+ // t2 = [00000000|00dddddd|000000bb|bbbb0000]
+ __m256i t2 = __lasx_xvand_v(in, v_3f03f0);
+ // t3 = [00dddddd|00000000|00bbbbbb|00000000]
+ // ((d << 8), (b << 4))
+ __m256i t3 = __lasx_xvsll_h(t2, shift_l);
+
+ // res = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3
+ __m256i indices = __lasx_xvor_v(t1, t3);
+
+ __m256i indices_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, indices);
+ __m256i indices_shuf1 = __lasx_xvshuf_b(
+ base64_tbl3, base64_tbl2, __lasx_xvsub_b(indices, __lasx_xvldi(32)));
+ __m256i indices_mask = __lasx_xvslei_bu(indices, 31);
+ __m256i indices_result =
+ __lasx_xvbitsel_v(indices_shuf1, indices_shuf0, indices_mask);
+ __lasx_xvst(indices_result, reinterpret_cast<__m256i *>(out), 0);
+ out += 32;
+ }
+
+ return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+ srclen - i, options);
+}
+
+static inline void compress(__m128i data, uint16_t mask, char *output) {
+ if (mask == 0) {
+ __lsx_vst(data, reinterpret_cast<__m128i *>(output), 0);
+ return;
+ }
+ // this particular implementation was inspired by work done by @animetosho
+ // we do it in two steps, first 8 bytes and then second 8 bytes
+ uint8_t mask1 = uint8_t(mask); // least significant 8 bits
+ uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+ // next line just loads the 64-bit values thintable_epi8[mask1] and
+ // thintable_epi8[mask2] into a 128-bit register, using only
+ // two instructions on most compilers.
+
+ v2u64 shufmask = {tables::base64::thintable_epi8[mask1],
+ tables::base64::thintable_epi8[mask2]};
+
+ // we increment by 0x08 the second half of the mask
+ const v4u32 hi = {0, 0, 0x08080808, 0x08080808};
+ __m128i shufmask1 = __lsx_vadd_b((__m128i)shufmask, (__m128i)hi);
+
+ // this is the version "nearly pruned"
+ __m128i pruned = __lsx_vshuf_b(data, data, shufmask1);
+ // we still need to put the two halves together.
+ // we compute the popcount of the first half:
+ int pop1 = tables::base64::BitsSetTable256mul2[mask1];
+ // then load the corresponding mask, what it does is to write
+ // only the first pop1 bytes from the first 8 bytes, and then
+ // it fills in with the bytes from the second 8 bytes + some filling
+ // at the end.
+ __m128i compactmask =
+ __lsx_vld(reinterpret_cast<const __m128i *>(
+ tables::base64::pshufb_combine_table + pop1 * 8),
+ 0);
+ __m128i answer = __lsx_vshuf_b(pruned, pruned, compactmask);
+
+ __lsx_vst(answer, reinterpret_cast<__m128i *>(output), 0);
+}
+
+struct block64 {
+ __m256i chunks[2];
+};
+
+template <bool base64_url>
+static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
+ __m256i ascii_space_tbl =
+ ____m256i((__m128i)v16u8{0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0});
+ // credit: aqrit
+ __m256i delta_asso =
+ ____m256i((__m128i)v16u8{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0xF, 0x0, 0xF});
+ __m256i delta_values;
+ if (base64_url) {
+ delta_values = ____m256i(
+ (__m128i)v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+ int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+ int8_t(0xB9), int8_t(0x00), int8_t(0x11), int8_t(0xC3),
+ int8_t(0xBF), int8_t(0xE0), int8_t(0xB9), int8_t(0xB9)});
+ } else {
+ delta_values = ____m256i(
+ (__m128i)v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+ int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+ int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
+ int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9)});
+ }
+
+ __m256i check_asso;
+ if (base64_url) {
+ check_asso = ____m256i((__m128i)v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
+ 0x0B, 0x06, 0x0B, 0x12});
+ } else {
+ check_asso = ____m256i((__m128i)v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
+ 0x0B, 0x0B, 0x0B, 0x0F});
+ }
+
+ __m256i check_values;
+ if (base64_url) {
+ check_values = ____m256i(
+ (__m128i)v16i8{int8_t(0x0), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+ int8_t(0xCF), int8_t(0xBF), int8_t(0xD3), int8_t(0xA6),
+ int8_t(0xB5), int8_t(0x86), int8_t(0xD0), int8_t(0x80),
+ int8_t(0xB0), int8_t(0x80), int8_t(0x0), int8_t(0x0)});
+ } else {
+ check_values = ____m256i(
+ (__m128i)v16i8{int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+ int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
+ int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
+ int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80)});
+ }
+
+ __m256i shifted = __lasx_xvsrli_b(*src, 3);
+ __m256i asso_index = __lasx_xvand_v(*src, __lasx_xvldi(0xF));
+ __m256i delta_hash = __lasx_xvavgr_bu(
+ __lasx_xvshuf_b(delta_asso, delta_asso, asso_index), shifted);
+ __m256i check_hash = __lasx_xvavgr_bu(
+ __lasx_xvshuf_b(check_asso, check_asso, asso_index), shifted);
+
+ __m256i out = __lasx_xvsadd_b(
+ __lasx_xvshuf_b(delta_values, delta_values, delta_hash), *src);
+ __m256i chk = __lasx_xvsadd_b(
+ __lasx_xvshuf_b(check_values, check_values, check_hash), *src);
+ __m256i chk_ltz = __lasx_xvmskltz_b(chk);
+ unsigned int mask = __lasx_xvpickve2gr_wu(chk_ltz, 0);
+ mask = mask | (__lsx_vpickve2gr_hu(lasx_extracti128_hi(chk_ltz), 0) << 16);
+ if (mask) {
+ __m256i ascii_space = __lasx_xvseq_b(
+ __lasx_xvshuf_b(ascii_space_tbl, ascii_space_tbl, asso_index), *src);
+ __m256i ascii_space_ltz = __lasx_xvmskltz_b(ascii_space);
+ unsigned int ascii_space_mask = __lasx_xvpickve2gr_wu(ascii_space_ltz, 0);
+ ascii_space_mask =
+ ascii_space_mask |
+ (__lsx_vpickve2gr_hu(lasx_extracti128_hi(ascii_space_ltz), 0) << 16);
+ *error |= (mask != ascii_space_mask);
+ }
+
+ *src = out;
+ return (uint32_t)mask;
+}
+
+template <bool base64_url>
+static inline uint64_t to_base64_mask(block64 *b, bool *error) {
+ *error = 0;
+ uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], error);
+ uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], error);
+ return m0 | (m1 << 32);
+}
+
+static inline void copy_block(block64 *b, char *output) {
+ __lasx_xvst(b->chunks[0], reinterpret_cast<__m256i *>(output), 0);
+ __lasx_xvst(b->chunks[1], reinterpret_cast<__m256i *>(output), 32);
+}
+
+static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+ uint64_t nmask = ~mask;
+ uint64_t count =
+ __lsx_vpickve2gr_d(__lsx_vpcnt_h(__lsx_vreplgr2vr_d(nmask)), 0);
+ uint16_t *count_ptr = (uint16_t *)&count;
+ compress(lasx_extracti128_lo(b->chunks[0]), uint16_t(mask), output);
+ compress(lasx_extracti128_hi(b->chunks[0]), uint16_t(mask >> 16),
+ output + count_ptr[0]);
+ compress(lasx_extracti128_lo(b->chunks[1]), uint16_t(mask >> 32),
+ output + count_ptr[0] + count_ptr[1]);
+ compress(lasx_extracti128_hi(b->chunks[1]), uint16_t(mask >> 48),
+ output + count_ptr[0] + count_ptr[1] + count_ptr[2]);
+ return count_ones(nmask);
+}
+
+// The caller of this function is responsible to ensure that there are 64 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char *src) {
+ b->chunks[0] = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0);
+ b->chunks[1] = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32);
+}
+
+// The caller of this function is responsible to ensure that there are 128 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char16_t *src) {
+ __m256i m1 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0);
+ __m256i m2 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32);
+ __m256i m3 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 64);
+ __m256i m4 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 96);
+ b->chunks[0] = __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(m2, m1, 0), 0b11011000);
+ b->chunks[1] = __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(m4, m3, 0), 0b11011000);
+}
+
+static inline void base64_decode(char *out, __m256i str) {
+ __m256i t0 = __lasx_xvor_v(
+ __lasx_xvslli_w(str, 26),
+ __lasx_xvslli_w(__lasx_xvand_v(str, __lasx_xvldi(-1758 /*0x0000FF00*/)),
+ 12));
+ __m256i t1 = __lasx_xvsrli_w(
+ __lasx_xvand_v(str, __lasx_xvldi(-3521 /*0x003F0000*/)), 2);
+ __m256i t2 = __lasx_xvor_v(t0, t1);
+ __m256i t3 = __lasx_xvor_v(t2, __lasx_xvsrli_w(str, 16));
+ __m256i pack_shuffle = ____m256i(
+ (__m128i)v16u8{3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, 0, 0, 0, 0});
+ t3 = __lasx_xvshuf_b(t3, t3, (__m256i)pack_shuffle);
+
+ // Store the output:
+ __lsx_vst(lasx_extracti128_lo(t3), out, 0);
+ __lsx_vst(lasx_extracti128_hi(t3), out, 12);
+}
+// decode 64 bytes and output 48 bytes
+static inline void base64_decode_block(char *out, const char *src) {
+ base64_decode(out, __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0));
+ base64_decode(out + 24,
+ __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32));
+}
+
+static inline void base64_decode_block_safe(char *out, const char *src) {
+ base64_decode(out, __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0));
+ char buffer[32];
+ base64_decode(buffer,
+ __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32));
+ std::memcpy(out + 24, buffer, 24);
+}
+
+static inline void base64_decode_block(char *out, block64 *b) {
+ base64_decode(out, b->chunks[0]);
+ base64_decode(out + 24, b->chunks[1]);
+}
+static inline void base64_decode_block_safe(char *out, block64 *b) {
+ base64_decode(out, b->chunks[0]);
+ char buffer[32];
+ base64_decode(buffer, b->chunks[1]);
+ std::memcpy(out + 24, buffer, 24);
+}
+
+template <bool base64_url, typename chartype>
+full_result
+compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options) {
+ const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+ : tables::base64::to_base64_value;
+ size_t equallocation =
+ srclen; // location of the first padding character if any
+ // skip trailing spaces
+ while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+ to_base64[uint8_t(src[srclen - 1])] == 64) {
+ srclen--;
+ }
+ size_t equalsigns = 0;
+ if (srclen > 0 && src[srclen - 1] == '=') {
+ equallocation = srclen - 1;
+ srclen--;
+ equalsigns = 1;
+ // skip trailing spaces
+ while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+ to_base64[uint8_t(src[srclen - 1])] == 64) {
+ srclen--;
+ }
+ if (srclen > 0 && src[srclen - 1] == '=') {
+ equallocation = srclen - 1;
+ srclen--;
+ equalsigns = 2;
+ }
+ }
+ if (srclen == 0) {
+ if (equalsigns > 0) {
+ return {INVALID_BASE64_CHARACTER, equallocation, 0};
+ }
+ return {SUCCESS, 0, 0};
+ }
+ char *end_of_safe_64byte_zone =
+ (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
+
+ const chartype *const srcinit = src;
+ const char *const dstinit = dst;
+ const chartype *const srcend = src + srclen;
+
+ constexpr size_t block_size = 6;
+ static_assert(block_size >= 2, "block_size must be at least two");
+ char buffer[block_size * 64];
+ char *bufferptr = buffer;
+ if (srclen >= 64) {
+ const chartype *const srcend64 = src + srclen - 64;
+ while (src <= srcend64) {
+ block64 b;
+ load_block(&b, src);
+ src += 64;
+ bool error = false;
+ uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+ if (error) {
+ src -= 64;
+ while (src < srcend && scalar::base64::is_eight_byte(*src) &&
+ to_base64[uint8_t(*src)] <= 64) {
+ src++;
+ }
+ return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+ size_t(dst - dstinit)};
+ }
+ if (badcharmask != 0) {
+ // optimization opportunity: check for simple masks like those made of
+ // continuous 1s followed by continuous 0s. And masks containing a
+ // single bad character.
+ bufferptr += compress_block(&b, badcharmask, bufferptr);
+ } else if (bufferptr != buffer) {
+ copy_block(&b, bufferptr);
+ bufferptr += 64;
+ } else {
+ if (dst >= end_of_safe_64byte_zone) {
+ base64_decode_block_safe(dst, &b);
+ } else {
+ base64_decode_block(dst, &b);
+ }
+ dst += 48;
+ }
+ if (bufferptr >= (block_size - 1) * 64 + buffer) {
+ for (size_t i = 0; i < (block_size - 2); i++) {
+ base64_decode_block(dst, buffer + i * 64);
+ dst += 48;
+ }
+ if (dst >= end_of_safe_64byte_zone) {
+ base64_decode_block_safe(dst, buffer + (block_size - 2) * 64);
+ } else {
+ base64_decode_block(dst, buffer + (block_size - 2) * 64);
+ }
+ dst += 48;
+ std::memcpy(buffer, buffer + (block_size - 1) * 64,
+ 64); // 64 might be too much
+ bufferptr -= (block_size - 1) * 64;
+ }
+ }
+ }
+
+ char *buffer_start = buffer;
+ // Optimization note: if this is almost full, then it is worth our
+ // time, otherwise, we should just decode directly.
+ int last_block = (int)((bufferptr - buffer_start) % 64);
+ if (last_block != 0 && srcend - src + last_block >= 64) {
+
+ while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
+ uint8_t val = to_base64[uint8_t(*src)];
+ *bufferptr = char(val);
+ if (!scalar::base64::is_eight_byte(*src) || val > 64) {
+ return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+ size_t(dst - dstinit)};
+ }
+ bufferptr += (val <= 63);
+ src++;
+ }
+ }
+
+ for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+ if (dst >= end_of_safe_64byte_zone) {
+ base64_decode_block_safe(dst, buffer_start);
+ } else {
+ base64_decode_block(dst, buffer_start);
+ }
+ dst += 48;
+ }
+ if ((bufferptr - buffer_start) % 64 != 0) {
+ while (buffer_start + 4 < bufferptr) {
+ uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+ (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+ (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+ (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+ << 8;
+ triple = scalar::utf32::swap_bytes(triple);
+ std::memcpy(dst, &triple, 4);
+
+ dst += 3;
+ buffer_start += 4;
+ }
+ if (buffer_start + 4 <= bufferptr) {
+ uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+ (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+ (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+ (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+ << 8;
+ triple = scalar::utf32::swap_bytes(triple);
+ std::memcpy(dst, &triple, 3);
+
+ dst += 3;
+ buffer_start += 4;
+ }
+ // we may have 1, 2 or 3 bytes left and we need to decode them so let us
+ // backtrack
+ int leftover = int(bufferptr - buffer_start);
+ while (leftover > 0) {
+ while (to_base64[uint8_t(*(src - 1))] == 64) {
+ src--;
+ }
+ src--;
+ leftover--;
+ }
+ }
+ if (src < srcend + equalsigns) {
+ full_result r = scalar::base64::base64_tail_decode(
+ dst, src, srcend - src, equalsigns, options, last_chunk_options);
+ r.input_count += size_t(src - srcinit);
+ if (r.error == error_code::INVALID_BASE64_CHARACTER ||
+ r.error == error_code::BASE64_EXTRA_BITS) {
+ return r;
+ } else {
+ r.output_count += size_t(dst - dstinit);
+ }
+ if (last_chunk_options != stop_before_partial &&
+ r.error == error_code::SUCCESS && equalsigns > 0) {
+ // additional checks
+ if ((r.output_count % 3 == 0) ||
+ ((r.output_count % 3) + 1 + equalsigns != 4)) {
+ r.error = error_code::INVALID_BASE64_CHARACTER;
+ r.input_count = equallocation;
+ }
+ }
+ return r;
+ }
+ if (equalsigns > 0) {
+ if ((size_t(dst - dstinit) % 3 == 0) ||
+ ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
+ return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
+ }
+ }
+ return {SUCCESS, srclen, size_t(dst - dstinit)};
+}
diff --git a/contrib/simdutf/src/lasx/lasx_convert_latin1_to_utf16.cpp b/contrib/simdutf/src/lasx/lasx_convert_latin1_to_utf16.cpp
new file mode 100644
index 000000000..a784d364e
--- /dev/null
+++ b/contrib/simdutf/src/lasx/lasx_convert_latin1_to_utf16.cpp
@@ -0,0 +1,76 @@
+std::pair<const char *, char16_t *>
+lasx_convert_latin1_to_utf16le(const char *buf, size_t len,
+ char16_t *utf16_output) {
+ const char *end = buf + len;
+
+ // Performance degradation when memory address is not 32-byte aligned
+ while (((uint64_t)utf16_output & 0x1F) && buf < end) {
+ *utf16_output++ = uint8_t(*buf) & 0xFF;
+ buf++;
+ }
+
+ while (buf + 32 <= end) {
+ __m256i in8 = __lasx_xvld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+ __m256i inlow = __lasx_vext2xv_hu_bu(in8);
+ __m256i in8_high = __lasx_xvpermi_q(in8, in8, 0b00000001);
+ __m256i inhigh = __lasx_vext2xv_hu_bu(in8_high);
+ __lasx_xvst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+ __lasx_xvst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 32);
+
+ utf16_output += 32;
+ buf += 32;
+ }
+
+ if (buf + 16 <= end) {
+ __m128i zero = __lsx_vldi(0);
+ __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+ __m128i inlow = __lsx_vilvl_b(zero, in8);
+ __m128i inhigh = __lsx_vilvh_b(zero, in8);
+ __lsx_vst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+ __lsx_vst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 16);
+
+ utf16_output += 16;
+ buf += 16;
+ }
+ return std::make_pair(buf, utf16_output);
+}
+
+std::pair<const char *, char16_t *>
+lasx_convert_latin1_to_utf16be(const char *buf, size_t len,
+ char16_t *utf16_output) {
+ const char *end = buf + len;
+
+ while (((uint64_t)utf16_output & 0x1F) && buf < end) {
+ *utf16_output++ = (uint16_t(*buf++) << 8);
+ }
+
+ __m256i zero = __lasx_xvldi(0);
+ while (buf + 32 <= end) {
+ __m256i in8 = __lasx_xvld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+ __m256i in8_shuf = __lasx_xvpermi_d(in8, 0b11011000);
+
+ __m256i inlow = __lasx_xvilvl_b(in8_shuf, zero);
+ __m256i inhigh = __lasx_xvilvh_b(in8_shuf, zero);
+ __lasx_xvst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+ __lasx_xvst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 32);
+ utf16_output += 32;
+ buf += 32;
+ }
+
+ if (buf + 16 <= end) {
+ __m128i zero_128 = __lsx_vldi(0);
+ __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+ __m128i inlow = __lsx_vilvl_b(in8, zero_128);
+ __m128i inhigh = __lsx_vilvh_b(in8, zero_128);
+ __lsx_vst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+ __lsx_vst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 16);
+ utf16_output += 16;
+ buf += 16;
+ }
+
+ return std::make_pair(buf, utf16_output);
+}
diff --git a/contrib/simdutf/src/lasx/lasx_convert_latin1_to_utf32.cpp b/contrib/simdutf/src/lasx/lasx_convert_latin1_to_utf32.cpp
new file mode 100644
index 000000000..80402e1df
--- /dev/null
+++ b/contrib/simdutf/src/lasx/lasx_convert_latin1_to_utf32.cpp
@@ -0,0 +1,55 @@
+std::pair<const char *, char32_t *>
+lasx_convert_latin1_to_utf32(const char *buf, size_t len,
+ char32_t *utf32_output) {
+ const char *end = buf + len;
+
+ // LASX requires 32-byte alignment, otherwise performance will be degraded
+ while (((uint64_t)utf32_output & 0x1F) && buf < end) {
+ *utf32_output++ = ((uint32_t)*buf) & 0xFF;
+ buf++;
+ }
+
+ while (buf + 32 <= end) {
+ __m256i in8 = __lasx_xvld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+ __m256i in32_0 = __lasx_vext2xv_wu_bu(in8);
+ __lasx_xvst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
+
+ __m256i in8_1 = __lasx_xvpermi_d(in8, 0b00000001);
+ __m256i in32_1 = __lasx_vext2xv_wu_bu(in8_1);
+ __lasx_xvst(in32_1, reinterpret_cast<uint32_t *>(utf32_output), 32);
+
+ __m256i in8_2 = __lasx_xvpermi_d(in8, 0b00000010);
+ __m256i in32_2 = __lasx_vext2xv_wu_bu(in8_2);
+ __lasx_xvst(in32_2, reinterpret_cast<uint32_t *>(utf32_output), 64);
+
+ __m256i in8_3 = __lasx_xvpermi_d(in8, 0b00000011);
+ __m256i in32_3 = __lasx_vext2xv_wu_bu(in8_3);
+ __lasx_xvst(in32_3, reinterpret_cast<uint32_t *>(utf32_output), 96);
+
+ utf32_output += 32;
+ buf += 32;
+ }
+
+ if (buf + 16 <= end) {
+ __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+ __m128i zero = __lsx_vldi(0);
+ __m128i in16low = __lsx_vilvl_b(zero, in8);
+ __m128i in16high = __lsx_vilvh_b(zero, in8);
+ __m128i in32_0 = __lsx_vilvl_h(zero, in16low);
+ __m128i in32_1 = __lsx_vilvh_h(zero, in16low);
+ __m128i in32_2 = __lsx_vilvl_h(zero, in16high);
+ __m128i in32_3 = __lsx_vilvh_h(zero, in16high);
+
+ __lsx_vst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
+ __lsx_vst(in32_1, reinterpret_cast<uint32_t *>(utf32_output), 16);
+ __lsx_vst(in32_2, reinterpret_cast<uint32_t *>(utf32_output), 32);
+ __lsx_vst(in32_3, reinterpret_cast<uint32_t *>(utf32_output), 48);
+
+ utf32_output += 16;
+ buf += 16;
+ }
+
+ return std::make_pair(buf, utf32_output);
+}
diff --git a/contrib/simdutf/src/lasx/lasx_convert_latin1_to_utf8.cpp b/contrib/simdutf/src/lasx/lasx_convert_latin1_to_utf8.cpp
new file mode 100644
index 000000000..f12270649
--- /dev/null
+++ b/contrib/simdutf/src/lasx/lasx_convert_latin1_to_utf8.cpp
@@ -0,0 +1,65 @@
+/*
+ Returns a pair: the first unprocessed byte from buf and utf8_output
+ A scalar routing should carry on the conversion of the tail.
+*/
+
+std::pair<const char *, char *>
+lasx_convert_latin1_to_utf8(const char *latin1_input, size_t len,
+ char *utf8_out) {
+ uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+ const size_t safety_margin = 12;
+ const char *end = latin1_input + len - safety_margin;
+
+ // We always write 16 bytes, of which more than the first 8 bytes
+ // are valid. A safety margin of 8 is more than sufficient.
+ while (latin1_input + 16 <= end) {
+ __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(latin1_input), 0);
+ uint32_t ascii_mask = __lsx_vpickve2gr_wu(__lsx_vmskgez_b(in8), 0);
+ if (ascii_mask == 0xFFFF) {
+ __lsx_vst(in8, utf8_output, 0);
+ utf8_output += 16;
+ latin1_input += 16;
+ continue;
+ }
+ // We just fallback on UTF-16 code. This could be optimized/simplified
+ // further.
+ __m256i in16 = __lasx_vext2xv_hu_bu(____m256i(in8));
+ // 1. prepare 2-byte values
+ // input 8-bit word : [aabb|bbbb] x 16
+ // expected output : [1100|00aa|10bb|bbbb] x 16
+ // t0 = [0000|00aa|bbbb|bb00]
+ __m256i t0 = __lasx_xvslli_h(in16, 2);
+ // t1 = [0000|00aa|0000|0000]
+ __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785));
+ // t3 = [0000|00aa|00bb|bbbb]
+ __m256i t2 = __lasx_xvbitsel_v(t1, in16, __lasx_xvrepli_h(0x3f));
+ // t4 = [1100|00aa|10bb|bbbb]
+ __m256i t3 = __lasx_xvor_v(t2, __lasx_xvreplgr2vr_h(uint16_t(0xc080)));
+ // merge ASCII and 2-byte codewords
+ __m256i one_byte_bytemask = __lasx_xvsle_hu(in16, __lasx_xvrepli_h(0x7F));
+ __m256i utf8_unpacked = __lasx_xvbitsel_v(t3, in16, one_byte_bytemask);
+
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+ [lasx_1_2_utf8_bytes_mask[(ascii_mask & 0xFF)]][0];
+ __m128i shuffle0 = __lsx_vld(row0 + 1, 0);
+ __m128i utf8_unpacked_lo = lasx_extracti128_lo(utf8_unpacked);
+ __m128i utf8_packed0 =
+ __lsx_vshuf_b(utf8_unpacked_lo, utf8_unpacked_lo, shuffle0);
+ __lsx_vst(utf8_packed0, utf8_output, 0);
+ utf8_output += row0[0];
+
+ const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+ [lasx_1_2_utf8_bytes_mask[(ascii_mask >> 8)]][0];
+ __m128i shuffle1 = __lsx_vld(row1 + 1, 0);
+ __m128i utf8_unpacked_hi = lasx_extracti128_hi(utf8_unpacked);
+ __m128i utf8_packed1 =
+ __lsx_vshuf_b(utf8_unpacked_hi, utf8_unpacked_hi, shuffle1);
+ __lsx_vst(utf8_packed1, utf8_output, 0);
+ utf8_output += row1[0];
+
+ latin1_input += 16;
+ } // while
+
+ return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
+}
diff --git a/contrib/simdutf/src/lasx/lasx_convert_utf16_to_latin1.cpp b/contrib/simdutf/src/lasx/lasx_convert_utf16_to_latin1.cpp
new file mode 100644
index 000000000..97fcbc925
--- /dev/null
+++ b/contrib/simdutf/src/lasx/lasx_convert_utf16_to_latin1.cpp
@@ -0,0 +1,66 @@
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+lasx_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_output) {
+ const char16_t *end = buf + len;
+ while (buf + 16 <= end) {
+ __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+ __m128i in1 = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
+ if (!match_system(big_endian)) {
+ in = lsx_swap_bytes(in);
+ in1 = lsx_swap_bytes(in1);
+ }
+ if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) {
+ // 1. pack the bytes
+ __m128i latin1_packed = __lsx_vpickev_b(in1, in);
+ // 2. store (8 bytes)
+ __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+ // 3. adjust pointers
+ buf += 16;
+ latin1_output += 16;
+ } else {
+ return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+ }
+ } // while
+ return std::make_pair(buf, latin1_output);
+}
+
+template <endianness big_endian>
+std::pair<result, char *>
+lasx_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+ char *latin1_output) {
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+ while (buf + 16 <= end) {
+ __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+ __m128i in1 = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
+ if (!match_system(big_endian)) {
+ in = lsx_swap_bytes(in);
+ in1 = lsx_swap_bytes(in1);
+ }
+ if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) {
+ // 1. pack the bytes
+ __m128i latin1_packed = __lsx_vpickev_b(in1, in);
+ // 2. store (8 bytes)
+ __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+ // 3. adjust pointers
+ buf += 16;
+ latin1_output += 16;
+ } else {
+ // Let us do a scalar fallback.
+ for (int k = 0; k < 16; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if (word <= 0xff) {
+ *latin1_output++ = char(word);
+ } else {
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+ latin1_output);
+ }
+ }
+ }
+ } // while
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ latin1_output);
+}
diff --git a/contrib/simdutf/src/lasx/lasx_convert_utf16_to_utf32.cpp b/contrib/simdutf/src/lasx/lasx_convert_utf16_to_utf32.cpp
new file mode 100644
index 000000000..85fe6c98d
--- /dev/null
+++ b/contrib/simdutf/src/lasx/lasx_convert_utf16_to_utf32.cpp
@@ -0,0 +1,195 @@
+template <endianness big_endian>
+std::pair<const char16_t *, char32_t *>
+lasx_convert_utf16_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_out) {
+ uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+ const char16_t *end = buf + len;
+
+ // Performance degradation when memory address is not 32-byte aligned
+ while (((uint64_t)utf32_output & 0x1f) && buf < end) {
+ uint16_t word =
+ !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[0]) : buf[0];
+ if ((word & 0xF800) != 0xD800) {
+ *utf32_output++ = char32_t(word);
+ buf++;
+ } else {
+ if (buf + 1 >= end) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char32_t *>(utf32_output));
+ }
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[1])
+ : buf[1];
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char32_t *>(utf32_output));
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf32_output++ = char32_t(value);
+ buf += 2;
+ }
+ }
+
+ __m256i v_f800 = __lasx_xvldi(-2568); /*0xF800*/
+ __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+
+ while (buf + 16 <= end) {
+ __m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
+ if (!match_system(big_endian)) {
+ in = lasx_swap_bytes(in);
+ }
+
+ __m256i surrogates_bytemask =
+ __lasx_xvseq_h(__lasx_xvand_v(in, v_f800), v_d800);
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (__lasx_xbz_v(surrogates_bytemask)) {
+ // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+ // units
+ __m256i in_hi = __lasx_xvpermi_q(in, in, 0b00000001);
+ __lasx_xvst(__lasx_vext2xv_wu_hu(in), utf32_output, 0);
+ __lasx_xvst(__lasx_vext2xv_wu_hu(in_hi), utf32_output, 32);
+ utf32_output += 16;
+ buf += 16;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if ((word & 0xF800) != 0xD800) {
+ *utf32_output++ = char32_t(word);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k + 1])
+ : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char32_t *>(utf32_output));
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf32_output++ = char32_t(value);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(buf, reinterpret_cast<char32_t *>(utf32_output));
+}
+
+/*
+ Returns a pair: a result struct and utf8_output.
+ If there is an error, the count field of the result is the position of the
+ error. Otherwise, it is the position of the first unprocessed byte in buf
+ (even if finished). A scalar routing should carry on the conversion of the
+ tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char32_t *>
+lasx_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
+ char32_t *utf32_out) {
+ uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+
+ // Performance degradation when memory address is not 32-byte aligned
+ while (((uint64_t)utf32_output & 0x1f) && buf < end) {
+ uint16_t word =
+ !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[0]) : buf[0];
+ if ((word & 0xF800) != 0xD800) {
+ *utf32_output++ = char32_t(word);
+ buf++;
+ } else if (buf + 1 < end) {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[1])
+ : buf[1];
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(result(error_code::SURROGATE, buf - start),
+ reinterpret_cast<char32_t *>(utf32_output));
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf32_output++ = char32_t(value);
+ buf += 2;
+ } else {
+ return std::make_pair(result(error_code::SURROGATE, buf - start),
+ reinterpret_cast<char32_t *>(utf32_output));
+ }
+ }
+
+ __m256i v_f800 = __lasx_xvldi(-2568); /*0xF800*/
+ __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+ while (buf + 16 <= end) {
+ __m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
+ if (!match_system(big_endian)) {
+ in = lasx_swap_bytes(in);
+ }
+
+ __m256i surrogates_bytemask =
+ __lasx_xvseq_h(__lasx_xvand_v(in, v_f800), v_d800);
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (__lasx_xbz_v(surrogates_bytemask)) {
+ // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+ // units
+ __m256i in_hi = __lasx_xvpermi_q(in, in, 0b00000001);
+ __lasx_xvst(__lasx_vext2xv_wu_hu(in), utf32_output, 0);
+ __lasx_xvst(__lasx_vext2xv_wu_hu(in_hi), utf32_output, 32);
+ utf32_output += 16;
+ buf += 16;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if ((word & 0xF800) != 0xD800) {
+ *utf32_output++ = char32_t(word);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k + 1])
+ : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k - 1),
+ reinterpret_cast<char32_t *>(utf32_output));
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf32_output++ = char32_t(value);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ reinterpret_cast<char32_t *>(utf32_output));
+}
diff --git a/contrib/simdutf/src/lasx/lasx_convert_utf16_to_utf8.cpp b/contrib/simdutf/src/lasx/lasx_convert_utf16_to_utf8.cpp
new file mode 100644
index 000000000..c5bf6e5e2
--- /dev/null
+++ b/contrib/simdutf/src/lasx/lasx_convert_utf16_to_utf8.cpp
@@ -0,0 +1,558 @@
+/*
+ The vectorized algorithm works on single LASX register i.e., it
+ loads eight 16-bit code units.
+
+ We consider three cases:
+ 1. an input register contains no surrogates and each value
+ is in range 0x0000 .. 0x07ff.
+ 2. an input register contains no surrogates and values are
+ is in range 0x0000 .. 0xffff.
+ 3. an input register contains surrogates --- i.e. codepoints
+ can have 16 or 32 bits.
+
+ Ad 1.
+
+ When values are less than 0x0800, it means that a 16-bit code unit
+ can be converted into: 1) single UTF8 byte (when it's an ASCII
+ char) or 2) two UTF8 bytes.
+
+ For this case we do only some shuffle to obtain these 2-byte
+ codes and finally compress the whole LASX register with a single
+ shuffle.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+ Ad 2.
+
+ When values fit in 16-bit code units, but are above 0x07ff, then
+ a single word may produce one, two or three UTF8 bytes.
+
+ We prepare data for all these three cases in two registers.
+ The first register contains lower two UTF8 bytes (used in all
+ cases), while the second one contains just the third byte for
+ the three-UTF8-bytes case.
+
+ Finally these two registers are interleaved forming eight-element
+ array of 32-bit values. The array spans two LASX registers.
+ The bytes from the registers are compressed using two shuffles.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+
+ To summarize:
+ - We need two 256-entry tables that have 8704 bytes in total.
+*/
+/*
+ Returns a pair: the first unprocessed byte from buf and utf8_output
+ A scalar routing should carry on the conversion of the tail.
+*/
+
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+lasx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
+ uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+ const char16_t *end = buf + len;
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7ff));
+ __m256i zero = __lasx_xvldi(0);
+ __m128i zero_128 = __lsx_vldi(0);
+ while (buf + 16 + safety_margin <= end) {
+ __m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
+ if (!match_system(big_endian)) {
+ in = lasx_swap_bytes(in);
+ }
+ if (__lasx_xbnz_h(__lasx_xvslt_hu(
+ in, __lasx_xvrepli_h(0x7F)))) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ __m256i utf8_packed =
+ __lasx_xvpermi_d(__lasx_xvpickev_b(in, in), 0b00001000);
+ // 2. store (16 bytes)
+ __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+
+ if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, in))) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 16
+ // expected output : [110a|aaaa|10bb|bbbb] x 16
+ // t0 = [000a|aaaa|bbbb|bb00]
+ __m256i t0 = __lasx_xvslli_h(in, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/));
+ // t2 = [0000|0000|00bb|bbbb]
+ __m256i t2 = __lasx_xvand_v(in, __lasx_xvrepli_h(0x3f));
+ // t3 = [000a|aaaa|00bb|bbbb]
+ __m256i t3 = __lasx_xvor_v(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xc080));
+ __m256i t4 = __lasx_xvor_v(t3, v_c080);
+ // 2. merge ASCII and 2-byte codewords
+ __m256i one_byte_bytemask =
+ __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F /*0x007F*/));
+ __m256i utf8_unpacked = __lasx_xvbitsel_v(t4, in, one_byte_bytemask);
+ // 3. prepare bitmask for 8-bit lookup
+ __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
+ uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
+ uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
+ // 4. pack the bytes
+ const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+ [lasx_1_2_utf8_bytes_mask[m1]][0];
+ __m128i shuffle1 = __lsx_vld(row1, 1);
+ __m128i utf8_packed1 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);
+
+ const uint8_t *row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+ [lasx_1_2_utf8_bytes_mask[m2]][0];
+ __m128i shuffle2 = __lsx_vld(row2, 1);
+ __m128i utf8_packed2 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
+ // 5. store bytes
+ __lsx_vst(utf8_packed1, utf8_output, 0);
+ utf8_output += row1[0];
+
+ __lsx_vst(utf8_packed2, utf8_output, 0);
+ utf8_output += row2[0];
+
+ buf += 16;
+ continue;
+ }
+ __m256i surrogates_bytemask =
+ __lasx_xvseq_h(__lasx_xvand_v(in, __lasx_xvldi(-2568 /*0xF800*/)),
+ __lasx_xvldi(-2600 /*0xD800*/));
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (__lasx_xbz_v(surrogates_bytemask)) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
+ two UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** --
+ precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+ they differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence,
+ taking into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ __m256i t0 = __lasx_xvpickev_b(in, in);
+ t0 = __lasx_xvilvl_b(t0, t0);
+
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc]
+ __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
+ __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688));
+
+ // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+ __m256i s0 = __lasx_xvsrli_h(in, 12);
+ // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+ __m256i s1 = __lasx_xvslli_h(in, 2);
+ // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
+ s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/));
+
+ // [00bb|bbbb|0000|aaaa]
+ __m256i s2 = __lasx_xvor_v(s0, s1);
+ // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
+ __m256i s3 = __lasx_xvor_v(s2, v_c0e0);
+ __m256i one_or_two_bytes_bytemask = __lasx_xvsle_hu(in, v_07ff);
+ __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask,
+ __lasx_xvldi(-2752 /*0x4000*/));
+ __m256i s4 = __lasx_xvxor_v(s3, m0);
+
+ // 4. expand code units 16-bit => 32-bit
+ __m256i out0 = __lasx_xvilvl_h(s4, t2);
+ __m256i out1 = __lasx_xvilvh_h(s4, t2);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ __m256i one_byte_bytemask = __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F));
+ __m256i one_byte_bytemask_low =
+ __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
+ __m256i one_byte_bytemask_high =
+ __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+ __m256i one_or_two_bytes_bytemask_low =
+ __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
+ __m256i one_or_two_bytes_bytemask_high =
+ __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);
+
+ __m256i mask0 = __lasx_xvmskltz_h(
+ __lasx_xvor_v(one_or_two_bytes_bytemask_low, one_byte_bytemask_low));
+ __m256i mask1 = __lasx_xvmskltz_h(__lasx_xvor_v(
+ one_or_two_bytes_bytemask_high, one_byte_bytemask_high));
+
+ uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+ [0];
+ __m128i shuffle0 = __lsx_vld(row0, 1);
+ __m128i utf8_0 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
+ __lsx_vst(utf8_0, utf8_output, 0);
+ utf8_output += row0[0];
+
+ mask = __lasx_xvpickve2gr_wu(mask1, 0);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+ [0];
+ __m128i shuffle1 = __lsx_vld(row1, 1);
+ __m128i utf8_1 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
+ __lsx_vst(utf8_1, utf8_output, 0);
+ utf8_output += row1[0];
+
+ mask = __lasx_xvpickve2gr_wu(mask0, 4);
+ const uint8_t *row2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+ [0];
+ __m128i shuffle2 = __lsx_vld(row2, 1);
+ __m128i utf8_2 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
+ __lsx_vst(utf8_2, utf8_output, 0);
+ utf8_output += row2[0];
+
+ mask = __lasx_xvpickve2gr_wu(mask1, 4);
+ const uint8_t *row3 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+ [0];
+ __m128i shuffle3 = __lsx_vld(row3, 1);
+ __m128i utf8_3 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
+ __lsx_vst(utf8_3, utf8_output, 0);
+ utf8_output += row3[0];
+
+ buf += 16;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if ((word & 0xFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xF800) != 0xD800) {
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k + 1])
+ : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char *>(utf8_output));
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf8_output++ = char((value >> 18) | 0b11110000);
+ *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((value & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
+}
+
+/*
+ Returns a pair: a result struct and utf8_output.
+ If there is an error, the count field of the result is the position of the
+ error. Otherwise, it is the position of the first unprocessed byte in buf
+ (even if finished). A scalar routing should carry on the conversion of the
+ tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char *>
+lasx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
+ char *utf8_out) {
+ uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7ff));
+ __m256i zero = __lasx_xvldi(0);
+ __m128i zero_128 = __lsx_vldi(0);
+ while (buf + 16 + safety_margin <= end) {
+ __m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
+ if (!match_system(big_endian)) {
+ in = lasx_swap_bytes(in);
+ }
+ if (__lasx_xbnz_h(__lasx_xvslt_hu(
+ in, __lasx_xvrepli_h(0x7F)))) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ __m256i utf8_packed =
+ __lasx_xvpermi_d(__lasx_xvpickev_b(in, in), 0b00001000);
+ // 2. store (16 bytes)
+ __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+
+ if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, in))) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 16
+ // expected output : [110a|aaaa|10bb|bbbb] x 16
+ // t0 = [000a|aaaa|bbbb|bb00]
+ __m256i t0 = __lasx_xvslli_h(in, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/));
+ // t2 = [0000|0000|00bb|bbbb]
+ __m256i t2 = __lasx_xvand_v(in, __lasx_xvrepli_h(0x3f));
+ // t3 = [000a|aaaa|00bb|bbbb]
+ __m256i t3 = __lasx_xvor_v(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xc080));
+ __m256i t4 = __lasx_xvor_v(t3, v_c080);
+ // 2. merge ASCII and 2-byte codewords
+ __m256i one_byte_bytemask =
+ __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F /*0x007F*/));
+ __m256i utf8_unpacked = __lasx_xvbitsel_v(t4, in, one_byte_bytemask);
+ // 3. prepare bitmask for 8-bit lookup
+ __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
+ uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
+ uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
+ // 4. pack the bytes
+ const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+ [lasx_1_2_utf8_bytes_mask[m1]][0];
+ __m128i shuffle1 = __lsx_vld(row1, 1);
+ __m128i utf8_packed1 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);
+
+ const uint8_t *row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+ [lasx_1_2_utf8_bytes_mask[m2]][0];
+ __m128i shuffle2 = __lsx_vld(row2, 1);
+ __m128i utf8_packed2 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
+ // 5. store bytes
+ __lsx_vst(utf8_packed1, utf8_output, 0);
+ utf8_output += row1[0];
+
+ __lsx_vst(utf8_packed2, utf8_output, 0);
+ utf8_output += row2[0];
+
+ buf += 16;
+ continue;
+ }
+ __m256i surrogates_bytemask =
+ __lasx_xvseq_h(__lasx_xvand_v(in, __lasx_xvldi(-2568 /*0xF800*/)),
+ __lasx_xvldi(-2600 /*0xD800*/));
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (__lasx_xbz_v(surrogates_bytemask)) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
+ two UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** --
+ precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+ they differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence,
+ taking into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ __m256i t0 = __lasx_xvpickev_b(in, in);
+ t0 = __lasx_xvilvl_b(t0, t0);
+
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc]
+ __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
+ __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688));
+
+ // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+ __m256i s0 = __lasx_xvsrli_h(in, 12);
+ // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+ __m256i s1 = __lasx_xvslli_h(in, 2);
+ // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
+ s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/));
+
+ // [00bb|bbbb|0000|aaaa]
+ __m256i s2 = __lasx_xvor_v(s0, s1);
+ // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
+ __m256i s3 = __lasx_xvor_v(s2, v_c0e0);
+ __m256i one_or_two_bytes_bytemask = __lasx_xvsle_hu(in, v_07ff);
+ __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask,
+ __lasx_xvldi(-2752 /*0x4000*/));
+ __m256i s4 = __lasx_xvxor_v(s3, m0);
+
+ // 4. expand code units 16-bit => 32-bit
+ __m256i out0 = __lasx_xvilvl_h(s4, t2);
+ __m256i out1 = __lasx_xvilvh_h(s4, t2);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ __m256i one_byte_bytemask = __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F));
+ __m256i one_byte_bytemask_low =
+ __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
+ __m256i one_byte_bytemask_high =
+ __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+ __m256i one_or_two_bytes_bytemask_low =
+ __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
+ __m256i one_or_two_bytes_bytemask_high =
+ __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);
+
+ __m256i mask0 = __lasx_xvmskltz_h(
+ __lasx_xvor_v(one_or_two_bytes_bytemask_low, one_byte_bytemask_low));
+ __m256i mask1 = __lasx_xvmskltz_h(__lasx_xvor_v(
+ one_or_two_bytes_bytemask_high, one_byte_bytemask_high));
+
+ uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+ [0];
+ __m128i shuffle0 = __lsx_vld(row0, 1);
+ __m128i utf8_0 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
+ __lsx_vst(utf8_0, utf8_output, 0);
+ utf8_output += row0[0];
+
+ mask = __lasx_xvpickve2gr_wu(mask1, 0);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+ [0];
+ __m128i shuffle1 = __lsx_vld(row1, 1);
+ __m128i utf8_1 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
+ __lsx_vst(utf8_1, utf8_output, 0);
+ utf8_output += row1[0];
+
+ mask = __lasx_xvpickve2gr_wu(mask0, 4);
+ const uint8_t *row2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+ [0];
+ __m128i shuffle2 = __lsx_vld(row2, 1);
+ __m128i utf8_2 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
+ __lsx_vst(utf8_2, utf8_output, 0);
+ utf8_output += row2[0];
+
+ mask = __lasx_xvpickve2gr_wu(mask1, 4);
+ const uint8_t *row3 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+ [0];
+ __m128i shuffle3 = __lsx_vld(row3, 1);
+ __m128i utf8_3 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
+ __lsx_vst(utf8_3, utf8_output, 0);
+ utf8_output += row3[0];
+
+ buf += 16;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if ((word & 0xFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xF800) != 0xD800) {
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k + 1])
+ : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k - 1),
+ reinterpret_cast<char *>(utf8_output));
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf8_output++ = char((value >> 18) | 0b11110000);
+ *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((value & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ reinterpret_cast<char *>(utf8_output));
+}
diff --git a/contrib/simdutf/src/lasx/lasx_convert_utf32_to_latin1.cpp b/contrib/simdutf/src/lasx/lasx_convert_utf32_to_latin1.cpp
new file mode 100644
index 000000000..bfcc783a6
--- /dev/null
+++ b/contrib/simdutf/src/lasx/lasx_convert_utf32_to_latin1.cpp
@@ -0,0 +1,73 @@
+std::pair<const char32_t *, char *>
+lasx_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) {
+ const char32_t *end = buf + len;
+ const __m256i shuf_mask = ____m256i(
+ (__m128i)v16u8{0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0});
+ __m256i v_ff = __lasx_xvrepli_w(0xFF);
+
+ while (buf + 16 <= end) {
+ __m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+ __m256i in2 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
+
+ __m256i in12 = __lasx_xvor_v(in1, in2);
+ if (__lasx_xbz_v(__lasx_xvslt_wu(v_ff, in12))) {
+ // 1. pack the bytes
+ __m256i latin1_packed_tmp = __lasx_xvshuf_b(in2, in1, shuf_mask);
+ latin1_packed_tmp = __lasx_xvpermi_d(latin1_packed_tmp, 0b00001000);
+ __m128i latin1_packed = lasx_extracti128_lo(latin1_packed_tmp);
+ latin1_packed = __lsx_vpermi_w(latin1_packed, latin1_packed, 0b11011000);
+ // 2. store (8 bytes)
+ __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+ // 3. adjust pointers
+ buf += 16;
+ latin1_output += 16;
+ } else {
+ return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+ }
+ } // while
+ return std::make_pair(buf, latin1_output);
+}
+
+std::pair<result, char *>
+lasx_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+ char *latin1_output) {
+ const char32_t *start = buf;
+ const char32_t *end = buf + len;
+
+ const __m256i shuf_mask = ____m256i(
+ (__m128i)v16u8{0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0});
+ __m256i v_ff = __lasx_xvrepli_w(0xFF);
+
+ while (buf + 16 <= end) {
+ __m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+ __m256i in2 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
+
+ __m256i in12 = __lasx_xvor_v(in1, in2);
+ if (__lasx_xbz_v(__lasx_xvslt_wu(v_ff, in12))) {
+ // 1. pack the bytes
+ __m256i latin1_packed_tmp = __lasx_xvshuf_b(in2, in1, shuf_mask);
+ latin1_packed_tmp = __lasx_xvpermi_d(latin1_packed_tmp, 0b00001000);
+ __m128i latin1_packed = lasx_extracti128_lo(latin1_packed_tmp);
+ latin1_packed = __lsx_vpermi_w(latin1_packed, latin1_packed, 0b11011000);
+ // 2. store (8 bytes)
+ __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+ // 3. adjust pointers
+ buf += 16;
+ latin1_output += 16;
+ } else {
+ // Let us do a scalar fallback.
+ for (int k = 0; k < 16; k++) {
+ uint32_t word = buf[k];
+ if (word <= 0xff) {
+ *latin1_output++ = char(word);
+ } else {
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+ latin1_output);
+ }
+ }
+ }
+ } // while
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ latin1_output);
+}
diff --git a/contrib/simdutf/src/lasx/lasx_convert_utf32_to_utf16.cpp b/contrib/simdutf/src/lasx/lasx_convert_utf32_to_utf16.cpp
new file mode 100644
index 000000000..7d49ba1bb
--- /dev/null
+++ b/contrib/simdutf/src/lasx/lasx_convert_utf32_to_utf16.cpp
@@ -0,0 +1,218 @@
+template <endianness big_endian>
+std::pair<const char32_t *, char16_t *>
+lasx_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+ char16_t *utf16_out) {
+ uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
+ const char32_t *end = buf + len;
+
+ // Performance degradation when memory address is not 32-byte aligned
+ while (((uint64_t)utf16_output & 0x1F) && buf < end) {
+ uint32_t word = *buf++;
+ if ((word & 0xFFFF0000) == 0) {
+ // will not generate a surrogate pair
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+ *utf16_output++ = !match_system(big_endian)
+ ? char16_t(word >> 8 | word << 8)
+ : char16_t(word);
+ // buf++;
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (!match_system(big_endian)) {
+ high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+ low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ // buf++;
+ }
+ }
+
+ __m256i forbidden_bytemask = __lasx_xvrepli_h(0);
+ __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+ __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xdfff));
+ while (buf + 16 <= end) {
+ __m256i in0 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+ __m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
+
+ // Check if no bits set above 16th
+ if (__lasx_xbz_v(__lasx_xvpickod_h(in1, in0))) {
+ __m256i utf16_packed =
+ __lasx_xvpermi_d(__lasx_xvpickev_h(in1, in0), 0b11011000);
+ forbidden_bytemask = __lasx_xvor_v(
+ __lasx_xvand_v(
+ __lasx_xvsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff
+ __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+ forbidden_bytemask);
+
+ if (!match_system(big_endian)) {
+ utf16_packed = lasx_swap_bytes(utf16_packed);
+ }
+ __lasx_xvst(utf16_packed, utf16_output, 0);
+ utf16_output += 16;
+ buf += 16;
+ } else {
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFF0000) == 0) {
+ // will not generate a surrogate pair
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+ *utf16_output++ = !match_system(big_endian)
+ ? char16_t(word >> 8 | word << 8)
+ : char16_t(word);
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (!match_system(big_endian)) {
+ high_surrogate =
+ uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+ low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ }
+ }
+ buf += k;
+ }
+ }
+
+ // check for invalid input
+ if (__lasx_xbnz_v(forbidden_bytemask)) {
+ return std::make_pair(nullptr, reinterpret_cast<char16_t *>(utf16_output));
+ }
+ return std::make_pair(buf, reinterpret_cast<char16_t *>(utf16_output));
+}
+
+template <endianness big_endian>
+std::pair<result, char16_t *>
+lasx_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+ char16_t *utf16_out) {
+ uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
+ const char32_t *start = buf;
+ const char32_t *end = buf + len;
+
+ // Performance degradation when memory address is not 32-byte aligned
+ while (((uint64_t)utf16_output & 0x1F) && buf < end) {
+ uint32_t word = *buf++;
+ if ((word & 0xFFFF0000) == 0) {
+ // will not generate a surrogate pair
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(result(error_code::SURROGATE, buf - start - 1),
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+ *utf16_output++ = !match_system(big_endian)
+ ? char16_t(word >> 8 | word << 8)
+ : char16_t(word);
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start - 1),
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (!match_system(big_endian)) {
+ high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+ low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ }
+ }
+
+ __m256i forbidden_bytemask = __lasx_xvrepli_h(0);
+ __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+ __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xdfff));
+ while (buf + 16 <= end) {
+ __m256i in0 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+ __m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
+
+ // Check if no bits set above 16th
+ if (__lasx_xbz_v(__lasx_xvpickod_h(in1, in0))) {
+ __m256i utf16_packed =
+ __lasx_xvpermi_d(__lasx_xvpickev_h(in1, in0), 0b11011000);
+ forbidden_bytemask = __lasx_xvor_v(
+ __lasx_xvand_v(
+ __lasx_xvsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff
+ __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+ forbidden_bytemask);
+ if (__lasx_xbnz_v(forbidden_bytemask)) {
+ return std::make_pair(result(error_code::SURROGATE, buf - start),
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+
+ if (!match_system(big_endian)) {
+ utf16_packed = lasx_swap_bytes(utf16_packed);
+ }
+
+ __lasx_xvst(utf16_packed, utf16_output, 0);
+ utf16_output += 16;
+ buf += 16;
+ } else {
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFF0000) == 0) {
+ // will not generate a surrogate pair
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k),
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+ *utf16_output++ = !match_system(big_endian)
+ ? char16_t(word >> 8 | word << 8)
+ : char16_t(word);
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return std::make_pair(
+ result(error_code::TOO_LARGE, buf - start + k),
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (!match_system(big_endian)) {
+ high_surrogate =
+ uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+ low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ }
+ }
+ buf += k;
+ }
+ }
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ reinterpret_cast<char16_t *>(utf16_output));
+}
diff --git a/contrib/simdutf/src/lasx/lasx_convert_utf32_to_utf8.cpp b/contrib/simdutf/src/lasx/lasx_convert_utf32_to_utf8.cpp
new file mode 100644
index 000000000..355a5753c
--- /dev/null
+++ b/contrib/simdutf/src/lasx/lasx_convert_utf32_to_utf8.cpp
@@ -0,0 +1,589 @@
+std::pair<const char32_t *, char *>
+lasx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
+ uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+ const char32_t *end = buf + len;
+
+ // load addr align 32
+ while (((uint64_t)buf & 0x1F) && buf < end) {
+ uint32_t word = *buf;
+ if ((word & 0xFFFFFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xFFFFF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xFFFF0000) == 0) {
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ if (word > 0x10FFFF) {
+ return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ }
+ buf++;
+ }
+
+ __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xC080));
+ __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7FF));
+ __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xDFFF));
+ __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+ __m256i zero = __lasx_xvldi(0);
+ __m128i zero_128 = __lsx_vldi(0);
+ __m256i forbidden_bytemask = __lasx_xvldi(0x0);
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (buf + 16 + safety_margin < end) {
+ __m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+ __m256i nextin = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
+
+ // Check if no bits set above 16th
+ if (__lasx_xbz_v(__lasx_xvpickod_h(in, nextin))) {
+ // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+ // Apply UTF-16 => UTF-8 routine (lasx_convert_utf16_to_utf8.cpp)
+ __m256i utf16_packed =
+ __lasx_xvpermi_d(__lasx_xvpickev_h(nextin, in), 0b11011000);
+
+ if (__lasx_xbz_v(__lasx_xvslt_hu(__lasx_xvrepli_h(0x7F),
+ utf16_packed))) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ // obviously suboptimal.
+ __m256i utf8_packed = __lasx_xvpermi_d(
+ __lasx_xvpickev_b(utf16_packed, utf16_packed), 0b00001000);
+ // 2. store (8 bytes)
+ __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+
+ if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, utf16_packed))) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const __m256i t0 = __lasx_xvslli_h(utf16_packed, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/));
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m256i t2 = __lasx_xvand_v(utf16_packed, __lasx_xvrepli_h(0x3f));
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m256i t3 = __lasx_xvor_v(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const __m256i t4 = __lasx_xvor_v(t3, v_c080);
+ // 2. merge ASCII and 2-byte codewords
+ __m256i one_byte_bytemask =
+ __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F /*0x007F*/));
+ __m256i utf8_unpacked =
+ __lasx_xvbitsel_v(t4, utf16_packed, one_byte_bytemask);
+ // 3. prepare bitmask for 8-bit lookup
+ __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
+ uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
+ uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
+ // 4. pack the bytes
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+ [lasx_1_2_utf8_bytes_mask[m1]][0];
+ __m128i shuffle1 = __lsx_vld(row1, 1);
+ __m128i utf8_packed1 = __lsx_vshuf_b(
+ zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);
+
+ const uint8_t *row2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+ [lasx_1_2_utf8_bytes_mask[m2]][0];
+ __m128i shuffle2 = __lsx_vld(row2, 1);
+ __m128i utf8_packed2 = __lsx_vshuf_b(
+ zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
+ // 5. store bytes
+ __lsx_vst(utf8_packed1, utf8_output, 0);
+ utf8_output += row1[0];
+
+ __lsx_vst(utf8_packed2, utf8_output, 0);
+ utf8_output += row2[0];
+
+ buf += 16;
+ continue;
+ } else {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ forbidden_bytemask = __lasx_xvor_v(
+ __lasx_xvand_v(
+ __lasx_xvsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff
+ __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+ forbidden_bytemask);
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
+ two UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 &
+ #3 in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** --
+ precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+ they differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence,
+ taking into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ __m256i t0 = __lasx_xvpickev_b(utf16_packed, utf16_packed);
+ t0 = __lasx_xvilvl_b(t0, t0);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
+ __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688 /*0x8000*/));
+
+ // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+ __m256i s0 = __lasx_xvsrli_h(utf16_packed, 12);
+ // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+ __m256i s1 = __lasx_xvslli_h(utf16_packed, 2);
+ // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+ s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/));
+ // [00bb|bbbb|0000|aaaa]
+ __m256i s2 = __lasx_xvor_v(s0, s1);
+ // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
+ __m256i s3 = __lasx_xvor_v(s2, v_c0e0);
+ // __m256i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+ __m256i one_or_two_bytes_bytemask =
+ __lasx_xvsle_hu(utf16_packed, v_07ff);
+ __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask,
+ __lasx_xvldi(-2752 /*0x4000*/));
+ __m256i s4 = __lasx_xvxor_v(s3, m0);
+
+ // 4. expand code units 16-bit => 32-bit
+ __m256i out0 = __lasx_xvilvl_h(s4, t2);
+ __m256i out1 = __lasx_xvilvh_h(s4, t2);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ __m256i one_byte_bytemask =
+ __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F));
+
+ __m256i one_or_two_bytes_bytemask_u16_to_u32_low =
+ __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
+ __m256i one_or_two_bytes_bytemask_u16_to_u32_high =
+ __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);
+
+ __m256i one_byte_bytemask_u16_to_u32_low =
+ __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
+ __m256i one_byte_bytemask_u16_to_u32_high =
+ __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+ __m256i mask0 = __lasx_xvmskltz_h(
+ __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_low,
+ one_byte_bytemask_u16_to_u32_low));
+ __m256i mask1 = __lasx_xvmskltz_h(
+ __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_high,
+ one_byte_bytemask_u16_to_u32_high));
+
+ uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+ [0];
+ __m128i shuffle0 = __lsx_vld(row0, 1);
+ __m128i utf8_0 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
+ __lsx_vst(utf8_0, utf8_output, 0);
+ utf8_output += row0[0];
+
+ mask = __lasx_xvpickve2gr_wu(mask1, 0);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+ [0];
+ __m128i shuffle1 = __lsx_vld(row1, 1);
+ __m128i utf8_1 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
+ __lsx_vst(utf8_1, utf8_output, 0);
+ utf8_output += row1[0];
+
+ mask = __lasx_xvpickve2gr_wu(mask0, 4);
+ const uint8_t *row2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+ [0];
+ __m128i shuffle2 = __lsx_vld(row2, 1);
+ __m128i utf8_2 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
+ __lsx_vst(utf8_2, utf8_output, 0);
+ utf8_output += row2[0];
+
+ mask = __lasx_xvpickve2gr_wu(mask1, 4);
+ const uint8_t *row3 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+ [0];
+ __m128i shuffle3 = __lsx_vld(row3, 1);
+ __m128i utf8_3 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
+ __lsx_vst(utf8_3, utf8_output, 0);
+ utf8_output += row3[0];
+
+ buf += 16;
+ }
+ // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+ // will produce four UTF-8 bytes.
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFFFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xFFFFF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xFFFF0000) == 0) {
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char *>(utf8_output));
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ if (word > 0x10FFFF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char *>(utf8_output));
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ // check for invalid input
+ if (__lasx_xbnz_v(forbidden_bytemask)) {
+ return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
+ }
+ return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
+}
+
+std::pair<result, char *>
+lasx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+ char *utf8_out) {
+ uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+ const char32_t *start = buf;
+ const char32_t *end = buf + len;
+
+ // load addr align 32
+ while (((uint64_t)buf & 0x1F) && buf < end) {
+ uint32_t word = *buf;
+ if ((word & 0xFFFFFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xFFFFF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xFFFF0000) == 0) {
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(result(error_code::SURROGATE, buf - start),
+ reinterpret_cast<char *>(utf8_output));
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ if (word > 0x10FFFF) {
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+ reinterpret_cast<char *>(utf8_output));
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ }
+ buf++;
+ }
+
+ __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xC080));
+ __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7FF));
+ __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xDFFF));
+ __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+ __m256i zero = __lasx_xvldi(0);
+ __m128i zero_128 = __lsx_vldi(0);
+ __m256i forbidden_bytemask = __lasx_xvldi(0x0);
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (buf + 16 + safety_margin < end) {
+ __m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+ __m256i nextin = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
+
+ // Check if no bits set above 16th
+ if (__lasx_xbz_v(__lasx_xvpickod_h(in, nextin))) {
+ // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+ // Apply UTF-16 => UTF-8 routine (lasx_convert_utf16_to_utf8.cpp)
+ __m256i utf16_packed =
+ __lasx_xvpermi_d(__lasx_xvpickev_h(nextin, in), 0b11011000);
+
+ if (__lasx_xbz_v(__lasx_xvslt_hu(__lasx_xvrepli_h(0x7F),
+ utf16_packed))) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ // obviously suboptimal.
+ __m256i utf8_packed = __lasx_xvpermi_d(
+ __lasx_xvpickev_b(utf16_packed, utf16_packed), 0b00001000);
+ // 2. store (8 bytes)
+ __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+
+ if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, utf16_packed))) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const __m256i t0 = __lasx_xvslli_h(utf16_packed, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/));
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m256i t2 = __lasx_xvand_v(utf16_packed, __lasx_xvrepli_h(0x3f));
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m256i t3 = __lasx_xvor_v(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const __m256i t4 = __lasx_xvor_v(t3, v_c080);
+ // 2. merge ASCII and 2-byte codewords
+ __m256i one_byte_bytemask =
+ __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F /*0x007F*/));
+ __m256i utf8_unpacked =
+ __lasx_xvbitsel_v(t4, utf16_packed, one_byte_bytemask);
+ // 3. prepare bitmask for 8-bit lookup
+ __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
+ uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
+ uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
+ // 4. pack the bytes
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+ [lasx_1_2_utf8_bytes_mask[m1]][0];
+ __m128i shuffle1 = __lsx_vld(row1, 1);
+ __m128i utf8_packed1 = __lsx_vshuf_b(
+ zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);
+
+ const uint8_t *row2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+ [lasx_1_2_utf8_bytes_mask[m2]][0];
+ __m128i shuffle2 = __lsx_vld(row2, 1);
+ __m128i utf8_packed2 = __lsx_vshuf_b(
+ zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
+ // 5. store bytes
+ __lsx_vst(utf8_packed1, utf8_output, 0);
+ utf8_output += row1[0];
+
+ __lsx_vst(utf8_packed2, utf8_output, 0);
+ utf8_output += row2[0];
+
+ buf += 16;
+ continue;
+ } else {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ forbidden_bytemask = __lasx_xvor_v(
+ __lasx_xvand_v(
+ __lasx_xvsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff
+ __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+ forbidden_bytemask);
+ if (__lasx_xbnz_v(forbidden_bytemask)) {
+ return std::make_pair(result(error_code::SURROGATE, buf - start),
+ reinterpret_cast<char *>(utf8_output));
+ }
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
+ two UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 &
+ #3 in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** --
+ precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+ they differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence,
+ taking into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ __m256i t0 = __lasx_xvpickev_b(utf16_packed, utf16_packed);
+ t0 = __lasx_xvilvl_b(t0, t0);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
+ __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688 /*0x8000*/));
+
+ // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+ __m256i s0 = __lasx_xvsrli_h(utf16_packed, 12);
+ // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+ __m256i s1 = __lasx_xvslli_h(utf16_packed, 2);
+ // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+ s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/));
+ // [00bb|bbbb|0000|aaaa]
+ __m256i s2 = __lasx_xvor_v(s0, s1);
+ // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
+ __m256i s3 = __lasx_xvor_v(s2, v_c0e0);
+ // __m256i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+ __m256i one_or_two_bytes_bytemask =
+ __lasx_xvsle_hu(utf16_packed, v_07ff);
+ __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask,
+ __lasx_xvldi(-2752 /*0x4000*/));
+ __m256i s4 = __lasx_xvxor_v(s3, m0);
+
+ // 4. expand code units 16-bit => 32-bit
+ __m256i out0 = __lasx_xvilvl_h(s4, t2);
+ __m256i out1 = __lasx_xvilvh_h(s4, t2);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ __m256i one_byte_bytemask =
+ __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F));
+
+ __m256i one_or_two_bytes_bytemask_u16_to_u32_low =
+ __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
+ __m256i one_or_two_bytes_bytemask_u16_to_u32_high =
+ __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);
+
+ __m256i one_byte_bytemask_u16_to_u32_low =
+ __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
+ __m256i one_byte_bytemask_u16_to_u32_high =
+ __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+ __m256i mask0 = __lasx_xvmskltz_h(
+ __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_low,
+ one_byte_bytemask_u16_to_u32_low));
+ __m256i mask1 = __lasx_xvmskltz_h(
+ __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_high,
+ one_byte_bytemask_u16_to_u32_high));
+
+ uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+ [0];
+ __m128i shuffle0 = __lsx_vld(row0, 1);
+ __m128i utf8_0 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
+ __lsx_vst(utf8_0, utf8_output, 0);
+ utf8_output += row0[0];
+
+ mask = __lasx_xvpickve2gr_wu(mask1, 0);
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+ [0];
+ __m128i shuffle1 = __lsx_vld(row1, 1);
+ __m128i utf8_1 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
+ __lsx_vst(utf8_1, utf8_output, 0);
+ utf8_output += row1[0];
+
+ mask = __lasx_xvpickve2gr_wu(mask0, 4);
+ const uint8_t *row2 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+ [0];
+ __m128i shuffle2 = __lsx_vld(row2, 1);
+ __m128i utf8_2 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
+ __lsx_vst(utf8_2, utf8_output, 0);
+ utf8_output += row2[0];
+
+ mask = __lasx_xvpickve2gr_wu(mask1, 4);
+ const uint8_t *row3 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+ [0];
+ __m128i shuffle3 = __lsx_vld(row3, 1);
+ __m128i utf8_3 =
+ __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
+ __lsx_vst(utf8_3, utf8_output, 0);
+ utf8_output += row3[0];
+
+ buf += 16;
+ }
+ // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+ // will produce four UTF-8 bytes.
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFFFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xFFFFF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xFFFF0000) == 0) {
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k),
+ reinterpret_cast<char *>(utf8_output));
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ if (word > 0x10FFFF) {
+ return std::make_pair(
+ result(error_code::TOO_LARGE, buf - start + k),
+ reinterpret_cast<char *>(utf8_output));
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ reinterpret_cast<char *>(utf8_output));
+}
diff --git a/contrib/simdutf/src/lasx/lasx_convert_utf8_to_latin1.cpp b/contrib/simdutf/src/lasx/lasx_convert_utf8_to_latin1.cpp
new file mode 100644
index 000000000..cafc04946
--- /dev/null
+++ b/contrib/simdutf/src/lasx/lasx_convert_utf8_to_latin1.cpp
@@ -0,0 +1,72 @@
+size_t convert_masked_utf8_to_latin1(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char *&latin1_output) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask & 0xfff;
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+
+ // We first try a few fast paths.
+ // The obvious first test is ASCII, which actually consumes the full 16.
+ if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
+ // We process in chunks of 16 bytes
+ __lsx_vst(in, reinterpret_cast<uint8_t *>(latin1_output), 0);
+ latin1_output += 16; // We wrote 16 18-bit characters.
+ return 16; // We consumed 16 bytes.
+ }
+ /// We do not have a fast path available, or the fast path is unimportant, so
+ /// we fallback.
+ const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][0];
+
+ const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][1];
+ // this indicates an invalid input:
+ if (idx >= 64) {
+ return consumed;
+ }
+ // Here we should have (idx < 64), if not, there is a bug in the validation or
+ // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
+ // we process SIX (6) input code-code units. The max length in bytes of six
+ // code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6
+ // 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy
+ // scenario we process SIX (6) input code-code units. The max length in bytes
+ // of six code code units spanning between 1 and 2 bytes each is 12 bytes.
+ __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+ 0);
+ // Shuffle
+ // 1 byte: 00000000 0bbbbbbb
+ // 2 byte: 110aaaaa 10bbbbbb
+ sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+ __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
+ // ascii mask
+ // 1 byte: 11111111 11111111
+ // 2 byte: 00000000 00000000
+ __m128i ascii_mask = __lsx_vslt_bu(perm, __lsx_vldi(0x80));
+ // utf8 mask
+ // 1 byte: 00000000 00000000
+ // 2 byte: 00111111 00111111
+ __m128i utf8_mask = __lsx_vand_v(__lsx_vsle_bu(__lsx_vldi(0x80), perm),
+ __lsx_vldi(0b00111111));
+ // mask
+ // 1 byte: 11111111 11111111
+ // 2 byte: 00111111 00111111
+ __m128i mask = __lsx_vor_v(utf8_mask, ascii_mask);
+
+ __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), perm, mask);
+ // writing 8 bytes even though we only care about the first 6 bytes.
+ __m128i latin1_packed = __lsx_vpickev_b(__lsx_vldi(0), composed);
+
+ __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+ latin1_output += 6; // We wrote 6 bytes.
+ return consumed;
+}
diff --git a/contrib/simdutf/src/lasx/lasx_convert_utf8_to_utf16.cpp b/contrib/simdutf/src/lasx/lasx_convert_utf8_to_utf16.cpp
new file mode 100644
index 000000000..2a97eb75f
--- /dev/null
+++ b/contrib/simdutf/src/lasx/lasx_convert_utf8_to_utf16.cpp
@@ -0,0 +1,293 @@
+// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 16, usually 12).
+template <endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char16_t *&utf16_output) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask & 0xfff;
+ //
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+
+ // We first try a few fast paths.
+ // The obvious first test is ASCII, which actually consumes the full 16.
+ if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
+ __m128i zero = __lsx_vldi(0);
+ if (match_system(big_endian)) {
+ __lsx_vst(__lsx_vilvl_b(zero, in),
+ reinterpret_cast<uint16_t *>(utf16_output), 0);
+ __lsx_vst(__lsx_vilvh_b(zero, in),
+ reinterpret_cast<uint16_t *>(utf16_output), 16);
+ } else {
+ __lsx_vst(__lsx_vilvl_b(in, zero),
+ reinterpret_cast<uint16_t *>(utf16_output), 0);
+ __lsx_vst(__lsx_vilvh_b(in, zero),
+ reinterpret_cast<uint16_t *>(utf16_output), 16);
+ }
+ utf16_output += 16; // We wrote 16 16-bit characters.
+ return 16; // We consumed 16 bytes.
+ }
+
+ // 3 byte sequences are the next most common, as seen in CJK, which has long
+ // sequences of these.
+ if (input_utf8_end_of_code_point_mask == 0x924) {
+ // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
+ // UTF-16 code units.
+ __m128i composed = convert_utf8_3_byte_to_utf16(in);
+ // Byte swap if necessary
+ if (!match_system(big_endian)) {
+ composed = lsx_swap_bytes(composed);
+ }
+
+ __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+ utf16_output += 4; // We wrote 4 16-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+
+ // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+ if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xAAAA) {
+ // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte
+ // UTF-16 code units.
+ __m128i composed = convert_utf8_2_byte_to_utf16(in);
+ // Byte swap if necessary
+ if (!match_system(big_endian)) {
+ composed = lsx_swap_bytes(composed);
+ }
+
+ __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+ utf16_output += 8; // We wrote 6 16-bit characters.
+ return 16; // We consumed 12 bytes.
+ }
+
+ /// We do not have a fast path available, or the fast path is unimportant, so
+ /// we fallback.
+ const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][0];
+
+ const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][1];
+ const __m128i zero = __lsx_vldi(0);
+ if (idx < 64) {
+ // SIX (6) input code-code units
+ // Convert to UTF-16
+ __m128i composed = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+ // Byte swap if necessary
+ if (!match_system(big_endian)) {
+ composed = lsx_swap_bytes(composed);
+ }
+ // Store
+ __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+ utf16_output += 6; // We wrote 6 16-bit characters.
+ return consumed;
+ } else if (idx < 145) {
+ // FOUR (4) input code-code units
+ // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+ __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+ 0);
+ // XXX: depending on the system scalar instructions might be faster.
+ // 1 byte: 00000000 00000000 0ccccccc
+ // 2 byte: 00000000 110bbbbb 10cccccc
+ // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+ sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+ __m128i perm = __lsx_vshuf_b(zero, in, sh);
+ // 1 byte: 00000000 0ccccccc
+ // 2 byte: xx0bbbbb x0cccccc
+ // 3 byte: xxbbbbbb x0cccccc
+ __m128i lowperm = __lsx_vpickev_h(perm, perm);
+ // 1 byte: 00000000 00000000
+ // 2 byte: 00000000 00000000
+ // 3 byte: 00000000 1110aaaa
+ __m128i highperm = __lsx_vpickod_h(perm, perm);
+ // 3 byte: aaaa0000 00000000
+ highperm = __lsx_vslli_h(highperm, 12);
+ // ASCII
+ // 1 byte: 00000000 0ccccccc
+ // 2+byte: 00000000 00cccccc
+ __m128i ascii = __lsx_vand_v(lowperm, __lsx_vrepli_h(0x7f));
+ // 1 byte: 00000000 00000000
+ // 2 byte: xx0bbbbb 00000000
+ // 3 byte: xxbbbbbb 00000000
+ __m128i middlebyte = __lsx_vand_v(lowperm, __lsx_vldi(-2561) /*0xFF00*/);
+ // 1 byte: 00000000 0ccccccc
+ // 2 byte: 0010bbbb bbcccccc
+ // 3 byte: 0010bbbb bbcccccc
+ __m128i composed = __lsx_vor_v(__lsx_vsrli_h(middlebyte, 2), ascii);
+
+ __m128i v0fff = __lsx_vreplgr2vr_h(uint16_t(0xfff));
+ // aaaabbbb bbcccccc
+ composed = __lsx_vbitsel_v(highperm, composed, v0fff);
+
+ if (!match_system(big_endian)) {
+ composed = lsx_swap_bytes(composed);
+ }
+
+ __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+ utf16_output += 4; // We wrote 4 16-bit codepoints
+ return consumed;
+ } else if (idx < 209) {
+ // THREE (3) input code-code units
+ if (input_utf8_end_of_code_point_mask == 0x888) {
+ // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+ // UTF-16 pairs. Generating surrogate pairs is a little tricky though, but
+ // it is easier when we can assume they are all pairs. This version does
+ // not use the LUT, but 4 byte sequences are less common and the overhead
+ // of the extra memory access is less important than the early branch
+ // overhead in shorter sequences.
+
+ // Swap byte pairs
+ // 10dddddd 10cccccc|10bbbbbb 11110aaa
+ // 10cccccc 10dddddd|11110aaa 10bbbbbb
+ __m128i swap = lsx_swap_bytes(in);
+ // Shift left 2 bits
+ // cccccc00 dddddd00 xxxxxxxx bbbbbb00
+ __m128i shift = __lsx_vslli_b(swap, 2);
+ // Create a magic number containing the low 2 bits of the trail surrogate
+ // and all the corrections needed to create the pair. UTF-8 4b prefix =
+ // -0x0000|0xF000 surrogate offset = -0x0000|0x0040 (0x10000 << 6)
+ // surrogate high = +0x0000|0xD800
+ // surrogate low = +0xDC00|0x0000
+ // -------------------------------
+ // = +0xDC00|0xE7C0
+ __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xDC00E7C0));
+ // Generate unadjusted trail surrogate minus lowest 2 bits
+ // vec(0000FF00) = __lsx_vldi(-1758)
+ // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
+ __m128i trail =
+ __lsx_vbitsel_v(shift, swap, __lsx_vldi(-1758 /*0000FF00*/));
+ // Insert low 2 bits of trail surrogate to magic number for later
+ // 11011100 00000000 11100111 110000cc
+ __m128i magic_with_low_2 = __lsx_vor_v(__lsx_vsrli_w(shift, 30), magic);
+
+ // Generate lead surrogate
+ // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx
+ // 000000cc ccdddddd|xxxxxxxx xxxxxxxx
+ __m128i lead = __lsx_vbitsel_v(
+ __lsx_vsrli_h(__lsx_vand_v(shift, __lsx_vldi(0x3F)), 4), swap,
+ __lsx_vrepli_h(0x3f /* 0x003f*/));
+
+ // Blend pairs
+ // __lsx_vldi(-1741) => vec(0x0000FFFF)
+ // 000000cc ccdddddd|11110aaa bbbbbb00
+ __m128i blend =
+ __lsx_vbitsel_v(lead, trail, __lsx_vldi(-1741) /* (0x0000FFFF)*4 */);
+
+ // Add magic number to finish the result
+ // 110111CC CCDDDDDD|110110AA BBBBBBCC
+ __m128i composed = __lsx_vadd_h(blend, magic_with_low_2);
+ // Byte swap if necessary
+ if (!match_system(big_endian)) {
+ composed = lsx_swap_bytes(composed);
+ }
+ __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+ utf16_output += 6; // We 3 32-bit surrogate pairs.
+ return 12; // We consumed 12 bytes.
+ }
+ // 3 1-4 byte sequences
+ __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+ 0);
+ // 1 byte: 00000000 00000000 00000000 0ddddddd
+ // 3 byte: 00000000 00000000 110ccccc 10dddddd
+ // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+ // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+ sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+ __m128i perm = __lsx_vshuf_b(zero, in, sh);
+ // added to fix issue https://github.com/simdutf/simdutf/issues/514
+ // We only want to write 2 * 16-bit code units when that is actually what we
+ // have. Unfortunately, we cannot trust the input. So it is possible to get
+ // 0xff as an input byte and it should not result in a surrogate pair. We
+ // need to check for that.
+ uint32_t permbuffer[4];
+ __lsx_vst(perm, permbuffer, 0);
+ // Mask the low and middle bytes
+ // 00000000 00000000 00000000 0ddddddd
+ __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7f));
+ // Because the surrogates need more work, the high surrogate is computed
+ // first.
+ __m128i middlehigh = __lsx_vslli_w(perm, 2);
+ // 00000000 00000000 00cccccc 00000000
+ __m128i middlebyte = __lsx_vand_v(perm, __lsx_vldi(-3777) /* 0x00003F00 */);
+ // Start assembling the sequence. Since the 4th byte is in the same position
+ // as it would be in a surrogate and there is no dependency, shift left
+ // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte:
+ // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
+ __m128i ab =
+ __lsx_vbitsel_v(middlehigh, perm, __lsx_vldi(-1656) /*0xFF000000*/);
+ // Top 16 bits contains the high ten bits of the surrogate pair before
+ // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa
+ // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
+ __m128i v_fffc0000 = __lsx_vreplgr2vr_w(uint32_t(0xFFFC0000));
+ __m128i abc = __lsx_vbitsel_v(__lsx_vslli_w(middlebyte, 4), ab, v_fffc0000);
+ // Combine the low 6 or 7 bits by a shift right accumulate
+ // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct
+ // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o
+ // correction
+ __m128i composed = __lsx_vor_v(ascii, __lsx_vsrli_w(abc, 6));
+ // After this is for surrogates
+ // Blend the low and high surrogates
+ // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
+ __m128i mixed =
+ __lsx_vbitsel_v(abc, composed, __lsx_vldi(-1741) /*0x0000FFFF*/);
+ // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits
+ // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte:
+ // 11110aaa bbbbbbcc|000000cc ccdddddd
+ __m128i v_ffff03ff = __lsx_vreplgr2vr_w(uint32_t(0xFFFF03FF));
+ __m128i masked_pair = __lsx_vand_v(mixed, v_ffff03ff);
+ // Correct the remaining UTF-8 prefix, surrogate offset, and add the
+ // surrogate prefixes in one magic 16-bit addition. similar magic number but
+ // without the continue byte adjust and halfword swapped UTF-8 4b prefix =
+ // -0xF000|0x0000 surrogate offset = -0x0040|0x0000 (0x10000 << 6)
+ // surrogate high = +0xD800|0x0000
+ // surrogate low = +0x0000|0xDC00
+ // -----------------------------------
+ // = +0xE7C0|0xDC00
+ __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xE7C0DC00));
+ // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete
+ __m128i surrogates = __lsx_vadd_w(masked_pair, magic);
+ // If the high bit is 1 (s32 less than zero), this needs a surrogate pair
+ __m128i is_pair = __lsx_vslt_w(perm, zero);
+ // Select either the 4 byte surrogate pair or the 2 byte solo codepoint
+ // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd
+ // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD
+ __m128i selected = __lsx_vbitsel_v(composed, surrogates, is_pair);
+ // Byte swap if necessary
+ if (!match_system(big_endian)) {
+ selected = lsx_swap_bytes(selected);
+ }
+ // Attempting to shuffle and store would be complex, just scalarize.
+ uint32_t buffer_tmp[4];
+ __lsx_vst(selected, buffer_tmp, 0);
+ // Test for the top bit of the surrogate mask. Remove due to issue 514
+ // const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 :
+ // 0x00800000;
+ for (size_t i = 0; i < 3; i++) {
+ // Surrogate
+ // Used to be if (buffer[i] & SURROGATE_MASK) {
+ // See discussion above.
+ // patch for issue https://github.com/simdutf/simdutf/issues/514
+ if ((permbuffer[i] & 0xf8000000) == 0xf0000000) {
+ utf16_output[0] = uint16_t(buffer_tmp[i] >> 16);
+ utf16_output[1] = uint16_t(buffer_tmp[i] & 0xFFFF);
+ utf16_output += 2;
+ } else {
+ utf16_output[0] = uint16_t(buffer_tmp[i] & 0xFFFF);
+ utf16_output++;
+ }
+ }
+ return consumed;
+ } else {
+ // here we know that there is an error but we do not handle errors
+ return 12;
+ }
+}
diff --git a/contrib/simdutf/src/lasx/lasx_convert_utf8_to_utf32.cpp b/contrib/simdutf/src/lasx/lasx_convert_utf8_to_utf32.cpp
new file mode 100644
index 000000000..ca200e46c
--- /dev/null
+++ b/contrib/simdutf/src/lasx/lasx_convert_utf8_to_utf32.cpp
@@ -0,0 +1,193 @@
+// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_utf32(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char32_t *&utf32_out) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
+ __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask & 0xFFF;
+ //
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+ //
+ // We first try a few fast paths.
+ if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
+ // We process in chunks of 16 bytes.
+ // use fast implementation in src/simdutf/arm64/simd.h
+ // Ideally the compiler can keep the tables in registers.
+ __m128i zero = __lsx_vldi(0);
+ __m128i in16low = __lsx_vilvl_b(zero, in);
+ __m128i in16high = __lsx_vilvh_b(zero, in);
+ __m128i in32_0 = __lsx_vilvl_h(zero, in16low);
+ __m128i in32_1 = __lsx_vilvh_h(zero, in16low);
+ __m128i in32_2 = __lsx_vilvl_h(zero, in16high);
+ __m128i in32_3 = __lsx_vilvh_h(zero, in16high);
+
+ __lsx_vst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
+ __lsx_vst(in32_1, reinterpret_cast<uint32_t *>(utf32_output), 16);
+ __lsx_vst(in32_2, reinterpret_cast<uint32_t *>(utf32_output), 32);
+ __lsx_vst(in32_3, reinterpret_cast<uint32_t *>(utf32_output), 48);
+
+ utf32_output += 16; // We wrote 16 32-bit characters.
+ return 16; // We consumed 16 bytes.
+ }
+ __m128i zero = __lsx_vldi(0);
+ if (input_utf8_end_of_code_point_mask == 0x924) {
+ // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
+ // UTF-32 code units. Convert to UTF-16
+ __m128i composed_utf16 = convert_utf8_3_byte_to_utf16(in);
+ __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
+
+ __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+ utf32_output += 4; // We wrote 4 32-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+ // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+ if (input_utf8_end_of_code_point_mask == 0xaaa) {
+ // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
+ // UTF-32 code units. Convert to UTF-16
+ __m128i composed_utf16 = convert_utf8_2_byte_to_utf16(in);
+
+ __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
+ __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);
+
+ __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+ __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
+ utf32_output += 6;
+ return 12; // We consumed 12 bytes.
+ }
+ // Either no fast path or an unimportant fast path.
+
+ const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][0];
+ const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][1];
+
+ if (idx < 64) {
+ // SIX (6) input code-code units
+ // Convert to UTF-16
+ __m128i composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+ __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
+ __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);
+
+ __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+ __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
+ utf32_output += 6;
+ return consumed;
+ } else if (idx < 145) {
+ // FOUR (4) input code-code units
+ // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+ __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+ 0);
+ // Shuffle
+ // 1 byte: 00000000 00000000 0ccccccc
+ // 2 byte: 00000000 110bbbbb 10cccccc
+ // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+ sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+ __m128i perm = __lsx_vshuf_b(zero, in, sh);
+ // Split
+ // 00000000 00000000 0ccccccc
+ __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F)); // 6 or 7 bits
+ // Note: unmasked
+ // xxxxxxxx aaaaxxxx xxxxxxxx
+ __m128i high =
+ __lsx_vsrli_w(__lsx_vand_v(perm, __lsx_vldi(0xf)), 4); // 4 bits
+ // Use 16 bit bic instead of and.
+ // The top bits will be corrected later in the bsl
+ // 00000000 10bbbbbb 00000000
+ __m128i middle =
+ __lsx_vand_v(perm, __lsx_vldi(-1758 /*0x0000FF00*/)); // 5 or 6 bits
+ // Combine low and middle with shift right accumulate
+ // 00000000 00xxbbbb bbcccccc
+ __m128i lowmid = __lsx_vor_v(ascii, __lsx_vsrli_w(middle, 2));
+ // Insert top 4 bits from high byte with bitwise select
+ // 00000000 aaaabbbb bbcccccc
+ __m128i composed =
+ __lsx_vbitsel_v(lowmid, high, __lsx_vldi(-3600 /*0x0000F000*/));
+ __lsx_vst(composed, utf32_output, 0);
+ utf32_output += 4; // We wrote 4 32-bit characters.
+ return consumed;
+ } else if (idx < 209) {
+ // THREE (3) input code-code units
+ if (input_utf8_end_of_code_point_mask == 0x888) {
+ // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+ // UTF-32 code units. This uses the same method as the fixed 3 byte
+ // version, reversing and shift left insert. However, there is no need for
+ // a shuffle mask now, just rev16 and rev32.
+ //
+ // This version does not use the LUT, but 4 byte sequences are less common
+ // and the overhead of the extra memory access is less important than the
+ // early branch overhead in shorter sequences, so it comes last.
+
+ // Swap pairs of bytes
+ // 10dddddd|10cccccc|10bbbbbb|11110aaa
+ // 10cccccc 10dddddd|11110aaa 10bbbbbb
+ __m128i swap = lsx_swap_bytes(in);
+ // Shift left and insert
+ // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
+ __m128i merge1 = __lsx_vbitsel_v(__lsx_vsrli_h(swap, 2), swap,
+ __lsx_vrepli_h(0x3f /*0x003F*/));
+ // Shift insert again
+ // xxxxxxxx xxxaaabb bbbbcccc ccdddddd
+ __m128i merge2 =
+ __lsx_vbitsel_v(__lsx_vslli_w(merge1, 12), /* merge1 << 12 */
+ __lsx_vsrli_w(merge1, 16), /* merge1 >> 16 */
+ __lsx_vldi(-2545)); /*0x00000FFF*/
+ // Clear the garbage
+ // 00000000 000aaabb bbbbcccc ccdddddd
+ __m128i composed = __lsx_vand_v(merge2, __lsx_vldi(-2273 /*0x1FFFFF*/));
+ // Store
+ __lsx_vst(composed, utf32_output, 0);
+ utf32_output += 3; // We wrote 3 32-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+ // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
+ // due to surrogates no longer being involved.
+ __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+ 0);
+ // 1 byte: 00000000 00000000 00000000 0ddddddd
+ // 2 byte: 00000000 00000000 110ccccc 10dddddd
+ // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+ // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+ sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+ __m128i perm = __lsx_vshuf_b(zero, in, sh);
+
+ // Ascii
+ __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F));
+ __m128i middle = __lsx_vand_v(perm, __lsx_vldi(-3777 /*0x00003f00*/));
+ // 00000000 00000000 0000cccc ccdddddd
+ __m128i cd =
+ __lsx_vbitsel_v(__lsx_vsrli_w(middle, 2), ascii, __lsx_vrepli_w(0x3f));
+
+ __m128i correction = __lsx_vand_v(perm, __lsx_vldi(-3520 /*0x00400000*/));
+ __m128i corrected = __lsx_vadd_b(perm, __lsx_vsrli_w(correction, 1));
+ // Insert twice
+ // 00000000 000aaabb bbbbxxxx xxxxxxxx
+ __m128i corrected_srli2 =
+ __lsx_vsrli_w(__lsx_vand_v(corrected, __lsx_vrepli_b(0x7)), 2);
+ __m128i ab =
+ __lsx_vbitsel_v(corrected_srli2, corrected, __lsx_vrepli_h(0x3f));
+ ab = __lsx_vsrli_w(ab, 4);
+ // 00000000 000aaabb bbbbcccc ccdddddd
+ __m128i composed =
+ __lsx_vbitsel_v(ab, cd, __lsx_vldi(-2545 /*0x00000FFF*/));
+ // Store
+ __lsx_vst(composed, utf32_output, 0);
+ utf32_output += 3; // We wrote 3 32-bit characters.
+ return consumed;
+ } else {
+ // here we know that there is an error but we do not handle errors
+ return 12;
+ }
+}
diff --git a/contrib/simdutf/src/lasx/lasx_validate_utf16.cpp b/contrib/simdutf/src/lasx/lasx_validate_utf16.cpp
new file mode 100644
index 000000000..392a124ff
--- /dev/null
+++ b/contrib/simdutf/src/lasx/lasx_validate_utf16.cpp
@@ -0,0 +1,201 @@
+/*
+ In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
+
+ In a vectorized algorithm we want to examine the most significant
+ nibble in order to select a fast path. If none of highest nibbles
+ are 0xD (13), than we are sure that UTF-16 chunk in a vector
+ register is valid.
+
+ Let us analyze what we need to check if the nibble is 0xD. The
+ value of the preceding nibble determines what we have:
+
+ 0xd000 .. 0xd7ff - a valid word
+ 0xd800 .. 0xdbff - low surrogate
+ 0xdc00 .. 0xdfff - high surrogate
+
+ Other constraints we have to consider:
+ - there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
+ - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
+ - there must not be sole low surrogate nor high surrogate
+
+ We're going to build three bitmasks based on the 3rd nibble:
+ - V = valid word,
+ - L = low surrogate (0xd800 .. 0xdbff)
+ - H = high surrogate (0xdc00 .. 0xdfff)
+
+ 0 1 2 3 4 5 6 7 <--- word index
+ [ V | L | H | L | H | V | V | L ]
+ 1 0 0 0 0 1 1 0 - V = valid masks
+ 0 1 0 1 0 0 0 1 - L = low surrogate
+ 0 0 1 0 1 0 0 0 - H high surrogate
+
+
+ 1 0 0 0 0 1 1 0 V = valid masks
+ 0 1 0 1 0 0 0 0 a = L & (H >> 1)
+ 0 0 1 0 1 0 0 0 b = a << 1
+ 1 1 1 1 1 1 1 0 c = V | a | b
+ ^
+ the last bit can be zero, we just consume 7
+ code units and recheck this word in the next iteration
+*/
+
+/* Returns:
+ - pointer to the last unprocessed character (a scalar fallback should check
+ the rest);
+ - nullptr if an error was detected.
+*/
+template <endianness big_endian>
+const char16_t *lasx_validate_utf16(const char16_t *input, size_t size) {
+ const char16_t *end = input + size;
+
+ const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+ const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+ const auto v_fc = simd8<uint8_t>::splat(0xfc);
+ const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+ while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
+ // 0. Load data: since the validation takes into account only higher
+ // byte of each word, we compress the two vectors into one which
+ // consists only the higher bytes.
+ auto in0 = simd16<uint16_t>(input);
+ auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
+
+ if (big_endian) {
+ in0 = in0.swap_bytes();
+ in1 = in1.swap_bytes();
+ }
+
+ const auto in = simd8<uint8_t>(__lasx_xvpermi_d(
+ __lasx_xvssrlni_bu_h(in1.value, in0.value, 8), 0b11011000));
+
+ // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+ const auto surrogates_wordmask = (in & v_f8) == v_d8;
+ const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
+ if (surrogates_bitmask == 0x0) {
+ input += simd16<uint16_t>::ELEMENTS * 2;
+ } else {
+ // 2. We have some surrogates that have to be distinguished:
+ // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+ // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+ //
+ // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+ // V - non-surrogate code units
+ // V = not surrogates_wordmask
+ const uint32_t V = ~surrogates_bitmask;
+
+ // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+ const auto vH = (in & v_fc) == v_dc;
+ const uint32_t H = vH.to_bitmask();
+
+ // L - word mask for low surrogates
+ // L = not H and surrogates_wordmask
+ const uint32_t L = ~H & surrogates_bitmask;
+
+ const uint32_t a =
+ L & (H >> 1); // A low surrogate must be followed by high one.
+ // (A low surrogate placed in the 7th register's word
+ // is an exception we handle.)
+ const uint32_t b =
+ a << 1; // Just mark that the opposite fact is hold,
+ // thanks to that we have only two masks for valid case.
+ const uint32_t c = V | a | b; // Combine all the masks into the final one.
+
+ if (c == 0xffffffff) {
+ // The whole input register contains valid UTF-16, i.e.,
+ // either single code units or proper surrogate pairs.
+ input += simd16<uint16_t>::ELEMENTS * 2;
+ } else if (c == 0x7fffffff) {
+ // The 31 lower code units of the input register contains valid UTF-16.
+ // The 31 word may be either a low or high surrogate. It the next
+ // iteration we 1) check if the low surrogate is followed by a high
+ // one, 2) reject sole high surrogate.
+ input += simd16<uint16_t>::ELEMENTS * 2 - 1;
+ } else {
+ return nullptr;
+ }
+ }
+ }
+
+ return input;
+}
+
+template <endianness big_endian>
+const result lasx_validate_utf16_with_errors(const char16_t *input,
+ size_t size) {
+ if (simdutf_unlikely(size == 0)) {
+ return result(error_code::SUCCESS, 0);
+ }
+ const char16_t *start = input;
+ const char16_t *end = input + size;
+
+ const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+ const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+ const auto v_fc = simd8<uint8_t>::splat(0xfc);
+ const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+ while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
+ // 0. Load data: since the validation takes into account only higher
+ // byte of each word, we compress the two vectors into one which
+ // consists only the higher bytes.
+ auto in0 = simd16<uint16_t>(input);
+ auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
+
+ if (big_endian) {
+ in0 = in0.swap_bytes();
+ in1 = in1.swap_bytes();
+ }
+ const auto in = simd8<uint8_t>(__lasx_xvpermi_d(
+ __lasx_xvssrlni_bu_h(in1.value, in0.value, 8), 0b11011000));
+
+ // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+ const auto surrogates_wordmask = (in & v_f8) == v_d8;
+ const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
+ if (surrogates_bitmask == 0x0) {
+ input += simd16<uint16_t>::ELEMENTS * 2;
+ } else {
+ // 2. We have some surrogates that have to be distinguished:
+ // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+ // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+ //
+ // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+ // V - non-surrogate code units
+ // V = not surrogates_wordmask
+ const uint32_t V = ~surrogates_bitmask;
+
+ // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+ const auto vH = (in & v_fc) == v_dc;
+ const uint32_t H = vH.to_bitmask();
+
+ // L - word mask for low surrogates
+ // L = not H and surrogates_wordmask
+ const uint32_t L = ~H & surrogates_bitmask;
+
+ const uint32_t a =
+ L & (H >> 1); // A low surrogate must be followed by high one.
+ // (A low surrogate placed in the 7th register's word
+ // is an exception we handle.)
+ const uint32_t b =
+ a << 1; // Just mark that the opposite fact is hold,
+ // thanks to that we have only two masks for valid case.
+ const uint32_t c = V | a | b; // Combine all the masks into the final one.
+
+ if (c == 0xffffffff) {
+ // The whole input register contains valid UTF-16, i.e.,
+ // either single code units or proper surrogate pairs.
+ input += simd16<uint16_t>::ELEMENTS * 2;
+ } else if (c == 0x7fffffff) {
+ // The 31 lower code units of the input register contains valid UTF-16.
+ // The 31 word may be either a low or high surrogate. It the next
+ // iteration we 1) check if the low surrogate is followed by a high
+ // one, 2) reject sole high surrogate.
+ input += simd16<uint16_t>::ELEMENTS * 2 - 1;
+ } else {
+ return result(error_code::SURROGATE, input - start);
+ }
+ }
+ }
+
+ return result(error_code::SUCCESS, input - start);
+}
diff --git a/contrib/simdutf/src/lasx/lasx_validate_utf32le.cpp b/contrib/simdutf/src/lasx/lasx_validate_utf32le.cpp
new file mode 100644
index 000000000..aa8e24b34
--- /dev/null
+++ b/contrib/simdutf/src/lasx/lasx_validate_utf32le.cpp
@@ -0,0 +1,85 @@
+
+const char32_t *lasx_validate_utf32le(const char32_t *input, size_t size) {
+ const char32_t *end = input + size;
+
+ // Performance degradation when memory address is not 32-byte aligned
+ while (((uint64_t)input & 0x1F) && input < end) {
+ uint32_t word = *input++;
+ if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
+ return nullptr;
+ }
+ }
+
+ __m256i offset = __lasx_xvreplgr2vr_w(uint32_t(0xffff2000));
+ __m256i standardoffsetmax = __lasx_xvreplgr2vr_w(uint32_t(0xfffff7ff));
+ __m256i standardmax = __lasx_xvldi(-2288); /*0x10ffff*/
+ __m256i currentmax = __lasx_xvldi(0x0);
+ __m256i currentoffsetmax = __lasx_xvldi(0x0);
+
+ while (input + 8 < end) {
+ __m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(input), 0);
+ currentmax = __lasx_xvmax_wu(in, currentmax);
+ // 0xD8__ + 0x2000 = 0xF8__ => 0xF8__ > 0xF7FF
+ currentoffsetmax =
+ __lasx_xvmax_wu(__lasx_xvadd_w(in, offset), currentoffsetmax);
+ input += 8;
+ }
+ __m256i is_zero =
+ __lasx_xvxor_v(__lasx_xvmax_wu(currentmax, standardmax), standardmax);
+ if (__lasx_xbnz_v(is_zero)) {
+ return nullptr;
+ }
+
+ is_zero = __lasx_xvxor_v(__lasx_xvmax_wu(currentoffsetmax, standardoffsetmax),
+ standardoffsetmax);
+ if (__lasx_xbnz_v(is_zero)) {
+ return nullptr;
+ }
+ return input;
+}
+
+const result lasx_validate_utf32le_with_errors(const char32_t *input,
+ size_t size) {
+ const char32_t *start = input;
+ const char32_t *end = input + size;
+
+ // Performance degradation when memory address is not 32-byte aligned
+ while (((uint64_t)input & 0x1F) && input < end) {
+ uint32_t word = *input;
+ if (word > 0x10FFFF) {
+ return result(error_code::TOO_LARGE, input - start);
+ }
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return result(error_code::SURROGATE, input - start);
+ }
+ input++;
+ }
+
+ __m256i offset = __lasx_xvreplgr2vr_w(uint32_t(0xffff2000));
+ __m256i standardoffsetmax = __lasx_xvreplgr2vr_w(uint32_t(0xfffff7ff));
+ __m256i standardmax = __lasx_xvldi(-2288); /*0x10ffff*/
+ __m256i currentmax = __lasx_xvldi(0x0);
+ __m256i currentoffsetmax = __lasx_xvldi(0x0);
+
+ while (input + 8 < end) {
+ __m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(input), 0);
+ currentmax = __lasx_xvmax_wu(in, currentmax);
+ currentoffsetmax =
+ __lasx_xvmax_wu(__lasx_xvadd_w(in, offset), currentoffsetmax);
+
+ __m256i is_zero =
+ __lasx_xvxor_v(__lasx_xvmax_wu(currentmax, standardmax), standardmax);
+ if (__lasx_xbnz_v(is_zero)) {
+ return result(error_code::TOO_LARGE, input - start);
+ }
+ is_zero =
+ __lasx_xvxor_v(__lasx_xvmax_wu(currentoffsetmax, standardoffsetmax),
+ standardoffsetmax);
+ if (__lasx_xbnz_v(is_zero)) {
+ return result(error_code::SURROGATE, input - start);
+ }
+ input += 8;
+ }
+
+ return result(error_code::SUCCESS, input - start);
+}
diff --git a/contrib/simdutf/src/lsx/implementation.cpp b/contrib/simdutf/src/lsx/implementation.cpp
new file mode 100644
index 000000000..b0055e642
--- /dev/null
+++ b/contrib/simdutf/src/lsx/implementation.cpp
@@ -0,0 +1,1178 @@
+#include "simdutf/lsx/begin.h"
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+#ifndef SIMDUTF_LSX_H
+ #error "lsx.h must be included"
+#endif
+using namespace simd;
+
+// convert vmskltz/vmskgez/vmsknz to
+// simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes index
+const uint8_t lsx_1_2_utf8_bytes_mask[] = {
+ 0, 1, 4, 5, 16, 17, 20, 21, 64, 65, 68, 69, 80, 81, 84,
+ 85, 2, 3, 6, 7, 18, 19, 22, 23, 66, 67, 70, 71, 82, 83,
+ 86, 87, 8, 9, 12, 13, 24, 25, 28, 29, 72, 73, 76, 77, 88,
+ 89, 92, 93, 10, 11, 14, 15, 26, 27, 30, 31, 74, 75, 78, 79,
+ 90, 91, 94, 95, 32, 33, 36, 37, 48, 49, 52, 53, 96, 97, 100,
+ 101, 112, 113, 116, 117, 34, 35, 38, 39, 50, 51, 54, 55, 98, 99,
+ 102, 103, 114, 115, 118, 119, 40, 41, 44, 45, 56, 57, 60, 61, 104,
+ 105, 108, 109, 120, 121, 124, 125, 42, 43, 46, 47, 58, 59, 62, 63,
+ 106, 107, 110, 111, 122, 123, 126, 127, 128, 129, 132, 133, 144, 145, 148,
+ 149, 192, 193, 196, 197, 208, 209, 212, 213, 130, 131, 134, 135, 146, 147,
+ 150, 151, 194, 195, 198, 199, 210, 211, 214, 215, 136, 137, 140, 141, 152,
+ 153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221, 138, 139, 142, 143,
+ 154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223, 160, 161, 164,
+ 165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245, 162, 163,
+ 166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247, 168,
+ 169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253,
+ 170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254,
+ 255};
+
+simdutf_really_inline __m128i lsx_swap_bytes(__m128i vec) {
+ // const v16u8 shuf = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+ // return __lsx_vshuf_b(__lsx_vldi(0), vec, shuf);
+ return __lsx_vshuf4i_b(vec, 0b10110001);
+ // return __lsx_vor_v(__lsx_vslli_h(vec, 8), __lsx_vsrli_h(vec, 8));
+}
+
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
+ return input.is_ascii();
+}
+
+simdutf_unused simdutf_really_inline simd8<bool>
+must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
+ const simd8<uint8_t> prev3) {
+ simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
+ simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+ simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+ // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller
+ // is using ^ as well. This will work fine because we only have to report
+ // errors for cases with 0-1 lead bytes. Multiple lead bytes implies 2
+ // overlapping multibyte characters, and if that happens, there is guaranteed
+ // to be at least *one* lead byte that is part of only 1 other multibyte
+ // character. The error will be detected there.
+ return is_second_byte ^ is_third_byte ^ is_fourth_byte;
+}
+
+simdutf_really_inline simd8<bool>
+must_be_2_3_continuation(const simd8<uint8_t> prev2,
+ const simd8<uint8_t> prev3) {
+ simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+ simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+ return is_third_byte ^ is_fourth_byte;
+}
+
+// common functions for utf8 conversions
+simdutf_really_inline __m128i convert_utf8_3_byte_to_utf16(__m128i in) {
+ // Low half contains 10bbbbbb|10cccccc
+ // High half contains 1110aaaa|1110aaaa
+ const v16u8 sh = {2, 1, 5, 4, 8, 7, 11, 10, 0, 0, 3, 3, 6, 6, 9, 9};
+ const v8u16 v0fff = {0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff};
+
+ __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, (__m128i)sh);
+ // 1110aaaa => aaaa0000
+ __m128i perm_high = __lsx_vslli_b(__lsx_vbsrl_v(perm, 8), 4);
+ // 10bbbbbb 10cccccc => 0010bbbb bbcccccc
+ __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), /* perm >> 2*/
+ perm, __lsx_vrepli_h(0x3f) /* 0x003f */);
+ // 0010bbbb bbcccccc => aaaabbbb bbcccccc
+ composed = __lsx_vbitsel_v(perm_high, composed, (__m128i)v0fff);
+
+ return composed;
+}
+
+simdutf_really_inline __m128i convert_utf8_2_byte_to_utf16(__m128i in) {
+ // 10bbbbb 110aaaaa => 00bbbbb 000aaaaa
+ __m128i composed = __lsx_vand_v(in, __lsx_vldi(0x3f));
+ // 00bbbbbb 000aaaaa => 00000aaa aabbbbbb
+ composed = __lsx_vbitsel_v(
+ __lsx_vsrli_h(__lsx_vslli_h(composed, 8), 2), /* (aaaaa << 8) >> 2 */
+ __lsx_vsrli_h(composed, 8), /* bbbbbb >> 8 */
+ __lsx_vrepli_h(0x3f)); /* 0x003f */
+ return composed;
+}
+
+simdutf_really_inline __m128i
+convert_utf8_1_to_2_byte_to_utf16(__m128i in, size_t shufutf8_idx) {
+ // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
+ // This is a relatively easy scenario
+ // we process SIX (6) input code-code units. The max length in bytes of six
+ // code code units spanning between 1 and 2 bytes each is 12 bytes.
+ __m128i sh =
+ __lsx_vld(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]),
+ 0);
+ // Shuffle
+ // 1 byte: 00000000 0bbbbbbb
+ // 2 byte: 110aaaaa 10bbbbbb
+ __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
+ // 1 byte: 00000000 0bbbbbbb
+ // 2 byte: 00000000 00bbbbbb
+ __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_h(0x7f)); // 6 or 7 bits
+ // 1 byte: 00000000 00000000
+ // 2 byte: 00000aaa aa000000
+ const __m128i v1f00 = __lsx_vldi(-2785); // -2785(13bit) => 151f
+ __m128i composed = __lsx_vsrli_h(__lsx_vand_v(perm, v1f00), 2); // 5 bits
+ // Combine with a shift right accumulate
+ // 1 byte: 00000000 0bbbbbbb
+ // 2 byte: 00000aaa aabbbbbb
+ composed = __lsx_vadd_h(ascii, composed);
+ return composed;
+}
+
+#include "lsx/lsx_validate_utf16.cpp"
+#include "lsx/lsx_validate_utf32le.cpp"
+
+#include "lsx/lsx_convert_latin1_to_utf8.cpp"
+#include "lsx/lsx_convert_latin1_to_utf16.cpp"
+#include "lsx/lsx_convert_latin1_to_utf32.cpp"
+
+#include "lsx/lsx_convert_utf8_to_utf16.cpp"
+#include "lsx/lsx_convert_utf8_to_utf32.cpp"
+#include "lsx/lsx_convert_utf8_to_latin1.cpp"
+
+#include "lsx/lsx_convert_utf16_to_latin1.cpp"
+#include "lsx/lsx_convert_utf16_to_utf8.cpp"
+#include "lsx/lsx_convert_utf16_to_utf32.cpp"
+
+#include "lsx/lsx_convert_utf32_to_latin1.cpp"
+#include "lsx/lsx_convert_utf32_to_utf8.cpp"
+#include "lsx/lsx_convert_utf32_to_utf16.cpp"
+#include "lsx/lsx_base64.cpp"
+
+} // namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#include "generic/buf_block_reader.h"
+#include "generic/utf8_validation/utf8_lookup4_algorithm.h"
+#include "generic/utf8_validation/utf8_validator.h"
+
+// transcoding from UTF-8 to Latin 1
+#include "generic/utf8_to_latin1/utf8_to_latin1.h"
+#include "generic/utf8_to_latin1/valid_utf8_to_latin1.h"
+// transcoding from UTF-8 to UTF-16
+#include "generic/utf8_to_utf16/valid_utf8_to_utf16.h"
+#include "generic/utf8_to_utf16/utf8_to_utf16.h"
+// transcoding from UTF-8 to UTF-32
+#include "generic/utf8_to_utf32/valid_utf8_to_utf32.h"
+#include "generic/utf8_to_utf32/utf8_to_utf32.h"
+
+#include "scalar/utf32_to_utf16/valid_utf32_to_utf16.h"
+#include "scalar/utf32_to_utf16/utf32_to_utf16.h"
+
+// other functions
+#include "generic/utf8.h"
+#include "generic/utf16.h"
+#include "scalar/latin1.h"
+
+//
+// Implementation-specific overrides
+//
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+ size_t length) const noexcept {
+ // If there is a BOM, then we trust it.
+ auto bom_encoding = simdutf::BOM::check_bom(input, length);
+ // todo: reimplement as a one-pass algorithm.
+ if (bom_encoding != encoding_type::unspecified) {
+ return bom_encoding;
+ }
+ int out = 0;
+ if (validate_utf8(input, length)) {
+ out |= encoding_type::UTF8;
+ }
+ if ((length % 2) == 0) {
+ if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+ length / 2)) {
+ out |= encoding_type::UTF16_LE;
+ }
+ }
+ if ((length % 4) == 0) {
+ if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+ out |= encoding_type::UTF32_LE;
+ }
+ }
+ return out;
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+ return lsx::utf8_validation::generic_validate_utf8(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+ const char *buf, size_t len) const noexcept {
+ return lsx::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+ return lsx::utf8_validation::generic_validate_ascii(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+ const char *buf, size_t len) const noexcept {
+ return lsx::utf8_validation::generic_validate_ascii_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid. protected the implementation from nullptr.
+ return true;
+ }
+ const char16_t *tail = lsx_validate_utf16<endianness::LITTLE>(buf, len);
+ if (tail) {
+ return scalar::utf16::validate<endianness::LITTLE>(tail,
+ len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid. protected the implementation from nullptr.
+ return true;
+ }
+ const char16_t *tail = lsx_validate_utf16<endianness::BIG>(buf, len);
+ if (tail) {
+ return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return result(error_code::SUCCESS, 0);
+ }
+ result res = lsx_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+ if (res.count != len) {
+ result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
+ buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return result(error_code::SUCCESS, 0);
+ }
+ result res = lsx_validate_utf16_with_errors<endianness::BIG>(buf, len);
+ if (res.count != len) {
+ result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
+ buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid. protected the implementation from nullptr.
+ return true;
+ }
+ const char32_t *tail = lsx_validate_utf32le(buf, len);
+ if (tail) {
+ return scalar::utf32::validate(tail, len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return result(error_code::SUCCESS, 0);
+ }
+ result res = lsx_validate_utf32le_with_errors(buf, len);
+ if (res.count != len) {
+ result scalar_res =
+ scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+ const char *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char *, char *> ret =
+ lsx_convert_latin1_to_utf8(buf, len, utf8_output);
+ size_t converted_chars = ret.second - utf8_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char *, char16_t *> ret =
+ lsx_convert_latin1_to_utf16le(buf, len, utf16_output);
+ size_t converted_chars = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars =
+ scalar::latin1_to_utf16::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char *, char16_t *> ret =
+ lsx_convert_latin1_to_utf16be(buf, len, utf16_output);
+ size_t converted_chars = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars =
+ scalar::latin1_to_utf16::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char *, char32_t *> ret =
+ lsx_convert_latin1_to_utf32(buf, len, utf32_output);
+ size_t converted_chars = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ utf8_to_latin1::validating_transcoder converter;
+ return converter.convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ utf8_to_latin1::validating_transcoder converter;
+ return converter.convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ return lsx::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert_with_errors<endianness::LITTLE>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+ const char *input, size_t size, char16_t *utf16_output) const noexcept {
+ return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+ const char *input, size_t size, char16_t *utf16_output) const noexcept {
+ return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ utf8_to_utf32::validating_transcoder converter;
+ return converter.convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ utf8_to_utf32::validating_transcoder converter;
+ return converter.convert_with_errors(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+ const char *input, size_t size, char32_t *utf32_output) const noexcept {
+ return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ lsx_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_latin1::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ lsx_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_latin1::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<result, char *> ret =
+ lsx_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+ buf, len, latin1_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<result, char *> ret =
+ lsx_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
+ latin1_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ // optimization opportunity: implement a custom function.
+ return convert_utf16be_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ // optimization opportunity: implement a custom function.
+ return convert_utf16le_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ lsx_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ lsx_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf8::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ lsx_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len,
+ utf8_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ lsx_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len,
+ utf8_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf16le_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf16be_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return 0;
+ }
+ std::pair<const char32_t *, char *> ret =
+ lsx_convert_utf32_to_utf8(buf, len, utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ return result(error_code::SUCCESS, 0);
+ }
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ lsx_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+ if (ret.first.count != len) {
+ result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char16_t *, char32_t *> ret =
+ lsx_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char16_t *, char32_t *> ret =
+ lsx_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf32::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char32_t *> ret =
+ lsx_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len,
+ utf32_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf32_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char32_t *> ret =
+ lsx_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len,
+ utf32_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf32_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char32_t *, char *> ret =
+ lsx_convert_utf32_to_latin1(buf, len, latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<result, char *> ret =
+ lsx_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char32_t *, char *> ret =
+ lsx_convert_utf32_to_latin1(buf, len, latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
+ ret.first, len - (ret.first - buf), ret.second);
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ // optimization opportunity: implement a custom function.
+ return convert_utf32_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char32_t *, char16_t *> ret =
+ lsx_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char32_t *, char16_t *> ret =
+ lsx_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf32_to_utf16::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char16_t *> ret =
+ lsx_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len,
+ utf16_output);
+ if (ret.first.count != len) {
+ result scalar_res =
+ scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf16_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char16_t *> ret =
+ lsx_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
+ utf16_output);
+ if (ret.first.count != len) {
+ result scalar_res =
+ scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf16_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return convert_utf32_to_utf16le(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return convert_utf16le_to_utf32(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return convert_utf16be_to_utf32(buf, len, utf32_output);
+}
+
+void implementation::change_endianness_utf16(const char16_t *input,
+ size_t length,
+ char16_t *output) const noexcept {
+ utf16::change_endianness_utf16(input, length, output);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::count_code_points<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::count_code_points<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+ return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+ const char *buf, size_t len) const noexcept {
+ return count_utf8(buf, len);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+ return length;
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+ return length;
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+ const char *input, size_t length) const noexcept {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
+ const uint8_t *data_end = data + length;
+ uint64_t result = 0;
+ while (data + 16 < data_end) {
+ uint64_t two_bytes = 0;
+ __m128i input_vec = __lsx_vld(data, 0);
+ two_bytes =
+ __lsx_vpickve2gr_hu(__lsx_vpcnt_h(__lsx_vmskltz_b(input_vec)), 0);
+ result += 16 + two_bytes;
+ data += 16;
+ }
+ return result + scalar::latin1::utf8_length_from_latin1((const char *)data,
+ data_end - data);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+ return length;
+}
+
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+ return length;
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ return utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ const __m128i v_80 = __lsx_vrepli_w(0x80); /*0x00000080*/
+ const __m128i v_800 = __lsx_vldi(-3832); /*0x00000800*/
+ const __m128i v_10000 = __lsx_vldi(-3583); /*0x00010000*/
+ size_t pos = 0;
+ size_t count = 0;
+ for (; pos + 4 <= length; pos += 4) {
+ __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input + pos), 0);
+ const __m128i ascii_bytes_bytemask = __lsx_vslt_w(in, v_80);
+ const __m128i one_two_bytes_bytemask = __lsx_vslt_w(in, v_800);
+ const __m128i two_bytes_bytemask =
+ __lsx_vxor_v(one_two_bytes_bytemask, ascii_bytes_bytemask);
+ const __m128i three_bytes_bytemask =
+ __lsx_vxor_v(__lsx_vslt_w(in, v_10000), one_two_bytes_bytemask);
+
+ const uint32_t ascii_bytes_count = __lsx_vpickve2gr_bu(
+ __lsx_vpcnt_b(__lsx_vmskltz_w(ascii_bytes_bytemask)), 0);
+ const uint32_t two_bytes_count = __lsx_vpickve2gr_bu(
+ __lsx_vpcnt_b(__lsx_vmskltz_w(two_bytes_bytemask)), 0);
+ const uint32_t three_bytes_count = __lsx_vpickve2gr_bu(
+ __lsx_vpcnt_b(__lsx_vmskltz_w(three_bytes_bytemask)), 0);
+
+ count +=
+ 16 - 3 * ascii_bytes_count - 2 * two_bytes_count - three_bytes_count;
+ }
+ return count +
+ scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ const __m128i v_ffff = __lsx_vldi(-2304); /*0x0000ffff*/
+ size_t pos = 0;
+ size_t count = 0;
+ for (; pos + 4 <= length; pos += 4) {
+ __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input + pos), 0);
+ const __m128i surrogate_bytemask = __lsx_vslt_wu(v_ffff, in);
+ size_t surrogate_count = __lsx_vpickve2gr_bu(
+ __lsx_vpcnt_b(__lsx_vmskltz_w(surrogate_bytemask)), 0);
+ count += 4 + surrogate_count;
+ }
+ return count +
+ scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+ size_t length, base64_options options) const noexcept {
+ return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+ char *output,
+ base64_options options) const noexcept {
+ if (options & base64_url) {
+ return encode_base64<true>(output, input, length, options);
+ } else {
+ return encode_base64<false>(output, input, length, options);
+ }
+}
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#include "simdutf/lsx/end.h"
diff --git a/contrib/simdutf/src/lsx/lsx_base64.cpp b/contrib/simdutf/src/lsx/lsx_base64.cpp
new file mode 100644
index 000000000..614cd850a
--- /dev/null
+++ b/contrib/simdutf/src/lsx/lsx_base64.cpp
@@ -0,0 +1,580 @@
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
+
+template <bool isbase64url>
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+ base64_options options) {
+ // credit: Wojciech Muła
+ // SSE (lookup: pshufb improved unrolled)
+ const uint8_t *input = (const uint8_t *)src;
+ static const char *lookup_tbl =
+ isbase64url
+ ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+ : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+ uint8_t *out = (uint8_t *)dst;
+
+ v16u8 shuf;
+ __m128i v_fc0fc00, v_3f03f0, shift_r, shift_l, base64_tbl0, base64_tbl1,
+ base64_tbl2, base64_tbl3;
+ if (srclen >= 16) {
+ shuf = v16u8{1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10};
+ v_fc0fc00 = __lsx_vreplgr2vr_w(uint32_t(0x0fc0fc00));
+ v_3f03f0 = __lsx_vreplgr2vr_w(uint32_t(0x003f03f0));
+ shift_r = __lsx_vreplgr2vr_w(uint32_t(0x0006000a));
+ shift_l = __lsx_vreplgr2vr_w(uint32_t(0x00080004));
+ base64_tbl0 = __lsx_vld(lookup_tbl, 0);
+ base64_tbl1 = __lsx_vld(lookup_tbl, 16);
+ base64_tbl2 = __lsx_vld(lookup_tbl, 32);
+ base64_tbl3 = __lsx_vld(lookup_tbl, 48);
+ }
+
+ size_t i = 0;
+ for (; i + 52 <= srclen; i += 48) {
+ __m128i in0 =
+ __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 0);
+ __m128i in1 =
+ __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 1);
+ __m128i in2 =
+ __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 2);
+ __m128i in3 =
+ __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 3);
+
+ in0 = __lsx_vshuf_b(in0, in0, (__m128i)shuf);
+ in1 = __lsx_vshuf_b(in1, in1, (__m128i)shuf);
+ in2 = __lsx_vshuf_b(in2, in2, (__m128i)shuf);
+ in3 = __lsx_vshuf_b(in3, in3, (__m128i)shuf);
+
+ __m128i t0_0 = __lsx_vand_v(in0, v_fc0fc00);
+ __m128i t0_1 = __lsx_vand_v(in1, v_fc0fc00);
+ __m128i t0_2 = __lsx_vand_v(in2, v_fc0fc00);
+ __m128i t0_3 = __lsx_vand_v(in3, v_fc0fc00);
+
+ __m128i t1_0 = __lsx_vsrl_h(t0_0, shift_r);
+ __m128i t1_1 = __lsx_vsrl_h(t0_1, shift_r);
+ __m128i t1_2 = __lsx_vsrl_h(t0_2, shift_r);
+ __m128i t1_3 = __lsx_vsrl_h(t0_3, shift_r);
+
+ __m128i t2_0 = __lsx_vand_v(in0, v_3f03f0);
+ __m128i t2_1 = __lsx_vand_v(in1, v_3f03f0);
+ __m128i t2_2 = __lsx_vand_v(in2, v_3f03f0);
+ __m128i t2_3 = __lsx_vand_v(in3, v_3f03f0);
+
+ __m128i t3_0 = __lsx_vsll_h(t2_0, shift_l);
+ __m128i t3_1 = __lsx_vsll_h(t2_1, shift_l);
+ __m128i t3_2 = __lsx_vsll_h(t2_2, shift_l);
+ __m128i t3_3 = __lsx_vsll_h(t2_3, shift_l);
+
+ __m128i input0 = __lsx_vor_v(t1_0, t3_0);
+ __m128i input0_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input0);
+ __m128i input0_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
+ __lsx_vsub_b(input0, __lsx_vldi(32)));
+ __m128i input0_mask = __lsx_vslei_bu(input0, 31);
+ __m128i input0_result =
+ __lsx_vbitsel_v(input0_shuf1, input0_shuf0, input0_mask);
+ __lsx_vst(input0_result, reinterpret_cast<__m128i *>(out), 0);
+ out += 16;
+
+ __m128i input1 = __lsx_vor_v(t1_1, t3_1);
+ __m128i input1_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input1);
+ __m128i input1_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
+ __lsx_vsub_b(input1, __lsx_vldi(32)));
+ __m128i input1_mask = __lsx_vslei_bu(input1, 31);
+ __m128i input1_result =
+ __lsx_vbitsel_v(input1_shuf1, input1_shuf0, input1_mask);
+ __lsx_vst(input1_result, reinterpret_cast<__m128i *>(out), 0);
+ out += 16;
+
+ __m128i input2 = __lsx_vor_v(t1_2, t3_2);
+ __m128i input2_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input2);
+ __m128i input2_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
+ __lsx_vsub_b(input2, __lsx_vldi(32)));
+ __m128i input2_mask = __lsx_vslei_bu(input2, 31);
+ __m128i input2_result =
+ __lsx_vbitsel_v(input2_shuf1, input2_shuf0, input2_mask);
+ __lsx_vst(input2_result, reinterpret_cast<__m128i *>(out), 0);
+ out += 16;
+
+ __m128i input3 = __lsx_vor_v(t1_3, t3_3);
+ __m128i input3_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input3);
+ __m128i input3_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
+ __lsx_vsub_b(input3, __lsx_vldi(32)));
+ __m128i input3_mask = __lsx_vslei_bu(input3, 31);
+ __m128i input3_result =
+ __lsx_vbitsel_v(input3_shuf1, input3_shuf0, input3_mask);
+ __lsx_vst(input3_result, reinterpret_cast<__m128i *>(out), 0);
+ out += 16;
+ }
+ for (; i + 16 <= srclen; i += 12) {
+
+ __m128i in = __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 0);
+
+ // bytes from groups A, B and C are needed in separate 32-bit lanes
+ // in = [DDDD|CCCC|BBBB|AAAA]
+ //
+ // an input triplet has layout
+ // [????????|ccdddddd|bbbbcccc|aaaaaabb]
+ // byte 3 byte 2 byte 1 byte 0 -- byte 3 comes from the next
+ // triplet
+ //
+ // shuffling changes the order of bytes: 1, 0, 2, 1
+ // [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc]
+ // ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^
+ // processed bits
+ in = __lsx_vshuf_b(in, in, (__m128i)shuf);
+
+ // unpacking
+ // t0 = [0000cccc|cc000000|aaaaaa00|00000000]
+ __m128i t0 = __lsx_vand_v(in, v_fc0fc00);
+ // t1 = [00000000|00cccccc|00000000|00aaaaaa]
+ // ((c >> 6), (a >> 10))
+ __m128i t1 = __lsx_vsrl_h(t0, shift_r);
+
+ // t2 = [00000000|00dddddd|000000bb|bbbb0000]
+ __m128i t2 = __lsx_vand_v(in, v_3f03f0);
+ // t3 = [00dddddd|00000000|00bbbbbb|00000000]
+ // ((d << 8), (b << 4))
+ __m128i t3 = __lsx_vsll_h(t2, shift_l);
+
+ // res = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3
+ __m128i indices = __lsx_vor_v(t1, t3);
+
+ __m128i indices_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, indices);
+ __m128i indices_shuf1 = __lsx_vshuf_b(
+ base64_tbl3, base64_tbl2, __lsx_vsub_b(indices, __lsx_vldi(32)));
+ __m128i indices_mask = __lsx_vslei_bu(indices, 31);
+ __m128i indices_result =
+ __lsx_vbitsel_v(indices_shuf1, indices_shuf0, indices_mask);
+
+ __lsx_vst(indices_result, reinterpret_cast<__m128i *>(out), 0);
+ out += 16;
+ }
+
+ return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+ srclen - i, options);
+}
+
+static inline void compress(__m128i data, uint16_t mask, char *output) {
+ if (mask == 0) {
+ __lsx_vst(data, reinterpret_cast<__m128i *>(output), 0);
+ return;
+ }
+ // this particular implementation was inspired by work done by @animetosho
+ // we do it in two steps, first 8 bytes and then second 8 bytes
+ uint8_t mask1 = uint8_t(mask); // least significant 8 bits
+ uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+ // next line just loads the 64-bit values thintable_epi8[mask1] and
+ // thintable_epi8[mask2] into a 128-bit register, using only
+ // two instructions on most compilers.
+
+ v2u64 shufmask = {tables::base64::thintable_epi8[mask1],
+ tables::base64::thintable_epi8[mask2]};
+
+ // we increment by 0x08 the second half of the mask
+ v4u32 hi = {0, 0, 0x08080808, 0x08080808};
+ __m128i shufmask1 = __lsx_vadd_b((__m128i)shufmask, (__m128i)hi);
+
+ // this is the version "nearly pruned"
+ __m128i pruned = __lsx_vshuf_b(data, data, shufmask1);
+ // we still need to put the two halves together.
+ // we compute the popcount of the first half:
+ int pop1 = tables::base64::BitsSetTable256mul2[mask1];
+ // then load the corresponding mask, what it does is to write
+ // only the first pop1 bytes from the first 8 bytes, and then
+ // it fills in with the bytes from the second 8 bytes + some filling
+ // at the end.
+ __m128i compactmask =
+ __lsx_vld(reinterpret_cast<const __m128i *>(
+ tables::base64::pshufb_combine_table + pop1 * 8),
+ 0);
+ __m128i answer = __lsx_vshuf_b(pruned, pruned, compactmask);
+
+ __lsx_vst(answer, reinterpret_cast<__m128i *>(output), 0);
+}
+
+struct block64 {
+ __m128i chunks[4];
+};
+
+template <bool base64_url>
+static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
+ const v16u8 ascii_space_tbl = {0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0};
+ // credit: aqrit
+ /*
+ '0'(0x30)-'9'(0x39) => delta_values_index = 4
+ 'A'(0x41)-'Z'(0x5a) => delta_values_index = 4/5/12(4+8)
+ 'a'(0x61)-'z'(0x7a) => delta_values_index = 6/7/14(6+8)
+ '+'(0x2b) => delta_values_index = 3
+ '/'(0x2f) => delta_values_index = 2+8 = 10
+ '-'(0x2d) => delta_values_index = 2+8 = 10
+ '_'(0x5f) => delta_values_index = 5+8 = 13
+ */
+ v16u8 delta_asso = {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF};
+ v16i8 delta_values;
+ if (base64_url) {
+ delta_values =
+ v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+ int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+ int8_t(0xB9), int8_t(0x00), int8_t(0x11), int8_t(0xC3),
+ int8_t(0xBF), int8_t(0xE0), int8_t(0xB9), int8_t(0xB9)};
+ } else {
+ delta_values =
+ v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+ int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+ int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
+ int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9)};
+ }
+
+ v16u8 check_asso;
+ if (base64_url) {
+ check_asso = v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x03, 0x07, 0x0B, 0x06, 0x0B, 0x12};
+ } else {
+ check_asso = v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F};
+ }
+
+ v16i8 check_values;
+ if (base64_url) {
+ check_values = v16i8{int8_t(0x0), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+ int8_t(0xCF), int8_t(0xBF), int8_t(0xD3), int8_t(0xA6),
+ int8_t(0xB5), int8_t(0x86), int8_t(0xD0), int8_t(0x80),
+ int8_t(0xB0), int8_t(0x80), int8_t(0x0), int8_t(0x0)};
+ } else {
+ check_values =
+ v16i8{int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+ int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
+ int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
+ int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80)};
+ }
+
+ const __m128i shifted = __lsx_vsrli_b(*src, 3);
+ __m128i asso_index = __lsx_vand_v(*src, __lsx_vldi(0xF));
+ const __m128i delta_hash =
+ __lsx_vavgr_bu(__lsx_vshuf_b((__m128i)delta_asso, (__m128i)delta_asso,
+ (__m128i)asso_index),
+ shifted);
+ const __m128i check_hash =
+ __lsx_vavgr_bu(__lsx_vshuf_b((__m128i)check_asso, (__m128i)check_asso,
+ (__m128i)asso_index),
+ shifted);
+
+ const __m128i out =
+ __lsx_vsadd_b(__lsx_vshuf_b((__m128i)delta_values, (__m128i)delta_values,
+ (__m128i)delta_hash),
+ *src);
+ const __m128i chk =
+ __lsx_vsadd_b(__lsx_vshuf_b((__m128i)check_values, (__m128i)check_values,
+ (__m128i)check_hash),
+ *src);
+ unsigned int mask = __lsx_vpickve2gr_hu(__lsx_vmskltz_b(chk), 0);
+ if (mask) {
+ __m128i ascii_space = __lsx_vseq_b(__lsx_vshuf_b((__m128i)ascii_space_tbl,
+ (__m128i)ascii_space_tbl,
+ (__m128i)asso_index),
+ *src);
+ *error |=
+ (mask != __lsx_vpickve2gr_hu(__lsx_vmskltz_b((__m128i)ascii_space), 0));
+ }
+
+ *src = out;
+ return (uint16_t)mask;
+}
+
+template <bool base64_url>
+static inline uint64_t to_base64_mask(block64 *b, bool *error) {
+ *error = 0;
+ uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], error);
+ uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], error);
+ uint64_t m2 = to_base64_mask<base64_url>(&b->chunks[2], error);
+ uint64_t m3 = to_base64_mask<base64_url>(&b->chunks[3], error);
+ return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
+}
+
+static inline void copy_block(block64 *b, char *output) {
+ __lsx_vst(b->chunks[0], reinterpret_cast<__m128i *>(output), 0);
+ __lsx_vst(b->chunks[1], reinterpret_cast<__m128i *>(output), 16);
+ __lsx_vst(b->chunks[2], reinterpret_cast<__m128i *>(output), 32);
+ __lsx_vst(b->chunks[3], reinterpret_cast<__m128i *>(output), 48);
+}
+
+static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+ uint64_t nmask = ~mask;
+ uint64_t count =
+ __lsx_vpickve2gr_d(__lsx_vpcnt_h(__lsx_vreplgr2vr_d(nmask)), 0);
+ uint16_t *count_ptr = (uint16_t *)&count;
+ compress(b->chunks[0], uint16_t(mask), output);
+ compress(b->chunks[1], uint16_t(mask >> 16), output + count_ptr[0]);
+ compress(b->chunks[2], uint16_t(mask >> 32),
+ output + count_ptr[0] + count_ptr[1]);
+ compress(b->chunks[3], uint16_t(mask >> 48),
+ output + count_ptr[0] + count_ptr[1] + count_ptr[2]);
+ return count_ones(nmask);
+}
+
+// The caller of this function is responsible to ensure that there are 64 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char *src) {
+ b->chunks[0] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 0);
+ b->chunks[1] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 16);
+ b->chunks[2] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 32);
+ b->chunks[3] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 48);
+}
+
+// The caller of this function is responsible to ensure that there are 128 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char16_t *src) {
+ __m128i m1 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 0);
+ __m128i m2 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 16);
+ __m128i m3 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 32);
+ __m128i m4 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 48);
+ __m128i m5 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 64);
+ __m128i m6 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 80);
+ __m128i m7 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 96);
+ __m128i m8 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 112);
+ b->chunks[0] = __lsx_vssrlni_bu_h(m2, m1, 0);
+ b->chunks[1] = __lsx_vssrlni_bu_h(m4, m3, 0);
+ b->chunks[2] = __lsx_vssrlni_bu_h(m6, m5, 0);
+ b->chunks[3] = __lsx_vssrlni_bu_h(m8, m7, 0);
+}
+
+static inline void base64_decode(char *out, __m128i str) {
+ __m128i t0 = __lsx_vor_v(
+ __lsx_vslli_w(str, 26),
+ __lsx_vslli_w(__lsx_vand_v(str, __lsx_vldi(-1758 /*0x0000FF00*/)), 12));
+ __m128i t1 =
+ __lsx_vsrli_w(__lsx_vand_v(str, __lsx_vldi(-3521 /*0x003F0000*/)), 2);
+ __m128i t2 = __lsx_vor_v(t0, t1);
+ __m128i t3 = __lsx_vor_v(t2, __lsx_vsrli_w(str, 16));
+ const v16u8 pack_shuffle = {3, 2, 1, 7, 6, 5, 11, 10,
+ 9, 15, 14, 13, 0, 0, 0, 0};
+ t3 = __lsx_vshuf_b(t3, t3, (__m128i)pack_shuffle);
+
+ // Store the output:
+ // we only need 12.
+ __lsx_vstelm_d(t3, out, 0, 0);
+ __lsx_vstelm_w(t3, out + 8, 0, 2);
+}
+// decode 64 bytes and output 48 bytes
+static inline void base64_decode_block(char *out, const char *src) {
+ base64_decode(out, __lsx_vld(reinterpret_cast<const __m128i *>(src), 0));
+ base64_decode(out + 12,
+ __lsx_vld(reinterpret_cast<const __m128i *>(src), 16));
+ base64_decode(out + 24,
+ __lsx_vld(reinterpret_cast<const __m128i *>(src), 32));
+ base64_decode(out + 36,
+ __lsx_vld(reinterpret_cast<const __m128i *>(src), 48));
+}
+static inline void base64_decode_block_safe(char *out, const char *src) {
+ base64_decode_block(out, src);
+}
+static inline void base64_decode_block(char *out, block64 *b) {
+ base64_decode(out, b->chunks[0]);
+ base64_decode(out + 12, b->chunks[1]);
+ base64_decode(out + 24, b->chunks[2]);
+ base64_decode(out + 36, b->chunks[3]);
+}
+static inline void base64_decode_block_safe(char *out, block64 *b) {
+ base64_decode_block(out, b);
+}
+
+template <bool base64_url, typename char_type>
+full_result
+compress_decode_base64(char *dst, const char_type *src, size_t srclen,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options) {
+ const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+ : tables::base64::to_base64_value;
+ size_t equallocation =
+ srclen; // location of the first padding character if any
+ // skip trailing spaces
+ while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+ to_base64[uint8_t(src[srclen - 1])] == 64) {
+ srclen--;
+ }
+ size_t equalsigns = 0;
+ if (srclen > 0 && src[srclen - 1] == '=') {
+ equallocation = srclen - 1;
+ srclen--;
+ equalsigns = 1;
+ // skip trailing spaces
+ while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+ to_base64[uint8_t(src[srclen - 1])] == 64) {
+ srclen--;
+ }
+ if (srclen > 0 && src[srclen - 1] == '=') {
+ equallocation = srclen - 1;
+ srclen--;
+ equalsigns = 2;
+ }
+ }
+ if (srclen == 0) {
+ if (equalsigns > 0) {
+ return {INVALID_BASE64_CHARACTER, equallocation, 0};
+ }
+ return {SUCCESS, 0, 0};
+ }
+ const char_type *const srcinit = src;
+ const char *const dstinit = dst;
+ const char_type *const srcend = src + srclen;
+
+ constexpr size_t block_size = 10;
+ char buffer[block_size * 64];
+ char *bufferptr = buffer;
+ if (srclen >= 64) {
+ const char_type *const srcend64 = src + srclen - 64;
+ while (src <= srcend64) {
+ block64 b;
+ load_block(&b, src);
+ src += 64;
+ bool error = false;
+ uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+ if (badcharmask) {
+ if (error) {
+ src -= 64;
+ while (src < srcend && scalar::base64::is_eight_byte(*src) &&
+ to_base64[uint8_t(*src)] <= 64) {
+ src++;
+ }
+ if (src < srcend) {
+ // should never happen
+ }
+ return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+ size_t(dst - dstinit)};
+ }
+ }
+
+ if (badcharmask != 0) {
+ // optimization opportunity: check for simple masks like those made of
+ // continuous 1s followed by continuous 0s. And masks containing a
+ // single bad character.
+ bufferptr += compress_block(&b, badcharmask, bufferptr);
+ } else {
+ // optimization opportunity: if bufferptr == buffer and mask == 0, we
+ // can avoid the call to compress_block and decode directly.
+ copy_block(&b, bufferptr);
+ bufferptr += 64;
+ }
+ if (bufferptr >= (block_size - 1) * 64 + buffer) {
+ for (size_t i = 0; i < (block_size - 1); i++) {
+ base64_decode_block(dst, buffer + i * 64);
+ dst += 48;
+ }
+ std::memcpy(buffer, buffer + (block_size - 1) * 64,
+ 64); // 64 might be too much
+ bufferptr -= (block_size - 1) * 64;
+ }
+ }
+ }
+ char *buffer_start = buffer;
+ // Optimization note: if this is almost full, then it is worth our
+ // time, otherwise, we should just decode directly.
+ int last_block = (int)((bufferptr - buffer_start) % 64);
+ if (last_block != 0 && srcend - src + last_block >= 64) {
+ while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
+ uint8_t val = to_base64[uint8_t(*src)];
+ *bufferptr = char(val);
+ if (!scalar::base64::is_eight_byte(*src) || val > 64) {
+ return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+ size_t(dst - dstinit)};
+ }
+ bufferptr += (val <= 63);
+ src++;
+ }
+ }
+
+ for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+ base64_decode_block(dst, buffer_start);
+ dst += 48;
+ }
+ if ((bufferptr - buffer_start) % 64 != 0) {
+ while (buffer_start + 4 < bufferptr) {
+ uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+ (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+ (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+ (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+ << 8;
+ triple = scalar::utf32::swap_bytes(triple);
+ std::memcpy(dst, &triple, 4);
+
+ dst += 3;
+ buffer_start += 4;
+ }
+ if (buffer_start + 4 <= bufferptr) {
+ uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+ (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+ (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+ (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+ << 8;
+ triple = scalar::utf32::swap_bytes(triple);
+ std::memcpy(dst, &triple, 3);
+
+ dst += 3;
+ buffer_start += 4;
+ }
+ // we may have 1, 2 or 3 bytes left and we need to decode them so let us
+ // backtrack
+ int leftover = int(bufferptr - buffer_start);
+ while (leftover > 0) {
+ while (to_base64[uint8_t(*(src - 1))] == 64) {
+ src--;
+ }
+ src--;
+ leftover--;
+ }
+ }
+ if (src < srcend + equalsigns) {
+ full_result r = scalar::base64::base64_tail_decode(
+ dst, src, srcend - src, equalsigns, options, last_chunk_options);
+ r.input_count += size_t(src - srcinit);
+ if (r.error == error_code::INVALID_BASE64_CHARACTER ||
+ r.error == error_code::BASE64_EXTRA_BITS) {
+ return r;
+ } else {
+ r.output_count += size_t(dst - dstinit);
+ }
+ if (last_chunk_options != stop_before_partial &&
+ r.error == error_code::SUCCESS && equalsigns > 0) {
+ // additional checks
+ if ((r.output_count % 3 == 0) ||
+ ((r.output_count % 3) + 1 + equalsigns != 4)) {
+ r.error = error_code::INVALID_BASE64_CHARACTER;
+ r.input_count = equallocation;
+ }
+ }
+ return r;
+ }
+ if (equalsigns > 0) {
+ if ((size_t(dst - dstinit) % 3 == 0) ||
+ ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
+ return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
+ }
+ }
+ return {SUCCESS, srclen, size_t(dst - dstinit)};
+}
diff --git a/contrib/simdutf/src/lsx/lsx_convert_latin1_to_utf16.cpp b/contrib/simdutf/src/lsx/lsx_convert_latin1_to_utf16.cpp
new file mode 100644
index 000000000..8586f6e6a
--- /dev/null
+++ b/contrib/simdutf/src/lsx/lsx_convert_latin1_to_utf16.cpp
@@ -0,0 +1,39 @@
+std::pair<const char *, char16_t *>
+lsx_convert_latin1_to_utf16le(const char *buf, size_t len,
+ char16_t *utf16_output) {
+ const char *end = buf + len;
+
+ __m128i zero = __lsx_vldi(0);
+ while (buf + 16 <= end) {
+ __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+ __m128i inlow = __lsx_vilvl_b(zero, in8);
+ __m128i inhigh = __lsx_vilvh_b(zero, in8);
+ __lsx_vst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+ __lsx_vst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 16);
+
+ utf16_output += 16;
+ buf += 16;
+ }
+
+ return std::make_pair(buf, utf16_output);
+}
+
+std::pair<const char *, char16_t *>
+lsx_convert_latin1_to_utf16be(const char *buf, size_t len,
+ char16_t *utf16_output) {
+ const char *end = buf + len;
+ __m128i zero = __lsx_vldi(0);
+ while (buf + 16 <= end) {
+ __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+ __m128i inlow = __lsx_vilvl_b(in8, zero);
+ __m128i inhigh = __lsx_vilvh_b(in8, zero);
+ __lsx_vst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+ __lsx_vst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 16);
+ utf16_output += 16;
+ buf += 16;
+ }
+
+ return std::make_pair(buf, utf16_output);
+}
diff --git a/contrib/simdutf/src/lsx/lsx_convert_latin1_to_utf32.cpp b/contrib/simdutf/src/lsx/lsx_convert_latin1_to_utf32.cpp
new file mode 100644
index 000000000..d99ea7a28
--- /dev/null
+++ b/contrib/simdutf/src/lsx/lsx_convert_latin1_to_utf32.cpp
@@ -0,0 +1,27 @@
+std::pair<const char *, char32_t *>
+lsx_convert_latin1_to_utf32(const char *buf, size_t len,
+ char32_t *utf32_output) {
+ const char *end = buf + len;
+
+ while (buf + 16 <= end) {
+ __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+ __m128i zero = __lsx_vldi(0);
+ __m128i in16low = __lsx_vilvl_b(zero, in8);
+ __m128i in16high = __lsx_vilvh_b(zero, in8);
+ __m128i in32_0 = __lsx_vilvl_h(zero, in16low);
+ __m128i in32_1 = __lsx_vilvh_h(zero, in16low);
+ __m128i in32_2 = __lsx_vilvl_h(zero, in16high);
+ __m128i in32_3 = __lsx_vilvh_h(zero, in16high);
+
+ __lsx_vst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
+ __lsx_vst(in32_1, reinterpret_cast<uint32_t *>(utf32_output + 4), 0);
+ __lsx_vst(in32_2, reinterpret_cast<uint32_t *>(utf32_output + 8), 0);
+ __lsx_vst(in32_3, reinterpret_cast<uint32_t *>(utf32_output + 12), 0);
+
+ utf32_output += 16;
+ buf += 16;
+ }
+
+ return std::make_pair(buf, utf32_output);
+}
diff --git a/contrib/simdutf/src/lsx/lsx_convert_latin1_to_utf8.cpp b/contrib/simdutf/src/lsx/lsx_convert_latin1_to_utf8.cpp
new file mode 100644
index 000000000..a532bb729
--- /dev/null
+++ b/contrib/simdutf/src/lsx/lsx_convert_latin1_to_utf8.cpp
@@ -0,0 +1,56 @@
+/*
+ Returns a pair: the first unprocessed byte from buf and utf8_output
+ A scalar routing should carry on the conversion of the tail.
+*/
+
+std::pair<const char *, char *>
+lsx_convert_latin1_to_utf8(const char *latin1_input, size_t len,
+ char *utf8_out) {
+ uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+ const char *end = latin1_input + len;
+
+ __m128i zero = __lsx_vldi(0);
+ // We always write 16 bytes, of which more than the first 8 bytes
+ // are valid. A safety margin of 8 is more than sufficient.
+ while (latin1_input + 16 <= end) {
+ __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(latin1_input), 0);
+ uint32_t ascii = __lsx_vpickve2gr_hu(__lsx_vmskgez_b(in8), 0);
+ if (ascii == 0xffff) { // ASCII fast path!!!!
+ __lsx_vst(in8, utf8_output, 0);
+ utf8_output += 16;
+ latin1_input += 16;
+ continue;
+ }
+ // We just fallback on UTF-16 code. This could be optimized/simplified
+ // further.
+ __m128i in16 = __lsx_vilvl_b(zero, in8);
+ // 1. prepare 2-byte values
+ // input 8-bit word : [aabb|bbbb] x 8
+ // expected output : [1100|00aa|10bb|bbbb] x 8
+ // t0 = [0000|00aa|bbbb|bb00]
+ __m128i t0 = __lsx_vslli_h(in16, 2);
+ // t1 = [0000|00aa|0000|0000]
+ __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785));
+ // t3 = [0000|00aa|00bb|bbbb]
+ __m128i t2 = __lsx_vbitsel_v(t1, in16, __lsx_vrepli_h(0x3f));
+ // t4 = [1100|00aa|10bb|bbbb]
+ __m128i t3 = __lsx_vor_v(t2, __lsx_vreplgr2vr_h(uint16_t(0xc080)));
+ // merge ASCII and 2-byte codewords
+ __m128i one_byte_bytemask = __lsx_vsle_hu(in16, __lsx_vrepli_h(0x7F));
+ __m128i utf8_unpacked = __lsx_vbitsel_v(t3, in16, one_byte_bytemask);
+
+ const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+ [lsx_1_2_utf8_bytes_mask[(ascii & 0xff)]][0];
+ __m128i shuffle = __lsx_vld(row + 1, 0);
+ __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
+
+ // store bytes
+ __lsx_vst(utf8_packed, utf8_output, 0);
+ // adjust pointers
+ latin1_input += 8;
+ utf8_output += row[0];
+
+ } // while
+
+ return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
+}
diff --git a/contrib/simdutf/src/lsx/lsx_convert_utf16_to_latin1.cpp b/contrib/simdutf/src/lsx/lsx_convert_utf16_to_latin1.cpp
new file mode 100644
index 000000000..ea30d34fd
--- /dev/null
+++ b/contrib/simdutf/src/lsx/lsx_convert_utf16_to_latin1.cpp
@@ -0,0 +1,66 @@
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+lsx_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_output) {
+ const char16_t *end = buf + len;
+ while (buf + 16 <= end) {
+ __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+ __m128i in1 = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
+ if (!match_system(big_endian)) {
+ in = lsx_swap_bytes(in);
+ in1 = lsx_swap_bytes(in1);
+ }
+ if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) {
+ // 1. pack the bytes
+ __m128i latin1_packed = __lsx_vpickev_b(in1, in);
+ // 2. store (8 bytes)
+ __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+ // 3. adjust pointers
+ buf += 16;
+ latin1_output += 16;
+ } else {
+ return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+ }
+ } // while
+ return std::make_pair(buf, latin1_output);
+}
+
+template <endianness big_endian>
+std::pair<result, char *>
+lsx_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+ char *latin1_output) {
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+ while (buf + 16 <= end) {
+ __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+ __m128i in1 = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
+ if (!match_system(big_endian)) {
+ in = lsx_swap_bytes(in);
+ in1 = lsx_swap_bytes(in1);
+ }
+ if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) {
+ // 1. pack the bytes
+ __m128i latin1_packed = __lsx_vpickev_b(in1, in);
+ // 2. store (8 bytes)
+ __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+ // 3. adjust pointers
+ buf += 16;
+ latin1_output += 16;
+ } else {
+ // Let us do a scalar fallback.
+ for (int k = 0; k < 16; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if (word <= 0xff) {
+ *latin1_output++ = char(word);
+ } else {
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+ latin1_output);
+ }
+ }
+ }
+ } // while
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ latin1_output);
+}
diff --git a/contrib/simdutf/src/lsx/lsx_convert_utf16_to_utf32.cpp b/contrib/simdutf/src/lsx/lsx_convert_utf16_to_utf32.cpp
new file mode 100644
index 000000000..4f7679b70
--- /dev/null
+++ b/contrib/simdutf/src/lsx/lsx_convert_utf16_to_utf32.cpp
@@ -0,0 +1,139 @@
+template <endianness big_endian>
+std::pair<const char16_t *, char32_t *>
+lsx_convert_utf16_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_out) {
+ uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+ const char16_t *end = buf + len;
+
+ __m128i zero = __lsx_vldi(0);
+ __m128i v_f800 = __lsx_vldi(-2568); /*0xF800*/
+ __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+
+ while (buf + 8 <= end) {
+ __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+ if (!match_system(big_endian)) {
+ in = lsx_swap_bytes(in);
+ }
+
+ __m128i surrogates_bytemask =
+ __lsx_vseq_h(__lsx_vand_v(in, v_f800), v_d800);
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (__lsx_bz_v(surrogates_bytemask)) {
+ // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+ // units
+ __lsx_vst(__lsx_vilvl_h(zero, in), utf32_output, 0);
+ __lsx_vst(__lsx_vilvh_h(zero, in), utf32_output, 16);
+ utf32_output += 8;
+ buf += 8;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if ((word & 0xF800) != 0xD800) {
+ *utf32_output++ = char32_t(word);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k + 1])
+ : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char32_t *>(utf32_output));
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf32_output++ = char32_t(value);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(buf, reinterpret_cast<char32_t *>(utf32_output));
+}
+
+/*
+ Returns a pair: a result struct and utf8_output.
+ If there is an error, the count field of the result is the position of the
+ error. Otherwise, it is the position of the first unprocessed byte in buf
+ (even if finished). A scalar routing should carry on the conversion of the
+ tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char32_t *>
+lsx_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
+ char32_t *utf32_out) {
+ uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+
+ __m128i zero = __lsx_vldi(0);
+ __m128i v_f800 = __lsx_vldi(-2568); /*0xF800*/
+ __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+
+ while (buf + 8 <= end) {
+ __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+ if (!match_system(big_endian)) {
+ in = lsx_swap_bytes(in);
+ }
+
+ __m128i surrogates_bytemask =
+ __lsx_vseq_h(__lsx_vand_v(in, v_f800), v_d800);
+ if (__lsx_bz_v(surrogates_bytemask)) {
+ // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+ // units
+ __lsx_vst(__lsx_vilvl_h(zero, in), utf32_output, 0);
+ __lsx_vst(__lsx_vilvh_h(zero, in), utf32_output, 16);
+ utf32_output += 8;
+ buf += 8;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if ((word & 0xF800) != 0xD800) {
+ *utf32_output++ = char32_t(word);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k + 1])
+ : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k - 1),
+ reinterpret_cast<char32_t *>(utf32_output));
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf32_output++ = char32_t(value);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ reinterpret_cast<char32_t *>(utf32_output));
+}
diff --git a/contrib/simdutf/src/lsx/lsx_convert_utf16_to_utf8.cpp b/contrib/simdutf/src/lsx/lsx_convert_utf16_to_utf8.cpp
new file mode 100644
index 000000000..11dd2ca49
--- /dev/null
+++ b/contrib/simdutf/src/lsx/lsx_convert_utf16_to_utf8.cpp
@@ -0,0 +1,526 @@
+/*
+ The vectorized algorithm works on single SSE register i.e., it
+ loads eight 16-bit code units.
+
+ We consider three cases:
+ 1. an input register contains no surrogates and each value
+ is in range 0x0000 .. 0x07ff.
+ 2. an input register contains no surrogates and values are
+ is in range 0x0000 .. 0xffff.
+ 3. an input register contains surrogates --- i.e. codepoints
+ can have 16 or 32 bits.
+
+ Ad 1.
+
+ When values are less than 0x0800, it means that a 16-bit code unit
+ can be converted into: 1) single UTF8 byte (when it's an ASCII
+ char) or 2) two UTF8 bytes.
+
+ For this case we do only some shuffle to obtain these 2-byte
+ codes and finally compress the whole SSE register with a single
+ shuffle.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+ Ad 2.
+
+ When values fit in 16-bit code units, but are above 0x07ff, then
+ a single word may produce one, two or three UTF8 bytes.
+
+ We prepare data for all these three cases in two registers.
+ The first register contains lower two UTF8 bytes (used in all
+ cases), while the second one contains just the third byte for
+ the three-UTF8-bytes case.
+
+ Finally these two registers are interleaved forming eight-element
+ array of 32-bit values. The array spans two SSE registers.
+ The bytes from the registers are compressed using two shuffles.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+
+ To summarize:
+ - We need two 256-entry tables that have 8704 bytes in total.
+*/
+/*
+ Returns a pair: the first unprocessed byte from buf and utf8_output
+ A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+lsx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
+ uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+ const char16_t *end = buf + len;
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7ff));
+ while (buf + 16 + safety_margin <= end) {
+ __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+ if (!match_system(big_endian)) {
+ in = lsx_swap_bytes(in);
+ }
+ if (__lsx_bz_v(
+ __lsx_vslt_hu(__lsx_vrepli_h(0x7F), in))) { // ASCII fast path!!!!
+ // It is common enough that we have sequences of 16 consecutive ASCII
+ // characters.
+ __m128i nextin = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
+ if (!match_system(big_endian)) {
+ nextin = lsx_swap_bytes(nextin);
+ }
+ if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F), nextin))) {
+ // 1. pack the bytes
+ // obviously suboptimal.
+ __m128i utf8_packed = __lsx_vpickev_b(nextin, in);
+ // 2. store (16 bytes)
+ __lsx_vst(utf8_packed, utf8_output, 0);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ } else {
+ // 1. pack the bytes
+ // obviously suboptimal.
+ __m128i utf8_packed = __lsx_vpickev_b(in, in);
+ // 2. store (8 bytes)
+ __lsx_vst(utf8_packed, utf8_output, 0);
+ // 3. adjust pointers
+ buf += 8;
+ utf8_output += 8;
+ in = nextin;
+ }
+ }
+
+ __m128i zero = __lsx_vldi(0);
+ if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, in))) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ // t0 = [000a|aaaa|bbbb|bb00]
+ __m128i t0 = __lsx_vslli_h(in, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/));
+ // t2 = [0000|0000|00bb|bbbb]
+ __m128i t2 = __lsx_vand_v(in, __lsx_vrepli_h(0x3f));
+ // t3 = [000a|aaaa|00bb|bbbb]
+ __m128i t3 = __lsx_vor_v(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xc080));
+ __m128i t4 = __lsx_vor_v(t3, v_c080);
+ // 2. merge ASCII and 2-byte codewords
+ __m128i one_byte_bytemask =
+ __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F /*0x007F*/));
+ __m128i utf8_unpacked = __lsx_vbitsel_v(t4, in, one_byte_bytemask);
+ // 3. prepare bitmask for 8-bit lookup
+ uint32_t m2 = __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
+ // 4. pack the bytes
+ const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+ [lsx_1_2_utf8_bytes_mask[m2]][0];
+ __m128i shuffle = __lsx_vld(row, 1);
+ __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
+ // 5. store bytes
+ __lsx_vst(utf8_packed, utf8_output, 0);
+ // 6. adjust pointers
+ buf += 8;
+ utf8_output += row[0];
+ continue;
+ }
+ __m128i surrogates_bytemask =
+ __lsx_vseq_h(__lsx_vand_v(in, __lsx_vldi(-2568 /*0xF800*/)),
+ __lsx_vldi(-2600 /*0xD800*/));
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (__lsx_bz_v(surrogates_bytemask)) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
+ two UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** --
+ precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+ they differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence,
+ taking into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ __m128i t0 = __lsx_vpickev_b(in, in);
+ t0 = __lsx_vilvl_b(t0, t0);
+
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc]
+ __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
+ __m128i t1 = __lsx_vand_v(t0, v_3f7f);
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688 /*0x8000*/));
+
+ // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+ __m128i s0 = __lsx_vsrli_h(in, 12);
+ // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+ __m128i s1 = __lsx_vslli_h(in, 2);
+ // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
+ s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/));
+
+ // [00bb|bbbb|0000|aaaa]
+ __m128i s2 = __lsx_vor_v(s0, s1);
+ // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
+ __m128i s3 = __lsx_vor_v(s2, v_c0e0);
+ __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(in, v_07ff);
+ __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask,
+ __lsx_vldi(-2752 /*0x4000*/));
+ __m128i s4 = __lsx_vxor_v(s3, m0);
+
+ // 4. expand code units 16-bit => 32-bit
+ __m128i out0 = __lsx_vilvl_h(s4, t2);
+ __m128i out1 = __lsx_vilvh_h(s4, t2);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ __m128i one_byte_bytemask = __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F));
+
+ __m128i one_or_two_bytes_bytemask_low =
+ __lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
+ __m128i one_or_two_bytes_bytemask_high =
+ __lsx_vilvh_h(one_or_two_bytes_bytemask, zero);
+
+ __m128i one_byte_bytemask_low =
+ __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
+ __m128i one_byte_bytemask_high =
+ __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+ const uint32_t mask0 = __lsx_vpickve2gr_bu(
+ __lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_low,
+ one_byte_bytemask_low)),
+ 0);
+ const uint32_t mask1 = __lsx_vpickve2gr_bu(
+ __lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_high,
+ one_byte_bytemask_high)),
+ 0);
+
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ __m128i shuffle0 = __lsx_vld(row0, 1);
+ __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);
+
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ __m128i shuffle1 = __lsx_vld(row1, 1);
+ __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);
+
+ __lsx_vst(utf8_0, utf8_output, 0);
+ utf8_output += row0[0];
+ __lsx_vst(utf8_1, utf8_output, 0);
+ utf8_output += row1[0];
+
+ buf += 8;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if ((word & 0xFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xF800) != 0xD800) {
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k + 1])
+ : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char *>(utf8_output));
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf8_output++ = char((value >> 18) | 0b11110000);
+ *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((value & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
+}
+
+/*
+ Returns a pair: a result struct and utf8_output.
+ If there is an error, the count field of the result is the position of the
+ error. Otherwise, it is the position of the first unprocessed byte in buf
+ (even if finished). A scalar routing should carry on the conversion of the
+ tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char *>
+lsx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
+ char *utf8_out) {
+ uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+ while (buf + 16 + safety_margin <= end) {
+ __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+ if (!match_system(big_endian)) {
+ in = lsx_swap_bytes(in);
+ }
+ if (__lsx_bz_v(
+ __lsx_vslt_hu(__lsx_vrepli_h(0x7F), in))) { // ASCII fast path!!!!
+ // It is common enough that we have sequences of 16 consecutive ASCII
+ // characters.
+ __m128i nextin = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
+ if (!match_system(big_endian)) {
+ nextin = lsx_swap_bytes(nextin);
+ }
+ if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F), nextin))) {
+ // 1. pack the bytes
+ // obviously suboptimal.
+ __m128i utf8_packed = __lsx_vpickev_b(nextin, in);
+ // 2. store (16 bytes)
+ __lsx_vst(utf8_packed, utf8_output, 0);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ } else {
+ // 1. pack the bytes
+ // obviously suboptimal.
+ __m128i utf8_packed = __lsx_vpickev_b(in, in);
+ // 2. store (8 bytes)
+ __lsx_vst(utf8_packed, utf8_output, 0);
+ // 3. adjust pointers
+ buf += 8;
+ utf8_output += 8;
+ in = nextin;
+ }
+ }
+
+ __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7ff));
+ __m128i zero = __lsx_vldi(0);
+ if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, in))) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ // t0 = [000a|aaaa|bbbb|bb00]
+ __m128i t0 = __lsx_vslli_h(in, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/));
+ // t2 = [0000|0000|00bb|bbbb]
+ __m128i t2 = __lsx_vand_v(in, __lsx_vrepli_h(0x3f));
+ // t3 = [000a|aaaa|00bb|bbbb]
+ __m128i t3 = __lsx_vor_v(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xc080));
+ __m128i t4 = __lsx_vor_v(t3, v_c080);
+ // 2. merge ASCII and 2-byte codewords
+ __m128i one_byte_bytemask =
+ __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F /*0x007F*/));
+ __m128i utf8_unpacked = __lsx_vbitsel_v(t4, in, one_byte_bytemask);
+ // 3. prepare bitmask for 8-bit lookup
+ uint32_t m2 = __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
+ // 4. pack the bytes
+ const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+ [lsx_1_2_utf8_bytes_mask[m2]][0];
+ __m128i shuffle = __lsx_vld(row, 1);
+ __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
+ // 5. store bytes
+ __lsx_vst(utf8_packed, utf8_output, 0);
+ // 6. adjust pointers
+ buf += 8;
+ utf8_output += row[0];
+ continue;
+ }
+ __m128i surrogates_bytemask =
+ __lsx_vseq_h(__lsx_vand_v(in, __lsx_vldi(-2568 /*0xF800*/)),
+ __lsx_vldi(-2600 /*0xD800*/));
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (__lsx_bz_v(surrogates_bytemask)) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
+ two UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** --
+ precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+ they differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence,
+ taking into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ __m128i t0 = __lsx_vpickev_b(in, in);
+ t0 = __lsx_vilvl_b(t0, t0);
+
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc]
+ __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
+ __m128i t1 = __lsx_vand_v(t0, v_3f7f);
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688));
+
+ // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+ __m128i s0 = __lsx_vsrli_h(in, 12);
+ // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+ __m128i s1 = __lsx_vslli_h(in, 2);
+ // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
+ s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/));
+
+ // [00bb|bbbb|0000|aaaa]
+ __m128i s2 = __lsx_vor_v(s0, s1);
+ // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
+ __m128i s3 = __lsx_vor_v(s2, v_c0e0);
+ __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(in, v_07ff);
+ __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask,
+ __lsx_vldi(-2752 /*0x4000*/));
+ __m128i s4 = __lsx_vxor_v(s3, m0);
+
+ // 4. expand code units 16-bit => 32-bit
+ __m128i out0 = __lsx_vilvl_h(s4, t2);
+ __m128i out1 = __lsx_vilvh_h(s4, t2);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ __m128i one_byte_bytemask = __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F));
+
+ __m128i one_or_two_bytes_bytemask_low =
+ __lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
+ __m128i one_or_two_bytes_bytemask_high =
+ __lsx_vilvh_h(one_or_two_bytes_bytemask, zero);
+
+ __m128i one_byte_bytemask_low =
+ __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
+ __m128i one_byte_bytemask_high =
+ __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+ const uint32_t mask0 = __lsx_vpickve2gr_bu(
+ __lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_low,
+ one_byte_bytemask_low)),
+ 0);
+ const uint32_t mask1 = __lsx_vpickve2gr_bu(
+ __lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_high,
+ one_byte_bytemask_high)),
+ 0);
+
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ __m128i shuffle0 = __lsx_vld(row0, 1);
+ __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);
+
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ __m128i shuffle1 = __lsx_vld(row1, 1);
+ __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);
+
+ __lsx_vst(utf8_0, utf8_output, 0);
+ utf8_output += row0[0];
+ __lsx_vst(utf8_1, utf8_output, 0);
+ utf8_output += row1[0];
+
+ buf += 8;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if ((word & 0xFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xF800) != 0xD800) {
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k + 1])
+ : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k - 1),
+ reinterpret_cast<char *>(utf8_output));
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf8_output++ = char((value >> 18) | 0b11110000);
+ *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((value & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ reinterpret_cast<char *>(utf8_output));
+}
diff --git a/contrib/simdutf/src/lsx/lsx_convert_utf32_to_latin1.cpp b/contrib/simdutf/src/lsx/lsx_convert_utf32_to_latin1.cpp
new file mode 100644
index 000000000..ee279a0ec
--- /dev/null
+++ b/contrib/simdutf/src/lsx/lsx_convert_utf32_to_latin1.cpp
@@ -0,0 +1,66 @@
+std::pair<const char32_t *, char *>
+lsx_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) {
+ const char32_t *end = buf + len;
+ const v16u8 shuf_mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
+ __m128i v_ff = __lsx_vrepli_w(0xFF);
+
+ while (buf + 16 <= end) {
+ __m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+ __m128i in2 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
+
+ __m128i in12 = __lsx_vor_v(in1, in2);
+ if (__lsx_bz_v(__lsx_vslt_wu(v_ff, in12))) {
+ // 1. pack the bytes
+ __m128i latin1_packed = __lsx_vshuf_b(in2, in1, (__m128i)shuf_mask);
+ // 2. store (8 bytes)
+ __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+ // 3. adjust pointers
+ buf += 8;
+ latin1_output += 8;
+ } else {
+ return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+ }
+ } // while
+ return std::make_pair(buf, latin1_output);
+}
+
+std::pair<result, char *>
+lsx_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+ char *latin1_output) {
+ const char32_t *start = buf;
+ const char32_t *end = buf + len;
+
+ const v16u8 shuf_mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
+ __m128i v_ff = __lsx_vrepli_w(0xFF);
+
+ while (buf + 16 <= end) {
+ __m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+ __m128i in2 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
+
+ __m128i in12 = __lsx_vor_v(in1, in2);
+
+ if (__lsx_bz_v(__lsx_vslt_wu(v_ff, in12))) {
+ // 1. pack the bytes
+ __m128i latin1_packed = __lsx_vshuf_b(in2, in1, (__m128i)shuf_mask);
+ // 2. store (8 bytes)
+ __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+ // 3. adjust pointers
+ buf += 8;
+ latin1_output += 8;
+ } else {
+ // Let us do a scalar fallback.
+ for (int k = 0; k < 8; k++) {
+ uint32_t word = buf[k];
+ if (word <= 0xff) {
+ *latin1_output++ = char(word);
+ } else {
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+ latin1_output);
+ }
+ }
+ }
+ } // while
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ latin1_output);
+}
diff --git a/contrib/simdutf/src/lsx/lsx_convert_utf32_to_utf16.cpp b/contrib/simdutf/src/lsx/lsx_convert_utf32_to_utf16.cpp
new file mode 100644
index 000000000..ddad69594
--- /dev/null
+++ b/contrib/simdutf/src/lsx/lsx_convert_utf32_to_utf16.cpp
@@ -0,0 +1,155 @@
+template <endianness big_endian>
+std::pair<const char32_t *, char16_t *>
+lsx_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+ char16_t *utf16_out) {
+ uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
+ const char32_t *end = buf + len;
+
+ __m128i forbidden_bytemask = __lsx_vrepli_h(0);
+ __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+ __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xdfff));
+ while (buf + 8 <= end) {
+ __m128i in0 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+ __m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
+
+ // Check if no bits set above 16th
+ if (__lsx_bz_v(__lsx_vpickod_h(in1, in0))) {
+ __m128i utf16_packed = __lsx_vpickev_h(in1, in0);
+ forbidden_bytemask = __lsx_vor_v(
+ __lsx_vand_v(
+ __lsx_vsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff
+ __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+ forbidden_bytemask);
+
+ if (!match_system(big_endian)) {
+ utf16_packed = lsx_swap_bytes(utf16_packed);
+ }
+ __lsx_vst(utf16_packed, utf16_output, 0);
+ utf16_output += 8;
+ buf += 8;
+ } else {
+ size_t forward = 3;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFF0000) == 0) {
+ // will not generate a surrogate pair
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+ *utf16_output++ = !match_system(big_endian)
+ ? char16_t(word >> 8 | word << 8)
+ : char16_t(word);
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (!match_system(big_endian)) {
+ high_surrogate =
+ uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+ low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ }
+ }
+ buf += k;
+ }
+ }
+
+ // check for invalid input
+ if (__lsx_bnz_v(forbidden_bytemask)) {
+ return std::make_pair(nullptr, reinterpret_cast<char16_t *>(utf16_output));
+ }
+ return std::make_pair(buf, reinterpret_cast<char16_t *>(utf16_output));
+}
+
+template <endianness big_endian>
+std::pair<result, char16_t *>
+lsx_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+ char16_t *utf16_out) {
+ uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
+ const char32_t *start = buf;
+ const char32_t *end = buf + len;
+
+ __m128i forbidden_bytemask = __lsx_vrepli_h(0);
+ __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+ __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xdfff));
+
+ while (buf + 8 <= end) {
+ __m128i in0 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+ __m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
+ // Check if no bits set above 16th
+ if (__lsx_bz_v(__lsx_vpickod_h(in1, in0))) {
+ __m128i utf16_packed = __lsx_vpickev_h(in1, in0);
+
+ forbidden_bytemask = __lsx_vor_v(
+ __lsx_vand_v(
+ __lsx_vsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff
+ __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+ forbidden_bytemask);
+ if (__lsx_bnz_v(forbidden_bytemask)) {
+ return std::make_pair(result(error_code::SURROGATE, buf - start),
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+
+ if (!match_system(big_endian)) {
+ utf16_packed = lsx_swap_bytes(utf16_packed);
+ }
+
+ __lsx_vst(utf16_packed, utf16_output, 0);
+ utf16_output += 8;
+ buf += 8;
+ } else {
+ size_t forward = 3;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFF0000) == 0) {
+ // will not generate a surrogate pair
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k),
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+ *utf16_output++ = !match_system(big_endian)
+ ? char16_t(word >> 8 | word << 8)
+ : char16_t(word);
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return std::make_pair(
+ result(error_code::TOO_LARGE, buf - start + k),
+ reinterpret_cast<char16_t *>(utf16_output));
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (!match_system(big_endian)) {
+ high_surrogate =
+ uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+ low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ }
+ }
+ buf += k;
+ }
+ }
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ reinterpret_cast<char16_t *>(utf16_output));
+}
diff --git a/contrib/simdutf/src/lsx/lsx_convert_utf32_to_utf8.cpp b/contrib/simdutf/src/lsx/lsx_convert_utf32_to_utf8.cpp
new file mode 100644
index 000000000..0636fa1d1
--- /dev/null
+++ b/contrib/simdutf/src/lsx/lsx_convert_utf32_to_utf8.cpp
@@ -0,0 +1,459 @@
+std::pair<const char32_t *, char *>
+lsx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
+ uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+ const char32_t *end = buf + len;
+
+ __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xC080));
+ __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7FF));
+ __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xDFFF));
+ __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+ __m128i forbidden_bytemask = __lsx_vldi(0x0);
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (buf + 16 + safety_margin < end) {
+ __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+ __m128i nextin = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
+
+ // Check if no bits set above 16th
+ if (__lsx_bz_v(__lsx_vpickod_h(in, nextin))) {
+ // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+ // Apply UTF-16 => UTF-8 routine (lsx_convert_utf16_to_utf8.cpp)
+ __m128i utf16_packed = __lsx_vpickev_h(nextin, in);
+
+ if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F),
+ utf16_packed))) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ // obviously suboptimal.
+ __m128i utf8_packed = __lsx_vpickev_b(utf16_packed, utf16_packed);
+ // 2. store (8 bytes)
+ __lsx_vst(utf8_packed, utf8_output, 0);
+ // 3. adjust pointers
+ buf += 8;
+ utf8_output += 8;
+ continue; // we are done for this round!
+ }
+ __m128i zero = __lsx_vldi(0);
+ if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, utf16_packed))) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const __m128i t0 = __lsx_vslli_h(utf16_packed, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/));
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f));
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m128i t3 = __lsx_vor_v(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const __m128i t4 = __lsx_vor_v(t3, v_c080);
+ // 2. merge ASCII and 2-byte codewords
+ __m128i one_byte_bytemask =
+ __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F /*0x007F*/));
+ __m128i utf8_unpacked =
+ __lsx_vbitsel_v(t4, utf16_packed, one_byte_bytemask);
+ // 3. prepare bitmask for 8-bit lookup
+ uint32_t m2 =
+ __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
+ // 4. pack the bytes
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+ [lsx_1_2_utf8_bytes_mask[m2]][0];
+ __m128i shuffle = __lsx_vld(row, 1);
+ __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
+ // 5. store bytes
+ __lsx_vst(utf8_packed, utf8_output, 0);
+
+ // 6. adjust pointers
+ buf += 8;
+ utf8_output += row[0];
+ continue;
+ } else {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ forbidden_bytemask = __lsx_vor_v(
+ __lsx_vand_v(
+ __lsx_vsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff
+ __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+ forbidden_bytemask);
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single
+ UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
+ UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three
+ UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ __m128i t0 = __lsx_vpickev_b(utf16_packed, utf16_packed);
+ t0 = __lsx_vilvl_b(t0, t0);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
+ __m128i t1 = __lsx_vand_v(t0, v_3f7f);
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688 /*0x8000*/));
+
+ // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+ __m128i s0 = __lsx_vsrli_h(utf16_packed, 12);
+ // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+ __m128i s1 = __lsx_vslli_h(utf16_packed, 2);
+ // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+ s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/));
+ // [00bb|bbbb|0000|aaaa]
+ __m128i s2 = __lsx_vor_v(s0, s1);
+ // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
+ __m128i s3 = __lsx_vor_v(s2, v_c0e0);
+ // __m128i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+ __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff);
+ __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask,
+ __lsx_vldi(-2752 /*0x4000*/));
+ __m128i s4 = __lsx_vxor_v(s3, m0);
+
+ // 4. expand code units 16-bit => 32-bit
+ __m128i out0 = __lsx_vilvl_h(s4, t2);
+ __m128i out1 = __lsx_vilvh_h(s4, t2);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ __m128i one_byte_bytemask =
+ __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F));
+
+ __m128i one_or_two_bytes_bytemask_u16_to_u32_low =
+ __lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
+ __m128i one_or_two_bytes_bytemask_u16_to_u32_high =
+ __lsx_vilvh_h(one_or_two_bytes_bytemask, zero);
+
+ __m128i one_byte_bytemask_u16_to_u32_low =
+ __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
+ __m128i one_byte_bytemask_u16_to_u32_high =
+ __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+ const uint32_t mask0 =
+ __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
+ one_or_two_bytes_bytemask_u16_to_u32_low,
+ one_byte_bytemask_u16_to_u32_low)),
+ 0);
+ const uint32_t mask1 =
+ __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
+ one_or_two_bytes_bytemask_u16_to_u32_high,
+ one_byte_bytemask_u16_to_u32_high)),
+ 0);
+
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ __m128i shuffle0 = __lsx_vld(row0, 1);
+ __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);
+
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ __m128i shuffle1 = __lsx_vld(row1, 1);
+ __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);
+
+ __lsx_vst(utf8_0, utf8_output, 0);
+ utf8_output += row0[0];
+ __lsx_vst(utf8_1, utf8_output, 0);
+ utf8_output += row1[0];
+
+ buf += 8;
+ }
+ // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+ // will produce four UTF-8 bytes.
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFFFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xFFFFF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xFFFF0000) == 0) {
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char *>(utf8_output));
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ if (word > 0x10FFFF) {
+ return std::make_pair(nullptr,
+ reinterpret_cast<char *>(utf8_output));
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ // check for invalid input
+ if (__lsx_bnz_v(forbidden_bytemask)) {
+ return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
+ }
+ return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
+}
+
+std::pair<result, char *>
+lsx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+ char *utf8_out) {
+ uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+ const char32_t *start = buf;
+ const char32_t *end = buf + len;
+
+ __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xC080));
+ __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7FF));
+ __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xDFFF));
+ __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+ __m128i forbidden_bytemask = __lsx_vldi(0x0);
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (buf + 16 + safety_margin < end) {
+ __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+ __m128i nextin = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
+
+ // Check if no bits set above 16th
+ if (__lsx_bz_v(__lsx_vpickod_h(in, nextin))) {
+ // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+ // Apply UTF-16 => UTF-8 routine (lsx_convert_utf16_to_utf8.cpp)
+ __m128i utf16_packed = __lsx_vpickev_h(nextin, in);
+
+ if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F),
+ utf16_packed))) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ // obviously suboptimal.
+ __m128i utf8_packed = __lsx_vpickev_b(utf16_packed, utf16_packed);
+ // 2. store (8 bytes)
+ __lsx_vst(utf8_packed, utf8_output, 0);
+ // 3. adjust pointers
+ buf += 8;
+ utf8_output += 8;
+ continue; // we are done for this round!
+ }
+ __m128i zero = __lsx_vldi(0);
+ if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, utf16_packed))) {
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const __m128i t0 = __lsx_vslli_h(utf16_packed, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/));
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f));
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m128i t3 = __lsx_vor_v(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const __m128i t4 = __lsx_vor_v(t3, v_c080);
+ // 2. merge ASCII and 2-byte codewords
+ __m128i one_byte_bytemask =
+ __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F /*0x007F*/));
+ __m128i utf8_unpacked =
+ __lsx_vbitsel_v(t4, utf16_packed, one_byte_bytemask);
+ // 3. prepare bitmask for 8-bit lookup
+ uint32_t m2 =
+ __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
+ // 4. pack the bytes
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+ [lsx_1_2_utf8_bytes_mask[m2]][0];
+ __m128i shuffle = __lsx_vld(row, 1);
+ __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
+ // 5. store bytes
+ __lsx_vst(utf8_packed, utf8_output, 0);
+
+ // 6. adjust pointers
+ buf += 8;
+ utf8_output += row[0];
+ continue;
+ } else {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ forbidden_bytemask = __lsx_vor_v(
+ __lsx_vand_v(
+ __lsx_vsle_h(utf16_packed, v_dfff), // utf16_packed <= 0xdfff
+ __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+ forbidden_bytemask);
+ if (__lsx_bnz_v(forbidden_bytemask)) {
+ return std::make_pair(result(error_code::SURROGATE, buf - start),
+ reinterpret_cast<char *>(utf8_output));
+ }
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single
+ UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
+ UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three
+ UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ __m128i t0 = __lsx_vpickev_b(utf16_packed, utf16_packed);
+ t0 = __lsx_vilvl_b(t0, t0);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
+ __m128i t1 = __lsx_vand_v(t0, v_3f7f);
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688 /*0x8000*/));
+
+ // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+ __m128i s0 = __lsx_vsrli_h(utf16_packed, 12);
+ // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+ __m128i s1 = __lsx_vslli_h(utf16_packed, 2);
+ // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+ s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/));
+ // [00bb|bbbb|0000|aaaa]
+ __m128i s2 = __lsx_vor_v(s0, s1);
+ // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
+ __m128i s3 = __lsx_vor_v(s2, v_c0e0);
+ // __m128i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+ __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff);
+ __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask,
+ __lsx_vldi(-2752 /*0x4000*/));
+ __m128i s4 = __lsx_vxor_v(s3, m0);
+
+ // 4. expand code units 16-bit => 32-bit
+ __m128i out0 = __lsx_vilvl_h(s4, t2);
+ __m128i out1 = __lsx_vilvh_h(s4, t2);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ __m128i one_byte_bytemask =
+ __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F));
+
+ __m128i one_or_two_bytes_bytemask_u16_to_u32_low =
+ __lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
+ __m128i one_or_two_bytes_bytemask_u16_to_u32_high =
+ __lsx_vilvh_h(one_or_two_bytes_bytemask, zero);
+
+ __m128i one_byte_bytemask_u16_to_u32_low =
+ __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
+ __m128i one_byte_bytemask_u16_to_u32_high =
+ __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+ const uint32_t mask0 =
+ __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
+ one_or_two_bytes_bytemask_u16_to_u32_low,
+ one_byte_bytemask_u16_to_u32_low)),
+ 0);
+ const uint32_t mask1 =
+ __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
+ one_or_two_bytes_bytemask_u16_to_u32_high,
+ one_byte_bytemask_u16_to_u32_high)),
+ 0);
+
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ __m128i shuffle0 = __lsx_vld(row0, 1);
+ __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);
+
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ __m128i shuffle1 = __lsx_vld(row1, 1);
+ __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);
+
+ __lsx_vst(utf8_0, utf8_output, 0);
+ utf8_output += row0[0];
+ __lsx_vst(utf8_1, utf8_output, 0);
+ utf8_output += row1[0];
+
+ buf += 8;
+ }
+ // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+ // will produce four UTF-8 bytes.
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFFFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xFFFFF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xFFFF0000) == 0) {
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k),
+ reinterpret_cast<char *>(utf8_output));
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ if (word > 0x10FFFF) {
+ return std::make_pair(
+ result(error_code::TOO_LARGE, buf - start + k),
+ reinterpret_cast<char *>(utf8_output));
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ reinterpret_cast<char *>(utf8_output));
+}
diff --git a/contrib/simdutf/src/lsx/lsx_convert_utf8_to_latin1.cpp b/contrib/simdutf/src/lsx/lsx_convert_utf8_to_latin1.cpp
new file mode 100644
index 000000000..3f4ab8366
--- /dev/null
+++ b/contrib/simdutf/src/lsx/lsx_convert_utf8_to_latin1.cpp
@@ -0,0 +1,75 @@
+size_t convert_masked_utf8_to_latin1(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char *&latin1_output) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask & 0xfff;
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+
+ // We first try a few fast paths.
+ // The obvious first test is ASCII, which actually consumes the full 16.
+ if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
+ // We process in chunks of 16 bytes
+ __lsx_vst(in, reinterpret_cast<uint8_t *>(latin1_output), 0);
+ latin1_output += 16; // We wrote 16 18-bit characters.
+ return 16; // We consumed 16 bytes.
+ }
+ /// We do not have a fast path available, or the fast path is unimportant, so
+ /// we fallback.
+ const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][0];
+
+ const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][1];
+ // this indicates an invalid input:
+ if (idx >= 64) {
+ return consumed;
+ }
+ // Here we should have (idx < 64), if not, there is a bug in the validation or
+ // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
+ // we process SIX (6) input code-code units. The max length in bytes of six
+ // code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6
+ // 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy
+ // scenario we process SIX (6) input code-code units. The max length in bytes
+ // of six code code units spanning between 1 and 2 bytes each is 12 bytes.
+ __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+ 0);
+ // Shuffle
+ // 1 byte: 00000000 0bbbbbbb
+ // 2 byte: 110aaaaa 10bbbbbb
+ sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+ __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
+ // ascii mask
+ // 1 byte: 11111111 11111111
+ // 2 byte: 00000000 00000000
+ __m128i ascii_mask = __lsx_vslt_bu(perm, __lsx_vldi(0x80));
+ // utf8 mask
+ // 1 byte: 00000000 00000000
+ // 2 byte: 00111111 00111111
+ __m128i utf8_mask = __lsx_vand_v(__lsx_vsle_bu(__lsx_vldi(0x80), perm),
+ __lsx_vldi(0b00111111));
+ // mask
+ // 1 byte: 11111111 11111111
+ // 2 byte: 00111111 00111111
+ __m128i mask = __lsx_vor_v(utf8_mask, ascii_mask);
+
+ __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), perm, mask);
+ // writing 8 bytes even though we only care about the first 6 bytes.
+ __m128i latin1_packed = __lsx_vpickev_b(__lsx_vldi(0), composed);
+
+ uint64_t buffer[2];
+ // __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+ __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(buffer), 0);
+ std::memcpy(latin1_output, buffer, 6);
+ latin1_output += 6; // We wrote 6 bytes.
+ return consumed;
+}
diff --git a/contrib/simdutf/src/lsx/lsx_convert_utf8_to_utf16.cpp b/contrib/simdutf/src/lsx/lsx_convert_utf8_to_utf16.cpp
new file mode 100644
index 000000000..243804fa1
--- /dev/null
+++ b/contrib/simdutf/src/lsx/lsx_convert_utf8_to_utf16.cpp
@@ -0,0 +1,288 @@
+// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 16, usually 12).
+template <endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char16_t *&utf16_output) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask & 0xfff;
+ //
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+
+ // We first try a few fast paths.
+ // The obvious first test is ASCII, which actually consumes the full 16.
+ if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
+ // We process in chunks of 16 bytes
+ // The routine in simd.h is reused.
+ simd8<int8_t> temp{in};
+ temp.store_ascii_as_utf16<big_endian>(utf16_output);
+ utf16_output += 16; // We wrote 16 16-bit characters.
+ return 16; // We consumed 16 bytes.
+ }
+
+ uint64_t buffer[2];
+ // 3 byte sequences are the next most common, as seen in CJK, which has long
+ // sequences of these.
+ if (input_utf8_end_of_code_point_mask == 0x924) {
+ // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
+ // UTF-16 code units.
+ __m128i composed = convert_utf8_3_byte_to_utf16(in);
+ // Byte swap if necessary
+ if (!match_system(big_endian)) {
+ composed = lsx_swap_bytes(composed);
+ }
+
+ __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+ utf16_output += 4; // We wrote 4 16-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+
+ // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+ if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xAAAA) {
+ // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte
+ // UTF-16 code units.
+ __m128i composed = convert_utf8_2_byte_to_utf16(in);
+ // Byte swap if necessary
+ if (!match_system(big_endian)) {
+ composed = lsx_swap_bytes(composed);
+ }
+
+ __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+ utf16_output += 6; // We wrote 6 16-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+
+ /// We do not have a fast path available, or the fast path is unimportant, so
+ /// we fallback.
+ const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][0];
+
+ const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][1];
+ const __m128i zero = __lsx_vldi(0);
+ if (idx < 64) {
+ // SIX (6) input code-code units
+ // Convert to UTF-16
+ __m128i composed = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+ // Byte swap if necessary
+ if (!match_system(big_endian)) {
+ composed = lsx_swap_bytes(composed);
+ }
+ // Store
+ __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+ utf16_output += 6; // We wrote 6 16-bit characters.
+ return consumed;
+ } else if (idx < 145) {
+ // FOUR (4) input code-code units
+ // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+ __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+ 0);
+ // XXX: depending on the system scalar instructions might be faster.
+ // 1 byte: 00000000 00000000 0ccccccc
+ // 2 byte: 00000000 110bbbbb 10cccccc
+ // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+ sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+ __m128i perm = __lsx_vshuf_b(zero, in, sh);
+ // 1 byte: 00000000 0ccccccc
+ // 2 byte: xx0bbbbb x0cccccc
+ // 3 byte: xxbbbbbb x0cccccc
+ __m128i lowperm = __lsx_vpickev_h(perm, perm);
+ // 1 byte: 00000000 00000000
+ // 2 byte: 00000000 00000000
+ // 3 byte: 00000000 1110aaaa
+ __m128i highperm = __lsx_vpickod_h(perm, perm);
+ // 3 byte: aaaa0000 00000000
+ highperm = __lsx_vslli_h(highperm, 12);
+ // ASCII
+ // 1 byte: 00000000 0ccccccc
+ // 2+byte: 00000000 00cccccc
+ __m128i ascii = __lsx_vand_v(lowperm, __lsx_vrepli_h(0x7f));
+ // 1 byte: 00000000 00000000
+ // 2 byte: xx0bbbbb 00000000
+ // 3 byte: xxbbbbbb 00000000
+ __m128i middlebyte = __lsx_vand_v(lowperm, __lsx_vldi(-2561) /*0xFF00*/);
+ // 1 byte: 00000000 0ccccccc
+ // 2 byte: 0010bbbb bbcccccc
+ // 3 byte: 0010bbbb bbcccccc
+ __m128i composed = __lsx_vor_v(__lsx_vsrli_h(middlebyte, 2), ascii);
+
+ __m128i v0fff = __lsx_vreplgr2vr_h(uint16_t(0xfff));
+ // aaaabbbb bbcccccc
+ composed = __lsx_vbitsel_v(highperm, composed, v0fff);
+
+ if (!match_system(big_endian)) {
+ composed = lsx_swap_bytes(composed);
+ }
+
+ __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+ utf16_output += 4; // We wrote 4 16-bit codepoints
+ return consumed;
+ } else if (idx < 209) {
+ // THREE (3) input code-code units
+ if (input_utf8_end_of_code_point_mask == 0x888) {
+ // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+ // UTF-16 pairs. Generating surrogate pairs is a little tricky though, but
+ // it is easier when we can assume they are all pairs. This version does
+ // not use the LUT, but 4 byte sequences are less common and the overhead
+ // of the extra memory access is less important than the early branch
+ // overhead in shorter sequences.
+
+ // Swap byte pairs
+ // 10dddddd 10cccccc|10bbbbbb 11110aaa
+ // 10cccccc 10dddddd|11110aaa 10bbbbbb
+ __m128i swap = lsx_swap_bytes(in);
+ // Shift left 2 bits
+ // cccccc00 dddddd00 xxxxxxxx bbbbbb00
+ __m128i shift = __lsx_vslli_b(swap, 2);
+ // Create a magic number containing the low 2 bits of the trail surrogate
+ // and all the corrections needed to create the pair. UTF-8 4b prefix =
+ // -0x0000|0xF000 surrogate offset = -0x0000|0x0040 (0x10000 << 6)
+ // surrogate high = +0x0000|0xD800
+ // surrogate low = +0xDC00|0x0000
+ // -------------------------------
+ // = +0xDC00|0xE7C0
+ __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xDC00E7C0));
+ // Generate unadjusted trail surrogate minus lowest 2 bits
+ // vec(0000FF00) = __lsx_vldi(-1758)
+ // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
+ __m128i trail =
+ __lsx_vbitsel_v(shift, swap, __lsx_vldi(-1758 /*0000FF00*/));
+ // Insert low 2 bits of trail surrogate to magic number for later
+ // 11011100 00000000 11100111 110000cc
+ __m128i magic_with_low_2 = __lsx_vor_v(__lsx_vsrli_w(shift, 30), magic);
+
+ // Generate lead surrogate
+ // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx
+ // 000000cc ccdddddd|xxxxxxxx xxxxxxxx
+ __m128i lead = __lsx_vbitsel_v(
+ __lsx_vsrli_h(__lsx_vand_v(shift, __lsx_vldi(0x3F)), 4), swap,
+ __lsx_vrepli_h(0x3f /* 0x003f*/));
+
+ // Blend pairs
+ // __lsx_vldi(-1741) => vec(0x0000FFFF)
+ // 000000cc ccdddddd|11110aaa bbbbbb00
+ __m128i blend =
+ __lsx_vbitsel_v(lead, trail, __lsx_vldi(-1741) /* (0x0000FFFF)*4 */);
+
+ // Add magic number to finish the result
+ // 110111CC CCDDDDDD|110110AA BBBBBBCC
+ __m128i composed = __lsx_vadd_h(blend, magic_with_low_2);
+ // Byte swap if necessary
+ if (!match_system(big_endian)) {
+ composed = lsx_swap_bytes(composed);
+ }
+ // __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+ __lsx_vst(composed, reinterpret_cast<uint16_t *>(buffer), 0);
+ std::memcpy(utf16_output, buffer, 12);
+ utf16_output += 6; // We 3 32-bit surrogate pairs.
+ return 12; // We consumed 12 bytes.
+ }
+ // 3 1-4 byte sequences
+ __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+ 0);
+ // 1 byte: 00000000 00000000 00000000 0ddddddd
+ // 3 byte: 00000000 00000000 110ccccc 10dddddd
+ // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+ // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+ sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+ __m128i perm = __lsx_vshuf_b(zero, in, sh);
+ // added to fix issue https://github.com/simdutf/simdutf/issues/514
+ // We only want to write 2 * 16-bit code units when that is actually what we
+ // have. Unfortunately, we cannot trust the input. So it is possible to get
+ // 0xff as an input byte and it should not result in a surrogate pair. We
+ // need to check for that.
+ uint32_t permbuffer[4];
+ __lsx_vst(perm, permbuffer, 0);
+ // Mask the low and middle bytes
+ // 00000000 00000000 00000000 0ddddddd
+ __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7f));
+ // Because the surrogates need more work, the high surrogate is computed
+ // first.
+ __m128i middlehigh = __lsx_vslli_w(perm, 2);
+ // 00000000 00000000 00cccccc 00000000
+ __m128i middlebyte = __lsx_vand_v(perm, __lsx_vldi(-3777) /* 0x00003F00 */);
+ // Start assembling the sequence. Since the 4th byte is in the same position
+ // as it would be in a surrogate and there is no dependency, shift left
+ // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte:
+ // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
+ __m128i ab =
+ __lsx_vbitsel_v(middlehigh, perm, __lsx_vldi(-1656) /*0xFF000000*/);
+ // Top 16 bits contains the high ten bits of the surrogate pair before
+ // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa
+ // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
+ __m128i v_fffc0000 = __lsx_vreplgr2vr_w(uint32_t(0xFFFC0000));
+ __m128i abc = __lsx_vbitsel_v(__lsx_vslli_w(middlebyte, 4), ab, v_fffc0000);
+ // Combine the low 6 or 7 bits by a shift right accumulate
+ // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct
+ // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o
+ // correction
+ __m128i composed = __lsx_vor_v(ascii, __lsx_vsrli_w(abc, 6));
+ // After this is for surrogates
+ // Blend the low and high surrogates
+ // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
+ __m128i mixed =
+ __lsx_vbitsel_v(abc, composed, __lsx_vldi(-1741) /*0x0000FFFF*/);
+ // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits
+ // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte:
+ // 11110aaa bbbbbbcc|000000cc ccdddddd
+ __m128i v_ffff03ff = __lsx_vreplgr2vr_w(uint32_t(0xFFFF03FF));
+ __m128i masked_pair = __lsx_vand_v(mixed, v_ffff03ff);
+ // Correct the remaining UTF-8 prefix, surrogate offset, and add the
+ // surrogate prefixes in one magic 16-bit addition. similar magic number but
+ // without the continue byte adjust and halfword swapped UTF-8 4b prefix =
+ // -0xF000|0x0000 surrogate offset = -0x0040|0x0000 (0x10000 << 6)
+ // surrogate high = +0xD800|0x0000
+ // surrogate low = +0x0000|0xDC00
+ // -----------------------------------
+ // = +0xE7C0|0xDC00
+ __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xE7C0DC00));
+ // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete
+ __m128i surrogates = __lsx_vadd_w(masked_pair, magic);
+ // If the high bit is 1 (s32 less than zero), this needs a surrogate pair
+ __m128i is_pair = __lsx_vslt_w(perm, zero);
+ // Select either the 4 byte surrogate pair or the 2 byte solo codepoint
+ // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd
+ // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD
+ __m128i selected = __lsx_vbitsel_v(composed, surrogates, is_pair);
+ // Byte swap if necessary
+ if (!match_system(big_endian)) {
+ selected = lsx_swap_bytes(selected);
+ }
+ // Attempting to shuffle and store would be complex, just scalarize.
+ uint32_t buffer_tmp[4];
+ __lsx_vst(selected, buffer_tmp, 0);
+ // Test for the top bit of the surrogate mask. Remove due to issue 514
+ // const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 :
+ // 0x00800000;
+ for (size_t i = 0; i < 3; i++) {
+ // Surrogate
+ // Used to be if (buffer[i] & SURROGATE_MASK) {
+ // See discussion above.
+ // patch for issue https://github.com/simdutf/simdutf/issues/514
+ if ((permbuffer[i] & 0xf8000000) == 0xf0000000) {
+ utf16_output[0] = uint16_t(buffer_tmp[i] >> 16);
+ utf16_output[1] = uint16_t(buffer_tmp[i] & 0xFFFF);
+ utf16_output += 2;
+ } else {
+ utf16_output[0] = uint16_t(buffer_tmp[i] & 0xFFFF);
+ utf16_output++;
+ }
+ }
+ return consumed;
+ } else {
+ // here we know that there is an error but we do not handle errors
+ return 12;
+ }
+}
diff --git a/contrib/simdutf/src/lsx/lsx_convert_utf8_to_utf32.cpp b/contrib/simdutf/src/lsx/lsx_convert_utf8_to_utf32.cpp
new file mode 100644
index 000000000..ce05dd7b9
--- /dev/null
+++ b/contrib/simdutf/src/lsx/lsx_convert_utf8_to_utf32.cpp
@@ -0,0 +1,182 @@
+// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_utf32(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char32_t *&utf32_out) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
+ __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask & 0xFFF;
+ //
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+ //
+ // We first try a few fast paths.
+ if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
+ // We process in chunks of 16 bytes.
+ // use fast implementation in src/simdutf/arm64/simd.h
+ // Ideally the compiler can keep the tables in registers.
+ simd8<int8_t> temp{in};
+ temp.store_ascii_as_utf32_tbl(utf32_out);
+ utf32_output += 16; // We wrote 16 32-bit characters.
+ return 16; // We consumed 16 bytes.
+ }
+ __m128i zero = __lsx_vldi(0);
+ if (input_utf8_end_of_code_point_mask == 0x924) {
+ // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
+ // UTF-32 code units. Convert to UTF-16
+ __m128i composed_utf16 = convert_utf8_3_byte_to_utf16(in);
+ __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
+
+ __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+ utf32_output += 4; // We wrote 4 32-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+ // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+ if (input_utf8_end_of_code_point_mask == 0xaaa) {
+ // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
+ // UTF-32 code units. Convert to UTF-16
+ __m128i composed_utf16 = convert_utf8_2_byte_to_utf16(in);
+
+ __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
+ __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);
+
+ __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+ __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
+ utf32_output += 6;
+ return 12; // We consumed 12 bytes.
+ }
+ /// Either no fast path or an unimportant fast path.
+
+ const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][0];
+ const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+ [input_utf8_end_of_code_point_mask][1];
+
+ if (idx < 64) {
+ // SIX (6) input code-code units
+ // Convert to UTF-16
+ __m128i composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+ __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
+ __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);
+
+ __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+ __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
+ utf32_output += 6;
+ return consumed;
+ } else if (idx < 145) {
+ // FOUR (4) input code-code units
+ // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+ __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+ 0);
+ // Shuffle
+ // 1 byte: 00000000 00000000 0ccccccc
+ // 2 byte: 00000000 110bbbbb 10cccccc
+ // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+ sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+ __m128i perm = __lsx_vshuf_b(zero, in, sh);
+ // Split
+ // 00000000 00000000 0ccccccc
+ __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F)); // 6 or 7 bits
+ // Note: unmasked
+ // xxxxxxxx aaaaxxxx xxxxxxxx
+ __m128i high =
+ __lsx_vsrli_w(__lsx_vand_v(perm, __lsx_vldi(0xf)), 4); // 4 bits
+ // Use 16 bit bic instead of and.
+ // The top bits will be corrected later in the bsl
+ // 00000000 10bbbbbb 00000000
+ __m128i middle =
+ __lsx_vand_v(perm, __lsx_vldi(-1758 /*0x0000FF00*/)); // 5 or 6 bits
+ // Combine low and middle with shift right accumulate
+ // 00000000 00xxbbbb bbcccccc
+ __m128i lowmid = __lsx_vor_v(ascii, __lsx_vsrli_w(middle, 2));
+ // Insert top 4 bits from high byte with bitwise select
+ // 00000000 aaaabbbb bbcccccc
+ __m128i composed =
+ __lsx_vbitsel_v(lowmid, high, __lsx_vldi(-3600 /*0x0000F000*/));
+ __lsx_vst(composed, utf32_output, 0);
+ utf32_output += 4; // We wrote 4 32-bit characters.
+ return consumed;
+ } else if (idx < 209) {
+ // THREE (3) input code-code units
+ if (input_utf8_end_of_code_point_mask == 0x888) {
+ // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+ // UTF-32 code units. This uses the same method as the fixed 3 byte
+ // version, reversing and shift left insert. However, there is no need for
+ // a shuffle mask now, just rev16 and rev32.
+ //
+ // This version does not use the LUT, but 4 byte sequences are less common
+ // and the overhead of the extra memory access is less important than the
+ // early branch overhead in shorter sequences, so it comes last.
+
+ // Swap pairs of bytes
+ // 10dddddd|10cccccc|10bbbbbb|11110aaa
+ // 10cccccc 10dddddd|11110aaa 10bbbbbb
+ __m128i swap = lsx_swap_bytes(in);
+ // Shift left and insert
+ // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
+ __m128i merge1 = __lsx_vbitsel_v(__lsx_vsrli_h(swap, 2), swap,
+ __lsx_vrepli_h(0x3f /*0x003F*/));
+ // Shift insert again
+ // xxxxxxxx xxxaaabb bbbbcccc ccdddddd
+ __m128i merge2 =
+ __lsx_vbitsel_v(__lsx_vslli_w(merge1, 12), /* merge1 << 12 */
+ __lsx_vsrli_w(merge1, 16), /* merge1 >> 16 */
+ __lsx_vldi(-2545)); /*0x00000FFF*/
+ // Clear the garbage
+ // 00000000 000aaabb bbbbcccc ccdddddd
+ __m128i composed = __lsx_vand_v(merge2, __lsx_vldi(-2273 /*0x1FFFFF*/));
+ // Store
+ __lsx_vst(composed, utf32_output, 0);
+ utf32_output += 3; // We wrote 3 32-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+ // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
+ // due to surrogates no longer being involved.
+ __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+ simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+ 0);
+ // 1 byte: 00000000 00000000 00000000 0ddddddd
+ // 2 byte: 00000000 00000000 110ccccc 10dddddd
+ // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+ // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+ sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+ __m128i perm = __lsx_vshuf_b(zero, in, sh);
+
+ // Ascii
+ __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F));
+ __m128i middle = __lsx_vand_v(perm, __lsx_vldi(-3777 /*0x00003f00*/));
+ // 00000000 00000000 0000cccc ccdddddd
+ __m128i cd =
+ __lsx_vbitsel_v(__lsx_vsrli_w(middle, 2), ascii, __lsx_vrepli_w(0x3f));
+
+ __m128i correction = __lsx_vand_v(perm, __lsx_vldi(-3520 /*0x00400000*/));
+ __m128i corrected = __lsx_vadd_b(perm, __lsx_vsrli_w(correction, 1));
+ // Insert twice
+ // 00000000 000aaabb bbbbxxxx xxxxxxxx
+ __m128i corrected_srli2 =
+ __lsx_vsrli_w(__lsx_vand_v(corrected, __lsx_vrepli_b(0x7)), 2);
+ __m128i ab =
+ __lsx_vbitsel_v(corrected_srli2, corrected, __lsx_vrepli_h(0x3f));
+ ab = __lsx_vsrli_w(ab, 4);
+ // 00000000 000aaabb bbbbcccc ccdddddd
+ __m128i composed =
+ __lsx_vbitsel_v(ab, cd, __lsx_vldi(-2545 /*0x00000FFF*/));
+ // Store
+ __lsx_vst(composed, utf32_output, 0);
+ utf32_output += 3; // We wrote 3 32-bit characters.
+ return consumed;
+ } else {
+ // here we know that there is an error but we do not handle errors
+ return 12;
+ }
+}
diff --git a/contrib/simdutf/src/lsx/lsx_validate_utf16.cpp b/contrib/simdutf/src/lsx/lsx_validate_utf16.cpp
new file mode 100644
index 000000000..9fd2d8081
--- /dev/null
+++ b/contrib/simdutf/src/lsx/lsx_validate_utf16.cpp
@@ -0,0 +1,201 @@
+/*
+ In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
+
+ In a vectorized algorithm we want to examine the most significant
+ nibble in order to select a fast path. If none of highest nibbles
+ are 0xD (13), than we are sure that UTF-16 chunk in a vector
+ register is valid.
+
+ Let us analyze what we need to check if the nibble is 0xD. The
+ value of the preceding nibble determines what we have:
+
+ 0xd000 .. 0xd7ff - a valid word
+ 0xd800 .. 0xdbff - low surrogate
+ 0xdc00 .. 0xdfff - high surrogate
+
+ Other constraints we have to consider:
+ - there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
+ - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
+ - there must not be sole low surrogate nor high surrogate
+
+ We're going to build three bitmasks based on the 3rd nibble:
+ - V = valid word,
+ - L = low surrogate (0xd800 .. 0xdbff)
+ - H = high surrogate (0xdc00 .. 0xdfff)
+
+ 0 1 2 3 4 5 6 7 <--- word index
+ [ V | L | H | L | H | V | V | L ]
+ 1 0 0 0 0 1 1 0 - V = valid masks
+ 0 1 0 1 0 0 0 1 - L = low surrogate
+ 0 0 1 0 1 0 0 0 - H high surrogate
+
+
+ 1 0 0 0 0 1 1 0 V = valid masks
+ 0 1 0 1 0 0 0 0 a = L & (H >> 1)
+ 0 0 1 0 1 0 0 0 b = a << 1
+ 1 1 1 1 1 1 1 0 c = V | a | b
+ ^
+ the last bit can be zero, we just consume 7
+ code units and recheck this word in the next iteration
+*/
+
+/* Returns:
+ - pointer to the last unprocessed character (a scalar fallback should check
+ the rest);
+ - nullptr if an error was detected.
+*/
+template <endianness big_endian>
+const char16_t *lsx_validate_utf16(const char16_t *input, size_t size) {
+ const char16_t *end = input + size;
+
+ const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+ const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+ const auto v_fc = simd8<uint8_t>::splat(0xfc);
+ const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+ while (input + simd16<uint16_t>::SIZE * 2 < end) {
+ // 0. Load data: since the validation takes into account only higher
+ // byte of each word, we compress the two vectors into one which
+ // consists only the higher bytes.
+ auto in0 = simd16<uint16_t>(input);
+ auto in1 =
+ simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+ if (big_endian) {
+ in0 = in0.swap_bytes();
+ in1 = in1.swap_bytes();
+ }
+ const auto in = simd8<uint8_t>(__lsx_vssrlni_bu_h(in1.value, in0.value, 8));
+
+ // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+ const auto surrogates_wordmask = (in & v_f8) == v_d8;
+ const uint16_t surrogates_bitmask =
+ static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
+ if (surrogates_bitmask == 0x0000) {
+ input += 16;
+ } else {
+ // 2. We have some surrogates that have to be distinguished:
+ // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+ // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+ //
+ // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+ // V - non-surrogate code units
+ // V = not surrogates_wordmask
+ const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
+
+ // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+ const auto vH = (in & v_fc) == v_dc;
+ const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
+
+ // L - word mask for low surrogates
+ // L = not H and surrogates_wordmask
+ const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
+
+ const uint16_t a = static_cast<uint16_t>(
+ L & (H >> 1)); // A low surrogate must be followed by high one.
+ // (A low surrogate placed in the 7th register's word
+ // is an exception we handle.)
+ const uint16_t b = static_cast<uint16_t>(
+ a << 1); // Just mark that the opinput - startite fact is hold,
+ // thanks to that we have only two masks for valid case.
+ const uint16_t c = static_cast<uint16_t>(
+ V | a | b); // Combine all the masks into the final one.
+
+ if (c == 0xffff) {
+ // The whole input register contains valid UTF-16, i.e.,
+ // either single code units or proper surrogate pairs.
+ input += 16;
+ } else if (c == 0x7fff) {
+ // The 15 lower code units of the input register contains valid UTF-16.
+ // The 15th word may be either a low or high surrogate. It the next
+ // iteration we 1) check if the low surrogate is followed by a high
+ // one, 2) reject sole high surrogate.
+ input += 15;
+ } else {
+ return nullptr;
+ }
+ }
+ }
+
+ return input;
+}
+
+template <endianness big_endian>
+const result lsx_validate_utf16_with_errors(const char16_t *input,
+ size_t size) {
+ const char16_t *start = input;
+ const char16_t *end = input + size;
+
+ const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+ const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+ const auto v_fc = simd8<uint8_t>::splat(0xfc);
+ const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+ while (input + simd16<uint16_t>::SIZE * 2 < end) {
+ // 0. Load data: since the validation takes into account only higher
+ // byte of each word, we compress the two vectors into one which
+ // consists only the higher bytes.
+ auto in0 = simd16<uint16_t>(input);
+ auto in1 =
+ simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+
+ if (big_endian) {
+ in0 = in0.swap_bytes();
+ in1 = in1.swap_bytes();
+ }
+
+ const auto in = simd8<uint8_t>(__lsx_vssrlni_bu_h(in1.value, in0.value, 8));
+
+ // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+ const auto surrogates_wordmask = (in & v_f8) == v_d8;
+ const uint16_t surrogates_bitmask =
+ static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
+ if (surrogates_bitmask == 0x0000) {
+ input += 16;
+ } else {
+ // 2. We have some surrogates that have to be distinguished:
+ // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+ // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+ //
+ // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+ // V - non-surrogate code units
+ // V = not surrogates_wordmask
+ const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
+
+ // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+ const auto vH = (in & v_fc) == v_dc;
+ const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
+
+ // L - word mask for low surrogates
+ // L = not H and surrogates_wordmask
+ const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
+
+ const uint16_t a = static_cast<uint16_t>(
+ L & (H >> 1)); // A low surrogate must be followed by high one.
+ // (A low surrogate placed in the 7th register's word
+ // is an exception we handle.)
+ const uint16_t b = static_cast<uint16_t>(
+ a << 1); // Just mark that the opinput - startite fact is hold,
+ // thanks to that we have only two masks for valid case.
+ const uint16_t c = static_cast<uint16_t>(
+ V | a | b); // Combine all the masks into the final one.
+
+ if (c == 0xffff) {
+ // The whole input register contains valid UTF-16, i.e.,
+ // either single code units or proper surrogate pairs.
+ input += 16;
+ } else if (c == 0x7fff) {
+ // The 15 lower code units of the input register contains valid UTF-16.
+ // The 15th word may be either a low or high surrogate. It the next
+ // iteration we 1) check if the low surrogate is followed by a high
+ // one, 2) reject sole high surrogate.
+ input += 15;
+ } else {
+ return result(error_code::SURROGATE, input - start);
+ }
+ }
+ }
+
+ return result(error_code::SUCCESS, input - start);
+}
diff --git a/contrib/simdutf/src/lsx/lsx_validate_utf32le.cpp b/contrib/simdutf/src/lsx/lsx_validate_utf32le.cpp
new file mode 100644
index 000000000..6237431fc
--- /dev/null
+++ b/contrib/simdutf/src/lsx/lsx_validate_utf32le.cpp
@@ -0,0 +1,69 @@
+
+const char32_t *lsx_validate_utf32le(const char32_t *input, size_t size) {
+ const char32_t *end = input + size;
+
+ __m128i offset = __lsx_vreplgr2vr_w(uint32_t(0xffff2000));
+ __m128i standardoffsetmax = __lsx_vreplgr2vr_w(uint32_t(0xfffff7ff));
+ __m128i standardmax = __lsx_vldi(-2288); /*0x10ffff*/
+ __m128i currentmax = __lsx_vldi(0x0);
+ __m128i currentoffsetmax = __lsx_vldi(0x0);
+
+ while (input + 4 < end) {
+ __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input), 0);
+ currentmax = __lsx_vmax_wu(in, currentmax);
+ // 0xD8__ + 0x2000 = 0xF8__ => 0xF8__ > 0xF7FF
+ currentoffsetmax =
+ __lsx_vmax_wu(__lsx_vadd_w(in, offset), currentoffsetmax);
+
+ input += 4;
+ }
+
+ __m128i is_zero =
+ __lsx_vxor_v(__lsx_vmax_wu(currentmax, standardmax), standardmax);
+ if (__lsx_bnz_v(is_zero)) {
+ return nullptr;
+ }
+
+ is_zero = __lsx_vxor_v(__lsx_vmax_wu(currentoffsetmax, standardoffsetmax),
+ standardoffsetmax);
+ if (__lsx_bnz_v(is_zero)) {
+ return nullptr;
+ }
+
+ return input;
+}
+
+const result lsx_validate_utf32le_with_errors(const char32_t *input,
+ size_t size) {
+ const char32_t *start = input;
+ const char32_t *end = input + size;
+
+ __m128i offset = __lsx_vreplgr2vr_w(uint32_t(0xffff2000));
+ __m128i standardoffsetmax = __lsx_vreplgr2vr_w(uint32_t(0xfffff7ff));
+ __m128i standardmax = __lsx_vldi(-2288); /*0x10ffff*/
+ __m128i currentmax = __lsx_vldi(0x0);
+ __m128i currentoffsetmax = __lsx_vldi(0x0);
+
+ while (input + 4 < end) {
+ __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input), 0);
+ currentmax = __lsx_vmax_wu(in, currentmax);
+ currentoffsetmax =
+ __lsx_vmax_wu(__lsx_vadd_w(in, offset), currentoffsetmax);
+
+ __m128i is_zero =
+ __lsx_vxor_v(__lsx_vmax_wu(currentmax, standardmax), standardmax);
+ if (__lsx_bnz_v(is_zero)) {
+ return result(error_code::TOO_LARGE, input - start);
+ }
+
+ is_zero = __lsx_vxor_v(__lsx_vmax_wu(currentoffsetmax, standardoffsetmax),
+ standardoffsetmax);
+ if (__lsx_bnz_v(is_zero)) {
+ return result(error_code::SURROGATE, input - start);
+ }
+
+ input += 4;
+ }
+
+ return result(error_code::SUCCESS, input - start);
+}
diff --git a/contrib/simdutf/src/ppc64/implementation.cpp b/contrib/simdutf/src/ppc64/implementation.cpp
new file mode 100644
index 000000000..c7cccd532
--- /dev/null
+++ b/contrib/simdutf/src/ppc64/implementation.cpp
@@ -0,0 +1,510 @@
+#include "scalar/latin1.h"
+#include "scalar/utf16.h"
+#include "scalar/utf8.h"
+#include "scalar/utf8_to_latin1/utf8_to_latin1.h"
+#include "scalar/utf8_to_latin1/valid_utf8_to_latin1.h"
+
+#include "scalar/utf16_to_utf8/utf16_to_utf8.h"
+#include "scalar/utf16_to_utf8/valid_utf16_to_utf8.h"
+
+#include "scalar/utf16_to_utf32/utf16_to_utf32.h"
+#include "scalar/utf16_to_utf32/valid_utf16_to_utf32.h"
+
+#include "scalar/utf32_to_utf8/utf32_to_utf8.h"
+#include "scalar/utf32_to_utf8/valid_utf32_to_utf8.h"
+
+#include "scalar/utf32_to_utf16/utf32_to_utf16.h"
+#include "scalar/utf32_to_utf16/valid_utf32_to_utf16.h"
+
+#include "simdutf/ppc64/begin.h"
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+#ifndef SIMDUTF_PPC64_H
+ #error "ppc64.h must be included"
+#endif
+using namespace simd;
+
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
+ // careful: 0x80 is not ascii.
+ return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
+}
+
+simdutf_unused simdutf_really_inline simd8<bool>
+must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
+ const simd8<uint8_t> prev3) {
+ simd8<uint8_t> is_second_byte =
+ prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
+ simd8<uint8_t> is_third_byte =
+ prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+ simd8<uint8_t> is_fourth_byte =
+ prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+ // Caller requires a bool (all 1's). All values resulting from the subtraction
+ // will be <= 64, so signed comparison is fine.
+ return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) >
+ int8_t(0);
+}
+
+simdutf_really_inline simd8<bool>
+must_be_2_3_continuation(const simd8<uint8_t> prev2,
+ const simd8<uint8_t> prev3) {
+ simd8<uint8_t> is_third_byte =
+ prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be >= 0x80
+ simd8<uint8_t> is_fourth_byte =
+ prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be >= 0x80
+ // Caller requires a bool (all 1's). All values resulting from the subtraction
+ // will be <= 64, so signed comparison is fine.
+ return simd8<bool>(is_third_byte | is_fourth_byte);
+}
+
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#include "generic/buf_block_reader.h"
+#include "generic/utf8_validation/utf8_lookup4_algorithm.h"
+#include "generic/utf8_validation/utf8_validator.h"
+// transcoding from UTF-8 to UTF-16
+#include "generic/utf8_to_utf16/utf8_to_utf16.h"
+#include "generic/utf8_to_utf16/valid_utf8_to_utf16.h"
+// transcoding from UTF-8 to UTF-32
+#include "generic/utf8_to_utf32/utf8_to_utf32.h"
+#include "generic/utf8_to_utf32/valid_utf8_to_utf32.h"
+// other functions
+#include "generic/utf16.h"
+#include "generic/utf8.h"
+
+//
+// Implementation-specific overrides
+//
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+ size_t length) const noexcept {
+ // If there is a BOM, then we trust it.
+ auto bom_encoding = simdutf::BOM::check_bom(input, length);
+ if (bom_encoding != encoding_type::unspecified) {
+ return bom_encoding;
+ }
+ // todo: reimplement as a one-pass algorithm.
+ int out = 0;
+ if (validate_utf8(input, length)) {
+ out |= encoding_type::UTF8;
+ }
+ if ((length % 2) == 0) {
+ if (validate_utf16(reinterpret_cast<const char16_t *>(input), length / 2)) {
+ out |= encoding_type::UTF16_LE;
+ }
+ }
+ if ((length % 4) == 0) {
+ if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+ out |= encoding_type::UTF32_LE;
+ }
+ }
+
+ return out;
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+ return ppc64::utf8_validation::generic_validate_utf8(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+ const char *buf, size_t len) const noexcept {
+ return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+ return ppc64::utf8_validation::generic_validate_ascii(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+ const char *buf, size_t len) const noexcept {
+ return ppc64::utf8_validation::generic_validate_ascii_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept {
+ return scalar::utf16::validate<endianness::LITTLE>(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept {
+ return scalar::utf16::validate<endianness::BIG>(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept {
+ return scalar::utf32::validate_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf32(const char16_t *buf, size_t len) const noexcept {
+ return scalar::utf32::validate(buf, len);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+ const char * /*buf*/, size_t /*len*/,
+ char16_t * /*utf16_output*/) const noexcept {
+ return 0; // stub
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+ const char * /*buf*/, size_t /*len*/,
+ char16_t * /*utf16_output*/) const noexcept {
+ return 0; // stub
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+ const char * /*buf*/, size_t /*len*/,
+ char16_t * /*utf16_output*/) const noexcept {
+ return result(error_code::OTHER, 0); // stub
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+ const char * /*buf*/, size_t /*len*/,
+ char16_t * /*utf16_output*/) const noexcept {
+ return result(error_code::OTHER, 0); // stub
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+ const char * /*buf*/, size_t /*len*/,
+ char16_t * /*utf16_output*/) const noexcept {
+ return 0; // stub
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+ const char * /*buf*/, size_t /*len*/,
+ char16_t * /*utf16_output*/) const noexcept {
+ return 0; // stub
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+ const char * /*buf*/, size_t /*len*/,
+ char32_t * /*utf16_output*/) const noexcept {
+ return 0; // stub
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+ const char * /*buf*/, size_t /*len*/,
+ char32_t * /*utf16_output*/) const noexcept {
+ return result(error_code::OTHER, 0); // stub
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+ const char * /*buf*/, size_t /*len*/,
+ char32_t * /*utf16_output*/) const noexcept {
+ return 0; // stub
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len,
+ utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+ buf, len, utf8_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+ buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len,
+ utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len,
+ utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+ buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+ buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(
+ buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len,
+ utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len,
+ utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+ buf, len, utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+ buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(
+ buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len,
+ utf32_output);
+}
+
+void implementation::change_endianness_utf16(const char16_t *input,
+ size_t length,
+ char16_t *output) const noexcept {
+ scalar::utf16::change_endianness_utf16(input, length, output);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::utf16::count_code_points<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+ return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input,
+ length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input,
+ length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ return scalar::utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ return scalar::utf32::utf8_length_from_utf32(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ return scalar::utf32::utf16_length_from_utf32(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ return scalar::utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ // skip trailing spaces
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ size_t equallocation =
+ length; // location of the first padding character if any
+ size_t equalsigns = 0;
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ length -= 1;
+ equalsigns++;
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ equalsigns++;
+ length -= 1;
+ }
+ }
+ if (length == 0) {
+ if (equalsigns > 0) {
+ return {INVALID_BASE64_CHARACTER, equallocation};
+ }
+ return {SUCCESS, 0};
+ }
+ result r = scalar::base64::base64_tail_decode(
+ output, input, length, equalsigns, options, last_chunk_options);
+ if (last_chunk_options != stop_before_partial &&
+ r.error == error_code::SUCCESS && equalsigns > 0) {
+ // additional checks
+ if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+ return {INVALID_BASE64_CHARACTER, equallocation};
+ }
+ }
+ return r;
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ // skip trailing spaces
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ size_t equallocation =
+ length; // location of the first padding character if any
+ size_t equalsigns = 0;
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ length -= 1;
+ equalsigns++;
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ equalsigns++;
+ length -= 1;
+ }
+ }
+ if (length == 0) {
+ if (equalsigns > 0) {
+ return {INVALID_BASE64_CHARACTER, equallocation};
+ }
+ return {SUCCESS, 0};
+ }
+ result r = scalar::base64::base64_tail_decode(
+ output, input, length, equalsigns, options, last_chunk_options);
+ if (last_chunk_options != stop_before_partial &&
+ r.error == error_code::SUCCESS && equalsigns > 0) {
+ // additional checks
+ if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+ return {INVALID_BASE64_CHARACTER, equallocation};
+ }
+ }
+ return r;
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+ size_t length, base64_options options) const noexcept {
+ return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+ char *output,
+ base64_options options) const noexcept {
+ return scalar::base64::binary_to_base64(input, length, output, options);
+}
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#include "simdutf/ppc64/end.h"
diff --git a/contrib/simdutf/src/rvv/implementation.cpp b/contrib/simdutf/src/rvv/implementation.cpp
new file mode 100644
index 000000000..5ac745df2
--- /dev/null
+++ b/contrib/simdutf/src/rvv/implementation.cpp
@@ -0,0 +1,280 @@
+#include "scalar/latin1.h"
+#include "scalar/utf16.h"
+#include "scalar/utf8.h"
+#include "scalar/utf8_to_latin1/utf8_to_latin1.h"
+#include "scalar/utf8_to_latin1/valid_utf8_to_latin1.h"
+
+#include "scalar/utf16_to_utf8/utf16_to_utf8.h"
+#include "scalar/utf16_to_utf8/valid_utf16_to_utf8.h"
+
+#include "scalar/utf16_to_utf32/utf16_to_utf32.h"
+#include "scalar/utf16_to_utf32/valid_utf16_to_utf32.h"
+
+#include "scalar/utf32_to_utf8/utf32_to_utf8.h"
+#include "scalar/utf32_to_utf8/valid_utf32_to_utf8.h"
+
+#include "scalar/utf32_to_utf16/utf32_to_utf16.h"
+#include "scalar/utf32_to_utf16/valid_utf32_to_utf16.h"
+
+#include "simdutf/rvv/begin.h"
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+#ifndef SIMDUTF_RVV_H
+ #error "rvv.h must be included"
+#endif
+
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+//
+// Implementation-specific overrides
+//
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+#include "rvv/rvv_helpers.inl.cpp"
+
+#include "rvv/rvv_length_from.inl.cpp"
+#include "rvv/rvv_validate.inl.cpp"
+
+#include "rvv/rvv_latin1_to.inl.cpp"
+#include "rvv/rvv_utf16_to.inl.cpp"
+#include "rvv/rvv_utf32_to.inl.cpp"
+#include "rvv/rvv_utf8_to.inl.cpp"
+
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+ size_t length) const noexcept {
+ // If there is a BOM, then we trust it.
+ auto bom_encoding = simdutf::BOM::check_bom(input, length);
+ if (bom_encoding != encoding_type::unspecified)
+ return bom_encoding;
+ // todo: reimplement as a one-pass algorithm.
+ int out = 0;
+ if (validate_utf8(input, length))
+ out |= encoding_type::UTF8;
+ if (length % 2 == 0) {
+ if (validate_utf16(reinterpret_cast<const char16_t *>(input), length / 2))
+ out |= encoding_type::UTF16_LE;
+ }
+ if (length % 4 == 0) {
+ if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4))
+ out |= encoding_type::UTF32_LE;
+ }
+
+ return out;
+}
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static void
+rvv_change_endianness_utf16(const char16_t *src, size_t len, char16_t *dst) {
+ for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+ vl = __riscv_vsetvl_e16m8(len);
+ vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+ __riscv_vse16_v_u16m8((uint16_t *)dst, simdutf_byteflip<bflip>(v, vl), vl);
+ }
+}
+
+void implementation::change_endianness_utf16(const char16_t *src, size_t len,
+ char16_t *dst) const noexcept {
+ if (supports_zvbb())
+ return rvv_change_endianness_utf16<simdutf_ByteFlip::ZVBB>(src, len, dst);
+ else
+ return rvv_change_endianness_utf16<simdutf_ByteFlip::V>(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ size_t equallocation =
+ length; // location of the first padding character if any
+ size_t equalsigns = 0;
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ length -= 1;
+ equalsigns++;
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ equalsigns++;
+ length -= 1;
+ }
+ }
+ if (length == 0) {
+ if (equalsigns > 0) {
+ return {INVALID_BASE64_CHARACTER, equallocation};
+ }
+ return {SUCCESS, 0};
+ }
+ result r = scalar::base64::base64_tail_decode(
+ output, input, length, equalsigns, options, last_chunk_options);
+ if (last_chunk_options != stop_before_partial &&
+ r.error == error_code::SUCCESS && equalsigns > 0) {
+ // additional checks
+ if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+ return {INVALID_BASE64_CHARACTER, equallocation};
+ }
+ }
+ return r;
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ size_t equallocation =
+ length; // location of the first padding character if any
+ size_t equalsigns = 0;
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ length -= 1;
+ equalsigns++;
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ equalsigns++;
+ length -= 1;
+ }
+ }
+ if (length == 0) {
+ if (equalsigns > 0) {
+ return {INVALID_BASE64_CHARACTER, equallocation, 0};
+ }
+ return {SUCCESS, 0, 0};
+ }
+ full_result r = scalar::base64::base64_tail_decode(
+ output, input, length, equalsigns, options, last_chunk_options);
+ if (last_chunk_options != stop_before_partial &&
+ r.error == error_code::SUCCESS && equalsigns > 0) {
+ // additional checks
+ if ((r.output_count % 3 == 0) ||
+ ((r.output_count % 3) + 1 + equalsigns != 4)) {
+ return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
+ }
+ }
+ return r;
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ size_t equallocation =
+ length; // location of the first padding character if any
+ auto equalsigns = 0;
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ length -= 1;
+ equalsigns++;
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ equalsigns++;
+ length -= 1;
+ }
+ }
+ if (length == 0) {
+ if (equalsigns > 0) {
+ return {INVALID_BASE64_CHARACTER, equallocation};
+ }
+ return {SUCCESS, 0};
+ }
+ result r = scalar::base64::base64_tail_decode(
+ output, input, length, equalsigns, options, last_chunk_options);
+ if (last_chunk_options != stop_before_partial &&
+ r.error == error_code::SUCCESS && equalsigns > 0) {
+ // additional checks
+ if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+ return {INVALID_BASE64_CHARACTER, equallocation};
+ }
+ }
+ return r;
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ size_t equallocation =
+ length; // location of the first padding character if any
+ size_t equalsigns = 0;
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ length -= 1;
+ equalsigns++;
+ while (length > 0 &&
+ scalar::base64::is_ascii_white_space(input[length - 1])) {
+ length--;
+ }
+ if (length > 0 && input[length - 1] == '=') {
+ equallocation = length - 1;
+ equalsigns++;
+ length -= 1;
+ }
+ }
+ if (length == 0) {
+ if (equalsigns > 0) {
+ return {INVALID_BASE64_CHARACTER, equallocation, 0};
+ }
+ return {SUCCESS, 0, 0};
+ }
+ full_result r = scalar::base64::base64_tail_decode(
+ output, input, length, equalsigns, options, last_chunk_options);
+ if (last_chunk_options != stop_before_partial &&
+ r.error == error_code::SUCCESS && equalsigns > 0) {
+ // additional checks
+ if ((r.output_count % 3 == 0) ||
+ ((r.output_count % 3) + 1 + equalsigns != 4)) {
+ return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
+ }
+ }
+ return r;
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+ size_t length, base64_options options) const noexcept {
+ return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+ char *output,
+ base64_options options) const noexcept {
+ return scalar::base64::tail_encode_base64(output, input, length, options);
+}
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#include "simdutf/rvv/end.h"
diff --git a/contrib/simdutf/src/rvv/rvv_helpers.inl.cpp b/contrib/simdutf/src/rvv/rvv_helpers.inl.cpp
new file mode 100644
index 000000000..dc1341847
--- /dev/null
+++ b/contrib/simdutf/src/rvv/rvv_helpers.inl.cpp
@@ -0,0 +1,23 @@
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static size_t
+rvv_utf32_store_utf16_m4(uint16_t *dst, vuint32m4_t utf32, size_t vl,
+ vbool4_t m4even) {
+ /* convert [000000000000aaaa|aaaaaabbbbbbbbbb]
+ * to [110111bbbbbbbbbb|110110aaaaaaaaaa] */
+ vuint32m4_t sur = __riscv_vsub_vx_u32m4(utf32, 0x10000, vl);
+ sur = __riscv_vor_vv_u32m4(__riscv_vsll_vx_u32m4(sur, 16, vl),
+ __riscv_vsrl_vx_u32m4(sur, 10, vl), vl);
+ sur = __riscv_vand_vx_u32m4(sur, 0x3FF03FF, vl);
+ sur = __riscv_vor_vx_u32m4(sur, 0xDC00D800, vl);
+ /* merge 1 byte utf32 and 2 byte sur */
+ vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(utf32, 0xFFFF, vl);
+ vuint16m4_t utf32_16 = __riscv_vreinterpret_v_u32m4_u16m4(
+ __riscv_vmerge_vvm_u32m4(utf32, sur, m4, vl));
+ /* compress and store */
+ vbool4_t mOut = __riscv_vmor_mm_b4(
+ __riscv_vmsne_vx_u16m4_b4(utf32_16, 0, vl * 2), m4even, vl * 2);
+ vuint16m4_t vout = __riscv_vcompress_vm_u16m4(utf32_16, mOut, vl * 2);
+ vl = __riscv_vcpop_m_b4(mOut, vl * 2);
+ __riscv_vse16_v_u16m4(dst, simdutf_byteflip<bflip>(vout, vl), vl);
+ return vl;
+};
diff --git a/contrib/simdutf/src/rvv/rvv_latin1_to.inl.cpp b/contrib/simdutf/src/rvv/rvv_latin1_to.inl.cpp
new file mode 100644
index 000000000..72603cf31
--- /dev/null
+++ b/contrib/simdutf/src/rvv/rvv_latin1_to.inl.cpp
@@ -0,0 +1,66 @@
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+ const char *src, size_t len, char *dst) const noexcept {
+ char *beg = dst;
+ for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
+ vl = __riscv_vsetvl_e8m2(len);
+ vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
+ vbool4_t nascii =
+ __riscv_vmslt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v1), 0, vl);
+ size_t cnt = __riscv_vcpop_m_b4(nascii, vl);
+ vlOut = vl + cnt;
+ if (cnt == 0) {
+ __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
+ continue;
+ }
+
+ vuint8m2_t v0 =
+ __riscv_vor_vx_u8m2(__riscv_vsrl_vx_u8m2(v1, 6, vl), 0b11000000, vl);
+ v1 = __riscv_vand_vx_u8m2_mu(nascii, v1, v1, 0b10111111, vl);
+
+ vuint8m4_t wide =
+ __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vwmaccu_vx_u16m4(
+ __riscv_vwaddu_vv_u16m4(v0, v1, vl), 0xFF, v1, vl));
+ vbool2_t mask = __riscv_vmsgtu_vx_u8m4_b2(
+ __riscv_vsub_vx_u8m4(wide, 0b11000000, vl * 2), 1, vl * 2);
+ vuint8m4_t comp = __riscv_vcompress_vm_u8m4(wide, mask, vl * 2);
+
+ __riscv_vse8_v_u8m4((uint8_t *)dst, comp, vlOut);
+ }
+ return dst - beg;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+ const char *src, size_t len, char16_t *dst) const noexcept {
+ char16_t *beg = dst;
+ for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+ vl = __riscv_vsetvl_e8m4(len);
+ vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t *)src, vl);
+ __riscv_vse16_v_u16m8((uint16_t *)dst, __riscv_vzext_vf2_u16m8(v, vl), vl);
+ }
+ return dst - beg;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+ const char *src, size_t len, char16_t *dst) const noexcept {
+ char16_t *beg = dst;
+ for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+ vl = __riscv_vsetvl_e8m4(len);
+ vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t *)src, vl);
+ __riscv_vse16_v_u16m8(
+ (uint16_t *)dst,
+ __riscv_vsll_vx_u16m8(__riscv_vzext_vf2_u16m8(v, vl), 8, vl), vl);
+ }
+ return dst - beg;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+ const char *src, size_t len, char32_t *dst) const noexcept {
+ char32_t *beg = dst;
+ for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+ vl = __riscv_vsetvl_e8m2(len);
+ vuint8m2_t v = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
+ __riscv_vse32_v_u32m8((uint32_t *)dst, __riscv_vzext_vf4_u32m8(v, vl), vl);
+ }
+ return dst - beg;
+}
diff --git a/contrib/simdutf/src/rvv/rvv_length_from.inl.cpp b/contrib/simdutf/src/rvv/rvv_length_from.inl.cpp
new file mode 100644
index 000000000..b0ffe0dd9
--- /dev/null
+++ b/contrib/simdutf/src/rvv/rvv_length_from.inl.cpp
@@ -0,0 +1,165 @@
+
+simdutf_warn_unused size_t
+implementation::count_utf16le(const char16_t *src, size_t len) const noexcept {
+ return utf32_length_from_utf16le(src, len);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf16be(const char16_t *src, size_t len) const noexcept {
+ return utf32_length_from_utf16be(src, len);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *src, size_t len) const noexcept {
+ return utf32_length_from_utf8(src, len);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+ const char *src, size_t len) const noexcept {
+ return utf32_length_from_utf8(src, len);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t len) const noexcept {
+ return len;
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t len) const noexcept {
+ return len;
+}
+
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t len) const noexcept {
+ return len;
+}
+
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t len) const noexcept {
+ return len;
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+ const char *src, size_t len) const noexcept {
+ size_t count = 0;
+ for (size_t vl; len > 0; len -= vl, src += vl) {
+ vl = __riscv_vsetvl_e8m8(len);
+ vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
+ vbool1_t mask = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl);
+ count += __riscv_vcpop_m_b1(mask, vl);
+ }
+ return count;
+}
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static size_t
+rvv_utf32_length_from_utf16(const char16_t *src, size_t len) {
+ size_t count = 0;
+ for (size_t vl; len > 0; len -= vl, src += vl) {
+ vl = __riscv_vsetvl_e16m8(len);
+ vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+ v = simdutf_byteflip<bflip>(v, vl);
+ vbool2_t notHigh =
+ __riscv_vmor_mm_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl),
+ __riscv_vmsltu_vx_u16m8_b2(v, 0xDC00, vl), vl);
+ count += __riscv_vcpop_m_b2(notHigh, vl);
+ }
+ return count;
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+ const char16_t *src, size_t len) const noexcept {
+ return rvv_utf32_length_from_utf16<simdutf_ByteFlip::NONE>(src, len);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+ const char16_t *src, size_t len) const noexcept {
+ if (supports_zvbb())
+ return rvv_utf32_length_from_utf16<simdutf_ByteFlip::ZVBB>(src, len);
+ else
+ return rvv_utf32_length_from_utf16<simdutf_ByteFlip::V>(src, len);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+ const char *src, size_t len) const noexcept {
+ size_t count = len;
+ for (size_t vl; len > 0; len -= vl, src += vl) {
+ vl = __riscv_vsetvl_e8m8(len);
+ vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
+ count += __riscv_vcpop_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl);
+ }
+ return count;
+}
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static size_t
+rvv_utf8_length_from_utf16(const char16_t *src, size_t len) {
+ size_t count = 0;
+ for (size_t vl; len > 0; len -= vl, src += vl) {
+ vl = __riscv_vsetvl_e16m8(len);
+ vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+ v = simdutf_byteflip<bflip>(v, vl);
+ vbool2_t m234 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7F, vl);
+ vbool2_t m34 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7FF, vl);
+ vbool2_t notSur =
+ __riscv_vmor_mm_b2(__riscv_vmsltu_vx_u16m8_b2(v, 0xD800, vl),
+ __riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl), vl);
+ vbool2_t m3 = __riscv_vmand_mm_b2(m34, notSur, vl);
+ count += vl + __riscv_vcpop_m_b2(m234, vl) + __riscv_vcpop_m_b2(m3, vl);
+ }
+ return count;
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+ const char16_t *src, size_t len) const noexcept {
+ return rvv_utf8_length_from_utf16<simdutf_ByteFlip::NONE>(src, len);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+ const char16_t *src, size_t len) const noexcept {
+ if (supports_zvbb())
+ return rvv_utf8_length_from_utf16<simdutf_ByteFlip::ZVBB>(src, len);
+ else
+ return rvv_utf8_length_from_utf16<simdutf_ByteFlip::V>(src, len);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+ const char32_t *src, size_t len) const noexcept {
+ size_t count = 0;
+ for (size_t vl; len > 0; len -= vl, src += vl) {
+ vl = __riscv_vsetvl_e32m8(len);
+ vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
+ vbool4_t m234 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7F, vl);
+ vbool4_t m34 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7FF, vl);
+ vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl);
+ count += vl + __riscv_vcpop_m_b4(m234, vl) + __riscv_vcpop_m_b4(m34, vl) +
+ __riscv_vcpop_m_b4(m4, vl);
+ }
+ return count;
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+ const char *src, size_t len) const noexcept {
+ size_t count = 0;
+ for (size_t vl; len > 0; len -= vl, src += vl) {
+ vl = __riscv_vsetvl_e8m8(len);
+ vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
+ vbool1_t m1234 = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl);
+ vbool1_t m4 = __riscv_vmsgtu_vx_u8m8_b1(__riscv_vreinterpret_u8m8(v),
+ (uint8_t)0b11101111, vl);
+ count += __riscv_vcpop_m_b1(m1234, vl) + __riscv_vcpop_m_b1(m4, vl);
+ }
+ return count;
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+ const char32_t *src, size_t len) const noexcept {
+ size_t count = 0;
+ for (size_t vl; len > 0; len -= vl, src += vl) {
+ vl = __riscv_vsetvl_e32m8(len);
+ vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
+ vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl);
+ count += vl + __riscv_vcpop_m_b4(m4, vl);
+ }
+ return count;
+}
diff --git a/contrib/simdutf/src/rvv/rvv_utf16_to.inl.cpp b/contrib/simdutf/src/rvv/rvv_utf16_to.inl.cpp
new file mode 100644
index 000000000..de9831c19
--- /dev/null
+++ b/contrib/simdutf/src/rvv/rvv_utf16_to.inl.cpp
@@ -0,0 +1,393 @@
+#include <cstdio>
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static result
+rvv_utf16_to_latin1_with_errors(const char16_t *src, size_t len, char *dst) {
+ const char16_t *const beg = src;
+ for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+ vl = __riscv_vsetvl_e16m8(len);
+ vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+ v = simdutf_byteflip<bflip>(v, vl);
+ long idx = __riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 255, vl), vl);
+ if (idx >= 0)
+ return result(error_code::TOO_LARGE, src - beg + idx);
+ __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl);
+ }
+ return result(error_code::SUCCESS, src - beg);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+ const char16_t *src, size_t len, char *dst) const noexcept {
+ result res = convert_utf16le_to_latin1_with_errors(src, len, dst);
+ return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+ const char16_t *src, size_t len, char *dst) const noexcept {
+ result res = convert_utf16be_to_latin1_with_errors(src, len, dst);
+ return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+ const char16_t *src, size_t len, char *dst) const noexcept {
+ return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+ const char16_t *src, size_t len, char *dst) const noexcept {
+ if (supports_zvbb())
+ return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::ZVBB>(src, len,
+ dst);
+ else
+ return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::V>(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+ const char16_t *src, size_t len, char *dst) const noexcept {
+ const char16_t *const beg = src;
+ for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+ vl = __riscv_vsetvl_e16m8(len);
+ vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+ __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl);
+ }
+ return src - beg;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+ const char16_t *src, size_t len, char *dst) const noexcept {
+ const char16_t *const beg = src;
+ for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+ vl = __riscv_vsetvl_e16m8(len);
+ vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+ __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vnsrl_wx_u8m4(v, 8, vl), vl);
+ }
+ return src - beg;
+}
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static result
+rvv_utf16_to_utf8_with_errors(const char16_t *src, size_t len, char *dst) {
+ size_t n = len;
+ const char16_t *srcBeg = src;
+ const char *dstBeg = dst;
+ size_t vl8m4 = __riscv_vsetvlmax_e8m4();
+ vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(
+ __riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4);
+
+ for (size_t vl, vlOut; n > 0;) {
+ vl = __riscv_vsetvl_e16m2(n);
+
+ vuint16m2_t v = __riscv_vle16_v_u16m2((uint16_t const *)src, vl);
+ v = simdutf_byteflip<bflip>(v, vl);
+ vbool8_t m234 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x80 - 1, vl);
+
+ if (__riscv_vfirst_m_b8(m234, vl) < 0) { /* 1 byte utf8 */
+ vlOut = vl;
+ __riscv_vse8_v_u8m1((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m1(v, vlOut),
+ vlOut);
+ n -= vl, src += vl, dst += vlOut;
+ continue;
+ }
+
+ vbool8_t m34 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x800 - 1, vl);
+
+ if (__riscv_vfirst_m_b8(m34, vl) < 0) { /* 1/2 byte utf8 */
+ /* 0: [ aaa|aabbbbbb]
+ * 1: [aabbbbbb| ] vsll 8
+ * 2: [ | aaaaa] vsrl 6
+ * 3: [00111111|00011111]
+ * 4: [ bbbbbb|000aaaaa] (1|2)&3
+ * 5: [11000000|11000000]
+ * 6: [10bbbbbb|110aaaaa] 4|5 */
+ vuint16m2_t twoByte = __riscv_vand_vx_u16m2(
+ __riscv_vor_vv_u16m2(__riscv_vsll_vx_u16m2(v, 8, vl),
+ __riscv_vsrl_vx_u16m2(v, 6, vl), vl),
+ 0b0011111100011111, vl);
+ vuint16m2_t vout16 =
+ __riscv_vor_vx_u16m2_mu(m234, v, twoByte, 0b1000000011000000, vl);
+ vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16);
+
+ /* Every high byte that is zero should be compressed
+ * low bytes should never be compressed, so we set them
+ * to all ones, and then create a non-zero bytes mask */
+ vbool4_t mcomp =
+ __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(
+ __riscv_vor_vx_u16m2(vout16, 0xFF, vl)),
+ 0, vl * 2);
+ vlOut = __riscv_vcpop_m_b4(mcomp, vl * 2);
+
+ vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl * 2);
+ __riscv_vse8_v_u8m2((uint8_t *)dst, vout, vlOut);
+
+ n -= vl, src += vl, dst += vlOut;
+ continue;
+ }
+
+ vbool8_t sur = __riscv_vmseq_vx_u16m2_b8(
+ __riscv_vand_vx_u16m2(v, 0xF800, vl), 0xD800, vl);
+ long first = __riscv_vfirst_m_b8(sur, vl);
+ size_t tail = vl - first;
+ vl = first < 0 ? vl : first;
+
+ if (vl > 0) { /* 1/2/3 byte utf8 */
+ /* in: [aaaabbbb|bbcccccc]
+ * v1: [0bcccccc| ] vsll 8
+ * v1: [10cccccc| ] vsll 8 & 0b00111111 | 0b10000000
+ * v2: [ |110bbbbb] vsrl 6 & 0b00111111 | 0b11000000
+ * v2: [ |10bbbbbb] vsrl 6 & 0b00111111 | 0b10000000
+ * v3: [ |1110aaaa] vsrl 12 | 0b11100000
+ * 1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc]
+ * 2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc]
+ * 3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb]
+ * [10cccccc]
+ */
+ vuint16m2_t v1, v2, v3, v12;
+ v1 = __riscv_vor_vx_u16m2_mu(
+ m234, v, __riscv_vand_vx_u16m2(v, 0b00111111, vl), 0b10000000, vl);
+ v1 = __riscv_vsll_vx_u16m2(v1, 8, vl);
+
+ v2 = __riscv_vor_vx_u16m2(
+ __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 6, vl), 0b00111111,
+ vl),
+ 0b10000000, vl);
+ v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34, vl), v2, v2,
+ 0b01000000, vl);
+ v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 12, vl), 0b11100000,
+ vl);
+ v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl);
+
+ vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1 << 8, vl);
+ vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl);
+ vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123);
+
+ vbool2_t mcomp = __riscv_vmor_mm_b2(
+ m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl * 4), vl * 4);
+ vlOut = __riscv_vcpop_m_b2(mcomp, vl * 4);
+
+ vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl * 4);
+ __riscv_vse8_v_u8m4((uint8_t *)dst, vout, vlOut);
+
+ n -= vl, src += vl, dst += vlOut;
+ }
+
+ if (tail)
+ while (n) {
+ uint16_t word = simdutf_byteflip<bflip>(src[0]);
+ if ((word & 0xFF80) == 0) {
+ break;
+ } else if ((word & 0xF800) == 0) {
+ break;
+ } else if ((word & 0xF800) != 0xD800) {
+ break;
+ } else {
+ // must be a surrogate pair
+ if (n <= 1)
+ return result(error_code::SURROGATE, src - srcBeg);
+ uint16_t diff = word - 0xD800;
+ if (diff > 0x3FF)
+ return result(error_code::SURROGATE, src - srcBeg);
+ uint16_t diff2 = simdutf_byteflip<bflip>(src[1]) - 0xDC00;
+ if (diff2 > 0x3FF)
+ return result(error_code::SURROGATE, src - srcBeg);
+
+ uint32_t value = ((diff + 0x40) << 10) + diff2;
+
+ // will generate four UTF-8 bytes
+ // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+ *dst++ = (char)((value >> 18) | 0b11110000);
+ *dst++ = (char)(((value >> 12) & 0b111111) | 0b10000000);
+ *dst++ = (char)(((value >> 6) & 0b111111) | 0b10000000);
+ *dst++ = (char)((value & 0b111111) | 0b10000000);
+ src += 2;
+ n -= 2;
+ }
+ }
+ }
+
+ return result(error_code::SUCCESS, dst - dstBeg);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+ const char16_t *src, size_t len, char *dst) const noexcept {
+ result res = convert_utf16le_to_utf8_with_errors(src, len, dst);
+ return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+ const char16_t *src, size_t len, char *dst) const noexcept {
+ result res = convert_utf16be_to_utf8_with_errors(src, len, dst);
+ return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+ const char16_t *src, size_t len, char *dst) const noexcept {
+ return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+ const char16_t *src, size_t len, char *dst) const noexcept {
+ if (supports_zvbb())
+ return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::ZVBB>(src, len, dst);
+ else
+ return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::V>(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+ const char16_t *src, size_t len, char *dst) const noexcept {
+ return convert_utf16le_to_utf8(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+ const char16_t *src, size_t len, char *dst) const noexcept {
+ return convert_utf16be_to_utf8(src, len, dst);
+}
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static result
+rvv_utf16_to_utf32_with_errors(const char16_t *src, size_t len, char32_t *dst) {
+ const char16_t *const srcBeg = src;
+ char32_t *const dstBeg = dst;
+
+ constexpr const uint16_t ANY_SURROGATE_MASK = 0xf800;
+ constexpr const uint16_t ANY_SURROGATE_VALUE = 0xd800;
+ constexpr const uint16_t LO_SURROGATE_MASK = 0xfc00;
+ constexpr const uint16_t LO_SURROGATE_VALUE = 0xdc00;
+ constexpr const uint16_t HI_SURROGATE_MASK = 0xfc00;
+ constexpr const uint16_t HI_SURROGATE_VALUE = 0xd800;
+
+ uint16_t last = 0;
+ while (len > 0) {
+ size_t vl = __riscv_vsetvl_e16m2(len);
+ vuint16m2_t v0 = __riscv_vle16_v_u16m2((uint16_t const *)src, vl);
+ v0 = simdutf_byteflip<bflip>(v0, vl);
+
+ { // check fast-path
+ const vuint16m2_t v = __riscv_vand_vx_u16m2(v0, ANY_SURROGATE_MASK, vl);
+ const vbool8_t any_surrogate =
+ __riscv_vmseq_vx_u16m2_b8(v, ANY_SURROGATE_VALUE, vl);
+ if (__riscv_vfirst_m_b8(any_surrogate, vl) < 0) {
+ /* no surrogates */
+ __riscv_vse32_v_u32m4((uint32_t *)dst, __riscv_vzext_vf2_u32m4(v0, vl),
+ vl);
+ len -= vl;
+ src += vl;
+ dst += vl;
+ continue;
+ }
+ }
+
+ if ((simdutf_byteflip<bflip>(src[0]) & LO_SURROGATE_MASK) ==
+ LO_SURROGATE_VALUE) {
+ return result(error_code::SURROGATE, src - srcBeg);
+ }
+
+ // decode surrogates
+ vuint16m2_t v1 = __riscv_vslide1down_vx_u16m2(v0, 0, vl);
+ vl = __riscv_vsetvl_e16m2(vl - 1);
+ if (vl == 0) {
+ return result(error_code::SURROGATE, src - srcBeg);
+ }
+
+ const vbool8_t surhi = __riscv_vmseq_vx_u16m2_b8(
+ __riscv_vand_vx_u16m2(v0, HI_SURROGATE_MASK, vl), HI_SURROGATE_VALUE,
+ vl);
+ const vbool8_t surlo = __riscv_vmseq_vx_u16m2_b8(
+ __riscv_vand_vx_u16m2(v1, LO_SURROGATE_MASK, vl), LO_SURROGATE_VALUE,
+ vl);
+
+ // compress everything but lo surrogates
+ const vbool8_t compress = __riscv_vmsne_vx_u16m2_b8(
+ __riscv_vand_vx_u16m2(v0, LO_SURROGATE_MASK, vl), LO_SURROGATE_VALUE,
+ vl);
+
+ {
+ const vbool8_t diff = __riscv_vmxor_mm_b8(surhi, surlo, vl);
+ const long idx = __riscv_vfirst_m_b8(diff, vl);
+ if (idx >= 0) {
+ uint16_t word = simdutf_byteflip<bflip>(src[idx]);
+ if (word < 0xD800 || word > 0xDBFF) {
+ return result(error_code::SURROGATE, src - srcBeg + idx + 1);
+ }
+ return result(error_code::SURROGATE, src - srcBeg + idx);
+ }
+ }
+
+ last = simdutf_byteflip<bflip>(src[vl]);
+ vuint32m4_t utf32 = __riscv_vzext_vf2_u32m4(v0, vl);
+
+ // v0 = 110110yyyyyyyyyy (0xd800 + yyyyyyyyyy) --- hi surrogate
+ // v1 = 110111xxxxxxxxxx (0xdc00 + xxxxxxxxxx) --- lo surrogate
+
+ // t0 = u16( 0000_00yy_yyyy_yyyy)
+ const vuint32m4_t t0 =
+ __riscv_vzext_vf2_u32m4(__riscv_vand_vx_u16m2(v0, 0x03ff, vl), vl);
+ // t1 = u32(0000_0000_0000_yyyy_yyyy_yy00_0000_0000)
+ const vuint32m4_t t1 = __riscv_vsll_vx_u32m4(t0, 10, vl);
+
+ // t2 = u32(0000_0000_0000_0000_0000_00xx_xxxx_xxxx)
+ const vuint32m4_t t2 =
+ __riscv_vzext_vf2_u32m4(__riscv_vand_vx_u16m2(v1, 0x03ff, vl), vl);
+
+ // t3 = u32(0000_0000_0000_yyyy_yyyy_yyxx_xxxx_xxxx)
+ const vuint32m4_t t3 = __riscv_vor_vv_u32m4(t1, t2, vl);
+
+ // t4 = utf32 from surrogate pairs
+ const vuint32m4_t t4 = __riscv_vadd_vx_u32m4(t3, 0x10000, vl);
+
+ const vuint32m4_t result = __riscv_vmerge_vvm_u32m4(utf32, t4, surhi, vl);
+
+ const vuint32m4_t comp = __riscv_vcompress_vm_u32m4(result, compress, vl);
+ const size_t vlOut = __riscv_vcpop_m_b8(compress, vl);
+ __riscv_vse32_v_u32m4((uint32_t *)dst, comp, vlOut);
+
+ len -= vl;
+ src += vl;
+ dst += vlOut;
+
+ if ((last & LO_SURROGATE_MASK) == LO_SURROGATE_VALUE) {
+ // last item is lo surrogate and got already consumed
+ len -= 1;
+ src += 1;
+ }
+ }
+
+ return result(error_code::SUCCESS, dst - dstBeg);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+ const char16_t *src, size_t len, char32_t *dst) const noexcept {
+ result res = convert_utf16le_to_utf32_with_errors(src, len, dst);
+ return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+ const char16_t *src, size_t len, char32_t *dst) const noexcept {
+ result res = convert_utf16be_to_utf32_with_errors(src, len, dst);
+ return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+ const char16_t *src, size_t len, char32_t *dst) const noexcept {
+ return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+ const char16_t *src, size_t len, char32_t *dst) const noexcept {
+ if (supports_zvbb())
+ return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::ZVBB>(src, len,
+ dst);
+ else
+ return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::V>(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+ const char16_t *src, size_t len, char32_t *dst) const noexcept {
+ return convert_utf16le_to_utf32(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+ const char16_t *src, size_t len, char32_t *dst) const noexcept {
+ return convert_utf16be_to_utf32(src, len, dst);
+}
diff --git a/contrib/simdutf/src/rvv/rvv_utf32_to.inl.cpp b/contrib/simdutf/src/rvv/rvv_utf32_to.inl.cpp
new file mode 100644
index 000000000..4d1afcc38
--- /dev/null
+++ b/contrib/simdutf/src/rvv/rvv_utf32_to.inl.cpp
@@ -0,0 +1,289 @@
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+ const char32_t *src, size_t len, char *dst) const noexcept {
+ result res = convert_utf32_to_latin1_with_errors(src, len, dst);
+ return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+ const char32_t *src, size_t len, char *dst) const noexcept {
+ const char32_t *const beg = src;
+ for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+ vl = __riscv_vsetvl_e32m8(len);
+ vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
+ long idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 255, vl), vl);
+ if (idx >= 0)
+ return result(error_code::TOO_LARGE, src - beg + idx);
+ /* We don't use vcompress here, because its performance varies widely on
+ * current platforms. This might be worth reconsidering once there is more
+ * hardware available. */
+ __riscv_vse8_v_u8m2(
+ (uint8_t *)dst,
+ __riscv_vncvt_x_x_w_u8m2(__riscv_vncvt_x_x_w_u16m4(v, vl), vl), vl);
+ }
+ return result(error_code::SUCCESS, src - beg);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+ const char32_t *src, size_t len, char *dst) const noexcept {
+ return convert_utf32_to_latin1(src, len, dst);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+ const char32_t *src, size_t len, char *dst) const noexcept {
+ size_t n = len;
+ const char32_t *srcBeg = src;
+ const char *dstBeg = dst;
+ size_t vl8m4 = __riscv_vsetvlmax_e8m4();
+ vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(
+ __riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4);
+
+ for (size_t vl, vlOut; n > 0;) {
+ vl = __riscv_vsetvl_e32m4(n);
+
+ vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t const *)src, vl);
+ vbool8_t m234 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x80 - 1, vl);
+ vuint16m2_t vn = __riscv_vncvt_x_x_w_u16m2(v, vl);
+
+ if (__riscv_vfirst_m_b8(m234, vl) < 0) { /* 1 byte utf8 */
+ vlOut = vl;
+ __riscv_vse8_v_u8m1((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m1(vn, vlOut),
+ vlOut);
+ n -= vl, src += vl, dst += vlOut;
+ continue;
+ }
+
+ vbool8_t m34 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x800 - 1, vl);
+
+ if (__riscv_vfirst_m_b8(m34, vl) < 0) { /* 1/2 byte utf8 */
+ /* 0: [ aaa|aabbbbbb]
+ * 1: [aabbbbbb| ] vsll 8
+ * 2: [ | aaaaa] vsrl 6
+ * 3: [00111111|00111111]
+ * 4: [ bbbbbb|000aaaaa] (1|2)&3
+ * 5: [10000000|11000000]
+ * 6: [10bbbbbb|110aaaaa] 4|5 */
+ vuint16m2_t twoByte = __riscv_vand_vx_u16m2(
+ __riscv_vor_vv_u16m2(__riscv_vsll_vx_u16m2(vn, 8, vl),
+ __riscv_vsrl_vx_u16m2(vn, 6, vl), vl),
+ 0b0011111100111111, vl);
+ vuint16m2_t vout16 =
+ __riscv_vor_vx_u16m2_mu(m234, vn, twoByte, 0b1000000011000000, vl);
+ vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16);
+
+ /* Every high byte that is zero should be compressed
+ * low bytes should never be compressed, so we set them
+ * to all ones, and then create a non-zero bytes mask */
+ vbool4_t mcomp =
+ __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(
+ __riscv_vor_vx_u16m2(vout16, 0xFF, vl)),
+ 0, vl * 2);
+ vlOut = __riscv_vcpop_m_b4(mcomp, vl * 2);
+
+ vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl * 2);
+ __riscv_vse8_v_u8m2((uint8_t *)dst, vout, vlOut);
+
+ n -= vl, src += vl, dst += vlOut;
+ continue;
+ }
+ long idx1 =
+ __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl);
+ vbool8_t sur = __riscv_vmseq_vx_u32m4_b8(
+ __riscv_vand_vx_u32m4(v, 0xFFFFF800, vl), 0xD800, vl);
+ long idx2 = __riscv_vfirst_m_b8(sur, vl);
+ if (idx1 >= 0 && idx2 >= 0) {
+ if (idx1 <= idx2) {
+ return result(error_code::TOO_LARGE, src - srcBeg + idx1);
+ } else {
+ return result(error_code::SURROGATE, src - srcBeg + idx2);
+ }
+ }
+ if (idx1 >= 0) {
+ return result(error_code::TOO_LARGE, src - srcBeg + idx1);
+ }
+ if (idx2 >= 0) {
+ return result(error_code::SURROGATE, src - srcBeg + idx2);
+ }
+
+ vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x10000 - 1, vl);
+ long first = __riscv_vfirst_m_b8(m4, vl);
+ size_t tail = vl - first;
+ vl = first < 0 ? vl : first;
+
+ if (vl > 0) { /* 1/2/3 byte utf8 */
+ /* vn: [aaaabbbb|bbcccccc]
+ * v1: [0bcccccc| ] vsll 8
+ * v1: [10cccccc| ] vsll 8 & 0b00111111 | 0b10000000
+ * v2: [ |110bbbbb] vsrl 6 & 0b00111111 | 0b11000000
+ * v2: [ |10bbbbbb] vsrl 6 & 0b00111111 | 0b10000000
+ * v3: [ |1110aaaa] vsrl 12 | 0b11100000
+ * 1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc]
+ * 2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc]
+ * 3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb]
+ * [10cccccc]
+ */
+ vuint16m2_t v1, v2, v3, v12;
+ v1 = __riscv_vor_vx_u16m2_mu(
+ m234, vn, __riscv_vand_vx_u16m2(vn, 0b00111111, vl), 0b10000000, vl);
+ v1 = __riscv_vsll_vx_u16m2(v1, 8, vl);
+
+ v2 = __riscv_vor_vx_u16m2(
+ __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 6, vl), 0b00111111,
+ vl),
+ 0b10000000, vl);
+ v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34, vl), v2, v2,
+ 0b01000000, vl);
+ v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 12, vl), 0b11100000,
+ vl);
+ v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl);
+
+ vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1 << 8, vl);
+ vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl);
+ vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123);
+
+ vbool2_t mcomp = __riscv_vmor_mm_b2(
+ m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl * 4), vl * 4);
+ vlOut = __riscv_vcpop_m_b2(mcomp, vl * 4);
+
+ vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl * 4);
+ __riscv_vse8_v_u8m4((uint8_t *)dst, vout, vlOut);
+
+ n -= vl, src += vl, dst += vlOut;
+ }
+
+ if (tail)
+ while (n) {
+ uint32_t word = src[0];
+ if (word < 0x10000)
+ break;
+ if (word > 0x10FFFF)
+ return result(error_code::TOO_LARGE, src - srcBeg);
+ *dst++ = (uint8_t)((word >> 18) | 0b11110000);
+ *dst++ = (uint8_t)(((word >> 12) & 0b111111) | 0b10000000);
+ *dst++ = (uint8_t)(((word >> 6) & 0b111111) | 0b10000000);
+ *dst++ = (uint8_t)((word & 0b111111) | 0b10000000);
+ ++src;
+ --n;
+ }
+ }
+
+ return result(error_code::SUCCESS, dst - dstBeg);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+ const char32_t *src, size_t len, char *dst) const noexcept {
+ result res = convert_utf32_to_utf8_with_errors(src, len, dst);
+ return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+ const char32_t *src, size_t len, char *dst) const noexcept {
+ return convert_utf32_to_utf8(src, len, dst);
+}
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static result
+rvv_convert_utf32_to_utf16_with_errors(const char32_t *src, size_t len,
+ char16_t *dst) {
+ size_t vl8m2 = __riscv_vsetvlmax_e8m2();
+ vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
+ __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
+ const char16_t *dstBeg = dst;
+ const char32_t *srcBeg = src;
+ for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
+ vl = __riscv_vsetvl_e32m4(len);
+ vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t *)src, vl);
+ vuint32m4_t off = __riscv_vadd_vx_u32m4(v, 0xFFFF2000, vl);
+ long idx1 =
+ __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl);
+ long idx2 = __riscv_vfirst_m_b8(
+ __riscv_vmsgtu_vx_u32m4_b8(off, 0xFFFFF7FF, vl), vl);
+ if (idx1 >= 0 && idx2 >= 0) {
+ if (idx1 <= idx2)
+ return result(error_code::TOO_LARGE, src - srcBeg + idx1);
+ return result(error_code::SURROGATE, src - srcBeg + idx2);
+ }
+ if (idx1 >= 0)
+ return result(error_code::TOO_LARGE, src - srcBeg + idx1);
+ if (idx2 >= 0)
+ return result(error_code::SURROGATE, src - srcBeg + idx2);
+ long idx =
+ __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl);
+ if (idx < 0) {
+ vlOut = vl;
+ vuint16m2_t n =
+ simdutf_byteflip<bflip>(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut);
+ __riscv_vse16_v_u16m2((uint16_t *)dst, n, vlOut);
+ continue;
+ }
+ vlOut = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, v, vl, m4even);
+ }
+ return result(error_code::SUCCESS, dst - dstBeg);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+ const char32_t *src, size_t len, char16_t *dst) const noexcept {
+ result res = convert_utf32_to_utf16le_with_errors(src, len, dst);
+ return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+ const char32_t *src, size_t len, char16_t *dst) const noexcept {
+ result res = convert_utf32_to_utf16be_with_errors(src, len, dst);
+ return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+ const char32_t *src, size_t len, char16_t *dst) const noexcept {
+ return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::NONE>(
+ src, len, dst);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+ const char32_t *src, size_t len, char16_t *dst) const noexcept {
+ if (supports_zvbb())
+ return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::ZVBB>(
+ src, len, dst);
+ else
+ return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::V>(src, len,
+ dst);
+}
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static size_t
+rvv_convert_valid_utf32_to_utf16(const char32_t *src, size_t len,
+ char16_t *dst) {
+ size_t vl8m2 = __riscv_vsetvlmax_e8m2();
+ vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
+ __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
+ char16_t *dstBeg = dst;
+ for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
+ vl = __riscv_vsetvl_e32m4(len);
+ vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t *)src, vl);
+ if (__riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl) <
+ 0) {
+ vlOut = vl;
+ vuint16m2_t n =
+ simdutf_byteflip<bflip>(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut);
+ __riscv_vse16_v_u16m2((uint16_t *)dst, n, vlOut);
+ continue;
+ }
+ vlOut = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, v, vl, m4even);
+ }
+ return dst - dstBeg;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+ const char32_t *src, size_t len, char16_t *dst) const noexcept {
+ return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::NONE>(src, len,
+ dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+ const char32_t *src, size_t len, char16_t *dst) const noexcept {
+ if (supports_zvbb())
+ return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::ZVBB>(src, len,
+ dst);
+ else
+ return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::V>(src, len, dst);
+}
diff --git a/contrib/simdutf/src/rvv/rvv_utf8_to.inl.cpp b/contrib/simdutf/src/rvv/rvv_utf8_to.inl.cpp
new file mode 100644
index 000000000..0860d1fe6
--- /dev/null
+++ b/contrib/simdutf/src/rvv/rvv_utf8_to.inl.cpp
@@ -0,0 +1,430 @@
+template <typename Tdst, simdutf_ByteFlip bflip, bool validate = true>
+simdutf_really_inline static size_t rvv_utf8_to_common(char const *src,
+ size_t len, Tdst *dst) {
+ static_assert(std::is_same<Tdst, uint16_t>() ||
+ std::is_same<Tdst, uint32_t>(),
+ "invalid type");
+ constexpr bool is16 = std::is_same<Tdst, uint16_t>();
+ constexpr endianness endian =
+ bflip == simdutf_ByteFlip::NONE ? endianness::LITTLE : endianness::BIG;
+ const auto scalar = [](char const *in, size_t count, Tdst *out) {
+ return is16 ? scalar::utf8_to_utf16::convert<endian>(in, count,
+ (char16_t *)out)
+ : scalar::utf8_to_utf32::convert(in, count, (char32_t *)out);
+ };
+
+ if (len < 32)
+ return scalar(src, len, dst);
+
+ /* validate first three bytes */
+ if (validate) {
+ size_t idx = 3;
+ while (idx < len && (src[idx] >> 6) == 0b10)
+ ++idx;
+ if (idx > 3 + 3 || !scalar::utf8::validate(src, idx))
+ return 0;
+ }
+
+ size_t tail = 3;
+ size_t n = len - tail;
+ Tdst *beg = dst;
+
+ static const uint64_t err1m[] = {0x0202020202020202, 0x4915012180808080};
+ static const uint64_t err2m[] = {0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB};
+ static const uint64_t err3m[] = {0x0101010101010101, 0X01010101BABAAEE6};
+
+ const vuint8m1_t err1tbl =
+ __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
+ const vuint8m1_t err2tbl =
+ __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
+ const vuint8m1_t err3tbl =
+ __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));
+
+ size_t vl8m2 = __riscv_vsetvlmax_e8m2();
+ vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
+ __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
+
+ for (size_t vl, vlOut; n > 0; n -= vl, src += vl, dst += vlOut) {
+ vl = __riscv_vsetvl_e8m2(n);
+
+ vuint8m2_t v0 = __riscv_vle8_v_u8m2((uint8_t const *)src, vl);
+ uint64_t max = __riscv_vmv_x_s_u8m1_u8(
+ __riscv_vredmaxu_vs_u8m2_u8m1(v0, __riscv_vmv_s_x_u8m1(0, vl), vl));
+
+ uint8_t next0 = src[vl + 0];
+ uint8_t next1 = src[vl + 1];
+ uint8_t next2 = src[vl + 2];
+
+ /* fast path: ASCII */
+ if ((max | next0 | next1 | next2) < 0b10000000) {
+ vlOut = vl;
+ if (is16)
+ __riscv_vse16_v_u16m4(
+ (uint16_t *)dst,
+ simdutf_byteflip<bflip>(__riscv_vzext_vf2_u16m4(v0, vlOut), vlOut),
+ vlOut);
+ else
+ __riscv_vse32_v_u32m8((uint32_t *)dst,
+ __riscv_vzext_vf4_u32m8(v0, vlOut), vlOut);
+ continue;
+ }
+
+ /* see "Validating UTF-8 In Less Than One Instruction Per Byte"
+ * https://arxiv.org/abs/2010.03090 */
+ vuint8m2_t v1 = __riscv_vslide1down_vx_u8m2(v0, next0, vl);
+ vuint8m2_t v2 = __riscv_vslide1down_vx_u8m2(v1, next1, vl);
+ vuint8m2_t v3 = __riscv_vslide1down_vx_u8m2(v2, next2, vl);
+
+ if (validate) {
+ vuint8m2_t s1 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(
+ __riscv_vreinterpret_v_u8m2_u16m2(v2), 4, __riscv_vsetvlmax_e16m2()));
+ vuint8m2_t s3 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(
+ __riscv_vreinterpret_v_u8m2_u16m2(v3), 4, __riscv_vsetvlmax_e16m2()));
+
+ vuint8m2_t idx2 = __riscv_vand_vx_u8m2(v2, 0xF, vl);
+ vuint8m2_t idx1 = __riscv_vand_vx_u8m2(s1, 0xF, vl);
+ vuint8m2_t idx3 = __riscv_vand_vx_u8m2(s3, 0xF, vl);
+
+ vuint8m2_t err1 = simdutf_vrgather_u8m1x2(err1tbl, idx1);
+ vuint8m2_t err2 = simdutf_vrgather_u8m1x2(err2tbl, idx2);
+ vuint8m2_t err3 = simdutf_vrgather_u8m1x2(err3tbl, idx3);
+ vint8m2_t errs = __riscv_vreinterpret_v_u8m2_i8m2(
+ __riscv_vand_vv_u8m2(__riscv_vand_vv_u8m2(err1, err2, vl), err3, vl));
+
+ vbool4_t is_3 = __riscv_vmsgtu_vx_u8m2_b4(v1, 0b11100000 - 1, vl);
+ vbool4_t is_4 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b11110000 - 1, vl);
+ vbool4_t is_34 = __riscv_vmor_mm_b4(is_3, is_4, vl);
+ vbool4_t err34 =
+ __riscv_vmxor_mm_b4(is_34, __riscv_vmslt_vx_i8m2_b4(errs, 0, vl), vl);
+ vbool4_t errm =
+ __riscv_vmor_mm_b4(__riscv_vmsgt_vx_i8m2_b4(errs, 0, vl), err34, vl);
+ if (__riscv_vfirst_m_b4(errm, vl) >= 0)
+ return 0;
+ }
+
+ /* decoding */
+
+ /* mask of non continuation bytes */
+ vbool4_t m =
+ __riscv_vmsgt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v0), -65, vl);
+ vlOut = __riscv_vcpop_m_b4(m, vl);
+
+ /* extract first and second bytes */
+ vuint8m2_t b1 = __riscv_vcompress_vm_u8m2(v0, m, vl);
+ vuint8m2_t b2 = __riscv_vcompress_vm_u8m2(v1, m, vl);
+
+ /* fast path: one and two byte */
+ if (max < 0b11100000) {
+ b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
+
+ vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
+ b1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);
+
+ vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(
+ b1,
+ __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1 << 6, m1,
+ vlOut),
+ vlOut);
+ b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
+ if (is16)
+ __riscv_vse16_v_u16m4((uint16_t *)dst,
+ simdutf_byteflip<bflip>(b12, vlOut), vlOut);
+ else
+ __riscv_vse32_v_u32m8((uint32_t *)dst,
+ __riscv_vzext_vf2_u32m8(b12, vlOut), vlOut);
+ continue;
+ }
+
+ /* fast path: one, two and three byte */
+ if (max < 0b11110000) {
+ vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
+
+ b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
+ b3 = __riscv_vand_vx_u8m2(b3, 0b00111111, vlOut);
+
+ vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
+ vbool4_t m3 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b11011111, vlOut);
+
+ vuint8m2_t t1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);
+ b1 = __riscv_vand_vx_u8m2_mu(m3, t1, b1, 15, vlOut);
+
+ vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(
+ b1,
+ __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1 << 6, m1,
+ vlOut),
+ vlOut);
+ b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
+ vuint16m4_t b123 = __riscv_vwaddu_wv_u16m4_mu(
+ m3, b12, __riscv_vsll_vx_u16m4_mu(m3, b12, b12, 6, vlOut), b3, vlOut);
+ if (is16)
+ __riscv_vse16_v_u16m4((uint16_t *)dst,
+ simdutf_byteflip<bflip>(b123, vlOut), vlOut);
+ else
+ __riscv_vse32_v_u32m8((uint32_t *)dst,
+ __riscv_vzext_vf2_u32m8(b123, vlOut), vlOut);
+ continue;
+ }
+
+ /* extract third and fourth bytes */
+ vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
+ vuint8m2_t b4 = __riscv_vcompress_vm_u8m2(v3, m, vl);
+
+ /* remove prefix from leading bytes
+ *
+ * We could also use vrgather here, but it increases register pressure,
+ * and its performance varies widely on current platforms. It might be
+ * worth reconsidering, though, once there is more hardware available.
+ * Same goes for the __riscv_vsrl_vv_u32m4 correction step.
+ *
+ * We shift left and then right by the number of bytes in the prefix,
+ * which can be calculated as follows:
+ * x max(x-10, 0)
+ * 0xxx -> 0000-0111 -> sift by 0 or 1 -> 0
+ * 10xx -> 1000-1011 -> don't care
+ * 110x -> 1100,1101 -> sift by 3 -> 2,3
+ * 1110 -> 1110 -> sift by 4 -> 4
+ * 1111 -> 1111 -> sift by 5 -> 5
+ *
+ * vssubu.vx v, 10, (max(x-10, 0)) almost gives us what we want, we
+ * just need to manually detect and handle the one special case:
+ */
+#define SIMDUTF_RVV_UTF8_TO_COMMON_M1(idx) \
+ vuint8m1_t c1 = __riscv_vget_v_u8m2_u8m1(b1, idx); \
+ vuint8m1_t c2 = __riscv_vget_v_u8m2_u8m1(b2, idx); \
+ vuint8m1_t c3 = __riscv_vget_v_u8m2_u8m1(b3, idx); \
+ vuint8m1_t c4 = __riscv_vget_v_u8m2_u8m1(b4, idx); \
+ /* remove prefix from trailing bytes */ \
+ c2 = __riscv_vand_vx_u8m1(c2, 0b00111111, vlOut); \
+ c3 = __riscv_vand_vx_u8m1(c3, 0b00111111, vlOut); \
+ c4 = __riscv_vand_vx_u8m1(c4, 0b00111111, vlOut); \
+ vuint8m1_t shift = __riscv_vsrl_vx_u8m1(c1, 4, vlOut); \
+ shift = __riscv_vmerge_vxm_u8m1(__riscv_vssubu_vx_u8m1(shift, 10, vlOut), 3, \
+ __riscv_vmseq_vx_u8m1_b8(shift, 12, vlOut), \
+ vlOut); \
+ c1 = __riscv_vsll_vv_u8m1(c1, shift, vlOut); \
+ c1 = __riscv_vsrl_vv_u8m1(c1, shift, vlOut); \
+ /* unconditionally widen and combine to c1234 */ \
+ vuint16m2_t c34 = __riscv_vwaddu_wv_u16m2( \
+ __riscv_vwmulu_vx_u16m2(c3, 1 << 6, vlOut), c4, vlOut); \
+ vuint16m2_t c12 = __riscv_vwaddu_wv_u16m2( \
+ __riscv_vwmulu_vx_u16m2(c1, 1 << 6, vlOut), c2, vlOut); \
+ vuint32m4_t c1234 = __riscv_vwaddu_wv_u32m4( \
+ __riscv_vwmulu_vx_u32m4(c12, 1 << 12, vlOut), c34, vlOut); \
+ /* derive required right-shift amount from `shift` to reduce \
+ * c1234 to the required number of bytes */ \
+ c1234 = __riscv_vsrl_vv_u32m4( \
+ c1234, \
+ __riscv_vzext_vf4_u32m4( \
+ __riscv_vmul_vx_u8m1( \
+ __riscv_vrsub_vx_u8m1(__riscv_vssubu_vx_u8m1(shift, 2, vlOut), \
+ 3, vlOut), \
+ 6, vlOut), \
+ vlOut), \
+ vlOut); \
+ /* store result in desired format */ \
+ if (is16) \
+ vlDst = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, c1234, vlOut, \
+ m4even); \
+ else \
+ vlDst = vlOut, __riscv_vse32_v_u32m4((uint32_t *)dst, c1234, vlOut);
+
+ /* Unrolling this manually reduces register pressure and allows
+ * us to terminate early. */
+ {
+ size_t vlOutm2 = vlOut, vlDst;
+ vlOut = __riscv_vsetvl_e8m1(vlOut);
+ SIMDUTF_RVV_UTF8_TO_COMMON_M1(0)
+ if (vlOutm2 == vlOut) {
+ vlOut = vlDst;
+ continue;
+ }
+
+ dst += vlDst;
+ vlOut = vlOutm2 - vlOut;
+ }
+ {
+ size_t vlDst;
+ SIMDUTF_RVV_UTF8_TO_COMMON_M1(1)
+ vlOut = vlDst;
+ }
+
+#undef SIMDUTF_RVV_UTF8_TO_COMMON_M1
+ }
+
+ /* validate the last character and reparse it + tail */
+ if (len > tail) {
+ if ((src[0] >> 6) == 0b10)
+ --dst;
+ while ((src[0] >> 6) == 0b10 && tail < len)
+ --src, ++tail;
+ if (is16) {
+ /* go back one more, when on high surrogate */
+ if (simdutf_byteflip<bflip>((uint16_t)dst[-1]) >= 0xD800 &&
+ simdutf_byteflip<bflip>((uint16_t)dst[-1]) <= 0xDBFF)
+ --dst;
+ }
+ }
+ size_t ret = scalar(src, tail, dst);
+ if (ret == 0)
+ return 0;
+ return (size_t)(dst - beg) + ret;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+ const char *src, size_t len, char *dst) const noexcept {
+ const char *beg = dst;
+ uint8_t last = 0;
+ for (size_t vl, vlOut; len > 0;
+ len -= vl, src += vl, dst += vlOut, last = src[-1]) {
+ vl = __riscv_vsetvl_e8m2(len);
+ vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
+ // check which bytes are ASCII
+ vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl);
+ // count ASCII bytes
+ vlOut = __riscv_vcpop_m_b4(ascii, vl);
+ // The original code would only enter the next block after this check:
+ // vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
+ // vlOut = __riscv_vcpop_m_b4(m, vl);
+ // if (vlOut != vl || last > 0b01111111) {...}q
+ // So that everything is ASCII or continuation bytes, we just proceeded
+ // without any processing, going straight to __riscv_vse8_v_u8m2.
+ // But you need the __riscv_vslide1up_vx_u8m2 whenever there is a non-ASCII
+ // byte.
+ if (vlOut != vl) { // If not pure ASCII
+ // Non-ASCII characters
+ // We now want to mark the ascii and continuation bytes
+ vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
+ // We count them, that's our new vlOut (output vector length)
+ vlOut = __riscv_vcpop_m_b4(m, vl);
+
+ vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);
+
+ vbool4_t leading0 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b10111111, vl);
+ vbool4_t trailing1 = __riscv_vmslt_vx_i8m2_b4(
+ __riscv_vreinterpret_v_u8m2_i8m2(v1), (uint8_t)0b11000000, vl);
+ // -62 i 0b11000010, so we check whether any of v0 is too big
+ vbool4_t tobig = __riscv_vmand_mm_b4(
+ leading0,
+ __riscv_vmsgtu_vx_u8m2_b4(__riscv_vxor_vx_u8m2(v0, (uint8_t)-62, vl),
+ 1, vl),
+ vl);
+ if (__riscv_vfirst_m_b4(
+ __riscv_vmor_mm_b4(
+ tobig, __riscv_vmxor_mm_b4(leading0, trailing1, vl), vl),
+ vl) >= 0)
+ return 0;
+
+ v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl),
+ v1, v1, 0b01000000, vl);
+ v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
+ } else if (last >= 0b11000000) { // If last byte is a leading byte and we
+ // got only ASCII, error!
+ return 0;
+ }
+ __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
+ }
+ if (last > 0b10111111)
+ return 0;
+ return dst - beg;
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+ const char *src, size_t len, char *dst) const noexcept {
+ size_t res = convert_utf8_to_latin1(src, len, dst);
+ if (res)
+ return result(error_code::SUCCESS, res);
+ return scalar::utf8_to_latin1::convert_with_errors(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+ const char *src, size_t len, char *dst) const noexcept {
+ const char *beg = dst;
+ uint8_t last = 0;
+ for (size_t vl, vlOut; len > 0;
+ len -= vl, src += vl, dst += vlOut, last = src[-1]) {
+ vl = __riscv_vsetvl_e8m2(len);
+ vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
+ vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl);
+ vlOut = __riscv_vcpop_m_b4(ascii, vl);
+ if (vlOut != vl) { // If not pure ASCII
+ vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
+ vlOut = __riscv_vcpop_m_b4(m, vl);
+ vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);
+ v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl),
+ v1, v1, 0b01000000, vl);
+ v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
+ }
+ __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
+ }
+ return dst - beg;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+ const char *src, size_t len, char16_t *dst) const noexcept {
+ return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::NONE>(src, len,
+ (uint16_t *)dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+ const char *src, size_t len, char16_t *dst) const noexcept {
+ if (supports_zvbb())
+ return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::ZVBB>(
+ src, len, (uint16_t *)dst);
+ else
+ return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::V>(src, len,
+ (uint16_t *)dst);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+ const char *src, size_t len, char16_t *dst) const noexcept {
+ size_t res = convert_utf8_to_utf16le(src, len, dst);
+ if (res)
+ return result(error_code::SUCCESS, res);
+ return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(
+ src, len, dst);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+ const char *src, size_t len, char16_t *dst) const noexcept {
+ size_t res = convert_utf8_to_utf16be(src, len, dst);
+ if (res)
+ return result(error_code::SUCCESS, res);
+ return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(src, len,
+ dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+ const char *src, size_t len, char16_t *dst) const noexcept {
+ return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::NONE, false>(
+ src, len, (uint16_t *)dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+ const char *src, size_t len, char16_t *dst) const noexcept {
+ if (supports_zvbb())
+ return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::ZVBB, false>(
+ src, len, (uint16_t *)dst);
+ else
+ return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::V, false>(
+ src, len, (uint16_t *)dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+ const char *src, size_t len, char32_t *dst) const noexcept {
+ return rvv_utf8_to_common<uint32_t, simdutf_ByteFlip::NONE>(src, len,
+ (uint32_t *)dst);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+ const char *src, size_t len, char32_t *dst) const noexcept {
+ size_t res = convert_utf8_to_utf32(src, len, dst);
+ if (res)
+ return result(error_code::SUCCESS, res);
+ return scalar::utf8_to_utf32::convert_with_errors(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+ const char *src, size_t len, char32_t *dst) const noexcept {
+ return rvv_utf8_to_common<uint32_t, simdutf_ByteFlip::NONE, false>(
+ src, len, (uint32_t *)dst);
+}
diff --git a/contrib/simdutf/src/rvv/rvv_validate.inl.cpp b/contrib/simdutf/src/rvv/rvv_validate.inl.cpp
new file mode 100644
index 000000000..89510341f
--- /dev/null
+++ b/contrib/simdutf/src/rvv/rvv_validate.inl.cpp
@@ -0,0 +1,228 @@
+
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *src, size_t len) const noexcept {
+ size_t vlmax = __riscv_vsetvlmax_e8m8();
+ vint8m8_t mask = __riscv_vmv_v_x_i8m8(0, vlmax);
+ for (size_t vl; len > 0; len -= vl, src += vl) {
+ vl = __riscv_vsetvl_e8m8(len);
+ vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
+ mask = __riscv_vor_vv_i8m8_tu(mask, mask, v, vl);
+ }
+ return __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(mask, 0, vlmax), vlmax) <
+ 0;
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+ const char *src, size_t len) const noexcept {
+ const char *beg = src;
+ for (size_t vl; len > 0; len -= vl, src += vl) {
+ vl = __riscv_vsetvl_e8m8(len);
+ vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
+ long idx = __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl);
+ if (idx >= 0)
+ return result(error_code::TOO_LARGE, src - beg + idx);
+ }
+ return result(error_code::SUCCESS, src - beg);
+}
+
+/* Returns a close estimation of the number of valid UTF-8 bytes up to the
+ * first invalid one, but never overestimating. */
+simdutf_really_inline static size_t rvv_count_valid_utf8(const char *src,
+ size_t len) {
+ const char *beg = src;
+ if (len < 32)
+ return 0;
+
+ /* validate first three bytes */
+ {
+ size_t idx = 3;
+ while (idx < len && (src[idx] >> 6) == 0b10)
+ ++idx;
+ if (idx > 3 + 3 || !scalar::utf8::validate(src, idx))
+ return 0;
+ }
+
+ static const uint64_t err1m[] = {0x0202020202020202, 0x4915012180808080};
+ static const uint64_t err2m[] = {0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB};
+ static const uint64_t err3m[] = {0x0101010101010101, 0X01010101BABAAEE6};
+
+ const vuint8m1_t err1tbl =
+ __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
+ const vuint8m1_t err2tbl =
+ __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
+ const vuint8m1_t err3tbl =
+ __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));
+
+ size_t tail = 3;
+ size_t n = len - tail;
+
+ for (size_t vl; n > 0; n -= vl, src += vl) {
+ vl = __riscv_vsetvl_e8m4(n);
+ vuint8m4_t v0 = __riscv_vle8_v_u8m4((uint8_t const *)src, vl);
+
+ uint8_t next0 = src[vl + 0];
+ uint8_t next1 = src[vl + 1];
+ uint8_t next2 = src[vl + 2];
+
+ /* fast path: ASCII */
+ if (__riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u8m4_b2(v0, 0b01111111, vl), vl) <
+ 0 &&
+ (next0 | next1 | next2) < 0b10000000)
+ continue;
+
+ /* see "Validating UTF-8 In Less Than One Instruction Per Byte"
+ * https://arxiv.org/abs/2010.03090 */
+ vuint8m4_t v1 = __riscv_vslide1down_vx_u8m4(v0, next0, vl);
+ vuint8m4_t v2 = __riscv_vslide1down_vx_u8m4(v1, next1, vl);
+ vuint8m4_t v3 = __riscv_vslide1down_vx_u8m4(v2, next2, vl);
+
+ vuint8m4_t s1 = __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vsrl_vx_u16m4(
+ __riscv_vreinterpret_v_u8m4_u16m4(v2), 4, __riscv_vsetvlmax_e16m4()));
+ vuint8m4_t s3 = __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vsrl_vx_u16m4(
+ __riscv_vreinterpret_v_u8m4_u16m4(v3), 4, __riscv_vsetvlmax_e16m4()));
+
+ vuint8m4_t idx2 = __riscv_vand_vx_u8m4(v2, 0xF, vl);
+ vuint8m4_t idx1 = __riscv_vand_vx_u8m4(s1, 0xF, vl);
+ vuint8m4_t idx3 = __riscv_vand_vx_u8m4(s3, 0xF, vl);
+
+ vuint8m4_t err1 = simdutf_vrgather_u8m1x4(err1tbl, idx1);
+ vuint8m4_t err2 = simdutf_vrgather_u8m1x4(err2tbl, idx2);
+ vuint8m4_t err3 = simdutf_vrgather_u8m1x4(err3tbl, idx3);
+ vint8m4_t errs = __riscv_vreinterpret_v_u8m4_i8m4(
+ __riscv_vand_vv_u8m4(__riscv_vand_vv_u8m4(err1, err2, vl), err3, vl));
+
+ vbool2_t is_3 = __riscv_vmsgtu_vx_u8m4_b2(v1, 0b11100000 - 1, vl);
+ vbool2_t is_4 = __riscv_vmsgtu_vx_u8m4_b2(v0, 0b11110000 - 1, vl);
+ vbool2_t is_34 = __riscv_vmor_mm_b2(is_3, is_4, vl);
+ vbool2_t err34 =
+ __riscv_vmxor_mm_b2(is_34, __riscv_vmslt_vx_i8m4_b2(errs, 0, vl), vl);
+ vbool2_t errm =
+ __riscv_vmor_mm_b2(__riscv_vmsgt_vx_i8m4_b2(errs, 0, vl), err34, vl);
+ if (__riscv_vfirst_m_b2(errm, vl) >= 0)
+ break;
+ }
+
+ /* we need to validate the last character */
+ while (tail < len && (src[0] >> 6) == 0b10)
+ --src, ++tail;
+ return src - beg;
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *src, size_t len) const noexcept {
+ size_t count = rvv_count_valid_utf8(src, len);
+ return scalar::utf8::validate(src + count, len - count);
+}
+
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+ const char *src, size_t len) const noexcept {
+ size_t count = rvv_count_valid_utf8(src, len);
+ result res = scalar::utf8::validate_with_errors(src + count, len - count);
+ return result(res.error, count + res.count);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *src,
+ size_t len) const noexcept {
+ return validate_utf16le_with_errors(src, len).error == error_code::SUCCESS;
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *src,
+ size_t len) const noexcept {
+ return validate_utf16be_with_errors(src, len).error == error_code::SUCCESS;
+}
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static result
+rvv_validate_utf16_with_errors(const char16_t *src, size_t len) {
+ const char16_t *beg = src;
+ uint16_t last = 0;
+ for (size_t vl; len > 0;
+ len -= vl, src += vl, last = simdutf_byteflip<bflip>(src[-1])) {
+ vl = __riscv_vsetvl_e16m8(len);
+ vuint16m8_t v1 = __riscv_vle16_v_u16m8((const uint16_t *)src, vl);
+ v1 = simdutf_byteflip<bflip>(v1, vl);
+ vuint16m8_t v0 = __riscv_vslide1up_vx_u16m8(v1, last, vl);
+
+ vbool2_t surhi = __riscv_vmseq_vx_u16m8_b2(
+ __riscv_vand_vx_u16m8(v0, 0xFC00, vl), 0xD800, vl);
+ vbool2_t surlo = __riscv_vmseq_vx_u16m8_b2(
+ __riscv_vand_vx_u16m8(v1, 0xFC00, vl), 0xDC00, vl);
+
+ long idx = __riscv_vfirst_m_b2(__riscv_vmxor_mm_b2(surhi, surlo, vl), vl);
+ if (idx >= 0) {
+ last = idx > 0 ? simdutf_byteflip<bflip>(src[idx - 1]) : last;
+ return result(error_code::SURROGATE,
+ src - beg + idx - (last - 0xD800u < 0x400u));
+ break;
+ }
+ }
+ if (last - 0xD800u < 0x400u) {
+ return result(error_code::SURROGATE,
+ src - beg - 1); /* end on high surrogate */
+ } else {
+ return result(error_code::SUCCESS, src - beg);
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+ const char16_t *src, size_t len) const noexcept {
+ return rvv_validate_utf16_with_errors<simdutf_ByteFlip::NONE>(src, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+ const char16_t *src, size_t len) const noexcept {
+ if (supports_zvbb())
+ return rvv_validate_utf16_with_errors<simdutf_ByteFlip::ZVBB>(src, len);
+ else
+ return rvv_validate_utf16_with_errors<simdutf_ByteFlip::V>(src, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *src, size_t len) const noexcept {
+ size_t vlmax = __riscv_vsetvlmax_e32m8();
+ vuint32m8_t max = __riscv_vmv_v_x_u32m8(0x10FFFF, vlmax);
+ vuint32m8_t maxOff = __riscv_vmv_v_x_u32m8(0xFFFFF7FF, vlmax);
+ for (size_t vl; len > 0; len -= vl, src += vl) {
+ vl = __riscv_vsetvl_e32m8(len);
+ vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
+ vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl);
+ max = __riscv_vmaxu_vv_u32m8_tu(max, max, v, vl);
+ maxOff = __riscv_vmaxu_vv_u32m8_tu(maxOff, maxOff, off, vl);
+ }
+ return __riscv_vfirst_m_b4(
+ __riscv_vmor_mm_b4(
+ __riscv_vmsne_vx_u32m8_b4(max, 0x10FFFF, vlmax),
+ __riscv_vmsne_vx_u32m8_b4(maxOff, 0xFFFFF7FF, vlmax), vlmax),
+ vlmax) < 0;
+}
+
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+ const char32_t *src, size_t len) const noexcept {
+ const char32_t *beg = src;
+ for (size_t vl; len > 0; len -= vl, src += vl) {
+ vl = __riscv_vsetvl_e32m8(len);
+ vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
+ vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl);
+ long idx1 =
+ __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 0x10FFFF, vl), vl);
+ long idx2 = __riscv_vfirst_m_b4(
+ __riscv_vmsgtu_vx_u32m8_b4(off, 0xFFFFF7FF, vl), vl);
+ if (idx1 >= 0 && idx2 >= 0) {
+ if (idx1 <= idx2) {
+ return result(error_code::TOO_LARGE, src - beg + idx1);
+ } else {
+ return result(error_code::SURROGATE, src - beg + idx2);
+ }
+ }
+ if (idx1 >= 0) {
+ return result(error_code::TOO_LARGE, src - beg + idx1);
+ }
+ if (idx2 >= 0) {
+ return result(error_code::SURROGATE, src - beg + idx2);
+ }
+ }
+ return result(error_code::SUCCESS, src - beg);
+}
diff --git a/contrib/simdutf/src/scalar/ascii.h b/contrib/simdutf/src/scalar/ascii.h
new file mode 100644
index 000000000..f7504f1c8
--- /dev/null
+++ b/contrib/simdutf/src/scalar/ascii.h
@@ -0,0 +1,67 @@
+#ifndef SIMDUTF_ASCII_H
+#define SIMDUTF_ASCII_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace ascii {
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+// Only used by the fallback kernel.
+inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+ uint64_t pos = 0;
+ // process in blocks of 16 bytes when possible
+ for (; pos + 16 <= len; pos += 16) {
+ uint64_t v1;
+ std::memcpy(&v1, data + pos, sizeof(uint64_t));
+ uint64_t v2;
+ std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+ uint64_t v{v1 | v2};
+ if ((v & 0x8080808080808080) != 0) {
+ return false;
+ }
+ }
+ // process the tail byte-by-byte
+ for (; pos < len; pos++) {
+ if (data[pos] >= 0b10000000) {
+ return false;
+ }
+ }
+ return true;
+}
+#endif
+
+inline simdutf_warn_unused result validate_with_errors(const char *buf,
+ size_t len) noexcept {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+ size_t pos = 0;
+ // process in blocks of 16 bytes when possible
+ for (; pos + 16 <= len; pos += 16) {
+ uint64_t v1;
+ std::memcpy(&v1, data + pos, sizeof(uint64_t));
+ uint64_t v2;
+ std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+ uint64_t v{v1 | v2};
+ if ((v & 0x8080808080808080) != 0) {
+ for (; pos < len; pos++) {
+ if (data[pos] >= 0b10000000) {
+ return result(error_code::TOO_LARGE, pos);
+ }
+ }
+ }
+ }
+ // process the tail byte-by-byte
+ for (; pos < len; pos++) {
+ if (data[pos] >= 0b10000000) {
+ return result(error_code::TOO_LARGE, pos);
+ }
+ }
+ return result(error_code::SUCCESS, pos);
+}
+
+} // namespace ascii
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/base64.h b/contrib/simdutf/src/scalar/base64.h
new file mode 100644
index 000000000..57e770772
--- /dev/null
+++ b/contrib/simdutf/src/scalar/base64.h
@@ -0,0 +1,434 @@
+#ifndef SIMDUTF_BASE64_H
+#define SIMDUTF_BASE64_H
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace base64 {
+
+// This function is not expected to be fast. Do not use in long loops.
+template <class char_type> bool is_ascii_white_space(char_type c) {
+ return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f';
+}
+
+template <class char_type> bool is_ascii_white_space_or_padding(char_type c) {
+ return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' ||
+ c == '=';
+}
+
+template <class char_type> bool is_eight_byte(char_type c) {
+ if (sizeof(char_type) == 1) {
+ return true;
+ }
+ return uint8_t(c) == c;
+}
+
+// Returns true upon success. The destination buffer must be large enough.
+// This functions assumes that the padding (=) has been removed.
+template <class char_type>
+full_result
+base64_tail_decode(char *dst, const char_type *src, size_t length,
+ size_t padded_characters, // number of padding characters
+ // '=', typically 0, 1, 2.
+ base64_options options,
+ last_chunk_handling_options last_chunk_options) {
+ // This looks like 5 branches, but we expect the compiler to resolve this to a
+ // single branch:
+ const uint8_t *to_base64 = (options & base64_url)
+ ? tables::base64::to_base64_url_value
+ : tables::base64::to_base64_value;
+ const uint32_t *d0 = (options & base64_url)
+ ? tables::base64::base64_url::d0
+ : tables::base64::base64_default::d0;
+ const uint32_t *d1 = (options & base64_url)
+ ? tables::base64::base64_url::d1
+ : tables::base64::base64_default::d1;
+ const uint32_t *d2 = (options & base64_url)
+ ? tables::base64::base64_url::d2
+ : tables::base64::base64_default::d2;
+ const uint32_t *d3 = (options & base64_url)
+ ? tables::base64::base64_url::d3
+ : tables::base64::base64_default::d3;
+
+ const char_type *srcend = src + length;
+ const char_type *srcinit = src;
+ const char *dstinit = dst;
+
+ uint32_t x;
+ size_t idx;
+ uint8_t buffer[4];
+ while (true) {
+ while (src + 4 <= srcend && is_eight_byte(src[0]) &&
+ is_eight_byte(src[1]) && is_eight_byte(src[2]) &&
+ is_eight_byte(src[3]) &&
+ (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
+ d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
+ if (match_system(endianness::BIG)) {
+ x = scalar::utf32::swap_bytes(x);
+ }
+ std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes
+ dst += 3;
+ src += 4;
+ }
+ idx = 0;
+ // we need at least four characters.
+ while (idx < 4 && src < srcend) {
+ char_type c = *src;
+ uint8_t code = to_base64[uint8_t(c)];
+ buffer[idx] = uint8_t(code);
+ if (is_eight_byte(c) && code <= 63) {
+ idx++;
+ } else if (code > 64 || !scalar::base64::is_eight_byte(c)) {
+ return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+ size_t(dst - dstinit)};
+ } else {
+ // We have a space or a newline. We ignore it.
+ }
+ src++;
+ }
+ if (idx != 4) {
+ if (last_chunk_options == last_chunk_handling_options::strict &&
+ (idx != 1) && ((idx + padded_characters) & 3) != 0) {
+ // The partial chunk was at src - idx
+ return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
+ size_t(dst - dstinit)};
+ } else if (last_chunk_options ==
+ last_chunk_handling_options::stop_before_partial &&
+ (idx != 1) && ((idx + padded_characters) & 3) != 0) {
+ // Rewind src to before partial chunk
+ src -= idx;
+ return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
+ } else {
+ if (idx == 2) {
+ uint32_t triple =
+ (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6);
+ if ((last_chunk_options == last_chunk_handling_options::strict) &&
+ (triple & 0xffff)) {
+ return {BASE64_EXTRA_BITS, size_t(src - srcinit),
+ size_t(dst - dstinit)};
+ }
+ if (match_system(endianness::BIG)) {
+ triple <<= 8;
+ std::memcpy(dst, &triple, 1);
+ } else {
+ triple = scalar::utf32::swap_bytes(triple);
+ triple >>= 8;
+ std::memcpy(dst, &triple, 1);
+ }
+ dst += 1;
+ } else if (idx == 3) {
+ uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
+ (uint32_t(buffer[1]) << 2 * 6) +
+ (uint32_t(buffer[2]) << 1 * 6);
+ if ((last_chunk_options == last_chunk_handling_options::strict) &&
+ (triple & 0xff)) {
+ return {BASE64_EXTRA_BITS, size_t(src - srcinit),
+ size_t(dst - dstinit)};
+ }
+ if (match_system(endianness::BIG)) {
+ triple <<= 8;
+ std::memcpy(dst, &triple, 2);
+ } else {
+ triple = scalar::utf32::swap_bytes(triple);
+ triple >>= 8;
+ std::memcpy(dst, &triple, 2);
+ }
+ dst += 2;
+ } else if (idx == 1) {
+ return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
+ size_t(dst - dstinit)};
+ }
+ return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
+ }
+ }
+
+ uint32_t triple =
+ (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) +
+ (uint32_t(buffer[2]) << 1 * 6) + (uint32_t(buffer[3]) << 0 * 6);
+ if (match_system(endianness::BIG)) {
+ triple <<= 8;
+ std::memcpy(dst, &triple, 3);
+ } else {
+ triple = scalar::utf32::swap_bytes(triple);
+ triple >>= 8;
+ std::memcpy(dst, &triple, 3);
+ }
+ dst += 3;
+ }
+}
+
+// like base64_tail_decode, but it will not write past the end of the output
+// buffer. The outlen paramter is modified to reflect the number of bytes
+// written. This functions assumes that the padding (=) has been removed.
+template <class char_type>
+result base64_tail_decode_safe(
+ char *dst, size_t &outlen, const char_type *&srcr, size_t length,
+ size_t padded_characters, // number of padding characters '=', typically 0,
+ // 1, 2.
+ base64_options options, last_chunk_handling_options last_chunk_options) {
+ const char_type *src = srcr;
+ if (length == 0) {
+ outlen = 0;
+ return {SUCCESS, 0};
+ }
+ // This looks like 5 branches, but we expect the compiler to resolve this to a
+ // single branch:
+ const uint8_t *to_base64 = (options & base64_url)
+ ? tables::base64::to_base64_url_value
+ : tables::base64::to_base64_value;
+ const uint32_t *d0 = (options & base64_url)
+ ? tables::base64::base64_url::d0
+ : tables::base64::base64_default::d0;
+ const uint32_t *d1 = (options & base64_url)
+ ? tables::base64::base64_url::d1
+ : tables::base64::base64_default::d1;
+ const uint32_t *d2 = (options & base64_url)
+ ? tables::base64::base64_url::d2
+ : tables::base64::base64_default::d2;
+ const uint32_t *d3 = (options & base64_url)
+ ? tables::base64::base64_url::d3
+ : tables::base64::base64_default::d3;
+
+ const char_type *srcend = src + length;
+ const char_type *srcinit = src;
+ const char *dstinit = dst;
+ const char *dstend = dst + outlen;
+
+ uint32_t x;
+ size_t idx;
+ uint8_t buffer[4];
+ while (true) {
+ while (src + 4 <= srcend && is_eight_byte(src[0]) &&
+ is_eight_byte(src[1]) && is_eight_byte(src[2]) &&
+ is_eight_byte(src[3]) &&
+ (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
+ d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
+ if (dstend - dst < 3) {
+ outlen = size_t(dst - dstinit);
+ srcr = src;
+ return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+ }
+ if (match_system(endianness::BIG)) {
+ x = scalar::utf32::swap_bytes(x);
+ }
+ std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes
+ dst += 3;
+ src += 4;
+ }
+ idx = 0;
+ const char_type *srccur = src;
+ // We need at least four characters.
+ while (idx < 4 && src < srcend) {
+ char_type c = *src;
+ uint8_t code = to_base64[uint8_t(c)];
+
+ buffer[idx] = uint8_t(code);
+ if (is_eight_byte(c) && code <= 63) {
+ idx++;
+ } else if (code > 64 || !scalar::base64::is_eight_byte(c)) {
+ outlen = size_t(dst - dstinit);
+ srcr = src;
+ return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+ } else {
+ // We have a space or a newline. We ignore it.
+ }
+ src++;
+ }
+ if (idx != 4) {
+ if (last_chunk_options == last_chunk_handling_options::strict &&
+ ((idx + padded_characters) & 3) != 0) {
+ outlen = size_t(dst - dstinit);
+ srcr = src;
+ return {BASE64_INPUT_REMAINDER, size_t(src - srcinit)};
+ } else if (last_chunk_options ==
+ last_chunk_handling_options::stop_before_partial &&
+ ((idx + padded_characters) & 3) != 0) {
+ // Rewind src to before partial chunk
+ srcr = srccur;
+ outlen = size_t(dst - dstinit);
+ return {SUCCESS, size_t(dst - dstinit)};
+ } else { // loose mode
+ if (idx == 0) {
+ // No data left; return success
+ outlen = size_t(dst - dstinit);
+ srcr = src;
+ return {SUCCESS, size_t(dst - dstinit)};
+ } else if (idx == 1) {
+ // Error: Incomplete chunk of length 1 is invalid in loose mode
+ outlen = size_t(dst - dstinit);
+ srcr = src;
+ return {BASE64_INPUT_REMAINDER, size_t(src - srcinit)};
+ } else if (idx == 2 || idx == 3) {
+ // Check if there's enough space in the destination buffer
+ size_t required_space = (idx == 2) ? 1 : 2;
+ if (size_t(dstend - dst) < required_space) {
+ outlen = size_t(dst - dstinit);
+ srcr = src;
+ return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
+ }
+ uint32_t triple = 0;
+ if (idx == 2) {
+ triple = (uint32_t(buffer[0]) << 18) + (uint32_t(buffer[1]) << 12);
+ if ((last_chunk_options == last_chunk_handling_options::strict) &&
+ (triple & 0xffff)) {
+ srcr = src;
+ return {BASE64_EXTRA_BITS, size_t(src - srcinit)};
+ }
+ // Extract the first byte
+ triple >>= 16;
+ dst[0] = static_cast<char>(triple & 0xFF);
+ dst += 1;
+ } else if (idx == 3) {
+ triple = (uint32_t(buffer[0]) << 18) + (uint32_t(buffer[1]) << 12) +
+ (uint32_t(buffer[2]) << 6);
+ if ((last_chunk_options == last_chunk_handling_options::strict) &&
+ (triple & 0xff)) {
+ srcr = src;
+ return {BASE64_EXTRA_BITS, size_t(src - srcinit)};
+ }
+ // Extract the first two bytes
+ triple >>= 8;
+ dst[0] = static_cast<char>((triple >> 8) & 0xFF);
+ dst[1] = static_cast<char>(triple & 0xFF);
+ dst += 2;
+ }
+ outlen = size_t(dst - dstinit);
+ srcr = src;
+ return {SUCCESS, size_t(dst - dstinit)};
+ }
+ }
+ }
+
+ if (dstend - dst < 3) {
+ outlen = size_t(dst - dstinit);
+ srcr = src;
+ return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
+ }
+ uint32_t triple = (uint32_t(buffer[0]) << 18) +
+ (uint32_t(buffer[1]) << 12) + (uint32_t(buffer[2]) << 6) +
+ (uint32_t(buffer[3]));
+ if (match_system(endianness::BIG)) {
+ triple <<= 8;
+ std::memcpy(dst, &triple, 3);
+ } else {
+ triple = scalar::utf32::swap_bytes(triple);
+ triple >>= 8;
+ std::memcpy(dst, &triple, 3);
+ }
+ dst += 3;
+ }
+}
+
+// Returns the number of bytes written. The destination buffer must be large
+// enough. It will add padding (=) if needed.
+size_t tail_encode_base64(char *dst, const char *src, size_t srclen,
+ base64_options options) {
+ // By default, we use padding if we are not using the URL variant.
+ // This is check with ((options & base64_url) == 0) which returns true if we
+ // are not using the URL variant. However, we also allow 'inversion' of the
+ // convention with the base64_reverse_padding option. If the
+ // base64_reverse_padding option is set, we use padding if we are using the
+ // URL variant, and we omit it if we are not using the URL variant. This is
+ // checked with
+ // ((options & base64_reverse_padding) == base64_reverse_padding).
+ bool use_padding =
+ ((options & base64_url) == 0) ^
+ ((options & base64_reverse_padding) == base64_reverse_padding);
+ // This looks like 3 branches, but we expect the compiler to resolve this to
+ // a single branch:
+ const char *e0 = (options & base64_url) ? tables::base64::base64_url::e0
+ : tables::base64::base64_default::e0;
+ const char *e1 = (options & base64_url) ? tables::base64::base64_url::e1
+ : tables::base64::base64_default::e1;
+ const char *e2 = (options & base64_url) ? tables::base64::base64_url::e2
+ : tables::base64::base64_default::e2;
+ char *out = dst;
+ size_t i = 0;
+ uint8_t t1, t2, t3;
+ for (; i + 2 < srclen; i += 3) {
+ t1 = uint8_t(src[i]);
+ t2 = uint8_t(src[i + 1]);
+ t3 = uint8_t(src[i + 2]);
+ *out++ = e0[t1];
+ *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+ *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
+ *out++ = e2[t3];
+ }
+ switch (srclen - i) {
+ case 0:
+ break;
+ case 1:
+ t1 = uint8_t(src[i]);
+ *out++ = e0[t1];
+ *out++ = e1[(t1 & 0x03) << 4];
+ if (use_padding) {
+ *out++ = '=';
+ *out++ = '=';
+ }
+ break;
+ default: /* case 2 */
+ t1 = uint8_t(src[i]);
+ t2 = uint8_t(src[i + 1]);
+ *out++ = e0[t1];
+ *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+ *out++ = e2[(t2 & 0x0F) << 2];
+ if (use_padding) {
+ *out++ = '=';
+ }
+ }
+ return (size_t)(out - dst);
+}
+
+template <class char_type>
+simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char_type *input, size_t length) noexcept {
+ // We follow https://infra.spec.whatwg.org/#forgiving-base64-decode
+ size_t padding = 0;
+ if (length > 0) {
+ if (input[length - 1] == '=') {
+ padding++;
+ if (length > 1 && input[length - 2] == '=') {
+ padding++;
+ }
+ }
+ }
+ size_t actual_length = length - padding;
+ if (actual_length % 4 <= 1) {
+ return actual_length / 4 * 3;
+ }
+ // if we have a valid input, then the remainder must be 2 or 3 adding one or
+ // two extra bytes.
+ return actual_length / 4 * 3 + (actual_length % 4) - 1;
+}
+
+simdutf_warn_unused size_t
+base64_length_from_binary(size_t length, base64_options options) noexcept {
+ // By default, we use padding if we are not using the URL variant.
+ // This is check with ((options & base64_url) == 0) which returns true if we
+ // are not using the URL variant. However, we also allow 'inversion' of the
+ // convention with the base64_reverse_padding option. If the
+ // base64_reverse_padding option is set, we use padding if we are using the
+ // URL variant, and we omit it if we are not using the URL variant. This is
+ // checked with
+ // ((options & base64_reverse_padding) == base64_reverse_padding).
+ bool use_padding =
+ ((options & base64_url) == 0) ^
+ ((options & base64_reverse_padding) == base64_reverse_padding);
+ if (!use_padding) {
+ return length / 3 * 4 + ((length % 3) ? (length % 3) + 1 : 0);
+ }
+ return (length + 2) / 3 *
+ 4; // We use padding to make the length a multiple of 4.
+}
+
+} // namespace base64
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/latin1.h b/contrib/simdutf/src/scalar/latin1.h
new file mode 100644
index 000000000..9e35add79
--- /dev/null
+++ b/contrib/simdutf/src/scalar/latin1.h
@@ -0,0 +1,32 @@
+#ifndef SIMDUTF_LATIN1_H
+#define SIMDUTF_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1 {
+
+inline size_t utf32_length_from_latin1(size_t len) {
+ // We are not BOM aware.
+ return len; // a utf32 unit will always represent 1 latin1 character
+}
+
+inline size_t utf8_length_from_latin1(const char *buf, size_t len) {
+ const uint8_t *c = reinterpret_cast<const uint8_t *>(buf);
+ size_t answer = 0;
+ for (size_t i = 0; i < len; i++) {
+ if ((c[i] >> 7)) {
+ answer++;
+ }
+ }
+ return answer + len;
+}
+
+inline size_t utf16_length_from_latin1(size_t len) { return len; }
+
+} // namespace latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/latin1_to_utf16/latin1_to_utf16.h b/contrib/simdutf/src/scalar/latin1_to_utf16/latin1_to_utf16.h
new file mode 100644
index 000000000..b5ab9dc05
--- /dev/null
+++ b/contrib/simdutf/src/scalar/latin1_to_utf16/latin1_to_utf16.h
@@ -0,0 +1,49 @@
+#ifndef SIMDUTF_LATIN1_TO_UTF16_H
+#define SIMDUTF_LATIN1_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1_to_utf16 {
+
+template <endianness big_endian>
+inline size_t convert(const char *buf, size_t len, char16_t *utf16_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+ size_t pos = 0;
+ char16_t *start{utf16_output};
+
+ while (pos < len) {
+ uint16_t word =
+ uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
+ *utf16_output++ =
+ char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
+ pos++;
+ }
+
+ return utf16_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char *buf, size_t len,
+ char16_t *utf16_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+ size_t pos = 0;
+ char16_t *start{utf16_output};
+
+ while (pos < len) {
+ uint16_t word =
+ uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
+ *utf16_output++ =
+ char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
+ pos++;
+ }
+
+ return result(error_code::SUCCESS, utf16_output - start);
+}
+
+} // namespace latin1_to_utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/latin1_to_utf32/latin1_to_utf32.h b/contrib/simdutf/src/scalar/latin1_to_utf32/latin1_to_utf32.h
new file mode 100644
index 000000000..568acefac
--- /dev/null
+++ b/contrib/simdutf/src/scalar/latin1_to_utf32/latin1_to_utf32.h
@@ -0,0 +1,23 @@
+#ifndef SIMDUTF_LATIN1_TO_UTF32_H
+#define SIMDUTF_LATIN1_TO_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1_to_utf32 {
+
+inline size_t convert(const char *buf, size_t len, char32_t *utf32_output) {
+ const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
+ char32_t *start{utf32_output};
+ for (size_t i = 0; i < len; i++) {
+ *utf32_output++ = (char32_t)data[i];
+ }
+ return utf32_output - start;
+}
+
+} // namespace latin1_to_utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/latin1_to_utf8/latin1_to_utf8.h b/contrib/simdutf/src/scalar/latin1_to_utf8/latin1_to_utf8.h
new file mode 100644
index 000000000..87aa49eac
--- /dev/null
+++ b/contrib/simdutf/src/scalar/latin1_to_utf8/latin1_to_utf8.h
@@ -0,0 +1,104 @@
+#ifndef SIMDUTF_LATIN1_TO_UTF8_H
+#define SIMDUTF_LATIN1_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1_to_utf8 {
+
+inline size_t convert(const char *buf, size_t len, char *utf8_output) {
+ const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
+ size_t pos = 0;
+ size_t utf8_pos = 0;
+ while (pos < len) {
+ // try to convert the next block of 16 ASCII bytes
+ if (pos + 16 <=
+ len) { // if it is safe to read 16 more bytes, check that they are ascii
+ uint64_t v1;
+ ::memcpy(&v1, data + pos, sizeof(uint64_t));
+ uint64_t v2;
+ ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+ uint64_t v{v1 |
+ v2}; // We are only interested in these bits: 1000 1000 1000
+ // 1000, so it makes sense to concatenate everything
+ if ((v & 0x8080808080808080) ==
+ 0) { // if NONE of these are set, e.g. all of them are zero, then
+ // everything is ASCII
+ size_t final_pos = pos + 16;
+ while (pos < final_pos) {
+ utf8_output[utf8_pos++] = char(buf[pos]);
+ pos++;
+ }
+ continue;
+ }
+ }
+
+ unsigned char byte = data[pos];
+ if ((byte & 0x80) == 0) { // if ASCII
+ // will generate one UTF-8 bytes
+ utf8_output[utf8_pos++] = char(byte);
+ pos++;
+ } else {
+ // will generate two UTF-8 bytes
+ utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
+ utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
+ pos++;
+ }
+ }
+ return utf8_pos;
+}
+
+inline size_t convert_safe(const char *buf, size_t len, char *utf8_output,
+ size_t utf8_len) {
+ const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
+ size_t pos = 0;
+ size_t skip_pos = 0;
+ size_t utf8_pos = 0;
+ while (pos < len && utf8_pos < utf8_len) {
+ // try to convert the next block of 16 ASCII bytes
+ if (pos >= skip_pos && pos + 16 <= len &&
+ utf8_pos + 16 <= utf8_len) { // if it is safe to read 16 more bytes,
+ // check that they are ascii
+ uint64_t v1;
+ ::memcpy(&v1, data + pos, sizeof(uint64_t));
+ uint64_t v2;
+ ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+ uint64_t v{v1 |
+ v2}; // We are only interested in these bits: 1000 1000 1000
+ // 1000, so it makes sense to concatenate everything
+ if ((v & 0x8080808080808080) ==
+ 0) { // if NONE of these are set, e.g. all of them are zero, then
+ // everything is ASCII
+ ::memcpy(utf8_output + utf8_pos, buf + pos, 16);
+ utf8_pos += 16;
+ pos += 16;
+ } else {
+ // At least one of the next 16 bytes are not ASCII, we will process them
+ // one by one
+ skip_pos = pos + 16;
+ }
+ } else {
+ const auto byte = data[pos];
+ if ((byte & 0x80) == 0) { // if ASCII
+ // will generate one UTF-8 bytes
+ utf8_output[utf8_pos++] = char(byte);
+ pos++;
+ } else if (utf8_pos + 2 <= utf8_len) {
+ // will generate two UTF-8 bytes
+ utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
+ utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
+ pos++;
+ } else {
+ break;
+ }
+ }
+ }
+ return utf8_pos;
+}
+
+} // namespace latin1_to_utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf16.h b/contrib/simdutf/src/scalar/utf16.h
new file mode 100644
index 000000000..838e95dc7
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf16.h
@@ -0,0 +1,142 @@
+#ifndef SIMDUTF_UTF16_H
+#define SIMDUTF_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16 {
+
+inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
+ return uint16_t((word >> 8) | (word << 8));
+}
+
+template <endianness big_endian>
+inline simdutf_warn_unused bool validate(const char16_t *buf,
+ size_t len) noexcept {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+ uint64_t pos = 0;
+ while (pos < len) {
+ uint16_t word =
+ !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
+ if ((word & 0xF800) == 0xD800) {
+ if (pos + 1 >= len) {
+ return false;
+ }
+ uint16_t diff = uint16_t(word - 0xD800);
+ if (diff > 0x3FF) {
+ return false;
+ }
+ uint16_t next_word =
+ !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if (diff2 > 0x3FF) {
+ return false;
+ }
+ pos += 2;
+ } else {
+ pos++;
+ }
+ }
+ return true;
+}
+
+template <endianness big_endian>
+inline simdutf_warn_unused result validate_with_errors(const char16_t *buf,
+ size_t len) noexcept {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+ size_t pos = 0;
+ while (pos < len) {
+ uint16_t word =
+ !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
+ if ((word & 0xF800) == 0xD800) {
+ if (pos + 1 >= len) {
+ return result(error_code::SURROGATE, pos);
+ }
+ uint16_t diff = uint16_t(word - 0xD800);
+ if (diff > 0x3FF) {
+ return result(error_code::SURROGATE, pos);
+ }
+ uint16_t next_word =
+ !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if (diff2 > 0x3FF) {
+ return result(error_code::SURROGATE, pos);
+ }
+ pos += 2;
+ } else {
+ pos++;
+ }
+ }
+ return result(error_code::SUCCESS, pos);
+}
+
+template <endianness big_endian>
+inline size_t count_code_points(const char16_t *buf, size_t len) {
+ // We are not BOM aware.
+ const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
+ size_t counter{0};
+ for (size_t i = 0; i < len; i++) {
+ uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+ counter += ((word & 0xFC00) != 0xDC00);
+ }
+ return counter;
+}
+
+template <endianness big_endian>
+inline size_t utf8_length_from_utf16(const char16_t *buf, size_t len) {
+ // We are not BOM aware.
+ const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
+ size_t counter{0};
+ for (size_t i = 0; i < len; i++) {
+ uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+ counter++; // ASCII
+ counter += static_cast<size_t>(
+ word >
+ 0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
+ counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) ||
+ (word >= 0xE000)); // three-byte
+ }
+ return counter;
+}
+
+template <endianness big_endian>
+inline size_t utf32_length_from_utf16(const char16_t *buf, size_t len) {
+ // We are not BOM aware.
+ const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
+ size_t counter{0};
+ for (size_t i = 0; i < len; i++) {
+ uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+ counter += ((word & 0xFC00) != 0xDC00);
+ }
+ return counter;
+}
+
+inline size_t latin1_length_from_utf16(size_t len) { return len; }
+
+simdutf_really_inline void change_endianness_utf16(const char16_t *in,
+ size_t size, char16_t *out) {
+ const uint16_t *input = reinterpret_cast<const uint16_t *>(in);
+ uint16_t *output = reinterpret_cast<uint16_t *>(out);
+ for (size_t i = 0; i < size; i++) {
+ *output++ = uint16_t(input[i] >> 8 | input[i] << 8);
+ }
+}
+
+template <endianness big_endian>
+simdutf_warn_unused inline size_t trim_partial_utf16(const char16_t *input,
+ size_t length) {
+ if (length <= 1) {
+ return length;
+ }
+ uint16_t last_word = uint16_t(input[length - 1]);
+ last_word = !match_system(big_endian) ? swap_bytes(last_word) : last_word;
+ length -= ((last_word & 0xFC00) == 0xD800);
+ return length;
+}
+
+} // namespace utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf16_to_latin1/utf16_to_latin1.h b/contrib/simdutf/src/scalar/utf16_to_latin1/utf16_to_latin1.h
new file mode 100644
index 000000000..23cac54ec
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf16_to_latin1/utf16_to_latin1.h
@@ -0,0 +1,95 @@
+#ifndef SIMDUTF_UTF16_TO_LATIN1_H
+#define SIMDUTF_UTF16_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_latin1 {
+
+#include <cstring> // for std::memcpy
+
+template <endianness big_endian>
+inline size_t convert(const char16_t *buf, size_t len, char *latin_output) {
+ if (len == 0) {
+ return 0;
+ }
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+ size_t pos = 0;
+ char *current_write = latin_output;
+ uint16_t word = 0;
+ uint16_t too_large = 0;
+
+ while (pos < len) {
+ word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+ too_large |= word;
+ *current_write++ = char(word & 0xFF);
+ pos++;
+ }
+ if ((too_large & 0xFF00) != 0) {
+ return 0;
+ }
+
+ return current_write - latin_output;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char16_t *buf, size_t len,
+ char *latin_output) {
+ if (len == 0) {
+ return result(error_code::SUCCESS, 0);
+ }
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+ size_t pos = 0;
+ char *start{latin_output};
+ uint16_t word;
+
+ while (pos < len) {
+ if (pos + 16 <= len) { // if it is safe to read 32 more bytes, check that
+ // they are Latin1
+ uint64_t v1, v2, v3, v4;
+ ::memcpy(&v1, data + pos, sizeof(uint64_t));
+ ::memcpy(&v2, data + pos + 4, sizeof(uint64_t));
+ ::memcpy(&v3, data + pos + 8, sizeof(uint64_t));
+ ::memcpy(&v4, data + pos + 12, sizeof(uint64_t));
+
+ if (!match_system(big_endian)) {
+ v1 = (v1 >> 8) | (v1 << (64 - 8));
+ }
+ if (!match_system(big_endian)) {
+ v2 = (v2 >> 8) | (v2 << (64 - 8));
+ }
+ if (!match_system(big_endian)) {
+ v3 = (v3 >> 8) | (v3 << (64 - 8));
+ }
+ if (!match_system(big_endian)) {
+ v4 = (v4 >> 8) | (v4 << (64 - 8));
+ }
+
+ if (((v1 | v2 | v3 | v4) & 0xFF00FF00FF00FF00) == 0) {
+ size_t final_pos = pos + 16;
+ while (pos < final_pos) {
+ *latin_output++ = !match_system(big_endian)
+ ? char(utf16::swap_bytes(data[pos]))
+ : char(data[pos]);
+ pos++;
+ }
+ continue;
+ }
+ }
+ word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+ if ((word & 0xFF00) == 0) {
+ *latin_output++ = char(word & 0xFF);
+ pos++;
+ } else {
+ return result(error_code::TOO_LARGE, pos);
+ }
+ }
+ return result(error_code::SUCCESS, latin_output - start);
+}
+
+} // namespace utf16_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h b/contrib/simdutf/src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h
new file mode 100644
index 000000000..f418250cb
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h
@@ -0,0 +1,31 @@
+#ifndef SIMDUTF_VALID_UTF16_TO_LATIN1_H
+#define SIMDUTF_VALID_UTF16_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_latin1 {
+
+template <endianness big_endian>
+inline size_t convert_valid(const char16_t *buf, size_t len,
+ char *latin_output) {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+ size_t pos = 0;
+ char *start{latin_output};
+ uint16_t word = 0;
+
+ while (pos < len) {
+ word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+ *latin_output++ = char(word);
+ pos++;
+ }
+
+ return latin_output - start;
+}
+
+} // namespace utf16_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf16_to_utf32/utf16_to_utf32.h b/contrib/simdutf/src/scalar/utf16_to_utf32/utf16_to_utf32.h
new file mode 100644
index 000000000..7d712fd83
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf16_to_utf32/utf16_to_utf32.h
@@ -0,0 +1,87 @@
+#ifndef SIMDUTF_UTF16_TO_UTF32_H
+#define SIMDUTF_UTF16_TO_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_utf32 {
+
+template <endianness big_endian>
+inline size_t convert(const char16_t *buf, size_t len, char32_t *utf32_output) {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+ size_t pos = 0;
+ char32_t *start{utf32_output};
+ while (pos < len) {
+ uint16_t word =
+ !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+ if ((word & 0xF800) != 0xD800) {
+ // No surrogate pair, extend 16-bit word to 32-bit word
+ *utf32_output++ = char32_t(word);
+ pos++;
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ if (diff > 0x3FF) {
+ return 0;
+ }
+ if (pos + 1 >= len) {
+ return 0;
+ } // minimal bound checking
+ uint16_t next_word = !match_system(big_endian)
+ ? utf16::swap_bytes(data[pos + 1])
+ : data[pos + 1];
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if (diff2 > 0x3FF) {
+ return 0;
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf32_output++ = char32_t(value);
+ pos += 2;
+ }
+ }
+ return utf32_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char16_t *buf, size_t len,
+ char32_t *utf32_output) {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+ size_t pos = 0;
+ char32_t *start{utf32_output};
+ while (pos < len) {
+ uint16_t word =
+ !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+ if ((word & 0xF800) != 0xD800) {
+ // No surrogate pair, extend 16-bit word to 32-bit word
+ *utf32_output++ = char32_t(word);
+ pos++;
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ if (diff > 0x3FF) {
+ return result(error_code::SURROGATE, pos);
+ }
+ if (pos + 1 >= len) {
+ return result(error_code::SURROGATE, pos);
+ } // minimal bound checking
+ uint16_t next_word = !match_system(big_endian)
+ ? utf16::swap_bytes(data[pos + 1])
+ : data[pos + 1];
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if (diff2 > 0x3FF) {
+ return result(error_code::SURROGATE, pos);
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf32_output++ = char32_t(value);
+ pos += 2;
+ }
+ }
+ return result(error_code::SUCCESS, utf32_output - start);
+}
+
+} // namespace utf16_to_utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h b/contrib/simdutf/src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h
new file mode 100644
index 000000000..a9e107356
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h
@@ -0,0 +1,45 @@
+#ifndef SIMDUTF_VALID_UTF16_TO_UTF32_H
+#define SIMDUTF_VALID_UTF16_TO_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_utf32 {
+
+template <endianness big_endian>
+inline size_t convert_valid(const char16_t *buf, size_t len,
+ char32_t *utf32_output) {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+ size_t pos = 0;
+ char32_t *start{utf32_output};
+ while (pos < len) {
+ uint16_t word =
+ !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+ if ((word & 0xF800) != 0xD800) {
+ // No surrogate pair, extend 16-bit word to 32-bit word
+ *utf32_output++ = char32_t(word);
+ pos++;
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ if (pos + 1 >= len) {
+ return 0;
+ } // minimal bound checking
+ uint16_t next_word = !match_system(big_endian)
+ ? utf16::swap_bytes(data[pos + 1])
+ : data[pos + 1];
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf32_output++ = char32_t(value);
+ pos += 2;
+ }
+ }
+ return utf32_output - start;
+}
+
+} // namespace utf16_to_utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf16_to_utf8/utf16_to_utf8.h b/contrib/simdutf/src/scalar/utf16_to_utf8/utf16_to_utf8.h
new file mode 100644
index 000000000..49ba4feb6
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf16_to_utf8/utf16_to_utf8.h
@@ -0,0 +1,160 @@
+#ifndef SIMDUTF_UTF16_TO_UTF8_H
+#define SIMDUTF_UTF16_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_utf8 {
+
+template <endianness big_endian>
+inline size_t convert(const char16_t *buf, size_t len, char *utf8_output) {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+ size_t pos = 0;
+ char *start{utf8_output};
+ while (pos < len) {
+ // try to convert the next block of 8 bytes
+ if (pos + 4 <=
+ len) { // if it is safe to read 8 more bytes, check that they are ascii
+ uint64_t v;
+ ::memcpy(&v, data + pos, sizeof(uint64_t));
+ if (!match_system(big_endian)) {
+ v = (v >> 8) | (v << (64 - 8));
+ }
+ if ((v & 0xFF80FF80FF80FF80) == 0) {
+ size_t final_pos = pos + 4;
+ while (pos < final_pos) {
+ *utf8_output++ = !match_system(big_endian)
+ ? char(utf16::swap_bytes(buf[pos]))
+ : char(buf[pos]);
+ pos++;
+ }
+ continue;
+ }
+ }
+ uint16_t word =
+ !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+ if ((word & 0xFF80) == 0) {
+ // will generate one UTF-8 bytes
+ *utf8_output++ = char(word);
+ pos++;
+ } else if ((word & 0xF800) == 0) {
+ // will generate two UTF-8 bytes
+ // we have 0b110XXXXX 0b10XXXXXX
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ pos++;
+ } else if ((word & 0xF800) != 0xD800) {
+ // will generate three UTF-8 bytes
+ // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ pos++;
+ } else {
+ // must be a surrogate pair
+ if (pos + 1 >= len) {
+ return 0;
+ }
+ uint16_t diff = uint16_t(word - 0xD800);
+ if (diff > 0x3FF) {
+ return 0;
+ }
+ uint16_t next_word = !match_system(big_endian)
+ ? utf16::swap_bytes(data[pos + 1])
+ : data[pos + 1];
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if (diff2 > 0x3FF) {
+ return 0;
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ // will generate four UTF-8 bytes
+ // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+ *utf8_output++ = char((value >> 18) | 0b11110000);
+ *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((value & 0b111111) | 0b10000000);
+ pos += 2;
+ }
+ }
+ return utf8_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char16_t *buf, size_t len,
+ char *utf8_output) {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+ size_t pos = 0;
+ char *start{utf8_output};
+ while (pos < len) {
+ // try to convert the next block of 8 bytes
+ if (pos + 4 <=
+ len) { // if it is safe to read 8 more bytes, check that they are ascii
+ uint64_t v;
+ ::memcpy(&v, data + pos, sizeof(uint64_t));
+ if (!match_system(big_endian))
+ v = (v >> 8) | (v << (64 - 8));
+ if ((v & 0xFF80FF80FF80FF80) == 0) {
+ size_t final_pos = pos + 4;
+ while (pos < final_pos) {
+ *utf8_output++ = !match_system(big_endian)
+ ? char(utf16::swap_bytes(buf[pos]))
+ : char(buf[pos]);
+ pos++;
+ }
+ continue;
+ }
+ }
+ uint16_t word =
+ !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+ if ((word & 0xFF80) == 0) {
+ // will generate one UTF-8 bytes
+ *utf8_output++ = char(word);
+ pos++;
+ } else if ((word & 0xF800) == 0) {
+ // will generate two UTF-8 bytes
+ // we have 0b110XXXXX 0b10XXXXXX
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ pos++;
+ } else if ((word & 0xF800) != 0xD800) {
+ // will generate three UTF-8 bytes
+ // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ pos++;
+ } else {
+ // must be a surrogate pair
+ if (pos + 1 >= len) {
+ return result(error_code::SURROGATE, pos);
+ }
+ uint16_t diff = uint16_t(word - 0xD800);
+ if (diff > 0x3FF) {
+ return result(error_code::SURROGATE, pos);
+ }
+ uint16_t next_word = !match_system(big_endian)
+ ? utf16::swap_bytes(data[pos + 1])
+ : data[pos + 1];
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if (diff2 > 0x3FF) {
+ return result(error_code::SURROGATE, pos);
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ // will generate four UTF-8 bytes
+ // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+ *utf8_output++ = char((value >> 18) | 0b11110000);
+ *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((value & 0b111111) | 0b10000000);
+ pos += 2;
+ }
+ }
+ return result(error_code::SUCCESS, utf8_output - start);
+}
+
+} // namespace utf16_to_utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h b/contrib/simdutf/src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h
new file mode 100644
index 000000000..102c40ea4
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h
@@ -0,0 +1,83 @@
+#ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H
+#define SIMDUTF_VALID_UTF16_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_utf8 {
+
+template <endianness big_endian>
+inline size_t convert_valid(const char16_t *buf, size_t len,
+ char *utf8_output) {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+ size_t pos = 0;
+ char *start{utf8_output};
+ while (pos < len) {
+ // try to convert the next block of 4 ASCII characters
+ if (pos + 4 <=
+ len) { // if it is safe to read 8 more bytes, check that they are ascii
+ uint64_t v;
+ ::memcpy(&v, data + pos, sizeof(uint64_t));
+ if (!match_system(big_endian)) {
+ v = (v >> 8) | (v << (64 - 8));
+ }
+ if ((v & 0xFF80FF80FF80FF80) == 0) {
+ size_t final_pos = pos + 4;
+ while (pos < final_pos) {
+ *utf8_output++ = !match_system(big_endian)
+ ? char(utf16::swap_bytes(buf[pos]))
+ : char(buf[pos]);
+ pos++;
+ }
+ continue;
+ }
+ }
+
+ uint16_t word =
+ !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+ if ((word & 0xFF80) == 0) {
+ // will generate one UTF-8 bytes
+ *utf8_output++ = char(word);
+ pos++;
+ } else if ((word & 0xF800) == 0) {
+ // will generate two UTF-8 bytes
+ // we have 0b110XXXXX 0b10XXXXXX
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ pos++;
+ } else if ((word & 0xF800) != 0xD800) {
+ // will generate three UTF-8 bytes
+ // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ pos++;
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ if (pos + 1 >= len) {
+ return 0;
+ } // minimal bound checking
+ uint16_t next_word = !match_system(big_endian)
+ ? utf16::swap_bytes(data[pos + 1])
+ : data[pos + 1];
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ // will generate four UTF-8 bytes
+ // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+ *utf8_output++ = char((value >> 18) | 0b11110000);
+ *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((value & 0b111111) | 0b10000000);
+ pos += 2;
+ }
+ }
+ return utf8_output - start;
+}
+
+} // namespace utf16_to_utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf32.h b/contrib/simdutf/src/scalar/utf32.h
new file mode 100644
index 000000000..38b8240f8
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf32.h
@@ -0,0 +1,80 @@
+#ifndef SIMDUTF_UTF32_H
+#define SIMDUTF_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32 {
+
+inline simdutf_warn_unused bool validate(const char32_t *buf,
+ size_t len) noexcept {
+ const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+ uint64_t pos = 0;
+ for (; pos < len; pos++) {
+ uint32_t word = data[pos];
+ if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+inline simdutf_warn_unused result validate_with_errors(const char32_t *buf,
+ size_t len) noexcept {
+ const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+ size_t pos = 0;
+ for (; pos < len; pos++) {
+ uint32_t word = data[pos];
+ if (word > 0x10FFFF) {
+ return result(error_code::TOO_LARGE, pos);
+ }
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return result(error_code::SURROGATE, pos);
+ }
+ }
+ return result(error_code::SUCCESS, pos);
+}
+
+inline size_t utf8_length_from_utf32(const char32_t *buf, size_t len) {
+ // We are not BOM aware.
+ const uint32_t *p = reinterpret_cast<const uint32_t *>(buf);
+ size_t counter{0};
+ for (size_t i = 0; i < len; i++) {
+ // credit: @ttsugriy for the vectorizable approach
+ counter++; // ASCII
+ counter += static_cast<size_t>(p[i] > 0x7F); // two-byte
+ counter += static_cast<size_t>(p[i] > 0x7FF); // three-byte
+ counter += static_cast<size_t>(p[i] > 0xFFFF); // four-bytes
+ }
+ return counter;
+}
+
+inline size_t utf16_length_from_utf32(const char32_t *buf, size_t len) {
+ // We are not BOM aware.
+ const uint32_t *p = reinterpret_cast<const uint32_t *>(buf);
+ size_t counter{0};
+ for (size_t i = 0; i < len; i++) {
+ counter++; // non-surrogate word
+ counter += static_cast<size_t>(p[i] > 0xFFFF); // surrogate pair
+ }
+ return counter;
+}
+
+inline size_t latin1_length_from_utf32(size_t len) {
+ // We are not BOM aware.
+ return len; // a utf32 codepoint will always represent 1 latin1 character
+}
+
+inline simdutf_warn_unused uint32_t swap_bytes(const uint32_t word) {
+ return ((word >> 24) & 0xff) | // move byte 3 to byte 0
+ ((word << 8) & 0xff0000) | // move byte 1 to byte 2
+ ((word >> 8) & 0xff00) | // move byte 2 to byte 1
+ ((word << 24) & 0xff000000); // byte 0 to byte 3
+}
+
+} // namespace utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf32_to_latin1/utf32_to_latin1.h b/contrib/simdutf/src/scalar/utf32_to_latin1/utf32_to_latin1.h
new file mode 100644
index 000000000..f09bc9d1d
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf32_to_latin1/utf32_to_latin1.h
@@ -0,0 +1,62 @@
+#ifndef SIMDUTF_UTF32_TO_LATIN1_H
+#define SIMDUTF_UTF32_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_latin1 {
+
+inline size_t convert(const char32_t *buf, size_t len, char *latin1_output) {
+ const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+ char *start = latin1_output;
+ uint32_t utf32_char;
+ size_t pos = 0;
+ uint32_t too_large = 0;
+
+ while (pos < len) {
+ utf32_char = (uint32_t)data[pos];
+ too_large |= utf32_char;
+ *latin1_output++ = (char)(utf32_char & 0xFF);
+ pos++;
+ }
+ if ((too_large & 0xFFFFFF00) != 0) {
+ return 0;
+ }
+ return latin1_output - start;
+}
+
+inline result convert_with_errors(const char32_t *buf, size_t len,
+ char *latin1_output) {
+ const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+ char *start{latin1_output};
+ size_t pos = 0;
+ while (pos < len) {
+ if (pos + 2 <=
+ len) { // if it is safe to read 8 more bytes, check that they are Latin1
+ uint64_t v;
+ ::memcpy(&v, data + pos, sizeof(uint64_t));
+ if ((v & 0xFFFFFF00FFFFFF00) == 0) {
+ *latin1_output++ = char(buf[pos]);
+ *latin1_output++ = char(buf[pos + 1]);
+ pos += 2;
+ continue;
+ }
+ }
+ uint32_t utf32_char = data[pos];
+ if ((utf32_char & 0xFFFFFF00) ==
+ 0) { // Check if the character can be represented in Latin-1
+ *latin1_output++ = (char)(utf32_char & 0xFF);
+ pos++;
+ } else {
+ return result(error_code::TOO_LARGE, pos);
+ };
+ }
+ return result(error_code::SUCCESS, latin1_output - start);
+}
+
+} // namespace utf32_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h b/contrib/simdutf/src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h
new file mode 100644
index 000000000..c983b6be0
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h
@@ -0,0 +1,49 @@
+#ifndef SIMDUTF_VALID_UTF32_TO_LATIN1_H
+#define SIMDUTF_VALID_UTF32_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_latin1 {
+
+inline size_t convert_valid(const char32_t *buf, size_t len,
+ char *latin1_output) {
+ const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+ char *start = latin1_output;
+ uint32_t utf32_char;
+ size_t pos = 0;
+
+ while (pos < len) {
+ utf32_char = (uint32_t)data[pos];
+
+ if (pos + 2 <=
+ len) { // if it is safe to read 8 more bytes, check that they are Latin1
+ uint64_t v;
+ ::memcpy(&v, data + pos, sizeof(uint64_t));
+ if ((v & 0xFFFFFF00FFFFFF00) == 0) {
+ *latin1_output++ = char(buf[pos]);
+ *latin1_output++ = char(buf[pos + 1]);
+ pos += 2;
+ continue;
+ } else {
+ // output can not be represented in latin1
+ return 0;
+ }
+ }
+ if ((utf32_char & 0xFFFFFF00) == 0) {
+ *latin1_output++ = char(utf32_char);
+ } else {
+ // output can not be represented in latin1
+ return 0;
+ }
+ pos++;
+ }
+ return latin1_output - start;
+}
+
+} // namespace utf32_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf32_to_utf16/utf32_to_utf16.h b/contrib/simdutf/src/scalar/utf32_to_utf16/utf32_to_utf16.h
new file mode 100644
index 000000000..ded9ff818
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf32_to_utf16/utf32_to_utf16.h
@@ -0,0 +1,85 @@
+#ifndef SIMDUTF_UTF32_TO_UTF16_H
+#define SIMDUTF_UTF32_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_utf16 {
+
+template <endianness big_endian>
+inline size_t convert(const char32_t *buf, size_t len, char16_t *utf16_output) {
+ const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+ size_t pos = 0;
+ char16_t *start{utf16_output};
+ while (pos < len) {
+ uint32_t word = data[pos];
+ if ((word & 0xFFFF0000) == 0) {
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return 0;
+ }
+ // will not generate a surrogate pair
+ *utf16_output++ = !match_system(big_endian)
+ ? char16_t(utf16::swap_bytes(uint16_t(word)))
+ : char16_t(word);
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return 0;
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (!match_system(big_endian)) {
+ high_surrogate = utf16::swap_bytes(high_surrogate);
+ low_surrogate = utf16::swap_bytes(low_surrogate);
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ }
+ pos++;
+ }
+ return utf16_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char32_t *buf, size_t len,
+ char16_t *utf16_output) {
+ const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+ size_t pos = 0;
+ char16_t *start{utf16_output};
+ while (pos < len) {
+ uint32_t word = data[pos];
+ if ((word & 0xFFFF0000) == 0) {
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return result(error_code::SURROGATE, pos);
+ }
+ // will not generate a surrogate pair
+ *utf16_output++ = !match_system(big_endian)
+ ? char16_t(utf16::swap_bytes(uint16_t(word)))
+ : char16_t(word);
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return result(error_code::TOO_LARGE, pos);
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (!match_system(big_endian)) {
+ high_surrogate = utf16::swap_bytes(high_surrogate);
+ low_surrogate = utf16::swap_bytes(low_surrogate);
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ }
+ pos++;
+ }
+ return result(error_code::SUCCESS, utf16_output - start);
+}
+
+} // namespace utf32_to_utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h b/contrib/simdutf/src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h
new file mode 100644
index 000000000..625d23c56
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h
@@ -0,0 +1,45 @@
+#ifndef SIMDUTF_VALID_UTF32_TO_UTF16_H
+#define SIMDUTF_VALID_UTF32_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_utf16 {
+
+template <endianness big_endian>
+inline size_t convert_valid(const char32_t *buf, size_t len,
+ char16_t *utf16_output) {
+ const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+ size_t pos = 0;
+ char16_t *start{utf16_output};
+ while (pos < len) {
+ uint32_t word = data[pos];
+ if ((word & 0xFFFF0000) == 0) {
+ // will not generate a surrogate pair
+ *utf16_output++ = !match_system(big_endian)
+ ? char16_t(utf16::swap_bytes(uint16_t(word)))
+ : char16_t(word);
+ pos++;
+ } else {
+ // will generate a surrogate pair
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (!match_system(big_endian)) {
+ high_surrogate = utf16::swap_bytes(high_surrogate);
+ low_surrogate = utf16::swap_bytes(low_surrogate);
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ pos++;
+ }
+ }
+ return utf16_output - start;
+}
+
+} // namespace utf32_to_utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf32_to_utf8/utf32_to_utf8.h b/contrib/simdutf/src/scalar/utf32_to_utf8/utf32_to_utf8.h
new file mode 100644
index 000000000..efd812156
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf32_to_utf8/utf32_to_utf8.h
@@ -0,0 +1,123 @@
+#ifndef SIMDUTF_UTF32_TO_UTF8_H
+#define SIMDUTF_UTF32_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_utf8 {
+
+inline size_t convert(const char32_t *buf, size_t len, char *utf8_output) {
+ const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+ size_t pos = 0;
+ char *start{utf8_output};
+ while (pos < len) {
+ // try to convert the next block of 2 ASCII characters
+ if (pos + 2 <=
+ len) { // if it is safe to read 8 more bytes, check that they are ascii
+ uint64_t v;
+ ::memcpy(&v, data + pos, sizeof(uint64_t));
+ if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+ *utf8_output++ = char(buf[pos]);
+ *utf8_output++ = char(buf[pos + 1]);
+ pos += 2;
+ continue;
+ }
+ }
+ uint32_t word = data[pos];
+ if ((word & 0xFFFFFF80) == 0) {
+ // will generate one UTF-8 bytes
+ *utf8_output++ = char(word);
+ pos++;
+ } else if ((word & 0xFFFFF800) == 0) {
+ // will generate two UTF-8 bytes
+ // we have 0b110XXXXX 0b10XXXXXX
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ pos++;
+ } else if ((word & 0xFFFF0000) == 0) {
+ // will generate three UTF-8 bytes
+ // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return 0;
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ pos++;
+ } else {
+ // will generate four UTF-8 bytes
+ // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+ if (word > 0x10FFFF) {
+ return 0;
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ pos++;
+ }
+ }
+ return utf8_output - start;
+}
+
+inline result convert_with_errors(const char32_t *buf, size_t len,
+ char *utf8_output) {
+ const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+ size_t pos = 0;
+ char *start{utf8_output};
+ while (pos < len) {
+ // try to convert the next block of 2 ASCII characters
+ if (pos + 2 <=
+ len) { // if it is safe to read 8 more bytes, check that they are ascii
+ uint64_t v;
+ ::memcpy(&v, data + pos, sizeof(uint64_t));
+ if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+ *utf8_output++ = char(buf[pos]);
+ *utf8_output++ = char(buf[pos + 1]);
+ pos += 2;
+ continue;
+ }
+ }
+ uint32_t word = data[pos];
+ if ((word & 0xFFFFFF80) == 0) {
+ // will generate one UTF-8 bytes
+ *utf8_output++ = char(word);
+ pos++;
+ } else if ((word & 0xFFFFF800) == 0) {
+ // will generate two UTF-8 bytes
+ // we have 0b110XXXXX 0b10XXXXXX
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ pos++;
+ } else if ((word & 0xFFFF0000) == 0) {
+ // will generate three UTF-8 bytes
+ // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return result(error_code::SURROGATE, pos);
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ pos++;
+ } else {
+ // will generate four UTF-8 bytes
+ // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+ if (word > 0x10FFFF) {
+ return result(error_code::TOO_LARGE, pos);
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ pos++;
+ }
+ }
+ return result(error_code::SUCCESS, utf8_output - start);
+}
+
+} // namespace utf32_to_utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h b/contrib/simdutf/src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h
new file mode 100644
index 000000000..7a7cdd568
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h
@@ -0,0 +1,66 @@
+#ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H
+#define SIMDUTF_VALID_UTF32_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_utf8 {
+
+#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
+// only used by the fallback and POWER kernel
+inline size_t convert_valid(const char32_t *buf, size_t len,
+ char *utf8_output) {
+ const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+ size_t pos = 0;
+ char *start{utf8_output};
+ while (pos < len) {
+ // try to convert the next block of 2 ASCII characters
+ if (pos + 2 <=
+ len) { // if it is safe to read 8 more bytes, check that they are ascii
+ uint64_t v;
+ ::memcpy(&v, data + pos, sizeof(uint64_t));
+ if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+ *utf8_output++ = char(buf[pos]);
+ *utf8_output++ = char(buf[pos + 1]);
+ pos += 2;
+ continue;
+ }
+ }
+ uint32_t word = data[pos];
+ if ((word & 0xFFFFFF80) == 0) {
+ // will generate one UTF-8 bytes
+ *utf8_output++ = char(word);
+ pos++;
+ } else if ((word & 0xFFFFF800) == 0) {
+ // will generate two UTF-8 bytes
+ // we have 0b110XXXXX 0b10XXXXXX
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ pos++;
+ } else if ((word & 0xFFFF0000) == 0) {
+ // will generate three UTF-8 bytes
+ // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ pos++;
+ } else {
+ // will generate four UTF-8 bytes
+ // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ pos++;
+ }
+ }
+ return utf8_output - start;
+}
+#endif // SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
+
+} // namespace utf32_to_utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf8.h b/contrib/simdutf/src/scalar/utf8.h
new file mode 100644
index 000000000..404548621
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf8.h
@@ -0,0 +1,295 @@
+#ifndef SIMDUTF_UTF8_H
+#define SIMDUTF_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8 {
+#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_RVV
+// only used by the fallback kernel.
+// credit: based on code from Google Fuchsia (Apache Licensed)
+inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+ uint64_t pos = 0;
+ uint32_t code_point = 0;
+ while (pos < len) {
+ // check of the next 16 bytes are ascii.
+ uint64_t next_pos = pos + 16;
+ if (next_pos <=
+ len) { // if it is safe to read 16 more bytes, check that they are ascii
+ uint64_t v1;
+ std::memcpy(&v1, data + pos, sizeof(uint64_t));
+ uint64_t v2;
+ std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+ uint64_t v{v1 | v2};
+ if ((v & 0x8080808080808080) == 0) {
+ pos = next_pos;
+ continue;
+ }
+ }
+ unsigned char byte = data[pos];
+
+ while (byte < 0b10000000) {
+ if (++pos == len) {
+ return true;
+ }
+ byte = data[pos];
+ }
+
+ if ((byte & 0b11100000) == 0b11000000) {
+ next_pos = pos + 2;
+ if (next_pos > len) {
+ return false;
+ }
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return false;
+ }
+ // range check
+ code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+ if ((code_point < 0x80) || (0x7ff < code_point)) {
+ return false;
+ }
+ } else if ((byte & 0b11110000) == 0b11100000) {
+ next_pos = pos + 3;
+ if (next_pos > len) {
+ return false;
+ }
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return false;
+ }
+ if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+ return false;
+ }
+ // range check
+ code_point = (byte & 0b00001111) << 12 |
+ (data[pos + 1] & 0b00111111) << 6 |
+ (data[pos + 2] & 0b00111111);
+ if ((code_point < 0x800) || (0xffff < code_point) ||
+ (0xd7ff < code_point && code_point < 0xe000)) {
+ return false;
+ }
+ } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+ next_pos = pos + 4;
+ if (next_pos > len) {
+ return false;
+ }
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return false;
+ }
+ if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+ return false;
+ }
+ if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+ return false;
+ }
+ // range check
+ code_point =
+ (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+ (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+ if (code_point <= 0xffff || 0x10ffff < code_point) {
+ return false;
+ }
+ } else {
+ // we may have a continuation
+ return false;
+ }
+ pos = next_pos;
+ }
+ return true;
+}
+#endif
+
+inline simdutf_warn_unused result validate_with_errors(const char *buf,
+ size_t len) noexcept {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+ size_t pos = 0;
+ uint32_t code_point = 0;
+ while (pos < len) {
+ // check of the next 16 bytes are ascii.
+ size_t next_pos = pos + 16;
+ if (next_pos <=
+ len) { // if it is safe to read 16 more bytes, check that they are ascii
+ uint64_t v1;
+ std::memcpy(&v1, data + pos, sizeof(uint64_t));
+ uint64_t v2;
+ std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+ uint64_t v{v1 | v2};
+ if ((v & 0x8080808080808080) == 0) {
+ pos = next_pos;
+ continue;
+ }
+ }
+ unsigned char byte = data[pos];
+
+ while (byte < 0b10000000) {
+ if (++pos == len) {
+ return result(error_code::SUCCESS, len);
+ }
+ byte = data[pos];
+ }
+
+ if ((byte & 0b11100000) == 0b11000000) {
+ next_pos = pos + 2;
+ if (next_pos > len) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ // range check
+ code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+ if ((code_point < 0x80) || (0x7ff < code_point)) {
+ return result(error_code::OVERLONG, pos);
+ }
+ } else if ((byte & 0b11110000) == 0b11100000) {
+ next_pos = pos + 3;
+ if (next_pos > len) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ // range check
+ code_point = (byte & 0b00001111) << 12 |
+ (data[pos + 1] & 0b00111111) << 6 |
+ (data[pos + 2] & 0b00111111);
+ if ((code_point < 0x800) || (0xffff < code_point)) {
+ return result(error_code::OVERLONG, pos);
+ }
+ if (0xd7ff < code_point && code_point < 0xe000) {
+ return result(error_code::SURROGATE, pos);
+ }
+ } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+ next_pos = pos + 4;
+ if (next_pos > len) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ // range check
+ code_point =
+ (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+ (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+ if (code_point <= 0xffff) {
+ return result(error_code::OVERLONG, pos);
+ }
+ if (0x10ffff < code_point) {
+ return result(error_code::TOO_LARGE, pos);
+ }
+ } else {
+ // we either have too many continuation bytes or an invalid leading byte
+ if ((byte & 0b11000000) == 0b10000000) {
+ return result(error_code::TOO_LONG, pos);
+ } else {
+ return result(error_code::HEADER_BITS, pos);
+ }
+ }
+ pos = next_pos;
+ }
+ return result(error_code::SUCCESS, len);
+}
+
+// Finds the previous leading byte starting backward from buf and validates with
+// errors from there Used to pinpoint the location of an error when an invalid
+// chunk is detected We assume that the stream starts with a leading byte, and
+// to check that it is the case, we ask that you pass a pointer to the start of
+// the stream (start).
+inline simdutf_warn_unused result rewind_and_validate_with_errors(
+ const char *start, const char *buf, size_t len) noexcept {
+ // First check that we start with a leading byte
+ if ((*start & 0b11000000) == 0b10000000) {
+ return result(error_code::TOO_LONG, 0);
+ }
+ size_t extra_len{0};
+ // A leading byte cannot be further than 4 bytes away
+ for (int i = 0; i < 5; i++) {
+ unsigned char byte = *buf;
+ if ((byte & 0b11000000) != 0b10000000) {
+ break;
+ } else {
+ buf--;
+ extra_len++;
+ }
+ }
+
+ result res = validate_with_errors(buf, len + extra_len);
+ res.count -= extra_len;
+ return res;
+}
+
+inline size_t count_code_points(const char *buf, size_t len) {
+ const int8_t *p = reinterpret_cast<const int8_t *>(buf);
+ size_t counter{0};
+ for (size_t i = 0; i < len; i++) {
+ // -65 is 0b10111111, anything larger in two-complement's should start a new
+ // code point.
+ if (p[i] > -65) {
+ counter++;
+ }
+ }
+ return counter;
+}
+
+inline size_t utf16_length_from_utf8(const char *buf, size_t len) {
+ const int8_t *p = reinterpret_cast<const int8_t *>(buf);
+ size_t counter{0};
+ for (size_t i = 0; i < len; i++) {
+ if (p[i] > -65) {
+ counter++;
+ }
+ if (uint8_t(p[i]) >= 240) {
+ counter++;
+ }
+ }
+ return counter;
+}
+
+simdutf_warn_unused inline size_t trim_partial_utf8(const char *input,
+ size_t length) {
+ if (length < 3) {
+ switch (length) {
+ case 2:
+ if (uint8_t(input[length - 1]) >= 0xc0) {
+ return length - 1;
+ } // 2-, 3- and 4-byte characters with only 1 byte left
+ if (uint8_t(input[length - 2]) >= 0xe0) {
+ return length - 2;
+ } // 3- and 4-byte characters with only 2 bytes left
+ return length;
+ case 1:
+ if (uint8_t(input[length - 1]) >= 0xc0) {
+ return length - 1;
+ } // 2-, 3- and 4-byte characters with only 1 byte left
+ return length;
+ case 0:
+ return length;
+ }
+ }
+ if (uint8_t(input[length - 1]) >= 0xc0) {
+ return length - 1;
+ } // 2-, 3- and 4-byte characters with only 1 byte left
+ if (uint8_t(input[length - 2]) >= 0xe0) {
+ return length - 2;
+ } // 3- and 4-byte characters with only 1 byte left
+ if (uint8_t(input[length - 3]) >= 0xf0) {
+ return length - 3;
+ } // 4-byte characters with only 3 bytes left
+ return length;
+}
+
+} // namespace utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf8_to_latin1/utf8_to_latin1.h b/contrib/simdutf/src/scalar/utf8_to_latin1/utf8_to_latin1.h
new file mode 100644
index 000000000..cefb1dda9
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf8_to_latin1/utf8_to_latin1.h
@@ -0,0 +1,207 @@
+#ifndef SIMDUTF_UTF8_TO_LATIN1_H
+#define SIMDUTF_UTF8_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_latin1 {
+
+inline size_t convert(const char *buf, size_t len, char *latin_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+ size_t pos = 0;
+ char *start{latin_output};
+
+ while (pos < len) {
+ // try to convert the next block of 16 ASCII bytes
+ if (pos + 16 <=
+ len) { // if it is safe to read 16 more bytes, check that they are ascii
+ uint64_t v1;
+ ::memcpy(&v1, data + pos, sizeof(uint64_t));
+ uint64_t v2;
+ ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+ uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000
+ // 1000 1000 .... etc
+ if ((v & 0x8080808080808080) ==
+ 0) { // if NONE of these are set, e.g. all of them are zero, then
+ // everything is ASCII
+ size_t final_pos = pos + 16;
+ while (pos < final_pos) {
+ *latin_output++ = char(buf[pos]);
+ pos++;
+ }
+ continue;
+ }
+ }
+
+ // suppose it is not an all ASCII byte sequence
+ uint8_t leading_byte = data[pos]; // leading byte
+ if (leading_byte < 0b10000000) {
+ // converting one ASCII byte !!!
+ *latin_output++ = char(leading_byte);
+ pos++;
+ } else if ((leading_byte & 0b11100000) ==
+ 0b11000000) { // the first three bits indicate:
+ // We have a two-byte UTF-8
+ if (pos + 1 >= len) {
+ return 0;
+ } // minimal bound checking
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return 0;
+ } // checks if the next byte is a valid continuation byte in UTF-8. A
+ // valid continuation byte starts with 10.
+ // range check -
+ uint32_t code_point =
+ (leading_byte & 0b00011111) << 6 |
+ (data[pos + 1] &
+ 0b00111111); // assembles the Unicode code point from the two bytes.
+ // It does this by discarding the leading 110 and 10
+ // bits from the two bytes, shifting the remaining bits
+ // of the first byte, and then combining the results
+ // with a bitwise OR operation.
+ if (code_point < 0x80 || 0xFF < code_point) {
+ return 0; // We only care about the range 129-255 which is Non-ASCII
+ // latin1 characters. A code_point beneath 0x80 is invalid as
+ // it is already covered by bytes whose leading bit is zero.
+ }
+ *latin_output++ = char(code_point);
+ pos += 2;
+ } else {
+ return 0;
+ }
+ }
+ return latin_output - start;
+}
+
+inline result convert_with_errors(const char *buf, size_t len,
+ char *latin_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+ size_t pos = 0;
+ char *start{latin_output};
+
+ while (pos < len) {
+ // try to convert the next block of 16 ASCII bytes
+ if (pos + 16 <=
+ len) { // if it is safe to read 16 more bytes, check that they are ascii
+ uint64_t v1;
+ ::memcpy(&v1, data + pos, sizeof(uint64_t));
+ uint64_t v2;
+ ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+ uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000
+ // 1000 1000...etc
+ if ((v & 0x8080808080808080) ==
+ 0) { // if NONE of these are set, e.g. all of them are zero, then
+ // everything is ASCII
+ size_t final_pos = pos + 16;
+ while (pos < final_pos) {
+ *latin_output++ = char(buf[pos]);
+ pos++;
+ }
+ continue;
+ }
+ }
+ // suppose it is not an all ASCII byte sequence
+ uint8_t leading_byte = data[pos]; // leading byte
+ if (leading_byte < 0b10000000) {
+ // converting one ASCII byte !!!
+ *latin_output++ = char(leading_byte);
+ pos++;
+ } else if ((leading_byte & 0b11100000) ==
+ 0b11000000) { // the first three bits indicate:
+ // We have a two-byte UTF-8
+ if (pos + 1 >= len) {
+ return result(error_code::TOO_SHORT, pos);
+ } // minimal bound checking
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ } // checks if the next byte is a valid continuation byte in UTF-8. A
+ // valid continuation byte starts with 10.
+ // range check -
+ uint32_t code_point =
+ (leading_byte & 0b00011111) << 6 |
+ (data[pos + 1] &
+ 0b00111111); // assembles the Unicode code point from the two bytes.
+ // It does this by discarding the leading 110 and 10
+ // bits from the two bytes, shifting the remaining bits
+ // of the first byte, and then combining the results
+ // with a bitwise OR operation.
+ if (code_point < 0x80) {
+ return result(error_code::OVERLONG, pos);
+ }
+ if (0xFF < code_point) {
+ return result(error_code::TOO_LARGE, pos);
+ } // We only care about the range 129-255 which is Non-ASCII latin1
+ // characters
+ *latin_output++ = char(code_point);
+ pos += 2;
+ } else if ((leading_byte & 0b11110000) == 0b11100000) {
+ // We have a three-byte UTF-8
+ return result(error_code::TOO_LARGE, pos);
+ } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+ // we have a 4-byte UTF-8 word.
+ return result(error_code::TOO_LARGE, pos);
+ } else {
+ // we either have too many continuation bytes or an invalid leading byte
+ if ((leading_byte & 0b11000000) == 0b10000000) {
+ return result(error_code::TOO_LONG, pos);
+ }
+
+ return result(error_code::HEADER_BITS, pos);
+ }
+ }
+ return result(error_code::SUCCESS, latin_output - start);
+}
+
+inline result rewind_and_convert_with_errors(size_t prior_bytes,
+ const char *buf, size_t len,
+ char *latin1_output) {
+ size_t extra_len{0};
+ // We potentially need to go back in time and find a leading byte.
+ // In theory '3' would be sufficient, but sometimes the error can go back
+ // quite far.
+ size_t how_far_back = prior_bytes;
+ // size_t how_far_back = 3; // 3 bytes in the past + current position
+ // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
+ bool found_leading_bytes{false};
+ // important: it is i <= how_far_back and not 'i < how_far_back'.
+ for (size_t i = 0; i <= how_far_back; i++) {
+ unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
+ found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
+ if (found_leading_bytes) {
+ if (i > 0 && byte < 128) {
+ // If we had to go back and the leading byte is ascii
+ // then we can stop right away.
+ return result(error_code::TOO_LONG, 0 - i + 1);
+ }
+ buf -= i;
+ extra_len = i;
+ break;
+ }
+ }
+ //
+ // It is possible for this function to return a negative count in its result.
+ // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
+ // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
+ // unsigned integral type of the result of the sizeof operator
+ //
+ // An unsigned type will simply wrap round arithmetically (well defined).
+ //
+ if (!found_leading_bytes) {
+ // If how_far_back == 3, we may have four consecutive continuation bytes!!!
+ // [....] [continuation] [continuation] [continuation] | [buf is
+ // continuation] Or we possibly have a stream that does not start with a
+ // leading byte.
+ return result(error_code::TOO_LONG, 0 - how_far_back);
+ }
+ result res = convert_with_errors(buf, len + extra_len, latin1_output);
+ if (res.error) {
+ res.count -= extra_len;
+ }
+ return res;
+}
+
+} // namespace utf8_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h b/contrib/simdutf/src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h
new file mode 100644
index 000000000..e5186b042
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h
@@ -0,0 +1,78 @@
+#ifndef SIMDUTF_VALID_UTF8_TO_LATIN1_H
+#define SIMDUTF_VALID_UTF8_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_latin1 {
+
+inline size_t convert_valid(const char *buf, size_t len, char *latin_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+
+ size_t pos = 0;
+ char *start{latin_output};
+
+ while (pos < len) {
+ // try to convert the next block of 16 ASCII bytes
+ if (pos + 16 <=
+ len) { // if it is safe to read 16 more bytes, check that they are ascii
+ uint64_t v1;
+ ::memcpy(&v1, data + pos, sizeof(uint64_t));
+ uint64_t v2;
+ ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+ uint64_t v{v1 |
+ v2}; // We are only interested in these bits: 1000 1000 1000
+ // 1000, so it makes sense to concatenate everything
+ if ((v & 0x8080808080808080) ==
+ 0) { // if NONE of these are set, e.g. all of them are zero, then
+ // everything is ASCII
+ size_t final_pos = pos + 16;
+ while (pos < final_pos) {
+ *latin_output++ = char(buf[pos]);
+ pos++;
+ }
+ continue;
+ }
+ }
+
+ // suppose it is not an all ASCII byte sequence
+ uint8_t leading_byte = data[pos]; // leading byte
+ if (leading_byte < 0b10000000) {
+ // converting one ASCII byte !!!
+ *latin_output++ = char(leading_byte);
+ pos++;
+ } else if ((leading_byte & 0b11100000) ==
+ 0b11000000) { // the first three bits indicate:
+ // We have a two-byte UTF-8
+ if (pos + 1 >= len) {
+ break;
+ } // minimal bound checking
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return 0;
+ } // checks if the next byte is a valid continuation byte in UTF-8. A
+ // valid continuation byte starts with 10.
+ // range check -
+ uint32_t code_point =
+ (leading_byte & 0b00011111) << 6 |
+ (data[pos + 1] &
+ 0b00111111); // assembles the Unicode code point from the two bytes.
+ // It does this by discarding the leading 110 and 10
+ // bits from the two bytes, shifting the remaining bits
+ // of the first byte, and then combining the results
+ // with a bitwise OR operation.
+ *latin_output++ = char(code_point);
+ pos += 2;
+ } else {
+ // we may have a continuation but we do not do error checking
+ return 0;
+ }
+ }
+ return latin_output - start;
+}
+
+} // namespace utf8_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf8_to_utf16/utf8_to_utf16.h b/contrib/simdutf/src/scalar/utf8_to_utf16/utf8_to_utf16.h
new file mode 100644
index 000000000..a5b1c5f15
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf8_to_utf16/utf8_to_utf16.h
@@ -0,0 +1,326 @@
+#ifndef SIMDUTF_UTF8_TO_UTF16_H
+#define SIMDUTF_UTF8_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_utf16 {
+
+template <endianness big_endian>
+inline size_t convert(const char *buf, size_t len, char16_t *utf16_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+ size_t pos = 0;
+ char16_t *start{utf16_output};
+ while (pos < len) {
+ // try to convert the next block of 16 ASCII bytes
+ if (pos + 16 <=
+ len) { // if it is safe to read 16 more bytes, check that they are ascii
+ uint64_t v1;
+ ::memcpy(&v1, data + pos, sizeof(uint64_t));
+ uint64_t v2;
+ ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+ uint64_t v{v1 | v2};
+ if ((v & 0x8080808080808080) == 0) {
+ size_t final_pos = pos + 16;
+ while (pos < final_pos) {
+ *utf16_output++ = !match_system(big_endian)
+ ? char16_t(utf16::swap_bytes(buf[pos]))
+ : char16_t(buf[pos]);
+ pos++;
+ }
+ continue;
+ }
+ }
+
+ uint8_t leading_byte = data[pos]; // leading byte
+ if (leading_byte < 0b10000000) {
+ // converting one ASCII byte !!!
+ *utf16_output++ = !match_system(big_endian)
+ ? char16_t(utf16::swap_bytes(leading_byte))
+ : char16_t(leading_byte);
+ pos++;
+ } else if ((leading_byte & 0b11100000) == 0b11000000) {
+ // We have a two-byte UTF-8, it should become
+ // a single UTF-16 word.
+ if (pos + 1 >= len) {
+ return 0;
+ } // minimal bound checking
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return 0;
+ }
+ // range check
+ uint32_t code_point =
+ (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+ if (code_point < 0x80 || 0x7ff < code_point) {
+ return 0;
+ }
+ if (!match_system(big_endian)) {
+ code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+ }
+ *utf16_output++ = char16_t(code_point);
+ pos += 2;
+ } else if ((leading_byte & 0b11110000) == 0b11100000) {
+ // We have a three-byte UTF-8, it should become
+ // a single UTF-16 word.
+ if (pos + 2 >= len) {
+ return 0;
+ } // minimal bound checking
+
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return 0;
+ }
+ if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+ return 0;
+ }
+ // range check
+ uint32_t code_point = (leading_byte & 0b00001111) << 12 |
+ (data[pos + 1] & 0b00111111) << 6 |
+ (data[pos + 2] & 0b00111111);
+ if (code_point < 0x800 || 0xffff < code_point ||
+ (0xd7ff < code_point && code_point < 0xe000)) {
+ return 0;
+ }
+ if (!match_system(big_endian)) {
+ code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+ }
+ *utf16_output++ = char16_t(code_point);
+ pos += 3;
+ } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+ // we have a 4-byte UTF-8 word.
+ if (pos + 3 >= len) {
+ return 0;
+ } // minimal bound checking
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return 0;
+ }
+ if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+ return 0;
+ }
+ if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+ return 0;
+ }
+
+ // range check
+ uint32_t code_point = (leading_byte & 0b00000111) << 18 |
+ (data[pos + 1] & 0b00111111) << 12 |
+ (data[pos + 2] & 0b00111111) << 6 |
+ (data[pos + 3] & 0b00111111);
+ if (code_point <= 0xffff || 0x10ffff < code_point) {
+ return 0;
+ }
+ code_point -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+ if (!match_system(big_endian)) {
+ high_surrogate = utf16::swap_bytes(high_surrogate);
+ low_surrogate = utf16::swap_bytes(low_surrogate);
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ pos += 4;
+ } else {
+ return 0;
+ }
+ }
+ return utf16_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char *buf, size_t len,
+ char16_t *utf16_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+ size_t pos = 0;
+ char16_t *start{utf16_output};
+ while (pos < len) {
+ // try to convert the next block of 16 ASCII bytes
+ if (pos + 16 <=
+ len) { // if it is safe to read 16 more bytes, check that they are ascii
+ uint64_t v1;
+ ::memcpy(&v1, data + pos, sizeof(uint64_t));
+ uint64_t v2;
+ ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+ uint64_t v{v1 | v2};
+ if ((v & 0x8080808080808080) == 0) {
+ size_t final_pos = pos + 16;
+ while (pos < final_pos) {
+ *utf16_output++ = !match_system(big_endian)
+ ? char16_t(utf16::swap_bytes(buf[pos]))
+ : char16_t(buf[pos]);
+ pos++;
+ }
+ continue;
+ }
+ }
+ uint8_t leading_byte = data[pos]; // leading byte
+ if (leading_byte < 0b10000000) {
+ // converting one ASCII byte !!!
+ *utf16_output++ = !match_system(big_endian)
+ ? char16_t(utf16::swap_bytes(leading_byte))
+ : char16_t(leading_byte);
+ pos++;
+ } else if ((leading_byte & 0b11100000) == 0b11000000) {
+ // We have a two-byte UTF-8, it should become
+ // a single UTF-16 word.
+ if (pos + 1 >= len) {
+ return result(error_code::TOO_SHORT, pos);
+ } // minimal bound checking
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ // range check
+ uint32_t code_point =
+ (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+ if (code_point < 0x80 || 0x7ff < code_point) {
+ return result(error_code::OVERLONG, pos);
+ }
+ if (!match_system(big_endian)) {
+ code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+ }
+ *utf16_output++ = char16_t(code_point);
+ pos += 2;
+ } else if ((leading_byte & 0b11110000) == 0b11100000) {
+ // We have a three-byte UTF-8, it should become
+ // a single UTF-16 word.
+ if (pos + 2 >= len) {
+ return result(error_code::TOO_SHORT, pos);
+ } // minimal bound checking
+
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ // range check
+ uint32_t code_point = (leading_byte & 0b00001111) << 12 |
+ (data[pos + 1] & 0b00111111) << 6 |
+ (data[pos + 2] & 0b00111111);
+ if ((code_point < 0x800) || (0xffff < code_point)) {
+ return result(error_code::OVERLONG, pos);
+ }
+ if (0xd7ff < code_point && code_point < 0xe000) {
+ return result(error_code::SURROGATE, pos);
+ }
+ if (!match_system(big_endian)) {
+ code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+ }
+ *utf16_output++ = char16_t(code_point);
+ pos += 3;
+ } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+ // we have a 4-byte UTF-8 word.
+ if (pos + 3 >= len) {
+ return result(error_code::TOO_SHORT, pos);
+ } // minimal bound checking
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+
+ // range check
+ uint32_t code_point = (leading_byte & 0b00000111) << 18 |
+ (data[pos + 1] & 0b00111111) << 12 |
+ (data[pos + 2] & 0b00111111) << 6 |
+ (data[pos + 3] & 0b00111111);
+ if (code_point <= 0xffff) {
+ return result(error_code::OVERLONG, pos);
+ }
+ if (0x10ffff < code_point) {
+ return result(error_code::TOO_LARGE, pos);
+ }
+ code_point -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+ if (!match_system(big_endian)) {
+ high_surrogate = utf16::swap_bytes(high_surrogate);
+ low_surrogate = utf16::swap_bytes(low_surrogate);
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ pos += 4;
+ } else {
+ // we either have too many continuation bytes or an invalid leading byte
+ if ((leading_byte & 0b11000000) == 0b10000000) {
+ return result(error_code::TOO_LONG, pos);
+ } else {
+ return result(error_code::HEADER_BITS, pos);
+ }
+ }
+ }
+ return result(error_code::SUCCESS, utf16_output - start);
+}
+
+/**
+ * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and
+ * we have up to len input bytes left, and we encountered some error. It is
+ * possible that the error is at 'buf' exactly, but it could also be in the
+ * previous bytes (up to 3 bytes back).
+ *
+ * prior_bytes indicates how many bytes, prior to 'buf' may belong to the
+ * current memory section and can be safely accessed. We prior_bytes to access
+ * safely up to three bytes before 'buf'.
+ *
+ * The caller is responsible to ensure that len > 0.
+ *
+ * If the error is believed to have occurred prior to 'buf', the count value
+ * contain in the result will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
+ */
+template <endianness endian>
+inline result rewind_and_convert_with_errors(size_t prior_bytes,
+ const char *buf, size_t len,
+ char16_t *utf16_output) {
+ size_t extra_len{0};
+ // We potentially need to go back in time and find a leading byte.
+ // In theory '3' would be sufficient, but sometimes the error can go back
+ // quite far.
+ size_t how_far_back = prior_bytes;
+ // size_t how_far_back = 3; // 3 bytes in the past + current position
+ // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
+ bool found_leading_bytes{false};
+ // important: it is i <= how_far_back and not 'i < how_far_back'.
+ for (size_t i = 0; i <= how_far_back; i++) {
+ unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
+ found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
+ if (found_leading_bytes) {
+ if (i > 0 && byte < 128) {
+ // If we had to go back and the leading byte is ascii
+ // then we can stop right away.
+ return result(error_code::TOO_LONG, 0 - i + 1);
+ }
+ buf -= i;
+ extra_len = i;
+ break;
+ }
+ }
+ //
+ // It is possible for this function to return a negative count in its result.
+ // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
+ // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
+ // unsigned integral type of the result of the sizeof operator
+ //
+ // An unsigned type will simply wrap round arithmetically (well defined).
+ //
+ if (!found_leading_bytes) {
+ // If how_far_back == 3, we may have four consecutive continuation bytes!!!
+ // [....] [continuation] [continuation] [continuation] | [buf is
+ // continuation] Or we possibly have a stream that does not start with a
+ // leading byte.
+ return result(error_code::TOO_LONG, 0 - how_far_back);
+ }
+ result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
+ if (res.error) {
+ res.count -= extra_len;
+ }
+ return res;
+}
+
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h b/contrib/simdutf/src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h
new file mode 100644
index 000000000..d0ed78456
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h
@@ -0,0 +1,98 @@
+#ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H
+#define SIMDUTF_VALID_UTF8_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_utf16 {
+
+template <endianness big_endian>
+inline size_t convert_valid(const char *buf, size_t len,
+ char16_t *utf16_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+ size_t pos = 0;
+ char16_t *start{utf16_output};
+ while (pos < len) {
+ // try to convert the next block of 8 ASCII bytes
+ if (pos + 8 <=
+ len) { // if it is safe to read 8 more bytes, check that they are ascii
+ uint64_t v;
+ ::memcpy(&v, data + pos, sizeof(uint64_t));
+ if ((v & 0x8080808080808080) == 0) {
+ size_t final_pos = pos + 8;
+ while (pos < final_pos) {
+ *utf16_output++ = !match_system(big_endian)
+ ? char16_t(utf16::swap_bytes(buf[pos]))
+ : char16_t(buf[pos]);
+ pos++;
+ }
+ continue;
+ }
+ }
+ uint8_t leading_byte = data[pos]; // leading byte
+ if (leading_byte < 0b10000000) {
+ // converting one ASCII byte !!!
+ *utf16_output++ = !match_system(big_endian)
+ ? char16_t(utf16::swap_bytes(leading_byte))
+ : char16_t(leading_byte);
+ pos++;
+ } else if ((leading_byte & 0b11100000) == 0b11000000) {
+ // We have a two-byte UTF-8, it should become
+ // a single UTF-16 word.
+ if (pos + 1 >= len) {
+ break;
+ } // minimal bound checking
+ uint16_t code_point = uint16_t(((leading_byte & 0b00011111) << 6) |
+ (data[pos + 1] & 0b00111111));
+ if (!match_system(big_endian)) {
+ code_point = utf16::swap_bytes(uint16_t(code_point));
+ }
+ *utf16_output++ = char16_t(code_point);
+ pos += 2;
+ } else if ((leading_byte & 0b11110000) == 0b11100000) {
+ // We have a three-byte UTF-8, it should become
+ // a single UTF-16 word.
+ if (pos + 2 >= len) {
+ break;
+ } // minimal bound checking
+ uint16_t code_point = uint16_t(((leading_byte & 0b00001111) << 12) |
+ ((data[pos + 1] & 0b00111111) << 6) |
+ (data[pos + 2] & 0b00111111));
+ if (!match_system(big_endian)) {
+ code_point = utf16::swap_bytes(uint16_t(code_point));
+ }
+ *utf16_output++ = char16_t(code_point);
+ pos += 3;
+ } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+ // we have a 4-byte UTF-8 word.
+ if (pos + 3 >= len) {
+ break;
+ } // minimal bound checking
+ uint32_t code_point = ((leading_byte & 0b00000111) << 18) |
+ ((data[pos + 1] & 0b00111111) << 12) |
+ ((data[pos + 2] & 0b00111111) << 6) |
+ (data[pos + 3] & 0b00111111);
+ code_point -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+ if (!match_system(big_endian)) {
+ high_surrogate = utf16::swap_bytes(high_surrogate);
+ low_surrogate = utf16::swap_bytes(low_surrogate);
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ pos += 4;
+ } else {
+ // we may have a continuation but we do not do error checking
+ return 0;
+ }
+ }
+ return utf16_output - start;
+}
+
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf8_to_utf32/utf8_to_utf32.h b/contrib/simdutf/src/scalar/utf8_to_utf32/utf8_to_utf32.h
new file mode 100644
index 000000000..85dba93d5
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf8_to_utf32/utf8_to_utf32.h
@@ -0,0 +1,282 @@
+#ifndef SIMDUTF_UTF8_TO_UTF32_H
+#define SIMDUTF_UTF8_TO_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_utf32 {
+
+inline size_t convert(const char *buf, size_t len, char32_t *utf32_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+ size_t pos = 0;
+ char32_t *start{utf32_output};
+ while (pos < len) {
+ // try to convert the next block of 16 ASCII bytes
+ if (pos + 16 <=
+ len) { // if it is safe to read 16 more bytes, check that they are ascii
+ uint64_t v1;
+ ::memcpy(&v1, data + pos, sizeof(uint64_t));
+ uint64_t v2;
+ ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+ uint64_t v{v1 | v2};
+ if ((v & 0x8080808080808080) == 0) {
+ size_t final_pos = pos + 16;
+ while (pos < final_pos) {
+ *utf32_output++ = char32_t(buf[pos]);
+ pos++;
+ }
+ continue;
+ }
+ }
+ uint8_t leading_byte = data[pos]; // leading byte
+ if (leading_byte < 0b10000000) {
+ // converting one ASCII byte !!!
+ *utf32_output++ = char32_t(leading_byte);
+ pos++;
+ } else if ((leading_byte & 0b11100000) == 0b11000000) {
+ // We have a two-byte UTF-8
+ if (pos + 1 >= len) {
+ return 0;
+ } // minimal bound checking
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return 0;
+ }
+ // range check
+ uint32_t code_point =
+ (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+ if (code_point < 0x80 || 0x7ff < code_point) {
+ return 0;
+ }
+ *utf32_output++ = char32_t(code_point);
+ pos += 2;
+ } else if ((leading_byte & 0b11110000) == 0b11100000) {
+ // We have a three-byte UTF-8
+ if (pos + 2 >= len) {
+ return 0;
+ } // minimal bound checking
+
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return 0;
+ }
+ if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+ return 0;
+ }
+ // range check
+ uint32_t code_point = (leading_byte & 0b00001111) << 12 |
+ (data[pos + 1] & 0b00111111) << 6 |
+ (data[pos + 2] & 0b00111111);
+ if (code_point < 0x800 || 0xffff < code_point ||
+ (0xd7ff < code_point && code_point < 0xe000)) {
+ return 0;
+ }
+ *utf32_output++ = char32_t(code_point);
+ pos += 3;
+ } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+ // we have a 4-byte UTF-8 word.
+ if (pos + 3 >= len) {
+ return 0;
+ } // minimal bound checking
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return 0;
+ }
+ if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+ return 0;
+ }
+ if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+ return 0;
+ }
+
+ // range check
+ uint32_t code_point = (leading_byte & 0b00000111) << 18 |
+ (data[pos + 1] & 0b00111111) << 12 |
+ (data[pos + 2] & 0b00111111) << 6 |
+ (data[pos + 3] & 0b00111111);
+ if (code_point <= 0xffff || 0x10ffff < code_point) {
+ return 0;
+ }
+ *utf32_output++ = char32_t(code_point);
+ pos += 4;
+ } else {
+ return 0;
+ }
+ }
+ return utf32_output - start;
+}
+
+inline result convert_with_errors(const char *buf, size_t len,
+ char32_t *utf32_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+ size_t pos = 0;
+ char32_t *start{utf32_output};
+ while (pos < len) {
+ // try to convert the next block of 16 ASCII bytes
+ if (pos + 16 <=
+ len) { // if it is safe to read 16 more bytes, check that they are ascii
+ uint64_t v1;
+ ::memcpy(&v1, data + pos, sizeof(uint64_t));
+ uint64_t v2;
+ ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+ uint64_t v{v1 | v2};
+ if ((v & 0x8080808080808080) == 0) {
+ size_t final_pos = pos + 16;
+ while (pos < final_pos) {
+ *utf32_output++ = char32_t(buf[pos]);
+ pos++;
+ }
+ continue;
+ }
+ }
+ uint8_t leading_byte = data[pos]; // leading byte
+ if (leading_byte < 0b10000000) {
+ // converting one ASCII byte !!!
+ *utf32_output++ = char32_t(leading_byte);
+ pos++;
+ } else if ((leading_byte & 0b11100000) == 0b11000000) {
+ // We have a two-byte UTF-8
+ if (pos + 1 >= len) {
+ return result(error_code::TOO_SHORT, pos);
+ } // minimal bound checking
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ // range check
+ uint32_t code_point =
+ (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+ if (code_point < 0x80 || 0x7ff < code_point) {
+ return result(error_code::OVERLONG, pos);
+ }
+ *utf32_output++ = char32_t(code_point);
+ pos += 2;
+ } else if ((leading_byte & 0b11110000) == 0b11100000) {
+ // We have a three-byte UTF-8
+ if (pos + 2 >= len) {
+ return result(error_code::TOO_SHORT, pos);
+ } // minimal bound checking
+
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ // range check
+ uint32_t code_point = (leading_byte & 0b00001111) << 12 |
+ (data[pos + 1] & 0b00111111) << 6 |
+ (data[pos + 2] & 0b00111111);
+ if (code_point < 0x800 || 0xffff < code_point) {
+ return result(error_code::OVERLONG, pos);
+ }
+ if (0xd7ff < code_point && code_point < 0xe000) {
+ return result(error_code::SURROGATE, pos);
+ }
+ *utf32_output++ = char32_t(code_point);
+ pos += 3;
+ } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+ // we have a 4-byte UTF-8 word.
+ if (pos + 3 >= len) {
+ return result(error_code::TOO_SHORT, pos);
+ } // minimal bound checking
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+ if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+ return result(error_code::TOO_SHORT, pos);
+ }
+
+ // range check
+ uint32_t code_point = (leading_byte & 0b00000111) << 18 |
+ (data[pos + 1] & 0b00111111) << 12 |
+ (data[pos + 2] & 0b00111111) << 6 |
+ (data[pos + 3] & 0b00111111);
+ if (code_point <= 0xffff) {
+ return result(error_code::OVERLONG, pos);
+ }
+ if (0x10ffff < code_point) {
+ return result(error_code::TOO_LARGE, pos);
+ }
+ *utf32_output++ = char32_t(code_point);
+ pos += 4;
+ } else {
+ // we either have too many continuation bytes or an invalid leading byte
+ if ((leading_byte & 0b11000000) == 0b10000000) {
+ return result(error_code::TOO_LONG, pos);
+ } else {
+ return result(error_code::HEADER_BITS, pos);
+ }
+ }
+ }
+ return result(error_code::SUCCESS, utf32_output - start);
+}
+
+/**
+ * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and
+ * we have up to len input bytes left, and we encountered some error. It is
+ * possible that the error is at 'buf' exactly, but it could also be in the
+ * previous bytes location (up to 3 bytes back).
+ *
+ * prior_bytes indicates how many bytes, prior to 'buf' may belong to the
+ * current memory section and can be safely accessed. We prior_bytes to access
+ * safely up to three bytes before 'buf'.
+ *
+ * The caller is responsible to ensure that len > 0.
+ *
+ * If the error is believed to have occurred prior to 'buf', the count value
+ * contain in the result will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
+ */
+inline result rewind_and_convert_with_errors(size_t prior_bytes,
+ const char *buf, size_t len,
+ char32_t *utf32_output) {
+ size_t extra_len{0};
+ // We potentially need to go back in time and find a leading byte.
+ size_t how_far_back = 3; // 3 bytes in the past + current position
+ if (how_far_back > prior_bytes) {
+ how_far_back = prior_bytes;
+ }
+ bool found_leading_bytes{false};
+ // important: it is i <= how_far_back and not 'i < how_far_back'.
+ for (size_t i = 0; i <= how_far_back; i++) {
+ unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
+ found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
+ if (found_leading_bytes) {
+ if (i > 0 && byte < 128) {
+ // If we had to go back and the leading byte is ascii
+ // then we can stop right away.
+ return result(error_code::TOO_LONG, 0 - i + 1);
+ }
+ buf -= i;
+ extra_len = i;
+ break;
+ }
+ }
+ //
+ // It is possible for this function to return a negative count in its result.
+ // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
+ // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
+ // unsigned integral type of the result of the sizeof operator
+ //
+ // An unsigned type will simply wrap round arithmetically (well defined).
+ //
+ if (!found_leading_bytes) {
+ // If how_far_back == 3, we may have four consecutive continuation bytes!!!
+ // [....] [continuation] [continuation] [continuation] | [buf is
+ // continuation] Or we possibly have a stream that does not start with a
+ // leading byte.
+ return result(error_code::TOO_LONG, 0 - how_far_back);
+ }
+
+ result res = convert_with_errors(buf, len + extra_len, utf32_output);
+ if (res.error) {
+ res.count -= extra_len;
+ }
+ return res;
+}
+
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h b/contrib/simdutf/src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h
new file mode 100644
index 000000000..4110e6903
--- /dev/null
+++ b/contrib/simdutf/src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h
@@ -0,0 +1,75 @@
+#ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H
+#define SIMDUTF_VALID_UTF8_TO_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_utf32 {
+
+inline size_t convert_valid(const char *buf, size_t len,
+ char32_t *utf32_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+ size_t pos = 0;
+ char32_t *start{utf32_output};
+ while (pos < len) {
+ // try to convert the next block of 8 ASCII bytes
+ if (pos + 8 <=
+ len) { // if it is safe to read 8 more bytes, check that they are ascii
+ uint64_t v;
+ ::memcpy(&v, data + pos, sizeof(uint64_t));
+ if ((v & 0x8080808080808080) == 0) {
+ size_t final_pos = pos + 8;
+ while (pos < final_pos) {
+ *utf32_output++ = char32_t(buf[pos]);
+ pos++;
+ }
+ continue;
+ }
+ }
+ uint8_t leading_byte = data[pos]; // leading byte
+ if (leading_byte < 0b10000000) {
+ // converting one ASCII byte !!!
+ *utf32_output++ = char32_t(leading_byte);
+ pos++;
+ } else if ((leading_byte & 0b11100000) == 0b11000000) {
+ // We have a two-byte UTF-8
+ if (pos + 1 >= len) {
+ break;
+ } // minimal bound checking
+ *utf32_output++ = char32_t(((leading_byte & 0b00011111) << 6) |
+ (data[pos + 1] & 0b00111111));
+ pos += 2;
+ } else if ((leading_byte & 0b11110000) == 0b11100000) {
+ // We have a three-byte UTF-8
+ if (pos + 2 >= len) {
+ break;
+ } // minimal bound checking
+ *utf32_output++ = char32_t(((leading_byte & 0b00001111) << 12) |
+ ((data[pos + 1] & 0b00111111) << 6) |
+ (data[pos + 2] & 0b00111111));
+ pos += 3;
+ } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+ // we have a 4-byte UTF-8 word.
+ if (pos + 3 >= len) {
+ break;
+ } // minimal bound checking
+ uint32_t code_word = ((leading_byte & 0b00000111) << 18) |
+ ((data[pos + 1] & 0b00111111) << 12) |
+ ((data[pos + 2] & 0b00111111) << 6) |
+ (data[pos + 3] & 0b00111111);
+ *utf32_output++ = char32_t(code_word);
+ pos += 4;
+ } else {
+ // we may have a continuation but we do not do error checking
+ return 0;
+ }
+ }
+ return utf32_output - start;
+}
+
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
diff --git a/contrib/simdutf/src/simdutf.cpp b/contrib/simdutf/src/simdutf.cpp
new file mode 100644
index 000000000..81b2355aa
--- /dev/null
+++ b/contrib/simdutf/src/simdutf.cpp
@@ -0,0 +1,82 @@
+#include "simdutf.h"
+// We include base64_tables once.
+#include "tables/base64_tables.h"
+#include "implementation.cpp"
+#include "encoding_types.cpp"
+#include "error.cpp"
+// The large tables should be included once and they
+// should not depend on a kernel.
+#include "tables/utf8_to_utf16_tables.h"
+#include "tables/utf16_to_utf8_tables.h"
+// End of tables.
+
+// The scalar routines should be included once.
+#include "scalar/ascii.h"
+#include "scalar/utf8.h"
+#include "scalar/utf16.h"
+#include "scalar/utf32.h"
+#include "scalar/latin1.h"
+#include "scalar/base64.h"
+
+#include "scalar/utf32_to_utf8/valid_utf32_to_utf8.h"
+#include "scalar/utf32_to_utf8/utf32_to_utf8.h"
+
+#include "scalar/utf32_to_utf16/valid_utf32_to_utf16.h"
+#include "scalar/utf32_to_utf16/utf32_to_utf16.h"
+
+#include "scalar/utf16_to_utf8/valid_utf16_to_utf8.h"
+#include "scalar/utf16_to_utf8/utf16_to_utf8.h"
+
+#include "scalar/utf16_to_utf32/valid_utf16_to_utf32.h"
+#include "scalar/utf16_to_utf32/utf16_to_utf32.h"
+
+#include "scalar/utf8_to_utf16/valid_utf8_to_utf16.h"
+#include "scalar/utf8_to_utf16/utf8_to_utf16.h"
+
+#include "scalar/utf8_to_utf32/valid_utf8_to_utf32.h"
+#include "scalar/utf8_to_utf32/utf8_to_utf32.h"
+
+#include "scalar/latin1_to_utf8/latin1_to_utf8.h"
+#include "scalar/latin1_to_utf16/latin1_to_utf16.h"
+#include "scalar/latin1_to_utf32/latin1_to_utf32.h"
+
+#include "scalar/utf8_to_latin1/utf8_to_latin1.h"
+#include "scalar/utf16_to_latin1/utf16_to_latin1.h"
+#include "scalar/utf32_to_latin1/utf32_to_latin1.h"
+
+#include "scalar/utf8_to_latin1/valid_utf8_to_latin1.h"
+#include "scalar/utf16_to_latin1/valid_utf16_to_latin1.h"
+#include "scalar/utf32_to_latin1/valid_utf32_to_latin1.h"
+
+SIMDUTF_PUSH_DISABLE_WARNINGS
+SIMDUTF_DISABLE_UNDESIRED_WARNINGS
+
+#if SIMDUTF_IMPLEMENTATION_ARM64
+ #include "arm64/implementation.cpp"
+#endif
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+ #include "fallback/implementation.cpp"
+#endif
+#if SIMDUTF_IMPLEMENTATION_ICELAKE
+ #include "icelake/implementation.cpp"
+#endif
+#if SIMDUTF_IMPLEMENTATION_HASWELL
+ #include "haswell/implementation.cpp"
+#endif
+#if SIMDUTF_IMPLEMENTATION_PPC64
+ #include "ppc64/implementation.cpp"
+#endif
+#if SIMDUTF_IMPLEMENTATION_RVV
+ #include "rvv/implementation.cpp"
+#endif
+#if SIMDUTF_IMPLEMENTATION_WESTMERE
+ #include "westmere/implementation.cpp"
+#endif
+#if SIMDUTF_IMPLEMENTATION_LSX
+ #include "lsx/implementation.cpp"
+#endif
+#if SIMDUTF_IMPLEMENTATION_LASX
+ #include "lasx/implementation.cpp"
+#endif
+
+SIMDUTF_POP_DISABLE_WARNINGS
diff --git a/contrib/simdutf/src/simdutf/arm64.h b/contrib/simdutf/src/simdutf/arm64.h
new file mode 100644
index 000000000..5955c8bb9
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/arm64.h
@@ -0,0 +1,43 @@
+#ifndef SIMDUTF_ARM64_H
+#define SIMDUTF_ARM64_H
+
+#ifdef SIMDUTF_FALLBACK_H
+ #error "arm64.h must be included before fallback.h"
+#endif
+
+#include "simdutf/portability.h"
+
+#ifndef SIMDUTF_IMPLEMENTATION_ARM64
+ #define SIMDUTF_IMPLEMENTATION_ARM64 (SIMDUTF_IS_ARM64)
+#endif
+#if SIMDUTF_IMPLEMENTATION_ARM64 && SIMDUTF_IS_ARM64
+ #define SIMDUTF_CAN_ALWAYS_RUN_ARM64 1
+#else
+ #define SIMDUTF_CAN_ALWAYS_RUN_ARM64 0
+#endif
+
+#include "simdutf/internal/isadetection.h"
+
+#if SIMDUTF_IMPLEMENTATION_ARM64
+
+namespace simdutf {
+/**
+ * Implementation for NEON (ARMv8).
+ */
+namespace arm64 {} // namespace arm64
+} // namespace simdutf
+
+ #include "simdutf/arm64/implementation.h"
+
+ #include "simdutf/arm64/begin.h"
+
+ // Declarations
+ #include "simdutf/arm64/intrinsics.h"
+ #include "simdutf/arm64/bitmanipulation.h"
+ #include "simdutf/arm64/simd.h"
+
+ #include "simdutf/arm64/end.h"
+
+#endif // SIMDUTF_IMPLEMENTATION_ARM64
+
+#endif // SIMDUTF_ARM64_H
diff --git a/contrib/simdutf/src/simdutf/arm64/begin.h b/contrib/simdutf/src/simdutf/arm64/begin.h
new file mode 100644
index 000000000..3ad489457
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/arm64/begin.h
@@ -0,0 +1 @@
+#define SIMDUTF_IMPLEMENTATION arm64
diff --git a/contrib/simdutf/src/simdutf/arm64/bitmanipulation.h b/contrib/simdutf/src/simdutf/arm64/bitmanipulation.h
new file mode 100644
index 000000000..fc51c0b53
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/arm64/bitmanipulation.h
@@ -0,0 +1,31 @@
+#ifndef SIMDUTF_ARM64_BITMANIPULATION_H
+#define SIMDUTF_ARM64_BITMANIPULATION_H
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+
+/* result might be undefined when input_num is zero */
+simdutf_really_inline int count_ones(uint64_t input_num) {
+ return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
+}
+
+#if SIMDUTF_NEED_TRAILING_ZEROES
+simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
+ #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ unsigned long ret;
+ // Search the mask data from least significant bit (LSB)
+ // to the most significant bit (MSB) for a set bit (1).
+ _BitScanForward64(&ret, input_num);
+ return (int)ret;
+ #else // SIMDUTF_REGULAR_VISUAL_STUDIO
+ return __builtin_ctzll(input_num);
+ #endif // SIMDUTF_REGULAR_VISUAL_STUDIO
+}
+#endif
+
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#endif // SIMDUTF_ARM64_BITMANIPULATION_H
diff --git a/contrib/simdutf/src/simdutf/arm64/end.h b/contrib/simdutf/src/simdutf/arm64/end.h
new file mode 100644
index 000000000..58fd810d4
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/arm64/end.h
@@ -0,0 +1 @@
+#undef SIMDUTF_IMPLEMENTATION
diff --git a/contrib/simdutf/src/simdutf/arm64/implementation.h b/contrib/simdutf/src/simdutf/arm64/implementation.h
new file mode 100644
index 000000000..7066ccff9
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/arm64/implementation.h
@@ -0,0 +1,221 @@
+#ifndef SIMDUTF_ARM64_IMPLEMENTATION_H
+#define SIMDUTF_ARM64_IMPLEMENTATION_H
+
+#include "simdutf.h"
+#include "simdutf/internal/isadetection.h"
+
+namespace simdutf {
+namespace arm64 {
+
+namespace {
+using namespace simdutf;
+}
+
+class implementation final : public simdutf::implementation {
+public:
+ simdutf_really_inline implementation()
+ : simdutf::implementation("arm64", "ARM NEON",
+ internal::instruction_set::NEON) {}
+ simdutf_warn_unused int detect_encodings(const char *input,
+ size_t length) const noexcept final;
+ simdutf_warn_unused bool validate_utf8(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_ascii(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf32(const char32_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf8(
+ const char *buf, size_t len, char *utf8_output) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len, char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16le_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16be_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused result
+ convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ void change_endianness_utf16(const char16_t *buf, size_t length,
+ char16_t *output) const noexcept final;
+ simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t count_utf8(const char *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t utf32_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t utf32_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf16(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf32(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_latin1(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_latin1(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_latin1(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept;
+ simdutf_warn_unused result base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused full_result base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused result
+ base64_to_binary(const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused full_result base64_to_binary_details(
+ const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused size_t base64_length_from_binary(
+ size_t length, base64_options options) const noexcept;
+ size_t binary_to_base64(const char *input, size_t length, char *output,
+ base64_options options) const noexcept;
+};
+
+} // namespace arm64
+} // namespace simdutf
+
+#endif // SIMDUTF_ARM64_IMPLEMENTATION_H
diff --git a/contrib/simdutf/src/simdutf/arm64/intrinsics.h b/contrib/simdutf/src/simdutf/arm64/intrinsics.h
new file mode 100644
index 000000000..bd239633f
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/arm64/intrinsics.h
@@ -0,0 +1,10 @@
+#ifndef SIMDUTF_ARM64_INTRINSICS_H
+#define SIMDUTF_ARM64_INTRINSICS_H
+
+#include "simdutf.h"
+
+// This should be the correct header whether
+// you use visual studio or other compilers.
+#include <arm_neon.h>
+
+#endif // SIMDUTF_ARM64_INTRINSICS_H
diff --git a/contrib/simdutf/src/simdutf/arm64/simd.h b/contrib/simdutf/src/simdutf/arm64/simd.h
new file mode 100644
index 000000000..12612553a
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/arm64/simd.h
@@ -0,0 +1,725 @@
+#ifndef SIMDUTF_ARM64_SIMD_H
+#define SIMDUTF_ARM64_SIMD_H
+
+#include "simdutf.h"
+#include "simdutf/arm64/bitmanipulation.h"
+#include <type_traits>
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+namespace simd {
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+namespace {
+ // Start of private section with Visual Studio workaround
+
+ #ifndef simdutf_make_uint8x16_t
+ #define simdutf_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, \
+ x11, x12, x13, x14, x15, x16) \
+ ([=]() { \
+ uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
+ x9, x10, x11, x12, x13, x14, x15, x16}; \
+ return vld1q_u8(array); \
+ }())
+ #endif
+ #ifndef simdutf_make_int8x16_t
+ #define simdutf_make_int8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, \
+ x11, x12, x13, x14, x15, x16) \
+ ([=]() { \
+ int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
+ x9, x10, x11, x12, x13, x14, x15, x16}; \
+ return vld1q_s8(array); \
+ }())
+ #endif
+
+ #ifndef simdutf_make_uint8x8_t
+ #define simdutf_make_uint8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
+ ([=]() { \
+ uint8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
+ return vld1_u8(array); \
+ }())
+ #endif
+ #ifndef simdutf_make_int8x8_t
+ #define simdutf_make_int8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
+ ([=]() { \
+ int8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
+ return vld1_s8(array); \
+ }())
+ #endif
+ #ifndef simdutf_make_uint16x8_t
+ #define simdutf_make_uint16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
+ ([=]() { \
+ uint16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
+ return vld1q_u16(array); \
+ }())
+ #endif
+ #ifndef simdutf_make_int16x8_t
+ #define simdutf_make_int16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
+ ([=]() { \
+ int16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
+ return vld1q_s16(array); \
+ }())
+ #endif
+
+// End of private section with Visual Studio workaround
+} // namespace
+#endif // SIMDUTF_REGULAR_VISUAL_STUDIO
+
+template <typename T> struct simd8;
+
+//
+// Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t
+// internally.
+//
+template <typename T, typename Mask = simd8<bool>> struct base_u8 {
+ uint8x16_t value;
+ static const int SIZE = sizeof(value);
+
+ // Conversion from/to SIMD register
+ simdutf_really_inline base_u8(const uint8x16_t _value) : value(_value) {}
+ simdutf_really_inline operator const uint8x16_t &() const {
+ return this->value;
+ }
+ simdutf_really_inline operator uint8x16_t &() { return this->value; }
+ simdutf_really_inline T first() const { return vgetq_lane_u8(*this, 0); }
+ simdutf_really_inline T last() const { return vgetq_lane_u8(*this, 15); }
+
+ // Bit operations
+ simdutf_really_inline simd8<T> operator|(const simd8<T> other) const {
+ return vorrq_u8(*this, other);
+ }
+ simdutf_really_inline simd8<T> operator&(const simd8<T> other) const {
+ return vandq_u8(*this, other);
+ }
+ simdutf_really_inline simd8<T> operator^(const simd8<T> other) const {
+ return veorq_u8(*this, other);
+ }
+ simdutf_really_inline simd8<T> bit_andnot(const simd8<T> other) const {
+ return vbicq_u8(*this, other);
+ }
+ simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+ simdutf_really_inline simd8<T> &operator|=(const simd8<T> other) {
+ auto this_cast = static_cast<simd8<T> *>(this);
+ *this_cast = *this_cast | other;
+ return *this_cast;
+ }
+ simdutf_really_inline simd8<T> &operator&=(const simd8<T> other) {
+ auto this_cast = static_cast<simd8<T> *>(this);
+ *this_cast = *this_cast & other;
+ return *this_cast;
+ }
+ simdutf_really_inline simd8<T> &operator^=(const simd8<T> other) {
+ auto this_cast = static_cast<simd8<T> *>(this);
+ *this_cast = *this_cast ^ other;
+ return *this_cast;
+ }
+
+ friend simdutf_really_inline Mask operator==(const simd8<T> lhs,
+ const simd8<T> rhs) {
+ return vceqq_u8(lhs, rhs);
+ }
+
+ template <int N = 1>
+ simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+ return vextq_u8(prev_chunk, *this, 16 - N);
+ }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd8<bool> : base_u8<bool> {
+ typedef uint16_t bitmask_t;
+ typedef uint32_t bitmask2_t;
+
+ static simdutf_really_inline simd8<bool> splat(bool _value) {
+ return vmovq_n_u8(uint8_t(-(!!_value)));
+ }
+
+ simdutf_really_inline simd8(const uint8x16_t _value)
+ : base_u8<bool>(_value) {}
+ // False constructor
+ simdutf_really_inline simd8() : simd8(vdupq_n_u8(0)) {}
+ // Splat constructor
+ simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {}
+ simdutf_really_inline void store(uint8_t dst[16]) const {
+ return vst1q_u8(dst, *this);
+ }
+
+ // We return uint32_t instead of uint16_t because that seems to be more
+ // efficient for most purposes (cutting it down to uint16_t costs performance
+ // in some compilers).
+ simdutf_really_inline uint32_t to_bitmask() const {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint8x16_t bit_mask =
+ simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+ 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+#else
+ const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+ 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+#endif
+ auto minput = *this & bit_mask;
+ uint8x16_t tmp = vpaddq_u8(minput, minput);
+ tmp = vpaddq_u8(tmp, tmp);
+ tmp = vpaddq_u8(tmp, tmp);
+ return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
+ }
+
+ // Returns 4-bit out of each byte, alternating between the high 4 bits and low
+ // bits result it is 64 bit. This method is expected to be faster than none()
+ // and is equivalent when the vector register is the result of a comparison,
+ // with byte values 0xff and 0x00.
+ simdutf_really_inline uint64_t to_bitmask64() const {
+ return vget_lane_u64(
+ vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0);
+ }
+
+ simdutf_really_inline bool any() const {
+ return vmaxvq_u32(vreinterpretq_u32_u8(*this)) != 0;
+ }
+ simdutf_really_inline bool none() const {
+ return vmaxvq_u32(vreinterpretq_u32_u8(*this)) == 0;
+ }
+ simdutf_really_inline bool all() const {
+ return vminvq_u32(vreinterpretq_u32_u8(*this)) == 0xFFFFF;
+ }
+};
+
+// Unsigned bytes
+template <> struct simd8<uint8_t> : base_u8<uint8_t> {
+ static simdutf_really_inline simd8<uint8_t> splat(uint8_t _value) {
+ return vmovq_n_u8(_value);
+ }
+ static simdutf_really_inline simd8<uint8_t> zero() { return vdupq_n_u8(0); }
+ static simdutf_really_inline simd8<uint8_t> load(const uint8_t *values) {
+ return vld1q_u8(values);
+ }
+ simdutf_really_inline simd8(const uint8x16_t _value)
+ : base_u8<uint8_t>(_value) {}
+ // Zero constructor
+ simdutf_really_inline simd8() : simd8(zero()) {}
+ // Array constructor
+ simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
+ // Splat constructor
+ simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+ // Member-by-member initialization
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ simdutf_really_inline
+ simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
+ uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
+ uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+ : simd8(simdutf_make_uint8x16_t(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+ v10, v11, v12, v13, v14, v15)) {}
+#else
+ simdutf_really_inline
+ simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
+ uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
+ uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+ : simd8(uint8x16_t{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+ v13, v14, v15}) {}
+#endif
+
+ // Repeat 16 values as many times as necessary (usually for lookup tables)
+ simdutf_really_inline static simd8<uint8_t>
+ repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
+ uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
+ uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
+ uint8_t v15) {
+ return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+ v13, v14, v15);
+ }
+
+ // Store to array
+ simdutf_really_inline void store(uint8_t dst[16]) const {
+ return vst1q_u8(dst, *this);
+ }
+
+ // Saturated math
+ simdutf_really_inline simd8<uint8_t>
+ saturating_add(const simd8<uint8_t> other) const {
+ return vqaddq_u8(*this, other);
+ }
+ simdutf_really_inline simd8<uint8_t>
+ saturating_sub(const simd8<uint8_t> other) const {
+ return vqsubq_u8(*this, other);
+ }
+
+ // Addition/subtraction are the same for signed and unsigned
+ simdutf_really_inline simd8<uint8_t>
+ operator+(const simd8<uint8_t> other) const {
+ return vaddq_u8(*this, other);
+ }
+ simdutf_really_inline simd8<uint8_t>
+ operator-(const simd8<uint8_t> other) const {
+ return vsubq_u8(*this, other);
+ }
+ simdutf_really_inline simd8<uint8_t> &operator+=(const simd8<uint8_t> other) {
+ *this = *this + other;
+ return *this;
+ }
+ simdutf_really_inline simd8<uint8_t> &operator-=(const simd8<uint8_t> other) {
+ *this = *this - other;
+ return *this;
+ }
+
+ // Order-specific operations
+ simdutf_really_inline uint8_t max_val() const { return vmaxvq_u8(*this); }
+ simdutf_really_inline uint8_t min_val() const { return vminvq_u8(*this); }
+ simdutf_really_inline simd8<uint8_t>
+ max_val(const simd8<uint8_t> other) const {
+ return vmaxq_u8(*this, other);
+ }
+ simdutf_really_inline simd8<uint8_t>
+ min_val(const simd8<uint8_t> other) const {
+ return vminq_u8(*this, other);
+ }
+ simdutf_really_inline simd8<bool>
+ operator<=(const simd8<uint8_t> other) const {
+ return vcleq_u8(*this, other);
+ }
+ simdutf_really_inline simd8<bool>
+ operator>=(const simd8<uint8_t> other) const {
+ return vcgeq_u8(*this, other);
+ }
+ simdutf_really_inline simd8<bool>
+ operator<(const simd8<uint8_t> other) const {
+ return vcltq_u8(*this, other);
+ }
+ simdutf_really_inline simd8<bool>
+ operator>(const simd8<uint8_t> other) const {
+ return vcgtq_u8(*this, other);
+ }
+ // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true
+ // = nonzero. For ARM, returns all 1's.
+ simdutf_really_inline simd8<uint8_t>
+ gt_bits(const simd8<uint8_t> other) const {
+ return simd8<uint8_t>(*this > other);
+ }
+ // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true
+ // = nonzero. For ARM, returns all 1's.
+ simdutf_really_inline simd8<uint8_t>
+ lt_bits(const simd8<uint8_t> other) const {
+ return simd8<uint8_t>(*this < other);
+ }
+
+ // Bit-specific operations
+ simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
+ return vtstq_u8(*this, bits);
+ }
+ simdutf_really_inline bool is_ascii() const {
+ return this->max_val() < 0b10000000u;
+ }
+
+ simdutf_really_inline bool any_bits_set_anywhere() const {
+ return this->max_val() != 0;
+ }
+ simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
+ return (*this & bits).any_bits_set_anywhere();
+ }
+ template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
+ return vshrq_n_u8(*this, N);
+ }
+ template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
+ return vshlq_n_u8(*this, N);
+ }
+
+ // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
+ // for out of range values)
+ template <typename L>
+ simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+ return lookup_table.apply_lookup_16_to(*this);
+ }
+
+ template <typename L>
+ simdutf_really_inline simd8<L>
+ lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+ L replace5, L replace6, L replace7, L replace8, L replace9,
+ L replace10, L replace11, L replace12, L replace13, L replace14,
+ L replace15) const {
+ return lookup_16(simd8<L>::repeat_16(
+ replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+ replace7, replace8, replace9, replace10, replace11, replace12,
+ replace13, replace14, replace15));
+ }
+
+ template <typename T>
+ simdutf_really_inline simd8<uint8_t>
+ apply_lookup_16_to(const simd8<T> original) const {
+ return vqtbl1q_u8(*this, simd8<uint8_t>(original));
+ }
+};
+
+// Signed bytes
+template <> struct simd8<int8_t> {
+ int8x16_t value;
+
+ static simdutf_really_inline simd8<int8_t> splat(int8_t _value) {
+ return vmovq_n_s8(_value);
+ }
+ static simdutf_really_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
+ static simdutf_really_inline simd8<int8_t> load(const int8_t values[16]) {
+ return vld1q_s8(values);
+ }
+
+ // Use ST2 instead of UXTL+UXTL2 to interleave zeroes. UXTL is actually a
+ // USHLL #0, and shifting in NEON is actually quite slow.
+ //
+ // While this needs the registers to be in a specific order, bigger cores can
+ // interleave these with no overhead, and it still performs decently on little
+ // cores.
+ // movi v1.3d, #0
+ // mov v0.16b, value[0]
+ // st2 {v0.16b, v1.16b}, [ptr], #32
+ // mov v0.16b, value[1]
+ // st2 {v0.16b, v1.16b}, [ptr], #32
+ // ...
+ template <endianness big_endian>
+ simdutf_really_inline void store_ascii_as_utf16(char16_t *p) const {
+ int8x16x2_t pair = match_system(big_endian)
+ ? int8x16x2_t{{this->value, vmovq_n_s8(0)}}
+ : int8x16x2_t{{vmovq_n_s8(0), this->value}};
+ vst2q_s8(reinterpret_cast<int8_t *>(p), pair);
+ }
+
+ // currently unused
+ // Technically this could be done with ST4 like in store_ascii_as_utf16, but
+ // it is very much not worth it, as explicitly mentioned in the ARM Cortex-X1
+ // Core Software Optimization Guide:
+ // 4.18 Complex ASIMD instructions
+ // The bandwidth of [ST4 with element size less than 64b] is limited by
+ // decode constraints and it is advisable to avoid them when high
+ // performing code is desired.
+ // Instead, it is better to use ZIP1+ZIP2 and two ST2.
+ simdutf_really_inline void store_ascii_as_utf32(char32_t *p) const {
+ const uint16x8_t low =
+ vreinterpretq_u16_s8(vzip1q_s8(this->value, vmovq_n_s8(0)));
+ const uint16x8_t high =
+ vreinterpretq_u16_s8(vzip2q_s8(this->value, vmovq_n_s8(0)));
+ const uint16x8x2_t low_pair{{low, vmovq_n_u16(0)}};
+ vst2q_u16(reinterpret_cast<uint16_t *>(p), low_pair);
+ const uint16x8x2_t high_pair{{high, vmovq_n_u16(0)}};
+ vst2q_u16(reinterpret_cast<uint16_t *>(p + 8), high_pair);
+ }
+
+ // In places where the table can be reused, which is most uses in simdutf, it
+ // is worth it to do 4 table lookups, as there is no direct zero extension
+ // from u8 to u32.
+ simdutf_really_inline void store_ascii_as_utf32_tbl(char32_t *p) const {
+ const simd8<uint8_t> tb1{0, 255, 255, 255, 1, 255, 255, 255,
+ 2, 255, 255, 255, 3, 255, 255, 255};
+ const simd8<uint8_t> tb2{4, 255, 255, 255, 5, 255, 255, 255,
+ 6, 255, 255, 255, 7, 255, 255, 255};
+ const simd8<uint8_t> tb3{8, 255, 255, 255, 9, 255, 255, 255,
+ 10, 255, 255, 255, 11, 255, 255, 255};
+ const simd8<uint8_t> tb4{12, 255, 255, 255, 13, 255, 255, 255,
+ 14, 255, 255, 255, 15, 255, 255, 255};
+
+ // encourage store pairing and interleaving
+ const auto shuf1 = this->apply_lookup_16_to(tb1);
+ const auto shuf2 = this->apply_lookup_16_to(tb2);
+ shuf1.store(reinterpret_cast<int8_t *>(p));
+ shuf2.store(reinterpret_cast<int8_t *>(p + 4));
+
+ const auto shuf3 = this->apply_lookup_16_to(tb3);
+ const auto shuf4 = this->apply_lookup_16_to(tb4);
+ shuf3.store(reinterpret_cast<int8_t *>(p + 8));
+ shuf4.store(reinterpret_cast<int8_t *>(p + 12));
+ }
+ // Conversion from/to SIMD register
+ simdutf_really_inline simd8(const int8x16_t _value) : value{_value} {}
+ simdutf_really_inline operator const int8x16_t &() const {
+ return this->value;
+ }
+#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
+ simdutf_really_inline operator const uint8x16_t() const {
+ return vreinterpretq_u8_s8(this->value);
+ }
+#endif
+ simdutf_really_inline operator int8x16_t &() { return this->value; }
+
+ // Zero constructor
+ simdutf_really_inline simd8() : simd8(zero()) {}
+ // Splat constructor
+ simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
+ // Member-by-member initialization
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
+ int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+ int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+ int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+ : simd8(simdutf_make_int8x16_t(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+ v10, v11, v12, v13, v14, v15)) {}
+#else
+ simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
+ int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+ int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+ int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+ : simd8(int8x16_t{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+ v13, v14, v15}) {}
+#endif
+ // Repeat 16 values as many times as necessary (usually for lookup tables)
+ simdutf_really_inline static simd8<int8_t>
+ repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+ int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+ int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
+ return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+ v13, v14, v15);
+ }
+
+ // Store to array
+ simdutf_really_inline void store(int8_t dst[16]) const {
+ return vst1q_s8(dst, value);
+ }
+ // Explicit conversion to/from unsigned
+ //
+ // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same
+ // type. In theory, we could check this occurrence with std::same_as and
+ // std::enabled_if but it is C++14 and relatively ugly and hard to read.
+#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
+ simdutf_really_inline explicit simd8(const uint8x16_t other)
+ : simd8(vreinterpretq_s8_u8(other)) {}
+#endif
+ simdutf_really_inline operator simd8<uint8_t>() const {
+ return vreinterpretq_u8_s8(this->value);
+ }
+
+ simdutf_really_inline simd8<int8_t>
+ operator|(const simd8<int8_t> other) const {
+ return vorrq_s8(value, other.value);
+ }
+ simdutf_really_inline simd8<int8_t>
+ operator&(const simd8<int8_t> other) const {
+ return vandq_s8(value, other.value);
+ }
+ simdutf_really_inline simd8<int8_t>
+ operator^(const simd8<int8_t> other) const {
+ return veorq_s8(value, other.value);
+ }
+ simdutf_really_inline simd8<int8_t>
+ bit_andnot(const simd8<int8_t> other) const {
+ return vbicq_s8(value, other.value);
+ }
+
+ // Math
+ simdutf_really_inline simd8<int8_t>
+ operator+(const simd8<int8_t> other) const {
+ return vaddq_s8(value, other.value);
+ }
+ simdutf_really_inline simd8<int8_t>
+ operator-(const simd8<int8_t> other) const {
+ return vsubq_s8(value, other.value);
+ }
+ simdutf_really_inline simd8<int8_t> &operator+=(const simd8<int8_t> other) {
+ *this = *this + other;
+ return *this;
+ }
+ simdutf_really_inline simd8<int8_t> &operator-=(const simd8<int8_t> other) {
+ *this = *this - other;
+ return *this;
+ }
+
+ simdutf_really_inline int8_t max_val() const { return vmaxvq_s8(value); }
+ simdutf_really_inline int8_t min_val() const { return vminvq_s8(value); }
+ simdutf_really_inline bool is_ascii() const { return this->min_val() >= 0; }
+
+ // Order-sensitive comparisons
+ simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const {
+ return vmaxq_s8(value, other.value);
+ }
+ simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const {
+ return vminq_s8(value, other.value);
+ }
+ simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
+ return vcgtq_s8(value, other.value);
+ }
+ simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
+ return vcltq_s8(value, other.value);
+ }
+ simdutf_really_inline simd8<bool>
+ operator==(const simd8<int8_t> other) const {
+ return vceqq_s8(value, other.value);
+ }
+
+ template <int N = 1>
+ simdutf_really_inline simd8<int8_t>
+ prev(const simd8<int8_t> prev_chunk) const {
+ return vextq_s8(prev_chunk, *this, 16 - N);
+ }
+
+ // Perform a lookup assuming no value is larger than 16
+ template <typename L>
+ simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+ return lookup_table.apply_lookup_16_to(*this);
+ }
+ template <typename L>
+ simdutf_really_inline simd8<L>
+ lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+ L replace5, L replace6, L replace7, L replace8, L replace9,
+ L replace10, L replace11, L replace12, L replace13, L replace14,
+ L replace15) const {
+ return lookup_16(simd8<L>::repeat_16(
+ replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+ replace7, replace8, replace9, replace10, replace11, replace12,
+ replace13, replace14, replace15));
+ }
+
+ template <typename T>
+ simdutf_really_inline simd8<int8_t>
+ apply_lookup_16_to(const simd8<T> original) const {
+ return vqtbl1q_s8(*this, simd8<uint8_t>(original));
+ }
+};
+
+template <typename T> struct simd8x64 {
+ static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+ static_assert(NUM_CHUNKS == 4,
+ "ARM kernel should use four registers per 64-byte block.");
+ simd8<T> chunks[NUM_CHUNKS];
+
+ simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
+ simd8x64<T> &
+ operator=(const simd8<T> other) = delete; // no assignment allowed
+ simd8x64() = delete; // no default constructor allowed
+
+ simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
+ const simd8<T> chunk2, const simd8<T> chunk3)
+ : chunks{chunk0, chunk1, chunk2, chunk3} {}
+ simdutf_really_inline simd8x64(const T *ptr)
+ : chunks{simd8<T>::load(ptr),
+ simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)),
+ simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)),
+ simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T))} {}
+
+ simdutf_really_inline void store(T *ptr) const {
+ this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+ this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
+ this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
+ this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
+ }
+
+ simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
+ this->chunks[0] |= other.chunks[0];
+ this->chunks[1] |= other.chunks[1];
+ this->chunks[2] |= other.chunks[2];
+ this->chunks[3] |= other.chunks[3];
+ return *this;
+ }
+
+ simdutf_really_inline simd8<T> reduce_or() const {
+ return (this->chunks[0] | this->chunks[1]) |
+ (this->chunks[2] | this->chunks[3]);
+ }
+
+ simdutf_really_inline bool is_ascii() const { return reduce_or().is_ascii(); }
+
+ template <endianness endian>
+ simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+ this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
+ sizeof(simd8<T>) * 0);
+ this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
+ sizeof(simd8<T>) * 1);
+ this->chunks[2].template store_ascii_as_utf16<endian>(ptr +
+ sizeof(simd8<T>) * 2);
+ this->chunks[3].template store_ascii_as_utf16<endian>(ptr +
+ sizeof(simd8<T>) * 3);
+ }
+
+ simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
+ this->chunks[0].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 0);
+ this->chunks[1].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 1);
+ this->chunks[2].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 2);
+ this->chunks[3].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 3);
+ }
+
+ simdutf_really_inline uint64_t to_bitmask() const {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint8x16_t bit_mask =
+ simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+ 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+#else
+ const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+ 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+#endif
+ // Add each of the elements next to each other, successively, to stuff each
+ // 8 byte mask into one.
+ uint8x16_t sum0 =
+ vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask),
+ vandq_u8(uint8x16_t(this->chunks[1]), bit_mask));
+ uint8x16_t sum1 =
+ vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask),
+ vandq_u8(uint8x16_t(this->chunks[3]), bit_mask));
+ sum0 = vpaddq_u8(sum0, sum1);
+ sum0 = vpaddq_u8(sum0, sum0);
+ return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+ }
+
+ simdutf_really_inline uint64_t eq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
+ this->chunks[2] == mask, this->chunks[3] == mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t lteq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
+ this->chunks[2] <= mask, this->chunks[3] <= mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+ const simd8<T> mask_low = simd8<T>::splat(low);
+ const simd8<T> mask_high = simd8<T>::splat(high);
+
+ return simd8x64<bool>(
+ (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+ (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+ (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+ (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+ const simd8<T> mask_low = simd8<T>::splat(low);
+ const simd8<T> mask_high = simd8<T>::splat(high);
+ return simd8x64<bool>(
+ (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+ (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+ (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+ (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t lt(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
+ this->chunks[2] < mask, this->chunks[3] < mask)
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t gt(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask,
+ this->chunks[2] > mask, this->chunks[3] > mask)
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t gteq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask,
+ this->chunks[2] >= mask, this->chunks[3] >= mask)
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+ const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+ return simd8x64<bool>(simd8<uint8_t>(uint8x16_t(this->chunks[0])) >= mask,
+ simd8<uint8_t>(uint8x16_t(this->chunks[1])) >= mask,
+ simd8<uint8_t>(uint8x16_t(this->chunks[2])) >= mask,
+ simd8<uint8_t>(uint8x16_t(this->chunks[3])) >= mask)
+ .to_bitmask();
+ }
+}; // struct simd8x64<T>
+#include "simdutf/arm64/simd16-inl.h"
+} // namespace simd
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#endif // SIMDUTF_ARM64_SIMD_H
diff --git a/contrib/simdutf/src/simdutf/arm64/simd16-inl.h b/contrib/simdutf/src/simdutf/arm64/simd16-inl.h
new file mode 100644
index 000000000..d21c445b1
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/arm64/simd16-inl.h
@@ -0,0 +1,407 @@
+template <typename T> struct simd16;
+
+template <typename T, typename Mask = simd16<bool>> struct base_u16 {
+ uint16x8_t value;
+ static const int SIZE = sizeof(value);
+
+ // Conversion from/to SIMD register
+ simdutf_really_inline base_u16() = default;
+ simdutf_really_inline base_u16(const uint16x8_t _value) : value(_value) {}
+ simdutf_really_inline operator const uint16x8_t &() const {
+ return this->value;
+ }
+ simdutf_really_inline operator uint16x8_t &() { return this->value; }
+ // Bit operations
+ simdutf_really_inline simd16<T> operator|(const simd16<T> other) const {
+ return vorrq_u16(*this, other);
+ }
+ simdutf_really_inline simd16<T> operator&(const simd16<T> other) const {
+ return vandq_u16(*this, other);
+ }
+ simdutf_really_inline simd16<T> operator^(const simd16<T> other) const {
+ return veorq_u16(*this, other);
+ }
+ simdutf_really_inline simd16<T> bit_andnot(const simd16<T> other) const {
+ return vbicq_u16(*this, other);
+ }
+ simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
+ simdutf_really_inline simd16<T> &operator|=(const simd16<T> other) {
+ auto this_cast = static_cast<simd16<T> *>(this);
+ *this_cast = *this_cast | other;
+ return *this_cast;
+ }
+ simdutf_really_inline simd16<T> &operator&=(const simd16<T> other) {
+ auto this_cast = static_cast<simd16<T> *>(this);
+ *this_cast = *this_cast & other;
+ return *this_cast;
+ }
+ simdutf_really_inline simd16<T> &operator^=(const simd16<T> other) {
+ auto this_cast = static_cast<simd16<T> *>(this);
+ *this_cast = *this_cast ^ other;
+ return *this_cast;
+ }
+
+ friend simdutf_really_inline Mask operator==(const simd16<T> lhs,
+ const simd16<T> rhs) {
+ return vceqq_u16(lhs, rhs);
+ }
+
+ template <int N = 1>
+ simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+ return vextq_u18(prev_chunk, *this, 8 - N);
+ }
+};
+
+template <typename T, typename Mask = simd16<bool>>
+struct base16 : base_u16<T> {
+ typedef uint16_t bitmask_t;
+ typedef uint32_t bitmask2_t;
+
+ simdutf_really_inline base16() : base_u16<T>() {}
+ simdutf_really_inline base16(const uint16x8_t _value) : base_u16<T>(_value) {}
+ template <typename Pointer>
+ simdutf_really_inline base16(const Pointer *ptr) : base16(vld1q_u16(ptr)) {}
+
+ static const int SIZE = sizeof(base_u16<T>::value);
+
+ template <int N = 1>
+ simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+ return vextq_u18(prev_chunk, *this, 8 - N);
+ }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd16<bool> : base16<bool> {
+ static simdutf_really_inline simd16<bool> splat(bool _value) {
+ return vmovq_n_u16(uint16_t(-(!!_value)));
+ }
+
+ simdutf_really_inline simd16() : base16() {}
+ simdutf_really_inline simd16(const uint16x8_t _value)
+ : base16<bool>(_value) {}
+ // Splat constructor
+ simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}
+};
+
+template <typename T> struct base16_numeric : base16<T> {
+ static simdutf_really_inline simd16<T> splat(T _value) {
+ return vmovq_n_u16(_value);
+ }
+ static simdutf_really_inline simd16<T> zero() { return vdupq_n_u16(0); }
+ static simdutf_really_inline simd16<T> load(const T values[8]) {
+ return vld1q_u16(reinterpret_cast<const uint16_t *>(values));
+ }
+
+ simdutf_really_inline base16_numeric() : base16<T>() {}
+ simdutf_really_inline base16_numeric(const uint16x8_t _value)
+ : base16<T>(_value) {}
+
+ // Store to array
+ simdutf_really_inline void store(T dst[8]) const {
+ return vst1q_u16(dst, *this);
+ }
+
+ // Override to distinguish from bool version
+ simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
+
+ // Addition/subtraction are the same for signed and unsigned
+ simdutf_really_inline simd16<T> operator+(const simd16<T> other) const {
+ return vaddq_u8(*this, other);
+ }
+ simdutf_really_inline simd16<T> operator-(const simd16<T> other) const {
+ return vsubq_u8(*this, other);
+ }
+ simdutf_really_inline simd16<T> &operator+=(const simd16<T> other) {
+ *this = *this + other;
+ return *static_cast<simd16<T> *>(this);
+ }
+ simdutf_really_inline simd16<T> &operator-=(const simd16<T> other) {
+ *this = *this - other;
+ return *static_cast<simd16<T> *>(this);
+ }
+};
+
+// Signed code units
+template <> struct simd16<int16_t> : base16_numeric<int16_t> {
+ simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
+#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
+ simdutf_really_inline simd16(const uint16x8_t _value)
+ : base16_numeric<int16_t>(_value) {}
+#endif
+ simdutf_really_inline simd16(const int16x8_t _value)
+ : base16_numeric<int16_t>(vreinterpretq_u16_s16(_value)) {}
+
+ // Splat constructor
+ simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd16(const int16_t *values) : simd16(load(values)) {}
+ simdutf_really_inline simd16(const char16_t *values)
+ : simd16(load(reinterpret_cast<const int16_t *>(values))) {}
+ simdutf_really_inline operator simd16<uint16_t>() const;
+ simdutf_really_inline operator const uint16x8_t &() const {
+ return this->value;
+ }
+ simdutf_really_inline operator const int16x8_t() const {
+ return vreinterpretq_s16_u16(this->value);
+ }
+
+ simdutf_really_inline int16_t max_val() const {
+ return vmaxvq_s16(vreinterpretq_s16_u16(this->value));
+ }
+ simdutf_really_inline int16_t min_val() const {
+ return vminvq_s16(vreinterpretq_s16_u16(this->value));
+ }
+ // Order-sensitive comparisons
+ simdutf_really_inline simd16<int16_t>
+ max_val(const simd16<int16_t> other) const {
+ return vmaxq_s16(vreinterpretq_s16_u16(this->value),
+ vreinterpretq_s16_u16(other.value));
+ }
+ simdutf_really_inline simd16<int16_t>
+ min_val(const simd16<int16_t> other) const {
+ return vmaxq_s16(vreinterpretq_s16_u16(this->value),
+ vreinterpretq_s16_u16(other.value));
+ }
+ simdutf_really_inline simd16<bool>
+ operator>(const simd16<int16_t> other) const {
+ return vcgtq_s16(vreinterpretq_s16_u16(this->value),
+ vreinterpretq_s16_u16(other.value));
+ }
+ simdutf_really_inline simd16<bool>
+ operator<(const simd16<int16_t> other) const {
+ return vcltq_s16(vreinterpretq_s16_u16(this->value),
+ vreinterpretq_s16_u16(other.value));
+ }
+};
+
+// Unsigned code units
+template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
+ simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
+ simdutf_really_inline simd16(const uint16x8_t _value)
+ : base16_numeric<uint16_t>(_value) {}
+
+ // Splat constructor
+ simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {}
+ simdutf_really_inline simd16(const char16_t *values)
+ : simd16(load(reinterpret_cast<const uint16_t *>(values))) {}
+
+ simdutf_really_inline int16_t max_val() const { return vmaxvq_u16(*this); }
+ simdutf_really_inline int16_t min_val() const { return vminvq_u16(*this); }
+ // Saturated math
+ simdutf_really_inline simd16<uint16_t>
+ saturating_add(const simd16<uint16_t> other) const {
+ return vqaddq_u16(*this, other);
+ }
+ simdutf_really_inline simd16<uint16_t>
+ saturating_sub(const simd16<uint16_t> other) const {
+ return vqsubq_u16(*this, other);
+ }
+
+ // Order-specific operations
+ simdutf_really_inline simd16<uint16_t>
+ max_val(const simd16<uint16_t> other) const {
+ return vmaxq_u16(*this, other);
+ }
+ simdutf_really_inline simd16<uint16_t>
+ min_val(const simd16<uint16_t> other) const {
+ return vminq_u16(*this, other);
+ }
+ // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd16<uint16_t>
+ gt_bits(const simd16<uint16_t> other) const {
+ return this->saturating_sub(other);
+ }
+ // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd16<uint16_t>
+ lt_bits(const simd16<uint16_t> other) const {
+ return other.saturating_sub(*this);
+ }
+ simdutf_really_inline simd16<bool>
+ operator<=(const simd16<uint16_t> other) const {
+ return vcleq_u16(*this, other);
+ }
+ simdutf_really_inline simd16<bool>
+ operator>=(const simd16<uint16_t> other) const {
+ return vcgeq_u16(*this, other);
+ }
+ simdutf_really_inline simd16<bool>
+ operator>(const simd16<uint16_t> other) const {
+ return vcgtq_u16(*this, other);
+ }
+ simdutf_really_inline simd16<bool>
+ operator<(const simd16<uint16_t> other) const {
+ return vcltq_u16(*this, other);
+ }
+
+ // Bit-specific operations
+ simdutf_really_inline simd16<bool> bits_not_set() const {
+ return *this == uint16_t(0);
+ }
+ template <int N> simdutf_really_inline simd16<uint16_t> shr() const {
+ return simd16<uint16_t>(vshrq_n_u16(*this, N));
+ }
+ template <int N> simdutf_really_inline simd16<uint16_t> shl() const {
+ return simd16<uint16_t>(vshlq_n_u16(*this, N));
+ }
+
+ // logical operations
+ simdutf_really_inline simd16<uint16_t>
+ operator|(const simd16<uint16_t> other) const {
+ return vorrq_u16(*this, other);
+ }
+ simdutf_really_inline simd16<uint16_t>
+ operator&(const simd16<uint16_t> other) const {
+ return vandq_u16(*this, other);
+ }
+ simdutf_really_inline simd16<uint16_t>
+ operator^(const simd16<uint16_t> other) const {
+ return veorq_u16(*this, other);
+ }
+
+ // Pack with the unsigned saturation of two uint16_t code units into single
+ // uint8_t vector
+ static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
+ const simd16<uint16_t> &v1) {
+ return vqmovn_high_u16(vqmovn_u16(v0), v1);
+ }
+
+ // Change the endianness
+ simdutf_really_inline simd16<uint16_t> swap_bytes() const {
+ return vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(*this)));
+ }
+};
+simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const {
+ return this->value;
+}
+
+template <typename T> struct simd16x32 {
+ static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
+ static_assert(NUM_CHUNKS == 4,
+ "ARM kernel should use four registers per 64-byte block.");
+ simd16<T> chunks[NUM_CHUNKS];
+
+ simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
+ simd16x32<T> &
+ operator=(const simd16<T> other) = delete; // no assignment allowed
+ simd16x32() = delete; // no default constructor allowed
+
+ simdutf_really_inline
+ simd16x32(const simd16<T> chunk0, const simd16<T> chunk1,
+ const simd16<T> chunk2, const simd16<T> chunk3)
+ : chunks{chunk0, chunk1, chunk2, chunk3} {}
+ simdutf_really_inline simd16x32(const T *ptr)
+ : chunks{simd16<T>::load(ptr),
+ simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)),
+ simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)),
+ simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T))} {}
+
+ simdutf_really_inline void store(T *ptr) const {
+ this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
+ this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
+ this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
+ this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
+ }
+
+ simdutf_really_inline simd16<T> reduce_or() const {
+ return (this->chunks[0] | this->chunks[1]) |
+ (this->chunks[2] | this->chunks[3]);
+ }
+
+ simdutf_really_inline bool is_ascii() const { return reduce_or().is_ascii(); }
+
+ simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+ this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
+ this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 1);
+ this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 2);
+ this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 3);
+ }
+
+ simdutf_really_inline uint64_t to_bitmask() const {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+ const uint8x16_t bit_mask =
+ simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+ 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+#else
+ const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+ 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+#endif
+ // Add each of the elements next to each other, successively, to stuff each
+ // 8 byte mask into one.
+ uint8x16_t sum0 = vpaddq_u8(
+ vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)),
+ vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask)));
+ uint8x16_t sum1 = vpaddq_u8(
+ vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)),
+ vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask)));
+ sum0 = vpaddq_u8(sum0, sum1);
+ sum0 = vpaddq_u8(sum0, sum0);
+ return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+ }
+
+ simdutf_really_inline void swap_bytes() {
+ this->chunks[0] = this->chunks[0].swap_bytes();
+ this->chunks[1] = this->chunks[1].swap_bytes();
+ this->chunks[2] = this->chunks[2].swap_bytes();
+ this->chunks[3] = this->chunks[3].swap_bytes();
+ }
+
+ simdutf_really_inline uint64_t eq(const T m) const {
+ const simd16<T> mask = simd16<T>::splat(m);
+ return simd16x32<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
+ this->chunks[2] == mask, this->chunks[3] == mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t lteq(const T m) const {
+ const simd16<T> mask = simd16<T>::splat(m);
+ return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
+ this->chunks[2] <= mask, this->chunks[3] <= mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+ const simd16<T> mask_low = simd16<T>::splat(low);
+ const simd16<T> mask_high = simd16<T>::splat(high);
+
+ return simd16x32<bool>(
+ (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+ (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+ (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+ (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+ const simd16<T> mask_low = simd16<T>::splat(low);
+ const simd16<T> mask_high = simd16<T>::splat(high);
+ return simd16x32<bool>(
+ (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+ (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+ (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+ (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t lt(const T m) const {
+ const simd16<T> mask = simd16<T>::splat(m);
+ return simd16x32<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
+ this->chunks[2] < mask, this->chunks[3] < mask)
+ .to_bitmask();
+ }
+
+}; // struct simd16x32<T>
+template <>
+simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(
+ const uint16_t low, const uint16_t high) const {
+ const simd16<uint16_t> mask_low = simd16<uint16_t>::splat(low);
+ const simd16<uint16_t> mask_high = simd16<uint16_t>::splat(high);
+ simd16x32<uint16_t> x(simd16<uint16_t>((this->chunks[0] > mask_high) |
+ (this->chunks[0] < mask_low)),
+ simd16<uint16_t>((this->chunks[1] > mask_high) |
+ (this->chunks[1] < mask_low)),
+ simd16<uint16_t>((this->chunks[2] > mask_high) |
+ (this->chunks[2] < mask_low)),
+ simd16<uint16_t>((this->chunks[3] > mask_high) |
+ (this->chunks[3] < mask_low)));
+ return x.to_bitmask();
+}
diff --git a/contrib/simdutf/src/simdutf/fallback.h b/contrib/simdutf/src/simdutf/fallback.h
new file mode 100644
index 000000000..8a9e365a1
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/fallback.h
@@ -0,0 +1,42 @@
+#ifndef SIMDUTF_FALLBACK_H
+#define SIMDUTF_FALLBACK_H
+
+#include "simdutf/portability.h"
+
+// Note that fallback.h is always imported last.
+
+// Default Fallback to on unless a builtin implementation has already been
+// selected.
+#ifndef SIMDUTF_IMPLEMENTATION_FALLBACK
+ #if SIMDUTF_CAN_ALWAYS_RUN_ARM64 || SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || \
+ SIMDUTF_CAN_ALWAYS_RUN_HASWELL || SIMDUTF_CAN_ALWAYS_RUN_WESTMERE || \
+ SIMDUTF_CAN_ALWAYS_RUN_PPC64 || SIMDUTF_CAN_ALWAYS_RUN_RVV || \
+ SIMDUTF_CAN_ALWAYS_RUN_LSX || SIMDUTF_CAN_ALWAYS_RUN_LASX
+ #define SIMDUTF_IMPLEMENTATION_FALLBACK 0
+ #else
+ #define SIMDUTF_IMPLEMENTATION_FALLBACK 1
+ #endif
+#endif
+
+#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)
+
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+
+namespace simdutf {
+/**
+ * Fallback implementation (runs on any machine).
+ */
+namespace fallback {} // namespace fallback
+} // namespace simdutf
+
+ #include "simdutf/fallback/implementation.h"
+
+ #include "simdutf/fallback/begin.h"
+
+ // Declarations
+ #include "simdutf/fallback/bitmanipulation.h"
+
+ #include "simdutf/fallback/end.h"
+
+#endif // SIMDUTF_IMPLEMENTATION_FALLBACK
+#endif // SIMDUTF_FALLBACK_H
diff --git a/contrib/simdutf/src/simdutf/fallback/begin.h b/contrib/simdutf/src/simdutf/fallback/begin.h
new file mode 100644
index 000000000..d300ce051
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/fallback/begin.h
@@ -0,0 +1 @@
+#define SIMDUTF_IMPLEMENTATION fallback
diff --git a/contrib/simdutf/src/simdutf/fallback/bitmanipulation.h b/contrib/simdutf/src/simdutf/fallback/bitmanipulation.h
new file mode 100644
index 000000000..f3777f5ca
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/fallback/bitmanipulation.h
@@ -0,0 +1,13 @@
+#ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H
+#define SIMDUTF_FALLBACK_BITMANIPULATION_H
+
+#include "simdutf.h"
+#include <limits>
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#endif // SIMDUTF_FALLBACK_BITMANIPULATION_H
diff --git a/contrib/simdutf/src/simdutf/fallback/end.h b/contrib/simdutf/src/simdutf/fallback/end.h
new file mode 100644
index 000000000..58fd810d4
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/fallback/end.h
@@ -0,0 +1 @@
+#undef SIMDUTF_IMPLEMENTATION
diff --git a/contrib/simdutf/src/simdutf/fallback/implementation.h b/contrib/simdutf/src/simdutf/fallback/implementation.h
new file mode 100644
index 000000000..b89128a95
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/fallback/implementation.h
@@ -0,0 +1,217 @@
+#ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H
+#define SIMDUTF_FALLBACK_IMPLEMENTATION_H
+
+#include "simdutf/implementation.h"
+
+namespace simdutf {
+namespace fallback {
+
+namespace {
+using namespace simdutf;
+}
+
+class implementation final : public simdutf::implementation {
+public:
+ simdutf_really_inline implementation()
+ : simdutf::implementation("fallback", "Generic fallback implementation",
+ 0) {}
+ simdutf_warn_unused int detect_encodings(const char *input,
+ size_t length) const noexcept final;
+ simdutf_warn_unused bool validate_utf8(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_ascii(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf32(const char32_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf8(
+ const char *buf, size_t len, char *utf8_output) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len, char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16le_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16be_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused result
+ convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ void change_endianness_utf16(const char16_t *buf, size_t length,
+ char16_t *output) const noexcept final;
+ simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t count_utf8(const char *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t utf32_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t utf32_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf16(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf32(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_latin1(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_latin1(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_latin1(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept;
+ simdutf_warn_unused result base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept;
+ simdutf_warn_unused full_result base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused result base64_to_binary(
+ const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept;
+ simdutf_warn_unused size_t base64_length_from_binary(
+ size_t length, base64_options options) const noexcept;
+ simdutf_warn_unused full_result base64_to_binary_details(
+ const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ size_t binary_to_base64(const char *input, size_t length, char *output,
+ base64_options options) const noexcept;
+};
+} // namespace fallback
+} // namespace simdutf
+
+#endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H
diff --git a/contrib/simdutf/src/simdutf/haswell.h b/contrib/simdutf/src/simdutf/haswell.h
new file mode 100644
index 000000000..369e36bd6
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/haswell.h
@@ -0,0 +1,63 @@
+#ifndef SIMDUTF_HASWELL_H
+#define SIMDUTF_HASWELL_H
+
+#ifdef SIMDUTF_WESTMERE_H
+ #error "haswell.h must be included before westmere.h"
+#endif
+#ifdef SIMDUTF_FALLBACK_H
+ #error "haswell.h must be included before fallback.h"
+#endif
+
+#include "simdutf/portability.h"
+
+// Default Haswell to on if this is x86-64. Even if we are not compiled for it,
+// it could be selected at runtime.
+#ifndef SIMDUTF_IMPLEMENTATION_HASWELL
+ //
+ // You do not want to restrict it like so: SIMDUTF_IS_X86_64 && __AVX2__
+ // because we want to rely on *runtime dispatch*.
+ //
+ #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
+ #define SIMDUTF_IMPLEMENTATION_HASWELL 0
+ #else
+ #define SIMDUTF_IMPLEMENTATION_HASWELL (SIMDUTF_IS_X86_64)
+ #endif
+
+#endif
+// To see why (__BMI__) && (__LZCNT__) are not part of this next line, see
+// https://github.com/simdutf/simdutf/issues/1247
+#if ((SIMDUTF_IMPLEMENTATION_HASWELL) && (SIMDUTF_IS_X86_64) && (__AVX2__))
+ #define SIMDUTF_CAN_ALWAYS_RUN_HASWELL 1
+#else
+ #define SIMDUTF_CAN_ALWAYS_RUN_HASWELL 0
+#endif
+
+#if SIMDUTF_IMPLEMENTATION_HASWELL
+
+ #define SIMDUTF_TARGET_HASWELL SIMDUTF_TARGET_REGION("avx2,bmi,lzcnt,popcnt")
+
+namespace simdutf {
+/**
+ * Implementation for Haswell (Intel AVX2).
+ */
+namespace haswell {} // namespace haswell
+} // namespace simdutf
+
+ //
+ // These two need to be included outside SIMDUTF_TARGET_REGION
+ //
+ #include "simdutf/haswell/implementation.h"
+ #include "simdutf/haswell/intrinsics.h"
+
+ //
+ // The rest need to be inside the region
+ //
+ #include "simdutf/haswell/begin.h"
+ // Declarations
+ #include "simdutf/haswell/bitmanipulation.h"
+ #include "simdutf/haswell/simd.h"
+
+ #include "simdutf/haswell/end.h"
+
+#endif // SIMDUTF_IMPLEMENTATION_HASWELL
+#endif // SIMDUTF_HASWELL_COMMON_H
diff --git a/contrib/simdutf/src/simdutf/haswell/begin.h b/contrib/simdutf/src/simdutf/haswell/begin.h
new file mode 100644
index 000000000..70d67135b
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/haswell/begin.h
@@ -0,0 +1,14 @@
+#define SIMDUTF_IMPLEMENTATION haswell
+
+#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
+// nothing needed.
+#else
+SIMDUTF_TARGET_HASWELL
+#endif
+
+#if SIMDUTF_GCC11ORMORE // workaround for
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
+// clang-format off
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
+// clang-format on
+#endif // end of workaround
diff --git a/contrib/simdutf/src/simdutf/haswell/bitmanipulation.h b/contrib/simdutf/src/simdutf/haswell/bitmanipulation.h
new file mode 100644
index 000000000..3336502f9
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/haswell/bitmanipulation.h
@@ -0,0 +1,33 @@
+#ifndef SIMDUTF_HASWELL_BITMANIPULATION_H
+#define SIMDUTF_HASWELL_BITMANIPULATION_H
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
+ // note: we do not support legacy 32-bit Windows
+ return __popcnt64(input_num); // Visual Studio wants two underscores
+}
+#else
+simdutf_really_inline long long int count_ones(uint64_t input_num) {
+ return _popcnt64(input_num);
+}
+#endif
+
+#if SIMDUTF_NEED_TRAILING_ZEROES
+simdutf_inline int trailing_zeroes(uint64_t input_num) {
+ #if SIMDUTF_REGULAR_VISUAL_STUDIO
+ return (int)_tzcnt_u64(input_num);
+ #else // SIMDUTF_REGULAR_VISUAL_STUDIO
+ return __builtin_ctzll(input_num);
+ #endif // SIMDUTF_REGULAR_VISUAL_STUDIO
+}
+#endif
+
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#endif // SIMDUTF_HASWELL_BITMANIPULATION_H
diff --git a/contrib/simdutf/src/simdutf/haswell/end.h b/contrib/simdutf/src/simdutf/haswell/end.h
new file mode 100644
index 000000000..22f3e1041
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/haswell/end.h
@@ -0,0 +1,12 @@
+#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
+// nothing needed.
+#else
+SIMDUTF_UNTARGET_REGION
+#endif
+
+#undef SIMDUTF_IMPLEMENTATION
+
+#if SIMDUTF_GCC11ORMORE // workaround for
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
+SIMDUTF_POP_DISABLE_WARNINGS
+#endif // end of workaround
diff --git a/contrib/simdutf/src/simdutf/haswell/implementation.h b/contrib/simdutf/src/simdutf/haswell/implementation.h
new file mode 100644
index 000000000..5152555cf
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/haswell/implementation.h
@@ -0,0 +1,226 @@
+#ifndef SIMDUTF_HASWELL_IMPLEMENTATION_H
+#define SIMDUTF_HASWELL_IMPLEMENTATION_H
+
+#include "simdutf/implementation.h"
+
+// The constructor may be executed on any host, so we take care not to use
+// SIMDUTF_TARGET_REGION
+namespace simdutf {
+namespace haswell {
+
+using namespace simdutf;
+
+class implementation final : public simdutf::implementation {
+public:
+ simdutf_really_inline implementation()
+ : simdutf::implementation("haswell", "Intel/AMD AVX2",
+ internal::instruction_set::AVX2 |
+ internal::instruction_set::BMI1 |
+ internal::instruction_set::BMI2) {}
+ simdutf_warn_unused int detect_encodings(const char *input,
+ size_t length) const noexcept final;
+ simdutf_warn_unused bool validate_utf8(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_ascii(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf32(const char32_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf8(
+ const char *buf, size_t len, char *utf8_output) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len, char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16le_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16be_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused result
+ convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ void change_endianness_utf16(const char16_t *buf, size_t length,
+ char16_t *output) const noexcept final;
+ simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t count_utf8(const char *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t utf32_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t utf32_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf16(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf32(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_latin1(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_latin1(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_latin1(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused virtual size_t
+ maximal_binary_length_from_base64(const char *input,
+ size_t length) const noexcept;
+ simdutf_warn_unused virtual result
+ base64_to_binary(const char *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused virtual full_result base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused virtual size_t
+ maximal_binary_length_from_base64(const char16_t *input,
+ size_t length) const noexcept;
+ simdutf_warn_unused virtual result
+ base64_to_binary(const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused virtual full_result base64_to_binary_details(
+ const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused virtual size_t
+ base64_length_from_binary(size_t length,
+ base64_options options) const noexcept;
+ size_t binary_to_base64(const char *input, size_t length, char *output,
+ base64_options options) const noexcept;
+};
+
+} // namespace haswell
+} // namespace simdutf
+
+#endif // SIMDUTF_HASWELL_IMPLEMENTATION_H
diff --git a/contrib/simdutf/src/simdutf/haswell/intrinsics.h b/contrib/simdutf/src/simdutf/haswell/intrinsics.h
new file mode 100644
index 000000000..af38b0b28
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/haswell/intrinsics.h
@@ -0,0 +1,62 @@
+#ifndef SIMDUTF_HASWELL_INTRINSICS_H
+#define SIMDUTF_HASWELL_INTRINSICS_H
+
+#include "simdutf.h"
+
+#ifdef SIMDUTF_VISUAL_STUDIO
+ // under clang within visual studio, this will include <x86intrin.h>
+ #include <intrin.h> // visual studio or clang
+#else
+
+ #if SIMDUTF_GCC11ORMORE
+// We should not get warnings while including <x86intrin.h> yet we do
+// under some versions of GCC.
+// If the x86intrin.h header has uninitialized values that are problematic,
+// it is a GCC issue, we want to ignore these warnings.
+SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
+ #endif
+
+ #include <x86intrin.h> // elsewhere
+
+ #if SIMDUTF_GCC11ORMORE
+// cancels the suppression of the -Wuninitialized
+SIMDUTF_POP_DISABLE_WARNINGS
+ #endif
+
+#endif // SIMDUTF_VISUAL_STUDIO
+
+#ifdef SIMDUTF_CLANG_VISUAL_STUDIO
+ /**
+ * You are not supposed, normally, to include these
+ * headers directly. Instead you should either include intrin.h
+ * or x86intrin.h. However, when compiling with clang
+ * under Windows (i.e., when _MSC_VER is set), these headers
+ * only get included *if* the corresponding features are detected
+ * from macros:
+ * e.g., if __AVX2__ is set... in turn, we normally set these
+ * macros by compiling against the corresponding architecture
+ * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
+ * software with these advanced instructions. In simdutf, we
+ * want to compile the whole program for a generic target,
+ * and only target our specific kernels. As a workaround,
+ * we directly include the needed headers. These headers would
+ * normally guard against such usage, but we carefully included
+ * <x86intrin.h> (or <intrin.h>) before, so the headers
+ * are fooled.
+ */
+ #include <bmiintrin.h> // for _blsr_u64
+ #include <lzcntintrin.h> // for __lzcnt64
+ #include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
+ #include <smmintrin.h>
+ #include <tmmintrin.h>
+ #include <avxintrin.h>
+ #include <avx2intrin.h>
+ // unfortunately, we may not get _blsr_u64, but, thankfully, clang
+ // has it as a macro.
+ #ifndef _blsr_u64
+ // we roll our own
+ #define _blsr_u64(n) ((n - 1) & n)
+ #endif // _blsr_u64
+#endif // SIMDUTF_CLANG_VISUAL_STUDIO
+
+#endif // SIMDUTF_HASWELL_INTRINSICS_H
diff --git a/contrib/simdutf/src/simdutf/haswell/simd.h b/contrib/simdutf/src/simdutf/haswell/simd.h
new file mode 100644
index 000000000..4a1c807cb
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/haswell/simd.h
@@ -0,0 +1,502 @@
+#ifndef SIMDUTF_HASWELL_SIMD_H
+#define SIMDUTF_HASWELL_SIMD_H
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+namespace simd {
+
+// Forward-declared so they can be used by splat and friends.
+template <typename Child> struct base {
+ __m256i value;
+
+ // Zero constructor
+ simdutf_really_inline base() : value{__m256i()} {}
+
+ // Conversion from SIMD register
+ simdutf_really_inline base(const __m256i _value) : value(_value) {}
+ // Conversion to SIMD register
+ simdutf_really_inline operator const __m256i &() const { return this->value; }
+ simdutf_really_inline operator __m256i &() { return this->value; }
+ template <endianness big_endian>
+ simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+ __m256i first = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this));
+ __m256i second = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this, 1));
+ if (big_endian) {
+ const __m256i swap = _mm256_setr_epi8(
+ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+ 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+ first = _mm256_shuffle_epi8(first, swap);
+ second = _mm256_shuffle_epi8(second, swap);
+ }
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), first);
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), second);
+ }
+ simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
+ _mm256_cvtepu8_epi32(_mm256_castsi256_si128(*this)));
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 8),
+ _mm256_cvtepu8_epi32(_mm256_castsi256_si128(
+ _mm256_srli_si256(*this, 8))));
+ _mm256_storeu_si256(
+ reinterpret_cast<__m256i *>(ptr + 16),
+ _mm256_cvtepu8_epi32(_mm256_extractf128_si256(*this, 1)));
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 24),
+ _mm256_cvtepu8_epi32(_mm_srli_si128(
+ _mm256_extractf128_si256(*this, 1), 8)));
+ }
+ // Bit operations
+ simdutf_really_inline Child operator|(const Child other) const {
+ return _mm256_or_si256(*this, other);
+ }
+ simdutf_really_inline Child operator&(const Child other) const {
+ return _mm256_and_si256(*this, other);
+ }
+ simdutf_really_inline Child operator^(const Child other) const {
+ return _mm256_xor_si256(*this, other);
+ }
+ simdutf_really_inline Child bit_andnot(const Child other) const {
+ return _mm256_andnot_si256(other, *this);
+ }
+ simdutf_really_inline Child &operator|=(const Child other) {
+ auto this_cast = static_cast<Child *>(this);
+ *this_cast = *this_cast | other;
+ return *this_cast;
+ }
+ simdutf_really_inline Child &operator&=(const Child other) {
+ auto this_cast = static_cast<Child *>(this);
+ *this_cast = *this_cast & other;
+ return *this_cast;
+ }
+ simdutf_really_inline Child &operator^=(const Child other) {
+ auto this_cast = static_cast<Child *>(this);
+ *this_cast = *this_cast ^ other;
+ return *this_cast;
+ }
+};
+
+// Forward-declared so they can be used by splat and friends.
+template <typename T> struct simd8;
+
+template <typename T, typename Mask = simd8<bool>>
+struct base8 : base<simd8<T>> {
+ typedef uint32_t bitmask_t;
+ typedef uint64_t bitmask2_t;
+
+ simdutf_really_inline base8() : base<simd8<T>>() {}
+ simdutf_really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
+ simdutf_really_inline T first() const {
+ return _mm256_extract_epi8(*this, 0);
+ }
+ simdutf_really_inline T last() const {
+ return _mm256_extract_epi8(*this, 31);
+ }
+ friend simdutf_always_inline Mask operator==(const simd8<T> lhs,
+ const simd8<T> rhs) {
+ return _mm256_cmpeq_epi8(lhs, rhs);
+ }
+
+ static const int SIZE = sizeof(base<T>::value);
+
+ template <int N = 1>
+ simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+ return _mm256_alignr_epi8(
+ *this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
+ }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd8<bool> : base8<bool> {
+ static simdutf_really_inline simd8<bool> splat(bool _value) {
+ return _mm256_set1_epi8(uint8_t(-(!!_value)));
+ }
+
+ simdutf_really_inline simd8() : base8() {}
+ simdutf_really_inline simd8(const __m256i _value) : base8<bool>(_value) {}
+ // Splat constructor
+ simdutf_really_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
+
+ simdutf_really_inline uint32_t to_bitmask() const {
+ return uint32_t(_mm256_movemask_epi8(*this));
+ }
+ simdutf_really_inline bool any() const {
+ return !_mm256_testz_si256(*this, *this);
+ }
+ simdutf_really_inline bool none() const {
+ return _mm256_testz_si256(*this, *this);
+ }
+ simdutf_really_inline bool all() const {
+ return static_cast<uint32_t>(_mm256_movemask_epi8(*this)) == 0xFFFFFFFF;
+ }
+ simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
+};
+
+template <typename T> struct base8_numeric : base8<T> {
+ static simdutf_really_inline simd8<T> splat(T _value) {
+ return _mm256_set1_epi8(_value);
+ }
+ static simdutf_really_inline simd8<T> zero() {
+ return _mm256_setzero_si256();
+ }
+ static simdutf_really_inline simd8<T> load(const T values[32]) {
+ return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
+ }
+ // Repeat 16 values as many times as necessary (usually for lookup tables)
+ static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
+ T v5, T v6, T v7, T v8, T v9,
+ T v10, T v11, T v12, T v13,
+ T v14, T v15) {
+ return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+ v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+ v12, v13, v14, v15);
+ }
+
+ simdutf_really_inline base8_numeric() : base8<T>() {}
+ simdutf_really_inline base8_numeric(const __m256i _value)
+ : base8<T>(_value) {}
+
+ // Store to array
+ simdutf_really_inline void store(T dst[32]) const {
+ return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this);
+ }
+
+ // Addition/subtraction are the same for signed and unsigned
+ simdutf_really_inline simd8<T> operator+(const simd8<T> other) const {
+ return _mm256_add_epi8(*this, other);
+ }
+ simdutf_really_inline simd8<T> operator-(const simd8<T> other) const {
+ return _mm256_sub_epi8(*this, other);
+ }
+ simdutf_really_inline simd8<T> &operator+=(const simd8<T> other) {
+ *this = *this + other;
+ return *static_cast<simd8<T> *>(this);
+ }
+ simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
+ *this = *this - other;
+ return *static_cast<simd8<T> *>(this);
+ }
+
+ // Override to distinguish from bool version
+ simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+
+ // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
+ // for out of range values)
+ template <typename L>
+ simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+ return _mm256_shuffle_epi8(lookup_table, *this);
+ }
+
+ template <typename L>
+ simdutf_really_inline simd8<L>
+ lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+ L replace5, L replace6, L replace7, L replace8, L replace9,
+ L replace10, L replace11, L replace12, L replace13, L replace14,
+ L replace15) const {
+ return lookup_16(simd8<L>::repeat_16(
+ replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+ replace7, replace8, replace9, replace10, replace11, replace12,
+ replace13, replace14, replace15));
+ }
+};
+
+// Signed bytes
+template <> struct simd8<int8_t> : base8_numeric<int8_t> {
+ simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
+ simdutf_really_inline simd8(const __m256i _value)
+ : base8_numeric<int8_t>(_value) {}
+
+ // Splat constructor
+ simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
+ simdutf_really_inline operator simd8<uint8_t>() const;
+ // Member-by-member initialization
+ simdutf_really_inline
+ simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+ int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+ int8_t v12, int8_t v13, int8_t v14, int8_t v15, int8_t v16, int8_t v17,
+ int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
+ int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29,
+ int8_t v30, int8_t v31)
+ : simd8(_mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+ v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
+ v22, v23, v24, v25, v26, v27, v28, v29, v30,
+ v31)) {}
+ // Repeat 16 values as many times as necessary (usually for lookup tables)
+ simdutf_really_inline static simd8<int8_t>
+ repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+ int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+ int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
+ return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+ v13, v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+ v10, v11, v12, v13, v14, v15);
+ }
+ simdutf_really_inline bool is_ascii() const {
+ return _mm256_movemask_epi8(*this) == 0;
+ }
+ // Order-sensitive comparisons
+ simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const {
+ return _mm256_max_epi8(*this, other);
+ }
+ simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const {
+ return _mm256_min_epi8(*this, other);
+ }
+ simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
+ return _mm256_cmpgt_epi8(*this, other);
+ }
+ simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
+ return _mm256_cmpgt_epi8(other, *this);
+ }
+};
+
+// Unsigned bytes
+template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
+ simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
+ simdutf_really_inline simd8(const __m256i _value)
+ : base8_numeric<uint8_t>(_value) {}
+ // Splat constructor
+ simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
+ // Member-by-member initialization
+ simdutf_really_inline
+ simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
+ uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
+ uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
+ uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20,
+ uint8_t v21, uint8_t v22, uint8_t v23, uint8_t v24, uint8_t v25,
+ uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30,
+ uint8_t v31)
+ : simd8(_mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+ v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
+ v22, v23, v24, v25, v26, v27, v28, v29, v30,
+ v31)) {}
+ // Repeat 16 values as many times as necessary (usually for lookup tables)
+ simdutf_really_inline static simd8<uint8_t>
+ repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
+ uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
+ uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
+ uint8_t v15) {
+ return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+ v13, v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+ v10, v11, v12, v13, v14, v15);
+ }
+
+ // Saturated math
+ simdutf_really_inline simd8<uint8_t>
+ saturating_add(const simd8<uint8_t> other) const {
+ return _mm256_adds_epu8(*this, other);
+ }
+ simdutf_really_inline simd8<uint8_t>
+ saturating_sub(const simd8<uint8_t> other) const {
+ return _mm256_subs_epu8(*this, other);
+ }
+
+ // Order-specific operations
+ simdutf_really_inline simd8<uint8_t>
+ max_val(const simd8<uint8_t> other) const {
+ return _mm256_max_epu8(*this, other);
+ }
+ simdutf_really_inline simd8<uint8_t>
+ min_val(const simd8<uint8_t> other) const {
+ return _mm256_min_epu8(other, *this);
+ }
+ // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd8<uint8_t>
+ gt_bits(const simd8<uint8_t> other) const {
+ return this->saturating_sub(other);
+ }
+ // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd8<uint8_t>
+ lt_bits(const simd8<uint8_t> other) const {
+ return other.saturating_sub(*this);
+ }
+ simdutf_really_inline simd8<bool>
+ operator<=(const simd8<uint8_t> other) const {
+ return other.max_val(*this) == other;
+ }
+ simdutf_really_inline simd8<bool>
+ operator>=(const simd8<uint8_t> other) const {
+ return other.min_val(*this) == other;
+ }
+ simdutf_really_inline simd8<bool>
+ operator>(const simd8<uint8_t> other) const {
+ return this->gt_bits(other).any_bits_set();
+ }
+ simdutf_really_inline simd8<bool>
+ operator<(const simd8<uint8_t> other) const {
+ return this->lt_bits(other).any_bits_set();
+ }
+
+ // Bit-specific operations
+ simdutf_really_inline simd8<bool> bits_not_set() const {
+ return *this == uint8_t(0);
+ }
+ simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
+ return (*this & bits).bits_not_set();
+ }
+ simdutf_really_inline simd8<bool> any_bits_set() const {
+ return ~this->bits_not_set();
+ }
+ simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
+ return ~this->bits_not_set(bits);
+ }
+ simdutf_really_inline bool is_ascii() const {
+ return _mm256_movemask_epi8(*this) == 0;
+ }
+ simdutf_really_inline bool bits_not_set_anywhere() const {
+ return _mm256_testz_si256(*this, *this);
+ }
+ simdutf_really_inline bool any_bits_set_anywhere() const {
+ return !bits_not_set_anywhere();
+ }
+ simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
+ return _mm256_testz_si256(*this, bits);
+ }
+ simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
+ return !bits_not_set_anywhere(bits);
+ }
+ template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
+ return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N);
+ }
+ template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
+ return simd8<uint8_t>(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N);
+ }
+ // Get one of the bits and make a bitmask out of it.
+ // e.g. value.get_bit<7>() gets the high bit
+ template <int N> simdutf_really_inline int get_bit() const {
+ return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7 - N));
+ }
+};
+simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const {
+ return this->value;
+}
+
+template <typename T> struct simd8x64 {
+ static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+ static_assert(NUM_CHUNKS == 2,
+ "Haswell kernel should use two registers per 64-byte block.");
+ simd8<T> chunks[NUM_CHUNKS];
+
+ simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
+ simd8x64<T> &
+ operator=(const simd8<T> other) = delete; // no assignment allowed
+ simd8x64() = delete; // no default constructor allowed
+
+ simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1)
+ : chunks{chunk0, chunk1} {}
+ simdutf_really_inline simd8x64(const T *ptr)
+ : chunks{simd8<T>::load(ptr),
+ simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T))} {}
+
+ simdutf_really_inline void store(T *ptr) const {
+ this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+ this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
+ }
+
+ simdutf_really_inline uint64_t to_bitmask() const {
+ uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+ uint64_t r_hi = this->chunks[1].to_bitmask();
+ return r_lo | (r_hi << 32);
+ }
+
+ simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
+ this->chunks[0] |= other.chunks[0];
+ this->chunks[1] |= other.chunks[1];
+ return *this;
+ }
+
+ simdutf_really_inline simd8<T> reduce_or() const {
+ return this->chunks[0] | this->chunks[1];
+ }
+
+ simdutf_really_inline bool is_ascii() const {
+ return this->reduce_or().is_ascii();
+ }
+
+ template <endianness endian>
+ simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+ this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
+ sizeof(simd8<T>) * 0);
+ this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
+ sizeof(simd8<T>) * 1);
+ }
+
+ simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
+ this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
+ this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
+ }
+
+ simdutf_really_inline simd8x64<T> bit_or(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<T>(this->chunks[0] | mask, this->chunks[1] | mask);
+ }
+
+ simdutf_really_inline uint64_t eq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
+ return simd8x64<bool>(this->chunks[0] == other.chunks[0],
+ this->chunks[1] == other.chunks[1])
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t lteq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+ const simd8<T> mask_low = simd8<T>::splat(low);
+ const simd8<T> mask_high = simd8<T>::splat(high);
+
+ return simd8x64<bool>(
+ (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+ (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+ const simd8<T> mask_low = simd8<T>::splat(low);
+ const simd8<T> mask_high = simd8<T>::splat(high);
+ return simd8x64<bool>(
+ (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+ (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t lt(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t gt(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask)
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t gteq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask)
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+ const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+ return simd8x64<bool>((simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
+ (simd8<uint8_t>(__m256i(this->chunks[1])) >= mask))
+ .to_bitmask();
+ }
+}; // struct simd8x64<T>
+
+#include "simdutf/haswell/simd16-inl.h"
+
+} // namespace simd
+
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#endif // SIMDUTF_HASWELL_SIMD_H
diff --git a/contrib/simdutf/src/simdutf/haswell/simd16-inl.h b/contrib/simdutf/src/simdutf/haswell/simd16-inl.h
new file mode 100644
index 000000000..48304d568
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/haswell/simd16-inl.h
@@ -0,0 +1,355 @@
+#ifdef __GNUC__
+ #if __GNUC__ < 8
+ #define _mm256_set_m128i(xmm1, xmm2) \
+ _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), \
+ _mm256_castsi128_si256(xmm2), 2)
+ #define _mm256_setr_m128i(xmm2, xmm1) \
+ _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), \
+ _mm256_castsi128_si256(xmm2), 2)
+ #endif
+#endif
+
+template <typename T> struct simd16;
+
+template <typename T, typename Mask = simd16<bool>>
+struct base16 : base<simd16<T>> {
+ using bitmask_type = uint32_t;
+
+ simdutf_really_inline base16() : base<simd16<T>>() {}
+ simdutf_really_inline base16(const __m256i _value)
+ : base<simd16<T>>(_value) {}
+ template <typename Pointer>
+ simdutf_really_inline base16(const Pointer *ptr)
+ : base16(_mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr))) {}
+ friend simdutf_always_inline Mask operator==(const simd16<T> lhs,
+ const simd16<T> rhs) {
+ return _mm256_cmpeq_epi16(lhs, rhs);
+ }
+
+ /// the size of vector in bytes
+ static const int SIZE = sizeof(base<simd16<T>>::value);
+
+ /// the number of elements of type T a vector can hold
+ static const int ELEMENTS = SIZE / sizeof(T);
+
+ template <int N = 1>
+ simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+ return _mm256_alignr_epi8(*this, prev_chunk, 16 - N);
+ }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd16<bool> : base16<bool> {
+ static simdutf_really_inline simd16<bool> splat(bool _value) {
+ return _mm256_set1_epi16(uint16_t(-(!!_value)));
+ }
+
+ simdutf_really_inline simd16() : base16() {}
+ simdutf_really_inline simd16(const __m256i _value) : base16<bool>(_value) {}
+ // Splat constructor
+ simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}
+
+ simdutf_really_inline bitmask_type to_bitmask() const {
+ return _mm256_movemask_epi8(*this);
+ }
+ simdutf_really_inline bool any() const {
+ return !_mm256_testz_si256(*this, *this);
+ }
+ simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
+};
+
+template <typename T> struct base16_numeric : base16<T> {
+ static simdutf_really_inline simd16<T> splat(T _value) {
+ return _mm256_set1_epi16(_value);
+ }
+ static simdutf_really_inline simd16<T> zero() {
+ return _mm256_setzero_si256();
+ }
+ static simdutf_really_inline simd16<T> load(const T values[8]) {
+ return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
+ }
+
+ simdutf_really_inline base16_numeric() : base16<T>() {}
+ simdutf_really_inline base16_numeric(const __m256i _value)
+ : base16<T>(_value) {}
+
+ // Store to array
+ simdutf_really_inline void store(T dst[8]) const {
+ return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this);
+ }
+
+ // Override to distinguish from bool version
+ simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
+
+ // Addition/subtraction are the same for signed and unsigned
+ simdutf_really_inline simd16<T> operator+(const simd16<T> other) const {
+ return _mm256_add_epi16(*this, other);
+ }
+ simdutf_really_inline simd16<T> operator-(const simd16<T> other) const {
+ return _mm256_sub_epi16(*this, other);
+ }
+ simdutf_really_inline simd16<T> &operator+=(const simd16<T> other) {
+ *this = *this + other;
+ return *static_cast<simd16<T> *>(this);
+ }
+ simdutf_really_inline simd16<T> &operator-=(const simd16<T> other) {
+ *this = *this - other;
+ return *static_cast<simd16<T> *>(this);
+ }
+};
+
+// Signed code units
+template <> struct simd16<int16_t> : base16_numeric<int16_t> {
+ simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
+ simdutf_really_inline simd16(const __m256i _value)
+ : base16_numeric<int16_t>(_value) {}
+ // Splat constructor
+ simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd16(const int16_t *values) : simd16(load(values)) {}
+ simdutf_really_inline simd16(const char16_t *values)
+ : simd16(load(reinterpret_cast<const int16_t *>(values))) {}
+ // Order-sensitive comparisons
+ simdutf_really_inline simd16<int16_t>
+ max_val(const simd16<int16_t> other) const {
+ return _mm256_max_epi16(*this, other);
+ }
+ simdutf_really_inline simd16<int16_t>
+ min_val(const simd16<int16_t> other) const {
+ return _mm256_min_epi16(*this, other);
+ }
+ simdutf_really_inline simd16<bool>
+ operator>(const simd16<int16_t> other) const {
+ return _mm256_cmpgt_epi16(*this, other);
+ }
+ simdutf_really_inline simd16<bool>
+ operator<(const simd16<int16_t> other) const {
+ return _mm256_cmpgt_epi16(other, *this);
+ }
+};
+
+// Unsigned code units
+template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
+ simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
+ simdutf_really_inline simd16(const __m256i _value)
+ : base16_numeric<uint16_t>(_value) {}
+
+ // Splat constructor
+ simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {}
+ simdutf_really_inline simd16(const char16_t *values)
+ : simd16(load(reinterpret_cast<const uint16_t *>(values))) {}
+
+ // Saturated math
+ simdutf_really_inline simd16<uint16_t>
+ saturating_add(const simd16<uint16_t> other) const {
+ return _mm256_adds_epu16(*this, other);
+ }
+ simdutf_really_inline simd16<uint16_t>
+ saturating_sub(const simd16<uint16_t> other) const {
+ return _mm256_subs_epu16(*this, other);
+ }
+
+ // Order-specific operations
+ simdutf_really_inline simd16<uint16_t>
+ max_val(const simd16<uint16_t> other) const {
+ return _mm256_max_epu16(*this, other);
+ }
+ simdutf_really_inline simd16<uint16_t>
+ min_val(const simd16<uint16_t> other) const {
+ return _mm256_min_epu16(*this, other);
+ }
+ // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd16<uint16_t>
+ gt_bits(const simd16<uint16_t> other) const {
+ return this->saturating_sub(other);
+ }
+ // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd16<uint16_t>
+ lt_bits(const simd16<uint16_t> other) const {
+ return other.saturating_sub(*this);
+ }
+ simdutf_really_inline simd16<bool>
+ operator<=(const simd16<uint16_t> other) const {
+ return other.max_val(*this) == other;
+ }
+ simdutf_really_inline simd16<bool>
+ operator>=(const simd16<uint16_t> other) const {
+ return other.min_val(*this) == other;
+ }
+ simdutf_really_inline simd16<bool>
+ operator>(const simd16<uint16_t> other) const {
+ return this->gt_bits(other).any_bits_set();
+ }
+ simdutf_really_inline simd16<bool>
+ operator<(const simd16<uint16_t> other) const {
+ return this->gt_bits(other).any_bits_set();
+ }
+
+ // Bit-specific operations
+ simdutf_really_inline simd16<bool> bits_not_set() const {
+ return *this == uint16_t(0);
+ }
+ simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const {
+ return (*this & bits).bits_not_set();
+ }
+ simdutf_really_inline simd16<bool> any_bits_set() const {
+ return ~this->bits_not_set();
+ }
+ simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const {
+ return ~this->bits_not_set(bits);
+ }
+
+ simdutf_really_inline bool bits_not_set_anywhere() const {
+ return _mm256_testz_si256(*this, *this);
+ }
+ simdutf_really_inline bool any_bits_set_anywhere() const {
+ return !bits_not_set_anywhere();
+ }
+ simdutf_really_inline bool
+ bits_not_set_anywhere(simd16<uint16_t> bits) const {
+ return _mm256_testz_si256(*this, bits);
+ }
+ simdutf_really_inline bool
+ any_bits_set_anywhere(simd16<uint16_t> bits) const {
+ return !bits_not_set_anywhere(bits);
+ }
+ template <int N> simdutf_really_inline simd16<uint16_t> shr() const {
+ return simd16<uint16_t>(_mm256_srli_epi16(*this, N));
+ }
+ template <int N> simdutf_really_inline simd16<uint16_t> shl() const {
+ return simd16<uint16_t>(_mm256_slli_epi16(*this, N));
+ }
+ // Get one of the bits and make a bitmask out of it.
+ // e.g. value.get_bit<7>() gets the high bit
+ template <int N> simdutf_really_inline int get_bit() const {
+ return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 15 - N));
+ }
+
+ // Change the endianness
+ simdutf_really_inline simd16<uint16_t> swap_bytes() const {
+ const __m256i swap = _mm256_setr_epi8(
+ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+ 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+ return _mm256_shuffle_epi8(*this, swap);
+ }
+
+ // Pack with the unsigned saturation of two uint16_t code units into single
+ // uint8_t vector
+ static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
+ const simd16<uint16_t> &v1) {
+ // Note: the AVX2 variant of pack operates on 128-bit lanes, thus
+ // we have to shuffle lanes in order to produce bytes in the
+ // correct order.
+
+ // get the 0th lanes
+ const __m128i lo_0 = _mm256_extracti128_si256(v0, 0);
+ const __m128i lo_1 = _mm256_extracti128_si256(v1, 0);
+
+ // get the 1st lanes
+ const __m128i hi_0 = _mm256_extracti128_si256(v0, 1);
+ const __m128i hi_1 = _mm256_extracti128_si256(v1, 1);
+
+ // build new vectors (shuffle lanes)
+ const __m256i t0 = _mm256_set_m128i(lo_1, lo_0);
+ const __m256i t1 = _mm256_set_m128i(hi_1, hi_0);
+
+ // pack code units in linear order from v0 and v1
+ return _mm256_packus_epi16(t0, t1);
+ }
+};
+
+template <typename T> struct simd16x32 {
+ static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
+ static_assert(NUM_CHUNKS == 2,
+ "Haswell kernel should use two registers per 64-byte block.");
+ simd16<T> chunks[NUM_CHUNKS];
+
+ simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
+ simd16x32<T> &
+ operator=(const simd16<T> other) = delete; // no assignment allowed
+ simd16x32() = delete; // no default constructor allowed
+
+ simdutf_really_inline simd16x32(const simd16<T> chunk0,
+ const simd16<T> chunk1)
+ : chunks{chunk0, chunk1} {}
+ simdutf_really_inline simd16x32(const T *ptr)
+ : chunks{simd16<T>::load(ptr),
+ simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T))} {}
+
+ simdutf_really_inline void store(T *ptr) const {
+ this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
+ this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
+ }
+
+ simdutf_really_inline uint64_t to_bitmask() const {
+ uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+ uint64_t r_hi = this->chunks[1].to_bitmask();
+ return r_lo | (r_hi << 32);
+ }
+
+ simdutf_really_inline simd16<T> reduce_or() const {
+ return this->chunks[0] | this->chunks[1];
+ }
+
+ simdutf_really_inline bool is_ascii() const {
+ return this->reduce_or().is_ascii();
+ }
+
+ simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+ this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
+ this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>));
+ }
+
+ simdutf_really_inline simd16x32<T> bit_or(const T m) const {
+ const simd16<T> mask = simd16<T>::splat(m);
+ return simd16x32<T>(this->chunks[0] | mask, this->chunks[1] | mask);
+ }
+
+ simdutf_really_inline void swap_bytes() {
+ this->chunks[0] = this->chunks[0].swap_bytes();
+ this->chunks[1] = this->chunks[1].swap_bytes();
+ }
+
+ simdutf_really_inline uint64_t eq(const T m) const {
+ const simd16<T> mask = simd16<T>::splat(m);
+ return simd16x32<bool>(this->chunks[0] == mask, this->chunks[1] == mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
+ return simd16x32<bool>(this->chunks[0] == other.chunks[0],
+ this->chunks[1] == other.chunks[1])
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t lteq(const T m) const {
+ const simd16<T> mask = simd16<T>::splat(m);
+ return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+ const simd16<T> mask_low = simd16<T>::splat(low);
+ const simd16<T> mask_high = simd16<T>::splat(high);
+
+ return simd16x32<bool>(
+ (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+ (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+ const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low - 1));
+ const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high + 1));
+ return simd16x32<bool>(
+ (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+ (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t lt(const T m) const {
+ const simd16<T> mask = simd16<T>::splat(m);
+ return simd16x32<bool>(this->chunks[0] < mask, this->chunks[1] < mask)
+ .to_bitmask();
+ }
+}; // struct simd16x32<T>
diff --git a/contrib/simdutf/src/simdutf/icelake.h b/contrib/simdutf/src/simdutf/icelake.h
new file mode 100644
index 000000000..e63c4413e
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/icelake.h
@@ -0,0 +1,71 @@
+#ifndef SIMDUTF_ICELAKE_H
+#define SIMDUTF_ICELAKE_H
+
+#include "simdutf/portability.h"
+
+#ifdef __has_include
+ // How do we detect that a compiler supports vbmi2?
+ // For sure if the following header is found, we are ok?
+ #if __has_include(<avx512vbmi2intrin.h>)
+ #define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1
+ #endif
+#endif
+
+#ifdef _MSC_VER
+ #if _MSC_VER >= 1930
+ // Visual Studio 2022 and up support VBMI2 under x64 even if the header
+ // avx512vbmi2intrin.h is not found.
+ // Visual Studio 2019 technically supports VBMI2, but the implementation
+ // might be unreliable. Search for visualstudio2019icelakeissue in our
+ // tests.
+ #define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1
+ #endif
+#endif
+
+// We allow icelake on x64 as long as the compiler is known to support VBMI2.
+#ifndef SIMDUTF_IMPLEMENTATION_ICELAKE
+ #define SIMDUTF_IMPLEMENTATION_ICELAKE \
+ ((SIMDUTF_IS_X86_64) && (SIMDUTF_COMPILER_SUPPORTS_VBMI2))
+#endif
+
+// To see why (__BMI__) && (__LZCNT__) are not part of this next line, see
+// https://github.com/simdutf/simdutf/issues/1247
+#if ((SIMDUTF_IMPLEMENTATION_ICELAKE) && (SIMDUTF_IS_X86_64) && (__AVX2__) && \
+ (SIMDUTF_HAS_AVX512F && SIMDUTF_HAS_AVX512DQ && SIMDUTF_HAS_AVX512VL && \
+ SIMDUTF_HAS_AVX512VBMI2) && \
+ (!SIMDUTF_IS_32BITS))
+ #define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE 1
+#else
+ #define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE 0
+#endif
+
+#if SIMDUTF_IMPLEMENTATION_ICELAKE
+ #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
+ #define SIMDUTF_TARGET_ICELAKE
+ #else
+ #define SIMDUTF_TARGET_ICELAKE \
+ SIMDUTF_TARGET_REGION( \
+ "avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2," \
+ "avx512vl,avx2,bmi,bmi2,pclmul,lzcnt,popcnt,avx512vpopcntdq")
+ #endif
+
+namespace simdutf {
+namespace icelake {} // namespace icelake
+} // namespace simdutf
+
+ //
+ // These two need to be included outside SIMDUTF_TARGET_REGION
+ //
+ #include "simdutf/icelake/intrinsics.h"
+ #include "simdutf/icelake/implementation.h"
+
+ //
+ // The rest need to be inside the region
+ //
+ #include "simdutf/icelake/begin.h"
+ // Declarations
+ #include "simdutf/icelake/bitmanipulation.h"
+ #include "simdutf/icelake/end.h"
+
+#endif // SIMDUTF_IMPLEMENTATION_ICELAKE
+#endif // SIMDUTF_ICELAKE_H
diff --git a/contrib/simdutf/src/simdutf/icelake/begin.h b/contrib/simdutf/src/simdutf/icelake/begin.h
new file mode 100644
index 000000000..abe13d28b
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/icelake/begin.h
@@ -0,0 +1,14 @@
+#define SIMDUTF_IMPLEMENTATION icelake
+
+#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
+// nothing needed.
+#else
+SIMDUTF_TARGET_ICELAKE
+#endif
+
+#if SIMDUTF_GCC11ORMORE // workaround for
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
+// clang-format off
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
+// clang-format on
+#endif // end of workaround
diff --git a/contrib/simdutf/src/simdutf/icelake/bitmanipulation.h b/contrib/simdutf/src/simdutf/icelake/bitmanipulation.h
new file mode 100644
index 000000000..286193e8c
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/icelake/bitmanipulation.h
@@ -0,0 +1,33 @@
+#ifndef SIMDUTF_ICELAKE_BITMANIPULATION_H
+#define SIMDUTF_ICELAKE_BITMANIPULATION_H
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
+ // note: we do not support legacy 32-bit Windows
+ return __popcnt64(input_num); // Visual Studio wants two underscores
+}
+#else
+simdutf_really_inline long long int count_ones(uint64_t input_num) {
+ return _popcnt64(input_num);
+}
+#endif
+
+#if SIMDUTF_NEED_TRAILING_ZEROES
+simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
+ #if SIMDUTF_REGULAR_VISUAL_STUDIO
+ return (int)_tzcnt_u64(input_num);
+ #else // SIMDUTF_REGULAR_VISUAL_STUDIO
+ return __builtin_ctzll(input_num);
+ #endif // SIMDUTF_REGULAR_VISUAL_STUDIO
+}
+#endif
+
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#endif // SIMDUTF_ICELAKE_BITMANIPULATION_H
diff --git a/contrib/simdutf/src/simdutf/icelake/end.h b/contrib/simdutf/src/simdutf/icelake/end.h
new file mode 100644
index 000000000..92b1cd599
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/icelake/end.h
@@ -0,0 +1,12 @@
+#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
+// nothing needed.
+#else
+SIMDUTF_UNTARGET_REGION
+#endif
+
+#undef SIMDUTF_IMPLEMENTATION
+
+#if SIMDUTF_GCC11ORMORE // workaround for
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
+SIMDUTF_POP_DISABLE_WARNINGS
+#endif // end of workaround
diff --git a/contrib/simdutf/src/simdutf/icelake/implementation.h b/contrib/simdutf/src/simdutf/icelake/implementation.h
new file mode 100644
index 000000000..2f66b637f
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/icelake/implementation.h
@@ -0,0 +1,229 @@
+#ifndef SIMDUTF_ICELAKE_IMPLEMENTATION_H
+#define SIMDUTF_ICELAKE_IMPLEMENTATION_H
+
+#include "simdutf/implementation.h"
+
+namespace simdutf {
+namespace icelake {
+
+namespace {
+using namespace simdutf;
+}
+
+class implementation final : public simdutf::implementation {
+public:
+ simdutf_really_inline implementation()
+ : simdutf::implementation(
+ "icelake",
+ "Intel AVX512 (AVX-512BW, AVX-512CD, AVX-512VL, AVX-512VBMI2 "
+ "extensions)",
+ internal::instruction_set::AVX2 | internal::instruction_set::BMI1 |
+ internal::instruction_set::BMI2 |
+ internal::instruction_set::AVX512BW |
+ internal::instruction_set::AVX512CD |
+ internal::instruction_set::AVX512VL |
+ internal::instruction_set::AVX512VBMI2 |
+ internal::instruction_set::AVX512VPOPCNTDQ) {}
+ simdutf_warn_unused int detect_encodings(const char *input,
+ size_t length) const noexcept final;
+ simdutf_warn_unused bool validate_utf8(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_ascii(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf32(const char32_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf8(
+ const char *buf, size_t len, char *utf8_output) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len, char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16le_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16be_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused result
+ convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ void change_endianness_utf16(const char16_t *buf, size_t length,
+ char16_t *output) const noexcept final;
+ simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t count_utf8(const char *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t utf32_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t utf32_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf16(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf32(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_latin1(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_latin1(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_latin1(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept;
+ simdutf_warn_unused result base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused full_result base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused result
+ base64_to_binary(const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused full_result base64_to_binary_details(
+ const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused size_t base64_length_from_binary(
+ size_t length, base64_options options) const noexcept;
+ size_t binary_to_base64(const char *input, size_t length, char *output,
+ base64_options options) const noexcept;
+};
+
+} // namespace icelake
+} // namespace simdutf
+
+#endif // SIMDUTF_ICELAKE_IMPLEMENTATION_H
diff --git a/contrib/simdutf/src/simdutf/icelake/intrinsics.h b/contrib/simdutf/src/simdutf/icelake/intrinsics.h
new file mode 100644
index 000000000..d4a58a0f7
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/icelake/intrinsics.h
@@ -0,0 +1,138 @@
+#ifndef SIMDUTF_ICELAKE_INTRINSICS_H
+#define SIMDUTF_ICELAKE_INTRINSICS_H
+
+#include "simdutf.h"
+
+#ifdef SIMDUTF_VISUAL_STUDIO
+ // under clang within visual studio, this will include <x86intrin.h>
+ #include <intrin.h> // visual studio or clang
+ #include <immintrin.h>
+#else
+
+ #if SIMDUTF_GCC11ORMORE
+// We should not get warnings while including <x86intrin.h> yet we do
+// under some versions of GCC.
+// If the x86intrin.h header has uninitialized values that are problematic,
+// it is a GCC issue, we want to ignore these warnings.
+SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
+ #endif
+
+ #include <x86intrin.h> // elsewhere
+
+ #if SIMDUTF_GCC11ORMORE
+// cancels the suppression of the -Wuninitialized
+SIMDUTF_POP_DISABLE_WARNINGS
+ #endif
+
+ #ifndef _tzcnt_u64
+ #define _tzcnt_u64(x) __tzcnt_u64(x)
+ #endif // _tzcnt_u64
+#endif // SIMDUTF_VISUAL_STUDIO
+
+#ifdef SIMDUTF_CLANG_VISUAL_STUDIO
+ /**
+ * You are not supposed, normally, to include these
+ * headers directly. Instead you should either include intrin.h
+ * or x86intrin.h. However, when compiling with clang
+ * under Windows (i.e., when _MSC_VER is set), these headers
+ * only get included *if* the corresponding features are detected
+ * from macros:
+ * e.g., if __AVX2__ is set... in turn, we normally set these
+ * macros by compiling against the corresponding architecture
+ * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
+ * software with these advanced instructions. In simdutf, we
+ * want to compile the whole program for a generic target,
+ * and only target our specific kernels. As a workaround,
+ * we directly include the needed headers. These headers would
+ * normally guard against such usage, but we carefully included
+ * <x86intrin.h> (or <intrin.h>) before, so the headers
+ * are fooled.
+ */
+ #include <bmiintrin.h> // for _blsr_u64
+ #include <bmi2intrin.h> // for _pext_u64, _pdep_u64
+ #include <lzcntintrin.h> // for __lzcnt64
+ #include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
+ #include <smmintrin.h>
+ #include <tmmintrin.h>
+ #include <avxintrin.h>
+ #include <avx2intrin.h>
+ // Important: we need the AVX-512 headers:
+ #include <avx512fintrin.h>
+ #include <avx512dqintrin.h>
+ #include <avx512cdintrin.h>
+ #include <avx512bwintrin.h>
+ #include <avx512vlintrin.h>
+ #include <avx512vlbwintrin.h>
+ #include <avx512vbmiintrin.h>
+ #include <avx512vbmi2intrin.h>
+ #include <avx512vpopcntdqintrin.h>
+ #include <avx512vpopcntdqvlintrin.h>
+ // unfortunately, we may not get _blsr_u64, but, thankfully, clang
+ // has it as a macro.
+ #ifndef _blsr_u64
+ // we roll our own
+ #define _blsr_u64(n) ((n - 1) & n)
+ #endif // _blsr_u64
+#endif // SIMDUTF_CLANG_VISUAL_STUDIO
+
+#if defined(__GNUC__) && !defined(__clang__)
+
+ #if __GNUC__ == 8
+ #define SIMDUTF_GCC8 1
+ #elif __GNUC__ == 9
+ #define SIMDUTF_GCC9 1
+ #endif // __GNUC__ == 8 || __GNUC__ == 9
+
+#endif // defined(__GNUC__) && !defined(__clang__)
+
+#if SIMDUTF_GCC8
+ #pragma GCC push_options
+ #pragma GCC target("avx512f")
+/**
+ * GCC 8 fails to provide _mm512_set_epi8. We roll our own.
+ */
+inline __m512i
+_mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4,
+ uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9,
+ uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14,
+ uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19,
+ uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
+ uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29,
+ uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34,
+ uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39,
+ uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44,
+ uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49,
+ uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54,
+ uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59,
+ uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) {
+ return _mm512_set_epi64(
+ uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) +
+ (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) +
+ (uint64_t(a1) << 48) + (uint64_t(a0) << 56),
+ uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) +
+ (uint64_t(a12) << 24) + (uint64_t(a11) << 32) +
+ (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
+ uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) +
+ (uint64_t(a20) << 24) + (uint64_t(a19) << 32) +
+ (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
+ uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) +
+ (uint64_t(a28) << 24) + (uint64_t(a27) << 32) +
+ (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
+ uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) +
+ (uint64_t(a36) << 24) + (uint64_t(a35) << 32) +
+ (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
+ uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) +
+ (uint64_t(a44) << 24) + (uint64_t(a43) << 32) +
+ (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
+ uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) +
+ (uint64_t(a52) << 24) + (uint64_t(a51) << 32) +
+ (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
+ uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) +
+ (uint64_t(a60) << 24) + (uint64_t(a59) << 32) +
+ (uint64_t(a58) << 40) + (uint64_t(a57) << 48) +
+ (uint64_t(a56) << 56));
+}
+ #pragma GCC pop_options
+#endif // SIMDUTF_GCC8
+
+#endif // SIMDUTF_HASWELL_INTRINSICS_H
diff --git a/contrib/simdutf/src/simdutf/lasx.h b/contrib/simdutf/src/simdutf/lasx.h
new file mode 100644
index 000000000..c1f66a30d
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/lasx.h
@@ -0,0 +1,44 @@
+#ifndef SIMDUTF_LASX_H
+#define SIMDUTF_LASX_H
+
+#ifdef SIMDUTF_FALLBACK_H
+ #error "lasx.h must be included before fallback.h"
+#endif
+
+#include "simdutf/portability.h"
+
+#ifndef SIMDUTF_IMPLEMENTATION_LASX
+ #define SIMDUTF_IMPLEMENTATION_LASX (SIMDUTF_IS_LASX)
+#endif
+#if SIMDUTF_IMPLEMENTATION_LASX && SIMDUTF_IS_LASX
+ #define SIMDUTF_CAN_ALWAYS_RUN_LASX 1
+#else
+ #define SIMDUTF_CAN_ALWAYS_RUN_LASX 0
+#endif
+
+#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)
+#include "simdutf/internal/isadetection.h"
+
+#if SIMDUTF_IMPLEMENTATION_LASX
+
+namespace simdutf {
+/**
+ * Implementation for LoongArch ASX.
+ */
+namespace lasx {} // namespace lasx
+} // namespace simdutf
+
+ #include "simdutf/lasx/implementation.h"
+
+ #include "simdutf/lasx/begin.h"
+
+ // Declarations
+ #include "simdutf/lasx/intrinsics.h"
+ #include "simdutf/lasx/bitmanipulation.h"
+ #include "simdutf/lasx/simd.h"
+
+ #include "simdutf/lasx/end.h"
+
+#endif // SIMDUTF_IMPLEMENTATION_LASX
+
+#endif // SIMDUTF_LASX_H
diff --git a/contrib/simdutf/src/simdutf/lasx/begin.h b/contrib/simdutf/src/simdutf/lasx/begin.h
new file mode 100644
index 000000000..2484c6c7f
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/lasx/begin.h
@@ -0,0 +1 @@
+#define SIMDUTF_IMPLEMENTATION lasx
diff --git a/contrib/simdutf/src/simdutf/lasx/bitmanipulation.h b/contrib/simdutf/src/simdutf/lasx/bitmanipulation.h
new file mode 100644
index 000000000..e974413b9
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/lasx/bitmanipulation.h
@@ -0,0 +1,25 @@
+#ifndef SIMDUTF_LASX_BITMANIPULATION_H
+#define SIMDUTF_LASX_BITMANIPULATION_H
+
+#include "simdutf.h"
+#include <limits>
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+
+simdutf_really_inline int count_ones(uint64_t input_num) {
+ return __lsx_vpickve2gr_w(__lsx_vpcnt_d(__lsx_vreplgr2vr_d(input_num)), 0);
+}
+
+#if SIMDUTF_NEED_TRAILING_ZEROES
+simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
+ return __builtin_ctzll(input_num);
+}
+#endif
+
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#endif // SIMDUTF_LASX_BITMANIPULATION_H
diff --git a/contrib/simdutf/src/simdutf/lasx/end.h b/contrib/simdutf/src/simdutf/lasx/end.h
new file mode 100644
index 000000000..58fd810d4
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/lasx/end.h
@@ -0,0 +1 @@
+#undef SIMDUTF_IMPLEMENTATION
diff --git a/contrib/simdutf/src/simdutf/lasx/implementation.h b/contrib/simdutf/src/simdutf/lasx/implementation.h
new file mode 100644
index 000000000..6d2b18b67
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/lasx/implementation.h
@@ -0,0 +1,230 @@
+#ifndef SIMDUTF_LASX_IMPLEMENTATION_H
+#define SIMDUTF_LASX_IMPLEMENTATION_H
+
+#include "simdutf.h"
+#include "simdutf/internal/isadetection.h"
+
+namespace simdutf {
+namespace lasx {
+
+namespace {
+using namespace simdutf;
+}
+
+class implementation final : public simdutf::implementation {
+public:
+ simdutf_really_inline implementation()
+ : simdutf::implementation("lasx", "LOONGARCH ASX",
+ internal::instruction_set::LSX |
+ internal::instruction_set::LASX) {}
+ simdutf_warn_unused int detect_encodings(const char *input,
+ size_t length) const noexcept final;
+ simdutf_warn_unused bool validate_utf8(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_ascii(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf32(const char32_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf8(
+ const char *buf, size_t len, char *utf8_output) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len, char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16le_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16be_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused result
+ convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ void change_endianness_utf16(const char16_t *buf, size_t length,
+ char16_t *output) const noexcept final;
+ simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t count_utf8(const char *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t utf32_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t utf32_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf16(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf32(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_latin1(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_latin1(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_latin1(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept;
+ simdutf_warn_unused result
+ base64_to_binary(const char *input, size_t length, char *output,
+ base64_options options) const noexcept;
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused result
+ base64_to_binary(const char16_t *input, size_t length, char *output,
+ base64_options options) const noexcept;
+ simdutf_warn_unused size_t base64_length_from_binary(
+ size_t length, base64_options options) const noexcept;
+ size_t binary_to_base64(const char *input, size_t length, char *output,
+ base64_options options) const noexcept;
+
+ simdutf_warn_unused virtual result
+ base64_to_binary(const char *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused virtual full_result base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused virtual result
+ base64_to_binary(const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused virtual full_result base64_to_binary_details(
+ const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+};
+
+} // namespace lasx
+} // namespace simdutf
+
+#endif // SIMDUTF_LASX_IMPLEMENTATION_H
diff --git a/contrib/simdutf/src/simdutf/lasx/intrinsics.h b/contrib/simdutf/src/simdutf/lasx/intrinsics.h
new file mode 100644
index 000000000..9965504da
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/lasx/intrinsics.h
@@ -0,0 +1,101 @@
+#ifndef SIMDUTF_LASX_INTRINSICS_H
+#define SIMDUTF_LASX_INTRINSICS_H
+
+#include "simdutf.h"
+
+// This should be the correct header whether
+// you use visual studio or other compilers.
+#include <lsxintrin.h>
+#include <lasxintrin.h>
+
+#if defined(__loongarch_asx)
+ #ifdef __clang__
+ #define VREGS_PREFIX "$vr"
+ #define XREGS_PREFIX "$xr"
+ #else // GCC
+ #define VREGS_PREFIX "$f"
+ #define XREGS_PREFIX "$f"
+ #endif
+ #define __ALL_REGS \
+ "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26," \
+ "27,28,29,30,31"
+// Convert __m128i to __m256i
+static inline __m256i ____m256i(__m128i in) {
+ __m256i out = __lasx_xvldi(0);
+ __asm__ volatile(".irp i," __ALL_REGS "\n\t"
+ " .ifc %[out], " XREGS_PREFIX "\\i \n\t"
+ " .irp j," __ALL_REGS "\n\t"
+ " .ifc %[in], " VREGS_PREFIX "\\j \n\t"
+ " xvpermi.q $xr\\i, $xr\\j, 0x0 \n\t"
+ " .endif \n\t"
+ " .endr \n\t"
+ " .endif \n\t"
+ ".endr \n\t"
+ : [out] "+f"(out)
+ : [in] "f"(in));
+ return out;
+}
+// Convert two __m128i to __m256i
+static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) {
+ __m256i out;
+ __asm__ volatile(".irp i," __ALL_REGS "\n\t"
+ " .ifc %[hi], " VREGS_PREFIX "\\i \n\t"
+ " .irp j," __ALL_REGS "\n\t"
+ " .ifc %[lo], " VREGS_PREFIX "\\j \n\t"
+ " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t"
+ " .endif \n\t"
+ " .endr \n\t"
+ " .endif \n\t"
+ ".endr \n\t"
+ ".ifnc %[out], %[hi] \n\t"
+ ".irp i," __ALL_REGS "\n\t"
+ " .ifc %[out], " XREGS_PREFIX "\\i \n\t"
+ " .irp j," __ALL_REGS "\n\t"
+ " .ifc %[hi], " VREGS_PREFIX "\\j \n\t"
+ " xvori.b $xr\\i, $xr\\j, 0 \n\t"
+ " .endif \n\t"
+ " .endr \n\t"
+ " .endif \n\t"
+ ".endr \n\t"
+ ".endif \n\t"
+ : [out] "=f"(out), [hi] "+f"(inhi)
+ : [lo] "f"(inlo));
+ return out;
+}
+// Convert __m256i low part to __m128i
+static inline __m128i lasx_extracti128_lo(__m256i in) {
+ __m128i out;
+ __asm__ volatile(".ifnc %[out], %[in] \n\t"
+ ".irp i," __ALL_REGS "\n\t"
+ " .ifc %[out], " VREGS_PREFIX "\\i \n\t"
+ " .irp j," __ALL_REGS "\n\t"
+ " .ifc %[in], " XREGS_PREFIX "\\j \n\t"
+ " vori.b $vr\\i, $vr\\j, 0 \n\t"
+ " .endif \n\t"
+ " .endr \n\t"
+ " .endif \n\t"
+ ".endr \n\t"
+ ".endif \n\t"
+ : [out] "=f"(out)
+ : [in] "f"(in));
+ return out;
+}
+// Convert __m256i high part to __m128i
+static inline __m128i lasx_extracti128_hi(__m256i in) {
+ __m128i out;
+ __asm__ volatile(".irp i," __ALL_REGS "\n\t"
+ " .ifc %[out], " VREGS_PREFIX "\\i \n\t"
+ " .irp j," __ALL_REGS "\n\t"
+ " .ifc %[in], " XREGS_PREFIX "\\j \n\t"
+ " xvpermi.q $xr\\i, $xr\\j, 0x11 \n\t"
+ " .endif \n\t"
+ " .endr \n\t"
+ " .endif \n\t"
+ ".endr \n\t"
+ : [out] "=f"(out)
+ : [in] "f"(in));
+ return out;
+}
+#endif
+
+#endif // SIMDUTF_LASX_INTRINSICS_H
diff --git a/contrib/simdutf/src/simdutf/lasx/simd.h b/contrib/simdutf/src/simdutf/lasx/simd.h
new file mode 100644
index 000000000..493b7a452
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/lasx/simd.h
@@ -0,0 +1,707 @@
+#ifndef SIMDUTF_LASX_SIMD_H
+#define SIMDUTF_LASX_SIMD_H
+
+#include "simdutf.h"
+#include "simdutf/lasx/bitmanipulation.h"
+#include <type_traits>
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+namespace simd {
+
+__attribute__((aligned(32))) static const uint8_t prev_shuf_table[32][32] = {
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+ {0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
+ {0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+ 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13},
+ {0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
+ {0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+ 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+ {0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+ 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+ {0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+ 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+ {0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8,
+ 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8},
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7,
+ 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7},
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6,
+ 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6},
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5,
+ 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5},
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4,
+ 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4},
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3},
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2,
+ 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2},
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+ 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1},
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0},
+ {15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+ 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+ 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
+ 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+ 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+ 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+ 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0},
+ {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+ 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0},
+ {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+ 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0},
+ {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0},
+ {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0},
+ {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0},
+ {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0},
+ {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0},
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+};
+
+__attribute__((aligned(32))) static const uint8_t bitsel_mask_table[32][32] = {
+ {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0},
+ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0}};
+
+// Forward-declared so they can be used by splat and friends.
+template <typename Child> struct base {
+ __m256i value;
+
+ // Zero constructor
+ simdutf_really_inline base() : value{__m256i()} {}
+
+ // Conversion from SIMD register
+ simdutf_really_inline base(const __m256i _value) : value(_value) {}
+ // Conversion to SIMD register
+ simdutf_really_inline operator const __m256i &() const { return this->value; }
+ simdutf_really_inline operator __m256i &() { return this->value; }
+ template <endianness big_endian>
+ simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+ if (big_endian) {
+ __m256i zero = __lasx_xvldi(0);
+ __m256i in8 = __lasx_xvpermi_d(this->value, 0b11011000);
+ __m256i inlow = __lasx_xvilvl_b(in8, zero);
+ __m256i inhigh = __lasx_xvilvh_b(in8, zero);
+ __lasx_xvst(inlow, reinterpret_cast<uint16_t *>(ptr), 0);
+ __lasx_xvst(inhigh, reinterpret_cast<uint16_t *>(ptr), 32);
+ } else {
+ __m256i inlow = __lasx_vext2xv_hu_bu(this->value);
+ __m256i inhigh = __lasx_vext2xv_hu_bu(
+ __lasx_xvpermi_q(this->value, this->value, 0b00000001));
+ __lasx_xvst(inlow, reinterpret_cast<__m256i *>(ptr), 0);
+ __lasx_xvst(inhigh, reinterpret_cast<__m256i *>(ptr), 32);
+ }
+ }
+ simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
+ __m256i in32_0 = __lasx_vext2xv_wu_bu(this->value);
+ __lasx_xvst(in32_0, reinterpret_cast<uint32_t *>(ptr), 0);
+
+ __m256i in8_1 = __lasx_xvpermi_d(this->value, 0b00000001);
+ __m256i in32_1 = __lasx_vext2xv_wu_bu(in8_1);
+ __lasx_xvst(in32_1, reinterpret_cast<uint32_t *>(ptr), 32);
+
+ __m256i in8_2 = __lasx_xvpermi_d(this->value, 0b00000010);
+ __m256i in32_2 = __lasx_vext2xv_wu_bu(in8_2);
+ __lasx_xvst(in32_2, reinterpret_cast<uint32_t *>(ptr), 64);
+
+ __m256i in8_3 = __lasx_xvpermi_d(this->value, 0b00000011);
+ __m256i in32_3 = __lasx_vext2xv_wu_bu(in8_3);
+ __lasx_xvst(in32_3, reinterpret_cast<uint32_t *>(ptr), 96);
+ }
+ // Bit operations
+ simdutf_really_inline Child operator|(const Child other) const {
+ return __lasx_xvor_v(this->value, other);
+ }
+ simdutf_really_inline Child operator&(const Child other) const {
+ return __lasx_xvand_v(this->value, other);
+ }
+ simdutf_really_inline Child operator^(const Child other) const {
+ return __lasx_xvxor_v(this->value, other);
+ }
+ simdutf_really_inline Child bit_andnot(const Child other) const {
+ return __lasx_xvandn_v(this->value, other);
+ }
+ simdutf_really_inline Child &operator|=(const Child other) {
+ auto this_cast = static_cast<Child *>(this);
+ *this_cast = *this_cast | other;
+ return *this_cast;
+ }
+ simdutf_really_inline Child &operator&=(const Child other) {
+ auto this_cast = static_cast<Child *>(this);
+ *this_cast = *this_cast & other;
+ return *this_cast;
+ }
+ simdutf_really_inline Child &operator^=(const Child other) {
+ auto this_cast = static_cast<Child *>(this);
+ *this_cast = *this_cast ^ other;
+ return *this_cast;
+ }
+};
+
+template <typename T> struct simd8;
+
+template <typename T, typename Mask = simd8<bool>>
+struct base8 : base<simd8<T>> {
+ typedef uint32_t bitmask_t;
+ typedef uint64_t bitmask2_t;
+
+ simdutf_really_inline base8() : base<simd8<T>>() {}
+ simdutf_really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
+ simdutf_really_inline T first() const {
+ return __lasx_xvpickve2gr_wu(this->value, 0);
+ }
+ simdutf_really_inline T last() const {
+ return __lasx_xvpickve2gr_wu(this->value, 7);
+ }
+ friend simdutf_really_inline Mask operator==(const simd8<T> lhs,
+ const simd8<T> rhs) {
+ return __lasx_xvseq_b(lhs, rhs);
+ }
+
+ static const int SIZE = sizeof(base<T>::value);
+
+ template <int N = 1>
+ simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+ if (!N)
+ return this->value;
+
+ __m256i zero = __lasx_xvldi(0);
+ __m256i result, shuf;
+ if (N < 16) {
+ shuf = __lasx_xvld(prev_shuf_table[N], 0);
+
+ result = __lasx_xvshuf_b(
+ __lasx_xvpermi_q(this->value, this->value, 0b00000001), this->value,
+ shuf);
+ __m256i srl_prev = __lasx_xvbsrl_v(
+ __lasx_xvpermi_q(zero, prev_chunk.value, 0b00110001), (16 - N));
+ __m256i mask = __lasx_xvld(bitsel_mask_table[N], 0);
+ result = __lasx_xvbitsel_v(result, srl_prev, mask);
+
+ return result;
+ } else if (N == 16) {
+ return __lasx_xvpermi_q(this->value, prev_chunk.value, 0b00100001);
+ } /*else {
+ __m256i sll_value = __lasx_xvbsll_v(
+ __lasx_xvpermi_q(zero, this->value, 0b00000011), (N - 16) % 32);
+ __m256i mask = __lasx_xvld(bitsel_mask_table[N], 0);
+ shuf = __lasx_xvld(prev_shuf_table[N], 0);
+ result = __lasx_xvshuf_b(
+ __lasx_xvpermi_q(prev_chunk.value, prev_chunk.value, 0b00000001),
+ prev_chunk.value, shuf);
+ result = __lasx_xvbitsel_v(sll_value, result, mask);
+ return result;
+ }*/
+ }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd8<bool> : base8<bool> {
+ static simdutf_really_inline simd8<bool> splat(bool _value) {
+ return __lasx_xvreplgr2vr_b(uint8_t(-(!!_value)));
+ }
+
+ simdutf_really_inline simd8() : base8() {}
+ simdutf_really_inline simd8(const __m256i _value) : base8<bool>(_value) {}
+ // Splat constructor
+ simdutf_really_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
+
+ simdutf_really_inline uint32_t to_bitmask() const {
+ __m256i mask = __lasx_xvmsknz_b(this->value);
+ uint32_t mask0 = __lasx_xvpickve2gr_wu(mask, 0);
+ uint32_t mask1 = __lasx_xvpickve2gr_wu(mask, 4);
+ return (mask0 | (mask1 << 16));
+ }
+ simdutf_really_inline bool any() const {
+ if (__lasx_xbz_b(this->value))
+ return false;
+ return true;
+ }
+ simdutf_really_inline bool none() const {
+ if (__lasx_xbz_b(this->value))
+ return true;
+ return false;
+ }
+ simdutf_really_inline bool all() const {
+ if (__lasx_xbnz_b(this->value))
+ return true;
+ return false;
+ }
+ simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
+};
+
+template <typename T> struct base8_numeric : base8<T> {
+ static simdutf_really_inline simd8<T> splat(T _value) {
+ return __lasx_xvreplgr2vr_b(_value);
+ }
+ static simdutf_really_inline simd8<T> zero() { return __lasx_xvldi(0); }
+ static simdutf_really_inline simd8<T> load(const T values[32]) {
+ return __lasx_xvld(reinterpret_cast<const __m256i *>(values), 0);
+ }
+ // Repeat 16 values as many times as necessary (usually for lookup tables)
+ static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
+ T v5, T v6, T v7, T v8, T v9,
+ T v10, T v11, T v12, T v13,
+ T v14, T v15) {
+ return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+ v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+ v12, v13, v14, v15);
+ }
+
+ simdutf_really_inline base8_numeric() : base8<T>() {}
+ simdutf_really_inline base8_numeric(const __m256i _value)
+ : base8<T>(_value) {}
+
+ // Store to array
+ simdutf_really_inline void store(T dst[32]) const {
+ return __lasx_xvst(this->value, reinterpret_cast<__m256i *>(dst), 0);
+ }
+
+ // Addition/subtraction are the same for signed and unsigned
+ simdutf_really_inline simd8<T> operator+(const simd8<T> other) const {
+ return __lasx_xvadd_b(this->value, other);
+ }
+ simdutf_really_inline simd8<T> operator-(const simd8<T> other) const {
+ return __lasx_xvsub_b(this->value, other);
+ }
+ simdutf_really_inline simd8<T> &operator+=(const simd8<T> other) {
+ *this = *this + other;
+ return *static_cast<simd8<T> *>(this);
+ }
+ simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
+ *this = *this - other;
+ return *static_cast<simd8<T> *>(this);
+ }
+
+ // Override to distinguish from bool version
+ simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+
+ // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
+ // for out of range values)
+ template <typename L>
+ simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+ __m256i origin = __lasx_xvand_v(this->value, __lasx_xvldi(0x1f));
+ return __lasx_xvshuf_b(__lasx_xvldi(0), lookup_table, origin);
+ }
+
+ template <typename L>
+ simdutf_really_inline simd8<L>
+ lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+ L replace5, L replace6, L replace7, L replace8, L replace9,
+ L replace10, L replace11, L replace12, L replace13, L replace14,
+ L replace15) const {
+ return lookup_16(simd8<L>::repeat_16(
+ replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+ replace7, replace8, replace9, replace10, replace11, replace12,
+ replace13, replace14, replace15));
+ }
+};
+
+// Signed bytes
+template <> struct simd8<int8_t> : base8_numeric<int8_t> {
+ simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
+ simdutf_really_inline simd8(const __m256i _value)
+ : base8_numeric<int8_t>(_value) {}
+
+ // Splat constructor
+ simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
+ simdutf_really_inline operator simd8<uint8_t>() const;
+ // Member-by-member initialization
+ simdutf_really_inline
+ simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+ int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+ int8_t v12, int8_t v13, int8_t v14, int8_t v15, int8_t v16, int8_t v17,
+ int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
+ int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29,
+ int8_t v30, int8_t v31)
+ : simd8((__m256i)v32i8{v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, v10, v11, v12, v13, v14, v15,
+ v16, v17, v18, v19, v20, v21, v22, v23,
+ v24, v25, v26, v27, v28, v29, v30, v31}) {}
+ // Repeat 16 values as many times as necessary (usually for lookup tables)
+ simdutf_really_inline static simd8<int8_t>
+ repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+ int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+ int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
+ return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+ v13, v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+ v10, v11, v12, v13, v14, v15);
+ }
+ simdutf_really_inline bool is_ascii() const {
+ __m256i ascii_mask = __lasx_xvslti_b(this->value, 0);
+ if (__lasx_xbnz_v(ascii_mask))
+ return false;
+ return true;
+ }
+ // Order-sensitive comparisons
+ simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const {
+ return __lasx_xvmax_b(this->value, other);
+ }
+ simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const {
+ return __lasx_xvmin_b(this->value, other);
+ }
+ simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
+ return __lasx_xvslt_b(other, this->value);
+ }
+ simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
+ return __lasx_xvslt_b(this->value, other);
+ }
+};
+
+// Unsigned bytes
+template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
+ simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
+ simdutf_really_inline simd8(const __m256i _value)
+ : base8_numeric<uint8_t>(_value) {}
+ // Splat constructor
+ simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
+ // Member-by-member initialization
+ simdutf_really_inline
+ simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
+ uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
+ uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
+ uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20,
+ uint8_t v21, uint8_t v22, uint8_t v23, uint8_t v24, uint8_t v25,
+ uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30,
+ uint8_t v31)
+ : simd8((__m256i)v32u8{v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, v10, v11, v12, v13, v14, v15,
+ v16, v17, v18, v19, v20, v21, v22, v23,
+ v24, v25, v26, v27, v28, v29, v30, v31}) {}
+ // Repeat 16 values as many times as necessary (usually for lookup tables)
+ simdutf_really_inline static simd8<uint8_t>
+ repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
+ uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
+ uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
+ uint8_t v15) {
+ return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+ v13, v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+ v10, v11, v12, v13, v14, v15);
+ }
+
+ // Saturated math
+ simdutf_really_inline simd8<uint8_t>
+ saturating_add(const simd8<uint8_t> other) const {
+ return __lasx_xvsadd_bu(this->value, other);
+ }
+ simdutf_really_inline simd8<uint8_t>
+ saturating_sub(const simd8<uint8_t> other) const {
+ return __lasx_xvssub_bu(this->value, other);
+ }
+
+ // Order-specific operations
+ simdutf_really_inline simd8<uint8_t>
+ max_val(const simd8<uint8_t> other) const {
+ return __lasx_xvmax_bu(*this, other);
+ }
+ simdutf_really_inline simd8<uint8_t>
+ min_val(const simd8<uint8_t> other) const {
+ return __lasx_xvmin_bu(*this, other);
+ }
+ // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd8<uint8_t>
+ gt_bits(const simd8<uint8_t> other) const {
+ return this->saturating_sub(other);
+ }
+ // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd8<uint8_t>
+ lt_bits(const simd8<uint8_t> other) const {
+ return other.saturating_sub(*this);
+ }
+ simdutf_really_inline simd8<bool>
+ operator<=(const simd8<uint8_t> other) const {
+ return __lasx_xvsle_bu(*this, other);
+ }
+ simdutf_really_inline simd8<bool>
+ operator>=(const simd8<uint8_t> other) const {
+ return __lasx_xvsle_bu(other, *this);
+ }
+ simdutf_really_inline simd8<bool>
+ operator>(const simd8<uint8_t> other) const {
+ return __lasx_xvslt_bu(*this, other);
+ }
+ simdutf_really_inline simd8<bool>
+ operator<(const simd8<uint8_t> other) const {
+ return __lasx_xvslt_bu(other, *this);
+ }
+
+ // Bit-specific operations
+ simdutf_really_inline simd8<bool> bits_not_set() const {
+ return *this == uint8_t(0);
+ }
+ simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
+ return (*this & bits).bits_not_set();
+ }
+ simdutf_really_inline simd8<bool> any_bits_set() const {
+ return ~this->bits_not_set();
+ }
+ simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
+ return ~this->bits_not_set(bits);
+ }
+ simdutf_really_inline bool is_ascii() const {
+ __m256i ascii_mask = __lasx_xvslti_b(this->value, 0);
+ if (__lasx_xbnz_v(ascii_mask))
+ return false;
+ return true;
+ }
+ simdutf_really_inline bool any_bits_set_anywhere() const {
+ if (__lasx_xbnz_v(this->value))
+ return true;
+ return false;
+ }
+ simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
+ return (*this & bits).any_bits_set_anywhere();
+ }
+ template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
+ return __lasx_xvsrli_b(this->value, N);
+ }
+ template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
+ return __lasx_xvslli_b(this->value, N);
+ }
+};
+simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const {
+ return this->value;
+}
+
+template <typename T> struct simd8x64 {
+ static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+ static_assert(NUM_CHUNKS == 2,
+ "LASX kernel should use two registers per 64-byte block.");
+ simd8<T> chunks[NUM_CHUNKS];
+
+ simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
+ simd8x64<T> &
+ operator=(const simd8<T> other) = delete; // no assignment allowed
+ simd8x64() = delete; // no default constructor allowed
+
+ simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1)
+ : chunks{chunk0, chunk1} {}
+ simdutf_really_inline simd8x64(const T *ptr)
+ : chunks{simd8<T>::load(ptr),
+ simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T))} {}
+
+ simdutf_really_inline void store(T *ptr) const {
+ this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+ this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
+ }
+
+ simdutf_really_inline uint64_t to_bitmask() const {
+ uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+ uint64_t r_hi = this->chunks[1].to_bitmask();
+ return r_lo | (r_hi << 32);
+ }
+
+ simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
+ this->chunks[0] |= other.chunks[0];
+ this->chunks[1] |= other.chunks[1];
+ return *this;
+ }
+
+ simdutf_really_inline simd8<T> reduce_or() const {
+ return this->chunks[0] | this->chunks[1];
+ }
+
+ simdutf_really_inline bool is_ascii() const {
+ return this->reduce_or().is_ascii();
+ }
+
+ template <endianness endian>
+ simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+ this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
+ sizeof(simd8<T>) * 0);
+ this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
+ sizeof(simd8<T>) * 1);
+ }
+
+ simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
+ this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
+ this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
+ }
+
+ simdutf_really_inline simd8x64<T> bit_or(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<T>(this->chunks[0] | mask, this->chunks[1] | mask);
+ }
+
+ simdutf_really_inline uint64_t eq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
+ return simd8x64<bool>(this->chunks[0] == other.chunks[0],
+ this->chunks[1] == other.chunks[1])
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t lteq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+ const simd8<T> mask_low = simd8<T>::splat(low);
+ const simd8<T> mask_high = simd8<T>::splat(high);
+
+ return simd8x64<bool>(
+ (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+ (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+ const simd8<T> mask_low = simd8<T>::splat(low);
+ const simd8<T> mask_high = simd8<T>::splat(high);
+ return simd8x64<bool>(
+ (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+ (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t lt(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t gt(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask)
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t gteq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask)
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+ const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+ return simd8x64<bool>((simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
+ (simd8<uint8_t>(__m256i(this->chunks[1])) >= mask))
+ .to_bitmask();
+ }
+}; // struct simd8x64<T>
+
+#include "simdutf/lasx/simd16-inl.h"
+} // namespace simd
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#endif // SIMDUTF_LASX_SIMD_H
diff --git a/contrib/simdutf/src/simdutf/lasx/simd16-inl.h b/contrib/simdutf/src/simdutf/lasx/simd16-inl.h
new file mode 100644
index 000000000..4b0a4f4fa
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/lasx/simd16-inl.h
@@ -0,0 +1,348 @@
+template <typename T> struct simd16;
+
+template <typename T, typename Mask = simd16<bool>>
+struct base16 : base<simd16<T>> {
+ using bitmask_type = uint32_t;
+
+ simdutf_really_inline base16() : base<simd16<T>>() {}
+ simdutf_really_inline base16(const __m256i _value)
+ : base<simd16<T>>(_value) {}
+ template <typename Pointer>
+ simdutf_really_inline base16(const Pointer *ptr)
+ : base16(__lasx_xvld(reinterpret_cast<const __m256i *>(ptr), 0)) {}
+ friend simdutf_really_inline Mask operator==(const simd16<T> lhs,
+ const simd16<T> rhs) {
+ return __lasx_xvseq_h(lhs.value, rhs.value);
+ }
+
+ /// the size of vector in bytes
+ static const int SIZE = sizeof(base<simd16<T>>::value);
+
+ /// the number of elements of type T a vector can hold
+ static const int ELEMENTS = SIZE / sizeof(T);
+
+ template <int N = 1>
+ simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+ if (!N)
+ return this->value;
+
+ __m256i zero = __lasx_xvldi(0);
+ __m256i result, shuf;
+ if (N < 8) {
+ shuf = __lasx_xvld(prev_shuf_table[N * 2], 0);
+
+ result = __lasx_xvshuf_b(
+ __lasx_xvpermi_q(this->value, this->value, 0b00000001), this->value,
+ shuf);
+ __m256i srl_prev = __lasx_xvbsrl_v(
+ __lasx_xvpermi_q(zero, prev_chunk, 0b00110001), (16 - N * 2));
+ __m256i mask = __lasx_xvld(bitsel_mask_table[N], 0);
+ result = __lasx_xvbitsel_v(result, srl_prev, mask);
+
+ return result;
+ } else if (N == 8) {
+ return __lasx_xvpermi_q(this->value, prev_chunk, 0b00100001);
+ } else {
+ __m256i sll_value = __lasx_xvbsll_v(
+ __lasx_xvpermi_q(zero, this->value, 0b00000011), (N * 2 - 16));
+ __m256i mask = __lasx_xvld(bitsel_mask_table[N * 2], 0);
+ shuf = __lasx_xvld(prev_shuf_table[N * 2], 0);
+ result =
+ __lasx_xvshuf_b(__lasx_xvpermi_q(prev_chunk, prev_chunk, 0b00000001),
+ prev_chunk, shuf);
+ result = __lasx_xvbitsel_v(sll_value, result, mask);
+ return result;
+ }
+ }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd16<bool> : base16<bool> {
+ static simdutf_really_inline simd16<bool> splat(bool _value) {
+ return __lasx_xvreplgr2vr_h(uint8_t(-(!!_value)));
+ }
+
+ simdutf_really_inline simd16() : base16() {}
+ simdutf_really_inline simd16(const __m256i _value) : base16<bool>(_value) {}
+ // Splat constructor
+ simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}
+
+ simdutf_really_inline bitmask_type to_bitmask() const {
+ __m256i mask = __lasx_xvmsknz_b(this->value);
+ bitmask_type mask0 = __lasx_xvpickve2gr_wu(mask, 0);
+ bitmask_type mask1 = __lasx_xvpickve2gr_wu(mask, 4);
+ return (mask0 | (mask1 << 16));
+ }
+ simdutf_really_inline bool any() const {
+ if (__lasx_xbz_v(this->value))
+ return false;
+ return true;
+ }
+ simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
+};
+
+template <typename T> struct base16_numeric : base16<T> {
+ static simdutf_really_inline simd16<T> splat(T _value) {
+ return __lasx_xvreplgr2vr_h((uint16_t)_value);
+ }
+ static simdutf_really_inline simd16<T> zero() { return __lasx_xvldi(0); }
+ static simdutf_really_inline simd16<T> load(const T values[8]) {
+ return __lasx_xvld(reinterpret_cast<const __m256i *>(values), 0);
+ }
+
+ simdutf_really_inline base16_numeric() : base16<T>() {}
+ simdutf_really_inline base16_numeric(const __m256i _value)
+ : base16<T>(_value) {}
+
+ // Store to array
+ simdutf_really_inline void store(T dst[8]) const {
+ return __lasx_xvst(this->value, reinterpret_cast<__m256i *>(dst), 0);
+ }
+
+ // Override to distinguish from bool version
+ simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
+
+ // Addition/subtraction are the same for signed and unsigned
+ simdutf_really_inline simd16<T> operator+(const simd16<T> other) const {
+ return __lasx_xvadd_h(*this, other);
+ }
+ simdutf_really_inline simd16<T> operator-(const simd16<T> other) const {
+ return __lasx_xvsub_h(*this, other);
+ }
+ simdutf_really_inline simd16<T> &operator+=(const simd16<T> other) {
+ *this = *this + other;
+ return *static_cast<simd16<T> *>(this);
+ }
+ simdutf_really_inline simd16<T> &operator-=(const simd16<T> other) {
+ *this = *this - other;
+ return *static_cast<simd16<T> *>(this);
+ }
+};
+
+// Signed code units
+template <> struct simd16<int16_t> : base16_numeric<int16_t> {
+ simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
+ simdutf_really_inline simd16(const __m256i _value)
+ : base16_numeric<int16_t>(_value) {}
+ // Splat constructor
+ simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd16(const int16_t *values) : simd16(load(values)) {}
+ simdutf_really_inline simd16(const char16_t *values)
+ : simd16(load(reinterpret_cast<const int16_t *>(values))) {}
+ // Order-sensitive comparisons
+ simdutf_really_inline simd16<int16_t>
+ max_val(const simd16<int16_t> other) const {
+ return __lasx_xvmax_h(*this, other);
+ }
+ simdutf_really_inline simd16<int16_t>
+ min_val(const simd16<int16_t> other) const {
+ return __lasx_xvmin_h(*this, other);
+ }
+ simdutf_really_inline simd16<bool>
+ operator>(const simd16<int16_t> other) const {
+ return __lasx_xvsle_h(other.value, this->value);
+ }
+ simdutf_really_inline simd16<bool>
+ operator<(const simd16<int16_t> other) const {
+ return __lasx_xvslt_h(this->value, other.value);
+ }
+};
+
+// Unsigned code units
+template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
+ simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
+ simdutf_really_inline simd16(const __m256i _value)
+ : base16_numeric<uint16_t>(_value) {}
+
+ // Splat constructor
+ simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {}
+ simdutf_really_inline simd16(const char16_t *values)
+ : simd16(load(reinterpret_cast<const uint16_t *>(values))) {}
+
+ // Saturated math
+ simdutf_really_inline simd16<uint16_t>
+ saturating_add(const simd16<uint16_t> other) const {
+ return __lasx_xvsadd_hu(this->value, other.value);
+ }
+ simdutf_really_inline simd16<uint16_t>
+ saturating_sub(const simd16<uint16_t> other) const {
+ return __lasx_xvssub_hu(this->value, other.value);
+ }
+
+ // Order-specific operations
+ simdutf_really_inline simd16<uint16_t>
+ max_val(const simd16<uint16_t> other) const {
+ return __lasx_xvmax_hu(this->value, other.value);
+ }
+ simdutf_really_inline simd16<uint16_t>
+ min_val(const simd16<uint16_t> other) const {
+ return __lasx_xvmin_hu(this->value, other.value);
+ }
+ // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd16<uint16_t>
+ gt_bits(const simd16<uint16_t> other) const {
+ return this->saturating_sub(other);
+ }
+ // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd16<uint16_t>
+ lt_bits(const simd16<uint16_t> other) const {
+ return other.saturating_sub(*this);
+ }
+ simdutf_really_inline simd16<bool>
+ operator<=(const simd16<uint16_t> other) const {
+ return __lasx_xvsle_hu(this->value, other.value);
+ }
+ simdutf_really_inline simd16<bool>
+ operator>=(const simd16<uint16_t> other) const {
+ return __lasx_xvsle_hu(other.value, this->value);
+ }
+ simdutf_really_inline simd16<bool>
+ operator>(const simd16<uint16_t> other) const {
+ return __lasx_xvslt_hu(other.value, this->value);
+ }
+ simdutf_really_inline simd16<bool>
+ operator<(const simd16<uint16_t> other) const {
+ return __lasx_xvslt_hu(this->value, other.value);
+ }
+
+ // Bit-specific operations
+ simdutf_really_inline simd16<bool> bits_not_set() const {
+ return *this == uint16_t(0);
+ }
+ simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const {
+ return (*this & bits).bits_not_set();
+ }
+ simdutf_really_inline simd16<bool> any_bits_set() const {
+ return ~this->bits_not_set();
+ }
+ simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const {
+ return ~this->bits_not_set(bits);
+ }
+
+ simdutf_really_inline bool any_bits_set_anywhere() const {
+ if (__lasx_xbnz_v(this->value))
+ return true;
+ return false;
+ }
+ simdutf_really_inline bool
+ any_bits_set_anywhere(simd16<uint16_t> bits) const {
+ return (*this & bits).any_bits_set_anywhere();
+ }
+
+ template <int N> simdutf_really_inline simd16<uint16_t> shr() const {
+ return simd16<uint16_t>(__lasx_xvsrli_h(this->value, N));
+ }
+ template <int N> simdutf_really_inline simd16<uint16_t> shl() const {
+ return simd16<uint16_t>(__lasx_xvslli_h(this->value, N));
+ }
+
+ // Change the endianness
+ simdutf_really_inline simd16<uint16_t> swap_bytes() const {
+ return __lasx_xvshuf4i_b(this->value, 0b10110001);
+ }
+
+ // Pack with the unsigned saturation of two uint16_t code units into single
+ // uint8_t vector
+ static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
+ const simd16<uint16_t> &v1) {
+ return __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(v1.value, v0.value, 0),
+ 0b11011000);
+ }
+};
+
+template <typename T> struct simd16x32 {
+ static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
+ static_assert(NUM_CHUNKS == 2,
+ "LASX kernel should use two registers per 64-byte block.");
+ simd16<T> chunks[NUM_CHUNKS];
+
+ simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
+ simd16x32<T> &
+ operator=(const simd16<T> other) = delete; // no assignment allowed
+ simd16x32() = delete; // no default constructor allowed
+
+ simdutf_really_inline simd16x32(const simd16<T> chunk0,
+ const simd16<T> chunk1)
+ : chunks{chunk0, chunk1} {}
+ simdutf_really_inline simd16x32(const T *ptr)
+ : chunks{simd16<T>::load(ptr),
+ simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T))} {}
+
+ simdutf_really_inline void store(T *ptr) const {
+ this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
+ this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
+ }
+
+ simdutf_really_inline uint64_t to_bitmask() const {
+ uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+ uint64_t r_hi = this->chunks[1].to_bitmask();
+ return r_lo | (r_hi << 32);
+ }
+
+ simdutf_really_inline simd16<T> reduce_or() const {
+ return this->chunks[0] | this->chunks[1];
+ }
+
+ simdutf_really_inline bool is_ascii() const {
+ return this->reduce_or().is_ascii();
+ }
+
+ simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+ this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
+ this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>));
+ }
+
+ simdutf_really_inline simd16x32<T> bit_or(const T m) const {
+ const simd16<T> mask = simd16<T>::splat(m);
+ return simd16x32<T>(this->chunks[0] | mask, this->chunks[1] | mask);
+ }
+
+ simdutf_really_inline void swap_bytes() {
+ this->chunks[0] = this->chunks[0].swap_bytes();
+ this->chunks[1] = this->chunks[1].swap_bytes();
+ }
+
+ simdutf_really_inline uint64_t eq(const T m) const {
+ const simd16<T> mask = simd16<T>::splat(m);
+ return simd16x32<bool>(this->chunks[0] == mask, this->chunks[1] == mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
+ return simd16x32<bool>(this->chunks[0] == other.chunks[0],
+ this->chunks[1] == other.chunks[1])
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t lteq(const T m) const {
+ const simd16<T> mask = simd16<T>::splat(m);
+ return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+ const simd16<T> mask_low = simd16<T>::splat(low);
+ const simd16<T> mask_high = simd16<T>::splat(high);
+
+ return simd16x32<bool>(
+ (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+ (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+ const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low - 1));
+ const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high + 1));
+ return simd16x32<bool>(
+ (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+ (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t lt(const T m) const {
+ const simd16<T> mask = simd16<T>::splat(m);
+ return simd16x32<bool>(this->chunks[0] < mask, this->chunks[1] < mask)
+ .to_bitmask();
+ }
+}; // struct simd16x32<T>
diff --git a/contrib/simdutf/src/simdutf/lsx.h b/contrib/simdutf/src/simdutf/lsx.h
new file mode 100644
index 000000000..21ee0ac4d
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/lsx.h
@@ -0,0 +1,44 @@
+#ifndef SIMDUTF_LSX_H
+#define SIMDUTF_LSX_H
+
+#ifdef SIMDUTF_FALLBACK_H
+ #error "lsx.h must be included before fallback.h"
+#endif
+
+#include "simdutf/portability.h"
+
+#ifndef SIMDUTF_IMPLEMENTATION_LSX
+ #define SIMDUTF_IMPLEMENTATION_LSX (SIMDUTF_IS_LSX)
+#endif
+#if SIMDUTF_IMPLEMENTATION_LSX && SIMDUTF_IS_LSX
+ #define SIMDUTF_CAN_ALWAYS_RUN_LSX 1
+#else
+ #define SIMDUTF_CAN_ALWAYS_RUN_LSX 0
+#endif
+
+#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)
+#include "simdutf/internal/isadetection.h"
+
+#if SIMDUTF_IMPLEMENTATION_LSX
+
+namespace simdutf {
+/**
+ * Implementation for LoongArch SX.
+ */
+namespace lsx {} // namespace lsx
+} // namespace simdutf
+
+ #include "simdutf/lsx/implementation.h"
+
+ #include "simdutf/lsx/begin.h"
+
+ // Declarations
+ #include "simdutf/lsx/intrinsics.h"
+ #include "simdutf/lsx/bitmanipulation.h"
+ #include "simdutf/lsx/simd.h"
+
+ #include "simdutf/lsx/end.h"
+
+#endif // SIMDUTF_IMPLEMENTATION_LSX
+
+#endif // SIMDUTF_LSX_H
diff --git a/contrib/simdutf/src/simdutf/lsx/begin.h b/contrib/simdutf/src/simdutf/lsx/begin.h
new file mode 100644
index 000000000..b2db1b3db
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/lsx/begin.h
@@ -0,0 +1 @@
+#define SIMDUTF_IMPLEMENTATION lsx
diff --git a/contrib/simdutf/src/simdutf/lsx/bitmanipulation.h b/contrib/simdutf/src/simdutf/lsx/bitmanipulation.h
new file mode 100644
index 000000000..5df0bffc9
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/lsx/bitmanipulation.h
@@ -0,0 +1,25 @@
+#ifndef SIMDUTF_LSX_BITMANIPULATION_H
+#define SIMDUTF_LSX_BITMANIPULATION_H
+
+#include "simdutf.h"
+#include <limits>
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+
+simdutf_really_inline int count_ones(uint64_t input_num) {
+ return __lsx_vpickve2gr_w(__lsx_vpcnt_d(__lsx_vreplgr2vr_d(input_num)), 0);
+}
+
+#if SIMDUTF_NEED_TRAILING_ZEROES
+simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
+ return __builtin_ctzll(input_num);
+}
+#endif
+
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#endif // SIMDUTF_LSX_BITMANIPULATION_H
diff --git a/contrib/simdutf/src/simdutf/lsx/end.h b/contrib/simdutf/src/simdutf/lsx/end.h
new file mode 100644
index 000000000..58fd810d4
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/lsx/end.h
@@ -0,0 +1 @@
+#undef SIMDUTF_IMPLEMENTATION
diff --git a/contrib/simdutf/src/simdutf/lsx/implementation.h b/contrib/simdutf/src/simdutf/lsx/implementation.h
new file mode 100644
index 000000000..a0521039f
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/lsx/implementation.h
@@ -0,0 +1,229 @@
+#ifndef SIMDUTF_LSX_IMPLEMENTATION_H
+#define SIMDUTF_LSX_IMPLEMENTATION_H
+
+#include "simdutf.h"
+#include "simdutf/internal/isadetection.h"
+
+namespace simdutf {
+namespace lsx {
+
+namespace {
+using namespace simdutf;
+}
+
+class implementation final : public simdutf::implementation {
+public:
+ simdutf_really_inline implementation()
+ : simdutf::implementation("lsx", "LOONGARCH SX",
+ internal::instruction_set::LSX) {}
+ simdutf_warn_unused int detect_encodings(const char *input,
+ size_t length) const noexcept final;
+ simdutf_warn_unused bool validate_utf8(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_ascii(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf32(const char32_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf8(
+ const char *buf, size_t len, char *utf8_output) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len, char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16le_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16be_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused result
+ convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ void change_endianness_utf16(const char16_t *buf, size_t length,
+ char16_t *output) const noexcept final;
+ simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t count_utf8(const char *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t utf32_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t utf32_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf16(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf32(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_latin1(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_latin1(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_latin1(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept;
+ simdutf_warn_unused result
+ base64_to_binary(const char *input, size_t length, char *output,
+ base64_options options) const noexcept;
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused result
+ base64_to_binary(const char16_t *input, size_t length, char *output,
+ base64_options options) const noexcept;
+ simdutf_warn_unused size_t base64_length_from_binary(
+ size_t length, base64_options options) const noexcept;
+ size_t binary_to_base64(const char *input, size_t length, char *output,
+ base64_options options) const noexcept;
+
+ simdutf_warn_unused virtual result
+ base64_to_binary(const char *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused virtual full_result base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused virtual result
+ base64_to_binary(const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused virtual full_result base64_to_binary_details(
+ const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+};
+
+} // namespace lsx
+} // namespace simdutf
+
+#endif // SIMDUTF_LSX_IMPLEMENTATION_H
diff --git a/contrib/simdutf/src/simdutf/lsx/intrinsics.h b/contrib/simdutf/src/simdutf/lsx/intrinsics.h
new file mode 100644
index 000000000..f33ecff48
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/lsx/intrinsics.h
@@ -0,0 +1,10 @@
+#ifndef SIMDUTF_LSX_INTRINSICS_H
+#define SIMDUTF_LSX_INTRINSICS_H
+
+#include "simdutf.h"
+
+// This should be the correct header whether
+// you use visual studio or other compilers.
+#include <lsxintrin.h>
+
+#endif // SIMDUTF_LSX_INTRINSICS_H
diff --git a/contrib/simdutf/src/simdutf/lsx/simd.h b/contrib/simdutf/src/simdutf/lsx/simd.h
new file mode 100644
index 000000000..3aa180679
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/lsx/simd.h
@@ -0,0 +1,600 @@
+#ifndef SIMDUTF_LSX_SIMD_H
+#define SIMDUTF_LSX_SIMD_H
+
+#include "simdutf.h"
+#include "simdutf/lsx/bitmanipulation.h"
+#include <type_traits>
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+namespace simd {
+
+template <typename T> struct simd8;
+
+//
+// Base class of simd8<uint8_t> and simd8<bool>, both of which use __m128i
+// internally.
+//
+template <typename T, typename Mask = simd8<bool>> struct base_u8 {
+ __m128i value;
+ static const int SIZE = sizeof(value);
+
+ // Conversion from/to SIMD register
+ simdutf_really_inline base_u8(const __m128i _value) : value(_value) {}
+ simdutf_really_inline operator const __m128i &() const { return this->value; }
+ simdutf_really_inline operator __m128i &() { return this->value; }
+ simdutf_really_inline T first() const {
+ return __lsx_vpickve2gr_bu(this->value, 0);
+ }
+ simdutf_really_inline T last() const {
+ return __lsx_vpickve2gr_bu(this->value, 15);
+ }
+
+ // Bit operations
+ simdutf_really_inline simd8<T> operator|(const simd8<T> other) const {
+ return __lsx_vor_v(this->value, other);
+ }
+ simdutf_really_inline simd8<T> operator&(const simd8<T> other) const {
+ return __lsx_vand_v(this->value, other);
+ }
+ simdutf_really_inline simd8<T> operator^(const simd8<T> other) const {
+ return __lsx_vxor_v(this->value, other);
+ }
+ simdutf_really_inline simd8<T> bit_andnot(const simd8<T> other) const {
+ return __lsx_vandn_v(this->value, other);
+ }
+ simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+ simdutf_really_inline simd8<T> &operator|=(const simd8<T> other) {
+ auto this_cast = static_cast<simd8<T> *>(this);
+ *this_cast = *this_cast | other;
+ return *this_cast;
+ }
+ simdutf_really_inline simd8<T> &operator&=(const simd8<T> other) {
+ auto this_cast = static_cast<simd8<T> *>(this);
+ *this_cast = *this_cast & other;
+ return *this_cast;
+ }
+ simdutf_really_inline simd8<T> &operator^=(const simd8<T> other) {
+ auto this_cast = static_cast<simd8<T> *>(this);
+ *this_cast = *this_cast ^ other;
+ return *this_cast;
+ }
+
+ friend simdutf_really_inline Mask operator==(const simd8<T> lhs,
+ const simd8<T> rhs) {
+ return __lsx_vseq_b(lhs, rhs);
+ }
+
+ template <int N = 1>
+ simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+ return __lsx_vor_v(__lsx_vbsll_v(this->value, N),
+ __lsx_vbsrl_v(prev_chunk.value, 16 - N));
+ }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd8<bool> : base_u8<bool> {
+ typedef uint16_t bitmask_t;
+ typedef uint32_t bitmask2_t;
+
+ static simdutf_really_inline simd8<bool> splat(bool _value) {
+ return __lsx_vreplgr2vr_b(uint8_t(-(!!_value)));
+ }
+
+ simdutf_really_inline simd8(const __m128i _value) : base_u8<bool>(_value) {}
+ // False constructor
+ simdutf_really_inline simd8() : simd8(__lsx_vldi(0)) {}
+ // Splat constructor
+ simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {}
+ simdutf_really_inline void store(uint8_t dst[16]) const {
+ return __lsx_vst(this->value, dst, 0);
+ }
+
+ simdutf_really_inline uint32_t to_bitmask() const {
+ return __lsx_vpickve2gr_wu(__lsx_vmsknz_b(*this), 0);
+ }
+
+ simdutf_really_inline bool any() const {
+ return __lsx_vpickve2gr_hu(__lsx_vmsknz_b(*this), 0) != 0;
+ }
+ simdutf_really_inline bool none() const {
+ return __lsx_vpickve2gr_hu(__lsx_vmsknz_b(*this), 0) == 0;
+ }
+ simdutf_really_inline bool all() const {
+ return __lsx_vpickve2gr_hu(__lsx_vmsknz_b(*this), 0) == 0xFFFF;
+ }
+};
+
+// Unsigned bytes
+template <> struct simd8<uint8_t> : base_u8<uint8_t> {
+ static simdutf_really_inline simd8<uint8_t> splat(uint8_t _value) {
+ return __lsx_vreplgr2vr_b(_value);
+ }
+ static simdutf_really_inline simd8<uint8_t> zero() { return __lsx_vldi(0); }
+ static simdutf_really_inline simd8<uint8_t> load(const uint8_t *values) {
+ return __lsx_vld(values, 0);
+ }
+ simdutf_really_inline simd8(const __m128i _value)
+ : base_u8<uint8_t>(_value) {}
+ // Zero constructor
+ simdutf_really_inline simd8() : simd8(zero()) {}
+ // Array constructor
+ simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
+ // Splat constructor
+ simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+ // Member-by-member initialization
+
+ simdutf_really_inline
+ simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
+ uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
+ uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+ : simd8((__m128i)v16u8{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+ v12, v13, v14, v15}) {}
+
+ // Repeat 16 values as many times as necessary (usually for lookup tables)
+ simdutf_really_inline static simd8<uint8_t>
+ repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
+ uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
+ uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
+ uint8_t v15) {
+ return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+ v13, v14, v15);
+ }
+
+ // Store to array
+ simdutf_really_inline void store(uint8_t dst[16]) const {
+ return __lsx_vst(this->value, dst, 0);
+ }
+
+ // Saturated math
+ simdutf_really_inline simd8<uint8_t>
+ saturating_add(const simd8<uint8_t> other) const {
+ return __lsx_vsadd_bu(this->value, other);
+ }
+ simdutf_really_inline simd8<uint8_t>
+ saturating_sub(const simd8<uint8_t> other) const {
+ return __lsx_vssub_bu(this->value, other);
+ }
+
+ // Addition/subtraction are the same for signed and unsigned
+ simdutf_really_inline simd8<uint8_t>
+ operator+(const simd8<uint8_t> other) const {
+ return __lsx_vadd_b(this->value, other);
+ }
+ simdutf_really_inline simd8<uint8_t>
+ operator-(const simd8<uint8_t> other) const {
+ return __lsx_vsub_b(this->value, other);
+ }
+ simdutf_really_inline simd8<uint8_t> &operator+=(const simd8<uint8_t> other) {
+ *this = *this + other;
+ return *this;
+ }
+ simdutf_really_inline simd8<uint8_t> &operator-=(const simd8<uint8_t> other) {
+ *this = *this - other;
+ return *this;
+ }
+
+ // Order-specific operations
+ simdutf_really_inline simd8<uint8_t>
+ max_val(const simd8<uint8_t> other) const {
+ return __lsx_vmax_bu(*this, other);
+ }
+ simdutf_really_inline simd8<uint8_t>
+ min_val(const simd8<uint8_t> other) const {
+ return __lsx_vmin_bu(*this, other);
+ }
+ simdutf_really_inline simd8<bool>
+ operator<=(const simd8<uint8_t> other) const {
+ return __lsx_vsle_bu(*this, other);
+ }
+ simdutf_really_inline simd8<bool>
+ operator>=(const simd8<uint8_t> other) const {
+ return __lsx_vsle_bu(other, *this);
+ }
+ simdutf_really_inline simd8<bool>
+ operator<(const simd8<uint8_t> other) const {
+ return __lsx_vslt_bu(*this, other);
+ }
+ simdutf_really_inline simd8<bool>
+ operator>(const simd8<uint8_t> other) const {
+ return __lsx_vslt_bu(other, *this);
+ }
+ // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true
+ // = nonzero. For ARM, returns all 1's.
+ simdutf_really_inline simd8<uint8_t>
+ gt_bits(const simd8<uint8_t> other) const {
+ return simd8<uint8_t>(*this > other);
+ }
+ // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true
+ // = nonzero. For ARM, returns all 1's.
+ simdutf_really_inline simd8<uint8_t>
+ lt_bits(const simd8<uint8_t> other) const {
+ return simd8<uint8_t>(*this < other);
+ }
+
+ // Bit-specific operations
+ simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
+ return __lsx_vslt_bu(__lsx_vldi(0), __lsx_vand_v(this->value, bits));
+ }
+ simdutf_really_inline bool is_ascii() const {
+ return __lsx_vpickve2gr_hu(__lsx_vmskgez_b(this->value), 0) == 0xFFFF;
+ }
+
+ simdutf_really_inline bool any_bits_set_anywhere() const {
+ return __lsx_vpickve2gr_hu(__lsx_vmsknz_b(this->value), 0) > 0;
+ }
+ simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
+ return (*this & bits).any_bits_set_anywhere();
+ }
+ template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
+ return __lsx_vsrli_b(this->value, N);
+ }
+ template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
+ return __lsx_vslli_b(this->value, N);
+ }
+
+ // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
+ // for out of range values)
+ template <typename L>
+ simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+ return lookup_table.apply_lookup_16_to(*this);
+ }
+
+ template <typename L>
+ simdutf_really_inline simd8<L>
+ lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+ L replace5, L replace6, L replace7, L replace8, L replace9,
+ L replace10, L replace11, L replace12, L replace13, L replace14,
+ L replace15) const {
+ return lookup_16(simd8<L>::repeat_16(
+ replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+ replace7, replace8, replace9, replace10, replace11, replace12,
+ replace13, replace14, replace15));
+ }
+
+ template <typename T>
+ simdutf_really_inline simd8<uint8_t>
+ apply_lookup_16_to(const simd8<T> original) const {
+ __m128i original_tmp = __lsx_vand_v(original, __lsx_vldi(0x1f));
+ return __lsx_vshuf_b(__lsx_vldi(0), *this, simd8<uint8_t>(original_tmp));
+ }
+};
+
+// Signed bytes
+template <> struct simd8<int8_t> {
+ __m128i value;
+
+ static simdutf_really_inline simd8<int8_t> splat(int8_t _value) {
+ return __lsx_vreplgr2vr_b(_value);
+ }
+ static simdutf_really_inline simd8<int8_t> zero() { return __lsx_vldi(0); }
+ static simdutf_really_inline simd8<int8_t> load(const int8_t values[16]) {
+ return __lsx_vld(values, 0);
+ }
+
+ template <endianness big_endian>
+ simdutf_really_inline void store_ascii_as_utf16(char16_t *p) const {
+ __m128i zero = __lsx_vldi(0);
+ if (match_system(big_endian)) {
+ __lsx_vst(__lsx_vilvl_b(zero, (__m128i)this->value),
+ reinterpret_cast<uint16_t *>(p), 0);
+ __lsx_vst(__lsx_vilvh_b(zero, (__m128i)this->value),
+ reinterpret_cast<uint16_t *>(p + 8), 0);
+ } else {
+ __lsx_vst(__lsx_vilvl_b((__m128i)this->value, zero),
+ reinterpret_cast<uint16_t *>(p), 0);
+ __lsx_vst(__lsx_vilvh_b((__m128i)this->value, zero),
+ reinterpret_cast<uint16_t *>(p + 8), 0);
+ }
+ }
+
+ simdutf_really_inline void store_ascii_as_utf32(char32_t *p) const {
+ __m128i zero = __lsx_vldi(0);
+ __m128i in16low = __lsx_vilvl_b(zero, (__m128i)this->value);
+ __m128i in16high = __lsx_vilvh_b(zero, (__m128i)this->value);
+ __m128i in32_0 = __lsx_vilvl_h(zero, in16low);
+ __m128i in32_1 = __lsx_vilvh_h(zero, in16low);
+ __m128i in32_2 = __lsx_vilvl_h(zero, in16high);
+ __m128i in32_3 = __lsx_vilvh_h(zero, in16high);
+ __lsx_vst(in32_0, reinterpret_cast<uint32_t *>(p), 0);
+ __lsx_vst(in32_1, reinterpret_cast<uint32_t *>(p + 4), 0);
+ __lsx_vst(in32_2, reinterpret_cast<uint32_t *>(p + 8), 0);
+ __lsx_vst(in32_3, reinterpret_cast<uint32_t *>(p + 12), 0);
+ }
+
+ // In places where the table can be reused, which is most uses in simdutf, it
+ // is worth it to do 4 table lookups, as there is no direct zero extension
+ // from u8 to u32.
+ simdutf_really_inline void store_ascii_as_utf32_tbl(char32_t *p) const {
+ const simd8<uint8_t> tb1{0, 255, 255, 255, 1, 255, 255, 255,
+ 2, 255, 255, 255, 3, 255, 255, 255};
+ const simd8<uint8_t> tb2{4, 255, 255, 255, 5, 255, 255, 255,
+ 6, 255, 255, 255, 7, 255, 255, 255};
+ const simd8<uint8_t> tb3{8, 255, 255, 255, 9, 255, 255, 255,
+ 10, 255, 255, 255, 11, 255, 255, 255};
+ const simd8<uint8_t> tb4{12, 255, 255, 255, 13, 255, 255, 255,
+ 14, 255, 255, 255, 15, 255, 255, 255};
+
+ // encourage store pairing and interleaving
+ const auto shuf1 = this->apply_lookup_16_to(tb1);
+ const auto shuf2 = this->apply_lookup_16_to(tb2);
+ shuf1.store(reinterpret_cast<int8_t *>(p));
+ shuf2.store(reinterpret_cast<int8_t *>(p + 4));
+
+ const auto shuf3 = this->apply_lookup_16_to(tb3);
+ const auto shuf4 = this->apply_lookup_16_to(tb4);
+ shuf3.store(reinterpret_cast<int8_t *>(p + 8));
+ shuf4.store(reinterpret_cast<int8_t *>(p + 12));
+ }
+ // Conversion from/to SIMD register
+ simdutf_really_inline simd8(const __m128i _value) : value(_value) {}
+ simdutf_really_inline operator const __m128i &() const { return this->value; }
+
+ simdutf_really_inline operator const __m128i() const { return this->value; }
+
+ simdutf_really_inline operator __m128i &() { return this->value; }
+
+ // Zero constructor
+ simdutf_really_inline simd8() : simd8(zero()) {}
+ // Splat constructor
+ simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
+ // Member-by-member initialization
+
+ simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
+ int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+ int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+ int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+ : simd8((__m128i)v16i8{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+ v12, v13, v14, v15}) {}
+
+ // Repeat 16 values as many times as necessary (usually for lookup tables)
+ simdutf_really_inline static simd8<int8_t>
+ repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+ int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+ int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
+ return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+ v13, v14, v15);
+ }
+
+ // Store to array
+ simdutf_really_inline void store(int8_t dst[16]) const {
+ return __lsx_vst(value, dst, 0);
+ }
+
+ simdutf_really_inline operator simd8<uint8_t>() const {
+ return ((__m128i)this->value);
+ }
+
+ simdutf_really_inline simd8<int8_t>
+ operator|(const simd8<int8_t> other) const {
+ return __lsx_vor_v((__m128i)value, (__m128i)other.value);
+ }
+ simdutf_really_inline simd8<int8_t>
+ operator&(const simd8<int8_t> other) const {
+ return __lsx_vand_v((__m128i)value, (__m128i)other.value);
+ }
+ simdutf_really_inline simd8<int8_t>
+ operator^(const simd8<int8_t> other) const {
+ return __lsx_vxor_v((__m128i)value, (__m128i)other.value);
+ }
+ simdutf_really_inline simd8<int8_t>
+ bit_andnot(const simd8<int8_t> other) const {
+ return __lsx_vandn_v((__m128i)other.value, (__m128i)value);
+ }
+
+ // Math
+ simdutf_really_inline simd8<int8_t>
+ operator+(const simd8<int8_t> other) const {
+ return __lsx_vadd_b((__m128i)value, (__m128i)other.value);
+ }
+ simdutf_really_inline simd8<int8_t>
+ operator-(const simd8<int8_t> other) const {
+ return __lsx_vsub_b((__m128i)value, (__m128i)other.value);
+ }
+ simdutf_really_inline simd8<int8_t> &operator+=(const simd8<int8_t> other) {
+ *this = *this + other;
+ return *this;
+ }
+ simdutf_really_inline simd8<int8_t> &operator-=(const simd8<int8_t> other) {
+ *this = *this - other;
+ return *this;
+ }
+
+ simdutf_really_inline bool is_ascii() const {
+ return (__lsx_vpickve2gr_hu(__lsx_vmskgez_b((__m128i)this->value), 0) ==
+ 0xffff);
+ }
+
+ // Order-sensitive comparisons
+ simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const {
+ return __lsx_vmax_b((__m128i)value, (__m128i)other.value);
+ }
+ simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const {
+ return __lsx_vmin_b((__m128i)value, (__m128i)other.value);
+ }
+ simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
+ return __lsx_vslt_b((__m128i)other.value, (__m128i)value);
+ }
+ simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
+ return __lsx_vslt_b((__m128i)value, (__m128i)other.value);
+ }
+ simdutf_really_inline simd8<bool>
+ operator==(const simd8<int8_t> other) const {
+ return __lsx_vseq_b((__m128i)value, (__m128i)other.value);
+ }
+
+ template <int N = 1>
+ simdutf_really_inline simd8<int8_t>
+ prev(const simd8<int8_t> prev_chunk) const {
+ return __lsx_vor_v(__lsx_vbsll_v(this->value, N),
+ __lsx_vbsrl_v(prev_chunk.value, 16 - N));
+ }
+
+ // Perform a lookup assuming no value is larger than 16
+ template <typename L>
+ simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+ return lookup_table.apply_lookup_16_to(*this);
+ }
+ template <typename L>
+ simdutf_really_inline simd8<L>
+ lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+ L replace5, L replace6, L replace7, L replace8, L replace9,
+ L replace10, L replace11, L replace12, L replace13, L replace14,
+ L replace15) const {
+ return lookup_16(simd8<L>::repeat_16(
+ replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+ replace7, replace8, replace9, replace10, replace11, replace12,
+ replace13, replace14, replace15));
+ }
+
+ template <typename T>
+ simdutf_really_inline simd8<int8_t>
+ apply_lookup_16_to(const simd8<T> original) const {
+ __m128i original_tmp = __lsx_vand_v(original, __lsx_vldi(0x1f));
+ return __lsx_vshuf_b(__lsx_vldi(0), (__m128i)this->value,
+ simd8<uint8_t>(original_tmp));
+ }
+};
+
+template <typename T> struct simd8x64 {
+ static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+ static_assert(
+ NUM_CHUNKS == 4,
+ "LoongArch kernel should use four registers per 64-byte block.");
+ simd8<T> chunks[NUM_CHUNKS];
+
+ simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
+ simd8x64<T> &
+ operator=(const simd8<T> other) = delete; // no assignment allowed
+ simd8x64() = delete; // no default constructor allowed
+
+ simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
+ const simd8<T> chunk2, const simd8<T> chunk3)
+ : chunks{chunk0, chunk1, chunk2, chunk3} {}
+ simdutf_really_inline simd8x64(const T *ptr)
+ : chunks{simd8<T>::load(ptr),
+ simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)),
+ simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)),
+ simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T))} {}
+
+ simdutf_really_inline void store(T *ptr) const {
+ this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+ this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
+ this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
+ this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
+ }
+
+ simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
+ this->chunks[0] |= other.chunks[0];
+ this->chunks[1] |= other.chunks[1];
+ this->chunks[2] |= other.chunks[2];
+ this->chunks[3] |= other.chunks[3];
+ return *this;
+ }
+
+ simdutf_really_inline simd8<T> reduce_or() const {
+ return (this->chunks[0] | this->chunks[1]) |
+ (this->chunks[2] | this->chunks[3]);
+ }
+
+ simdutf_really_inline bool is_ascii() const { return reduce_or().is_ascii(); }
+
+ template <endianness endian>
+ simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+ this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
+ sizeof(simd8<T>) * 0);
+ this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
+ sizeof(simd8<T>) * 1);
+ this->chunks[2].template store_ascii_as_utf16<endian>(ptr +
+ sizeof(simd8<T>) * 2);
+ this->chunks[3].template store_ascii_as_utf16<endian>(ptr +
+ sizeof(simd8<T>) * 3);
+ }
+
+ simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
+ this->chunks[0].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 0);
+ this->chunks[1].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 1);
+ this->chunks[2].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 2);
+ this->chunks[3].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 3);
+ }
+
+ simdutf_really_inline uint64_t to_bitmask() const {
+ __m128i mask = __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[3]), 6);
+ mask = __lsx_vor_v(mask, __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[2]), 4));
+ mask = __lsx_vor_v(mask, __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[1]), 2));
+ mask = __lsx_vor_v(mask, __lsx_vmsknz_b(this->chunks[0]));
+ return __lsx_vpickve2gr_du(mask, 0);
+ }
+
+ simdutf_really_inline uint64_t eq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
+ this->chunks[2] == mask, this->chunks[3] == mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t lteq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
+ this->chunks[2] <= mask, this->chunks[3] <= mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+ const simd8<T> mask_low = simd8<T>::splat(low);
+ const simd8<T> mask_high = simd8<T>::splat(high);
+
+ return simd8x64<bool>(
+ (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+ (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+ (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+ (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+ const simd8<T> mask_low = simd8<T>::splat(low);
+ const simd8<T> mask_high = simd8<T>::splat(high);
+ return simd8x64<bool>(
+ (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+ (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+ (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+ (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t lt(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
+ this->chunks[2] < mask, this->chunks[3] < mask)
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t gt(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask,
+ this->chunks[2] > mask, this->chunks[3] > mask)
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t gteq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask,
+ this->chunks[2] >= mask, this->chunks[3] >= mask)
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+ const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+ return simd8x64<bool>(simd8<uint8_t>(this->chunks[0].value) >= mask,
+ simd8<uint8_t>(this->chunks[1].value) >= mask,
+ simd8<uint8_t>(this->chunks[2].value) >= mask,
+ simd8<uint8_t>(this->chunks[3].value) >= mask)
+ .to_bitmask();
+ }
+}; // struct simd8x64<T>
+#include "simdutf/lsx/simd16-inl.h"
+} // namespace simd
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#endif // SIMDUTF_LSX_SIMD_H
diff --git a/contrib/simdutf/src/simdutf/lsx/simd16-inl.h b/contrib/simdutf/src/simdutf/lsx/simd16-inl.h
new file mode 100644
index 000000000..6d0ca6a47
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/lsx/simd16-inl.h
@@ -0,0 +1,378 @@
+template <typename T> struct simd16;
+
+template <typename T, typename Mask = simd16<bool>> struct base_u16 {
+ __m128i value;
+ static const int SIZE = sizeof(value);
+
+ // Conversion from/to SIMD register
+ simdutf_really_inline base_u16() = default;
+ simdutf_really_inline base_u16(const __m128i _value) : value(_value) {}
+ // Bit operations
+ simdutf_really_inline simd16<T> operator|(const simd16<T> other) const {
+ return __lsx_vor_v(this->value, other.value);
+ }
+ simdutf_really_inline simd16<T> operator&(const simd16<T> other) const {
+ return __lsx_vand_v(this->value, other.value);
+ }
+ simdutf_really_inline simd16<T> operator^(const simd16<T> other) const {
+ return __lsx_vxor_v(this->value, other.value);
+ }
+ simdutf_really_inline simd16<T> bit_andnot(const simd16<T> other) const {
+ return __lsx_vandn_v(this->value, other.value);
+ }
+ simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
+ simdutf_really_inline simd16<T> &operator|=(const simd16<T> other) {
+ auto this_cast = static_cast<simd16<T> *>(this);
+ *this_cast = *this_cast | other;
+ return *this_cast;
+ }
+ simdutf_really_inline simd16<T> &operator&=(const simd16<T> other) {
+ auto this_cast = static_cast<simd16<T> *>(this);
+ *this_cast = *this_cast & other;
+ return *this_cast;
+ }
+ simdutf_really_inline simd16<T> &operator^=(const simd16<T> other) {
+ auto this_cast = static_cast<simd16<T> *>(this);
+ *this_cast = *this_cast ^ other;
+ return *this_cast;
+ }
+
+ friend simdutf_really_inline Mask operator==(const simd16<T> lhs,
+ const simd16<T> rhs) {
+ return __lsx_vseq_h(lhs.value, rhs.value);
+ }
+
+ template <int N = 1>
+ simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+ return __lsx_vor_v(__lsx_vbsll_v(*this, N * 2),
+ __lsx_vbsrl_v(prev_chunk, 16 - N * 2));
+ }
+};
+
+template <typename T, typename Mask = simd16<bool>>
+struct base16 : base_u16<T> {
+ typedef uint16_t bitmask_t;
+ typedef uint32_t bitmask2_t;
+
+ simdutf_really_inline base16() : base_u16<T>() {}
+ simdutf_really_inline base16(const __m128i _value) : base_u16<T>(_value) {}
+ template <typename Pointer>
+ simdutf_really_inline base16(const Pointer *ptr)
+ : base16(__lsx_vld(ptr, 0)) {}
+
+ static const int SIZE = sizeof(base_u16<T>::value);
+
+ template <int N = 1>
+ simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+ return __lsx_vor_v(__lsx_vbsll_v(*this, N * 2),
+ __lsx_vbsrl_v(prev_chunk, 16 - N * 2));
+ }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd16<bool> : base16<bool> {
+ static simdutf_really_inline simd16<bool> splat(bool _value) {
+ return __lsx_vreplgr2vr_h(uint16_t(-(!!_value)));
+ }
+
+ simdutf_really_inline simd16() : base16() {}
+ simdutf_really_inline simd16(const __m128i _value) : base16<bool>(_value) {}
+ // Splat constructor
+ simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}
+};
+
+template <typename T> struct base16_numeric : base16<T> {
+ static simdutf_really_inline simd16<T> splat(T _value) {
+ return __lsx_vreplgr2vr_h(_value);
+ }
+ static simdutf_really_inline simd16<T> zero() { return __lsx_vldi(0); }
+ static simdutf_really_inline simd16<T> load(const T values[8]) {
+ return __lsx_vld(reinterpret_cast<const uint16_t *>(values), 0);
+ }
+
+ simdutf_really_inline base16_numeric() : base16<T>() {}
+ simdutf_really_inline base16_numeric(const __m128i _value)
+ : base16<T>(_value) {}
+
+ // Store to array
+ simdutf_really_inline void store(T dst[8]) const {
+ return __lsx_vst(this->value, dst, 0);
+ }
+
+ // Override to distinguish from bool version
+ simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
+
+ // Addition/subtraction are the same for signed and unsigned
+ simdutf_really_inline simd16<T> operator+(const simd16<T> other) const {
+ return __lsx_vadd_b(*this, other);
+ }
+ simdutf_really_inline simd16<T> operator-(const simd16<T> other) const {
+ return __lsx_vsub_b(*this, other);
+ }
+ simdutf_really_inline simd16<T> &operator+=(const simd16<T> other) {
+ *this = *this + other;
+ return *static_cast<simd16<T> *>(this);
+ }
+ simdutf_really_inline simd16<T> &operator-=(const simd16<T> other) {
+ *this = *this - other;
+ return *static_cast<simd16<T> *>(this);
+ }
+};
+
+// Signed code unitstemplate<>
+template <> struct simd16<int16_t> : base16_numeric<int16_t> {
+ simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
+ simdutf_really_inline simd16(const __m128i _value)
+ : base16_numeric<int16_t>(_value) {}
+ simdutf_really_inline simd16(simd16<bool> other)
+ : base16_numeric<int16_t>(other.value) {}
+
+ // Splat constructor
+ simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd16(const int16_t *values) : simd16(load(values)) {}
+ simdutf_really_inline simd16(const char16_t *values)
+ : simd16(load(reinterpret_cast<const int16_t *>(values))) {}
+ simdutf_really_inline operator simd16<uint16_t>() const;
+
+ // Order-sensitive comparisons
+ simdutf_really_inline simd16<int16_t>
+ max_val(const simd16<int16_t> other) const {
+ return __lsx_vmax_h(this->value, other.value);
+ }
+ simdutf_really_inline simd16<int16_t>
+ min_val(const simd16<int16_t> other) const {
+ return __lsx_vmin_h(this->value, other.value);
+ }
+ simdutf_really_inline simd16<bool>
+ operator>(const simd16<int16_t> other) const {
+ return __lsx_vsle_h(other.value, this->value);
+ }
+ simdutf_really_inline simd16<bool>
+ operator<(const simd16<int16_t> other) const {
+ return __lsx_vslt_h(this->value, other.value);
+ }
+};
+
+// Unsigned code unitstemplate<>
+template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
+ simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
+ simdutf_really_inline simd16(const __m128i _value)
+ : base16_numeric<uint16_t>((__m128i)_value) {}
+ simdutf_really_inline simd16(simd16<bool> other)
+ : base16_numeric<uint16_t>(other.value) {}
+
+ // Splat constructor
+ simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {}
+ simdutf_really_inline simd16(const char16_t *values)
+ : simd16(load(reinterpret_cast<const uint16_t *>(values))) {}
+
+ // Saturated math
+ simdutf_really_inline simd16<uint16_t>
+ saturating_add(const simd16<uint16_t> other) const {
+ return __lsx_vsadd_hu(this->value, other.value);
+ }
+ simdutf_really_inline simd16<uint16_t>
+ saturating_sub(const simd16<uint16_t> other) const {
+ return __lsx_vssub_hu(this->value, other.value);
+ }
+
+ // Order-specific operations
+ simdutf_really_inline simd16<uint16_t>
+ max_val(const simd16<uint16_t> other) const {
+ return __lsx_vmax_hu(this->value, other.value);
+ }
+ simdutf_really_inline simd16<uint16_t>
+ min_val(const simd16<uint16_t> other) const {
+ return __lsx_vmin_hu(this->value, other.value);
+ }
+ // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd16<uint16_t>
+ gt_bits(const simd16<uint16_t> other) const {
+ return this->saturating_sub(other);
+ }
+ // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd16<uint16_t>
+ lt_bits(const simd16<uint16_t> other) const {
+ return other.saturating_sub(*this);
+ }
+ simdutf_really_inline simd16<bool>
+ operator<=(const simd16<uint16_t> other) const {
+ return __lsx_vsle_hu(this->value, other.value);
+ }
+ simdutf_really_inline simd16<bool>
+ operator>=(const simd16<uint16_t> other) const {
+ return __lsx_vsle_hu(other.value, this->value);
+ }
+ simdutf_really_inline simd16<bool>
+ operator>(const simd16<uint16_t> other) const {
+ return __lsx_vslt_hu(other.value, this->value);
+ }
+ simdutf_really_inline simd16<bool>
+ operator<(const simd16<uint16_t> other) const {
+ return __lsx_vslt_hu(this->value, other.value);
+ }
+
+ // Bit-specific operations
+ simdutf_really_inline simd16<bool> bits_not_set() const {
+ return *this == uint16_t(0);
+ }
+ template <int N> simdutf_really_inline simd16<uint16_t> shr() const {
+ return simd16<uint16_t>(__lsx_vsrli_h(this->value, N));
+ }
+ template <int N> simdutf_really_inline simd16<uint16_t> shl() const {
+ return simd16<uint16_t>(__lsx_vslli_h(this->value, N));
+ }
+
+ // logical operations
+ simdutf_really_inline simd16<uint16_t>
+ operator|(const simd16<uint16_t> other) const {
+ return __lsx_vor_v(this->value, other.value);
+ }
+ simdutf_really_inline simd16<uint16_t>
+ operator&(const simd16<uint16_t> other) const {
+ return __lsx_vand_v(this->value, other.value);
+ }
+ simdutf_really_inline simd16<uint16_t>
+ operator^(const simd16<uint16_t> other) const {
+ return __lsx_vxor_v(this->value, other.value);
+ }
+
+ // Pack with the unsigned saturation of two uint16_t code units into single
+ // uint8_t vector
+ static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
+ const simd16<uint16_t> &v1) {
+ return __lsx_vssrlni_bu_h(v1.value, v0.value, 0);
+ }
+
+ // Change the endianness
+ simdutf_really_inline simd16<uint16_t> swap_bytes() const {
+ return __lsx_vshuf4i_b(this->value, 0b10110001);
+ }
+};
+
+simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const {
+ return this->value;
+}
+
+template <typename T> struct simd16x32 {
+ static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
+ static_assert(
+ NUM_CHUNKS == 4,
+ "LOONGARCH kernel should use four registers per 64-byte block.");
+ simd16<T> chunks[NUM_CHUNKS];
+
+ simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
+ simd16x32<T> &
+ operator=(const simd16<T> other) = delete; // no assignment allowed
+ simd16x32() = delete; // no default constructor allowed
+
+ simdutf_really_inline
+ simd16x32(const simd16<T> chunk0, const simd16<T> chunk1,
+ const simd16<T> chunk2, const simd16<T> chunk3)
+ : chunks{chunk0, chunk1, chunk2, chunk3} {}
+ simdutf_really_inline simd16x32(const T *ptr)
+ : chunks{simd16<T>::load(ptr),
+ simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)),
+ simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)),
+ simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T))} {}
+
+ simdutf_really_inline void store(T *ptr) const {
+ this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
+ this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
+ this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
+ this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
+ }
+
+ simdutf_really_inline simd16<T> reduce_or() const {
+ return (this->chunks[0] | this->chunks[1]) |
+ (this->chunks[2] | this->chunks[3]);
+ }
+
+ simdutf_really_inline bool is_ascii() const { return reduce_or().is_ascii(); }
+
+ simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+ this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
+ this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 1);
+ this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 2);
+ this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 3);
+ }
+
+ simdutf_really_inline uint64_t to_bitmask() const {
+ __m128i mask = __lsx_vbsll_v(__lsx_vmsknz_b((this->chunks[3]).value), 6);
+ mask = __lsx_vor_v(
+ mask, __lsx_vbsll_v(__lsx_vmsknz_b((this->chunks[2]).value), 4));
+ mask = __lsx_vor_v(
+ mask, __lsx_vbsll_v(__lsx_vmsknz_b((this->chunks[1]).value), 2));
+ mask = __lsx_vor_v(mask, __lsx_vmsknz_b((this->chunks[0]).value));
+ return __lsx_vpickve2gr_du(mask, 0);
+ }
+
+ simdutf_really_inline void swap_bytes() {
+ this->chunks[0] = this->chunks[0].swap_bytes();
+ this->chunks[1] = this->chunks[1].swap_bytes();
+ this->chunks[2] = this->chunks[2].swap_bytes();
+ this->chunks[3] = this->chunks[3].swap_bytes();
+ }
+
+ simdutf_really_inline uint64_t eq(const T m) const {
+ const simd16<T> mask = simd16<T>::splat(m);
+ return simd16x32<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
+ this->chunks[2] == mask, this->chunks[3] == mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t lteq(const T m) const {
+ const simd16<T> mask = simd16<T>::splat(m);
+ return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
+ this->chunks[2] <= mask, this->chunks[3] <= mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+ const simd16<T> mask_low = simd16<T>::splat(low);
+ const simd16<T> mask_high = simd16<T>::splat(high);
+
+ return simd16x32<bool>(
+ (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+ (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+ (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+ (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+ const simd16<T> mask_low = simd16<T>::splat(low);
+ const simd16<T> mask_high = simd16<T>::splat(high);
+ return simd16x32<bool>(
+ (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+ (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+ (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+ (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t lt(const T m) const {
+ const simd16<T> mask = simd16<T>::splat(m);
+ return simd16x32<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
+ this->chunks[2] < mask, this->chunks[3] < mask)
+ .to_bitmask();
+ }
+
+}; // struct simd16x32<T>
+
+template <>
+simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(
+ const uint16_t low, const uint16_t high) const {
+ const simd16<uint16_t> mask_low = simd16<uint16_t>::splat(low);
+ const simd16<uint16_t> mask_high = simd16<uint16_t>::splat(high);
+ simd16x32<uint16_t> x(simd16<uint16_t>((this->chunks[0] > mask_high) |
+ (this->chunks[0] < mask_low)),
+ simd16<uint16_t>((this->chunks[1] > mask_high) |
+ (this->chunks[1] < mask_low)),
+ simd16<uint16_t>((this->chunks[2] > mask_high) |
+ (this->chunks[2] < mask_low)),
+ simd16<uint16_t>((this->chunks[3] > mask_high) |
+ (this->chunks[3] < mask_low)));
+ return x.to_bitmask();
+}
diff --git a/contrib/simdutf/src/simdutf/ppc64.h b/contrib/simdutf/src/simdutf/ppc64.h
new file mode 100644
index 000000000..970fcd16c
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/ppc64.h
@@ -0,0 +1,40 @@
+#ifndef SIMDUTF_PPC64_H
+#define SIMDUTF_PPC64_H
+
+#ifdef SIMDUTF_FALLBACK_H
+ #error "ppc64.h must be included before fallback.h"
+#endif
+
+#include "simdutf/portability.h"
+
+#ifndef SIMDUTF_IMPLEMENTATION_PPC64
+ #define SIMDUTF_IMPLEMENTATION_PPC64 (SIMDUTF_IS_PPC64)
+#endif
+#define SIMDUTF_CAN_ALWAYS_RUN_PPC64 \
+ SIMDUTF_IMPLEMENTATION_PPC64 &&SIMDUTF_IS_PPC64
+
+#include "simdutf/internal/isadetection.h"
+
+#if SIMDUTF_IMPLEMENTATION_PPC64
+
+namespace simdutf {
+/**
+ * Implementation for ALTIVEC (PPC64).
+ */
+namespace ppc64 {} // namespace ppc64
+} // namespace simdutf
+
+ #include "simdutf/ppc64/implementation.h"
+
+ #include "simdutf/ppc64/begin.h"
+
+ // Declarations
+ #include "simdutf/ppc64/intrinsics.h"
+ #include "simdutf/ppc64/bitmanipulation.h"
+ #include "simdutf/ppc64/simd.h"
+
+ #include "simdutf/ppc64/end.h"
+
+#endif // SIMDUTF_IMPLEMENTATION_PPC64
+
+#endif // SIMDUTF_PPC64_H
diff --git a/contrib/simdutf/src/simdutf/ppc64/begin.h b/contrib/simdutf/src/simdutf/ppc64/begin.h
new file mode 100644
index 000000000..c39fd0812
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/ppc64/begin.h
@@ -0,0 +1 @@
+#define SIMDUTF_IMPLEMENTATION ppc64
diff --git a/contrib/simdutf/src/simdutf/ppc64/bitmanipulation.h b/contrib/simdutf/src/simdutf/ppc64/bitmanipulation.h
new file mode 100644
index 000000000..64366d0ad
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/ppc64/bitmanipulation.h
@@ -0,0 +1,23 @@
+#ifndef SIMDUTF_PPC64_BITMANIPULATION_H
+#define SIMDUTF_PPC64_BITMANIPULATION_H
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+simdutf_really_inline int count_ones(uint64_t input_num) {
+ // note: we do not support legacy 32-bit Windows
+ return __popcnt64(input_num); // Visual Studio wants two underscores
+}
+#else
+simdutf_really_inline int count_ones(uint64_t input_num) {
+ return __builtin_popcountll(input_num);
+}
+#endif
+
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#endif // SIMDUTF_PPC64_BITMANIPULATION_H
diff --git a/contrib/simdutf/src/simdutf/ppc64/end.h b/contrib/simdutf/src/simdutf/ppc64/end.h
new file mode 100644
index 000000000..58fd810d4
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/ppc64/end.h
@@ -0,0 +1 @@
+#undef SIMDUTF_IMPLEMENTATION
diff --git a/contrib/simdutf/src/simdutf/ppc64/implementation.h b/contrib/simdutf/src/simdutf/ppc64/implementation.h
new file mode 100644
index 000000000..0d749baba
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/ppc64/implementation.h
@@ -0,0 +1,168 @@
+#ifndef SIMDUTF_PPC64_IMPLEMENTATION_H
+#define SIMDUTF_PPC64_IMPLEMENTATION_H
+
+#include "simdutf.h"
+#include "simdutf/internal/isadetection.h"
+
+namespace simdutf {
+namespace ppc64 {
+
+namespace {
+using namespace simdutf;
+} // namespace
+
+class implementation final : public simdutf::implementation {
+public:
+ simdutf_really_inline implementation()
+ : simdutf::implementation("ppc64", "PPC64 ALTIVEC",
+ internal::instruction_set::ALTIVEC) {}
+ simdutf_warn_unused int detect_encodings(const char *input,
+ size_t length) const noexcept final;
+ simdutf_warn_unused bool validate_utf8(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_ascii(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf32(const char32_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ void change_endianness_utf16(const char16_t *buf, size_t length,
+ char16_t *output) const noexcept final;
+ simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t count_utf8(const char *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t utf32_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t utf32_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept;
+ simdutf_warn_unused result base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused full_result base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused result
+ base64_to_binary(const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused full_result base64_to_binary_details(
+ const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused size_t base64_length_from_binary(
+ size_t length, base64_options options) const noexcept;
+ size_t binary_to_base64(const char *input, size_t length, char *output,
+ base64_options options) const noexcept;
+};
+
+} // namespace ppc64
+} // namespace simdutf
+
+#endif // SIMDUTF_PPC64_IMPLEMENTATION_H
diff --git a/contrib/simdutf/src/simdutf/ppc64/intrinsics.h b/contrib/simdutf/src/simdutf/ppc64/intrinsics.h
new file mode 100644
index 000000000..51523d9dc
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/ppc64/intrinsics.h
@@ -0,0 +1,19 @@
+#ifndef SIMDUTF_PPC64_INTRINSICS_H
+#define SIMDUTF_PPC64_INTRINSICS_H
+
+#include "simdutf.h"
+
+// This should be the correct header whether
+// you use visual studio or other compilers.
+#include <altivec.h>
+
+// These are defined by altivec.h in GCC toolchain, it is safe to undef them.
+#ifdef bool
+ #undef bool
+#endif
+
+#ifdef vector
+ #undef vector
+#endif
+
+#endif // SIMDUTF_PPC64_INTRINSICS_H
diff --git a/contrib/simdutf/src/simdutf/ppc64/simd.h b/contrib/simdutf/src/simdutf/ppc64/simd.h
new file mode 100644
index 000000000..e0e6eee0e
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/ppc64/simd.h
@@ -0,0 +1,479 @@
+#ifndef SIMDUTF_PPC64_SIMD_H
+#define SIMDUTF_PPC64_SIMD_H
+
+#include "simdutf.h"
+#include "simdutf/ppc64/bitmanipulation.h"
+#include <type_traits>
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+namespace simd {
+
+using __m128i = __vector unsigned char;
+
+template <typename Child> struct base {
+ __m128i value;
+
+ // Zero constructor
+ simdutf_really_inline base() : value{__m128i()} {}
+
+ // Conversion from SIMD register
+ simdutf_really_inline base(const __m128i _value) : value(_value) {}
+
+ // Conversion to SIMD register
+ simdutf_really_inline operator const __m128i &() const { return this->value; }
+ simdutf_really_inline operator __m128i &() { return this->value; }
+
+ // Bit operations
+ simdutf_really_inline Child operator|(const Child other) const {
+ return vec_or(this->value, (__m128i)other);
+ }
+ simdutf_really_inline Child operator&(const Child other) const {
+ return vec_and(this->value, (__m128i)other);
+ }
+ simdutf_really_inline Child operator^(const Child other) const {
+ return vec_xor(this->value, (__m128i)other);
+ }
+ simdutf_really_inline Child bit_andnot(const Child other) const {
+ return vec_andc(this->value, (__m128i)other);
+ }
+ simdutf_really_inline Child &operator|=(const Child other) {
+ auto this_cast = static_cast<Child *>(this);
+ *this_cast = *this_cast | other;
+ return *this_cast;
+ }
+ simdutf_really_inline Child &operator&=(const Child other) {
+ auto this_cast = static_cast<Child *>(this);
+ *this_cast = *this_cast & other;
+ return *this_cast;
+ }
+ simdutf_really_inline Child &operator^=(const Child other) {
+ auto this_cast = static_cast<Child *>(this);
+ *this_cast = *this_cast ^ other;
+ return *this_cast;
+ }
+};
+
+// Forward-declared so they can be used by splat and friends.
+template <typename T> struct simd8;
+
+template <typename T, typename Mask = simd8<bool>>
+struct base8 : base<simd8<T>> {
+ typedef uint16_t bitmask_t;
+ typedef uint32_t bitmask2_t;
+
+ simdutf_really_inline base8() : base<simd8<T>>() {}
+ simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
+
+ friend simdutf_really_inline Mask operator==(const simd8<T> lhs,
+ const simd8<T> rhs) {
+ return (__m128i)vec_cmpeq(lhs.value, (__m128i)rhs);
+ }
+
+ static const int SIZE = sizeof(base<simd8<T>>::value);
+
+ template <int N = 1>
+ simdutf_really_inline simd8<T> prev(simd8<T> prev_chunk) const {
+ __m128i chunk = this->value;
+#ifdef __LITTLE_ENDIAN__
+ chunk = (__m128i)vec_reve(this->value);
+ prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
+#endif
+ chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
+#ifdef __LITTLE_ENDIAN__
+ chunk = (__m128i)vec_reve((__m128i)chunk);
+#endif
+ return chunk;
+ }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd8<bool> : base8<bool> {
+ static simdutf_really_inline simd8<bool> splat(bool _value) {
+ return (__m128i)vec_splats((unsigned char)(-(!!_value)));
+ }
+
+ simdutf_really_inline simd8() : base8() {}
+ simdutf_really_inline simd8(const __m128i _value) : base8<bool>(_value) {}
+ // Splat constructor
+ simdutf_really_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
+
+ simdutf_really_inline int to_bitmask() const {
+ __vector unsigned long long result;
+ const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
+ 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
+
+ result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
+ (__m128i)perm_mask));
+#ifdef __LITTLE_ENDIAN__
+ return static_cast<int>(result[1]);
+#else
+ return static_cast<int>(result[0]);
+#endif
+ }
+ simdutf_really_inline bool any() const {
+ return !vec_all_eq(this->value, (__m128i)vec_splats(0));
+ }
+ simdutf_really_inline simd8<bool> operator~() const {
+ return this->value ^ (__m128i)splat(true);
+ }
+};
+
+template <typename T> struct base8_numeric : base8<T> {
+ static simdutf_really_inline simd8<T> splat(T value) {
+ (void)value;
+ return (__m128i)vec_splats(value);
+ }
+ static simdutf_really_inline simd8<T> zero() { return splat(0); }
+ static simdutf_really_inline simd8<T> load(const T values[16]) {
+ return (__m128i)(vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(values)));
+ }
+ // Repeat 16 values as many times as necessary (usually for lookup tables)
+ static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
+ T v5, T v6, T v7, T v8, T v9,
+ T v10, T v11, T v12, T v13,
+ T v14, T v15) {
+ return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+ v14, v15);
+ }
+
+ simdutf_really_inline base8_numeric() : base8<T>() {}
+ simdutf_really_inline base8_numeric(const __m128i _value)
+ : base8<T>(_value) {}
+
+ // Store to array
+ simdutf_really_inline void store(T dst[16]) const {
+ vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst));
+ }
+
+ // Override to distinguish from bool version
+ simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+
+ // Addition/subtraction are the same for signed and unsigned
+ simdutf_really_inline simd8<T> operator+(const simd8<T> other) const {
+ return (__m128i)((__m128i)this->value + (__m128i)other);
+ }
+ simdutf_really_inline simd8<T> operator-(const simd8<T> other) const {
+ return (__m128i)((__m128i)this->value - (__m128i)other);
+ }
+ simdutf_really_inline simd8<T> &operator+=(const simd8<T> other) {
+ *this = *this + other;
+ return *static_cast<simd8<T> *>(this);
+ }
+ simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
+ *this = *this - other;
+ return *static_cast<simd8<T> *>(this);
+ }
+
+ // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
+ // for out of range values)
+ template <typename L>
+ simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+ return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table,
+ this->value);
+ }
+
+ template <typename L>
+ simdutf_really_inline simd8<L>
+ lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+ L replace5, L replace6, L replace7, L replace8, L replace9,
+ L replace10, L replace11, L replace12, L replace13, L replace14,
+ L replace15) const {
+ return lookup_16(simd8<L>::repeat_16(
+ replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+ replace7, replace8, replace9, replace10, replace11, replace12,
+ replace13, replace14, replace15));
+ }
+};
+
+// Signed bytes
+template <> struct simd8<int8_t> : base8_numeric<int8_t> {
+ simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
+ simdutf_really_inline simd8(const __m128i _value)
+ : base8_numeric<int8_t>(_value) {}
+
+ // Splat constructor
+ simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
+ // Member-by-member initialization
+ simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
+ int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+ int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+ int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+ : simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, v10, v11, v12, v13, v14,
+ v15}) {}
+ // Repeat 16 values as many times as necessary (usually for lookup tables)
+ simdutf_really_inline static simd8<int8_t>
+ repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+ int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+ int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
+ return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+ v13, v14, v15);
+ }
+
+ // Order-sensitive comparisons
+ simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const {
+ return (__m128i)vec_max((__vector signed char)this->value,
+ (__vector signed char)(__m128i)other);
+ }
+ simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const {
+ return (__m128i)vec_min((__vector signed char)this->value,
+ (__vector signed char)(__m128i)other);
+ }
+ simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
+ return (__m128i)vec_cmpgt((__vector signed char)this->value,
+ (__vector signed char)(__m128i)other);
+ }
+ simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
+ return (__m128i)vec_cmplt((__vector signed char)this->value,
+ (__vector signed char)(__m128i)other);
+ }
+};
+
+// Unsigned bytes
+template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
+ simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
+ simdutf_really_inline simd8(const __m128i _value)
+ : base8_numeric<uint8_t>(_value) {}
+ // Splat constructor
+ simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {}
+ // Member-by-member initialization
+ simdutf_really_inline
+ simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
+ uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
+ uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+ : simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+ v13, v14, v15}) {}
+ // Repeat 16 values as many times as necessary (usually for lookup tables)
+ simdutf_really_inline static simd8<uint8_t>
+ repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
+ uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
+ uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
+ uint8_t v15) {
+ return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+ v13, v14, v15);
+ }
+
+ // Saturated math
+ simdutf_really_inline simd8<uint8_t>
+ saturating_add(const simd8<uint8_t> other) const {
+ return (__m128i)vec_adds(this->value, (__m128i)other);
+ }
+ simdutf_really_inline simd8<uint8_t>
+ saturating_sub(const simd8<uint8_t> other) const {
+ return (__m128i)vec_subs(this->value, (__m128i)other);
+ }
+
+ // Order-specific operations
+ simdutf_really_inline simd8<uint8_t>
+ max_val(const simd8<uint8_t> other) const {
+ return (__m128i)vec_max(this->value, (__m128i)other);
+ }
+ simdutf_really_inline simd8<uint8_t>
+ min_val(const simd8<uint8_t> other) const {
+ return (__m128i)vec_min(this->value, (__m128i)other);
+ }
+ // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd8<uint8_t>
+ gt_bits(const simd8<uint8_t> other) const {
+ return this->saturating_sub(other);
+ }
+ // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd8<uint8_t>
+ lt_bits(const simd8<uint8_t> other) const {
+ return other.saturating_sub(*this);
+ }
+ simdutf_really_inline simd8<bool>
+ operator<=(const simd8<uint8_t> other) const {
+ return other.max_val(*this) == other;
+ }
+ simdutf_really_inline simd8<bool>
+ operator>=(const simd8<uint8_t> other) const {
+ return other.min_val(*this) == other;
+ }
+ simdutf_really_inline simd8<bool>
+ operator>(const simd8<uint8_t> other) const {
+ return this->gt_bits(other).any_bits_set();
+ }
+ simdutf_really_inline simd8<bool>
+ operator<(const simd8<uint8_t> other) const {
+ return this->gt_bits(other).any_bits_set();
+ }
+
+ // Bit-specific operations
+ simdutf_really_inline simd8<bool> bits_not_set() const {
+ return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
+ }
+ simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
+ return (*this & bits).bits_not_set();
+ }
+ simdutf_really_inline simd8<bool> any_bits_set() const {
+ return ~this->bits_not_set();
+ }
+ simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
+ return ~this->bits_not_set(bits);
+ }
+
+ simdutf_really_inline bool is_ascii() const {
+ return this->saturating_sub(0b01111111u).bits_not_set_anywhere();
+ }
+
+ simdutf_really_inline bool bits_not_set_anywhere() const {
+ return vec_all_eq(this->value, (__m128i)vec_splats(0));
+ }
+ simdutf_really_inline bool any_bits_set_anywhere() const {
+ return !bits_not_set_anywhere();
+ }
+ simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
+ return vec_all_eq(vec_and(this->value, (__m128i)bits),
+ (__m128i)vec_splats(0));
+ }
+ simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
+ return !bits_not_set_anywhere(bits);
+ }
+ template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
+ return simd8<uint8_t>(
+ (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
+ }
+ template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
+ return simd8<uint8_t>(
+ (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
+ }
+};
+
+template <typename T> struct simd8x64 {
+ static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+ static_assert(NUM_CHUNKS == 4,
+ "PPC64 kernel should use four registers per 64-byte block.");
+ simd8<T> chunks[NUM_CHUNKS];
+
+ simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
+ simd8x64<T> &
+ operator=(const simd8<T> other) = delete; // no assignment allowed
+ simd8x64() = delete; // no default constructor allowed
+
+ simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
+ const simd8<T> chunk2, const simd8<T> chunk3)
+ : chunks{chunk0, chunk1, chunk2, chunk3} {}
+
+ simdutf_really_inline simd8x64(const T *ptr)
+ : chunks{simd8<T>::load(ptr),
+ simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)),
+ simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)),
+ simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T))} {}
+
+ simdutf_really_inline void store(T *ptr) const {
+ this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+ this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
+ this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
+ this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
+ }
+
+ simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
+ this->chunks[0] |= other.chunks[0];
+ this->chunks[1] |= other.chunks[1];
+ this->chunks[2] |= other.chunks[2];
+ this->chunks[3] |= other.chunks[3];
+ return *this;
+ }
+
+ simdutf_really_inline simd8<T> reduce_or() const {
+ return (this->chunks[0] | this->chunks[1]) |
+ (this->chunks[2] | this->chunks[3]);
+ }
+
+ simdutf_really_inline bool is_ascii() const {
+ return input.reduce_or().is_ascii();
+ }
+
+ simdutf_really_inline uint64_t to_bitmask() const {
+ uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
+ uint64_t r1 = this->chunks[1].to_bitmask();
+ uint64_t r2 = this->chunks[2].to_bitmask();
+ uint64_t r3 = this->chunks[3].to_bitmask();
+ return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+ }
+
+ simdutf_really_inline uint64_t eq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
+ this->chunks[2] == mask, this->chunks[3] == mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
+ return simd8x64<bool>(this->chunks[0] == other.chunks[0],
+ this->chunks[1] == other.chunks[1],
+ this->chunks[2] == other.chunks[2],
+ this->chunks[3] == other.chunks[3])
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t lteq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
+ this->chunks[2] <= mask, this->chunks[3] <= mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+ const simd8<T> mask_low = simd8<T>::splat(low);
+ const simd8<T> mask_high = simd8<T>::splat(high);
+
+ return simd8x64<bool>(
+ (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+ (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+ (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+ (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+ const simd8<T> mask_low = simd8<T>::splat(low);
+ const simd8<T> mask_high = simd8<T>::splat(high);
+ return simd8x64<bool>(
+ (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+ (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+ (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+ (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t lt(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
+ this->chunks[2] < mask, this->chunks[3] < mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t gt(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask,
+ this->chunks[2] > mask, this->chunks[3] > mask)
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t gteq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask,
+ this->chunks[2] >= mask, this->chunks[3] >= mask)
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+ const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+ return simd8x64<bool>(simd8<uint8_t>(this->chunks[0]) >= mask,
+ simd8<uint8_t>(this->chunks[1]) >= mask,
+ simd8<uint8_t>(this->chunks[2]) >= mask,
+ simd8<uint8_t>(this->chunks[3]) >= mask)
+ .to_bitmask();
+ }
+}; // struct simd8x64<T>
+
+} // namespace simd
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#endif // SIMDUTF_PPC64_SIMD_INPUT_H
diff --git a/contrib/simdutf/src/simdutf/rvv.h b/contrib/simdutf/src/simdutf/rvv.h
new file mode 100644
index 000000000..4792de10f
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/rvv.h
@@ -0,0 +1,41 @@
+#ifndef SIMDUTF_RVV_H
+#define SIMDUTF_RVV_H
+
+#ifdef SIMDUTF_FALLBACK_H
+ #error "rvv.h must be included before fallback.h"
+#endif
+
+#include "simdutf/portability.h"
+
+#define SIMDUTF_CAN_ALWAYS_RUN_RVV SIMDUTF_IS_RVV
+
+#ifndef SIMDUTF_IMPLEMENTATION_RVV
+ #define SIMDUTF_IMPLEMENTATION_RVV \
+ (SIMDUTF_CAN_ALWAYS_RUN_RVV || \
+ (SIMDUTF_IS_RISCV64 && SIMDUTF_HAS_RVV_INTRINSICS && \
+ SIMDUTF_HAS_RVV_TARGET_REGION))
+#endif
+
+#if SIMDUTF_IMPLEMENTATION_RVV
+
+ #if SIMDUTF_CAN_ALWAYS_RUN_RVV
+ #define SIMDUTF_TARGET_RVV
+ #else
+ #define SIMDUTF_TARGET_RVV SIMDUTF_TARGET_REGION("arch=+v")
+ #endif
+ #if !SIMDUTF_IS_ZVBB && SIMDUTF_HAS_ZVBB_INTRINSICS
+ #define SIMDUTF_TARGET_ZVBB SIMDUTF_TARGET_REGION("arch=+v,+zvbb")
+ #endif
+
+namespace simdutf {
+namespace rvv {} // namespace rvv
+} // namespace simdutf
+
+ #include "simdutf/rvv/implementation.h"
+ #include "simdutf/rvv/begin.h"
+ #include "simdutf/rvv/intrinsics.h"
+ #include "simdutf/rvv/end.h"
+
+#endif // SIMDUTF_IMPLEMENTATION_RVV
+
+#endif // SIMDUTF_RVV_H
diff --git a/contrib/simdutf/src/simdutf/rvv/begin.h b/contrib/simdutf/src/simdutf/rvv/begin.h
new file mode 100644
index 000000000..1eed366cf
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/rvv/begin.h
@@ -0,0 +1,7 @@
+#define SIMDUTF_IMPLEMENTATION rvv
+
+#if SIMDUTF_CAN_ALWAYS_RUN_RVV
+// nothing needed.
+#else
+SIMDUTF_TARGET_RVV
+#endif
diff --git a/contrib/simdutf/src/simdutf/rvv/end.h b/contrib/simdutf/src/simdutf/rvv/end.h
new file mode 100644
index 000000000..39efe3323
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/rvv/end.h
@@ -0,0 +1,7 @@
+#if SIMDUTF_CAN_ALWAYS_RUN_RVV
+// nothing needed.
+#else
+SIMDUTF_UNTARGET_REGION
+#endif
+
+#undef SIMDUTF_IMPLEMENTATION
diff --git a/contrib/simdutf/src/simdutf/rvv/implementation.h b/contrib/simdutf/src/simdutf/rvv/implementation.h
new file mode 100644
index 000000000..e3757285e
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/rvv/implementation.h
@@ -0,0 +1,234 @@
+#ifndef SIMDUTF_RVV_IMPLEMENTATION_H
+#define SIMDUTF_RVV_IMPLEMENTATION_H
+
+#include "simdutf.h"
+#include "simdutf/internal/isadetection.h"
+
+namespace simdutf {
+namespace rvv {
+
+namespace {
+using namespace simdutf;
+} // namespace
+
+class implementation final : public simdutf::implementation {
+public:
+ simdutf_really_inline implementation()
+ : simdutf::implementation("rvv", "RISC-V Vector Extension",
+ internal::instruction_set::RVV),
+ _supports_zvbb(internal::detect_supported_architectures() &
+ internal::instruction_set::ZVBB) {}
+ simdutf_warn_unused int detect_encodings(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf8(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_ascii(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf32(const char32_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf8(
+ const char *buf, size_t len, char *utf8_output) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len, char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16le_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16be_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused result
+ convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ void change_endianness_utf16(const char16_t *buf, size_t len,
+ char16_t *output) const noexcept final;
+ simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
+ size_t len) const noexcept;
+ simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
+ size_t len) const noexcept;
+ simdutf_warn_unused size_t count_utf8(const char *buf,
+ size_t len) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16le(const char16_t *buf, size_t len) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16be(const char16_t *buf, size_t len) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_utf16le(const char16_t *buf, size_t len) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_utf16be(const char16_t *buf, size_t len) const noexcept;
+ simdutf_warn_unused size_t utf16_length_from_utf8(const char *buf,
+ size_t len) const noexcept;
+ simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *buf,
+ size_t len) const noexcept;
+ simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *buf,
+ size_t len) const noexcept;
+ simdutf_warn_unused size_t utf32_length_from_utf8(const char *buf,
+ size_t len) const noexcept;
+ simdutf_warn_unused size_t latin1_length_from_utf8(const char *buf,
+ size_t len) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf16(size_t len) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf32(size_t len) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_latin1(size_t len) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_latin1(size_t len) const noexcept;
+ simdutf_warn_unused size_t utf8_length_from_latin1(const char *buf,
+ size_t len) const noexcept;
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept;
+ simdutf_warn_unused result base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused full_result base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused result
+ base64_to_binary(const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused full_result base64_to_binary_details(
+ const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused size_t base64_length_from_binary(
+ size_t length, base64_options options) const noexcept;
+ size_t binary_to_base64(const char *input, size_t length, char *output,
+ base64_options options) const noexcept;
+
+private:
+ const bool _supports_zvbb;
+
+#if SIMDUTF_IS_ZVBB
+ bool supports_zvbb() const { return true; }
+#elif SIMDUTF_HAS_ZVBB_INTRINSICS
+ bool supports_zvbb() const { return _supports_zvbb; }
+#else
+ bool supports_zvbb() const { return false; }
+#endif
+};
+
+} // namespace rvv
+} // namespace simdutf
+
+#endif // SIMDUTF_RVV_IMPLEMENTATION_H
diff --git a/contrib/simdutf/src/simdutf/rvv/intrinsics.h b/contrib/simdutf/src/simdutf/rvv/intrinsics.h
new file mode 100644
index 000000000..9f927739f
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/rvv/intrinsics.h
@@ -0,0 +1,131 @@
+#ifndef SIMDUTF_RVV_INTRINSICS_H
+#define SIMDUTF_RVV_INTRINSICS_H
+
+#include "simdutf.h"
+
+#include <riscv_vector.h>
+
+#if __riscv_v_intrinsic >= 1000000 || __GCC__ >= 14
+ #define simdutf_vrgather_u8m1x2(tbl, idx) \
+ __riscv_vcreate_v_u8m1_u8m2( \
+ __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 0), \
+ __riscv_vsetvlmax_e8m1()), \
+ __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 1), \
+ __riscv_vsetvlmax_e8m1()));
+
+ #define simdutf_vrgather_u8m1x4(tbl, idx) \
+ __riscv_vcreate_v_u8m1_u8m4( \
+ __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 0), \
+ __riscv_vsetvlmax_e8m1()), \
+ __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 1), \
+ __riscv_vsetvlmax_e8m1()), \
+ __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 2), \
+ __riscv_vsetvlmax_e8m1()), \
+ __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 3), \
+ __riscv_vsetvlmax_e8m1()));
+#else
+ // This has worse codegen on gcc
+ #define simdutf_vrgather_u8m1x2(tbl, idx) \
+ __riscv_vset_v_u8m1_u8m2( \
+ __riscv_vlmul_ext_v_u8m1_u8m2(__riscv_vrgather_vv_u8m1( \
+ tbl, __riscv_vget_v_u8m2_u8m1(idx, 0), __riscv_vsetvlmax_e8m1())), \
+ 1, \
+ __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 1), \
+ __riscv_vsetvlmax_e8m1()))
+
+ #define simdutf_vrgather_u8m1x4(tbl, idx) \
+ __riscv_vset_v_u8m1_u8m4( \
+ __riscv_vset_v_u8m1_u8m4( \
+ __riscv_vset_v_u8m1_u8m4( \
+ __riscv_vlmul_ext_v_u8m1_u8m4(__riscv_vrgather_vv_u8m1( \
+ tbl, __riscv_vget_v_u8m4_u8m1(idx, 0), \
+ __riscv_vsetvlmax_e8m1())), \
+ 1, \
+ __riscv_vrgather_vv_u8m1(tbl, \
+ __riscv_vget_v_u8m4_u8m1(idx, 1), \
+ __riscv_vsetvlmax_e8m1())), \
+ 2, \
+ __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 2), \
+ __riscv_vsetvlmax_e8m1())), \
+ 3, \
+ __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 3), \
+ __riscv_vsetvlmax_e8m1()))
+#endif
+
+/* Zvbb adds dedicated support for endianness swaps with vrev8, but if we can't
+ * use that, we have to emulate it with the standard V extension.
+ * Using LMUL=1 vrgathers could be faster than the srl+macc variant, but that
+ * would increase register pressure, and vrgather implementations performance
+ * varies a lot. */
+enum class simdutf_ByteFlip { NONE, V, ZVBB };
+
+template <simdutf_ByteFlip method>
+simdutf_really_inline static uint16_t simdutf_byteflip(uint16_t v) {
+ if (method != simdutf_ByteFlip::NONE)
+ return (uint16_t)((v * 1u) << 8 | (v * 1u) >> 8);
+ return v;
+}
+
+#ifdef SIMDUTF_TARGET_ZVBB
+SIMDUTF_UNTARGET_REGION
+SIMDUTF_TARGET_ZVBB
+#endif
+
+template <simdutf_ByteFlip method>
+simdutf_really_inline static vuint16m1_t simdutf_byteflip(vuint16m1_t v,
+ size_t vl) {
+#if SIMDUTF_HAS_ZVBB_INTRINSICS
+ if (method == simdutf_ByteFlip::ZVBB)
+ return __riscv_vrev8_v_u16m1(v, vl);
+#endif
+ if (method == simdutf_ByteFlip::V)
+ return __riscv_vmacc_vx_u16m1(__riscv_vsrl_vx_u16m1(v, 8, vl), 0x100, v,
+ vl);
+ return v;
+}
+
+template <simdutf_ByteFlip method>
+simdutf_really_inline static vuint16m2_t simdutf_byteflip(vuint16m2_t v,
+ size_t vl) {
+#if SIMDUTF_HAS_ZVBB_INTRINSICS
+ if (method == simdutf_ByteFlip::ZVBB)
+ return __riscv_vrev8_v_u16m2(v, vl);
+#endif
+ if (method == simdutf_ByteFlip::V)
+ return __riscv_vmacc_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 8, vl), 0x100, v,
+ vl);
+ return v;
+}
+
+template <simdutf_ByteFlip method>
+simdutf_really_inline static vuint16m4_t simdutf_byteflip(vuint16m4_t v,
+ size_t vl) {
+#if SIMDUTF_HAS_ZVBB_INTRINSICS
+ if (method == simdutf_ByteFlip::ZVBB)
+ return __riscv_vrev8_v_u16m4(v, vl);
+#endif
+ if (method == simdutf_ByteFlip::V)
+ return __riscv_vmacc_vx_u16m4(__riscv_vsrl_vx_u16m4(v, 8, vl), 0x100, v,
+ vl);
+ return v;
+}
+
+template <simdutf_ByteFlip method>
+simdutf_really_inline static vuint16m8_t simdutf_byteflip(vuint16m8_t v,
+ size_t vl) {
+#if SIMDUTF_HAS_ZVBB_INTRINSICS
+ if (method == simdutf_ByteFlip::ZVBB)
+ return __riscv_vrev8_v_u16m8(v, vl);
+#endif
+ if (method == simdutf_ByteFlip::V)
+ return __riscv_vmacc_vx_u16m8(__riscv_vsrl_vx_u16m8(v, 8, vl), 0x100, v,
+ vl);
+ return v;
+}
+
+#ifdef SIMDUTF_TARGET_ZVBB
+SIMDUTF_UNTARGET_REGION
+SIMDUTF_TARGET_RVV
+#endif
+
+#endif // SIMDUTF_RVV_INTRINSICS_H
diff --git a/contrib/simdutf/src/simdutf/westmere.h b/contrib/simdutf/src/simdutf/westmere.h
new file mode 100644
index 000000000..c46ddf513
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/westmere.h
@@ -0,0 +1,59 @@
+#ifndef SIMDUTF_WESTMERE_H
+#define SIMDUTF_WESTMERE_H
+
+#ifdef SIMDUTF_FALLBACK_H
+ #error "westmere.h must be included before fallback.h"
+#endif
+
+#include "simdutf/portability.h"
+
+// Default Westmere to on if this is x86-64, unless we'll always select Haswell.
+#ifndef SIMDUTF_IMPLEMENTATION_WESTMERE
+ //
+ // You do not want to set it to (SIMDUTF_IS_X86_64 &&
+ // !SIMDUTF_REQUIRES_HASWELL) because you want to rely on runtime dispatch!
+ //
+ #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || SIMDUTF_CAN_ALWAYS_RUN_HASWELL
+ #define SIMDUTF_IMPLEMENTATION_WESTMERE 0
+ #else
+ #define SIMDUTF_IMPLEMENTATION_WESTMERE (SIMDUTF_IS_X86_64)
+ #endif
+
+#endif
+
+#if (SIMDUTF_IMPLEMENTATION_WESTMERE && SIMDUTF_IS_X86_64 && __SSE4_2__)
+ #define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE 1
+#else
+ #define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE 0
+#endif
+
+#if SIMDUTF_IMPLEMENTATION_WESTMERE
+
+ #define SIMDUTF_TARGET_WESTMERE SIMDUTF_TARGET_REGION("sse4.2,popcnt")
+
+namespace simdutf {
+/**
+ * Implementation for Westmere (Intel SSE4.2).
+ */
+namespace westmere {} // namespace westmere
+} // namespace simdutf
+
+ //
+ // These two need to be included outside SIMDUTF_TARGET_REGION
+ //
+ #include "simdutf/westmere/implementation.h"
+ #include "simdutf/westmere/intrinsics.h"
+
+ //
+ // The rest need to be inside the region
+ //
+ #include "simdutf/westmere/begin.h"
+
+ // Declarations
+ #include "simdutf/westmere/bitmanipulation.h"
+ #include "simdutf/westmere/simd.h"
+
+ #include "simdutf/westmere/end.h"
+
+#endif // SIMDUTF_IMPLEMENTATION_WESTMERE
+#endif // SIMDUTF_WESTMERE_COMMON_H
diff --git a/contrib/simdutf/src/simdutf/westmere/begin.h b/contrib/simdutf/src/simdutf/westmere/begin.h
new file mode 100644
index 000000000..9c51608f3
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/westmere/begin.h
@@ -0,0 +1,7 @@
+#define SIMDUTF_IMPLEMENTATION westmere
+
+#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
+// nothing needed.
+#else
+SIMDUTF_TARGET_WESTMERE
+#endif
diff --git a/contrib/simdutf/src/simdutf/westmere/bitmanipulation.h b/contrib/simdutf/src/simdutf/westmere/bitmanipulation.h
new file mode 100644
index 000000000..7190ef97f
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/westmere/bitmanipulation.h
@@ -0,0 +1,35 @@
+#ifndef SIMDUTF_WESTMERE_BITMANIPULATION_H
+#define SIMDUTF_WESTMERE_BITMANIPULATION_H
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
+ // note: we do not support legacy 32-bit Windows
+ return __popcnt64(input_num); // Visual Studio wants two underscores
+}
+#else
+simdutf_really_inline long long int count_ones(uint64_t input_num) {
+ return _popcnt64(input_num);
+}
+#endif
+
+#if SIMDUTF_NEED_TRAILING_ZEROES
+simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
+ #if SIMDUTF_REGULAR_VISUAL_STUDIO
+ unsigned long ret;
+ _BitScanForward64(&ret, input_num);
+ return (int)ret;
+ #else // SIMDUTF_REGULAR_VISUAL_STUDIO
+ return __builtin_ctzll(input_num);
+ #endif // SIMDUTF_REGULAR_VISUAL_STUDIO
+}
+#endif
+
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#endif // SIMDUTF_WESTMERE_BITMANIPULATION_H
diff --git a/contrib/simdutf/src/simdutf/westmere/end.h b/contrib/simdutf/src/simdutf/westmere/end.h
new file mode 100644
index 000000000..ee2b8315a
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/westmere/end.h
@@ -0,0 +1,7 @@
+#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
+// nothing needed.
+#else
+SIMDUTF_UNTARGET_REGION
+#endif
+
+#undef SIMDUTF_IMPLEMENTATION
diff --git a/contrib/simdutf/src/simdutf/westmere/implementation.h b/contrib/simdutf/src/simdutf/westmere/implementation.h
new file mode 100644
index 000000000..039c4061b
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/westmere/implementation.h
@@ -0,0 +1,222 @@
+#ifndef SIMDUTF_WESTMERE_IMPLEMENTATION_H
+#define SIMDUTF_WESTMERE_IMPLEMENTATION_H
+
+#include "simdutf/implementation.h"
+
+// The constructor may be executed on any host, so we take care not to use
+// SIMDUTF_TARGET_REGION
+namespace simdutf {
+namespace westmere {
+
+namespace {
+using namespace simdutf;
+}
+
+class implementation final : public simdutf::implementation {
+public:
+ simdutf_really_inline implementation()
+ : simdutf::implementation("westmere", "Intel/AMD SSE4.2",
+ internal::instruction_set::SSE42) {}
+ simdutf_warn_unused int detect_encodings(const char *input,
+ size_t length) const noexcept final;
+ simdutf_warn_unused bool validate_utf8(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_ascii(const char *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result
+ validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused bool validate_utf32(const char32_t *buf,
+ size_t len) const noexcept final;
+ simdutf_warn_unused result validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf8(
+ const char *buf, size_t len, char *utf8_output) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_latin1_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len, char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16le_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16be_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused result
+ convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
+ char16_t *utf16_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ simdutf_warn_unused size_t
+ convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_buffer) const noexcept final;
+ void change_endianness_utf16(const char16_t *buf, size_t length,
+ char16_t *output) const noexcept final;
+ simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t count_utf8(const char *buf,
+ size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t utf32_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t utf32_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf8(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf16(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ latin1_length_from_utf32(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf32_length_from_latin1(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf16_length_from_latin1(size_t length) const noexcept;
+ simdutf_warn_unused size_t
+ utf8_length_from_latin1(const char *input, size_t length) const noexcept;
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept;
+ simdutf_warn_unused result base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused full_result base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused size_t maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept;
+ simdutf_warn_unused result
+ base64_to_binary(const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused full_result base64_to_binary_details(
+ const char16_t *input, size_t length, char *output,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options =
+ last_chunk_handling_options::loose) const noexcept;
+ simdutf_warn_unused size_t base64_length_from_binary(
+ size_t length, base64_options options) const noexcept;
+ size_t binary_to_base64(const char *input, size_t length, char *output,
+ base64_options options) const noexcept;
+};
+
+} // namespace westmere
+} // namespace simdutf
+
+#endif // SIMDUTF_WESTMERE_IMPLEMENTATION_H
diff --git a/contrib/simdutf/src/simdutf/westmere/intrinsics.h b/contrib/simdutf/src/simdutf/westmere/intrinsics.h
new file mode 100644
index 000000000..54fc22b9c
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/westmere/intrinsics.h
@@ -0,0 +1,38 @@
+#ifndef SIMDUTF_WESTMERE_INTRINSICS_H
+#define SIMDUTF_WESTMERE_INTRINSICS_H
+
+#ifdef SIMDUTF_VISUAL_STUDIO
+ // under clang within visual studio, this will include <x86intrin.h>
+ #include <intrin.h> // visual studio or clang
+#else
+
+ #if SIMDUTF_GCC11ORMORE
+// We should not get warnings while including <x86intrin.h> yet we do
+// under some versions of GCC.
+// If the x86intrin.h header has uninitialized values that are problematic,
+// it is a GCC issue, we want to ignore these warnings.
+SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
+ #endif
+
+ #include <x86intrin.h> // elsewhere
+
+ #if SIMDUTF_GCC11ORMORE
+// cancels the suppression of the -Wuninitialized
+SIMDUTF_POP_DISABLE_WARNINGS
+ #endif
+
+#endif // SIMDUTF_VISUAL_STUDIO
+
+#ifdef SIMDUTF_CLANG_VISUAL_STUDIO
+ /**
+ * You are not supposed, normally, to include these
+ * headers directly. Instead you should either include intrin.h
+ * or x86intrin.h. However, when compiling with clang
+ * under Windows (i.e., when _MSC_VER is set), these headers
+ * only get included *if* the corresponding features are detected
+ * from macros:
+ */
+ #include <smmintrin.h> // for _mm_alignr_epi8
+#endif
+
+#endif // SIMDUTF_WESTMERE_INTRINSICS_H
diff --git a/contrib/simdutf/src/simdutf/westmere/simd.h b/contrib/simdutf/src/simdutf/westmere/simd.h
new file mode 100644
index 000000000..503cb8861
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/westmere/simd.h
@@ -0,0 +1,593 @@
+#ifndef SIMDUTF_WESTMERE_SIMD_H
+#define SIMDUTF_WESTMERE_SIMD_H
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+namespace simd {
+
+template <typename Child> struct base {
+ __m128i value;
+
+ // Zero constructor
+ simdutf_really_inline base() : value{__m128i()} {}
+
+ // Conversion from SIMD register
+ simdutf_really_inline base(const __m128i _value) : value(_value) {}
+ // Conversion to SIMD register
+ simdutf_really_inline operator const __m128i &() const { return this->value; }
+ simdutf_really_inline operator __m128i &() { return this->value; }
+ template <endianness big_endian>
+ simdutf_really_inline void store_ascii_as_utf16(char16_t *p) const {
+ __m128i first = _mm_cvtepu8_epi16(*this);
+ __m128i second = _mm_cvtepu8_epi16(_mm_srli_si128(*this, 8));
+ if (big_endian) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ first = _mm_shuffle_epi8(first, swap);
+ second = _mm_shuffle_epi8(second, swap);
+ }
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(p), first);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(p + 8), second);
+ }
+ simdutf_really_inline void store_ascii_as_utf32(char32_t *p) const {
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(p), _mm_cvtepu8_epi32(*this));
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(p + 4),
+ _mm_cvtepu8_epi32(_mm_srli_si128(*this, 4)));
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(p + 8),
+ _mm_cvtepu8_epi32(_mm_srli_si128(*this, 8)));
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(p + 12),
+ _mm_cvtepu8_epi32(_mm_srli_si128(*this, 12)));
+ }
+ // Bit operations
+ simdutf_really_inline Child operator|(const Child other) const {
+ return _mm_or_si128(*this, other);
+ }
+ simdutf_really_inline Child operator&(const Child other) const {
+ return _mm_and_si128(*this, other);
+ }
+ simdutf_really_inline Child operator^(const Child other) const {
+ return _mm_xor_si128(*this, other);
+ }
+ simdutf_really_inline Child bit_andnot(const Child other) const {
+ return _mm_andnot_si128(other, *this);
+ }
+ simdutf_really_inline Child &operator|=(const Child other) {
+ auto this_cast = static_cast<Child *>(this);
+ *this_cast = *this_cast | other;
+ return *this_cast;
+ }
+ simdutf_really_inline Child &operator&=(const Child other) {
+ auto this_cast = static_cast<Child *>(this);
+ *this_cast = *this_cast & other;
+ return *this_cast;
+ }
+ simdutf_really_inline Child &operator^=(const Child other) {
+ auto this_cast = static_cast<Child *>(this);
+ *this_cast = *this_cast ^ other;
+ return *this_cast;
+ }
+};
+
+// Forward-declared so they can be used by splat and friends.
+template <typename T> struct simd8;
+
+template <typename T, typename Mask = simd8<bool>>
+struct base8 : base<simd8<T>> {
+ typedef uint16_t bitmask_t;
+ typedef uint32_t bitmask2_t;
+
+ simdutf_really_inline T first() const { return _mm_extract_epi8(*this, 0); }
+ simdutf_really_inline T last() const { return _mm_extract_epi8(*this, 15); }
+ simdutf_really_inline base8() : base<simd8<T>>() {}
+ simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
+
+ friend simdutf_really_inline Mask operator==(const simd8<T> lhs,
+ const simd8<T> rhs) {
+ return _mm_cmpeq_epi8(lhs, rhs);
+ }
+
+ static const int SIZE = sizeof(base<simd8<T>>::value);
+
+ template <int N = 1>
+ simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+ return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
+ }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd8<bool> : base8<bool> {
+ static simdutf_really_inline simd8<bool> splat(bool _value) {
+ return _mm_set1_epi8(uint8_t(-(!!_value)));
+ }
+
+ simdutf_really_inline simd8() : base8() {}
+ simdutf_really_inline simd8(const __m128i _value) : base8<bool>(_value) {}
+ // Splat constructor
+ simdutf_really_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
+
+ simdutf_really_inline int to_bitmask() const {
+ return _mm_movemask_epi8(*this);
+ }
+ simdutf_really_inline bool any() const {
+ return !_mm_testz_si128(*this, *this);
+ }
+ simdutf_really_inline bool none() const {
+ return _mm_testz_si128(*this, *this);
+ }
+ simdutf_really_inline bool all() const {
+ return _mm_movemask_epi8(*this) == 0xFFFF;
+ }
+ simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
+};
+
+template <typename T> struct base8_numeric : base8<T> {
+ static simdutf_really_inline simd8<T> splat(T _value) {
+ return _mm_set1_epi8(_value);
+ }
+ static simdutf_really_inline simd8<T> zero() { return _mm_setzero_si128(); }
+ static simdutf_really_inline simd8<T> load(const T values[16]) {
+ return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
+ }
+ // Repeat 16 values as many times as necessary (usually for lookup tables)
+ static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
+ T v5, T v6, T v7, T v8, T v9,
+ T v10, T v11, T v12, T v13,
+ T v14, T v15) {
+ return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+ v14, v15);
+ }
+
+ simdutf_really_inline base8_numeric() : base8<T>() {}
+ simdutf_really_inline base8_numeric(const __m128i _value)
+ : base8<T>(_value) {}
+
+ // Store to array
+ simdutf_really_inline void store(T dst[16]) const {
+ return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this);
+ }
+
+ // Override to distinguish from bool version
+ simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+
+ // Addition/subtraction are the same for signed and unsigned
+ simdutf_really_inline simd8<T> operator+(const simd8<T> other) const {
+ return _mm_add_epi8(*this, other);
+ }
+ simdutf_really_inline simd8<T> operator-(const simd8<T> other) const {
+ return _mm_sub_epi8(*this, other);
+ }
+ simdutf_really_inline simd8<T> &operator+=(const simd8<T> other) {
+ *this = *this + other;
+ return *static_cast<simd8<T> *>(this);
+ }
+ simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
+ *this = *this - other;
+ return *static_cast<simd8<T> *>(this);
+ }
+
+ // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
+ // for out of range values)
+ template <typename L>
+ simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+ return _mm_shuffle_epi8(lookup_table, *this);
+ }
+
+ template <typename L>
+ simdutf_really_inline simd8<L>
+ lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+ L replace5, L replace6, L replace7, L replace8, L replace9,
+ L replace10, L replace11, L replace12, L replace13, L replace14,
+ L replace15) const {
+ return lookup_16(simd8<L>::repeat_16(
+ replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+ replace7, replace8, replace9, replace10, replace11, replace12,
+ replace13, replace14, replace15));
+ }
+};
+
+// Signed bytes
+template <> struct simd8<int8_t> : base8_numeric<int8_t> {
+ simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
+ simdutf_really_inline simd8(const __m128i _value)
+ : base8_numeric<int8_t>(_value) {}
+ // Splat constructor
+ simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
+ // Member-by-member initialization
+ simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
+ int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+ int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+ int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+ : simd8(_mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+ v12, v13, v14, v15)) {}
+ // Repeat 16 values as many times as necessary (usually for lookup tables)
+ simdutf_really_inline static simd8<int8_t>
+ repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+ int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+ int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
+ return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+ v13, v14, v15);
+ }
+ simdutf_really_inline operator simd8<uint8_t>() const;
+ simdutf_really_inline bool is_ascii() const {
+ return _mm_movemask_epi8(*this) == 0;
+ }
+
+ // Order-sensitive comparisons
+ simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const {
+ return _mm_max_epi8(*this, other);
+ }
+ simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const {
+ return _mm_min_epi8(*this, other);
+ }
+ simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
+ return _mm_cmpgt_epi8(*this, other);
+ }
+ simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
+ return _mm_cmpgt_epi8(other, *this);
+ }
+};
+
+// Unsigned bytes
+template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
+ simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
+ simdutf_really_inline simd8(const __m128i _value)
+ : base8_numeric<uint8_t>(_value) {}
+
+ // Splat constructor
+ simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {}
+ // Member-by-member initialization
+ simdutf_really_inline
+ simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
+ uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
+ uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+ : simd8(_mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+ v12, v13, v14, v15)) {}
+ // Repeat 16 values as many times as necessary (usually for lookup tables)
+ simdutf_really_inline static simd8<uint8_t>
+ repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
+ uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
+ uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
+ uint8_t v15) {
+ return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+ v13, v14, v15);
+ }
+
+ // Saturated math
+ simdutf_really_inline simd8<uint8_t>
+ saturating_add(const simd8<uint8_t> other) const {
+ return _mm_adds_epu8(*this, other);
+ }
+ simdutf_really_inline simd8<uint8_t>
+ saturating_sub(const simd8<uint8_t> other) const {
+ return _mm_subs_epu8(*this, other);
+ }
+
+ // Order-specific operations
+ simdutf_really_inline simd8<uint8_t>
+ max_val(const simd8<uint8_t> other) const {
+ return _mm_max_epu8(*this, other);
+ }
+ simdutf_really_inline simd8<uint8_t>
+ min_val(const simd8<uint8_t> other) const {
+ return _mm_min_epu8(*this, other);
+ }
+ // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd8<uint8_t>
+ gt_bits(const simd8<uint8_t> other) const {
+ return this->saturating_sub(other);
+ }
+ // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd8<uint8_t>
+ lt_bits(const simd8<uint8_t> other) const {
+ return other.saturating_sub(*this);
+ }
+ simdutf_really_inline simd8<bool>
+ operator<=(const simd8<uint8_t> other) const {
+ return other.max_val(*this) == other;
+ }
+ simdutf_really_inline simd8<bool>
+ operator>=(const simd8<uint8_t> other) const {
+ return other.min_val(*this) == other;
+ }
+ simdutf_really_inline simd8<bool>
+ operator>(const simd8<uint8_t> other) const {
+ return this->gt_bits(other).any_bits_set();
+ }
+ simdutf_really_inline simd8<bool>
+ operator<(const simd8<uint8_t> other) const {
+ return this->gt_bits(other).any_bits_set();
+ }
+
+ // Bit-specific operations
+ simdutf_really_inline simd8<bool> bits_not_set() const {
+ return *this == uint8_t(0);
+ }
+ simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
+ return (*this & bits).bits_not_set();
+ }
+ simdutf_really_inline simd8<bool> any_bits_set() const {
+ return ~this->bits_not_set();
+ }
+ simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
+ return ~this->bits_not_set(bits);
+ }
+ simdutf_really_inline bool is_ascii() const {
+ return _mm_movemask_epi8(*this) == 0;
+ }
+
+ simdutf_really_inline bool bits_not_set_anywhere() const {
+ return _mm_testz_si128(*this, *this);
+ }
+ simdutf_really_inline bool any_bits_set_anywhere() const {
+ return !bits_not_set_anywhere();
+ }
+ simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
+ return _mm_testz_si128(*this, bits);
+ }
+ simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
+ return !bits_not_set_anywhere(bits);
+ }
+ template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
+ return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N);
+ }
+ template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
+ return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N);
+ }
+ // Get one of the bits and make a bitmask out of it.
+ // e.g. value.get_bit<7>() gets the high bit
+ template <int N> simdutf_really_inline int get_bit() const {
+ return _mm_movemask_epi8(_mm_slli_epi16(*this, 7 - N));
+ }
+};
+simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const {
+ return this->value;
+}
+
+// Unsigned bytes
+template <> struct simd8<uint16_t> : base<uint16_t> {
+ static simdutf_really_inline simd8<uint16_t> splat(uint16_t _value) {
+ return _mm_set1_epi16(_value);
+ }
+ static simdutf_really_inline simd8<uint16_t> load(const uint16_t values[8]) {
+ return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
+ }
+
+ simdutf_really_inline simd8() : base<uint16_t>() {}
+ simdutf_really_inline simd8(const __m128i _value) : base<uint16_t>(_value) {}
+ // Splat constructor
+ simdutf_really_inline simd8(uint16_t _value) : simd8(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd8(const uint16_t *values) : simd8(load(values)) {}
+ // Member-by-member initialization
+ simdutf_really_inline simd8(uint16_t v0, uint16_t v1, uint16_t v2,
+ uint16_t v3, uint16_t v4, uint16_t v5,
+ uint16_t v6, uint16_t v7)
+ : simd8(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
+
+ // Saturated math
+ simdutf_really_inline simd8<uint16_t>
+ saturating_add(const simd8<uint16_t> other) const {
+ return _mm_adds_epu16(*this, other);
+ }
+ simdutf_really_inline simd8<uint16_t>
+ saturating_sub(const simd8<uint16_t> other) const {
+ return _mm_subs_epu16(*this, other);
+ }
+
+ // Order-specific operations
+ simdutf_really_inline simd8<uint16_t>
+ max_val(const simd8<uint16_t> other) const {
+ return _mm_max_epu16(*this, other);
+ }
+ simdutf_really_inline simd8<uint16_t>
+ min_val(const simd8<uint16_t> other) const {
+ return _mm_min_epu16(*this, other);
+ }
+ // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd8<uint16_t>
+ gt_bits(const simd8<uint16_t> other) const {
+ return this->saturating_sub(other);
+ }
+ // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd8<uint16_t>
+ lt_bits(const simd8<uint16_t> other) const {
+ return other.saturating_sub(*this);
+ }
+ simdutf_really_inline simd8<bool>
+ operator<=(const simd8<uint16_t> other) const {
+ return other.max_val(*this) == other;
+ }
+ simdutf_really_inline simd8<bool>
+ operator>=(const simd8<uint16_t> other) const {
+ return other.min_val(*this) == other;
+ }
+ simdutf_really_inline simd8<bool>
+ operator==(const simd8<uint16_t> other) const {
+ return _mm_cmpeq_epi16(*this, other);
+ }
+ simdutf_really_inline simd8<bool>
+ operator&(const simd8<uint16_t> other) const {
+ return _mm_and_si128(*this, other);
+ }
+ simdutf_really_inline simd8<bool>
+ operator|(const simd8<uint16_t> other) const {
+ return _mm_or_si128(*this, other);
+ }
+
+ // Bit-specific operations
+ simdutf_really_inline simd8<bool> bits_not_set() const {
+ return *this == uint16_t(0);
+ }
+ simdutf_really_inline simd8<bool> any_bits_set() const {
+ return ~this->bits_not_set();
+ }
+
+ simdutf_really_inline bool bits_not_set_anywhere() const {
+ return _mm_testz_si128(*this, *this);
+ }
+ simdutf_really_inline bool any_bits_set_anywhere() const {
+ return !bits_not_set_anywhere();
+ }
+ simdutf_really_inline bool bits_not_set_anywhere(simd8<uint16_t> bits) const {
+ return _mm_testz_si128(*this, bits);
+ }
+ simdutf_really_inline bool any_bits_set_anywhere(simd8<uint16_t> bits) const {
+ return !bits_not_set_anywhere(bits);
+ }
+};
+template <typename T> struct simd8x64 {
+ static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+ static_assert(NUM_CHUNKS == 4,
+ "Westmere kernel should use four registers per 64-byte block.");
+ simd8<T> chunks[NUM_CHUNKS];
+
+ simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
+ simd8x64<T> &
+ operator=(const simd8<T> other) = delete; // no assignment allowed
+ simd8x64() = delete; // no default constructor allowed
+
+ simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
+ const simd8<T> chunk2, const simd8<T> chunk3)
+ : chunks{chunk0, chunk1, chunk2, chunk3} {}
+ simdutf_really_inline simd8x64(const T *ptr)
+ : chunks{simd8<T>::load(ptr),
+ simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)),
+ simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)),
+ simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T))} {}
+
+ simdutf_really_inline void store(T *ptr) const {
+ this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+ this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
+ this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
+ this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
+ }
+
+ simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
+ this->chunks[0] |= other.chunks[0];
+ this->chunks[1] |= other.chunks[1];
+ this->chunks[2] |= other.chunks[2];
+ this->chunks[3] |= other.chunks[3];
+ return *this;
+ }
+
+ simdutf_really_inline simd8<T> reduce_or() const {
+ return (this->chunks[0] | this->chunks[1]) |
+ (this->chunks[2] | this->chunks[3]);
+ }
+
+ simdutf_really_inline bool is_ascii() const {
+ return this->reduce_or().is_ascii();
+ }
+
+ template <endianness endian>
+ simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+ this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
+ sizeof(simd8<T>) * 0);
+ this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
+ sizeof(simd8<T>) * 1);
+ this->chunks[2].template store_ascii_as_utf16<endian>(ptr +
+ sizeof(simd8<T>) * 2);
+ this->chunks[3].template store_ascii_as_utf16<endian>(ptr +
+ sizeof(simd8<T>) * 3);
+ }
+
+ simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
+ this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
+ this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
+ this->chunks[2].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 2);
+ this->chunks[3].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 3);
+ }
+
+ simdutf_really_inline uint64_t to_bitmask() const {
+ uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
+ uint64_t r1 = this->chunks[1].to_bitmask();
+ uint64_t r2 = this->chunks[2].to_bitmask();
+ uint64_t r3 = this->chunks[3].to_bitmask();
+ return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+ }
+
+ simdutf_really_inline uint64_t eq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
+ this->chunks[2] == mask, this->chunks[3] == mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
+ return simd8x64<bool>(this->chunks[0] == other.chunks[0],
+ this->chunks[1] == other.chunks[1],
+ this->chunks[2] == other.chunks[2],
+ this->chunks[3] == other.chunks[3])
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t lteq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
+ this->chunks[2] <= mask, this->chunks[3] <= mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+ const simd8<T> mask_low = simd8<T>::splat(low);
+ const simd8<T> mask_high = simd8<T>::splat(high);
+
+ return simd8x64<bool>(
+ (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+ (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+ (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+ (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+ const simd8<T> mask_low = simd8<T>::splat(low - 1);
+ const simd8<T> mask_high = simd8<T>::splat(high + 1);
+ return simd8x64<bool>(
+ (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+ (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
+ (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
+ (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t lt(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
+ this->chunks[2] < mask, this->chunks[3] < mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t gt(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask,
+ this->chunks[2] > mask, this->chunks[3] > mask)
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t gteq(const T m) const {
+ const simd8<T> mask = simd8<T>::splat(m);
+ return simd8x64<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask,
+ this->chunks[2] >= mask, this->chunks[3] >= mask)
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+ const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+ return simd8x64<bool>(simd8<uint8_t>(__m128i(this->chunks[0])) >= mask,
+ simd8<uint8_t>(__m128i(this->chunks[1])) >= mask,
+ simd8<uint8_t>(__m128i(this->chunks[2])) >= mask,
+ simd8<uint8_t>(__m128i(this->chunks[3])) >= mask)
+ .to_bitmask();
+ }
+}; // struct simd8x64<T>
+
+#include "simdutf/westmere/simd16-inl.h"
+
+} // namespace simd
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#endif // SIMDUTF_WESTMERE_SIMD_INPUT_H
diff --git a/contrib/simdutf/src/simdutf/westmere/simd16-inl.h b/contrib/simdutf/src/simdutf/westmere/simd16-inl.h
new file mode 100644
index 000000000..646f4f165
--- /dev/null
+++ b/contrib/simdutf/src/simdutf/westmere/simd16-inl.h
@@ -0,0 +1,358 @@
+template <typename T> struct simd16;
+
+template <typename T, typename Mask = simd16<bool>>
+struct base16 : base<simd16<T>> {
+ typedef uint16_t bitmask_t;
+ typedef uint32_t bitmask2_t;
+
+ simdutf_really_inline base16() : base<simd16<T>>() {}
+ simdutf_really_inline base16(const __m128i _value)
+ : base<simd16<T>>(_value) {}
+ template <typename Pointer>
+ simdutf_really_inline base16(const Pointer *ptr)
+ : base16(_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr))) {}
+
+ friend simdutf_really_inline Mask operator==(const simd16<T> lhs,
+ const simd16<T> rhs) {
+ return _mm_cmpeq_epi16(lhs, rhs);
+ }
+
+ static const int SIZE = sizeof(base<simd16<T>>::value);
+
+ template <int N = 1>
+ simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+ return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
+ }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd16<bool> : base16<bool> {
+ static simdutf_really_inline simd16<bool> splat(bool _value) {
+ return _mm_set1_epi16(uint16_t(-(!!_value)));
+ }
+
+ simdutf_really_inline simd16() : base16() {}
+ simdutf_really_inline simd16(const __m128i _value) : base16<bool>(_value) {}
+ // Splat constructor
+ simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}
+
+ simdutf_really_inline int to_bitmask() const {
+ return _mm_movemask_epi8(*this);
+ }
+ simdutf_really_inline bool any() const {
+ return !_mm_testz_si128(*this, *this);
+ }
+ simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
+};
+
+template <typename T> struct base16_numeric : base16<T> {
+ static simdutf_really_inline simd16<T> splat(T _value) {
+ return _mm_set1_epi16(_value);
+ }
+ static simdutf_really_inline simd16<T> zero() { return _mm_setzero_si128(); }
+ static simdutf_really_inline simd16<T> load(const T values[8]) {
+ return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
+ }
+
+ simdutf_really_inline base16_numeric() : base16<T>() {}
+ simdutf_really_inline base16_numeric(const __m128i _value)
+ : base16<T>(_value) {}
+
+ // Store to array
+ simdutf_really_inline void store(T dst[8]) const {
+ return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this);
+ }
+
+ // Override to distinguish from bool version
+ simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
+
+ // Addition/subtraction are the same for signed and unsigned
+ simdutf_really_inline simd16<T> operator+(const simd16<T> other) const {
+ return _mm_add_epi16(*this, other);
+ }
+ simdutf_really_inline simd16<T> operator-(const simd16<T> other) const {
+ return _mm_sub_epi16(*this, other);
+ }
+ simdutf_really_inline simd16<T> &operator+=(const simd16<T> other) {
+ *this = *this + other;
+ return *static_cast<simd16<T> *>(this);
+ }
+ simdutf_really_inline simd16<T> &operator-=(const simd16<T> other) {
+ *this = *this - other;
+ return *static_cast<simd16<T> *>(this);
+ }
+};
+
+// Signed code units
+template <> struct simd16<int16_t> : base16_numeric<int16_t> {
+ simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
+ simdutf_really_inline simd16(const __m128i _value)
+ : base16_numeric<int16_t>(_value) {}
+ // Splat constructor
+ simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd16(const int16_t *values) : simd16(load(values)) {}
+ simdutf_really_inline simd16(const char16_t *values)
+ : simd16(load(reinterpret_cast<const int16_t *>(values))) {}
+ // Member-by-member initialization
+ simdutf_really_inline simd16(int16_t v0, int16_t v1, int16_t v2, int16_t v3,
+ int16_t v4, int16_t v5, int16_t v6, int16_t v7)
+ : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
+ simdutf_really_inline operator simd16<uint16_t>() const;
+
+ // Order-sensitive comparisons
+ simdutf_really_inline simd16<int16_t>
+ max_val(const simd16<int16_t> other) const {
+ return _mm_max_epi16(*this, other);
+ }
+ simdutf_really_inline simd16<int16_t>
+ min_val(const simd16<int16_t> other) const {
+ return _mm_min_epi16(*this, other);
+ }
+ simdutf_really_inline simd16<bool>
+ operator>(const simd16<int16_t> other) const {
+ return _mm_cmpgt_epi16(*this, other);
+ }
+ simdutf_really_inline simd16<bool>
+ operator<(const simd16<int16_t> other) const {
+ return _mm_cmpgt_epi16(other, *this);
+ }
+};
+
+// Unsigned code units
+template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
+ simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
+ simdutf_really_inline simd16(const __m128i _value)
+ : base16_numeric<uint16_t>(_value) {}
+
+ // Splat constructor
+ simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
+ // Array constructor
+ simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {}
+ simdutf_really_inline simd16(const char16_t *values)
+ : simd16(load(reinterpret_cast<const uint16_t *>(values))) {}
+ // Member-by-member initialization
+ simdutf_really_inline simd16(uint16_t v0, uint16_t v1, uint16_t v2,
+ uint16_t v3, uint16_t v4, uint16_t v5,
+ uint16_t v6, uint16_t v7)
+ : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
+ // Repeat 16 values as many times as necessary (usually for lookup tables)
+ simdutf_really_inline static simd16<uint16_t>
+ repeat_16(uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4,
+ uint16_t v5, uint16_t v6, uint16_t v7) {
+ return simd16<uint16_t>(v0, v1, v2, v3, v4, v5, v6, v7);
+ }
+
+ // Saturated math
+ simdutf_really_inline simd16<uint16_t>
+ saturating_add(const simd16<uint16_t> other) const {
+ return _mm_adds_epu16(*this, other);
+ }
+ simdutf_really_inline simd16<uint16_t>
+ saturating_sub(const simd16<uint16_t> other) const {
+ return _mm_subs_epu16(*this, other);
+ }
+
+ // Order-specific operations
+ simdutf_really_inline simd16<uint16_t>
+ max_val(const simd16<uint16_t> other) const {
+ return _mm_max_epu16(*this, other);
+ }
+ simdutf_really_inline simd16<uint16_t>
+ min_val(const simd16<uint16_t> other) const {
+ return _mm_min_epu16(*this, other);
+ }
+ // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd16<uint16_t>
+ gt_bits(const simd16<uint16_t> other) const {
+ return this->saturating_sub(other);
+ }
+ // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+ simdutf_really_inline simd16<uint16_t>
+ lt_bits(const simd16<uint16_t> other) const {
+ return other.saturating_sub(*this);
+ }
+ simdutf_really_inline simd16<bool>
+ operator<=(const simd16<uint16_t> other) const {
+ return other.max_val(*this) == other;
+ }
+ simdutf_really_inline simd16<bool>
+ operator>=(const simd16<uint16_t> other) const {
+ return other.min_val(*this) == other;
+ }
+ simdutf_really_inline simd16<bool>
+ operator>(const simd16<uint16_t> other) const {
+ return this->gt_bits(other).any_bits_set();
+ }
+ simdutf_really_inline simd16<bool>
+ operator<(const simd16<uint16_t> other) const {
+ return this->gt_bits(other).any_bits_set();
+ }
+
+ // Bit-specific operations
+ simdutf_really_inline simd16<bool> bits_not_set() const {
+ return *this == uint16_t(0);
+ }
+ simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const {
+ return (*this & bits).bits_not_set();
+ }
+ simdutf_really_inline simd16<bool> any_bits_set() const {
+ return ~this->bits_not_set();
+ }
+ simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const {
+ return ~this->bits_not_set(bits);
+ }
+
+ simdutf_really_inline bool bits_not_set_anywhere() const {
+ return _mm_testz_si128(*this, *this);
+ }
+ simdutf_really_inline bool any_bits_set_anywhere() const {
+ return !bits_not_set_anywhere();
+ }
+ simdutf_really_inline bool
+ bits_not_set_anywhere(simd16<uint16_t> bits) const {
+ return _mm_testz_si128(*this, bits);
+ }
+ simdutf_really_inline bool
+ any_bits_set_anywhere(simd16<uint16_t> bits) const {
+ return !bits_not_set_anywhere(bits);
+ }
+ template <int N> simdutf_really_inline simd16<uint16_t> shr() const {
+ return simd16<uint16_t>(_mm_srli_epi16(*this, N));
+ }
+ template <int N> simdutf_really_inline simd16<uint16_t> shl() const {
+ return simd16<uint16_t>(_mm_slli_epi16(*this, N));
+ }
+ // Get one of the bits and make a bitmask out of it.
+ // e.g. value.get_bit<7>() gets the high bit
+ template <int N> simdutf_really_inline int get_bit() const {
+ return _mm_movemask_epi8(_mm_slli_epi16(*this, 7 - N));
+ }
+
+ // Change the endianness
+ simdutf_really_inline simd16<uint16_t> swap_bytes() const {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ return _mm_shuffle_epi8(*this, swap);
+ }
+
+ // Pack with the unsigned saturation of two uint16_t code units into single
+ // uint8_t vector
+ static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
+ const simd16<uint16_t> &v1) {
+ return _mm_packus_epi16(v0, v1);
+ }
+};
+simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const {
+ return this->value;
+}
+
+template <typename T> struct simd16x32 {
+ static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
+ static_assert(NUM_CHUNKS == 4,
+ "Westmere kernel should use four registers per 64-byte block.");
+ simd16<T> chunks[NUM_CHUNKS];
+
+ simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
+ simd16x32<T> &
+ operator=(const simd16<T> other) = delete; // no assignment allowed
+ simd16x32() = delete; // no default constructor allowed
+
+ simdutf_really_inline
+ simd16x32(const simd16<T> chunk0, const simd16<T> chunk1,
+ const simd16<T> chunk2, const simd16<T> chunk3)
+ : chunks{chunk0, chunk1, chunk2, chunk3} {}
+ simdutf_really_inline simd16x32(const T *ptr)
+ : chunks{simd16<T>::load(ptr),
+ simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)),
+ simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)),
+ simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T))} {}
+
+ simdutf_really_inline void store(T *ptr) const {
+ this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
+ this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
+ this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
+ this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
+ }
+
+ simdutf_really_inline simd16<T> reduce_or() const {
+ return (this->chunks[0] | this->chunks[1]) |
+ (this->chunks[2] | this->chunks[3]);
+ }
+
+ simdutf_really_inline bool is_ascii() const {
+ return this->reduce_or().is_ascii();
+ }
+
+ simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+ this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
+ this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 1);
+ this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 2);
+ this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 3);
+ }
+
+ simdutf_really_inline uint64_t to_bitmask() const {
+ uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
+ uint64_t r1 = this->chunks[1].to_bitmask();
+ uint64_t r2 = this->chunks[2].to_bitmask();
+ uint64_t r3 = this->chunks[3].to_bitmask();
+ return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+ }
+
+ simdutf_really_inline void swap_bytes() {
+ this->chunks[0] = this->chunks[0].swap_bytes();
+ this->chunks[1] = this->chunks[1].swap_bytes();
+ this->chunks[2] = this->chunks[2].swap_bytes();
+ this->chunks[3] = this->chunks[3].swap_bytes();
+ }
+
+ simdutf_really_inline uint64_t eq(const T m) const {
+ const simd16<T> mask = simd16<T>::splat(m);
+ return simd16x32<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
+ this->chunks[2] == mask, this->chunks[3] == mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
+ return simd16x32<bool>(this->chunks[0] == other.chunks[0],
+ this->chunks[1] == other.chunks[1],
+ this->chunks[2] == other.chunks[2],
+ this->chunks[3] == other.chunks[3])
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t lteq(const T m) const {
+ const simd16<T> mask = simd16<T>::splat(m);
+ return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
+ this->chunks[2] <= mask, this->chunks[3] <= mask)
+ .to_bitmask();
+ }
+
+ simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+ const simd16<T> mask_low = simd16<T>::splat(low);
+ const simd16<T> mask_high = simd16<T>::splat(high);
+
+ return simd16x32<bool>(
+ (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+ (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+ (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+ (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+ const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low - 1));
+ const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high + 1));
+ return simd16x32<bool>(
+ (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+ (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
+ (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
+ (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low))
+ .to_bitmask();
+ }
+ simdutf_really_inline uint64_t lt(const T m) const {
+ const simd16<T> mask = simd16<T>::splat(m);
+ return simd16x32<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
+ this->chunks[2] < mask, this->chunks[3] < mask)
+ .to_bitmask();
+ }
+}; // struct simd16x32<T>
diff --git a/contrib/simdutf/src/tables/base64_tables.h b/contrib/simdutf/src/tables/base64_tables.h
new file mode 100644
index 000000000..c54cf9b63
--- /dev/null
+++ b/contrib/simdutf/src/tables/base64_tables.h
@@ -0,0 +1,688 @@
+#ifndef SIMDUTF_BASE64_TABLES_H
+#define SIMDUTF_BASE64_TABLES_H
+#include <array>
+#include <cstdint>
+
+namespace simdutf {
+namespace {
+namespace tables {
+namespace base64 {
+namespace base64_default {
+
+const char e0[256] = {
+ 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D',
+ 'D', 'E', 'E', 'E', 'E', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H',
+ 'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 'K', 'K', 'K', 'K', 'L',
+ 'L', 'L', 'L', 'M', 'M', 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O',
+ 'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'S', 'S', 'S',
+ 'S', 'T', 'T', 'T', 'T', 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W',
+ 'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'a',
+ 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd',
+ 'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'h', 'h', 'h',
+ 'h', 'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l',
+ 'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'p',
+ 'p', 'p', 'p', 'q', 'q', 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's',
+ 't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', 'v', 'v', 'w', 'w', 'w',
+ 'w', 'x', 'x', 'x', 'x', 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0',
+ '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '3', '4',
+ '4', '4', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7',
+ '8', '8', '8', '8', '9', '9', '9', '9', '+', '+', '+', '+', '/', '/', '/',
+ '/'};
+
+const char e1[256] = {
+ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
+ 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
+ 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
+ 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
+ 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
+ '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
+ 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+ 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
+ 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C',
+ 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
+ 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+ 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+ 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+',
+ '/'};
+
+const char e2[256] = {
+ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
+ 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
+ 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
+ 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
+ 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
+ '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
+ 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+ 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
+ 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C',
+ 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
+ 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+ 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+ 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+',
+ '/'};
+
+const uint32_t d0[256] = {
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x000000f8, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000fc,
+ 0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4,
+ 0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+ 0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018,
+ 0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030,
+ 0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048,
+ 0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060,
+ 0x00000064, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078,
+ 0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090,
+ 0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8,
+ 0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0,
+ 0x000000c4, 0x000000c8, 0x000000cc, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+
+const uint32_t d1[256] = {
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x0000e003, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000f003,
+ 0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003,
+ 0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+ 0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000,
+ 0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000,
+ 0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001,
+ 0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001,
+ 0x00009001, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001,
+ 0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002,
+ 0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002,
+ 0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003,
+ 0x00001003, 0x00002003, 0x00003003, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+
+const uint32_t d2[256] = {
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x00800f00, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00c00f00,
+ 0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00,
+ 0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+ 0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100,
+ 0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300,
+ 0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400,
+ 0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600,
+ 0x00400600, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700,
+ 0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900,
+ 0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00,
+ 0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00,
+ 0x00400c00, 0x00800c00, 0x00c00c00, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+
+const uint32_t d3[256] = {
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x003e0000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003f0000,
+ 0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000,
+ 0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+ 0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000,
+ 0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000,
+ 0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000,
+ 0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000,
+ 0x00190000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000,
+ 0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000,
+ 0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000,
+ 0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000,
+ 0x00310000, 0x00320000, 0x00330000, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+} // namespace base64_default
+
+namespace base64_url {
+
+const char e0[256] = {
+ 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D',
+ 'D', 'E', 'E', 'E', 'E', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H',
+ 'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 'K', 'K', 'K', 'K', 'L',
+ 'L', 'L', 'L', 'M', 'M', 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O',
+ 'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'S', 'S', 'S',
+ 'S', 'T', 'T', 'T', 'T', 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W',
+ 'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'a',
+ 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd',
+ 'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'h', 'h', 'h',
+ 'h', 'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l',
+ 'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'p',
+ 'p', 'p', 'p', 'q', 'q', 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's',
+ 't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', 'v', 'v', 'w', 'w', 'w',
+ 'w', 'x', 'x', 'x', 'x', 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0',
+ '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '3', '4',
+ '4', '4', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7',
+ '8', '8', '8', '8', '9', '9', '9', '9', '-', '-', '-', '-', '_', '_', '_',
+ '_'};
+
+const char e1[256] = {
+ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
+ 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
+ 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
+ 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
+ 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
+ '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
+ 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+ 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
+ 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C',
+ 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
+ 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+ 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+ 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-',
+ '_'};
+
+const char e2[256] = {
+ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
+ 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
+ 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
+ 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
+ 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
+ '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
+ 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+ 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
+ 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C',
+ 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
+ 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+ 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+ 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-',
+ '_'};
+
+const uint32_t d0[256] = {
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000f8, 0x01ffffff, 0x01ffffff,
+ 0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4,
+ 0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+ 0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018,
+ 0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030,
+ 0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048,
+ 0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060,
+ 0x00000064, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000fc,
+ 0x01ffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078,
+ 0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090,
+ 0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8,
+ 0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0,
+ 0x000000c4, 0x000000c8, 0x000000cc, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+const uint32_t d1[256] = {
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000e003, 0x01ffffff, 0x01ffffff,
+ 0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003,
+ 0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+ 0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000,
+ 0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000,
+ 0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001,
+ 0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001,
+ 0x00009001, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000f003,
+ 0x01ffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001,
+ 0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002,
+ 0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002,
+ 0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003,
+ 0x00001003, 0x00002003, 0x00003003, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+const uint32_t d2[256] = {
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00800f00, 0x01ffffff, 0x01ffffff,
+ 0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00,
+ 0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+ 0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100,
+ 0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300,
+ 0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400,
+ 0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600,
+ 0x00400600, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00c00f00,
+ 0x01ffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700,
+ 0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900,
+ 0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00,
+ 0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00,
+ 0x00400c00, 0x00800c00, 0x00c00c00, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+const uint32_t d3[256] = {
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003e0000, 0x01ffffff, 0x01ffffff,
+ 0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000,
+ 0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+ 0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000,
+ 0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000,
+ 0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000,
+ 0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000,
+ 0x00190000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003f0000,
+ 0x01ffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000,
+ 0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000,
+ 0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000,
+ 0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000,
+ 0x00310000, 0x00320000, 0x00330000, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+ 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+} // namespace base64_url
+const uint64_t thintable_epi8[256] = {
+ 0x0706050403020100, 0x0007060504030201, 0x0007060504030200,
+ 0x0000070605040302, 0x0007060504030100, 0x0000070605040301,
+ 0x0000070605040300, 0x0000000706050403, 0x0007060504020100,
+ 0x0000070605040201, 0x0000070605040200, 0x0000000706050402,
+ 0x0000070605040100, 0x0000000706050401, 0x0000000706050400,
+ 0x0000000007060504, 0x0007060503020100, 0x0000070605030201,
+ 0x0000070605030200, 0x0000000706050302, 0x0000070605030100,
+ 0x0000000706050301, 0x0000000706050300, 0x0000000007060503,
+ 0x0000070605020100, 0x0000000706050201, 0x0000000706050200,
+ 0x0000000007060502, 0x0000000706050100, 0x0000000007060501,
+ 0x0000000007060500, 0x0000000000070605, 0x0007060403020100,
+ 0x0000070604030201, 0x0000070604030200, 0x0000000706040302,
+ 0x0000070604030100, 0x0000000706040301, 0x0000000706040300,
+ 0x0000000007060403, 0x0000070604020100, 0x0000000706040201,
+ 0x0000000706040200, 0x0000000007060402, 0x0000000706040100,
+ 0x0000000007060401, 0x0000000007060400, 0x0000000000070604,
+ 0x0000070603020100, 0x0000000706030201, 0x0000000706030200,
+ 0x0000000007060302, 0x0000000706030100, 0x0000000007060301,
+ 0x0000000007060300, 0x0000000000070603, 0x0000000706020100,
+ 0x0000000007060201, 0x0000000007060200, 0x0000000000070602,
+ 0x0000000007060100, 0x0000000000070601, 0x0000000000070600,
+ 0x0000000000000706, 0x0007050403020100, 0x0000070504030201,
+ 0x0000070504030200, 0x0000000705040302, 0x0000070504030100,
+ 0x0000000705040301, 0x0000000705040300, 0x0000000007050403,
+ 0x0000070504020100, 0x0000000705040201, 0x0000000705040200,
+ 0x0000000007050402, 0x0000000705040100, 0x0000000007050401,
+ 0x0000000007050400, 0x0000000000070504, 0x0000070503020100,
+ 0x0000000705030201, 0x0000000705030200, 0x0000000007050302,
+ 0x0000000705030100, 0x0000000007050301, 0x0000000007050300,
+ 0x0000000000070503, 0x0000000705020100, 0x0000000007050201,
+ 0x0000000007050200, 0x0000000000070502, 0x0000000007050100,
+ 0x0000000000070501, 0x0000000000070500, 0x0000000000000705,
+ 0x0000070403020100, 0x0000000704030201, 0x0000000704030200,
+ 0x0000000007040302, 0x0000000704030100, 0x0000000007040301,
+ 0x0000000007040300, 0x0000000000070403, 0x0000000704020100,
+ 0x0000000007040201, 0x0000000007040200, 0x0000000000070402,
+ 0x0000000007040100, 0x0000000000070401, 0x0000000000070400,
+ 0x0000000000000704, 0x0000000703020100, 0x0000000007030201,
+ 0x0000000007030200, 0x0000000000070302, 0x0000000007030100,
+ 0x0000000000070301, 0x0000000000070300, 0x0000000000000703,
+ 0x0000000007020100, 0x0000000000070201, 0x0000000000070200,
+ 0x0000000000000702, 0x0000000000070100, 0x0000000000000701,
+ 0x0000000000000700, 0x0000000000000007, 0x0006050403020100,
+ 0x0000060504030201, 0x0000060504030200, 0x0000000605040302,
+ 0x0000060504030100, 0x0000000605040301, 0x0000000605040300,
+ 0x0000000006050403, 0x0000060504020100, 0x0000000605040201,
+ 0x0000000605040200, 0x0000000006050402, 0x0000000605040100,
+ 0x0000000006050401, 0x0000000006050400, 0x0000000000060504,
+ 0x0000060503020100, 0x0000000605030201, 0x0000000605030200,
+ 0x0000000006050302, 0x0000000605030100, 0x0000000006050301,
+ 0x0000000006050300, 0x0000000000060503, 0x0000000605020100,
+ 0x0000000006050201, 0x0000000006050200, 0x0000000000060502,
+ 0x0000000006050100, 0x0000000000060501, 0x0000000000060500,
+ 0x0000000000000605, 0x0000060403020100, 0x0000000604030201,
+ 0x0000000604030200, 0x0000000006040302, 0x0000000604030100,
+ 0x0000000006040301, 0x0000000006040300, 0x0000000000060403,
+ 0x0000000604020100, 0x0000000006040201, 0x0000000006040200,
+ 0x0000000000060402, 0x0000000006040100, 0x0000000000060401,
+ 0x0000000000060400, 0x0000000000000604, 0x0000000603020100,
+ 0x0000000006030201, 0x0000000006030200, 0x0000000000060302,
+ 0x0000000006030100, 0x0000000000060301, 0x0000000000060300,
+ 0x0000000000000603, 0x0000000006020100, 0x0000000000060201,
+ 0x0000000000060200, 0x0000000000000602, 0x0000000000060100,
+ 0x0000000000000601, 0x0000000000000600, 0x0000000000000006,
+ 0x0000050403020100, 0x0000000504030201, 0x0000000504030200,
+ 0x0000000005040302, 0x0000000504030100, 0x0000000005040301,
+ 0x0000000005040300, 0x0000000000050403, 0x0000000504020100,
+ 0x0000000005040201, 0x0000000005040200, 0x0000000000050402,
+ 0x0000000005040100, 0x0000000000050401, 0x0000000000050400,
+ 0x0000000000000504, 0x0000000503020100, 0x0000000005030201,
+ 0x0000000005030200, 0x0000000000050302, 0x0000000005030100,
+ 0x0000000000050301, 0x0000000000050300, 0x0000000000000503,
+ 0x0000000005020100, 0x0000000000050201, 0x0000000000050200,
+ 0x0000000000000502, 0x0000000000050100, 0x0000000000000501,
+ 0x0000000000000500, 0x0000000000000005, 0x0000000403020100,
+ 0x0000000004030201, 0x0000000004030200, 0x0000000000040302,
+ 0x0000000004030100, 0x0000000000040301, 0x0000000000040300,
+ 0x0000000000000403, 0x0000000004020100, 0x0000000000040201,
+ 0x0000000000040200, 0x0000000000000402, 0x0000000000040100,
+ 0x0000000000000401, 0x0000000000000400, 0x0000000000000004,
+ 0x0000000003020100, 0x0000000000030201, 0x0000000000030200,
+ 0x0000000000000302, 0x0000000000030100, 0x0000000000000301,
+ 0x0000000000000300, 0x0000000000000003, 0x0000000000020100,
+ 0x0000000000000201, 0x0000000000000200, 0x0000000000000002,
+ 0x0000000000000100, 0x0000000000000001, 0x0000000000000000,
+ 0x0000000000000000,
+};
+
+const uint8_t pshufb_combine_table[272] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
+ 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08,
+ 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0x00, 0x01, 0x02, 0x03,
+ 0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
+ 0x0f, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
+ 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x08,
+ 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x01, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
+ 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x08, 0x09, 0x0a, 0x0b,
+ 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+};
+
+const unsigned char BitsSetTable256mul2[256] = {
+ 0, 2, 2, 4, 2, 4, 4, 6, 2, 4, 4, 6, 4, 6, 6, 8, 2, 4, 4,
+ 6, 4, 6, 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 2, 4, 4, 6, 4, 6,
+ 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, 6,
+ 8, 8, 10, 8, 10, 10, 12, 2, 4, 4, 6, 4, 6, 6, 8, 4, 6, 6, 8,
+ 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10,
+ 12, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, 12, 6, 8,
+ 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 2, 4, 4, 6, 4,
+ 6, 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10,
+ 6, 8, 8, 10, 8, 10, 10, 12, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8,
+ 10, 8, 10, 10, 12, 6, 8, 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12,
+ 12, 14, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, 12, 6,
+ 8, 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 6, 8, 8, 10,
+ 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 8, 10, 10, 12, 10, 12, 12,
+ 14, 10, 12, 12, 14, 12, 14, 14, 16};
+
+constexpr uint8_t to_base64_value[] = {
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 64, 255, 64, 64, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255,
+ 255, 255, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255,
+ 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+ 25, 255, 255, 255, 255, 255, 255, 26, 27, 28, 29, 30, 31, 32, 33,
+ 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+ 49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255};
+
+constexpr uint8_t to_base64_url_value[] = {
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 64, 255, 64, 64, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 62, 255, 255, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255,
+ 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+ 25, 255, 255, 255, 255, 63, 255, 26, 27, 28, 29, 30, 31, 32, 33,
+ 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+ 49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255};
+static_assert(sizeof(to_base64_value) == 256,
+ "to_base64_value must have 256 elements");
+static_assert(sizeof(to_base64_url_value) == 256,
+ "to_base64_url_value must have 256 elements");
+static_assert(to_base64_value[uint8_t(' ')] == 64,
+ "space must be == 64 in to_base64_value");
+static_assert(to_base64_url_value[uint8_t(' ')] == 64,
+ "space must be == 64 in to_base64_url_value");
+static_assert(to_base64_value[uint8_t('\t')] == 64,
+ "tab must be == 64 in to_base64_value");
+static_assert(to_base64_url_value[uint8_t('\t')] == 64,
+ "tab must be == 64 in to_base64_url_value");
+static_assert(to_base64_value[uint8_t('\r')] == 64,
+ "cr must be == 64 in to_base64_value");
+static_assert(to_base64_url_value[uint8_t('\r')] == 64,
+ "cr must be == 64 in to_base64_url_value");
+static_assert(to_base64_value[uint8_t('\n')] == 64,
+ "lf must be == 64 in to_base64_value");
+static_assert(to_base64_url_value[uint8_t('\n')] == 64,
+ "lf must be == 64 in to_base64_url_value");
+static_assert(to_base64_value[uint8_t('\f')] == 64,
+ "ff must be == 64 in to_base64_value");
+static_assert(to_base64_url_value[uint8_t('\f')] == 64,
+ "ff must be == 64 in to_base64_url_value");
+static_assert(to_base64_value[uint8_t('+')] == 62,
+ "+ must be == 62 in to_base64_value");
+static_assert(to_base64_url_value[uint8_t('-')] == 62,
+ "- must be == 62 in to_base64_url_value");
+static_assert(to_base64_value[uint8_t('/')] == 63,
+ "/ must be == 62 in to_base64_value");
+static_assert(to_base64_url_value[uint8_t('_')] == 63,
+ "_ must be == 62 in to_base64_url_value");
+} // namespace base64
+} // namespace tables
+} // unnamed namespace
+} // namespace simdutf
+
+#endif // SIMDUTF_BASE64_TABLES_H
diff --git a/contrib/simdutf/src/tables/utf16_to_utf8_tables.h b/contrib/simdutf/src/tables/utf16_to_utf8_tables.h
new file mode 100644
index 000000000..c4ea071b0
--- /dev/null
+++ b/contrib/simdutf/src/tables/utf16_to_utf8_tables.h
@@ -0,0 +1,768 @@
+// file generated by scripts/sse_convert_utf16_to_utf8.py
+#ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H
+#define SIMDUTF_UTF16_TO_UTF8_TABLES_H
+
+namespace simdutf {
+namespace {
+namespace tables {
+namespace utf16_to_utf8 {
+
+// 1 byte for length, 16 bytes for mask
+const uint8_t pack_1_2_utf8_bytes[256][17] = {
+ {16, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
+ {15, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80},
+ {15, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80},
+ {14, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
+ {15, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80},
+ {14, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
+ {14, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
+ {13, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80},
+ {14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80},
+ {14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80},
+ {13, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80},
+ {13, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {13, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {15, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80},
+ {14, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
+ {14, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
+ {13, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {14, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
+ {13, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {13, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80},
+ {13, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80},
+ {14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80},
+ {14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80},
+ {13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
+ {14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80},
+ {13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
+ {13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80},
+ {13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80},
+ {13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 3, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {15, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80},
+ {14, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
+ {14, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
+ {13, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {14, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
+ {13, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {13, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80},
+ {13, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {14, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
+ {13, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 3, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80},
+ {13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 3, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 3, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 3, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 3, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 3, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 1, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {9, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80},
+ {14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
+ {14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
+ {13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+ {14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
+ {13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+ {13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+ {12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80},
+ {13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+ {12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
+ {13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+ {12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80},
+ {13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
+ {12, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 1, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {9, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
+ {13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+ {12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 1, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {9, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
+ {12, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 1, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {9, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {12, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+ {11, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {11, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 1, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {9, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {11, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 1, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {9, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 1, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {9, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {9, 1, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {8, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80}};
+
+// 1 byte for length, 16 bytes for mask
+const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
+ {12, 2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80},
+ {9, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {11, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {10, 0, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {9, 2, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {8, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {7, 0, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {11, 2, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {8, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {10, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {9, 0, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {10, 2, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {7, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {9, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {8, 0, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {9, 2, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {6, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {8, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {7, 0, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 2, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {3, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {5, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {4, 0, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {8, 2, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 0, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 2, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {4, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {6, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {5, 0, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {11, 2, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {8, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {10, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {9, 0, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {8, 2, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 0, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {10, 2, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {7, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {9, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {8, 0, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {9, 2, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {8, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {7, 0, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {10, 2, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {7, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {9, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {8, 0, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {7, 2, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {6, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {5, 0, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {9, 2, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {8, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {7, 0, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {8, 2, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 0, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {9, 2, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {6, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {8, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {7, 0, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 2, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {3, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {5, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {4, 0, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {8, 2, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 0, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 2, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {4, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {6, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {5, 0, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {6, 2, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {3, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {5, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {4, 0, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {3, 2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80},
+ {2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80},
+ {5, 2, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {2, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {4, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {3, 0, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {4, 2, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80},
+ {3, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {2, 0, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {8, 2, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 0, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 2, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {2, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {4, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {3, 0, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {7, 2, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {4, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {6, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 0, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {6, 2, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {3, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {5, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {4, 0, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 2, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {4, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {6, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 0, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {4, 2, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80},
+ {3, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {2, 0, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {6, 2, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {3, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {5, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {4, 0, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {5, 2, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {2, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {4, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {3, 0, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {11, 2, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {8, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {10, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {9, 0, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {8, 2, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 0, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {10, 2, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {7, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {9, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {8, 0, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {9, 2, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {8, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {7, 0, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {8, 2, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 0, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {5, 2, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {2, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80},
+ {4, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {3, 0, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {7, 2, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {4, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {6, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {5, 0, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {6, 2, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {3, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {5, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {4, 0, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {10, 2, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {7, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {9, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {8, 0, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {7, 2, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {6, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {5, 0, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {9, 2, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {8, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {7, 0, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {8, 2, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 0, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {9, 2, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {6, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {8, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {7, 0, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 2, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {3, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {5, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {4, 0, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {8, 2, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 0, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 2, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {4, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {6, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {5, 0, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {10, 2, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {7, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {9, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {8, 0, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {7, 2, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {6, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {5, 0, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {9, 2, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {8, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {7, 0, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {8, 2, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 0, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 2, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {4, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {6, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 0, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {4, 2, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80},
+ {3, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {2, 0, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {6, 2, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {3, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {5, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {4, 0, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {5, 2, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {2, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {4, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {3, 0, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {9, 2, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+ {6, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {8, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {7, 0, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 2, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {3, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {5, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {4, 0, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {8, 2, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 0, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 2, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {4, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {6, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {5, 0, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {8, 2, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {7, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {6, 0, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 2, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {2, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {4, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {3, 0, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {7, 2, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {4, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {6, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {5, 0, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {6, 2, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80},
+ {3, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80},
+ {5, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80},
+ {4, 0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80}};
+
+} // namespace utf16_to_utf8
+} // namespace tables
+} // unnamed namespace
+} // namespace simdutf
+
+#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H
diff --git a/contrib/simdutf/src/tables/utf8_to_utf16_tables.h b/contrib/simdutf/src/tables/utf8_to_utf16_tables.h
new file mode 100644
index 000000000..8c782253d
--- /dev/null
+++ b/contrib/simdutf/src/tables/utf8_to_utf16_tables.h
@@ -0,0 +1,826 @@
+#ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H
+#define SIMDUTF_UTF8_TO_UTF16_TABLES_H
+#include <cstdint>
+
+namespace simdutf {
+namespace {
+namespace tables {
+namespace utf8_to_utf16 {
+/**
+ * utf8bigindex uses about 8 kB
+ * shufutf8 uses about 3344 B
+ *
+ * So we use a bit over 11 kB. It would be
+ * easy to save about 4 kB by only
+ * storing the index in utf8bigindex, and
+ * deriving the consumed bytes otherwise.
+ * However, this may come at a significant (10% to 20%)
+ * performance penalty.
+ */
+
+const uint8_t shufutf8[209][16] = {
+ {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0},
+ {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0},
+ {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0},
+ {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0},
+ {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0},
+ {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0},
+ {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0},
+ {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0},
+ {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
+ {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
+ {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
+ {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
+ {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
+ {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
+ {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
+ {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
+ {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
+ {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
+ {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
+ {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
+ {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
+ {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
+ {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
+ {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
+ {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
+ {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
+ {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
+ {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
+ {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
+ {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
+ {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
+ {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
+ {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
+ {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
+ {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
+ {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
+ {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
+ {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
+ {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
+ {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
+ {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
+ {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
+ {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
+ {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
+ {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
+ {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
+ {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
+ {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
+ {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
+ {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
+ {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
+ {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
+ {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
+ {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
+ {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
+ {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
+ {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0},
+ {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0},
+ {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0},
+ {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0},
+ {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0},
+ {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0},
+ {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0},
+ {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0},
+ {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255},
+ {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255},
+ {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255},
+ {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255},
+ {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255},
+ {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255},
+ {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255},
+ {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255},
+ {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255},
+ {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
+ {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
+ {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
+ {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
+ {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
+ {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
+ {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
+ {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
+ {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
+ {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255},
+ {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255},
+ {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255},
+ {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255},
+ {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255},
+ {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255},
+ {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255},
+ {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255},
+ {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255},
+ {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
+ {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
+ {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
+ {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
+ {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
+ {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
+ {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
+ {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
+ {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
+ {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
+ {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
+ {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
+ {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
+ {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
+ {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
+ {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
+ {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
+ {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
+ {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255},
+ {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255},
+ {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255},
+ {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255},
+ {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255},
+ {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255},
+ {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255},
+ {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255},
+ {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255},
+ {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
+ {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
+ {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
+ {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
+ {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
+ {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
+ {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
+ {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
+ {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
+ {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255},
+ {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255},
+ {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255},
+ {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255},
+ {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255},
+ {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255},
+ {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255},
+ {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255},
+ {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255},
+ {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255},
+ {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255},
+ {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255},
+ {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255},
+ {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255},
+ {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255},
+ {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255},
+ {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255},
+ {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255},
+ {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0},
+ {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0},
+ {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0},
+ {0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0},
+ {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
+ {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
+ {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
+ {0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
+ {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0},
+ {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0},
+ {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0},
+ {0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0},
+ {0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0},
+ {0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0},
+ {0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0},
+ {0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0},
+ {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
+ {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
+ {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
+ {1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
+ {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
+ {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
+ {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
+ {1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
+ {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0},
+ {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0},
+ {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0},
+ {1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0},
+ {1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0},
+ {1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0},
+ {1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0},
+ {1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0},
+ {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
+ {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
+ {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
+ {2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
+ {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
+ {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
+ {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
+ {2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
+ {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0},
+ {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0},
+ {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0},
+ {2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0},
+ {2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0},
+ {2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0},
+ {2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0},
+ {2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0},
+ {3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
+ {3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
+ {3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
+ {3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
+ {3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0},
+ {3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0},
+ {3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0},
+ {3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0},
+ {3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0},
+ {3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0},
+ {3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0},
+ {3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0},
+ {3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0},
+ {3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0},
+ {3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0},
+ {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}};
+/* number of two bytes : 64 */
+/* number of two + three bytes : 145 */
+/* number of two + three + four bytes : 209 */
+const uint8_t utf8bigindex[4096][2] = {
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4},
+ {161, 4}, {64, 4}, {209, 12}, {209, 12}, {209, 12}, {147, 5}, {209, 12},
+ {150, 5}, {162, 5}, {65, 5}, {209, 12}, {153, 5}, {165, 5}, {67, 5},
+ {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {209, 12},
+ {148, 6}, {209, 12}, {151, 6}, {163, 6}, {66, 6}, {209, 12}, {154, 6},
+ {166, 6}, {68, 6}, {178, 6}, {74, 6}, {92, 6}, {64, 4}, {209, 12},
+ {157, 6}, {169, 6}, {70, 6}, {181, 6}, {76, 6}, {94, 6}, {65, 5},
+ {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5},
+ {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {152, 7},
+ {164, 7}, {145, 3}, {209, 12}, {155, 7}, {167, 7}, {69, 7}, {179, 7},
+ {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {170, 7}, {71, 7},
+ {182, 7}, {77, 7}, {95, 7}, {65, 5}, {194, 7}, {83, 7}, {101, 7},
+ {67, 5}, {119, 7}, {73, 5}, {91, 5}, {1, 7}, {209, 12}, {209, 12},
+ {173, 7}, {148, 6}, {185, 7}, {79, 7}, {97, 7}, {66, 6}, {197, 7},
+ {85, 7}, {103, 7}, {68, 6}, {121, 7}, {74, 6}, {92, 6}, {2, 7},
+ {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {76, 6}, {94, 6},
+ {4, 7}, {193, 6}, {82, 6}, {100, 6}, {8, 7}, {118, 6}, {16, 7},
+ {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {145, 3}, {209, 12}, {156, 8}, {168, 8}, {146, 4},
+ {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8}, {171, 8},
+ {72, 8}, {183, 8}, {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8},
+ {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12},
+ {209, 12}, {174, 8}, {148, 6}, {186, 8}, {80, 8}, {98, 8}, {66, 6},
+ {198, 8}, {86, 8}, {104, 8}, {68, 6}, {122, 8}, {74, 6}, {92, 6},
+ {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8}, {76, 6},
+ {94, 6}, {5, 8}, {193, 6}, {82, 6}, {100, 6}, {9, 8}, {118, 6},
+ {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8},
+ {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7},
+ {112, 8}, {71, 7}, {130, 8}, {77, 7}, {95, 7}, {6, 8}, {194, 7},
+ {83, 7}, {101, 7}, {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7},
+ {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7},
+ {66, 6}, {197, 7}, {85, 7}, {103, 7}, {12, 8}, {121, 7}, {20, 8},
+ {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7},
+ {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7},
+ {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12},
+ {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12},
+ {160, 9}, {172, 9}, {147, 5}, {184, 9}, {150, 5}, {162, 5}, {65, 5},
+ {196, 9}, {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5},
+ {64, 4}, {209, 12}, {209, 12}, {175, 9}, {148, 6}, {187, 9}, {81, 9},
+ {99, 9}, {66, 6}, {199, 9}, {87, 9}, {105, 9}, {68, 6}, {123, 9},
+ {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6}, {111, 9}, {70, 6},
+ {129, 9}, {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6},
+ {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {190, 9}, {152, 7}, {164, 7}, {145, 3}, {202, 9},
+ {89, 9}, {107, 9}, {69, 7}, {125, 9}, {75, 7}, {93, 7}, {64, 4},
+ {209, 12}, {158, 7}, {113, 9}, {71, 7}, {131, 9}, {77, 7}, {95, 7},
+ {7, 9}, {194, 7}, {83, 7}, {101, 7}, {11, 9}, {119, 7}, {19, 9},
+ {35, 9}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {137, 9},
+ {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {103, 7}, {13, 9},
+ {121, 7}, {21, 9}, {37, 9}, {2, 7}, {209, 12}, {157, 6}, {109, 7},
+ {70, 6}, {127, 7}, {25, 9}, {41, 9}, {4, 7}, {193, 6}, {82, 6},
+ {49, 9}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
+ {205, 9}, {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4},
+ {64, 4}, {209, 12}, {159, 8}, {115, 9}, {72, 8}, {133, 9}, {78, 8},
+ {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8},
+ {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6},
+ {139, 9}, {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8}, {104, 8},
+ {14, 9}, {122, 8}, {22, 9}, {38, 9}, {3, 8}, {209, 12}, {157, 6},
+ {110, 8}, {70, 6}, {128, 8}, {26, 9}, {42, 9}, {5, 8}, {193, 6},
+ {82, 6}, {50, 9}, {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7},
+ {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7},
+ {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8},
+ {28, 9}, {44, 9}, {6, 8}, {194, 7}, {83, 7}, {52, 9}, {10, 8},
+ {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7},
+ {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7},
+ {56, 9}, {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12},
+ {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7},
+ {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7},
+ {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12},
+ {149, 4}, {161, 4}, {64, 4}, {209, 12}, {209, 12}, {209, 12}, {147, 5},
+ {209, 12}, {150, 5}, {162, 5}, {65, 5}, {209, 12}, {153, 5}, {165, 5},
+ {67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12},
+ {176, 10}, {148, 6}, {188, 10}, {151, 6}, {163, 6}, {66, 6}, {200, 10},
+ {154, 6}, {166, 6}, {68, 6}, {178, 6}, {74, 6}, {92, 6}, {64, 4},
+ {209, 12}, {157, 6}, {169, 6}, {70, 6}, {181, 6}, {76, 6}, {94, 6},
+ {65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5},
+ {91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {191, 10},
+ {152, 7}, {164, 7}, {145, 3}, {203, 10}, {90, 10}, {108, 10}, {69, 7},
+ {126, 10}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {114, 10},
+ {71, 7}, {132, 10}, {77, 7}, {95, 7}, {65, 5}, {194, 7}, {83, 7},
+ {101, 7}, {67, 5}, {119, 7}, {73, 5}, {91, 5}, {1, 7}, {209, 12},
+ {209, 12}, {173, 7}, {148, 6}, {138, 10}, {79, 7}, {97, 7}, {66, 6},
+ {197, 7}, {85, 7}, {103, 7}, {68, 6}, {121, 7}, {74, 6}, {92, 6},
+ {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {76, 6},
+ {94, 6}, {4, 7}, {193, 6}, {82, 6}, {100, 6}, {8, 7}, {118, 6},
+ {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {145, 3}, {206, 10}, {156, 8}, {168, 8},
+ {146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8},
+ {116, 10}, {72, 8}, {134, 10}, {78, 8}, {96, 8}, {65, 5}, {195, 8},
+ {84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4},
+ {209, 12}, {209, 12}, {174, 8}, {148, 6}, {140, 10}, {80, 8}, {98, 8},
+ {66, 6}, {198, 8}, {86, 8}, {104, 8}, {15, 10}, {122, 8}, {23, 10},
+ {39, 10}, {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8},
+ {27, 10}, {43, 10}, {5, 8}, {193, 6}, {82, 6}, {51, 10}, {9, 8},
+ {118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8},
+ {106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12},
+ {158, 7}, {112, 8}, {71, 7}, {130, 8}, {29, 10}, {45, 10}, {6, 8},
+ {194, 7}, {83, 7}, {53, 10}, {10, 8}, {119, 7}, {18, 8}, {34, 8},
+ {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7},
+ {97, 7}, {66, 6}, {197, 7}, {85, 7}, {57, 10}, {12, 8}, {121, 7},
+ {20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6},
+ {127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8},
+ {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12},
+ {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4},
+ {209, 12}, {160, 9}, {172, 9}, {147, 5}, {184, 9}, {150, 5}, {162, 5},
+ {65, 5}, {196, 9}, {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5},
+ {91, 5}, {64, 4}, {209, 12}, {209, 12}, {175, 9}, {148, 6}, {142, 10},
+ {81, 9}, {99, 9}, {66, 6}, {199, 9}, {87, 9}, {105, 9}, {68, 6},
+ {123, 9}, {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6}, {111, 9},
+ {70, 6}, {129, 9}, {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6},
+ {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {190, 9}, {152, 7}, {164, 7}, {145, 3},
+ {202, 9}, {89, 9}, {107, 9}, {69, 7}, {125, 9}, {75, 7}, {93, 7},
+ {64, 4}, {209, 12}, {158, 7}, {113, 9}, {71, 7}, {131, 9}, {30, 10},
+ {46, 10}, {7, 9}, {194, 7}, {83, 7}, {54, 10}, {11, 9}, {119, 7},
+ {19, 9}, {35, 9}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6},
+ {137, 9}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {58, 10},
+ {13, 9}, {121, 7}, {21, 9}, {37, 9}, {2, 7}, {209, 12}, {157, 6},
+ {109, 7}, {70, 6}, {127, 7}, {25, 9}, {41, 9}, {4, 7}, {193, 6},
+ {82, 6}, {49, 9}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {145, 3}, {205, 9}, {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4},
+ {161, 4}, {64, 4}, {209, 12}, {159, 8}, {115, 9}, {72, 8}, {133, 9},
+ {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5},
+ {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8},
+ {148, 6}, {139, 9}, {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8},
+ {60, 10}, {14, 9}, {122, 8}, {22, 9}, {38, 9}, {3, 8}, {209, 12},
+ {157, 6}, {110, 8}, {70, 6}, {128, 8}, {26, 9}, {42, 9}, {5, 8},
+ {193, 6}, {82, 6}, {50, 9}, {9, 8}, {118, 6}, {17, 8}, {33, 8},
+ {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7},
+ {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8},
+ {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7},
+ {130, 8}, {28, 9}, {44, 9}, {6, 8}, {194, 7}, {83, 7}, {52, 9},
+ {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12},
+ {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7},
+ {85, 7}, {56, 9}, {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7},
+ {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8},
+ {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7},
+ {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4},
+ {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {209, 12}, {209, 12},
+ {147, 5}, {209, 12}, {150, 5}, {162, 5}, {65, 5}, {209, 12}, {153, 5},
+ {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12},
+ {209, 12}, {209, 12}, {148, 6}, {209, 12}, {151, 6}, {163, 6}, {66, 6},
+ {209, 12}, {154, 6}, {166, 6}, {68, 6}, {178, 6}, {74, 6}, {92, 6},
+ {64, 4}, {209, 12}, {157, 6}, {169, 6}, {70, 6}, {181, 6}, {76, 6},
+ {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6},
+ {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {192, 11}, {152, 7}, {164, 7}, {145, 3}, {204, 11}, {155, 7}, {167, 7},
+ {69, 7}, {179, 7}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7},
+ {170, 7}, {71, 7}, {182, 7}, {77, 7}, {95, 7}, {65, 5}, {194, 7},
+ {83, 7}, {101, 7}, {67, 5}, {119, 7}, {73, 5}, {91, 5}, {1, 7},
+ {209, 12}, {209, 12}, {173, 7}, {148, 6}, {185, 7}, {79, 7}, {97, 7},
+ {66, 6}, {197, 7}, {85, 7}, {103, 7}, {68, 6}, {121, 7}, {74, 6},
+ {92, 6}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7},
+ {76, 6}, {94, 6}, {4, 7}, {193, 6}, {82, 6}, {100, 6}, {8, 7},
+ {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {207, 11}, {156, 8},
+ {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12},
+ {159, 8}, {117, 11}, {72, 8}, {135, 11}, {78, 8}, {96, 8}, {65, 5},
+ {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5},
+ {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6}, {141, 11}, {80, 8},
+ {98, 8}, {66, 6}, {198, 8}, {86, 8}, {104, 8}, {68, 6}, {122, 8},
+ {74, 6}, {92, 6}, {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6},
+ {128, 8}, {76, 6}, {94, 6}, {5, 8}, {193, 6}, {82, 6}, {100, 6},
+ {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8},
+ {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4},
+ {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8}, {77, 7}, {95, 7},
+ {6, 8}, {194, 7}, {83, 7}, {101, 7}, {10, 8}, {119, 7}, {18, 8},
+ {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8},
+ {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {103, 7}, {12, 8},
+ {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7},
+ {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6},
+ {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
+ {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4},
+ {64, 4}, {209, 12}, {160, 9}, {172, 9}, {147, 5}, {184, 9}, {150, 5},
+ {162, 5}, {65, 5}, {196, 9}, {153, 5}, {165, 5}, {67, 5}, {177, 5},
+ {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {175, 9}, {148, 6},
+ {143, 11}, {81, 9}, {99, 9}, {66, 6}, {199, 9}, {87, 9}, {105, 9},
+ {68, 6}, {123, 9}, {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6},
+ {111, 9}, {70, 6}, {129, 9}, {76, 6}, {94, 6}, {65, 5}, {193, 6},
+ {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9}, {152, 7}, {164, 7},
+ {145, 3}, {202, 9}, {89, 9}, {107, 9}, {69, 7}, {125, 9}, {75, 7},
+ {93, 7}, {64, 4}, {209, 12}, {158, 7}, {113, 9}, {71, 7}, {131, 9},
+ {31, 11}, {47, 11}, {7, 9}, {194, 7}, {83, 7}, {55, 11}, {11, 9},
+ {119, 7}, {19, 9}, {35, 9}, {1, 7}, {209, 12}, {209, 12}, {173, 7},
+ {148, 6}, {137, 9}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7},
+ {59, 11}, {13, 9}, {121, 7}, {21, 9}, {37, 9}, {2, 7}, {209, 12},
+ {157, 6}, {109, 7}, {70, 6}, {127, 7}, {25, 9}, {41, 9}, {4, 7},
+ {193, 6}, {82, 6}, {49, 9}, {8, 7}, {118, 6}, {16, 7}, {32, 7},
+ {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {145, 3}, {205, 9}, {156, 8}, {168, 8}, {146, 4}, {180, 8},
+ {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8}, {115, 9}, {72, 8},
+ {133, 9}, {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8},
+ {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12},
+ {174, 8}, {148, 6}, {139, 9}, {80, 8}, {98, 8}, {66, 6}, {198, 8},
+ {86, 8}, {61, 11}, {14, 9}, {122, 8}, {22, 9}, {38, 9}, {3, 8},
+ {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8}, {26, 9}, {42, 9},
+ {5, 8}, {193, 6}, {82, 6}, {50, 9}, {9, 8}, {118, 6}, {17, 8},
+ {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},
+ {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7},
+ {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8},
+ {71, 7}, {130, 8}, {28, 9}, {44, 9}, {6, 8}, {194, 7}, {83, 7},
+ {52, 9}, {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12},
+ {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6},
+ {197, 7}, {85, 7}, {56, 9}, {12, 8}, {121, 7}, {20, 8}, {36, 8},
+ {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8},
+ {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6},
+ {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12},
+ {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {209, 12},
+ {209, 12}, {147, 5}, {209, 12}, {150, 5}, {162, 5}, {65, 5}, {209, 12},
+ {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4},
+ {209, 12}, {209, 12}, {176, 10}, {148, 6}, {188, 10}, {151, 6}, {163, 6},
+ {66, 6}, {200, 10}, {154, 6}, {166, 6}, {68, 6}, {178, 6}, {74, 6},
+ {92, 6}, {64, 4}, {209, 12}, {157, 6}, {169, 6}, {70, 6}, {181, 6},
+ {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5},
+ {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {191, 10}, {152, 7}, {164, 7}, {145, 3}, {203, 10}, {90, 10},
+ {108, 10}, {69, 7}, {126, 10}, {75, 7}, {93, 7}, {64, 4}, {209, 12},
+ {158, 7}, {114, 10}, {71, 7}, {132, 10}, {77, 7}, {95, 7}, {65, 5},
+ {194, 7}, {83, 7}, {101, 7}, {67, 5}, {119, 7}, {73, 5}, {91, 5},
+ {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {138, 10}, {79, 7},
+ {97, 7}, {66, 6}, {197, 7}, {85, 7}, {103, 7}, {68, 6}, {121, 7},
+ {74, 6}, {92, 6}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6},
+ {127, 7}, {76, 6}, {94, 6}, {4, 7}, {193, 6}, {82, 6}, {100, 6},
+ {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {206, 10},
+ {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4},
+ {209, 12}, {159, 8}, {116, 10}, {72, 8}, {134, 10}, {78, 8}, {96, 8},
+ {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5},
+ {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6}, {140, 10},
+ {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8}, {62, 11}, {15, 10},
+ {122, 8}, {23, 10}, {39, 10}, {3, 8}, {209, 12}, {157, 6}, {110, 8},
+ {70, 6}, {128, 8}, {27, 10}, {43, 10}, {5, 8}, {193, 6}, {82, 6},
+ {51, 10}, {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3},
+ {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7},
+ {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8}, {29, 10},
+ {45, 10}, {6, 8}, {194, 7}, {83, 7}, {53, 10}, {10, 8}, {119, 7},
+ {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6},
+ {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {57, 10},
+ {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6},
+ {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6},
+ {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4},
+ {161, 4}, {64, 4}, {209, 12}, {160, 9}, {172, 9}, {147, 5}, {184, 9},
+ {150, 5}, {162, 5}, {65, 5}, {196, 9}, {153, 5}, {165, 5}, {67, 5},
+ {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {175, 9},
+ {148, 6}, {142, 10}, {81, 9}, {99, 9}, {66, 6}, {199, 9}, {87, 9},
+ {105, 9}, {68, 6}, {123, 9}, {74, 6}, {92, 6}, {64, 4}, {209, 12},
+ {157, 6}, {111, 9}, {70, 6}, {129, 9}, {76, 6}, {94, 6}, {65, 5},
+ {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5},
+ {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9}, {152, 7},
+ {164, 7}, {145, 3}, {202, 9}, {89, 9}, {107, 9}, {69, 7}, {125, 9},
+ {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {113, 9}, {71, 7},
+ {131, 9}, {30, 10}, {46, 10}, {7, 9}, {194, 7}, {83, 7}, {54, 10},
+ {11, 9}, {119, 7}, {19, 9}, {35, 9}, {1, 7}, {209, 12}, {209, 12},
+ {173, 7}, {148, 6}, {137, 9}, {79, 7}, {97, 7}, {66, 6}, {197, 7},
+ {85, 7}, {58, 10}, {13, 9}, {121, 7}, {21, 9}, {37, 9}, {2, 7},
+ {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {25, 9}, {41, 9},
+ {4, 7}, {193, 6}, {82, 6}, {49, 9}, {8, 7}, {118, 6}, {16, 7},
+ {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {145, 3}, {205, 9}, {156, 8}, {168, 8}, {146, 4},
+ {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8}, {115, 9},
+ {72, 8}, {133, 9}, {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8},
+ {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12},
+ {209, 12}, {174, 8}, {148, 6}, {139, 9}, {80, 8}, {98, 8}, {66, 6},
+ {198, 8}, {86, 8}, {60, 10}, {14, 9}, {122, 8}, {22, 9}, {38, 9},
+ {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8}, {26, 9},
+ {42, 9}, {5, 8}, {193, 6}, {82, 6}, {50, 9}, {9, 8}, {118, 6},
+ {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8},
+ {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7},
+ {112, 8}, {71, 7}, {130, 8}, {28, 9}, {44, 9}, {6, 8}, {194, 7},
+ {83, 7}, {52, 9}, {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7},
+ {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7},
+ {66, 6}, {197, 7}, {85, 7}, {56, 9}, {12, 8}, {121, 7}, {20, 8},
+ {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7},
+ {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7},
+ {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12},
+ {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12},
+ {209, 12}, {209, 12}, {147, 5}, {209, 12}, {150, 5}, {162, 5}, {65, 5},
+ {209, 12}, {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5},
+ {64, 4}, {209, 12}, {209, 12}, {209, 12}, {148, 6}, {209, 12}, {151, 6},
+ {163, 6}, {66, 6}, {209, 12}, {154, 6}, {166, 6}, {68, 6}, {178, 6},
+ {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6}, {169, 6}, {70, 6},
+ {181, 6}, {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6},
+ {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {152, 7}, {164, 7}, {145, 3}, {209, 12},
+ {155, 7}, {167, 7}, {69, 7}, {179, 7}, {75, 7}, {93, 7}, {64, 4},
+ {209, 12}, {158, 7}, {170, 7}, {71, 7}, {182, 7}, {77, 7}, {95, 7},
+ {65, 5}, {194, 7}, {83, 7}, {101, 7}, {67, 5}, {119, 7}, {73, 5},
+ {91, 5}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {185, 7},
+ {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {103, 7}, {68, 6},
+ {121, 7}, {74, 6}, {92, 6}, {2, 7}, {209, 12}, {157, 6}, {109, 7},
+ {70, 6}, {127, 7}, {76, 6}, {94, 6}, {4, 7}, {193, 6}, {82, 6},
+ {100, 6}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
+ {208, 12}, {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4},
+ {64, 4}, {209, 12}, {159, 8}, {171, 8}, {72, 8}, {183, 8}, {78, 8},
+ {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8},
+ {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6},
+ {186, 8}, {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8}, {104, 8},
+ {68, 6}, {122, 8}, {74, 6}, {92, 6}, {3, 8}, {209, 12}, {157, 6},
+ {110, 8}, {70, 6}, {128, 8}, {76, 6}, {94, 6}, {5, 8}, {193, 6},
+ {82, 6}, {100, 6}, {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7},
+ {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7},
+ {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8},
+ {77, 7}, {95, 7}, {6, 8}, {194, 7}, {83, 7}, {101, 7}, {10, 8},
+ {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7},
+ {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7},
+ {103, 7}, {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12},
+ {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7},
+ {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7},
+ {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12},
+ {149, 4}, {161, 4}, {64, 4}, {209, 12}, {160, 9}, {172, 9}, {147, 5},
+ {184, 9}, {150, 5}, {162, 5}, {65, 5}, {196, 9}, {153, 5}, {165, 5},
+ {67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12},
+ {175, 9}, {148, 6}, {144, 12}, {81, 9}, {99, 9}, {66, 6}, {199, 9},
+ {87, 9}, {105, 9}, {68, 6}, {123, 9}, {74, 6}, {92, 6}, {64, 4},
+ {209, 12}, {157, 6}, {111, 9}, {70, 6}, {129, 9}, {76, 6}, {94, 6},
+ {65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5},
+ {91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9},
+ {152, 7}, {164, 7}, {145, 3}, {202, 9}, {89, 9}, {107, 9}, {69, 7},
+ {125, 9}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {113, 9},
+ {71, 7}, {131, 9}, {77, 7}, {95, 7}, {7, 9}, {194, 7}, {83, 7},
+ {101, 7}, {11, 9}, {119, 7}, {19, 9}, {35, 9}, {1, 7}, {209, 12},
+ {209, 12}, {173, 7}, {148, 6}, {137, 9}, {79, 7}, {97, 7}, {66, 6},
+ {197, 7}, {85, 7}, {103, 7}, {13, 9}, {121, 7}, {21, 9}, {37, 9},
+ {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {25, 9},
+ {41, 9}, {4, 7}, {193, 6}, {82, 6}, {49, 9}, {8, 7}, {118, 6},
+ {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {145, 3}, {205, 9}, {156, 8}, {168, 8},
+ {146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8},
+ {115, 9}, {72, 8}, {133, 9}, {78, 8}, {96, 8}, {65, 5}, {195, 8},
+ {84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4},
+ {209, 12}, {209, 12}, {174, 8}, {148, 6}, {139, 9}, {80, 8}, {98, 8},
+ {66, 6}, {198, 8}, {86, 8}, {104, 8}, {14, 9}, {122, 8}, {22, 9},
+ {38, 9}, {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8},
+ {26, 9}, {42, 9}, {5, 8}, {193, 6}, {82, 6}, {50, 9}, {9, 8},
+ {118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8},
+ {106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12},
+ {158, 7}, {112, 8}, {71, 7}, {130, 8}, {28, 9}, {44, 9}, {6, 8},
+ {194, 7}, {83, 7}, {52, 9}, {10, 8}, {119, 7}, {18, 8}, {34, 8},
+ {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7},
+ {97, 7}, {66, 6}, {197, 7}, {85, 7}, {56, 9}, {12, 8}, {121, 7},
+ {20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6},
+ {127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8},
+ {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12},
+ {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4},
+ {209, 12}, {209, 12}, {209, 12}, {147, 5}, {209, 12}, {150, 5}, {162, 5},
+ {65, 5}, {209, 12}, {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5},
+ {91, 5}, {64, 4}, {209, 12}, {209, 12}, {176, 10}, {148, 6}, {188, 10},
+ {151, 6}, {163, 6}, {66, 6}, {200, 10}, {154, 6}, {166, 6}, {68, 6},
+ {178, 6}, {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6}, {169, 6},
+ {70, 6}, {181, 6}, {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6},
+ {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {191, 10}, {152, 7}, {164, 7}, {145, 3},
+ {203, 10}, {90, 10}, {108, 10}, {69, 7}, {126, 10}, {75, 7}, {93, 7},
+ {64, 4}, {209, 12}, {158, 7}, {114, 10}, {71, 7}, {132, 10}, {77, 7},
+ {95, 7}, {65, 5}, {194, 7}, {83, 7}, {101, 7}, {67, 5}, {119, 7},
+ {73, 5}, {91, 5}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6},
+ {138, 10}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {103, 7},
+ {68, 6}, {121, 7}, {74, 6}, {92, 6}, {2, 7}, {209, 12}, {157, 6},
+ {109, 7}, {70, 6}, {127, 7}, {76, 6}, {94, 6}, {4, 7}, {193, 6},
+ {82, 6}, {100, 6}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {145, 3}, {206, 10}, {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4},
+ {161, 4}, {64, 4}, {209, 12}, {159, 8}, {116, 10}, {72, 8}, {134, 10},
+ {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5},
+ {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8},
+ {148, 6}, {140, 10}, {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8},
+ {63, 12}, {15, 10}, {122, 8}, {23, 10}, {39, 10}, {3, 8}, {209, 12},
+ {157, 6}, {110, 8}, {70, 6}, {128, 8}, {27, 10}, {43, 10}, {5, 8},
+ {193, 6}, {82, 6}, {51, 10}, {9, 8}, {118, 6}, {17, 8}, {33, 8},
+ {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7},
+ {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8},
+ {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7},
+ {130, 8}, {29, 10}, {45, 10}, {6, 8}, {194, 7}, {83, 7}, {53, 10},
+ {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12},
+ {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7},
+ {85, 7}, {57, 10}, {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7},
+ {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8},
+ {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7},
+ {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4},
+ {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {160, 9}, {172, 9},
+ {147, 5}, {184, 9}, {150, 5}, {162, 5}, {65, 5}, {196, 9}, {153, 5},
+ {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12},
+ {209, 12}, {175, 9}, {148, 6}, {142, 10}, {81, 9}, {99, 9}, {66, 6},
+ {199, 9}, {87, 9}, {105, 9}, {68, 6}, {123, 9}, {74, 6}, {92, 6},
+ {64, 4}, {209, 12}, {157, 6}, {111, 9}, {70, 6}, {129, 9}, {76, 6},
+ {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6},
+ {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {190, 9}, {152, 7}, {164, 7}, {145, 3}, {202, 9}, {89, 9}, {107, 9},
+ {69, 7}, {125, 9}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7},
+ {113, 9}, {71, 7}, {131, 9}, {30, 10}, {46, 10}, {7, 9}, {194, 7},
+ {83, 7}, {54, 10}, {11, 9}, {119, 7}, {19, 9}, {35, 9}, {1, 7},
+ {209, 12}, {209, 12}, {173, 7}, {148, 6}, {137, 9}, {79, 7}, {97, 7},
+ {66, 6}, {197, 7}, {85, 7}, {58, 10}, {13, 9}, {121, 7}, {21, 9},
+ {37, 9}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7},
+ {25, 9}, {41, 9}, {4, 7}, {193, 6}, {82, 6}, {49, 9}, {8, 7},
+ {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {205, 9}, {156, 8},
+ {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12},
+ {159, 8}, {115, 9}, {72, 8}, {133, 9}, {78, 8}, {96, 8}, {65, 5},
+ {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5},
+ {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6}, {139, 9}, {80, 8},
+ {98, 8}, {66, 6}, {198, 8}, {86, 8}, {60, 10}, {14, 9}, {122, 8},
+ {22, 9}, {38, 9}, {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6},
+ {128, 8}, {26, 9}, {42, 9}, {5, 8}, {193, 6}, {82, 6}, {50, 9},
+ {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8},
+ {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4},
+ {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8}, {28, 9}, {44, 9},
+ {6, 8}, {194, 7}, {83, 7}, {52, 9}, {10, 8}, {119, 7}, {18, 8},
+ {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8},
+ {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {56, 9}, {12, 8},
+ {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7},
+ {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6},
+ {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
+ {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4},
+ {64, 4}, {209, 12}, {209, 12}, {209, 12}, {147, 5}, {209, 12}, {150, 5},
+ {162, 5}, {65, 5}, {209, 12}, {153, 5}, {165, 5}, {67, 5}, {177, 5},
+ {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {209, 12}, {148, 6},
+ {209, 12}, {151, 6}, {163, 6}, {66, 6}, {209, 12}, {154, 6}, {166, 6},
+ {68, 6}, {178, 6}, {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6},
+ {169, 6}, {70, 6}, {181, 6}, {76, 6}, {94, 6}, {65, 5}, {193, 6},
+ {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {192, 11}, {152, 7}, {164, 7},
+ {145, 3}, {204, 11}, {155, 7}, {167, 7}, {69, 7}, {179, 7}, {75, 7},
+ {93, 7}, {64, 4}, {209, 12}, {158, 7}, {170, 7}, {71, 7}, {182, 7},
+ {77, 7}, {95, 7}, {65, 5}, {194, 7}, {83, 7}, {101, 7}, {67, 5},
+ {119, 7}, {73, 5}, {91, 5}, {1, 7}, {209, 12}, {209, 12}, {173, 7},
+ {148, 6}, {185, 7}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7},
+ {103, 7}, {68, 6}, {121, 7}, {74, 6}, {92, 6}, {2, 7}, {209, 12},
+ {157, 6}, {109, 7}, {70, 6}, {127, 7}, {76, 6}, {94, 6}, {4, 7},
+ {193, 6}, {82, 6}, {100, 6}, {8, 7}, {118, 6}, {16, 7}, {32, 7},
+ {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {145, 3}, {207, 11}, {156, 8}, {168, 8}, {146, 4}, {180, 8},
+ {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8}, {117, 11}, {72, 8},
+ {135, 11}, {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8},
+ {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12},
+ {174, 8}, {148, 6}, {141, 11}, {80, 8}, {98, 8}, {66, 6}, {198, 8},
+ {86, 8}, {104, 8}, {68, 6}, {122, 8}, {74, 6}, {92, 6}, {3, 8},
+ {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8}, {76, 6}, {94, 6},
+ {5, 8}, {193, 6}, {82, 6}, {100, 6}, {9, 8}, {118, 6}, {17, 8},
+ {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},
+ {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7},
+ {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8},
+ {71, 7}, {130, 8}, {77, 7}, {95, 7}, {6, 8}, {194, 7}, {83, 7},
+ {101, 7}, {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12},
+ {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6},
+ {197, 7}, {85, 7}, {103, 7}, {12, 8}, {121, 7}, {20, 8}, {36, 8},
+ {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8},
+ {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6},
+ {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12}, {209, 12},
+ {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {160, 9},
+ {172, 9}, {147, 5}, {184, 9}, {150, 5}, {162, 5}, {65, 5}, {196, 9},
+ {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5}, {64, 4},
+ {209, 12}, {209, 12}, {175, 9}, {148, 6}, {143, 11}, {81, 9}, {99, 9},
+ {66, 6}, {199, 9}, {87, 9}, {105, 9}, {68, 6}, {123, 9}, {74, 6},
+ {92, 6}, {64, 4}, {209, 12}, {157, 6}, {111, 9}, {70, 6}, {129, 9},
+ {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6}, {67, 5},
+ {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {190, 9}, {152, 7}, {164, 7}, {145, 3}, {202, 9}, {89, 9},
+ {107, 9}, {69, 7}, {125, 9}, {75, 7}, {93, 7}, {64, 4}, {209, 12},
+ {158, 7}, {113, 9}, {71, 7}, {131, 9}, {31, 11}, {47, 11}, {7, 9},
+ {194, 7}, {83, 7}, {55, 11}, {11, 9}, {119, 7}, {19, 9}, {35, 9},
+ {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {137, 9}, {79, 7},
+ {97, 7}, {66, 6}, {197, 7}, {85, 7}, {59, 11}, {13, 9}, {121, 7},
+ {21, 9}, {37, 9}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6},
+ {127, 7}, {25, 9}, {41, 9}, {4, 7}, {193, 6}, {82, 6}, {49, 9},
+ {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {205, 9},
+ {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4}, {64, 4},
+ {209, 12}, {159, 8}, {115, 9}, {72, 8}, {133, 9}, {78, 8}, {96, 8},
+ {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8}, {73, 5},
+ {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6}, {139, 9},
+ {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8}, {61, 11}, {14, 9},
+ {122, 8}, {22, 9}, {38, 9}, {3, 8}, {209, 12}, {157, 6}, {110, 8},
+ {70, 6}, {128, 8}, {26, 9}, {42, 9}, {5, 8}, {193, 6}, {82, 6},
+ {50, 9}, {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7}, {145, 3},
+ {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7}, {93, 7},
+ {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8}, {28, 9},
+ {44, 9}, {6, 8}, {194, 7}, {83, 7}, {52, 9}, {10, 8}, {119, 7},
+ {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6},
+ {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {56, 9},
+ {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12}, {157, 6},
+ {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7}, {193, 6},
+ {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {145, 3}, {209, 12}, {209, 12}, {209, 12}, {146, 4}, {209, 12}, {149, 4},
+ {161, 4}, {64, 4}, {209, 12}, {209, 12}, {209, 12}, {147, 5}, {209, 12},
+ {150, 5}, {162, 5}, {65, 5}, {209, 12}, {153, 5}, {165, 5}, {67, 5},
+ {177, 5}, {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {176, 10},
+ {148, 6}, {188, 10}, {151, 6}, {163, 6}, {66, 6}, {200, 10}, {154, 6},
+ {166, 6}, {68, 6}, {178, 6}, {74, 6}, {92, 6}, {64, 4}, {209, 12},
+ {157, 6}, {169, 6}, {70, 6}, {181, 6}, {76, 6}, {94, 6}, {65, 5},
+ {193, 6}, {82, 6}, {100, 6}, {67, 5}, {118, 6}, {73, 5}, {91, 5},
+ {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {191, 10}, {152, 7},
+ {164, 7}, {145, 3}, {203, 10}, {90, 10}, {108, 10}, {69, 7}, {126, 10},
+ {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7}, {114, 10}, {71, 7},
+ {132, 10}, {77, 7}, {95, 7}, {65, 5}, {194, 7}, {83, 7}, {101, 7},
+ {67, 5}, {119, 7}, {73, 5}, {91, 5}, {1, 7}, {209, 12}, {209, 12},
+ {173, 7}, {148, 6}, {138, 10}, {79, 7}, {97, 7}, {66, 6}, {197, 7},
+ {85, 7}, {103, 7}, {68, 6}, {121, 7}, {74, 6}, {92, 6}, {2, 7},
+ {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7}, {76, 6}, {94, 6},
+ {4, 7}, {193, 6}, {82, 6}, {100, 6}, {8, 7}, {118, 6}, {16, 7},
+ {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {145, 3}, {206, 10}, {156, 8}, {168, 8}, {146, 4},
+ {180, 8}, {149, 4}, {161, 4}, {64, 4}, {209, 12}, {159, 8}, {116, 10},
+ {72, 8}, {134, 10}, {78, 8}, {96, 8}, {65, 5}, {195, 8}, {84, 8},
+ {102, 8}, {67, 5}, {120, 8}, {73, 5}, {91, 5}, {64, 4}, {209, 12},
+ {209, 12}, {174, 8}, {148, 6}, {140, 10}, {80, 8}, {98, 8}, {66, 6},
+ {198, 8}, {86, 8}, {62, 11}, {15, 10}, {122, 8}, {23, 10}, {39, 10},
+ {3, 8}, {209, 12}, {157, 6}, {110, 8}, {70, 6}, {128, 8}, {27, 10},
+ {43, 10}, {5, 8}, {193, 6}, {82, 6}, {51, 10}, {9, 8}, {118, 6},
+ {17, 8}, {33, 8}, {0, 6}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+ {189, 8}, {152, 7}, {164, 7}, {145, 3}, {201, 8}, {88, 8}, {106, 8},
+ {69, 7}, {124, 8}, {75, 7}, {93, 7}, {64, 4}, {209, 12}, {158, 7},
+ {112, 8}, {71, 7}, {130, 8}, {29, 10}, {45, 10}, {6, 8}, {194, 7},
+ {83, 7}, {53, 10}, {10, 8}, {119, 7}, {18, 8}, {34, 8}, {1, 7},
+ {209, 12}, {209, 12}, {173, 7}, {148, 6}, {136, 8}, {79, 7}, {97, 7},
+ {66, 6}, {197, 7}, {85, 7}, {57, 10}, {12, 8}, {121, 7}, {20, 8},
+ {36, 8}, {2, 7}, {209, 12}, {157, 6}, {109, 7}, {70, 6}, {127, 7},
+ {24, 8}, {40, 8}, {4, 7}, {193, 6}, {82, 6}, {48, 8}, {8, 7},
+ {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3}, {209, 12}, {209, 12},
+ {209, 12}, {146, 4}, {209, 12}, {149, 4}, {161, 4}, {64, 4}, {209, 12},
+ {160, 9}, {172, 9}, {147, 5}, {184, 9}, {150, 5}, {162, 5}, {65, 5},
+ {196, 9}, {153, 5}, {165, 5}, {67, 5}, {177, 5}, {73, 5}, {91, 5},
+ {64, 4}, {209, 12}, {209, 12}, {175, 9}, {148, 6}, {142, 10}, {81, 9},
+ {99, 9}, {66, 6}, {199, 9}, {87, 9}, {105, 9}, {68, 6}, {123, 9},
+ {74, 6}, {92, 6}, {64, 4}, {209, 12}, {157, 6}, {111, 9}, {70, 6},
+ {129, 9}, {76, 6}, {94, 6}, {65, 5}, {193, 6}, {82, 6}, {100, 6},
+ {67, 5}, {118, 6}, {73, 5}, {91, 5}, {0, 6}, {209, 12}, {209, 12},
+ {209, 12}, {209, 12}, {190, 9}, {152, 7}, {164, 7}, {145, 3}, {202, 9},
+ {89, 9}, {107, 9}, {69, 7}, {125, 9}, {75, 7}, {93, 7}, {64, 4},
+ {209, 12}, {158, 7}, {113, 9}, {71, 7}, {131, 9}, {30, 10}, {46, 10},
+ {7, 9}, {194, 7}, {83, 7}, {54, 10}, {11, 9}, {119, 7}, {19, 9},
+ {35, 9}, {1, 7}, {209, 12}, {209, 12}, {173, 7}, {148, 6}, {137, 9},
+ {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7}, {58, 10}, {13, 9},
+ {121, 7}, {21, 9}, {37, 9}, {2, 7}, {209, 12}, {157, 6}, {109, 7},
+ {70, 6}, {127, 7}, {25, 9}, {41, 9}, {4, 7}, {193, 6}, {82, 6},
+ {49, 9}, {8, 7}, {118, 6}, {16, 7}, {32, 7}, {0, 6}, {209, 12},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
+ {205, 9}, {156, 8}, {168, 8}, {146, 4}, {180, 8}, {149, 4}, {161, 4},
+ {64, 4}, {209, 12}, {159, 8}, {115, 9}, {72, 8}, {133, 9}, {78, 8},
+ {96, 8}, {65, 5}, {195, 8}, {84, 8}, {102, 8}, {67, 5}, {120, 8},
+ {73, 5}, {91, 5}, {64, 4}, {209, 12}, {209, 12}, {174, 8}, {148, 6},
+ {139, 9}, {80, 8}, {98, 8}, {66, 6}, {198, 8}, {86, 8}, {60, 10},
+ {14, 9}, {122, 8}, {22, 9}, {38, 9}, {3, 8}, {209, 12}, {157, 6},
+ {110, 8}, {70, 6}, {128, 8}, {26, 9}, {42, 9}, {5, 8}, {193, 6},
+ {82, 6}, {50, 9}, {9, 8}, {118, 6}, {17, 8}, {33, 8}, {0, 6},
+ {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8}, {152, 7}, {164, 7},
+ {145, 3}, {201, 8}, {88, 8}, {106, 8}, {69, 7}, {124, 8}, {75, 7},
+ {93, 7}, {64, 4}, {209, 12}, {158, 7}, {112, 8}, {71, 7}, {130, 8},
+ {28, 9}, {44, 9}, {6, 8}, {194, 7}, {83, 7}, {52, 9}, {10, 8},
+ {119, 7}, {18, 8}, {34, 8}, {1, 7}, {209, 12}, {209, 12}, {173, 7},
+ {148, 6}, {136, 8}, {79, 7}, {97, 7}, {66, 6}, {197, 7}, {85, 7},
+ {56, 9}, {12, 8}, {121, 7}, {20, 8}, {36, 8}, {2, 7}, {209, 12},
+ {157, 6}, {109, 7}, {70, 6}, {127, 7}, {24, 8}, {40, 8}, {4, 7},
+ {193, 6}, {82, 6}, {48, 8}, {8, 7}, {118, 6}, {16, 7}, {32, 7},
+ {0, 6}};
+} // namespace utf8_to_utf16
+} // namespace tables
+} // unnamed namespace
+} // namespace simdutf
+
+#endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H
diff --git a/contrib/simdutf/src/westmere/implementation.cpp b/contrib/simdutf/src/westmere/implementation.cpp
new file mode 100644
index 000000000..026a225ae
--- /dev/null
+++ b/contrib/simdutf/src/westmere/implementation.cpp
@@ -0,0 +1,1142 @@
+#include "simdutf/westmere/begin.h"
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+namespace {
+#ifndef SIMDUTF_WESTMERE_H
+ #error "westmere.h must be included"
+#endif
+using namespace simd;
+
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
+ return input.reduce_or().is_ascii();
+}
+
+simdutf_unused simdutf_really_inline simd8<bool>
+must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
+ const simd8<uint8_t> prev3) {
+ simd8<uint8_t> is_second_byte =
+ prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
+ simd8<uint8_t> is_third_byte =
+ prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+ simd8<uint8_t> is_fourth_byte =
+ prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+ // Caller requires a bool (all 1's). All values resulting from the subtraction
+ // will be <= 64, so signed comparison is fine.
+ return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) >
+ int8_t(0);
+}
+
+simdutf_really_inline simd8<bool>
+must_be_2_3_continuation(const simd8<uint8_t> prev2,
+ const simd8<uint8_t> prev3) {
+ simd8<uint8_t> is_third_byte =
+ prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be >= 0x80
+ simd8<uint8_t> is_fourth_byte =
+ prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be >= 0x80
+ return simd8<bool>(is_third_byte | is_fourth_byte);
+}
+
+#include "westmere/internal/loader.cpp"
+
+#include "westmere/sse_validate_utf16.cpp"
+#include "westmere/sse_validate_utf32le.cpp"
+
+#include "westmere/sse_convert_latin1_to_utf8.cpp"
+#include "westmere/sse_convert_latin1_to_utf16.cpp"
+#include "westmere/sse_convert_latin1_to_utf32.cpp"
+
+#include "westmere/sse_convert_utf8_to_utf16.cpp"
+#include "westmere/sse_convert_utf8_to_utf32.cpp"
+#include "westmere/sse_convert_utf8_to_latin1.cpp"
+
+#include "westmere/sse_convert_utf16_to_latin1.cpp"
+#include "westmere/sse_convert_utf16_to_utf8.cpp"
+#include "westmere/sse_convert_utf16_to_utf32.cpp"
+
+#include "westmere/sse_convert_utf32_to_latin1.cpp"
+#include "westmere/sse_convert_utf32_to_utf8.cpp"
+#include "westmere/sse_convert_utf32_to_utf16.cpp"
+#include "westmere/sse_base64.cpp"
+
+} // unnamed namespace
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#include "generic/buf_block_reader.h"
+#include "generic/utf8_validation/utf8_lookup4_algorithm.h"
+#include "generic/utf8_validation/utf8_validator.h"
+// transcoding from UTF-8 to UTF-16
+#include "generic/utf8_to_utf16/valid_utf8_to_utf16.h"
+#include "generic/utf8_to_utf16/utf8_to_utf16.h"
+// transcoding from UTF-8 to UTF-32
+#include "generic/utf8_to_utf32/valid_utf8_to_utf32.h"
+#include "generic/utf8_to_utf32/utf8_to_utf32.h"
+// other functions
+#include "generic/utf8.h"
+#include "generic/utf16.h"
+// transcoding from UTF-8 to Latin 1
+#include "generic/utf8_to_latin1/utf8_to_latin1.h"
+#include "generic/utf8_to_latin1/valid_utf8_to_latin1.h"
+
+//
+// Implementation-specific overrides
+//
+
+namespace simdutf {
+namespace SIMDUTF_IMPLEMENTATION {
+
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+ size_t length) const noexcept {
+ // If there is a BOM, then we trust it.
+ auto bom_encoding = simdutf::BOM::check_bom(input, length);
+ // todo: reimplement as a one-pass algorithm.
+ if (bom_encoding != encoding_type::unspecified) {
+ return bom_encoding;
+ }
+ int out = 0;
+ if (validate_utf8(input, length)) {
+ out |= encoding_type::UTF8;
+ }
+ if ((length % 2) == 0) {
+ if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+ length / 2)) {
+ out |= encoding_type::UTF16_LE;
+ }
+ }
+ if ((length % 4) == 0) {
+ if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+ out |= encoding_type::UTF32_LE;
+ }
+ }
+ return out;
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+ return westmere::utf8_validation::generic_validate_utf8(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+ const char *buf, size_t len) const noexcept {
+ return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+ return westmere::utf8_validation::generic_validate_ascii(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+ const char *buf, size_t len) const noexcept {
+ return westmere::utf8_validation::generic_validate_ascii_with_errors(buf,
+ len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+ size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid UTF-16. protect the implementation from
+ // handling nullptr
+ return true;
+ }
+ const char16_t *tail = sse_validate_utf16<endianness::LITTLE>(buf, len);
+ if (tail) {
+ return scalar::utf16::validate<endianness::LITTLE>(tail,
+ len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+ size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid UTF-16. protect the implementation from
+ // handling nullptr
+ return true;
+ }
+ const char16_t *tail = sse_validate_utf16<endianness::BIG>(buf, len);
+ if (tail) {
+ return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ result res = sse_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+ if (res.count != len) {
+ result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
+ buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+ const char16_t *buf, size_t len) const noexcept {
+ result res = sse_validate_utf16_with_errors<endianness::BIG>(buf, len);
+ if (res.count != len) {
+ result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
+ buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+ if (simdutf_unlikely(len == 0)) {
+ // empty input is valid UTF-32. protect the implementation from
+ // handling nullptr
+ return true;
+ }
+ const char32_t *tail = sse_validate_utf32le(buf, len);
+ if (tail) {
+ return scalar::utf32::validate(tail, len - (tail - buf));
+ } else {
+ return false;
+ }
+}
+
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+ const char32_t *buf, size_t len) const noexcept {
+ if (len == 0) {
+ // empty input is valid UTF-32. protect the implementation from
+ // handling nullptr
+ return result(error_code::SUCCESS, 0);
+ }
+ result res = sse_validate_utf32le_with_errors(buf, len);
+ if (res.count != len) {
+ result scalar_res =
+ scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+ return result(scalar_res.error, res.count + scalar_res.count);
+ } else {
+ return res;
+ }
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+ const char *buf, size_t len, char *utf8_output) const noexcept {
+
+ std::pair<const char *, char *> ret =
+ sse_convert_latin1_to_utf8(buf, len, utf8_output);
+ size_t converted_chars = ret.second - utf8_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ converted_chars += scalar_converted_chars;
+ }
+
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char *, char16_t *> ret =
+ sse_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t converted_chars = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars =
+ scalar::latin1_to_utf16::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_converted_chars == 0) {
+ return 0;
+ }
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char *, char16_t *> ret =
+ sse_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t converted_chars = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars =
+ scalar::latin1_to_utf16::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_converted_chars == 0) {
+ return 0;
+ }
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char *, char32_t *> ret =
+ sse_convert_latin1_to_utf32(buf, len, utf32_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t converted_chars = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_converted_chars == 0) {
+ return 0;
+ }
+ converted_chars += scalar_converted_chars;
+ }
+ return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ utf8_to_latin1::validating_transcoder converter;
+ return converter.convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ utf8_to_latin1::validating_transcoder converter;
+ return converter.convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+ const char *buf, size_t len, char *latin1_output) const noexcept {
+ return westmere::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert_with_errors<endianness::LITTLE>(buf, len,
+ utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+ const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+ utf8_to_utf16::validating_transcoder converter;
+ return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+ const char *input, size_t size, char16_t *utf16_output) const noexcept {
+ return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+ const char *input, size_t size, char16_t *utf16_output) const noexcept {
+ return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
+ utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ utf8_to_utf32::validating_transcoder converter;
+ return converter.convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+ const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+ utf8_to_utf32::validating_transcoder converter;
+ return converter.convert_with_errors(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+ const char *input, size_t size, char32_t *utf32_output) const noexcept {
+ return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ sse_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_latin1::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ sse_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_latin1::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<result, char *> ret =
+ sse_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+ buf, len, latin1_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<result, char *> ret =
+ sse_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
+ latin1_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ // optimization opportunity: we could provide an optimized function.
+ return convert_utf16be_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+ const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+ // optimization opportunity: we could provide an optimized function.
+ return convert_utf16le_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ sse_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char16_t *, char *> ret =
+ sse_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf8::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ westmere::sse_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(
+ buf, len, utf8_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ westmere::sse_convert_utf16_to_utf8_with_errors<endianness::BIG>(
+ buf, len, utf8_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf16le_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+ const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf16be_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ std::pair<const char32_t *, char *> ret =
+ sse_convert_utf32_to_latin1(buf, len, latin1_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - latin1_output;
+ // if (ret.first != buf + len) {
+ if (ret.first < buf + len) {
+ const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ westmere::sse_convert_utf32_to_latin1_with_errors(buf, len,
+ latin1_output);
+ if (ret.first.count != len) {
+ result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ latin1_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+ const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+ // optimization opportunity: we could provide an optimized function.
+ return convert_utf32_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ std::pair<const char32_t *, char *> ret =
+ sse_convert_utf32_to_utf8(buf, len, utf8_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf8_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char *> ret =
+ westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+ if (ret.first.count != len) {
+ result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf8_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char16_t *, char32_t *> ret =
+ sse_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ std::pair<const char16_t *, char32_t *> ret =
+ sse_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf32_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf16_to_utf32::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char32_t *> ret =
+ westmere::sse_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(
+ buf, len, utf32_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf32_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char32_t *> ret =
+ westmere::sse_convert_utf16_to_utf32_with_errors<endianness::BIG>(
+ buf, len, utf32_output);
+ if (ret.first.error) {
+ return ret.first;
+ } // Can return directly since scalar fallback already found correct
+ // ret.first.count
+ if (ret.first.count != len) { // All good so far, but not finished
+ result scalar_res =
+ scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf32_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+ const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+ return convert_utf32_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char32_t *, char16_t *> ret =
+ sse_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ std::pair<const char32_t *, char16_t *> ret =
+ sse_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+ if (ret.first == nullptr) {
+ return 0;
+ }
+ size_t saved_bytes = ret.second - utf16_output;
+ if (ret.first != buf + len) {
+ const size_t scalar_saved_bytes =
+ scalar::utf32_to_utf16::convert<endianness::BIG>(
+ ret.first, len - (ret.first - buf), ret.second);
+ if (scalar_saved_bytes == 0) {
+ return 0;
+ }
+ saved_bytes += scalar_saved_bytes;
+ }
+ return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char16_t *> ret =
+ westmere::sse_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
+ buf, len, utf16_output);
+ if (ret.first.count != len) {
+ result scalar_res =
+ scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf16_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ // ret.first.count is always the position in the buffer, not the number of
+ // code units written even if finished
+ std::pair<result, char16_t *> ret =
+ westmere::sse_convert_utf32_to_utf16_with_errors<endianness::BIG>(
+ buf, len, utf16_output);
+ if (ret.first.count != len) {
+ result scalar_res =
+ scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+ buf + ret.first.count, len - ret.first.count, ret.second);
+ if (scalar_res.error) {
+ scalar_res.count += ret.first.count;
+ return scalar_res;
+ } else {
+ ret.second += scalar_res.count;
+ }
+ }
+ ret.first.count =
+ ret.second -
+ utf16_output; // Set count to the number of 8-bit code units written
+ return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return convert_utf32_to_utf16le(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+ const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+ return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return convert_utf16le_to_utf32(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+ const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+ return convert_utf16be_to_utf32(buf, len, utf32_output);
+}
+
+void implementation::change_endianness_utf16(const char16_t *input,
+ size_t length,
+ char16_t *output) const noexcept {
+ utf16::change_endianness_utf16(input, length, output);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::count_code_points<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::count_code_points<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+ return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+ const char *buf, size_t len) const noexcept {
+ return count_utf8(buf, len);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+ return scalar::utf16::latin1_length_from_utf16(length);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+ return scalar::utf32::latin1_length_from_utf32(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+ return scalar::latin1::utf16_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+ return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+ const char *input, size_t len) const noexcept {
+ const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
+ size_t answer = len / sizeof(__m128i) * sizeof(__m128i);
+ size_t i = 0;
+ if (answer >= 2048) { // long strings optimization
+ __m128i two_64bits = _mm_setzero_si128();
+ while (i + sizeof(__m128i) <= len) {
+ __m128i runner = _mm_setzero_si128();
+ size_t iterations = (len - i) / sizeof(__m128i);
+ if (iterations > 255) {
+ iterations = 255;
+ }
+ size_t max_i = i + iterations * sizeof(__m128i) - sizeof(__m128i);
+ for (; i + 4 * sizeof(__m128i) <= max_i; i += 4 * sizeof(__m128i)) {
+ __m128i input1 = _mm_loadu_si128((const __m128i *)(str + i));
+ __m128i input2 =
+ _mm_loadu_si128((const __m128i *)(str + i + sizeof(__m128i)));
+ __m128i input3 =
+ _mm_loadu_si128((const __m128i *)(str + i + 2 * sizeof(__m128i)));
+ __m128i input4 =
+ _mm_loadu_si128((const __m128i *)(str + i + 3 * sizeof(__m128i)));
+ __m128i input12 =
+ _mm_add_epi8(_mm_cmpgt_epi8(_mm_setzero_si128(), input1),
+ _mm_cmpgt_epi8(_mm_setzero_si128(), input2));
+ __m128i input34 =
+ _mm_add_epi8(_mm_cmpgt_epi8(_mm_setzero_si128(), input3),
+ _mm_cmpgt_epi8(_mm_setzero_si128(), input4));
+ __m128i input1234 = _mm_add_epi8(input12, input34);
+ runner = _mm_sub_epi8(runner, input1234);
+ }
+ for (; i <= max_i; i += sizeof(__m128i)) {
+ __m128i more_input = _mm_loadu_si128((const __m128i *)(str + i));
+ runner = _mm_sub_epi8(runner,
+ _mm_cmpgt_epi8(_mm_setzero_si128(), more_input));
+ }
+ two_64bits =
+ _mm_add_epi64(two_64bits, _mm_sad_epu8(runner, _mm_setzero_si128()));
+ }
+ answer +=
+ _mm_extract_epi64(two_64bits, 0) + _mm_extract_epi64(two_64bits, 1);
+ } else if (answer > 0) { // short string optimization
+ for (; i + 2 * sizeof(__m128i) <= len; i += 2 * sizeof(__m128i)) {
+ __m128i latin = _mm_loadu_si128((const __m128i *)(input + i));
+ uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin);
+ answer += count_ones(non_ascii);
+ latin = _mm_loadu_si128((const __m128i *)(input + i) + 1);
+ non_ascii = (uint16_t)_mm_movemask_epi8(latin);
+ answer += count_ones(non_ascii);
+ }
+ for (; i + sizeof(__m128i) <= len; i += sizeof(__m128i)) {
+ __m128i latin = _mm_loadu_si128((const __m128i *)(input + i));
+ uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin);
+ answer += count_ones(non_ascii);
+ }
+ }
+ return answer + scalar::latin1::utf8_length_from_latin1(
+ reinterpret_cast<const char *>(str + i), len - i);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+ const char16_t *input, size_t length) const noexcept {
+ return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ return utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ const __m128i v_00000000 = _mm_setzero_si128();
+ const __m128i v_ffffff80 = _mm_set1_epi32((uint32_t)0xffffff80);
+ const __m128i v_fffff800 = _mm_set1_epi32((uint32_t)0xfffff800);
+ const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+ size_t pos = 0;
+ size_t count = 0;
+ for (; pos + 4 <= length; pos += 4) {
+ __m128i in = _mm_loadu_si128((__m128i *)(input + pos));
+ const __m128i ascii_bytes_bytemask =
+ _mm_cmpeq_epi32(_mm_and_si128(in, v_ffffff80), v_00000000);
+ const __m128i one_two_bytes_bytemask =
+ _mm_cmpeq_epi32(_mm_and_si128(in, v_fffff800), v_00000000);
+ const __m128i two_bytes_bytemask =
+ _mm_xor_si128(one_two_bytes_bytemask, ascii_bytes_bytemask);
+ const __m128i one_two_three_bytes_bytemask =
+ _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
+ const __m128i three_bytes_bytemask =
+ _mm_xor_si128(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
+ const uint16_t ascii_bytes_bitmask =
+ static_cast<uint16_t>(_mm_movemask_epi8(ascii_bytes_bytemask));
+ const uint16_t two_bytes_bitmask =
+ static_cast<uint16_t>(_mm_movemask_epi8(two_bytes_bytemask));
+ const uint16_t three_bytes_bitmask =
+ static_cast<uint16_t>(_mm_movemask_epi8(three_bytes_bytemask));
+
+ size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
+ size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
+ size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
+ count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
+ }
+ return count +
+ scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+ const char32_t *input, size_t length) const noexcept {
+ const __m128i v_00000000 = _mm_setzero_si128();
+ const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+ size_t pos = 0;
+ size_t count = 0;
+ for (; pos + 4 <= length; pos += 4) {
+ __m128i in = _mm_loadu_si128((__m128i *)(input + pos));
+ const __m128i surrogate_bytemask =
+ _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
+ const uint16_t surrogate_bitmask =
+ static_cast<uint16_t>(_mm_movemask_epi8(surrogate_bytemask));
+ size_t surrogate_count = (16 - count_ones(surrogate_bitmask)) / 4;
+ count += 4 + surrogate_count;
+ }
+ return count +
+ scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+ const char *input, size_t length) const noexcept {
+ return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+ const char16_t *input, size_t length) const noexcept {
+ return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+ const char16_t *input, size_t length, char *output, base64_options options,
+ last_chunk_handling_options last_chunk_options) const noexcept {
+ return (options & base64_url)
+ ? compress_decode_base64<true>(output, input, length, options,
+ last_chunk_options)
+ : compress_decode_base64<false>(output, input, length, options,
+ last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+ size_t length, base64_options options) const noexcept {
+ return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+ char *output,
+ base64_options options) const noexcept {
+ if (options & base64_url) {
+ return encode_base64<true>(output, input, length, options);
+ } else {
+ return encode_base64<false>(output, input, length, options);
+ }
+}
+} // namespace SIMDUTF_IMPLEMENTATION
+} // namespace simdutf
+
+#include "simdutf/westmere/end.h"
diff --git a/contrib/simdutf/src/westmere/internal/loader.cpp b/contrib/simdutf/src/westmere/internal/loader.cpp
new file mode 100644
index 000000000..459e9aa55
--- /dev/null
+++ b/contrib/simdutf/src/westmere/internal/loader.cpp
@@ -0,0 +1,7 @@
+namespace internal {
+namespace westmere {
+
+#include "westmere/internal/write_v_u16_11bits_to_utf8.cpp"
+
+} // namespace westmere
+} // namespace internal
diff --git a/contrib/simdutf/src/westmere/internal/write_v_u16_11bits_to_utf8.cpp b/contrib/simdutf/src/westmere/internal/write_v_u16_11bits_to_utf8.cpp
new file mode 100644
index 000000000..718b1140d
--- /dev/null
+++ b/contrib/simdutf/src/westmere/internal/write_v_u16_11bits_to_utf8.cpp
@@ -0,0 +1,66 @@
+/*
+ * reads a vector of uint16 values
+ * bits after 11th are ignored
+ * first 11 bits are encoded into utf8
+ * !important! utf8_output must have at least 16 writable bytes
+ */
+
+inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
+ const __m128i one_byte_bytemask,
+ const uint16_t one_byte_bitmask) {
+ // 0b1100_0000_1000_0000
+ const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
+ // 0b0001_1111_0000_0000
+ const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
+ // 0b0000_0000_0011_1111
+ const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
+
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const __m128i t0 = _mm_slli_epi16(v_u16, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const __m128i t1 = _mm_and_si128(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m128i t2 = _mm_and_si128(v_u16, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m128i t3 = _mm_or_si128(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const __m128i t4 = _mm_or_si128(t3, v_c080);
+
+ // 2. merge ASCII and 2-byte codewords
+ const __m128i utf8_unpacked = _mm_blendv_epi8(t4, v_u16, one_byte_bytemask);
+
+ // 3. prepare bitmask for 8-bit lookup
+ // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a
+ // - LSB)
+ const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
+ const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
+ const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 = hdgcfbea
+ // 4. pack the bytes
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+ const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+ const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+ // 5. store bytes
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+
+ // 6. adjust pointers
+ utf8_output += row[0];
+}
+
+inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
+ const __m128i v_0000,
+ const __m128i v_ff80) {
+ // no bits set above 7th bit
+ const __m128i one_byte_bytemask =
+ _mm_cmpeq_epi16(_mm_and_si128(v_u16, v_ff80), v_0000);
+ const uint16_t one_byte_bitmask =
+ static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+
+ write_v_u16_11bits_to_utf8(v_u16, utf8_output, one_byte_bytemask,
+ one_byte_bitmask);
+}
diff --git a/contrib/simdutf/src/westmere/sse_base64.cpp b/contrib/simdutf/src/westmere/sse_base64.cpp
new file mode 100644
index 000000000..4c1befa97
--- /dev/null
+++ b/contrib/simdutf/src/westmere/sse_base64.cpp
@@ -0,0 +1,591 @@
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
+template <bool base64_url> __m128i lookup_pshufb_improved(const __m128i input) {
+ // credit: Wojciech Muła
+ // reduce 0..51 -> 0
+ // 52..61 -> 1 .. 10
+ // 62 -> 11
+ // 63 -> 12
+ __m128i result = _mm_subs_epu8(input, _mm_set1_epi8(51));
+
+ // distinguish between ranges 0..25 and 26..51:
+ // 0 .. 25 -> remains 0
+ // 26 .. 51 -> becomes 13
+ const __m128i less = _mm_cmpgt_epi8(_mm_set1_epi8(26), input);
+ result = _mm_or_si128(result, _mm_and_si128(less, _mm_set1_epi8(13)));
+
+ __m128i shift_LUT;
+ if (base64_url) {
+ shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+ '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+ '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
+ } else {
+ shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+ '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+ '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+ }
+
+ // read shift
+ result = _mm_shuffle_epi8(shift_LUT, result);
+
+ return _mm_add_epi8(result, input);
+}
+
+template <bool isbase64url>
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+ base64_options options) {
+ // credit: Wojciech Muła
+ // SSE (lookup: pshufb improved unrolled)
+ const uint8_t *input = (const uint8_t *)src;
+
+ uint8_t *out = (uint8_t *)dst;
+ const __m128i shuf =
+ _mm_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
+
+ size_t i = 0;
+ for (; i + 52 <= srclen; i += 48) {
+ __m128i in0 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
+ __m128i in1 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
+ __m128i in2 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
+ __m128i in3 = _mm_loadu_si128(
+ reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
+
+ in0 = _mm_shuffle_epi8(in0, shuf);
+ in1 = _mm_shuffle_epi8(in1, shuf);
+ in2 = _mm_shuffle_epi8(in2, shuf);
+ in3 = _mm_shuffle_epi8(in3, shuf);
+
+ const __m128i t0_0 = _mm_and_si128(in0, _mm_set1_epi32(0x0fc0fc00));
+ const __m128i t0_1 = _mm_and_si128(in1, _mm_set1_epi32(0x0fc0fc00));
+ const __m128i t0_2 = _mm_and_si128(in2, _mm_set1_epi32(0x0fc0fc00));
+ const __m128i t0_3 = _mm_and_si128(in3, _mm_set1_epi32(0x0fc0fc00));
+
+ const __m128i t1_0 = _mm_mulhi_epu16(t0_0, _mm_set1_epi32(0x04000040));
+ const __m128i t1_1 = _mm_mulhi_epu16(t0_1, _mm_set1_epi32(0x04000040));
+ const __m128i t1_2 = _mm_mulhi_epu16(t0_2, _mm_set1_epi32(0x04000040));
+ const __m128i t1_3 = _mm_mulhi_epu16(t0_3, _mm_set1_epi32(0x04000040));
+
+ const __m128i t2_0 = _mm_and_si128(in0, _mm_set1_epi32(0x003f03f0));
+ const __m128i t2_1 = _mm_and_si128(in1, _mm_set1_epi32(0x003f03f0));
+ const __m128i t2_2 = _mm_and_si128(in2, _mm_set1_epi32(0x003f03f0));
+ const __m128i t2_3 = _mm_and_si128(in3, _mm_set1_epi32(0x003f03f0));
+
+ const __m128i t3_0 = _mm_mullo_epi16(t2_0, _mm_set1_epi32(0x01000010));
+ const __m128i t3_1 = _mm_mullo_epi16(t2_1, _mm_set1_epi32(0x01000010));
+ const __m128i t3_2 = _mm_mullo_epi16(t2_2, _mm_set1_epi32(0x01000010));
+ const __m128i t3_3 = _mm_mullo_epi16(t2_3, _mm_set1_epi32(0x01000010));
+
+ const __m128i input0 = _mm_or_si128(t1_0, t3_0);
+ const __m128i input1 = _mm_or_si128(t1_1, t3_1);
+ const __m128i input2 = _mm_or_si128(t1_2, t3_2);
+ const __m128i input3 = _mm_or_si128(t1_3, t3_3);
+
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+ lookup_pshufb_improved<isbase64url>(input0));
+ out += 16;
+
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+ lookup_pshufb_improved<isbase64url>(input1));
+ out += 16;
+
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+ lookup_pshufb_improved<isbase64url>(input2));
+ out += 16;
+
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+ lookup_pshufb_improved<isbase64url>(input3));
+ out += 16;
+ }
+ for (; i + 16 <= srclen; i += 12) {
+
+ __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
+
+ // bytes from groups A, B and C are needed in separate 32-bit lanes
+ // in = [DDDD|CCCC|BBBB|AAAA]
+ //
+ // an input triplet has layout
+ // [????????|ccdddddd|bbbbcccc|aaaaaabb]
+ // byte 3 byte 2 byte 1 byte 0 -- byte 3 comes from the next
+ // triplet
+ //
+ // shuffling changes the order of bytes: 1, 0, 2, 1
+ // [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc]
+ // ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^
+ // processed bits
+ in = _mm_shuffle_epi8(in, shuf);
+
+ // unpacking
+
+ // t0 = [0000cccc|cc000000|aaaaaa00|00000000]
+ const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00));
+ // t1 = [00000000|00cccccc|00000000|00aaaaaa]
+ // (c * (1 << 10), a * (1 << 6)) >> 16 (note: an unsigned
+ // multiplication)
+ const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
+
+ // t2 = [00000000|00dddddd|000000bb|bbbb0000]
+ const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0));
+ // t3 = [00dddddd|00000000|00bbbbbb|00000000](
+ // (d * (1 << 8), b * (1 << 4))
+ const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
+
+ // res = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3
+ const __m128i indices = _mm_or_si128(t1, t3);
+
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+ lookup_pshufb_improved<isbase64url>(indices));
+ out += 16;
+ }
+
+ return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+ srclen - i, options);
+}
+static inline void compress(__m128i data, uint16_t mask, char *output) {
+ if (mask == 0) {
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
+ return;
+ }
+
+ // this particular implementation was inspired by work done by @animetosho
+ // we do it in two steps, first 8 bytes and then second 8 bytes
+ uint8_t mask1 = uint8_t(mask); // least significant 8 bits
+ uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+ // next line just loads the 64-bit values thintable_epi8[mask1] and
+ // thintable_epi8[mask2] into a 128-bit register, using only
+ // two instructions on most compilers.
+
+ __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2],
+ tables::base64::thintable_epi8[mask1]);
+ // we increment by 0x08 the second half of the mask
+ shufmask =
+ _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
+ // this is the version "nearly pruned"
+ __m128i pruned = _mm_shuffle_epi8(data, shufmask);
+ // we still need to put the two halves together.
+ // we compute the popcount of the first half:
+ int pop1 = tables::base64::BitsSetTable256mul2[mask1];
+ // then load the corresponding mask, what it does is to write
+ // only the first pop1 bytes from the first 8 bytes, and then
+ // it fills in with the bytes from the second 8 bytes + some filling
+ // at the end.
+ __m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
+ tables::base64::pshufb_combine_table + pop1 * 8));
+ __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
+}
+
+struct block64 {
+ __m128i chunks[4];
+};
+
+template <bool base64_url>
+static inline uint16_t to_base64_mask(__m128i *src, uint32_t *error) {
+ const __m128i ascii_space_tbl =
+ _mm_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, 0x0,
+ 0xc, 0xd, 0x0, 0x0);
+ // credit: aqrit
+ __m128i delta_asso;
+ if (base64_url) {
+ delta_asso = _mm_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
+ } else {
+
+ delta_asso = _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
+ }
+ __m128i delta_values;
+ if (base64_url) {
+ delta_values = _mm_setr_epi8(0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
+ uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9),
+ 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF),
+ uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
+ } else {
+
+ delta_values =
+ _mm_setr_epi8(int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+ int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+ int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
+ int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9));
+ }
+ __m128i check_asso;
+ if (base64_url) {
+ check_asso = _mm_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x3, 0x7, 0xB, 0xE, 0xB, 0x6);
+ } else {
+
+ check_asso = _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
+ }
+ __m128i check_values;
+ if (base64_url) {
+ check_values = _mm_setr_epi8(uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
+ uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF),
+ uint8_t(0xB6), uint8_t(0xA6), uint8_t(0xB5),
+ uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
+ uint8_t(0x80), 0x0, uint8_t(0x80));
+ } else {
+
+ check_values =
+ _mm_setr_epi8(int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+ int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
+ int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
+ int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80));
+ }
+ const __m128i shifted = _mm_srli_epi32(*src, 3);
+
+ const __m128i delta_hash =
+ _mm_avg_epu8(_mm_shuffle_epi8(delta_asso, *src), shifted);
+ const __m128i check_hash =
+ _mm_avg_epu8(_mm_shuffle_epi8(check_asso, *src), shifted);
+
+ const __m128i out =
+ _mm_adds_epi8(_mm_shuffle_epi8(delta_values, delta_hash), *src);
+ const __m128i chk =
+ _mm_adds_epi8(_mm_shuffle_epi8(check_values, check_hash), *src);
+ const int mask = _mm_movemask_epi8(chk);
+ if (mask) {
+ __m128i ascii_space =
+ _mm_cmpeq_epi8(_mm_shuffle_epi8(ascii_space_tbl, *src), *src);
+ *error = (mask ^ _mm_movemask_epi8(ascii_space));
+ }
+ *src = out;
+ return (uint16_t)mask;
+}
+
+template <bool base64_url>
+static inline uint64_t to_base64_mask(block64 *b, uint64_t *error) {
+ uint32_t err0 = 0;
+ uint32_t err1 = 0;
+ uint32_t err2 = 0;
+ uint32_t err3 = 0;
+ uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], &err0);
+ uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], &err1);
+ uint64_t m2 = to_base64_mask<base64_url>(&b->chunks[2], &err2);
+ uint64_t m3 = to_base64_mask<base64_url>(&b->chunks[3], &err3);
+ *error = (err0) | ((uint64_t)err1 << 16) | ((uint64_t)err2 << 32) |
+ ((uint64_t)err3 << 48);
+ return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
+}
+
+#if defined(_MSC_VER) && !defined(__clang__)
+static inline size_t simdutf_tzcnt_u64(uint64_t num) {
+ unsigned long ret;
+ if (num == 0) {
+ return 64;
+ }
+ _BitScanForward64(&ret, num);
+ return ret;
+}
+#else // GCC or Clang
+static inline size_t simdutf_tzcnt_u64(uint64_t num) {
+ return num ? __builtin_ctzll(num) : 64;
+}
+#endif
+
+static inline void copy_block(block64 *b, char *output) {
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(output), b->chunks[0]);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 16), b->chunks[1]);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 32), b->chunks[2]);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 48), b->chunks[3]);
+}
+
+static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+ uint64_t nmask = ~mask;
+ compress(b->chunks[0], uint16_t(mask), output);
+ compress(b->chunks[1], uint16_t(mask >> 16),
+ output + _mm_popcnt_u64(nmask & 0xFFFF));
+ compress(b->chunks[2], uint16_t(mask >> 32),
+ output + _mm_popcnt_u64(nmask & 0xFFFFFFFF));
+ compress(b->chunks[3], uint16_t(mask >> 48),
+ output + _mm_popcnt_u64(nmask & 0xFFFFFFFFFFFFULL));
+ return _mm_popcnt_u64(nmask);
+}
+
+// The caller of this function is responsible to ensure that there are 64 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char *src) {
+ b->chunks[0] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+ b->chunks[1] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
+ b->chunks[2] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
+ b->chunks[3] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
+}
+
+// The caller of this function is responsible to ensure that there are 128 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char16_t *src) {
+ __m128i m1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+ __m128i m2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 8));
+ __m128i m3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
+ __m128i m4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 24));
+ __m128i m5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
+ __m128i m6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 40));
+ __m128i m7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
+ __m128i m8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 56));
+ b->chunks[0] = _mm_packus_epi16(m1, m2);
+ b->chunks[1] = _mm_packus_epi16(m3, m4);
+ b->chunks[2] = _mm_packus_epi16(m5, m6);
+ b->chunks[3] = _mm_packus_epi16(m7, m8);
+}
+
+static inline void base64_decode(char *out, __m128i str) {
+ // credit: aqrit
+
+ const __m128i pack_shuffle =
+ _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
+
+ const __m128i t0 = _mm_maddubs_epi16(str, _mm_set1_epi32(0x01400140));
+ const __m128i t1 = _mm_madd_epi16(t0, _mm_set1_epi32(0x00011000));
+ const __m128i t2 = _mm_shuffle_epi8(t1, pack_shuffle);
+ // Store the output:
+ // this writes 16 bytes, but we only need 12.
+ _mm_storeu_si128((__m128i *)out, t2);
+}
+// decode 64 bytes and output 48 bytes
+static inline void base64_decode_block(char *out, const char *src) {
+ base64_decode(out, _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
+ base64_decode(out + 12,
+ _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
+ base64_decode(out + 24,
+ _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32)));
+ base64_decode(out + 36,
+ _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48)));
+}
+static inline void base64_decode_block_safe(char *out, const char *src) {
+ base64_decode(out, _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
+ base64_decode(out + 12,
+ _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
+ base64_decode(out + 24,
+ _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32)));
+ char buffer[16];
+ base64_decode(buffer,
+ _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48)));
+ std::memcpy(out + 36, buffer, 12);
+}
+static inline void base64_decode_block(char *out, block64 *b) {
+ base64_decode(out, b->chunks[0]);
+ base64_decode(out + 12, b->chunks[1]);
+ base64_decode(out + 24, b->chunks[2]);
+ base64_decode(out + 36, b->chunks[3]);
+}
+static inline void base64_decode_block_safe(char *out, block64 *b) {
+ base64_decode(out, b->chunks[0]);
+ base64_decode(out + 12, b->chunks[1]);
+ base64_decode(out + 24, b->chunks[2]);
+ char buffer[16];
+ base64_decode(buffer, b->chunks[3]);
+ std::memcpy(out + 36, buffer, 12);
+}
+
+template <bool base64_url, typename chartype>
+full_result
+compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+ base64_options options,
+ last_chunk_handling_options last_chunk_options) {
+ const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+ : tables::base64::to_base64_value;
+ size_t equallocation =
+ srclen; // location of the first padding character if any
+ // skip trailing spaces
+ while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+ to_base64[uint8_t(src[srclen - 1])] == 64) {
+ srclen--;
+ }
+ size_t equalsigns = 0;
+ if (srclen > 0 && src[srclen - 1] == '=') {
+ equallocation = srclen - 1;
+ srclen--;
+ equalsigns = 1;
+ // skip trailing spaces
+ while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+ to_base64[uint8_t(src[srclen - 1])] == 64) {
+ srclen--;
+ }
+ if (srclen > 0 && src[srclen - 1] == '=') {
+ equallocation = srclen - 1;
+ srclen--;
+ equalsigns = 2;
+ }
+ }
+ if (srclen == 0) {
+ if (equalsigns > 0) {
+ return {INVALID_BASE64_CHARACTER, equallocation, 0};
+ }
+ return {SUCCESS, 0, 0};
+ }
+ char *end_of_safe_64byte_zone =
+ (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
+
+ const chartype *const srcinit = src;
+ const char *const dstinit = dst;
+ const chartype *const srcend = src + srclen;
+
+ constexpr size_t block_size = 6;
+ static_assert(block_size >= 2, "block should of size 2 or more");
+ char buffer[block_size * 64];
+ char *bufferptr = buffer;
+ if (srclen >= 64) {
+ const chartype *const srcend64 = src + srclen - 64;
+ while (src <= srcend64) {
+ block64 b;
+ load_block(&b, src);
+ src += 64;
+ uint64_t error = 0;
+ uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+ if (error) {
+ src -= 64;
+ size_t error_offset = simdutf_tzcnt_u64(error);
+ return {error_code::INVALID_BASE64_CHARACTER,
+ size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
+ }
+ if (badcharmask != 0) {
+ // optimization opportunity: check for simple masks like those made of
+ // continuous 1s followed by continuous 0s. And masks containing a
+ // single bad character.
+ bufferptr += compress_block(&b, badcharmask, bufferptr);
+ } else if (bufferptr != buffer) {
+ copy_block(&b, bufferptr);
+ bufferptr += 64;
+ } else {
+ if (dst >= end_of_safe_64byte_zone) {
+ base64_decode_block_safe(dst, &b);
+ } else {
+ base64_decode_block(dst, &b);
+ }
+ dst += 48;
+ }
+ if (bufferptr >= (block_size - 1) * 64 + buffer) {
+ for (size_t i = 0; i < (block_size - 2); i++) {
+ base64_decode_block(dst, buffer + i * 64);
+ dst += 48;
+ }
+ if (dst >= end_of_safe_64byte_zone) {
+ base64_decode_block_safe(dst, buffer + (block_size - 2) * 64);
+ } else {
+ base64_decode_block(dst, buffer + (block_size - 2) * 64);
+ }
+ dst += 48;
+ std::memcpy(buffer, buffer + (block_size - 1) * 64,
+ 64); // 64 might be too much
+ bufferptr -= (block_size - 1) * 64;
+ }
+ }
+ }
+
+ char *buffer_start = buffer;
+ // Optimization note: if this is almost full, then it is worth our
+ // time, otherwise, we should just decode directly.
+ int last_block = (int)((bufferptr - buffer_start) % 64);
+ if (last_block != 0 && srcend - src + last_block >= 64) {
+ while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
+ uint8_t val = to_base64[uint8_t(*src)];
+ *bufferptr = char(val);
+ if (!scalar::base64::is_eight_byte(*src) || val > 64) {
+ return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+ size_t(dst - dstinit)};
+ }
+ bufferptr += (val <= 63);
+ src++;
+ }
+ }
+
+ for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+ if (dst >= end_of_safe_64byte_zone) {
+ base64_decode_block_safe(dst, buffer_start);
+ } else {
+ base64_decode_block(dst, buffer_start);
+ }
+ dst += 48;
+ }
+ if ((bufferptr - buffer_start) % 64 != 0) {
+ while (buffer_start + 4 < bufferptr) {
+ uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+ (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+ (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+ (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+ << 8;
+ triple = scalar::utf32::swap_bytes(triple);
+ std::memcpy(dst, &triple, 4);
+
+ dst += 3;
+ buffer_start += 4;
+ }
+ if (buffer_start + 4 <= bufferptr) {
+ uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+ (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+ (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+ (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+ << 8;
+ triple = scalar::utf32::swap_bytes(triple);
+ std::memcpy(dst, &triple, 3);
+
+ dst += 3;
+ buffer_start += 4;
+ }
+ // we may have 1, 2 or 3 bytes left and we need to decode them so let us
+ // backtrack
+ int leftover = int(bufferptr - buffer_start);
+ while (leftover > 0) {
+ while (to_base64[uint8_t(*(src - 1))] == 64) {
+ src--;
+ }
+ src--;
+ leftover--;
+ }
+ }
+ if (src < srcend + equalsigns) {
+ full_result r = scalar::base64::base64_tail_decode(
+ dst, src, srcend - src, equalsigns, options, last_chunk_options);
+ r.input_count += size_t(src - srcinit);
+ if (r.error == error_code::INVALID_BASE64_CHARACTER ||
+ r.error == error_code::BASE64_EXTRA_BITS) {
+ return r;
+ } else {
+ r.output_count += size_t(dst - dstinit);
+ }
+ if (last_chunk_options != stop_before_partial &&
+ r.error == error_code::SUCCESS && equalsigns > 0) {
+ // additional checks
+ if ((r.output_count % 3 == 0) ||
+ ((r.output_count % 3) + 1 + equalsigns != 4)) {
+ r.error = error_code::INVALID_BASE64_CHARACTER;
+ r.input_count = equallocation;
+ }
+ }
+ return r;
+ }
+ if (equalsigns > 0) {
+ if ((size_t(dst - dstinit) % 3 == 0) ||
+ ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
+ return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
+ }
+ }
+ return {SUCCESS, srclen, size_t(dst - dstinit)};
+}
diff --git a/contrib/simdutf/src/westmere/sse_convert_latin1_to_utf16.cpp b/contrib/simdutf/src/westmere/sse_convert_latin1_to_utf16.cpp
new file mode 100644
index 000000000..b830c42c7
--- /dev/null
+++ b/contrib/simdutf/src/westmere/sse_convert_latin1_to_utf16.cpp
@@ -0,0 +1,21 @@
+template <endianness big_endian>
+std::pair<const char *, char16_t *>
+sse_convert_latin1_to_utf16(const char *latin1_input, size_t len,
+ char16_t *utf16_output) {
+ size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
+ for (size_t i = 0; i < rounded_len; i += 16) {
+ // Load 16 Latin1 characters into a 128-bit register
+ __m128i in =
+ _mm_loadu_si128(reinterpret_cast<const __m128i *>(&latin1_input[i]));
+ __m128i out1 = big_endian ? _mm_unpacklo_epi8(_mm_setzero_si128(), in)
+ : _mm_unpacklo_epi8(in, _mm_setzero_si128());
+ __m128i out2 = big_endian ? _mm_unpackhi_epi8(_mm_setzero_si128(), in)
+ : _mm_unpackhi_epi8(in, _mm_setzero_si128());
+ // Zero extend each Latin1 character to 16-bit integers and store the
+ // results back to memory
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(&utf16_output[i]), out1);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(&utf16_output[i + 8]), out2);
+ }
+ // return pointers pointing to where we left off
+ return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len);
+}
diff --git a/contrib/simdutf/src/westmere/sse_convert_latin1_to_utf32.cpp b/contrib/simdutf/src/westmere/sse_convert_latin1_to_utf32.cpp
new file mode 100644
index 000000000..c25675e3e
--- /dev/null
+++ b/contrib/simdutf/src/westmere/sse_convert_latin1_to_utf32.cpp
@@ -0,0 +1,31 @@
+std::pair<const char *, char32_t *>
+sse_convert_latin1_to_utf32(const char *buf, size_t len,
+ char32_t *utf32_output) {
+ const char *end = buf + len;
+
+ while (end - buf >= 16) {
+ // Load 16 Latin1 characters (16 bytes) into a 128-bit register
+ __m128i in = _mm_loadu_si128((__m128i *)buf);
+
+ // Shift input to process next 4 bytes
+ __m128i in_shifted1 = _mm_srli_si128(in, 4);
+ __m128i in_shifted2 = _mm_srli_si128(in, 8);
+ __m128i in_shifted3 = _mm_srli_si128(in, 12);
+
+ // expand 8-bit to 32-bit unit
+ __m128i out1 = _mm_cvtepu8_epi32(in);
+ __m128i out2 = _mm_cvtepu8_epi32(in_shifted1);
+ __m128i out3 = _mm_cvtepu8_epi32(in_shifted2);
+ __m128i out4 = _mm_cvtepu8_epi32(in_shifted3);
+
+ _mm_storeu_si128((__m128i *)utf32_output, out1);
+ _mm_storeu_si128((__m128i *)(utf32_output + 4), out2);
+ _mm_storeu_si128((__m128i *)(utf32_output + 8), out3);
+ _mm_storeu_si128((__m128i *)(utf32_output + 12), out4);
+
+ utf32_output += 16;
+ buf += 16;
+ }
+
+ return std::make_pair(buf, utf32_output);
+}
diff --git a/contrib/simdutf/src/westmere/sse_convert_latin1_to_utf8.cpp b/contrib/simdutf/src/westmere/sse_convert_latin1_to_utf8.cpp
new file mode 100644
index 000000000..e92fa9101
--- /dev/null
+++ b/contrib/simdutf/src/westmere/sse_convert_latin1_to_utf8.cpp
@@ -0,0 +1,71 @@
+std::pair<const char *const, char *const>
+sse_convert_latin1_to_utf8(const char *latin_input,
+ const size_t latin_input_length, char *utf8_output) {
+ const char *end = latin_input + latin_input_length;
+
+ const __m128i v_0000 = _mm_setzero_si128();
+ // 0b1000_0000
+ const __m128i v_80 = _mm_set1_epi8((uint8_t)0x80);
+ // 0b1111_1111_1000_0000
+ const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
+
+ const __m128i latin_1_half_into_u16_byte_mask =
+ _mm_setr_epi8(0, '\x80', 1, '\x80', 2, '\x80', 3, '\x80', 4, '\x80', 5,
+ '\x80', 6, '\x80', 7, '\x80');
+
+ const __m128i latin_2_half_into_u16_byte_mask =
+ _mm_setr_epi8(8, '\x80', 9, '\x80', 10, '\x80', 11, '\x80', 12, '\x80',
+ 13, '\x80', 14, '\x80', 15, '\x80');
+
+ // each latin1 takes 1-2 utf8 bytes
+ // slow path writes useful 8-15 bytes twice (eagerly writes 16 bytes and then
+ // adjust the pointer) so the last write can exceed the utf8_output size by
+ // 8-1 bytes by reserving 8 extra input bytes, we expect the output to have
+ // 8-16 bytes free
+ while (end - latin_input >= 16 + 8) {
+ // Load 16 Latin1 characters (16 bytes) into a 128-bit register
+ __m128i v_latin = _mm_loadu_si128((__m128i *)latin_input);
+
+ if (_mm_testz_si128(v_latin, v_80)) { // ASCII fast path!!!!
+ _mm_storeu_si128((__m128i *)utf8_output, v_latin);
+ latin_input += 16;
+ utf8_output += 16;
+ continue;
+ }
+
+ // assuming a/b are bytes and A/B are uint16 of the same value
+ // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
+ __m128i v_u16_latin_1_half =
+ _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
+ // aaaa_aaaa_bbbb_bbbb -> BBBB_BBBB
+ __m128i v_u16_latin_2_half =
+ _mm_shuffle_epi8(v_latin, latin_2_half_into_u16_byte_mask);
+
+ internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_1_half,
+ utf8_output, v_0000, v_ff80);
+ internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_2_half,
+ utf8_output, v_0000, v_ff80);
+ latin_input += 16;
+ }
+
+ if (end - latin_input >= 16) {
+ // Load 16 Latin1 characters (16 bytes) into a 128-bit register
+ __m128i v_latin = _mm_loadu_si128((__m128i *)latin_input);
+
+ if (_mm_testz_si128(v_latin, v_80)) { // ASCII fast path!!!!
+ _mm_storeu_si128((__m128i *)utf8_output, v_latin);
+ latin_input += 16;
+ utf8_output += 16;
+ } else {
+ // assuming a/b are bytes and A/B are uint16 of the same value
+ // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
+ __m128i v_u16_latin_1_half =
+ _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
+ internal::westmere::write_v_u16_11bits_to_utf8(
+ v_u16_latin_1_half, utf8_output, v_0000, v_ff80);
+ latin_input += 8;
+ }
+ }
+
+ return std::make_pair(latin_input, utf8_output);
+}
diff --git a/contrib/simdutf/src/westmere/sse_convert_utf16_to_latin1.cpp b/contrib/simdutf/src/westmere/sse_convert_utf16_to_latin1.cpp
new file mode 100644
index 000000000..4c25b7221
--- /dev/null
+++ b/contrib/simdutf/src/westmere/sse_convert_utf16_to_latin1.cpp
@@ -0,0 +1,72 @@
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+sse_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+ char *latin1_output) {
+ const char16_t *end = buf + len;
+ while (end - buf >= 8) {
+ // Load 8 UTF-16 characters into 128-bit SSE register
+ __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(buf));
+
+ if (!match_system(big_endian)) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ in = _mm_shuffle_epi8(in, swap);
+ }
+
+ __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00);
+ if (_mm_testz_si128(in, high_byte_mask)) {
+ // Pack 16-bit characters into 8-bit and store in latin1_output
+ __m128i latin1_packed = _mm_packus_epi16(in, in);
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
+ latin1_packed);
+ // Adjust pointers for next iteration
+ buf += 8;
+ latin1_output += 8;
+ } else {
+ return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+ }
+ } // while
+ return std::make_pair(buf, latin1_output);
+}
+
+template <endianness big_endian>
+std::pair<result, char *>
+sse_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+ char *latin1_output) {
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+ while (end - buf >= 8) {
+ __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(buf));
+
+ if (!match_system(big_endian)) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ in = _mm_shuffle_epi8(in, swap);
+ }
+
+ __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00);
+ if (_mm_testz_si128(in, high_byte_mask)) {
+ __m128i latin1_packed = _mm_packus_epi16(in, in);
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
+ latin1_packed);
+ buf += 8;
+ latin1_output += 8;
+ } else {
+ // Fallback to scalar code for handling errors
+ for (int k = 0; k < 8; k++) {
+ uint16_t word = !match_system(big_endian)
+ ? scalar::utf16::swap_bytes(buf[k])
+ : buf[k];
+ if (word <= 0xff) {
+ *latin1_output++ = char(word);
+ } else {
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+ latin1_output);
+ }
+ }
+ buf += 8;
+ }
+ } // while
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ latin1_output);
+}
diff --git a/contrib/simdutf/src/westmere/sse_convert_utf16_to_utf32.cpp b/contrib/simdutf/src/westmere/sse_convert_utf16_to_utf32.cpp
new file mode 100644
index 000000000..6a1e3da80
--- /dev/null
+++ b/contrib/simdutf/src/westmere/sse_convert_utf16_to_utf32.cpp
@@ -0,0 +1,206 @@
+/*
+ The vectorized algorithm works on single SSE register i.e., it
+ loads eight 16-bit code units.
+
+ We consider three cases:
+ 1. an input register contains no surrogates and each value
+ is in range 0x0000 .. 0x07ff.
+ 2. an input register contains no surrogates and values are
+ is in range 0x0000 .. 0xffff.
+ 3. an input register contains surrogates --- i.e. codepoints
+ can have 16 or 32 bits.
+
+ Ad 1.
+
+ When values are less than 0x0800, it means that a 16-bit code unit
+ can be converted into: 1) single UTF8 byte (when it's an ASCII
+ char) or 2) two UTF8 bytes.
+
+ For this case we do only some shuffle to obtain these 2-byte
+ codes and finally compress the whole SSE register with a single
+ shuffle.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+ Ad 2.
+
+ When values fit in 16-bit code units, but are above 0x07ff, then
+ a single word may produce one, two or three UTF8 bytes.
+
+ We prepare data for all these three cases in two registers.
+ The first register contains lower two UTF8 bytes (used in all
+ cases), while the second one contains just the third byte for
+ the three-UTF8-bytes case.
+
+ Finally these two registers are interleaved forming eight-element
+ array of 32-bit values. The array spans two SSE registers.
+ The bytes from the registers are compressed using two shuffles.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+
+ To summarize:
+ - We need two 256-entry tables that have 8704 bytes in total.
+*/
+
+/*
+ Returns a pair: the first unprocessed byte from buf and utf8_output
+ A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char32_t *>
+sse_convert_utf16_to_utf32(const char16_t *buf, size_t len,
+ char32_t *utf32_output) {
+ const char16_t *end = buf + len;
+
+ const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+ const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+
+ while (end - buf >= 8) {
+ __m128i in = _mm_loadu_si128((__m128i *)buf);
+
+ if (big_endian) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ in = _mm_shuffle_epi8(in, swap);
+ }
+
+ // 1. Check if there are any surrogate word in the input chunk.
+ // We have also deal with situation when there is a surrogate word
+ // at the end of a chunk.
+ const __m128i surrogates_bytemask =
+ _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+
+ // bitmask = 0x0000 if there are no surrogates
+ // = 0xc000 if the last word is a surrogate
+ const uint16_t surrogates_bitmask =
+ static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (surrogates_bitmask == 0x0000) {
+ // case: no surrogate pair, extend 16-bit code units to 32-bit code units
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
+ _mm_cvtepu16_epi32(in));
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
+ _mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
+ utf32_output += 8;
+ buf += 8;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+ if ((word & 0xF800) != 0xD800) {
+ *utf32_output++ = char32_t(word);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word =
+ big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(nullptr, utf32_output);
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf32_output++ = char32_t(value);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(buf, utf32_output);
+}
+
+/*
+ Returns a pair: a result struct and utf8_output.
+ If there is an error, the count field of the result is the position of the
+ error. Otherwise, it is the position of the first unprocessed byte in buf
+ (even if finished). A scalar routing should carry on the conversion of the
+ tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char32_t *>
+sse_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
+ char32_t *utf32_output) {
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+
+ const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+ const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+
+ while (end - buf >= 8) {
+ __m128i in = _mm_loadu_si128((__m128i *)buf);
+
+ if (big_endian) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ in = _mm_shuffle_epi8(in, swap);
+ }
+
+ // 1. Check if there are any surrogate word in the input chunk.
+ // We have also deal with situation when there is a surrogate word
+ // at the end of a chunk.
+ const __m128i surrogates_bytemask =
+ _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+
+ // bitmask = 0x0000 if there are no surrogates
+ // = 0xc000 if the last word is a surrogate
+ const uint16_t surrogates_bitmask =
+ static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (surrogates_bitmask == 0x0000) {
+ // case: no surrogate pair, extend 16-bit code units to 32-bit code units
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
+ _mm_cvtepu16_epi32(in));
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
+ _mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
+ utf32_output += 8;
+ buf += 8;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+ if ((word & 0xF800) != 0xD800) {
+ *utf32_output++ = char32_t(word);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word =
+ big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k - 1),
+ utf32_output);
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf32_output++ = char32_t(value);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
+}
diff --git a/contrib/simdutf/src/westmere/sse_convert_utf16_to_utf8.cpp b/contrib/simdutf/src/westmere/sse_convert_utf16_to_utf8.cpp
new file mode 100644
index 000000000..440e006da
--- /dev/null
+++ b/contrib/simdutf/src/westmere/sse_convert_utf16_to_utf8.cpp
@@ -0,0 +1,504 @@
+/*
+ The vectorized algorithm works on single SSE register i.e., it
+ loads eight 16-bit code units.
+
+ We consider three cases:
+ 1. an input register contains no surrogates and each value
+ is in range 0x0000 .. 0x07ff.
+ 2. an input register contains no surrogates and values are
+ is in range 0x0000 .. 0xffff.
+ 3. an input register contains surrogates --- i.e. codepoints
+ can have 16 or 32 bits.
+
+ Ad 1.
+
+ When values are less than 0x0800, it means that a 16-bit code unit
+ can be converted into: 1) single UTF8 byte (when it is an ASCII
+ char) or 2) two UTF8 bytes.
+
+ For this case we do only some shuffle to obtain these 2-byte
+ codes and finally compress the whole SSE register with a single
+ shuffle.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+ Ad 2.
+
+ When values fit in 16-bit code units, but are above 0x07ff, then
+ a single word may produce one, two or three UTF8 bytes.
+
+ We prepare data for all these three cases in two registers.
+ The first register contains lower two UTF8 bytes (used in all
+ cases), while the second one contains just the third byte for
+ the three-UTF8-bytes case.
+
+ Finally these two registers are interleaved forming eight-element
+ array of 32-bit values. The array spans two SSE registers.
+ The bytes from the registers are compressed using two shuffles.
+
+ We need 256-entry lookup table to get a compression pattern
+ and the number of output bytes in the compressed vector register.
+ Each entry occupies 17 bytes.
+
+
+ To summarize:
+ - We need two 256-entry tables that have 8704 bytes in total.
+*/
+
+/*
+ Returns a pair: the first unprocessed byte from buf and utf8_output
+ A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+sse_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
+
+ const char16_t *end = buf + len;
+
+ const __m128i v_0000 = _mm_setzero_si128();
+ const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+ const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+ __m128i in = _mm_loadu_si128((__m128i *)buf);
+ if (big_endian) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ in = _mm_shuffle_epi8(in, swap);
+ }
+ // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+ const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
+ if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
+ __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
+ if (big_endian) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ nextin = _mm_shuffle_epi8(nextin, swap);
+ }
+ if (!_mm_testz_si128(nextin, v_ff80)) {
+ // 1. pack the bytes
+ // obviously suboptimal.
+ const __m128i utf8_packed = _mm_packus_epi16(in, in);
+ // 2. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 8;
+ utf8_output += 8;
+ in = nextin;
+ } else {
+ // 1. pack the bytes
+ // obviously suboptimal.
+ const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
+ // 2. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+ }
+
+ // no bits set above 7th bit
+ const __m128i one_byte_bytemask =
+ _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
+ const uint16_t one_byte_bitmask =
+ static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+
+ // no bits set above 11th bit
+ const __m128i one_or_two_bytes_bytemask =
+ _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
+ const uint16_t one_or_two_bytes_bitmask =
+ static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+ if (one_or_two_bytes_bitmask == 0xffff) {
+ internal::westmere::write_v_u16_11bits_to_utf8(
+ in, utf8_output, one_byte_bytemask, one_byte_bitmask);
+ buf += 8;
+ continue;
+ }
+
+ // 1. Check if there are any surrogate word in the input chunk.
+ // We have also deal with situation when there is a surrogate word
+ // at the end of a chunk.
+ const __m128i surrogates_bytemask =
+ _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+
+ // bitmask = 0x0000 if there are no surrogates
+ // = 0xc000 if the last word is a surrogate
+ const uint16_t surrogates_bitmask =
+ static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (surrogates_bitmask == 0x0000) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+ 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
+ UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+
+ // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
+ const __m128i s0 = _mm_srli_epi16(in, 4);
+ // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+ const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
+ // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+ const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
+ // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+ const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
+ simdutf_vec(0b0100000000000000));
+ const __m128i s4 = _mm_xor_si128(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+ const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint16_t mask =
+ (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+ if (mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
+ 15, 13, -1, -1, -1, -1);
+ const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+ const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += 12;
+ buf += 8;
+ continue;
+ }
+ const uint8_t mask0 = uint8_t(mask);
+
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+ const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+ const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += row0[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += row1[0];
+
+ buf += 8;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+ if ((word & 0xFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xF800) != 0xD800) {
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word =
+ big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf8_output++ = char((value >> 18) | 0b11110000);
+ *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((value & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ return std::make_pair(buf, utf8_output);
+}
+
+/*
+ Returns a pair: a result struct and utf8_output.
+ If there is an error, the count field of the result is the position of the
+ error. Otherwise, it is the position of the first unprocessed byte in buf
+ (even if finished). A scalar routing should carry on the conversion of the
+ tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char *>
+sse_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
+ char *utf8_output) {
+ const char16_t *start = buf;
+ const char16_t *end = buf + len;
+
+ const __m128i v_0000 = _mm_setzero_si128();
+ const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+ const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+ __m128i in = _mm_loadu_si128((__m128i *)buf);
+ if (big_endian) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ in = _mm_shuffle_epi8(in, swap);
+ }
+ // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+ const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
+ if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
+ __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
+ if (big_endian) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ nextin = _mm_shuffle_epi8(nextin, swap);
+ }
+ if (!_mm_testz_si128(nextin, v_ff80)) {
+ // 1. pack the bytes
+ // obviously suboptimal.
+ const __m128i utf8_packed = _mm_packus_epi16(in, in);
+ // 2. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 8;
+ utf8_output += 8;
+ in = nextin;
+ } else {
+ // 1. pack the bytes
+ // obviously suboptimal.
+ const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
+ // 2. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+ }
+
+ // no bits set above 7th bit
+ const __m128i one_byte_bytemask =
+ _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
+ const uint16_t one_byte_bitmask =
+ static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+
+ // no bits set above 11th bit
+ const __m128i one_or_two_bytes_bytemask =
+ _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
+ const uint16_t one_or_two_bytes_bitmask =
+ static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+ if (one_or_two_bytes_bitmask == 0xffff) {
+ internal::westmere::write_v_u16_11bits_to_utf8(
+ in, utf8_output, one_byte_bytemask, one_byte_bitmask);
+ buf += 8;
+ continue;
+ }
+
+ // 1. Check if there are any surrogate word in the input chunk.
+ // We have also deal with situation when there is a surrogate word
+ // at the end of a chunk.
+ const __m128i surrogates_bytemask =
+ _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+
+ // bitmask = 0x0000 if there are no surrogates
+ // = 0xc000 if the last word is a surrogate
+ const uint16_t surrogates_bitmask =
+ static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+ // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+ // However, it is likely an uncommon occurrence.
+ if (surrogates_bitmask == 0x0000) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+ 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two
+ UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+
+ // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
+ const __m128i s0 = _mm_srli_epi16(in, 4);
+ // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+ const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
+ // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+ const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
+ // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+ const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
+ simdutf_vec(0b0100000000000000));
+ const __m128i s4 = _mm_xor_si128(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+ const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint16_t mask =
+ (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+ if (mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
+ 15, 13, -1, -1, -1, -1);
+ const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+ const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += 12;
+ buf += 8;
+ continue;
+ }
+ const uint8_t mask0 = uint8_t(mask);
+
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+ const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+ const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += row0[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += row1[0];
+
+ buf += 8;
+ // surrogate pair(s) in a register
+ } else {
+ // Let us do a scalar fallback.
+ // It may seem wasteful to use scalar code, but being efficient with SIMD
+ // in the presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+ if ((word & 0xFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xF800) != 0xD800) {
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ // must be a surrogate pair
+ uint16_t diff = uint16_t(word - 0xD800);
+ uint16_t next_word =
+ big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+ k++;
+ uint16_t diff2 = uint16_t(next_word - 0xDC00);
+ if ((diff | diff2) > 0x3FF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k - 1),
+ utf8_output);
+ }
+ uint32_t value = (diff << 10) + diff2 + 0x10000;
+ *utf8_output++ = char((value >> 18) | 0b11110000);
+ *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((value & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+}
diff --git a/contrib/simdutf/src/westmere/sse_convert_utf32_to_latin1.cpp b/contrib/simdutf/src/westmere/sse_convert_utf32_to_latin1.cpp
new file mode 100644
index 000000000..02bd7c98e
--- /dev/null
+++ b/contrib/simdutf/src/westmere/sse_convert_utf32_to_latin1.cpp
@@ -0,0 +1,82 @@
+std::pair<const char32_t *, char *>
+sse_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+ char *latin1_output) {
+ const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
+
+ __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
+ __m128i shufmask =
+ _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
+
+ for (size_t i = 0; i < rounded_len; i += 16) {
+ __m128i in1 = _mm_loadu_si128((__m128i *)buf);
+ __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
+ __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
+ __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));
+
+ __m128i check_combined = _mm_or_si128(in1, in2);
+ check_combined = _mm_or_si128(check_combined, in3);
+ check_combined = _mm_or_si128(check_combined, in4);
+
+ if (!_mm_testz_si128(check_combined, high_bytes_mask)) {
+ return std::make_pair(nullptr, latin1_output);
+ }
+ __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask),
+ _mm_shuffle_epi8(in2, shufmask));
+ __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask),
+ _mm_shuffle_epi8(in4, shufmask));
+ __m128i pack = _mm_unpacklo_epi64(pack1, pack2);
+ _mm_storeu_si128((__m128i *)latin1_output, pack);
+ latin1_output += 16;
+ buf += 16;
+ }
+
+ return std::make_pair(buf, latin1_output);
+}
+
+std::pair<result, char *>
+sse_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+ char *latin1_output) {
+ const char32_t *start = buf;
+ const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
+
+ __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
+ __m128i shufmask =
+ _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
+
+ for (size_t i = 0; i < rounded_len; i += 16) {
+ __m128i in1 = _mm_loadu_si128((__m128i *)buf);
+ __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
+ __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
+ __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));
+
+ __m128i check_combined = _mm_or_si128(in1, in2);
+ check_combined = _mm_or_si128(check_combined, in3);
+ check_combined = _mm_or_si128(check_combined, in4);
+
+ if (!_mm_testz_si128(check_combined, high_bytes_mask)) {
+ // Fallback to scalar code for handling errors
+ for (int k = 0; k < 16; k++) {
+ char32_t codepoint = buf[k];
+ if (codepoint <= 0xff) {
+ *latin1_output++ = char(codepoint);
+ } else {
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+ latin1_output);
+ }
+ }
+ buf += 16;
+ continue;
+ }
+ __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask),
+ _mm_shuffle_epi8(in2, shufmask));
+ __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask),
+ _mm_shuffle_epi8(in4, shufmask));
+ __m128i pack = _mm_unpacklo_epi64(pack1, pack2);
+ _mm_storeu_si128((__m128i *)latin1_output, pack);
+ latin1_output += 16;
+ buf += 16;
+ }
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start),
+ latin1_output);
+}
diff --git a/contrib/simdutf/src/westmere/sse_convert_utf32_to_utf16.cpp b/contrib/simdutf/src/westmere/sse_convert_utf32_to_utf16.cpp
new file mode 100644
index 000000000..4d18a563c
--- /dev/null
+++ b/contrib/simdutf/src/westmere/sse_convert_utf32_to_utf16.cpp
@@ -0,0 +1,170 @@
+template <endianness big_endian>
+std::pair<const char32_t *, char16_t *>
+sse_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+ char16_t *utf16_output) {
+
+ const char32_t *end = buf + len;
+
+ const __m128i v_0000 = _mm_setzero_si128();
+ const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
+ __m128i forbidden_bytemask = _mm_setzero_si128();
+
+ while (end - buf >= 8) {
+ __m128i in = _mm_loadu_si128((__m128i *)buf);
+ __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
+ const __m128i saturation_bytemask = _mm_cmpeq_epi32(
+ _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
+ const uint32_t saturation_bitmask =
+ static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+
+ // Check if no bits set above 16th
+ if (saturation_bitmask == 0xffff) {
+ // Pack UTF-32 to UTF-16
+ __m128i utf16_packed = _mm_packus_epi32(in, nextin);
+
+ const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+ const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+ forbidden_bytemask = _mm_or_si128(
+ forbidden_bytemask,
+ _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800));
+
+ if (big_endian) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+ }
+
+ _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
+ utf16_output += 8;
+ buf += 8;
+ } else {
+ size_t forward = 7;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFF0000) == 0) {
+ // will not generate a surrogate pair
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(nullptr, utf16_output);
+ }
+ *utf16_output++ =
+ big_endian
+ ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
+ : char16_t(word);
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return std::make_pair(nullptr, utf16_output);
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (big_endian) {
+ high_surrogate =
+ uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+ low_surrogate =
+ uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ }
+ }
+ buf += k;
+ }
+ }
+
+ // check for invalid input
+ if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
+ return std::make_pair(nullptr, utf16_output);
+ }
+
+ return std::make_pair(buf, utf16_output);
+}
+
+template <endianness big_endian>
+std::pair<result, char16_t *>
+sse_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+ char16_t *utf16_output) {
+ const char32_t *start = buf;
+ const char32_t *end = buf + len;
+
+ const __m128i v_0000 = _mm_setzero_si128();
+ const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
+
+ while (end - buf >= 8) {
+ __m128i in = _mm_loadu_si128((__m128i *)buf);
+ __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
+ const __m128i saturation_bytemask = _mm_cmpeq_epi32(
+ _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
+ const uint32_t saturation_bitmask =
+ static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+
+ // Check if no bits set above 16th
+ if (saturation_bitmask == 0xffff) {
+ // Pack UTF-32 to UTF-16
+ __m128i utf16_packed = _mm_packus_epi32(in, nextin);
+
+ const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+ const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+ const __m128i forbidden_bytemask =
+ _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800);
+ if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
+ return std::make_pair(result(error_code::SURROGATE, buf - start),
+ utf16_output);
+ }
+
+ if (big_endian) {
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+ }
+
+ _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
+ utf16_output += 8;
+ buf += 8;
+ } else {
+ size_t forward = 7;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFF0000) == 0) {
+ // will not generate a surrogate pair
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k), utf16_output);
+ }
+ *utf16_output++ =
+ big_endian
+ ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
+ : char16_t(word);
+ } else {
+ // will generate a surrogate pair
+ if (word > 0x10FFFF) {
+ return std::make_pair(
+ result(error_code::TOO_LARGE, buf - start + k), utf16_output);
+ }
+ word -= 0x10000;
+ uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+ uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+ if (big_endian) {
+ high_surrogate =
+ uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+ low_surrogate =
+ uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+ }
+ *utf16_output++ = char16_t(high_surrogate);
+ *utf16_output++ = char16_t(low_surrogate);
+ }
+ }
+ buf += k;
+ }
+ }
+
+ return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+}
diff --git a/contrib/simdutf/src/westmere/sse_convert_utf32_to_utf8.cpp b/contrib/simdutf/src/westmere/sse_convert_utf32_to_utf8.cpp
new file mode 100644
index 000000000..521cc67de
--- /dev/null
+++ b/contrib/simdutf/src/westmere/sse_convert_utf32_to_utf8.cpp
@@ -0,0 +1,590 @@
+std::pair<const char32_t *, char *>
+sse_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
+ const char32_t *end = buf + len;
+
+ const __m128i v_0000 = _mm_setzero_si128(); //__m128 = 128 bits
+ const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); // 1111 1000 0000
+ // 0000
+ const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); // 1100 0000 1000
+ // 0000
+ const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); // 1111 1111 1000
+ // 0000
+ const __m128i v_ffff0000 = _mm_set1_epi32(
+ (uint32_t)0xffff0000); // 1111 1111 1111 1111 0000 0000 0000 0000
+ const __m128i v_7fffffff = _mm_set1_epi32(
+ (uint32_t)0x7fffffff); // 0111 1111 1111 1111 1111 1111 1111 1111
+ __m128i running_max = _mm_setzero_si128();
+ __m128i forbidden_bytemask = _mm_setzero_si128();
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >=
+ std::ptrdiff_t(
+ 16 + safety_margin)) { // buf is a char32_t pointer, each char32_t
+ // has 4 bytes or 32 bits, thus buf + 16 *
+ // char_32t = 512 bits = 64 bytes
+ // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
+ __m128i in = _mm_loadu_si128((__m128i *)buf);
+ __m128i nextin = _mm_loadu_si128(
+ (__m128i *)buf + 1); // These two values can hold only 8 UTF32 chars
+ running_max = _mm_max_epu32(
+ _mm_max_epu32(in, running_max), // take element-wise max char32_t from
+ // in and running_max vector
+ nextin); // and take element-wise max element from nextin and
+ // running_max vector
+
+ // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
+ // saturation
+ __m128i in_16 = _mm_packus_epi32(
+ _mm_and_si128(in, v_7fffffff),
+ _mm_and_si128(
+ nextin,
+ v_7fffffff)); // in this context pack the two __m128 into a single
+ // By ensuring the highest bit is set to 0(&v_7fffffff), we are making sure
+ // all values are interpreted as non-negative, or specifically, the values
+ // are within the range of valid Unicode code points. remember : having
+ // leading byte 0 means a positive number by the two complements system.
+ // Unicode is well beneath the range where you'll start getting issues so
+ // that's OK.
+
+ // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
+
+ // Check for ASCII fast path
+
+ // ASCII fast path!!!!
+ // We eagerly load another 32 bytes, hoping that they will be ASCII too.
+ // The intuition is that we try to collect 16 ASCII characters which
+ // requires a total of 64 bytes of input. If we fail, we just pass thirdin
+ // and fourthin as our new inputs.
+ if (_mm_testz_si128(in_16, v_ff80)) { // if the first two blocks are ASCII
+ __m128i thirdin = _mm_loadu_si128((__m128i *)buf + 2);
+ __m128i fourthin = _mm_loadu_si128((__m128i *)buf + 3);
+ running_max = _mm_max_epu32(
+ _mm_max_epu32(thirdin, running_max),
+ fourthin); // take the running max of all 4 vectors thus far
+ __m128i nextin_16 = _mm_packus_epi32(
+ _mm_and_si128(thirdin, v_7fffffff),
+ _mm_and_si128(fourthin,
+ v_7fffffff)); // pack into 1 vector, now you have two
+ if (!_mm_testz_si128(
+ nextin_16,
+ v_ff80)) { // checks if the second packed vector is ASCII, if not:
+ // 1. pack the bytes
+ // obviously suboptimal.
+ const __m128i utf8_packed = _mm_packus_epi16(
+ in_16, in_16); // creates two copy of in_16 in 1 vector
+ // 2. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output,
+ utf8_packed); // put them into the output
+ // 3. adjust pointers
+ buf += 8; // the char32_t buffer pointer goes up 8 char32_t chars* 32
+ // bits = 256 bits
+ utf8_output +=
+ 8; // same with output, e.g. lift the first two blocks alone.
+ // Proceed with next input
+ in_16 = nextin_16;
+ // We need to update in and nextin because they are used later.
+ in = thirdin;
+ nextin = fourthin;
+ } else {
+ // 1. pack the bytes
+ const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
+ // 2. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 16;
+ utf8_output += 16;
+ continue; // we are done for this round!
+ }
+ }
+
+ // no bits set above 7th bit -- find out all the ASCII characters
+ const __m128i one_byte_bytemask =
+ _mm_cmpeq_epi16( // this takes four bytes at a time and compares:
+ _mm_and_si128(in_16, v_ff80), // the vector that get only the first
+ // 9 bits of each 16-bit/2-byte units
+ v_0000 //
+ ); // they should be all zero if they are ASCII. E.g. ASCII in UTF32 is
+ // of format 0000 0000 0000 0XXX XXXX
+ // _mm_cmpeq_epi16 should now return a 1111 1111 1111 1111 for equals, and
+ // 0000 0000 0000 0000 if not for each 16-bit/2-byte units
+ const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(
+ one_byte_bytemask)); // collect the MSB from previous vector and put
+ // them into uint16_t mas
+
+ // no bits set above 11th bit
+ const __m128i one_or_two_bytes_bytemask =
+ _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
+ const uint16_t one_or_two_bytes_bitmask =
+ static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+ if (one_or_two_bytes_bitmask == 0xffff) {
+ // case: all code units either produce 1 or 2 UTF-8 bytes (at least one
+ // produces 2 bytes)
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ const __m128i v_1f00 =
+ _mm_set1_epi16((int16_t)0x1f00); // 0001 1111 0000 0000
+ const __m128i v_003f =
+ _mm_set1_epi16((int16_t)0x003f); // 0000 0000 0011 1111
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const __m128i t0 = _mm_slli_epi16(in_16, 2); // shift packed vector by two
+ // t1 = [000a|aaaa|0000|0000]
+ const __m128i t1 =
+ _mm_and_si128(t0, v_1f00); // potentital first utf8 byte
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m128i t2 =
+ _mm_and_si128(in_16, v_003f); // potential second utf8 byte
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m128i t3 =
+ _mm_or_si128(t1, t2); // first and second potential utf8 byte together
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const __m128i t4 = _mm_or_si128(
+ t3,
+ v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit
+
+ // 2. merge ASCII and 2-byte codewords
+ const __m128i utf8_unpacked =
+ _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+ // 3. prepare bitmask for 8-bit lookup
+ // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h -
+ // MSB, a - LSB)
+ const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
+ const uint16_t m1 =
+ static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
+ const uint8_t m2 =
+ static_cast<uint8_t>((m0 | m1) & 0xff); // m2 = hdgcfbea
+ // 4. pack the bytes
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+ const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+ const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+ // 5. store bytes
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+
+ // 6. adjust pointers
+ buf += 8;
+ utf8_output += row[0];
+ continue;
+ }
+
+ // Check for overflow in packing
+
+ const __m128i saturation_bytemask = _mm_cmpeq_epi32(
+ _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
+ const uint32_t saturation_bitmask =
+ static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+ if (saturation_bitmask == 0xffff) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+ const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+ forbidden_bytemask =
+ _mm_or_si128(forbidden_bytemask,
+ _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800));
+
+ const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+ 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
+ two UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+
+ // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
+ const __m128i s0 = _mm_srli_epi16(in_16, 4);
+ // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+ const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
+ // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+ const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
+ // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+ const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
+ simdutf_vec(0b0100000000000000));
+ const __m128i s4 = _mm_xor_si128(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+ const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint16_t mask =
+ (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+ if (mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
+ 15, 13, -1, -1, -1, -1);
+ const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+ const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += 12;
+ buf += 8;
+ continue;
+ }
+ const uint8_t mask0 = uint8_t(mask);
+
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+ const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+ const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += row0[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += row1[0];
+
+ buf += 8;
+ } else {
+ // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=>
+ // will produce four UTF-8 bytes Let us do a scalar fallback. It may seem
+ // wasteful to use scalar code, but being efficient with SIMD in the
+ // presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFFFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xFFFFF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xFFFF0000) == 0) {
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ if (word > 0x10FFFF) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+
+ // check for invalid input
+ const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
+ if (static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(
+ _mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+
+ if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
+ return std::make_pair(nullptr, utf8_output);
+ }
+
+ return std::make_pair(buf, utf8_output);
+}
+
+std::pair<result, char *>
+sse_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+ char *utf8_output) {
+ const char32_t *end = buf + len;
+ const char32_t *start = buf;
+
+ const __m128i v_0000 = _mm_setzero_si128();
+ const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+ const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
+ const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
+ const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+ const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
+ const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
+
+ const size_t safety_margin =
+ 12; // to avoid overruns, see issue
+ // https://github.com/simdutf/simdutf/issues/92
+
+ while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+ // We load two 16 bytes registers for a total of 32 bytes or 8 characters.
+ __m128i in = _mm_loadu_si128((__m128i *)buf);
+ __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
+ // Check for too large input
+ __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
+ if (static_cast<uint16_t>(_mm_movemask_epi8(
+ _mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) {
+ return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+ utf8_output);
+ }
+
+ // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
+ // saturation
+ __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff),
+ _mm_and_si128(nextin, v_7fffffff));
+
+ // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
+
+ // Check for ASCII fast path
+ if (_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
+ // 1. pack the bytes
+ // obviously suboptimal.
+ const __m128i utf8_packed = _mm_packus_epi16(in_16, in_16);
+ // 2. store (16 bytes)
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+ // 3. adjust pointers
+ buf += 8;
+ utf8_output += 8;
+ continue;
+ }
+
+ // no bits set above 7th bit
+ const __m128i one_byte_bytemask =
+ _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
+ const uint16_t one_byte_bitmask =
+ static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+
+ // no bits set above 11th bit
+ const __m128i one_or_two_bytes_bytemask =
+ _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
+ const uint16_t one_or_two_bytes_bitmask =
+ static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+ if (one_or_two_bytes_bitmask == 0xffff) {
+ // case: all code units either produce 1 or 2 UTF-8 bytes (at least one
+ // produces 2 bytes)
+ // 1. prepare 2-byte values
+ // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+ // expected output : [110a|aaaa|10bb|bbbb] x 8
+ const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
+ const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
+
+ // t0 = [000a|aaaa|bbbb|bb00]
+ const __m128i t0 = _mm_slli_epi16(in_16, 2);
+ // t1 = [000a|aaaa|0000|0000]
+ const __m128i t1 = _mm_and_si128(t0, v_1f00);
+ // t2 = [0000|0000|00bb|bbbb]
+ const __m128i t2 = _mm_and_si128(in_16, v_003f);
+ // t3 = [000a|aaaa|00bb|bbbb]
+ const __m128i t3 = _mm_or_si128(t1, t2);
+ // t4 = [110a|aaaa|10bb|bbbb]
+ const __m128i t4 = _mm_or_si128(t3, v_c080);
+
+ // 2. merge ASCII and 2-byte codewords
+ const __m128i utf8_unpacked =
+ _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+ // 3. prepare bitmask for 8-bit lookup
+ // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h -
+ // MSB, a - LSB)
+ const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
+ const uint16_t m1 =
+ static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
+ const uint8_t m2 =
+ static_cast<uint8_t>((m0 | m1) & 0xff); // m2 = hdgcfbea
+ // 4. pack the bytes
+ const uint8_t *row =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+ const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+ const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+ // 5. store bytes
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+
+ // 6. adjust pointers
+ buf += 8;
+ utf8_output += row[0];
+ continue;
+ }
+
+ // Check for overflow in packing
+ const __m128i saturation_bytemask = _mm_cmpeq_epi32(
+ _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
+ const uint32_t saturation_bitmask =
+ static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+
+ if (saturation_bitmask == 0xffff) {
+ // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+
+ // Check for illegal surrogate code units
+ const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+ const __m128i forbidden_bytemask =
+ _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800);
+ if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
+ return std::make_pair(result(error_code::SURROGATE, buf - start),
+ utf8_output);
+ }
+
+ const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+ 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+ /* In this branch we handle three cases:
+ 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] -
+ single UFT-8 byte
+ 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] -
+ two UTF-8 bytes
+ 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+ three UTF-8 bytes
+
+ We expand the input word (16-bit) into two code units (32-bit), thus
+ we have room for four bytes. However, we need five distinct bit
+ layouts. Note that the last byte in cases #2 and #3 is the same.
+
+ We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+ in register t2.
+
+ We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+ either byte 1 for case #2 or byte 2 for case #3. Note that they
+ differ by exactly one bit.
+
+ Finally from these two code units we build proper UTF-8 sequence, taking
+ into account the case (i.e, the number of bytes to write).
+ */
+ /**
+ * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+ * t2 => [0ccc|cccc] [10cc|cccc]
+ * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+ */
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
+ // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+ const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
+ // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+ const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+ // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+ const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+
+ // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
+ const __m128i s0 = _mm_srli_epi16(in_16, 4);
+ // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+ const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
+ // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+ const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
+ // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+ const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+ const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
+ simdutf_vec(0b0100000000000000));
+ const __m128i s4 = _mm_xor_si128(s3, m0);
+#undef simdutf_vec
+
+ // 4. expand code units 16-bit => 32-bit
+ const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+ const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+
+ // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+ const uint16_t mask =
+ (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+ if (mask == 0) {
+ // We only have three-byte code units. Use fast path.
+ const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
+ 15, 13, -1, -1, -1, -1);
+ const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+ const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += 12;
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += 12;
+ buf += 8;
+ continue;
+ }
+ const uint8_t mask0 = uint8_t(mask);
+
+ const uint8_t *row0 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+ const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+ const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+
+ const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
+ const uint8_t *row1 =
+ &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+ const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+ const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+ utf8_output += row0[0];
+ _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+ utf8_output += row1[0];
+
+ buf += 8;
+ } else {
+ // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=>
+ // will produce four UTF-8 bytes Let us do a scalar fallback. It may seem
+ // wasteful to use scalar code, but being efficient with SIMD in the
+ // presence of surrogate pairs may require non-trivial tables.
+ size_t forward = 15;
+ size_t k = 0;
+ if (size_t(end - buf) < forward + 1) {
+ forward = size_t(end - buf - 1);
+ }
+ for (; k < forward; k++) {
+ uint32_t word = buf[k];
+ if ((word & 0xFFFFFF80) == 0) {
+ *utf8_output++ = char(word);
+ } else if ((word & 0xFFFFF800) == 0) {
+ *utf8_output++ = char((word >> 6) | 0b11000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else if ((word & 0xFFFF0000) == 0) {
+ if (word >= 0xD800 && word <= 0xDFFF) {
+ return std::make_pair(
+ result(error_code::SURROGATE, buf - start + k), utf8_output);
+ }
+ *utf8_output++ = char((word >> 12) | 0b11100000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ } else {
+ if (word > 0x10FFFF) {
+ return std::make_pair(
+ result(error_code::TOO_LARGE, buf - start + k), utf8_output);
+ }
+ *utf8_output++ = char((word >> 18) | 0b11110000);
+ *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+ *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+ *utf8_output++ = char((word & 0b111111) | 0b10000000);
+ }
+ }
+ buf += k;
+ }
+ } // while
+ return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+}
diff --git a/contrib/simdutf/src/westmere/sse_convert_utf8_to_latin1.cpp b/contrib/simdutf/src/westmere/sse_convert_utf8_to_latin1.cpp
new file mode 100644
index 000000000..29145f6d1
--- /dev/null
+++ b/contrib/simdutf/src/westmere/sse_convert_utf8_to_latin1.cpp
@@ -0,0 +1,58 @@
+// depends on "tables/utf8_to_utf16_tables.h"
+
+// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_latin1(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char *&latin1_output) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ //
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+ //
+ const __m128i in = _mm_loadu_si128((__m128i *)input);
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask &
+ 0xfff; // we are only processing 12 bytes in case it is not all ASCII
+ if (utf8_end_of_code_point_mask == 0xfff) {
+ // We process the data in chunks of 12 bytes.
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
+ latin1_output += 12; // We wrote 12 characters.
+ return 12; // We consumed 12 bytes.
+ }
+ /// We do not have a fast path available, so we fallback.
+ const uint8_t idx =
+ tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+ const uint8_t consumed =
+ tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+ // this indicates an invalid input:
+ if (idx >= 64) {
+ return consumed;
+ }
+ // Here we should have (idx < 64), if not, there is a bug in the validation or
+ // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
+ // we process SIX (6) input code-code units. The max length in bytes of six
+ // code code units spanning between 1 and 2 bytes each is 12 bytes. On
+ // processors where pdep/pext is fast, we might be able to use a small lookup
+ // table.
+ const __m128i sh =
+ _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+ __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+ const __m128i latin1_packed = _mm_packus_epi16(composed, composed);
+ // writing 8 bytes even though we only care about the first 6 bytes.
+ // performance note: it would be faster to use _mm_storeu_si128, we should
+ // investigate.
+ _mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
+ latin1_output += 6; // We wrote 6 bytes.
+ return consumed;
+}
diff --git a/contrib/simdutf/src/westmere/sse_convert_utf8_to_utf16.cpp b/contrib/simdutf/src/westmere/sse_convert_utf8_to_utf16.cpp
new file mode 100644
index 000000000..3bea26d96
--- /dev/null
+++ b/contrib/simdutf/src/westmere/sse_convert_utf8_to_utf16.cpp
@@ -0,0 +1,197 @@
+// depends on "tables/utf8_to_utf16_tables.h"
+
+// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+template <endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char16_t *&utf16_output) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ //
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+ //
+ // We first try a few fast paths.
+ const __m128i swap =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ const __m128i in = _mm_loadu_si128((__m128i *)input);
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask & 0xfff;
+ if (utf8_end_of_code_point_mask == 0xfff) {
+ // We process the data in chunks of 12 bytes.
+ // Note: using 16 bytes is unsafe, see issue_ossfuzz_71218
+ __m128i ascii_first = _mm_cvtepu8_epi16(in);
+ __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in, 8));
+ if (big_endian) {
+ ascii_first = _mm_shuffle_epi8(ascii_first, swap);
+ ascii_second = _mm_shuffle_epi8(ascii_second, swap);
+ }
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8),
+ ascii_second);
+ utf16_output += 12; // We wrote 12 16-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+ if (((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
+ // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte
+ // UTF-16 code units. There is probably a more efficient sequence, but the
+ // following might do.
+ const __m128i sh =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+ __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+ if (big_endian)
+ composed = _mm_shuffle_epi8(composed, swap);
+ _mm_storeu_si128((__m128i *)utf16_output, composed);
+ utf16_output += 8; // We wrote 16 bytes, 8 code points.
+ return 16;
+ }
+ if (input_utf8_end_of_code_point_mask == 0x924) {
+ // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
+ // UTF-16 code units. There is probably a more efficient sequence, but the
+ // following might do.
+ const __m128i sh =
+ _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii =
+ _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+ const __m128i middlebyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ const __m128i highbyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+ __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+ if (big_endian)
+ composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+ _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+ utf16_output += 4;
+ return 12;
+ }
+ /// We do not have a fast path available, so we fallback.
+
+ const uint8_t idx =
+ tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+ const uint8_t consumed =
+ tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+ if (idx < 64) {
+ // SIX (6) input code-code units
+ // this is a relatively easy scenario
+ // we process SIX (6) input code-code units. The max length in bytes of six
+ // code code units spanning between 1 and 2 bytes each is 12 bytes. On
+ // processors where pdep/pext is fast, we might be able to use a small
+ // lookup table.
+ const __m128i sh =
+ _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+ __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+ if (big_endian)
+ composed = _mm_shuffle_epi8(composed, swap);
+ _mm_storeu_si128((__m128i *)utf16_output, composed);
+ utf16_output += 6; // We wrote 12 bytes, 6 code points.
+ } else if (idx < 145) {
+ // FOUR (4) input code-code units
+ const __m128i sh =
+ _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii =
+ _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+ const __m128i middlebyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ const __m128i highbyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+ __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+ if (big_endian)
+ composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+ _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+ utf16_output += 4;
+ } else if (idx < 209) {
+ // TWO (2) input code-code units
+ //////////////
+ // There might be garbage inputs where a leading byte mascarades as a
+ // four-byte leading byte (by being followed by 3 continuation byte), but is
+ // not greater than 0xf0. This could trigger a buffer overflow if we only
+ // counted leading bytes of the form 0xf0 as generating surrogate pairs,
+ // without further UTF-8 validation. Thus we must be careful to ensure that
+ // only leading bytes at least as large as 0xf0 generate surrogate pairs. We
+ // do as at the cost of an extra mask.
+ /////////////
+ const __m128i sh =
+ _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+ const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+ // correct for spurious high bit
+ const __m128i correct =
+ _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+ middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+ const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+ // We deliberately carry the leading four bits in highbyte if they are
+ // present, we remove them later when computing hightenbits.
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+ // When we need to generate a surrogate pair (leading byte > 0xF0), then
+ // the corresponding 32-bit value in 'composed' will be greater than
+ // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
+ // location of the surrogate pairs.
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+ _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+ const __m128i composedminus =
+ _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
+ const __m128i lowtenbits =
+ _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
+ // Notice the 0x3ff mask:
+ const __m128i hightenbits =
+ _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
+ const __m128i lowtenbitsadd =
+ _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
+ const __m128i hightenbitsadd =
+ _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
+ const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
+ __m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
+ uint32_t basic_buffer[4];
+ uint32_t basic_buffer_swap[4];
+ if (big_endian) {
+ _mm_storeu_si128((__m128i *)basic_buffer_swap,
+ _mm_shuffle_epi8(composed, swap));
+ surrogates = _mm_shuffle_epi8(surrogates, swap);
+ }
+ _mm_storeu_si128((__m128i *)basic_buffer, composed);
+ uint32_t surrogate_buffer[4];
+ _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
+ for (size_t i = 0; i < 3; i++) {
+ if (basic_buffer[i] > 0x3c00000) {
+ utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
+ utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+ utf16_output += 2;
+ } else {
+ utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i])
+ : uint16_t(basic_buffer[i]);
+ utf16_output++;
+ }
+ }
+ } else {
+ // here we know that there is an error but we do not handle errors
+ }
+ return consumed;
+}
diff --git a/contrib/simdutf/src/westmere/sse_convert_utf8_to_utf32.cpp b/contrib/simdutf/src/westmere/sse_convert_utf8_to_utf32.cpp
new file mode 100644
index 000000000..df1733e56
--- /dev/null
+++ b/contrib/simdutf/src/westmere/sse_convert_utf8_to_utf32.cpp
@@ -0,0 +1,141 @@
+// depends on "tables/utf8_to_utf16_tables.h"
+
+// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_utf32(const char *input,
+ uint64_t utf8_end_of_code_point_mask,
+ char32_t *&utf32_output) {
+ // we use an approach where we try to process up to 12 input bytes.
+ // Why 12 input bytes and not 16? Because we are concerned with the size of
+ // the lookup tables. Also 12 is nicely divisible by two and three.
+ //
+ //
+ // Optimization note: our main path below is load-latency dependent. Thus it
+ // is maybe beneficial to have fast paths that depend on branch prediction but
+ // have less latency. This results in more instructions but, potentially, also
+ // higher speeds.
+ //
+ // We first try a few fast paths.
+ const __m128i in = _mm_loadu_si128((__m128i *)input);
+ const uint16_t input_utf8_end_of_code_point_mask =
+ utf8_end_of_code_point_mask & 0xfff;
+ if (utf8_end_of_code_point_mask == 0xfff) {
+ // We process the data in chunks of 12 bytes.
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
+ _mm_cvtepu8_epi32(in));
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
+ _mm_cvtepu8_epi32(_mm_srli_si128(in, 4)));
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 8),
+ _mm_cvtepu8_epi32(_mm_srli_si128(in, 8)));
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 12),
+ _mm_cvtepu8_epi32(_mm_srli_si128(in, 12)));
+ utf32_output += 12; // We wrote 12 32-bit characters.
+ return 12; // We consumed 12 bytes.
+ }
+ if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+ // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte
+ // UTF-32 code units. There is probably a more efficient sequence, but the
+ // following might do.
+ const __m128i sh =
+ _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+ const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
+ _mm_cvtepu16_epi32(composed));
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
+ _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
+ utf32_output += 8; // We wrote 32 bytes, 8 code points.
+ return 16;
+ }
+ if (input_utf8_end_of_code_point_mask == 0x924) {
+ // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
+ // UTF-32 code units. There is probably a more efficient sequence, but the
+ // following might do.
+ const __m128i sh =
+ _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii =
+ _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+ const __m128i middlebyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ const __m128i highbyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+ _mm_storeu_si128((__m128i *)utf32_output, composed);
+ utf32_output += 4;
+ return 12;
+ }
+ /// We do not have a fast path available, so we fallback.
+
+ const uint8_t idx =
+ tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+ const uint8_t consumed =
+ tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+ if (idx < 64) {
+ // SIX (6) input code-code units
+ // this is a relatively easy scenario
+ // we process SIX (6) input code-code units. The max length in bytes of six
+ // code code units spanning between 1 and 2 bytes each is 12 bytes. On
+ // processors where pdep/pext is fast, we might be able to use a small
+ // lookup table.
+ const __m128i sh =
+ _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+ const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
+ _mm_cvtepu16_epi32(composed));
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
+ _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
+ utf32_output += 6; // We wrote 12 bytes, 6 code points.
+ } else if (idx < 145) {
+ // FOUR (4) input code-code units
+ const __m128i sh =
+ _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii =
+ _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+ const __m128i middlebyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ const __m128i highbyte =
+ _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+ _mm_storeu_si128((__m128i *)utf32_output, composed);
+ utf32_output += 4;
+ } else if (idx < 209) {
+ // TWO (2) input code-code units
+ const __m128i sh =
+ _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+ const __m128i perm = _mm_shuffle_epi8(in, sh);
+ const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+ const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+ const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+ __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+ // correct for spurious high bit
+ const __m128i correct =
+ _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+ middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+ const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+ const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
+ const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+ const __m128i composed =
+ _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+ _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+ _mm_storeu_si128((__m128i *)utf32_output, composed);
+ utf32_output += 3;
+ } else {
+ // here we know that there is an error but we do not handle errors
+ }
+ return consumed;
+}
diff --git a/contrib/simdutf/src/westmere/sse_validate_utf16.cpp b/contrib/simdutf/src/westmere/sse_validate_utf16.cpp
new file mode 100644
index 000000000..35d6af51d
--- /dev/null
+++ b/contrib/simdutf/src/westmere/sse_validate_utf16.cpp
@@ -0,0 +1,211 @@
+/*
+ In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
+
+ In a vectorized algorithm we want to examine the most significant
+ nibble in order to select a fast path. If none of highest nibbles
+ are 0xD (13), than we are sure that UTF-16 chunk in a vector
+ register is valid.
+
+ Let us analyze what we need to check if the nibble is 0xD. The
+ value of the preceding nibble determines what we have:
+
+ 0xd000 .. 0xd7ff - a valid word
+ 0xd800 .. 0xdbff - low surrogate
+ 0xdc00 .. 0xdfff - high surrogate
+
+ Other constraints we have to consider:
+ - there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
+ - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
+ - there must not be sole low surrogate nor high surrogate
+
+ We are going to build three bitmasks based on the 3rd nibble:
+ - V = valid word,
+ - L = low surrogate (0xd800 .. 0xdbff)
+ - H = high surrogate (0xdc00 .. 0xdfff)
+
+ 0 1 2 3 4 5 6 7 <--- word index
+ [ V | L | H | L | H | V | V | L ]
+ 1 0 0 0 0 1 1 0 - V = valid masks
+ 0 1 0 1 0 0 0 1 - L = low surrogate
+ 0 0 1 0 1 0 0 0 - H high surrogate
+
+
+ 1 0 0 0 0 1 1 0 V = valid masks
+ 0 1 0 1 0 0 0 0 a = L & (H >> 1)
+ 0 0 1 0 1 0 0 0 b = a << 1
+ 1 1 1 1 1 1 1 0 c = V | a | b
+ ^
+ the last bit can be zero, we just consume 7
+ code units and recheck this word in the next iteration
+*/
+
+/* Returns:
+ - pointer to the last unprocessed character (a scalar fallback should check
+ the rest);
+ - nullptr if an error was detected.
+*/
+template <endianness big_endian>
+const char16_t *sse_validate_utf16(const char16_t *input, size_t size) {
+ const char16_t *end = input + size;
+
+ const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+ const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+ const auto v_fc = simd8<uint8_t>::splat(0xfc);
+ const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+ while (input + simd16<uint16_t>::SIZE * 2 < end) {
+ // 0. Load data: since the validation takes into account only higher
+ // byte of each word, we compress the two vectors into one which
+ // consists only the higher bytes.
+ auto in0 = simd16<uint16_t>(input);
+ auto in1 =
+ simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+ if (big_endian) {
+ in0 = in0.swap_bytes();
+ in1 = in1.swap_bytes();
+ }
+
+ const auto t0 = in0.shr<8>();
+ const auto t1 = in1.shr<8>();
+
+ const auto in = simd16<uint16_t>::pack(t0, t1);
+
+ // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+ const auto surrogates_wordmask = (in & v_f8) == v_d8;
+ const uint16_t surrogates_bitmask =
+ static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
+ if (surrogates_bitmask == 0x0000) {
+ input += 16;
+ } else {
+ // 2. We have some surrogates that have to be distinguished:
+ // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+ // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+ //
+ // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+ // V - non-surrogate code units
+ // V = not surrogates_wordmask
+ const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
+
+ // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+ const auto vH = (in & v_fc) == v_dc;
+ const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
+
+ // L - word mask for low surrogates
+ // L = not H and surrogates_wordmask
+ const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
+
+ const uint16_t a = static_cast<uint16_t>(
+ L & (H >> 1)); // A low surrogate must be followed by high one.
+ // (A low surrogate placed in the 7th register's word
+ // is an exception we handle.)
+ const uint16_t b = static_cast<uint16_t>(
+ a << 1); // Just mark that the opinput - startite fact is hold,
+ // thanks to that we have only two masks for valid case.
+ const uint16_t c = static_cast<uint16_t>(
+ V | a | b); // Combine all the masks into the final one.
+
+ if (c == 0xffff) {
+ // The whole input register contains valid UTF-16, i.e.,
+ // either single code units or proper surrogate pairs.
+ input += 16;
+ } else if (c == 0x7fff) {
+ // The 15 lower code units of the input register contains valid UTF-16.
+ // The 15th word may be either a low or high surrogate. It the next
+ // iteration we 1) check if the low surrogate is followed by a high
+ // one, 2) reject sole high surrogate.
+ input += 15;
+ } else {
+ return nullptr;
+ }
+ }
+ }
+
+ return input;
+}
+
+template <endianness big_endian>
+const result sse_validate_utf16_with_errors(const char16_t *input,
+ size_t size) {
+ if (simdutf_unlikely(size == 0)) {
+ return result(error_code::SUCCESS, 0);
+ }
+ const char16_t *start = input;
+ const char16_t *end = input + size;
+
+ const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+ const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+ const auto v_fc = simd8<uint8_t>::splat(0xfc);
+ const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+ while (input + simd16<uint16_t>::SIZE * 2 < end) {
+ // 0. Load data: since the validation takes into account only higher
+ // byte of each word, we compress the two vectors into one which
+ // consists only the higher bytes.
+ auto in0 = simd16<uint16_t>(input);
+ auto in1 =
+ simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+
+ if (big_endian) {
+ in0 = in0.swap_bytes();
+ in1 = in1.swap_bytes();
+ }
+
+ const auto t0 = in0.shr<8>();
+ const auto t1 = in1.shr<8>();
+
+ const auto in = simd16<uint16_t>::pack(t0, t1);
+
+ // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+ const auto surrogates_wordmask = (in & v_f8) == v_d8;
+ const uint16_t surrogates_bitmask =
+ static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
+ if (surrogates_bitmask == 0x0000) {
+ input += 16;
+ } else {
+ // 2. We have some surrogates that have to be distinguished:
+ // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+ // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+ //
+ // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+ // V - non-surrogate code units
+ // V = not surrogates_wordmask
+ const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
+
+ // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+ const auto vH = (in & v_fc) == v_dc;
+ const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
+
+ // L - word mask for low surrogates
+ // L = not H and surrogates_wordmask
+ const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
+
+ const uint16_t a = static_cast<uint16_t>(
+ L & (H >> 1)); // A low surrogate must be followed by high one.
+ // (A low surrogate placed in the 7th register's word
+ // is an exception we handle.)
+ const uint16_t b = static_cast<uint16_t>(
+ a << 1); // Just mark that the opinput - startite fact is hold,
+ // thanks to that we have only two masks for valid case.
+ const uint16_t c = static_cast<uint16_t>(
+ V | a | b); // Combine all the masks into the final one.
+
+ if (c == 0xffff) {
+ // The whole input register contains valid UTF-16, i.e.,
+ // either single code units or proper surrogate pairs.
+ input += 16;
+ } else if (c == 0x7fff) {
+ // The 15 lower code units of the input register contains valid UTF-16.
+ // The 15th word may be either a low or high surrogate. It the next
+ // iteration we 1) check if the low surrogate is followed by a high
+ // one, 2) reject sole high surrogate.
+ input += 15;
+ } else {
+ return result(error_code::SURROGATE, input - start);
+ }
+ }
+ }
+
+ return result(error_code::SUCCESS, input - start);
+}
diff --git a/contrib/simdutf/src/westmere/sse_validate_utf32le.cpp b/contrib/simdutf/src/westmere/sse_validate_utf32le.cpp
new file mode 100644
index 000000000..71feff66a
--- /dev/null
+++ b/contrib/simdutf/src/westmere/sse_validate_utf32le.cpp
@@ -0,0 +1,69 @@
+/* Returns:
+ - pointer to the last unprocessed character (a scalar fallback should check
+ the rest);
+ - nullptr if an error was detected.
+*/
+const char32_t *sse_validate_utf32le(const char32_t *input, size_t size) {
+ const char32_t *end = input + size;
+
+ const __m128i standardmax = _mm_set1_epi32(0x10ffff);
+ const __m128i offset = _mm_set1_epi32(0xffff2000);
+ const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
+ __m128i currentmax = _mm_setzero_si128();
+ __m128i currentoffsetmax = _mm_setzero_si128();
+
+ while (input + 4 < end) {
+ const __m128i in = _mm_loadu_si128((__m128i *)input);
+ currentmax = _mm_max_epu32(in, currentmax);
+ currentoffsetmax =
+ _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
+ input += 4;
+ }
+ __m128i is_zero =
+ _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
+ if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+ return nullptr;
+ }
+
+ is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax),
+ standardoffsetmax);
+ if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+ return nullptr;
+ }
+
+ return input;
+}
+
+const result sse_validate_utf32le_with_errors(const char32_t *input,
+ size_t size) {
+ const char32_t *start = input;
+ const char32_t *end = input + size;
+
+ const __m128i standardmax = _mm_set1_epi32(0x10ffff);
+ const __m128i offset = _mm_set1_epi32(0xffff2000);
+ const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
+ __m128i currentmax = _mm_setzero_si128();
+ __m128i currentoffsetmax = _mm_setzero_si128();
+
+ while (input + 4 < end) {
+ const __m128i in = _mm_loadu_si128((__m128i *)input);
+ currentmax = _mm_max_epu32(in, currentmax);
+ currentoffsetmax =
+ _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
+
+ __m128i is_zero =
+ _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
+ if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+ return result(error_code::TOO_LARGE, input - start);
+ }
+
+ is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax),
+ standardoffsetmax);
+ if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+ return result(error_code::SURROGATE, input - start);
+ }
+ input += 4;
+ }
+
+ return result(error_code::SUCCESS, input - start);
+}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f7fdcef7b..92edb0b6a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -201,7 +201,7 @@ IF(SYSTEM_ZSTD MATCHES "OFF")
ELSE()
TARGET_LINK_LIBRARIES(rspamd-server zstd)
ENDIF()
-TARGET_LINK_LIBRARIES(rspamd-server rspamd-fastutf8)
+TARGET_LINK_LIBRARIES(rspamd-server rspamd-simdutf)
IF (ENABLE_CLANG_PLUGIN MATCHES "ON")
ADD_DEPENDENCIES(rspamd-server rspamd-clang)
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c
index 8b4c45436..995706d10 100644
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -22,7 +22,7 @@
#include "libserver/task.h"
#include "mime_encoding.h"
#include "message.h"
-#include "contrib/fastutf8/fastutf8.h"
+#include "rspamd_simdutf.h"
#include "contrib/google-ced/ced_c.h"
#include <unicode/ucnv.h>
#if U_ICU_VERSION_MAJOR_NUM >= 44
diff --git a/src/libmime/mime_parser.c b/src/libmime/mime_parser.c
index ac35cffe3..1fe8b86e3 100644
--- a/src/libmime/mime_parser.c
+++ b/src/libmime/mime_parser.c
@@ -25,7 +25,7 @@
#include "contrib/uthash/utlist.h"
#include <openssl/cms.h>
#include <openssl/pkcs7.h>
-#include "contrib/fastutf8/fastutf8.h"
+#include "rspamd_simdutf.h"
struct rspamd_mime_parser_lib_ctx {
struct rspamd_multipattern *mp_boundary;
diff --git a/src/libmime/mime_string.hxx b/src/libmime/mime_string.hxx
index 7476816c6..b181576d3 100644
--- a/src/libmime/mime_string.hxx
+++ b/src/libmime/mime_string.hxx
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2021 Vsevolod Stakhov
+/*
+ * Copyright 2024 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -28,7 +28,7 @@
#include "libutil/mem_pool.h"
#include "function2/function2.hpp"
#include "unicode/utf8.h"
-#include "contrib/fastutf8/fastutf8.h"
+#include "rspamd_simdutf.h"
namespace rspamd::mime {
/*
diff --git a/src/libmime/scan_result.c b/src/libmime/scan_result.c
index 894ae4f9e..992a8ea49 100644
--- a/src/libmime/scan_result.c
+++ b/src/libmime/scan_result.c
@@ -21,7 +21,7 @@
#include "lua/lua_common.h"
#include "libserver/cfg_file_private.h"
#include "libmime/scan_result_private.h"
-#include "contrib/fastutf8/fastutf8.h"
+#include "rspamd_simdutf.h"
#include <math.h>
#include "contrib/uthash/utlist.h"
diff --git a/src/libserver/cfg_utils.cxx b/src/libserver/cfg_utils.cxx
index 38adf8390..9612cdae4 100644
--- a/src/libserver/cfg_utils.cxx
+++ b/src/libserver/cfg_utils.cxx
@@ -35,7 +35,7 @@
#include "cryptobox.h"
#include "ssl_util.h"
#include "contrib/libottery/ottery.h"
-#include "contrib/fastutf8/fastutf8.h"
+#include "rspamd_simdutf.h"
#ifdef SYS_ZSTD
#include "zstd.h"
@@ -2658,14 +2658,6 @@ rspamd_init_libs(void)
/* Configure utf8 library */
unsigned int utf8_flags = 0;
-
- if ((ctx->crypto_ctx->cpu_config & CPUID_SSE41)) {
- utf8_flags |= RSPAMD_FAST_UTF8_FLAG_SSE41;
- }
- if ((ctx->crypto_ctx->cpu_config & CPUID_AVX2)) {
- utf8_flags |= RSPAMD_FAST_UTF8_FLAG_AVX2;
- }
-
rspamd_fast_utf8_library_init(utf8_flags);
#ifdef HAVE_LOCALE_H
diff --git a/src/libserver/maps/map_helpers.c b/src/libserver/maps/map_helpers.c
index 505932563..6f14a797a 100644
--- a/src/libserver/maps/map_helpers.c
+++ b/src/libserver/maps/map_helpers.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2023 Vsevolod Stakhov
+ * Copyright 2024 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
#include "rspamd.h"
#include "cryptobox.h"
#include "mempool_vars_internal.h"
-#include "contrib/fastutf8/fastutf8.h"
+#include "rspamd_simdutf.h"
#include "contrib/cdb/cdb.h"
#ifdef WITH_HYPERSCAN
diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c
index 1196d2d14..b683547a1 100644
--- a/src/libserver/protocol.c
+++ b/src/libserver/protocol.c
@@ -26,7 +26,7 @@
#include "protocol_internal.h"
#include "libserver/mempool_vars_internal.h"
#include "libserver/worker_util.h"
-#include "contrib/fastutf8/fastutf8.h"
+#include "rspamd_simdutf.h"
#include "task.h"
#include "lua/lua_classnames.h"
#include <math.h>
diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c
index 0644980da..06e9f3328 100644
--- a/src/libserver/re_cache.c
+++ b/src/libserver/re_cache.c
@@ -46,7 +46,7 @@
#include <pcre2.h>
#endif
-#include "contrib/fastutf8/fastutf8.h"
+#include "rspamd_simdutf.h"
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
diff --git a/src/libutil/CMakeLists.txt b/src/libutil/CMakeLists.txt
index 67b7e948f..acf082708 100644
--- a/src/libutil/CMakeLists.txt
+++ b/src/libutil/CMakeLists.txt
@@ -18,6 +18,7 @@ SET(LIBRSPAMDUTILSRC
${CMAKE_CURRENT_SOURCE_DIR}/heap.c
${CMAKE_CURRENT_SOURCE_DIR}/multipattern.c
${CMAKE_CURRENT_SOURCE_DIR}/cxx/utf8_util.cxx
+ ${CMAKE_CURRENT_SOURCE_DIR}/cxx/rspamd-simdutf.cxx
${CMAKE_CURRENT_SOURCE_DIR}/cxx/util_tests.cxx
${CMAKE_CURRENT_SOURCE_DIR}/cxx/file_util.cxx)
# Rspamdutil
diff --git a/src/libutil/cxx/rspamd-simdutf.cxx b/src/libutil/cxx/rspamd-simdutf.cxx
new file mode 100644
index 000000000..67b585812
--- /dev/null
+++ b/src/libutil/cxx/rspamd-simdutf.cxx
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2024 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * A simple interface for simdutf library to allow old functions to work properly
+ */
+
+#include "config.h"
+#include "simdutf.h"
+
+extern "C" {
+
+void rspamd_fast_utf8_library_init(unsigned flags)
+{
+ // This library requires no initialisation
+}
+
+off_t rspamd_fast_utf8_validate(const unsigned char *data, size_t len)
+{
+ auto res = simdutf::validate_utf8_with_errors((const char *) data, len);
+
+ if (res.error == simdutf::error_code::SUCCESS) {
+ return 0;
+ }
+
+ return res.count + 1;// We need to return offset for the first invalid character
+}
+} \ No newline at end of file
diff --git a/src/libutil/fstring.c b/src/libutil/fstring.c
index 082620c27..8da6b0068 100644
--- a/src/libutil/fstring.c
+++ b/src/libutil/fstring.c
@@ -15,7 +15,7 @@
*/
#include "fstring.h"
#include "str_util.h"
-#include "contrib/fastutf8/fastutf8.h"
+#include "rspamd_simdutf.h"
#include "contrib/mumhash/mum.h"
diff --git a/src/libutil/regexp.c b/src/libutil/regexp.c
index 9e98699fe..0646285ae 100644
--- a/src/libutil/regexp.c
+++ b/src/libutil/regexp.c
@@ -19,7 +19,7 @@
#include "ref.h"
#include "util.h"
#include "rspamd.h"
-#include "contrib/fastutf8/fastutf8.h"
+#include "rspamd_simdutf.h"
#ifndef WITH_PCRE2
/* Normal pcre path */
diff --git a/src/libutil/rspamd_simdutf.h b/src/libutil/rspamd_simdutf.h
new file mode 100644
index 000000000..c1fa07892
--- /dev/null
+++ b/src/libutil/rspamd_simdutf.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2024 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_RSPAMD_SIMDUTF_H
+#define RSPAMD_RSPAMD_SIMDUTF_H
+
+#pragma once
+#include "config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void rspamd_fast_utf8_library_init(unsigned flags);
+off_t rspamd_fast_utf8_validate(const unsigned char *data, size_t len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif//RSPAMD_RSPAMD_SIMDUTF_H
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index f8fff0dca..b3e47b7d4 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -31,7 +31,7 @@
#include <immintrin.h>
#endif
-#include "contrib/fastutf8/fastutf8.h"
+#include "rspamd_simdutf.h"
const unsigned char lc_map[256] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
diff --git a/src/lua/lua_text.c b/src/lua/lua_text.c
index 3342fc95c..7ce7440c7 100644
--- a/src/lua/lua_text.c
+++ b/src/lua/lua_text.c
@@ -16,7 +16,7 @@
#include "lua_common.h"
#include "libcryptobox/cryptobox.h"
-#include "contrib/fastutf8/fastutf8.h"
+#include "rspamd_simdutf.h"
#include "unix-std.h"
/***
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index e92e4977a..14994751c 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -32,7 +32,7 @@
#include "unicode/uspoof.h"
#include "unicode/uscript.h"
-#include "contrib/fastutf8/fastutf8.h"
+#include "rspamd_simdutf.h"
/***
* @module rspamd_util