std::pair avx2_convert_latin1_to_utf32(const char *buf, size_t len, char32_t *utf32_output) { size_t rounded_len = ((len | 7) ^ 7); // Round down to nearest multiple of 8 for (size_t i = 0; i < rounded_len; i += 8) { // Load 8 Latin1 characters into a 64-bit register __m128i in = _mm_loadl_epi64((__m128i *)&buf[i]); // Zero extend each set of 8 Latin1 characters to 8 32-bit integers using // vpmovzxbd __m256i out = _mm256_cvtepu8_epi32(in); // Store the results back to memory _mm256_storeu_si256((__m256i *)&utf32_output[i], out); } // return pointers pointing to where we left off return std::make_pair(buf + rounded_len, utf32_output + rounded_len); }