From 53bbf800f30c623a8750d4b9957d108a739542dc Mon Sep 17 00:00:00 2001
From: Vsevolod Stakhov <vsevolod@rspamd.com>
Date: Mon, 4 Mar 2024 14:43:44 +0000
Subject: [PATCH] [Minor] Update stringzilla library to 3.5.0

---
 contrib/DEPENDENCY_INFO.md                    |   2 +-
 contrib/stringzilla/LICENSE                   | 201 ----
 .../include/stringzilla/stringzilla.h         | 998 ++++++++++++++----
 .../include/stringzilla/stringzilla.hpp       | 159 ++-
 contrib/stringzilla/lib.c                     |  45 +-
 5 files changed, 956 insertions(+), 449 deletions(-)
 delete mode 100644 contrib/stringzilla/LICENSE

diff --git a/contrib/DEPENDENCY_INFO.md b/contrib/DEPENDENCY_INFO.md
index bd4e6742b..40aa4686d 100644
--- a/contrib/DEPENDENCY_INFO.md
+++ b/contrib/DEPENDENCY_INFO.md
@@ -38,5 +38,5 @@
 | ankerl/svector         | 1.0.2   | MIT                 | NO      |                     |
 | ankerl/unordered_dense | 4.4.0   | MIT                 | NO      |                     |
 | backward-cpp           | 1.6     | MIT                 | NO      |                     |
-| stringzilla            | 3.0.0   | Apache2             | NO      |                     |
+| stringzilla            | 3.5.0   | Apache2             | NO      |                     |
 
diff --git a/contrib/stringzilla/LICENSE b/contrib/stringzilla/LICENSE
deleted file mode 100644
index 261eeb9e9..000000000
--- a/contrib/stringzilla/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/contrib/stringzilla/include/stringzilla/stringzilla.h b/contrib/stringzilla/include/stringzilla/stringzilla.h
index 283b0ca76..1b4d3e285 100644
--- a/contrib/stringzilla/include/stringzilla/stringzilla.h
+++ b/contrib/stringzilla/include/stringzilla/stringzilla.h
@@ -24,7 +24,7 @@
 #define STRINGZILLA_H_
 
 #define STRINGZILLA_VERSION_MAJOR 3
-#define STRINGZILLA_VERSION_MINOR 0
+#define STRINGZILLA_VERSION_MINOR 5
 #define STRINGZILLA_VERSION_PATCH 0
 
 /**
@@ -41,14 +41,18 @@
 
 /**
  *  @brief  A misaligned load can be - trying to fetch eight consecutive bytes from an address
- *          that is not divisible by eight.
+ *          that is not divisible by eight. On x86 enabled by default. On ARM it's not.
  *
  *  Most platforms support it, but there is no industry standard way to check for those.
  *  This value will mostly affect the performance of the serial (SWAR) backend.
  */
 #ifndef SZ_USE_MISALIGNED_LOADS
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
+#define SZ_USE_MISALIGNED_LOADS (1) // true or false
+#else
 #define SZ_USE_MISALIGNED_LOADS (0) // true or false
 #endif
+#endif
 
 /**
  *  @brief  Removes compile-time dispatching, and replaces it with runtime dispatching.
@@ -75,6 +79,25 @@
 #define SZ_SSIZE_MAX (0x7FFFFFFFu) // Largest signed integer that fits into 32 bits.
 #endif
 
+/**
+ *  @brief  On Big-Endian machines StringZilla will work in compatibility mode.
+ *          This disables SWAR hacks to minimize code duplication, assuming practically
+ *          all modern popular platforms are Little-Endian.
+ *
+ *  This variable is hard to infer from macros reliably. It's best to set it manually.
+ *  For that CMake provides the `TestBigEndian` and `CMAKE_<LANG>_BYTE_ORDER` (from 3.20 onwards).
+ *  In Python one can check `sys.byteorder == 'big'` in the `setup.py` script and pass the appropriate macro.
+ *  https://stackoverflow.com/a/27054190
+ */
+#ifndef SZ_DETECT_BIG_ENDIAN
+#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \
+    defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || defined(__MIBSEB) || defined(__MIBSEB__)
+#define SZ_DETECT_BIG_ENDIAN (1) //< It's a big-endian target architecture
+#else
+#define SZ_DETECT_BIG_ENDIAN (0) //< It's a little-endian target architecture
+#endif
+#endif
+
 /*
  *  Debugging and testing.
  */
@@ -86,6 +109,19 @@
 #endif
 #endif
 
+/**
+ *  @brief  Threshold for switching to SWAR (8-bytes at a time) backend over serial byte-level for-loops.
+ *          On very short strings, under 16 bytes long, at most a single word will be processed with SWAR.
+ *          Assuming potentially misaligned loads, SWAR makes sense only after ~24 bytes.
+ */
+#ifndef SZ_SWAR_THRESHOLD
+#if SZ_DEBUG
+#define SZ_SWAR_THRESHOLD (8u) // 8 bytes in debug builds
+#else
+#define SZ_SWAR_THRESHOLD (24u) // 24 bytes in release builds
+#endif
+#endif
+
 /*  Annotation for the public API symbols:
  *
  *  - `SZ_PUBLIC` is used for functions that are part of the public API.
@@ -168,6 +204,8 @@ typedef char *sz_ptr_t;          // A type alias for `char *`
 typedef char const *sz_cptr_t;   // A type alias for `char const *`
 typedef sz_i8_t sz_error_cost_t; // Character mismatch cost for fuzzy matching functions
 
+typedef sz_u64_t sz_sorted_idx_t; // Index of a sorted string in a list of strings
+
 typedef enum { sz_false_k = 0, sz_true_k = 1 } sz_bool_t;                        // Only one relevant bit
 typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t; // Only three possible states: <=>
 
@@ -263,6 +301,8 @@ typedef struct sz_memory_allocator_t {
 
 /**
  *  @brief  Initializes a memory allocator to use the system default `malloc` and `free`.
+ *          ! The function is not available if the library was compiled with `SZ_AVOID_LIBC`.
+ *
  *  @param alloc    Memory allocator to initialize.
  */
 SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc);
@@ -285,13 +325,13 @@ SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void
 #ifdef SZ_STRING_INTERNAL_SPACE
 #undef SZ_STRING_INTERNAL_SPACE
 #endif
-#define SZ_STRING_INTERNAL_SPACE (23)
+#define SZ_STRING_INTERNAL_SPACE (sizeof(sz_size_t) * 3 - 1) // 3 pointers minus one byte for an 8-bit length
 
 /**
  *  @brief  Tiny memory-owning string structure with a Small String Optimization (SSO).
  *          Differs in layout from Folly, Clang, GCC, and probably most other implementations.
  *          It's designed to avoid any branches on read-only operations, and can store up
- *          to 22 characters on stack, followed by the SZ_NULL-termination character.
+ *          to 22 characters on stack on 64-bit machines, followed by the SZ_NULL-termination character.
  *
  *  @section Changing Length
  *
@@ -302,21 +342,39 @@ SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void
  */
 typedef union sz_string_t {
 
+#if !SZ_DETECT_BIG_ENDIAN
+
+    struct external {
+        sz_ptr_t start;
+        sz_size_t length;
+        sz_size_t space;
+        sz_size_t padding;
+    } external;
+
     struct internal {
         sz_ptr_t start;
         sz_u8_t length;
         char chars[SZ_STRING_INTERNAL_SPACE];
     } internal;
 
+#else
+
     struct external {
         sz_ptr_t start;
-        sz_size_t length;
-        /// @brief Number of bytes, that have been allocated for this string, equals to (capacity + 1).
         sz_size_t space;
         sz_size_t padding;
+        sz_size_t length;
     } external;
 
-    sz_u64_t u64s[4];
+    struct internal {
+        sz_ptr_t start;
+        char chars[SZ_STRING_INTERNAL_SPACE];
+        sz_u8_t length;
+    } internal;
+
+#endif
+
+    sz_size_t words[4];
 
 } sz_string_t;
 
@@ -413,6 +471,15 @@ SZ_PUBLIC void sz_toupper(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
  */
 SZ_PUBLIC void sz_toascii(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
 
+/**
+ *  @brief  Checks if all characters in the range are valid ASCII characters.
+ *
+ *  @param text     String to be analyzed.
+ *  @param length   Number of bytes in the string.
+ *  @return         Whether all characters are valid ASCII characters.
+ */
+SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length);
+
 /**
  *  @brief  Generates a random string for a given alphabet, avoiding integer division and modulo operations.
  *          Similar to `text[i] = alphabet[rand() % cardinality]`.
@@ -430,8 +497,12 @@ SZ_PUBLIC void sz_toascii(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
  *  @param generate     Callback producing random numbers given the generator state.
  *  @param generator    Generator state, can be a pointer to a seed, or a pointer to a random number generator.
  */
-SZ_PUBLIC void sz_generate(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                           sz_random_generator_t generate, void *generator);
+SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
+                            sz_random_generator_t generate, void *generator);
+
+/** @copydoc sz_generate */
+SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
+                                  sz_random_generator_t generate, void *generator);
 
 /**
  *  @brief  Similar to `memcpy`, copies contents of one string into another.
@@ -681,6 +752,54 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz
 
 #pragma region String Similarity Measures API
 
+/**
+ *  @brief  Computes the Hamming distance between two strings - number of not matching characters.
+ *          Difference in length is is counted as a mismatch.
+ *
+ *  @param a        First string to compare.
+ *  @param a_length Number of bytes in the first string.
+ *  @param b        Second string to compare.
+ *  @param b_length Number of bytes in the second string.
+ *
+ *  @param bound    Upper bound on the distance, that allows us to exit early.
+ *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
+ *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
+ *
+ *  @see    sz_hamming_distance_utf8
+ *  @see    https://en.wikipedia.org/wiki/Hamming_distance
+ */
+SZ_DYNAMIC sz_size_t sz_hamming_distance(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
+                                         sz_size_t bound);
+
+/** @copydoc sz_hamming_distance */
+SZ_PUBLIC sz_size_t sz_hamming_distance_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
+                                               sz_size_t bound);
+
+/**
+ *  @brief  Computes the Hamming distance between two @b UTF8 strings - number of not matching characters.
+ *          Difference in length is is counted as a mismatch.
+ *
+ *  @param a        First string to compare.
+ *  @param a_length Number of bytes in the first string.
+ *  @param b        Second string to compare.
+ *  @param b_length Number of bytes in the second string.
+ *
+ *  @param bound    Upper bound on the distance, that allows us to exit early.
+ *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
+ *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
+ *
+ *  @see    sz_hamming_distance
+ *  @see    https://en.wikipedia.org/wiki/Hamming_distance
+ */
+SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
+                                              sz_size_t bound);
+
+/** @copydoc sz_hamming_distance_utf8 */
+SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
+                                                    sz_size_t bound);
+
+typedef sz_size_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t);
+
 /**
  *  @brief  Computes the Levenshtein edit-distance between two strings using the Wagner-Fisher algorithm.
  *          Similar to the Needleman-Wunsch alignment algorithm. Often used in fuzzy string matching.
@@ -708,8 +827,35 @@ SZ_DYNAMIC sz_size_t sz_edit_distance(sz_cptr_t a, sz_size_t a_length, sz_cptr_t
 SZ_PUBLIC sz_size_t sz_edit_distance_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
                                             sz_size_t bound, sz_memory_allocator_t *alloc);
 
+/**
+ *  @brief  Computes the Levenshtein edit-distance between two @b UTF8 strings.
+ *          Unlike `sz_edit_distance`, reports the distance in Unicode codepoints, and not in bytes.
+ *
+ *  @param a        First string to compare.
+ *  @param a_length Number of bytes in the first string.
+ *  @param b        Second string to compare.
+ *  @param b_length Number of bytes in the second string.
+ *
+ *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
+ *                  so the memory usage is linear in relation to ::a_length and ::b_length.
+ *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
+ *  @param bound    Upper bound on the distance, that allows us to exit early.
+ *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
+ *  @return         Unsigned integer for edit distance, the `bound` if was exceeded or `SZ_SIZE_MAX`
+ *                  if the memory allocation failed.
+ *
+ *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default, sz_edit_distance
+ *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
+ */
+SZ_DYNAMIC sz_size_t sz_edit_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
+                                           sz_size_t bound, sz_memory_allocator_t *alloc);
+
 typedef sz_size_t (*sz_edit_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_memory_allocator_t *);
 
+/** @copydoc sz_edit_distance_utf8 */
+SZ_PUBLIC sz_size_t sz_edit_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
+                                                 sz_size_t bound, sz_memory_allocator_t *alloc);
+
 /**
  *  @brief  Computes NeedlemanâWunsch alignment score for two string. Often used in bioinformatics and cheminformatics.
  *          Similar to the Levenshtein edit-distance, parameterized for gap and substitution penalties.
@@ -870,7 +1016,7 @@ typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_s
 typedef sz_bool_t (*sz_string_is_less_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
 
 typedef struct sz_sequence_t {
-    sz_u64_t *order;
+    sz_sorted_idx_t *order;
     sz_size_t count;
     sz_sequence_member_start_t get_start;
     sz_sequence_member_length_t get_length;
@@ -1091,8 +1237,10 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t text, sz_size_t length, sz_c
  */
 #ifdef __GNUG__
 #define SZ_NULL __null
+#define SZ_NULL_CHAR __null
 #else
 #define SZ_NULL ((void *)0)
+#define SZ_NULL_CHAR ((char *)0)
 #endif
 
 /**
@@ -1120,27 +1268,61 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t text, sz_size_t length, sz_c
 #define sz_assert(condition) ((void)0)
 #endif
 
-/*
- *  Intrinsics aliases for MSVC, GCC, and Clang.
+/*  Intrinsics aliases for MSVC, GCC, Clang, and Clang-Cl.
+ *  The following section of compiler intrinsics comes in 2 flavors.
  */
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__) // On Clang-CL
 #include <intrin.h>
-SZ_INTERNAL sz_size_t sz_u64_popcount(sz_u64_t x) { return __popcnt64(x); }
-SZ_INTERNAL sz_size_t sz_u64_ctz(sz_u64_t x) { return _tzcnt_u64(x); }
-SZ_INTERNAL sz_size_t sz_u64_clz(sz_u64_t x) { return _lzcnt_u64(x); }
-SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return _byteswap_uint64(val); }
+
+// Sadly, when building Win32 images, we can't use the `_tzcnt_u64`, `_lzcnt_u64`,
+// `_BitScanForward64`, or `_BitScanReverse64` intrinsics. For now it's a simple `for`-loop.
+// In the future we can switch to a more efficient De Bruijn's algorithm.
+// https://www.chessprogramming.org/BitScan
+// https://www.chessprogramming.org/De_Bruijn_Sequence
+// https://gist.github.com/resilar/e722d4600dbec9752771ab4c9d47044f
+//
+// Use the serial version on 32-bit x86 and on Arm.
+#if (defined(_WIN32) && !defined(_WIN64)) || defined(_M_ARM) || defined(_M_ARM64)
+SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) {
+    sz_assert(x != 0);
+    int n = 0;
+    while ((x & 1) == 0) { n++, x >>= 1; }
+    return n;
+}
+SZ_INTERNAL int sz_u64_clz(sz_u64_t x) {
+    sz_assert(x != 0);
+    int n = 0;
+    while ((x & 0x8000000000000000ULL) == 0) { n++, x <<= 1; }
+    return n;
+}
+SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) {
+    x = x - ((x >> 1) & 0x5555555555555555);
+    x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
+    return (((x + (x >> 4)) & 0x0F0F0F0F0F0F0F0F) * 0x0101010101010101) >> 56;
+}
+SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) {
+    x = x - ((x >> 1) & 0x55555555);
+    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+    return (((x + (x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
+}
+#else
+SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return _tzcnt_u64(x); }
+SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return _lzcnt_u64(x); }
+SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return __popcnt64(x); }
 SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return __popcnt(x); }
+#endif
 SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return _tzcnt_u32(x); }
 SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return _lzcnt_u32(x); }
+SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return _byteswap_uint64(val); }
 SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return _byteswap_ulong(val); }
 #else
 SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return __builtin_popcountll(x); }
+SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return __builtin_popcount(x); }
 SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return __builtin_ctzll(x); }
 SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return __builtin_clzll(x); }
-SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return __builtin_bswap64(val); }
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return __builtin_popcount(x); }
 SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return __builtin_ctz(x); } // ! Undefined if `x == 0`
 SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return __builtin_clz(x); } // ! Undefined if `x == 0`
+SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return __builtin_bswap64(val); }
 SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return __builtin_bswap32(val); }
 #endif
 
@@ -1231,7 +1413,9 @@ SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
     x |= x >> 4;
     x |= x >> 8;
     x |= x >> 16;
+#if SZ_DETECT_64_BIT
     x |= x >> 32;
+#endif
     x++;
     return x;
 }
@@ -1264,6 +1448,15 @@ SZ_INTERNAL void sz_u64_swap(sz_u64_t *a, sz_u64_t *b) {
     *b = t;
 }
 
+/**
+ *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
+ */
+SZ_INTERNAL void sz_pointer_swap(void **a, void **b) {
+    void *t = *a;
+    *a = *b;
+    *b = t;
+}
+
 /**
  *  @brief  Helper structure to simplify work with 16-bit words.
  *  @see    sz_u16_load
@@ -1282,8 +1475,12 @@ SZ_INTERNAL sz_u16_vec_t sz_u16_load(sz_cptr_t ptr) {
     result.u8s[0] = ptr[0];
     result.u8s[1] = ptr[1];
     return result;
-#elif defined(_MSC_VER)
+#elif defined(_MSC_VER) && !defined(__clang__)
+#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
+    return *((sz_u16_vec_t *)ptr);
+#else
     return *((__unaligned sz_u16_vec_t *)ptr);
+#endif
 #else
     __attribute__((aligned(1))) sz_u16_vec_t const *result = (sz_u16_vec_t const *)ptr;
     return *result;
@@ -1311,8 +1508,12 @@ SZ_INTERNAL sz_u32_vec_t sz_u32_load(sz_cptr_t ptr) {
     result.u8s[2] = ptr[2];
     result.u8s[3] = ptr[3];
     return result;
-#elif defined(_MSC_VER)
+#elif defined(_MSC_VER) && !defined(__clang__)
+#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
+    return *((sz_u32_vec_t *)ptr);
+#else
     return *((__unaligned sz_u32_vec_t *)ptr);
+#endif
 #else
     __attribute__((aligned(1))) sz_u32_vec_t const *result = (sz_u32_vec_t const *)ptr;
     return *result;
@@ -1345,8 +1546,12 @@ SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
     result.u8s[6] = ptr[6];
     result.u8s[7] = ptr[7];
     return result;
-#elif defined(_MSC_VER)
+#elif defined(_MSC_VER) && !defined(__clang__)
+#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
+    return *((sz_u64_vec_t *)ptr);
+#else
     return *((__unaligned sz_u64_vec_t *)ptr);
+#endif
 #else
     __attribute__((aligned(1))) sz_u64_vec_t const *result = (sz_u64_vec_t const *)ptr;
     return *result;
@@ -1358,7 +1563,7 @@ SZ_INTERNAL sz_ptr_t _sz_memory_allocate_fixed(sz_size_t length, void *handle) {
     sz_size_t capacity;
     sz_copy((sz_ptr_t)&capacity, (sz_cptr_t)handle, sizeof(sz_size_t));
     sz_size_t consumed_capacity = sizeof(sz_size_t);
-    if (consumed_capacity + length > capacity) return SZ_NULL;
+    if (consumed_capacity + length > capacity) return SZ_NULL_CHAR;
     return (sz_ptr_t)handle + consumed_capacity;
 }
 
@@ -1420,14 +1625,51 @@ SZ_INTERNAL void _sz_locate_needle_anomalies(sz_cptr_t start, sz_size_t length,
 
     // Loop through letters to find non-colliding variants.
     if (length > 3 && has_duplicates) {
-        // Pivot the middle point left, until we find a character different from the first one.
-        for (; start[*second] == start[*first] && *second; --(*second)) {}
         // Pivot the middle point right, until we find a character different from the first one.
         for (; start[*second] == start[*first] && *second + 1 < *third; ++(*second)) {}
         // Pivot the third (last) point left, until we find a different character.
         for (; (start[*third] == start[*second] || start[*third] == start[*first]) && *third > (*second + 1);
              --(*third)) {}
     }
+
+    // TODO: Investigate alternative strategies for long needles.
+    // On very long needles we have the luxury to choose!
+    // Often dealing with UTF8, we will likely benfit from shifting the first and second characters
+    // further to the right, to achieve not only uniqness within the needle, but also avoid common
+    // rune prefixes of 2-, 3-, and 4-byte codes.
+    if (length > 8) {
+        // Pivot the first and second points right, until we find a character, that:
+        // > is different from others.
+        // > doesn't start with 0b'110x'xxxx - only 5 bits of relevant info.
+        // > doesn't start with 0b'1110'xxxx - only 4 bits of relevant info.
+        // > doesn't start with 0b'1111'0xxx - only 3 bits of relevant info.
+        //
+        // So we are practically searching for byte values that start with 0b0xxx'xxxx or 0b'10xx'xxxx.
+        // Meaning they fall in the range [0, 127] and [128, 191], in other words any unsigned int up to 191.
+        sz_u8_t const *start_u8 = (sz_u8_t const *)start;
+        sz_size_t vibrant_first = *first, vibrant_second = *second, vibrant_third = *third;
+
+        // Let's begin with the seccond character, as the termination criterea there is more obvious
+        // and we may end up with more variants to check for the first candidate.
+        for (; (start_u8[vibrant_second] > 191 || start_u8[vibrant_second] == start_u8[vibrant_third]) &&
+               (vibrant_second + 1 < vibrant_third);
+             ++vibrant_second) {}
+
+        // Now check if we've indeed found a good candidate or should revert the `vibrant_second` to `second`.
+        if (start_u8[vibrant_second] < 191) { *second = vibrant_second; }
+        else { vibrant_second = *second; }
+
+        // Now check the first character.
+        for (; (start_u8[vibrant_first] > 191 || start_u8[vibrant_first] == start_u8[vibrant_second] ||
+                start_u8[vibrant_first] == start_u8[vibrant_third]) &&
+               (vibrant_first + 1 < vibrant_second);
+             ++vibrant_first) {}
+
+        // Now check if we've indeed found a good candidate or should revert the `vibrant_first` to `first`.
+        // We don't need to shift the third one when dealing with texts as the last byte of the text is
+        // also the last byte of a rune and contains the most information.
+        if (start_u8[vibrant_first] < 191) { *first = vibrant_first; }
+    }
 }
 
 #pragma GCC visibility pop
@@ -1438,14 +1680,16 @@ SZ_INTERNAL void _sz_locate_needle_anomalies(sz_cptr_t start, sz_size_t length,
 #if !SZ_AVOID_LIBC
 #include <stdio.h>  // `fprintf`
 #include <stdlib.h> // `malloc`, `EXIT_FAILURE`
-#else
-extern void *malloc(size_t);
-extern void free(void *);
 #endif
 
 SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc) {
+#if !SZ_AVOID_LIBC
     alloc->allocate = (sz_memory_allocate_t)malloc;
     alloc->free = (sz_memory_free_t)free;
+#else
+    alloc->allocate = (sz_memory_allocate_t)SZ_NULL;
+    alloc->free = (sz_memory_free_t)SZ_NULL;
+#endif
     alloc->handle = SZ_NULL;
 }
 
@@ -1464,6 +1708,16 @@ SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void
  */
 SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
     sz_cptr_t const a_end = a + length;
+#if SZ_USE_MISALIGNED_LOADS
+    if (length >= SZ_SWAR_THRESHOLD) {
+        sz_u64_vec_t a_vec, b_vec;
+        for (; a + 8 <= a_end; a += 8, b += 8) {
+            a_vec = sz_u64_load(a);
+            b_vec = sz_u64_load(b);
+            if (a_vec.u64 != b_vec.u64) return sz_false_k;
+        }
+    }
+#endif
     while (a != a_end && *a == *b) a++, b++;
     return (sz_bool_t)(a_end == a);
 }
@@ -1471,7 +1725,7 @@ SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length)
 SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
     for (sz_cptr_t const end = text + length; text != end; ++text)
         if (sz_charset_contains(set, *text)) return text;
-    return SZ_NULL;
+    return SZ_NULL_CHAR;
 }
 
 SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
@@ -1480,25 +1734,38 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz
     sz_cptr_t const end = text;
     for (text += length; text != end;)
         if (sz_charset_contains(set, *(text -= 1))) return text;
-    return SZ_NULL;
+    return SZ_NULL_CHAR;
 #pragma GCC diagnostic pop
 }
 
+/**
+ *  One option to avoid branching is to use conditional moves and lookup the comparison result in a table:
+ *       sz_ordering_t ordering_lookup[2] = {sz_greater_k, sz_less_k};
+ *       for (; a != min_end; ++a, ++b)
+ *           if (*a != *b) return ordering_lookup[*a < *b];
+ *  That, however, introduces a data-dependency.
+ *  A cleaner option is to perform two comparisons and a subtraction.
+ *  One instruction more, but no data-dependency.
+ */
+#define _sz_order_scalars(a, b) ((sz_ordering_t)((a > b) - (a < b)))
+
 SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_ordering_t ordering_lookup[2] = {sz_greater_k, sz_less_k};
     sz_bool_t a_shorter = (sz_bool_t)(a_length < b_length);
     sz_size_t min_length = a_shorter ? a_length : b_length;
     sz_cptr_t min_end = a + min_length;
-#if SZ_USE_MISALIGNED_LOADS
+#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
     for (sz_u64_vec_t a_vec, b_vec; a + 8 <= min_end; a += 8, b += 8) {
-        a_vec.u64 = sz_u64_bytes_reverse(sz_u64_load(a).u64);
-        b_vec.u64 = sz_u64_bytes_reverse(sz_u64_load(b).u64);
-        if (a_vec.u64 != b_vec.u64) return ordering_lookup[a_vec.u64 < b_vec.u64];
+        a_vec = sz_u64_load(a);
+        b_vec = sz_u64_load(b);
+        if (a_vec.u64 != b_vec.u64)
+            return _sz_order_scalars(sz_u64_bytes_reverse(a_vec.u64), sz_u64_bytes_reverse(b_vec.u64));
     }
 #endif
     for (; a != min_end; ++a, ++b)
-        if (*a != *b) return ordering_lookup[*a < *b];
-    return a_length != b_length ? ordering_lookup[a_shorter] : sz_equal_k;
+        if (*a != *b) return _sz_order_scalars(*a, *b);
+
+    // If the strings are equal up to `min_end`, then the shorter string is smaller
+    return _sz_order_scalars(a_length, b_length);
 }
 
 /**
@@ -1522,11 +1789,11 @@ SZ_INTERNAL sz_u64_vec_t _sz_u64_each_byte_equal(sz_u64_vec_t a, sz_u64_vec_t b)
  */
 SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
 
-    if (!h_length) return SZ_NULL;
+    if (!h_length) return SZ_NULL_CHAR;
     sz_cptr_t const h_end = h + h_length;
 
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
+#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
+#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
         if (*h == *n) return h;
 #endif
@@ -1541,11 +1808,12 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr
         match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
         if (match_vec.u64) return h + sz_u64_ctz(match_vec.u64) / 8;
     }
+#endif
 
     // Handle the misaligned tail.
     for (; h < h_end; ++h)
         if (*h == *n) return h;
-    return SZ_NULL;
+    return SZ_NULL_CHAR;
 }
 
 /**
@@ -1555,14 +1823,14 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr
  */
 sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
 
-    if (!h_length) return SZ_NULL;
+    if (!h_length) return SZ_NULL_CHAR;
     sz_cptr_t const h_start = h;
 
     // Reposition the `h` pointer to the end, as we will be walking backwards.
     h = h + h_length - 1;
 
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
+#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
+#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((sz_size_t)(h + 1) & 7ull) && h >= h_start; --h)
         if (*h == *n) return h;
 #endif
@@ -1576,10 +1844,11 @@ sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
         match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
         if (match_vec.u64) return h - sz_u64_clz(match_vec.u64) / 8;
     }
+#endif
 
     for (; h >= h_start; --h)
         if (*h == *n) return h;
-    return SZ_NULL;
+    return SZ_NULL_CHAR;
 }
 
 /**
@@ -1633,7 +1902,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_
 
     for (; h + 2 <= h_end; ++h)
         if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-    return SZ_NULL;
+    return SZ_NULL_CHAR;
 }
 
 /**
@@ -1697,7 +1966,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_
 
     for (; h + 4 <= h_end; ++h)
         if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-    return SZ_NULL;
+    return SZ_NULL_CHAR;
 }
 
 /**
@@ -1768,7 +2037,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_
 
     for (; h + 3 <= h_end; ++h)
         if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-    return SZ_NULL;
+    return SZ_NULL_CHAR;
 }
 
 /**
@@ -1821,7 +2090,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_horspool_upto_256bytes_serial(sz_cptr_t h_chars,
         if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
         i += bad_shift_table.jumps[h[i + n_length - 1]];
     }
-    return SZ_NULL;
+    return SZ_NULL_CHAR;
 }
 
 /**
@@ -1872,7 +2141,7 @@ SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_upto_256bytes_serial(sz_cptr_t h_chars,
         if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
         j += bad_shift_table.jumps[h[i]];
     }
-    return SZ_NULL;
+    return SZ_NULL_CHAR;
 }
 
 /**
@@ -1885,11 +2154,11 @@ SZ_INTERNAL sz_cptr_t _sz_find_with_prefix(sz_cptr_t h, sz_size_t h_length, sz_c
     sz_size_t suffix_length = n_length - prefix_length;
     while (1) {
         sz_cptr_t found = find_prefix(h, h_length, n, prefix_length);
-        if (!found) return SZ_NULL;
+        if (!found) return SZ_NULL_CHAR;
 
         // Verify the remaining part of the needle
         sz_size_t remaining = h_length - (found - h);
-        if (remaining < suffix_length) return SZ_NULL;
+        if (remaining < suffix_length) return SZ_NULL_CHAR;
         if (sz_equal(found + prefix_length, n + prefix_length, suffix_length)) return found;
 
         // Adjust the position.
@@ -1898,7 +2167,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_with_prefix(sz_cptr_t h, sz_size_t h_length, sz_c
     }
 
     // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL;
+    return SZ_NULL_CHAR;
 }
 
 /**
@@ -1911,11 +2180,11 @@ SZ_INTERNAL sz_cptr_t _sz_rfind_with_suffix(sz_cptr_t h, sz_size_t h_length, sz_
     sz_size_t prefix_length = n_length - suffix_length;
     while (1) {
         sz_cptr_t found = find_suffix(h, h_length, n + prefix_length, suffix_length);
-        if (!found) return SZ_NULL;
+        if (!found) return SZ_NULL_CHAR;
 
         // Verify the remaining part of the needle
         sz_size_t remaining = found - h;
-        if (remaining < prefix_length) return SZ_NULL;
+        if (remaining < prefix_length) return SZ_NULL_CHAR;
         if (sz_equal(found - prefix_length, n, prefix_length)) return found - prefix_length;
 
         // Adjust the position.
@@ -1923,7 +2192,7 @@ SZ_INTERNAL sz_cptr_t _sz_rfind_with_suffix(sz_cptr_t h, sz_size_t h_length, sz_
     }
 
     // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL;
+    return SZ_NULL_CHAR;
 }
 
 SZ_INTERNAL sz_cptr_t _sz_find_over_4bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
@@ -1943,8 +2212,17 @@ SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_over_256bytes_serial(sz_cptr_t h, sz_si
 SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL;
+    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
+
+#if SZ_DETECT_BIG_ENDIAN
+    sz_find_t backends[] = {
+        (sz_find_t)sz_find_byte_serial,
+        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
+        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
+    };
 
+    return backends[(n_length > 1) + (n_length > 256)](h, h_length, n, n_length);
+#else
     sz_find_t backends[] = {
         // For very short strings brute-force SWAR makes sense.
         (sz_find_t)sz_find_byte_serial,
@@ -1965,12 +2243,13 @@ SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
         (n_length > 4) +
         // For longer needles - use skip tables.
         (n_length > 8) + (n_length > 256)](h, h_length, n, n_length);
+#endif
 }
 
 SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL;
+    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
 
     sz_find_t backends[] = {
         // For very short strings brute-force SWAR makes sense.
@@ -2073,10 +2352,91 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
     return result;
 }
 
+/**
+ *  @brief  Describes the length of a UTF8 character / codepoint / rune in bytes.
+ */
+typedef enum {
+    sz_utf8_invalid_k = 0,     //!< Invalid UTF8 character.
+    sz_utf8_rune_1byte_k = 1,  //!< 1-byte UTF8 character.
+    sz_utf8_rune_2bytes_k = 2, //!< 2-byte UTF8 character.
+    sz_utf8_rune_3bytes_k = 3, //!< 3-byte UTF8 character.
+    sz_utf8_rune_4bytes_k = 4, //!< 4-byte UTF8 character.
+} sz_rune_length_t;
+
+typedef sz_u32_t sz_rune_t;
+
+/**
+ *  @brief  Extracts just one UTF8 codepoint from a UTF8 string into a 32-bit unsigned integer.
+ */
+SZ_INTERNAL void _sz_extract_utf8_rune(sz_cptr_t utf8, sz_rune_t *code, sz_rune_length_t *code_length) {
+    sz_u8_t const *current = (sz_u8_t const *)utf8;
+    sz_u8_t leading_byte = *current++;
+    sz_rune_t ch;
+    sz_rune_length_t ch_length;
+
+    // TODO: This can be made entirely branchless using 32-bit SWAR.
+    if (leading_byte < 0x80) {
+        // Single-byte rune (0xxxxxxx)
+        ch = leading_byte;
+        ch_length = sz_utf8_rune_1byte_k;
+    }
+    else if ((leading_byte & 0xE0) == 0xC0) {
+        // Two-byte rune (110xxxxx 10xxxxxx)
+        ch = (leading_byte & 0x1F) << 6;
+        ch |= (*current++ & 0x3F);
+        ch_length = sz_utf8_rune_2bytes_k;
+    }
+    else if ((leading_byte & 0xF0) == 0xE0) {
+        // Three-byte rune (1110xxxx 10xxxxxx 10xxxxxx)
+        ch = (leading_byte & 0x0F) << 12;
+        ch |= (*current++ & 0x3F) << 6;
+        ch |= (*current++ & 0x3F);
+        ch_length = sz_utf8_rune_3bytes_k;
+    }
+    else if ((leading_byte & 0xF8) == 0xF0) {
+        // Four-byte rune (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+        ch = (leading_byte & 0x07) << 18;
+        ch |= (*current++ & 0x3F) << 12;
+        ch |= (*current++ & 0x3F) << 6;
+        ch |= (*current++ & 0x3F);
+        ch_length = sz_utf8_rune_4bytes_k;
+    }
+    else {
+        // Invalid UTF8 rune.
+        ch = 0;
+        ch_length = sz_utf8_invalid_k;
+    }
+    *code = ch;
+    *code_length = ch_length;
+}
+
+/**
+ *  @brief  Exports a UTF8 string into a UTF32 buffer.
+ *          ! The result is undefined id the UTF8 string is corrupted.
+ *  @return The length in the number of codepoints.
+ */
+SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_length, sz_rune_t *utf32) {
+    sz_cptr_t const end = utf8 + utf8_length;
+    sz_size_t count = 0;
+    sz_rune_length_t rune_length;
+    for (; utf8 != end; utf8 += rune_length, utf32++, count++) _sz_extract_utf8_rune(utf8, utf32, &rune_length);
+    return count;
+}
+
+/**
+ *  @brief  Compute the Levenshtein distance between two strings using the Wagner-Fisher algorithm.
+ *          Stores only 2 rows of the Levenshtein matrix, but uses 64-bit integers for the distance values,
+ *          and upcasts UTF8 variable-length codepoints to 64-bit integers for faster addressing.
+ *
+ *  ! In the worst case for 2 strings of length 100, that contain just one 16-bit codepoint this will result in extra:
+ *      + 2 rows * 100 slots * 8 bytes/slot = 1600 bytes of memory for the two rows of the Levenshtein matrix rows.
+ *      + 100 codepoints * 2 strings * 4 bytes/codepoint = 800 bytes of memory for the UTF8 buffer.
+ *      = 2400 bytes of memory or @b 12x memory amplification!
+ */
 SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
     sz_cptr_t longer, sz_size_t longer_length,                //
     sz_cptr_t shorter, sz_size_t shorter_length,              //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
+    sz_size_t bound, sz_bool_t can_be_unicode, sz_memory_allocator_t *alloc) {
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     sz_memory_allocator_t global_alloc;
@@ -2085,71 +2445,127 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
         alloc = &global_alloc;
     }
 
+    // A good idea may be to dispatch different kernels for different string lengths.
+    // Like using `uint8_t` counters for strings under 255 characters long.
+    // Good in theory, this results in frequent upcasts and downcasts in serial code.
+    // On strings over 20 bytes, using `uint8` over `uint64` on 64-bit x86 CPU doubles the execution time.
+    // So one must be very cautious with such optimizations.
+    typedef sz_size_t _distance_t;
+
+    // Compute the number of columns in our Levenshtein matrix.
+    sz_size_t const n = shorter_length + 1;
+
     // If a buffering memory-allocator is provided, this operation is practically free,
     // and cheaper than allocating even 512 bytes (for small distance matrices) on stack.
-    sz_size_t buffer_length = sizeof(sz_size_t) * ((shorter_length + 1) * 2);
-    sz_size_t *distances = (sz_size_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
+    sz_size_t buffer_length = sizeof(_distance_t) * (n * 2);
 
-    sz_size_t *previous_distances = distances;
-    sz_size_t *current_distances = previous_distances + shorter_length + 1;
+    // If the strings contain Unicode characters, let's estimate the max character width,
+    // and use it to allocate a larger buffer to decode UTF8.
+    if ((can_be_unicode == sz_true_k) &&
+        (sz_isascii(longer, longer_length) == sz_false_k || sz_isascii(shorter, shorter_length) == sz_false_k)) {
+        buffer_length += (shorter_length + longer_length) * sizeof(sz_rune_t);
+    }
+    else { can_be_unicode = sz_false_k; }
+
+    // If the allocation fails, return the maximum distance.
+    sz_ptr_t const buffer = (sz_ptr_t)alloc->allocate(buffer_length, alloc->handle);
+    if (!buffer) return SZ_SIZE_MAX;
+
+    // Let's export the UTF8 sequence into the newly allocated buffer at the end.
+    if (can_be_unicode == sz_true_k) {
+        sz_rune_t *const longer_utf32 = (sz_rune_t *)(buffer + sizeof(_distance_t) * (n * 2));
+        sz_rune_t *const shorter_utf32 = longer_utf32 + longer_length;
+        // Export the UTF8 sequences into the newly allocated buffer.
+        longer_length = _sz_export_utf8_to_utf32(longer, longer_length, longer_utf32);
+        shorter_length = _sz_export_utf8_to_utf32(shorter, shorter_length, shorter_utf32);
+        longer = (sz_cptr_t)longer_utf32;
+        shorter = (sz_cptr_t)shorter_utf32;
+    }
 
-    for (sz_size_t idx_shorter = 0; idx_shorter != (shorter_length + 1); ++idx_shorter)
-        previous_distances[idx_shorter] = idx_shorter;
+    // Let's parameterize the core logic for different character types and distance types.
+#define _wagner_fisher_unbounded(_distance_t, _char_t)                                                                \
+    /* Now let's cast our pointer to avoid it in subsequent sections. */                                              \
+    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
+    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
+    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
+    _distance_t *current_distances = previous_distances + n;                                                          \
+    /*  Initialize the first row of the Levenshtein matrix with `iota`-style arithmetic progression. */               \
+    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
+    /* The main loop of the algorithm with quadratic complexity. */                                                   \
+    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
+        _char_t const longer_char = longer_chars[idx_longer];                                                         \
+        /* Using pure pointer arithmetic is faster than iterating with an index. */                                   \
+        _char_t const *shorter_ptr = shorter_chars;                                                                   \
+        _distance_t const *previous_ptr = previous_distances;                                                         \
+        _distance_t *current_ptr = current_distances;                                                                 \
+        _distance_t *const current_end = current_ptr + shorter_length;                                                \
+        current_ptr[0] = idx_longer + 1;                                                                              \
+        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
+            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
+            /* We can avoid `+1` for costs here, shifting it to post-minimum computation, */                          \
+            /* saving one increment operation. */                                                                     \
+            _distance_t cost_deletion = previous_ptr[1];                                                              \
+            _distance_t cost_insertion = current_ptr[0];                                                              \
+            /* ? It might be a good idea to enforce branchless execution here. */                                     \
+            /* ? The caveat being that the benchmarks on longer sequences backfire and more research is needed. */    \
+            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
+        }                                                                                                             \
+        /* Swap `previous_distances` and `current_distances` pointers. */                                             \
+        _distance_t *temporary = previous_distances;                                                                  \
+        previous_distances = current_distances;                                                                       \
+        current_distances = temporary;                                                                                \
+    }                                                                                                                 \
+    /* Cache scalar before `free` call. */                                                                            \
+    sz_size_t result = previous_distances[shorter_length];                                                            \
+    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
+    return result;
 
-    // Keeping track of the bound parameter introduces a very noticeable performance penalty.
-    // So if it's not provided, we can skip the check altogether.
+    // Let's define a separate variant for bounded distance computation.
+    // Practically the same as unbounded, but also collecting the running minimum within each row for early exit.
+#define _wagner_fisher_bounded(_distance_t, _char_t)                                                                  \
+    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
+    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
+    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
+    _distance_t *current_distances = previous_distances + n;                                                          \
+    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
+    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
+        _char_t const longer_char = longer_chars[idx_longer];                                                         \
+        _char_t const *shorter_ptr = shorter_chars;                                                                   \
+        _distance_t const *previous_ptr = previous_distances;                                                         \
+        _distance_t *current_ptr = current_distances;                                                                 \
+        _distance_t *const current_end = current_ptr + shorter_length;                                                \
+        current_ptr[0] = idx_longer + 1;                                                                              \
+        /* Initialize min_distance with a value greater than bound */                                                 \
+        _distance_t min_distance = bound - 1;                                                                         \
+        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
+            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
+            _distance_t cost_deletion = previous_ptr[1];                                                              \
+            _distance_t cost_insertion = current_ptr[0];                                                              \
+            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
+            /* Keep track of the minimum distance seen so far in this row */                                          \
+            min_distance = sz_min_of_two(current_ptr[1], min_distance);                                               \
+        }                                                                                                             \
+        /* If the minimum distance in this row exceeded the bound, return early */                                    \
+        if (min_distance >= bound) {                                                                                  \
+            alloc->free(buffer, buffer_length, alloc->handle);                                                        \
+            return bound;                                                                                             \
+        }                                                                                                             \
+        _distance_t *temporary = previous_distances;                                                                  \
+        previous_distances = current_distances;                                                                       \
+        current_distances = temporary;                                                                                \
+    }                                                                                                                 \
+    sz_size_t result = previous_distances[shorter_length];                                                            \
+    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
+    return sz_min_of_two(result, bound);
+
+    // Dispatch the actual computation.
     if (!bound) {
-        for (sz_size_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {
-            current_distances[0] = idx_longer + 1;
-            for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-                sz_size_t cost_deletion = previous_distances[idx_shorter + 1] + 1;
-                sz_size_t cost_insertion = current_distances[idx_shorter] + 1;
-                sz_size_t cost_substitution =
-                    previous_distances[idx_shorter] + (longer[idx_longer] != shorter[idx_shorter]);
-                // ? It might be a good idea to enforce branchless execution here.
-                // ? The caveat being that the benchmarks on longer sequences backfire and more research is needed.
-                current_distances[idx_shorter + 1] = sz_min_of_three(cost_deletion, cost_insertion, cost_substitution);
-            }
-            sz_u64_swap((sz_u64_t *)&previous_distances, (sz_u64_t *)&current_distances);
-        }
-        // Cache scalar before `free` call.
-        sz_size_t result = previous_distances[shorter_length];
-        alloc->free(distances, buffer_length, alloc->handle);
-        return result;
+        if (can_be_unicode == sz_true_k) { _wagner_fisher_unbounded(sz_size_t, sz_rune_t); }
+        else { _wagner_fisher_unbounded(sz_size_t, sz_u8_t); }
     }
-    //
     else {
-        for (sz_size_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {
-            current_distances[0] = idx_longer + 1;
-
-            // Initialize min_distance with a value greater than bound
-            sz_size_t min_distance = bound - 1;
-
-            for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-                sz_size_t cost_deletion = previous_distances[idx_shorter + 1] + 1;
-                sz_size_t cost_insertion = current_distances[idx_shorter] + 1;
-                sz_size_t cost_substitution =
-                    previous_distances[idx_shorter] + (longer[idx_longer] != shorter[idx_shorter]);
-                current_distances[idx_shorter + 1] = sz_min_of_three(cost_deletion, cost_insertion, cost_substitution);
-
-                // Keep track of the minimum distance seen so far in this row
-                min_distance = sz_min_of_two(current_distances[idx_shorter + 1], min_distance);
-            }
-
-            // If the minimum distance in this row exceeded the bound, return early
-            if (min_distance >= bound) {
-                alloc->free(distances, buffer_length, alloc->handle);
-                return bound;
-            }
-
-            // Swap previous_distances and current_distances pointers
-            sz_u64_swap((sz_u64_t *)&previous_distances, (sz_u64_t *)&current_distances);
-        }
-        // Cache scalar before `free` call.
-        sz_size_t result = previous_distances[shorter_length] < bound ? previous_distances[shorter_length] : bound;
-        alloc->free(distances, buffer_length, alloc->handle);
-        return result;
+        if (can_be_unicode == sz_true_k) { _wagner_fisher_bounded(sz_size_t, sz_rune_t); }
+        else { _wagner_fisher_bounded(sz_size_t, sz_u8_t); }
     }
 }
 
@@ -2161,8 +2577,8 @@ SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
     // Let's make sure that we use the amount proportional to the
     // number of elements in the shorter string, not the larger.
     if (shorter_length > longer_length) {
-        sz_u64_swap((sz_u64_t *)&longer_length, (sz_u64_t *)&shorter_length);
-        sz_u64_swap((sz_u64_t *)&longer, (sz_u64_t *)&shorter);
+        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
+        sz_pointer_swap((void **)&longer, (void **)&shorter);
     }
 
     // Skip the matching prefixes and suffixes, they won't affect the distance.
@@ -2177,8 +2593,8 @@ SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
     // Bounded computations may exit early.
     if (bound) {
         // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return shorter_length <= bound ? shorter_length : bound;
-        if (shorter_length == 0) return longer_length <= bound ? longer_length : bound;
+        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
+        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
         // If the difference in length is beyond the `bound`, there is no need to check at all.
         if (longer_length - shorter_length > bound) return bound;
     }
@@ -2186,7 +2602,8 @@ SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
     if (shorter_length == 0) return longer_length; // If no mismatches were found - the distance is zero.
     if (shorter_length == longer_length && !bound)
         return _sz_edit_distance_skewed_diagonals_serial(longer, longer_length, shorter, shorter_length, bound, alloc);
-    return _sz_edit_distance_wagner_fisher_serial(longer, longer_length, shorter, shorter_length, bound, alloc);
+    return _sz_edit_distance_wagner_fisher_serial(longer, longer_length, shorter, shorter_length, bound, sz_false_k,
+                                                  alloc);
 }
 
 SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(       //
@@ -2202,8 +2619,8 @@ SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(       //
     // Let's make sure that we use the amount proportional to the
     // number of elements in the shorter string, not the larger.
     if (shorter_length > longer_length) {
-        sz_u64_swap((sz_u64_t *)&longer_length, (sz_u64_t *)&shorter_length);
-        sz_u64_swap((sz_u64_t *)&longer, (sz_u64_t *)&shorter);
+        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
+        sz_pointer_swap((void **)&longer, (void **)&shorter);
     }
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
@@ -2237,7 +2654,7 @@ SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(       //
         }
 
         // Swap previous_distances and current_distances pointers
-        sz_u64_swap((sz_u64_t *)&previous_distances, (sz_u64_t *)&current_distances);
+        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
     }
 
     // Cache scalar before `free` call.
@@ -2246,6 +2663,74 @@ SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(       //
     return result;
 }
 
+SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
+    sz_cptr_t a, sz_size_t a_length,            //
+    sz_cptr_t b, sz_size_t b_length,            //
+    sz_size_t bound) {
+
+    sz_size_t const min_length = sz_min_of_two(a_length, b_length);
+    sz_size_t const max_length = sz_max_of_two(a_length, b_length);
+    sz_cptr_t const a_end = a + min_length;
+    bound = bound == 0 ? max_length : bound;
+
+    // Walk through both strings using SWAR and counting the number of differing characters.
+    sz_size_t distance = max_length - min_length;
+#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
+    if (min_length >= SZ_SWAR_THRESHOLD) {
+        sz_u64_vec_t a_vec, b_vec, match_vec;
+        for (; a + 8 <= a_end && distance < bound; a += 8, b += 8) {
+            a_vec.u64 = sz_u64_load(a).u64;
+            b_vec.u64 = sz_u64_load(b).u64;
+            match_vec = _sz_u64_each_byte_equal(a_vec, b_vec);
+            distance += sz_u64_popcount((~match_vec.u64) & 0x8080808080808080ull);
+        }
+    }
+#endif
+
+    for (; a != a_end && distance < bound; ++a, ++b) { distance += (*a != *b); }
+    return sz_min_of_two(distance, bound);
+}
+
+SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
+    sz_cptr_t a, sz_size_t a_length,                 //
+    sz_cptr_t b, sz_size_t b_length,                 //
+    sz_size_t bound) {
+
+    sz_cptr_t const a_end = a + a_length;
+    sz_cptr_t const b_end = b + b_length;
+    sz_size_t distance = 0;
+
+    sz_rune_t a_rune, b_rune;
+    sz_rune_length_t a_rune_length, b_rune_length;
+
+    if (bound) {
+        for (; a < a_end && b < b_end && distance < bound; a += a_rune_length, b += b_rune_length) {
+            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
+            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
+            distance += (a_rune != b_rune);
+        }
+        // If one string has more runes, we need to go through the tail.
+        if (distance < bound) {
+            for (; a < a_end && distance < bound; a += a_rune_length, ++distance)
+                _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
+
+            for (; b < b_end && distance < bound; b += b_rune_length, ++distance)
+                _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
+        }
+    }
+    else {
+        for (; a < a_end && b < b_end; a += a_rune_length, b += b_rune_length) {
+            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
+            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
+            distance += (a_rune != b_rune);
+        }
+        // If one string has more runes, we need to go through the tail.
+        for (; a < a_end; a += a_rune_length, ++distance) _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
+        for (; b < b_end; b += b_rune_length, ++distance) _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
+    }
+    return distance;
+}
+
 /**
  *  @brief  Largest prime number that fits into 31 bits.
  *  @see    https://mersenneforum.org/showthread.php?t=3471
@@ -2491,10 +2976,11 @@ SZ_INTERNAL sz_u8_t sz_u8_toupper(sz_u8_t c) {
  *  @brief  Uses two small lookup tables (768 bytes total) to accelerate division by a small
  *          unsigned integer. Performs two lookups, one multiplication, two shifts, and two accumulations.
  *
- *  @param  divisor Integral value larger than one.
+ *  @param  divisor Integral value @b larger than one.
  *  @param  number  Integral value to divide.
  */
 SZ_INTERNAL sz_u8_t sz_u8_divide(sz_u8_t number, sz_u8_t divisor) {
+    sz_assert(divisor > 1);
     static sz_u16_t const multipliers[256] = {
         0,     0,     0,     21846, 0,     39322, 21846, 9363,  0,     50973, 39322, 29790, 21846, 15124, 9363,  4370,
         0,     57826, 50973, 44841, 39322, 34329, 29790, 25645, 21846, 18351, 15124, 12137, 9363,  6780,  4370,  2115,
@@ -2553,18 +3039,50 @@ SZ_PUBLIC void sz_toascii_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t resu
     for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = *unsigned_text & 0x7F;
 }
 
-SZ_PUBLIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                           sz_random_generator_t generator, void *generator_user_data) {
+/**
+ *  @brief  Check if there is a byte in this buffer, that exceeds 127 and can't be an ASCII character.
+ *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
+ */
+SZ_PUBLIC sz_bool_t sz_isascii_serial(sz_cptr_t text, sz_size_t length) {
+
+    if (!length) return sz_true_k;
+    sz_u8_t const *h = (sz_u8_t const *)text;
+    sz_u8_t const *const h_end = h + length;
+
+#if !SZ_USE_MISALIGNED_LOADS
+    // Process the misaligned head, to void UB on unaligned 64-bit loads.
+    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
+        if (*h & 0x80ull) return sz_false_k;
+#endif
+
+    // Validate eight bytes at once using SWAR.
+    sz_u64_vec_t text_vec;
+    for (; h + 8 <= h_end; h += 8) {
+        text_vec.u64 = *(sz_u64_t const *)h;
+        if (text_vec.u64 & 0x8080808080808080ull) return sz_false_k;
+    }
+
+    // Handle the misaligned tail.
+    for (; h < h_end; ++h)
+        if (*h & 0x80ull) return sz_false_k;
+    return sz_true_k;
+}
+
+SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
+                                  sz_random_generator_t generator, void *generator_user_data) {
 
     sz_assert(alphabet_size > 0 && alphabet_size <= 256 && "Inadequate alphabet size");
 
-    if (alphabet_size == 1)
-        for (sz_cptr_t end = result + result_length; result != end; ++result) *result = *alphabet;
+    if (alphabet_size == 1) sz_fill(result, result_length, *alphabet);
 
     else {
         sz_assert(generator && "Expects a valid random generator");
-        for (sz_cptr_t end = result + result_length; result != end; ++result)
-            *result = alphabet[sz_u8_divide(generator(generator_user_data) & 0xFF, (sz_u8_t)alphabet_size)];
+        sz_u8_t divisor = (sz_u8_t)alphabet_size;
+        for (sz_cptr_t end = result + result_length; result != end; ++result) {
+            sz_u8_t random = generator(generator_user_data) & 0xFF;
+            sz_u8_t quotient = sz_u8_divide(random, divisor);
+            *result = alphabet[random - quotient * divisor];
+        }
     }
 }
 
@@ -2575,15 +3093,6 @@ SZ_PUBLIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t
  */
 #pragma region Serial Implementation for the String Class
 
-/**
- *  @brief  Threshold for switching to SWAR (8-bytes at a time) backend over serial byte-level for-loops.
- *          On very short strings, under 16 bytes long, at most a single word will be processed with SWAR.
- *          Assuming potentially misaligned loads, SWAR makes sense only after ~24 bytes.
- */
-#ifndef SZ_SWAR_THRESHOLD
-#define SZ_SWAR_THRESHOLD (24) // bytes
-#endif
-
 SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string) {
     // It doesn't matter if it's on stack or heap, the pointer location is the same.
     return (sz_bool_t)((sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0]);
@@ -2650,18 +3159,18 @@ SZ_PUBLIC void sz_string_init(sz_string_t *string) {
     // But for safety let's initialize the entire structure to zeros.
     // string->internal.chars[0] = 0;
     // string->internal.length = 0;
-    string->u64s[1] = 0;
-    string->u64s[2] = 0;
-    string->u64s[3] = 0;
+    string->words[1] = 0;
+    string->words[2] = 0;
+    string->words[3] = 0;
 }
 
 SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator) {
     sz_size_t space_needed = length + 1; // space for trailing \0
     sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
     // Initialize the string to zeros for safety.
-    string->u64s[1] = 0;
-    string->u64s[2] = 0;
-    string->u64s[3] = 0;
+    string->words[1] = 0;
+    string->words[2] = 0;
+    string->words[3] = 0;
     // If we are lucky, no memory allocations will be needed.
     if (space_needed <= SZ_STRING_INTERNAL_SPACE) {
         string->internal.start = &string->internal.chars[0];
@@ -2670,7 +3179,7 @@ SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length,
     else {
         // If we are not lucky, we need to allocate memory.
         string->external.start = (sz_ptr_t)allocator->allocate(space_needed, allocator->handle);
-        if (!string->external.start) return SZ_NULL;
+        if (!string->external.start) return SZ_NULL_CHAR;
         string->external.length = length;
         string->external.space = space_needed;
     }
@@ -2694,7 +3203,7 @@ SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity
     sz_assert(new_space > string_space && "New space must be larger than current.");
 
     sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
-    if (!new_start) return SZ_NULL;
+    if (!new_start) return SZ_NULL_CHAR;
 
     sz_copy(new_start, string_start, string_length);
     string->external.start = new_start;
@@ -2734,7 +3243,7 @@ SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_si
         sz_size_t min_needed_space = sz_size_bit_ceil(offset + string_length + added_length + 1);
         sz_size_t new_space = sz_max_of_two(min_needed_space, next_planned_size);
         string_start = sz_string_reserve(string, new_space - 1, allocator);
-        if (!string_start) return SZ_NULL;
+        if (!string_start) return SZ_NULL_CHAR;
 
         // Copy into the new buffer.
         sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
@@ -2806,7 +3315,7 @@ SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value)
 
 SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
 #if SZ_USE_MISALIGNED_LOADS
-    while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t *)source, target += 8, source += 8, length -= 8;
+    while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)source, target += 8, source += 8, length -= 8;
 #endif
     while (length--) *(target++) = *(source++);
 }
@@ -2823,7 +3332,7 @@ SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
     // but older CPUs may predict and fetch forward-passes better.
     if (target < source || target >= source + length) {
 #if SZ_USE_MISALIGNED_LOADS
-        while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t *)source, target += 8, source += 8, length -= 8;
+        while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)(source), target += 8, source += 8, length -= 8;
 #endif
         while (length--) *(target++) = *(source++);
     }
@@ -2831,7 +3340,7 @@ SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
         // Jump to the end and walk backwards.
         target += length, source += length;
 #if SZ_USE_MISALIGNED_LOADS
-        while (length >= 8) *(sz_u64_t *)(target -= 8) = *(sz_u64_t *)(source -= 8), length -= 8;
+        while (length >= 8) *(sz_u64_t *)(target -= 8) = *(sz_u64_t const *)(source -= 8), length -= 8;
 #endif
         while (length--) *(--target) = *(--source);
     }
@@ -3028,7 +3537,7 @@ SZ_PUBLIC void sz_sort_recursion( //
     //        if (!(sequence->order[i] & mask)) sz_u64_swap(sequence->order + i, sequence->order + split), ++split;
     //
     // This, however, doesn't take into account the high relative cost of writes and swaps.
-    // To cercumvent that, we can first count the total number entries to be mapped into either part.
+    // To circumvent that, we can first count the total number entries to be mapped into either part.
     // And then walk through both parts, swapping the entries that are in the wrong part.
     // This would often lead to ~15% performance gain.
     sz_size_t count_with_bit_set = 0;
@@ -3094,20 +3603,33 @@ SZ_INTERNAL sz_bool_t _sz_sort_is_less(sz_sequence_t *sequence, sz_size_t i_key,
 
 SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t partial_order_length) {
 
+#if SZ_DETECT_BIG_ENDIAN
+    // TODO: Implement partial sort for big-endian systems. For now this sorts the whole thing.
+    sz_unused(partial_order_length);
+    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
+#else
+
     // Export up to 4 bytes into the `sequence` bits themselves
     for (sz_size_t i = 0; i != sequence->count; ++i) {
         sz_cptr_t begin = sequence->get_start(sequence, sequence->order[i]);
         sz_size_t length = sequence->get_length(sequence, sequence->order[i]);
-        length = length > 4ull ? 4ull : length;
+        length = length > 4u ? 4u : length;
         sz_ptr_t prefix = (sz_ptr_t)&sequence->order[i];
         for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];
     }
 
     // Perform optionally-parallel radix sort on them
     sz_sort_recursion(sequence, 0, 32, (sz_sequence_comparator_t)_sz_sort_is_less, partial_order_length);
+#endif
 }
 
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence) { sz_sort_partial(sequence, sequence->count); }
+SZ_PUBLIC void sz_sort(sz_sequence_t *sequence) {
+#if SZ_DETECT_BIG_ENDIAN
+    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
+#else
+    sz_sort_partial(sequence, sequence->count);
+#endif
+}
 
 #pragma endregion
 
@@ -3193,7 +3715,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_
 SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL;
+    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
     if (n_length == 1) return sz_find_byte_avx2(h, h_length, n);
 
     // Pick the parts of the needle that are worth comparing.
@@ -3228,7 +3750,7 @@ SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, s
 SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL;
+    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
     if (n_length == 1) return sz_rfind_byte_avx2(h, h_length, n);
 
     // Pick the parts of the needle that are worth comparing.
@@ -3465,7 +3987,6 @@ SZ_INTERNAL __mmask64 _sz_u64_mask_until(sz_size_t n) {
 }
 
 SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_ordering_t ordering_lookup[2] = {sz_greater_k, sz_less_k};
     sz_u512_vec_t a_vec, b_vec;
     __mmask64 a_mask, b_mask, mask_not_equal;
 
@@ -3478,7 +3999,7 @@ SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr
             int first_diff = _tzcnt_u64(mask_not_equal);
             char a_char = a[first_diff];
             char b_char = b[first_diff];
-            return ordering_lookup[a_char < b_char];
+            return _sz_order_scalars(a_char, b_char);
         }
         a += 64, b += 64, a_length -= 64, b_length -= 64;
     }
@@ -3497,12 +4018,12 @@ SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr
             int first_diff = _tzcnt_u64(mask_not_equal);
             char a_char = a[first_diff];
             char b_char = b[first_diff];
-            return ordering_lookup[a_char < b_char];
+            return _sz_order_scalars(a_char, b_char);
         }
         else
             // From logic perspective, the hardest cases are "abc\0" and "abc".
             // The result must be `sz_greater_k`, as the latter is shorter.
-            return a_length != b_length ? ordering_lookup[a_length < b_length] : sz_equal_k;
+            return _sz_order_scalars(a_length, b_length);
     }
     else
         return sz_equal_k;
@@ -3584,13 +4105,13 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr
         if (mask) return h + sz_u64_ctz(mask);
     }
 
-    return SZ_NULL;
+    return SZ_NULL_CHAR;
 }
 
 SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL;
+    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
     if (n_length == 1) return sz_find_byte_avx512(h, h_length, n);
 
     // Pick the parts of the needle that are worth comparing.
@@ -3640,7 +4161,7 @@ SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
             matches &= matches - 1;
         }
     }
-    return SZ_NULL;
+    return SZ_NULL_CHAR;
 }
 
 SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
@@ -3663,13 +4184,13 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cpt
         if (mask) return h + 64 - sz_u64_clz(mask) - 1;
     }
 
-    return SZ_NULL;
+    return SZ_NULL_CHAR;
 }
 
 SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL;
+    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
     if (n_length == 1) return sz_rfind_byte_avx512(h, h_length, n);
 
     // Pick the parts of the needle that are worth comparing.
@@ -3725,7 +4246,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
         }
     }
 
-    return SZ_NULL;
+    return SZ_NULL_CHAR;
 }
 
 SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
@@ -3986,10 +4507,10 @@ SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t win
         chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
 
         // ... and prefetch the next four characters into Level 2 or higher.
-        _mm_prefetch(text_fourth + 1, _MM_HINT_T1);
-        _mm_prefetch(text_third + 1, _MM_HINT_T1);
-        _mm_prefetch(text_second + 1, _MM_HINT_T1);
-        _mm_prefetch(text_first + 1, _MM_HINT_T1);
+        _mm_prefetch((sz_cptr_t)text_fourth + 1, _MM_HINT_T1);
+        _mm_prefetch((sz_cptr_t)text_third + 1, _MM_HINT_T1);
+        _mm_prefetch((sz_cptr_t)text_second + 1, _MM_HINT_T1);
+        _mm_prefetch((sz_cptr_t)text_first + 1, _MM_HINT_T1);
 
         // 3. Add the incoming characters.
         hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
@@ -4074,7 +4595,7 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_
         else { text += load_length, length -= load_length; }
     }
 
-    return SZ_NULL;
+    return SZ_NULL_CHAR;
 }
 
 SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
@@ -4129,7 +4650,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz
         else { length -= load_length; }
     }
 
-    return SZ_NULL;
+    return SZ_NULL_CHAR;
 }
 
 /**
@@ -4157,8 +4678,8 @@ SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_avx512( //
     // Let's make sure that we use the amount proportional to the
     // number of elements in the shorter string, not the larger.
     if (shorter_length > longer_length) {
-        sz_u64_swap((sz_u64_t *)&longer_length, (sz_u64_t *)&shorter_length);
-        sz_u64_swap((sz_u64_t *)&longer, (sz_u64_t *)&shorter);
+        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
+        sz_pointer_swap((void **)&longer, (void **)&shorter);
     }
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
@@ -4339,7 +4860,7 @@ SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_avx512( //
         }
 
         // Swap previous_distances and current_distances pointers
-        sz_u64_swap((sz_u64_t *)&previous_distances, (sz_u64_t *)&current_distances);
+        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
     }
 
     // Cache scalar before `free` call.
@@ -4452,35 +4973,77 @@ SZ_PUBLIC sz_u64_t _sz_find_charset_neon_register(sz_u128_vec_t h_vec, uint8x16_
 SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL;
+    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
     if (n_length == 1) return sz_find_byte_neon(h, h_length, n);
 
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into SIMD registers.
-    sz_u64_t matches;
-    sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-    n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
-    n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
-    n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
-
     // Scan through the string.
-    for (; h_length >= n_length + 16; h += 16, h_length -= 16) {
-        h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_first));
-        h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_mid));
-        h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_last));
-        matches_vec.u8x16 = vandq_u8(                           //
-            vandq_u8(                                           //
-                vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-            vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-        matches = vreinterpretq_u8_u4(matches_vec.u8x16);
-        while (matches) {
-            int potential_offset = sz_u64_ctz(matches) / 4;
-            if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
-            matches &= matches - 1;
+    // Assuming how tiny the Arm NEON registers are, we should avoid internal branches at all costs.
+    // That's why, for smaller needles, we use different loops.
+    if (n_length == 2) {
+        // Broadcast needle characters into SIMD registers.
+        sz_u64_t matches;
+        sz_u128_vec_t h_first_vec, h_last_vec, n_first_vec, n_last_vec, matches_vec;
+        // Dealing with 16-bit values, we can load 2 registers at a time and compare 31 possible offsets
+        // in a single loop iteration.
+        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
+        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
+        for (; h_length >= 17; h += 16, h_length -= 16) {
+            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
+            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
+            matches_vec.u8x16 =
+                vandq_u8(vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
+            matches = vreinterpretq_u8_u4(matches_vec.u8x16);
+            if (matches) return h + sz_u64_ctz(matches) / 4;
+        }
+    }
+    else if (n_length == 3) {
+        // Broadcast needle characters into SIMD registers.
+        sz_u64_t matches;
+        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
+        // Comparing 24-bit values is a bumer. Being lazy, I went with the same approach
+        // as when searching for string over 4 characters long. I only avoid the last comparison.
+        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
+        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
+        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[2]);
+        for (; h_length >= 18; h += 16, h_length -= 16) {
+            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
+            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
+            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 2));
+            matches_vec.u8x16 = vandq_u8(                           //
+                vandq_u8(                                           //
+                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
+                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
+                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
+            matches = vreinterpretq_u8_u4(matches_vec.u8x16);
+            if (matches) return h + sz_u64_ctz(matches) / 4;
+        }
+    }
+    else {
+        // Pick the parts of the needle that are worth comparing.
+        sz_size_t offset_first, offset_mid, offset_last;
+        _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
+        // Broadcast those characters into SIMD registers.
+        sz_u64_t matches;
+        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
+        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
+        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
+        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
+        // Walk through the string.
+        for (; h_length >= n_length + 16; h += 16, h_length -= 16) {
+            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_first));
+            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_mid));
+            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_last));
+            matches_vec.u8x16 = vandq_u8(                           //
+                vandq_u8(                                           //
+                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
+                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
+                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
+            matches = vreinterpretq_u8_u4(matches_vec.u8x16);
+            while (matches) {
+                int potential_offset = sz_u64_ctz(matches) / 4;
+                if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
+                matches &= matches - 1;
+            }
         }
     }
 
@@ -4490,7 +5053,7 @@ SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, s
 SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL;
+    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
     if (n_length == 1) return sz_rfind_byte_neon(h, h_length, n);
 
     // Pick the parts of the needle that are worth comparing.
@@ -4573,6 +5136,7 @@ SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t ins, sz_size_t length) { return sz_hash_ser
 SZ_PUBLIC void sz_tolower(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_tolower_serial(ins, length, outs); }
 SZ_PUBLIC void sz_toupper(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toupper_serial(ins, length, outs); }
 SZ_PUBLIC void sz_toascii(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toascii_serial(ins, length, outs); }
+SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t ins, sz_size_t length) { return sz_isascii_serial(ins, length); }
 
 SZ_PUBLIC void sz_hashes_fingerprint(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_ptr_t fingerprint,
                                      sz_size_t fingerprint_bytes) {
@@ -4580,6 +5144,8 @@ SZ_PUBLIC void sz_hashes_fingerprint(sz_cptr_t start, sz_size_t length, sz_size_
     sz_bool_t fingerprint_length_is_power_of_two = (sz_bool_t)((fingerprint_bytes & (fingerprint_bytes - 1)) == 0);
     sz_string_view_t fingerprint_buffer = {fingerprint, fingerprint_bytes};
 
+    // There are several issues related to the fingerprinting algorithm.
+    // First, the memory traversal order is important.
     // https://blog.stuffedcow.net/2015/08/pagewalk-coherence/
 
     // In most cases the fingerprint length will be a power of two.
@@ -4705,6 +5271,20 @@ SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_chars
 #endif
 }
 
+SZ_DYNAMIC sz_size_t sz_hamming_distance( //
+    sz_cptr_t a, sz_size_t a_length,      //
+    sz_cptr_t b, sz_size_t b_length,      //
+    sz_size_t bound) {
+    return sz_hamming_distance_serial(a, a_length, b, b_length, bound);
+}
+
+SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8( //
+    sz_cptr_t a, sz_size_t a_length,           //
+    sz_cptr_t b, sz_size_t b_length,           //
+    sz_size_t bound) {
+    return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound);
+}
+
 SZ_DYNAMIC sz_size_t sz_edit_distance( //
     sz_cptr_t a, sz_size_t a_length,   //
     sz_cptr_t b, sz_size_t b_length,   //
@@ -4716,6 +5296,13 @@ SZ_DYNAMIC sz_size_t sz_edit_distance( //
 #endif
 }
 
+SZ_DYNAMIC sz_size_t sz_edit_distance_utf8( //
+    sz_cptr_t a, sz_size_t a_length,        //
+    sz_cptr_t b, sz_size_t b_length,        //
+    sz_size_t bound, sz_memory_allocator_t *alloc) {
+    return _sz_edit_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc);
+}
+
 SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
                                          sz_error_cost_t const *subs, sz_error_cost_t gap,
                                          sz_memory_allocator_t *alloc) {
@@ -4767,6 +5354,11 @@ SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_
     return sz_rfind_charset(h, h_length, &set);
 }
 
+SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
+                            sz_random_generator_t generator, void *generator_user_data) {
+    sz_generate_serial(alphabet, alphabet_size, result, result_length, generator, generator_user_data);
+}
+
 #endif
 #pragma endregion
 
diff --git a/contrib/stringzilla/include/stringzilla/stringzilla.hpp b/contrib/stringzilla/include/stringzilla/stringzilla.hpp
index 47c0e8ec3..e64ebc449 100644
--- a/contrib/stringzilla/include/stringzilla/stringzilla.hpp
+++ b/contrib/stringzilla/include/stringzilla/stringzilla.hpp
@@ -45,8 +45,10 @@
 #endif
 
 #if !SZ_AVOID_STL
+#include <array>
 #include <bitset>
 #include <string>
+#include <vector>
 #if SZ_DETECT_CPP_17 && __cpp_lib_string_view
 #include <string_view>
 #endif
@@ -54,6 +56,7 @@
 
 #include <cassert>   // `assert`
 #include <cstddef>   // `std::size_t`
+#include <cstdint>   // `std::int8_t`
 #include <iosfwd>    // `std::basic_ostream`
 #include <stdexcept> // `std::out_of_range`
 #include <utility>   // `std::swap`
@@ -455,8 +458,9 @@ class range_matches {
             return temp;
         }
 
-        bool operator!=(iterator const &other) const noexcept { return remaining_.begin() != other.remaining_.begin(); }
-        bool operator==(iterator const &other) const noexcept { return remaining_.begin() == other.remaining_.begin(); }
+        // Assumes both iterators point to the same underlying string.
+        bool operator!=(iterator const &other) const noexcept { return remaining_.data() != other.remaining_.data(); }
+        bool operator==(iterator const &other) const noexcept { return remaining_.data() == other.remaining_.data(); }
         bool operator!=(end_sentinel_type) const noexcept { return !remaining_.empty(); }
         bool operator==(end_sentinel_type) const noexcept { return remaining_.empty(); }
     };
@@ -547,8 +551,14 @@ class range_rmatches {
             return temp;
         }
 
-        bool operator!=(iterator const &other) const noexcept { return remaining_.end() != other.remaining_.end(); }
-        bool operator==(iterator const &other) const noexcept { return remaining_.end() == other.remaining_.end(); }
+        // Assumes both iterators point to the same underlying string.
+        // This has to be `.data() + .size()`, to be compatible with `std::string_view` on MSVC.
+        bool operator!=(iterator const &other) const noexcept {
+            return remaining_.data() + remaining_.size() != other.remaining_.data() + other.remaining_.size();
+        }
+        bool operator==(iterator const &other) const noexcept {
+            return remaining_.data() + remaining_.size() == other.remaining_.data() + other.remaining_.size();
+        }
         bool operator!=(end_sentinel_type) const noexcept { return !remaining_.empty(); }
         bool operator==(end_sentinel_type) const noexcept { return remaining_.empty(); }
     };
@@ -1037,8 +1047,6 @@ class reversed_iterator_for {
 
 /**
  *  @brief  An "expression template" for lazy concatenation of strings using the `operator|`.
- *
- *  TODO: Ensure eqnership passing and move semantics are preserved.
  */
 template <typename first_type, typename second_type>
 struct concatenation {
@@ -1121,9 +1129,11 @@ class basic_string_slice {
     using partition_type = string_partition_result<string_slice>;
 
     /** @brief  Special value for missing matches.
-     *          We take the largest 63-bit unsigned integer.
+     *
+     *  We take the largest 63-bit unsigned integer on 64-bit machines.
+     *  We take the largest 31-bit unsigned integer on 32-bit machines.
      */
-    static constexpr size_type npos = 0x7FFFFFFFFFFFFFFFull;
+    static constexpr size_type npos = SZ_SSIZE_MAX;
 
 #pragma region Constructors and STL Utilities
 
@@ -1858,7 +1868,7 @@ class basic_string_slice {
  *
  *  Functions defined for `basic_string`, but not present in `basic_string_slice`:
  *      * `replace`, `insert`, `erase`, `append`, `push_back`, `pop_back`, `resize`, `shrink_to_fit`... from STL,
- *      * `try_` exception-free "try" operations that returning non-zero values on succces,
+ *      * `try_` exception-free "try" operations that returning non-zero values on success,
  *      * `replace_all` and `erase_all` similar to Boost,
  *      * `edit_distance` - Levenshtein distance computation reusing the allocator,
  *      * `randomize`, `random` - for fast random string generation.
@@ -1872,7 +1882,7 @@ class basic_string_slice {
  *  Default constructor is `constexpr`. Move constructor and move assignment operator are `noexcept`.
  *  Copy constructor and copy assignment operator are not! They may throw `std::bad_alloc` if the memory
  *  allocation fails. Similar to STL `std::out_of_range` if the position argument to some of the functions
- *  is out of bounds. Same as with STL, the bound checks are often assymetric, so pay attention to docs.
+ *  is out of bounds. Same as with STL, the bound checks are often asymmetric, so pay attention to docs.
  *  If exceptions are disabled, on failure, `std::terminate` is called.
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
@@ -1956,18 +1966,20 @@ class basic_string {
     using partition_type = string_partition_result<string_view>;
 
     /** @brief  Special value for missing matches.
-     *          We take the largest 63-bit unsigned integer.
+     *
+     *  We take the largest 63-bit unsigned integer on 64-bit machines.
+     *  We take the largest 31-bit unsigned integer on 32-bit machines.
      */
-    static constexpr size_type npos = 0x7FFFFFFFFFFFFFFFull;
+    static constexpr size_type npos = SZ_SSIZE_MAX;
 
 #pragma region Constructors and STL Utilities
 
     sz_constexpr_if_cpp20 basic_string() noexcept {
         // ! Instead of relying on the `sz_string_init`, we have to reimplement it to support `constexpr`.
         string_.internal.start = &string_.internal.chars[0];
-        string_.u64s[1] = 0;
-        string_.u64s[2] = 0;
-        string_.u64s[3] = 0;
+        string_.words[1] = 0;
+        string_.words[2] = 0;
+        string_.words[3] = 0;
     }
 
     ~basic_string() noexcept {
@@ -2787,8 +2799,8 @@ class basic_string {
      *  @throw  `std::length_error` if the string is too long.
      *  @throw  `std::bad_alloc` if the allocation fails.
      */
-    iterator insert(const_iterator it, std::initializer_list<char_type> ilist) noexcept(false) {
-        return insert(it, ilist.begin(), ilist.end());
+    iterator insert(const_iterator it, std::initializer_list<char_type> list) noexcept(false) {
+        return insert(it, list.begin(), list.end());
     }
 
     /**
@@ -2947,8 +2959,8 @@ class basic_string {
      *  @see    `try_replace` for a cleaner exception-less alternative.
      */
     basic_string &replace(const_iterator first, const_iterator last,
-                          std::initializer_list<char_type> ilist) noexcept(false) {
-        return replace(first, last, ilist.begin(), ilist.end());
+                          std::initializer_list<char_type> list) noexcept(false) {
+        return replace(first, last, list.begin(), list.end());
     }
 
     /**
@@ -3027,8 +3039,8 @@ class basic_string {
      *  @throw  `std::bad_alloc` if the allocation fails.
      *  @see    `try_assign` for a cleaner exception-less alternative.
      */
-    basic_string &assign(std::initializer_list<char_type> ilist) noexcept(false) {
-        return assign(ilist.begin(), ilist.end());
+    basic_string &assign(std::initializer_list<char_type> list) noexcept(false) {
+        return assign(list.begin(), list.end());
     }
 
     /**
@@ -3155,7 +3167,8 @@ class basic_string {
      *  @param  alphabet   A string of characters to choose from.
      */
     basic_string &randomize(string_view alphabet = "abcdefghijklmnopqrstuvwxyz") noexcept {
-        return randomize(&std::rand, alphabet);
+        auto generator = []() { return static_cast<sz_u64_t>(std::rand()); };
+        return randomize(generator, alphabet);
     }
 
     /**
@@ -3353,7 +3366,7 @@ bool basic_string<char_type_, allocator_>::try_replace_all_(pattern_type pattern
 
     // 2. The pattern is longer than the replacement. We need to compact the strings.
     else if (matcher.needle_length() > replacement.length()) {
-        // Dealing with shorter replacements, we will avoid memory allocations, but we can also mimnimize the number
+        // Dealing with shorter replacements, we will avoid memory allocations, but we can also minimize the number
         // of `memmove`-s, by keeping one more iterator, pointing to the end of the last compacted area.
         // Having the split-ranges, however, we reuse their logic.
         using splits_type = range_splits<string_view, matcher_type>;
@@ -3513,15 +3526,55 @@ typename concatenation_result<first_type, second_type, following_types...>::type
 }
 
 /**
- *  @brief  Calculates the Levenshtein edit distance between two strings.
+ *  @brief  Calculates the Hamming edit distance in @b bytes between two strings.
+ *  @see    sz_edit_distance
+ */
+template <typename char_type_>
+std::size_t hamming_distance(basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b,
+                             std::size_t bound = 0) noexcept {
+    return sz_hamming_distance(a.data(), a.size(), b.data(), b.size(), bound);
+}
+
+/**
+ *  @brief  Calculates the Hamming edit distance in @b bytes between two strings.
+ *  @see    sz_edit_distance
+ */
+template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
+std::size_t hamming_distance(basic_string<char_type_, allocator_type_> const &a,
+                             basic_string<char_type_, allocator_type_> const &b, std::size_t bound = 0) noexcept {
+    return ashvardanian::stringzilla::hamming_distance(a.view(), b.view(), bound);
+}
+
+/**
+ *  @brief  Calculates the Hamming edit distance in @b unicode codepoints between two strings.
+ *  @see    sz_hamming_distance_utf8
+ */
+template <typename char_type_>
+std::size_t hamming_distance_utf8(basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b,
+                                  std::size_t bound = 0) noexcept {
+    return sz_hamming_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound);
+}
+
+/**
+ *  @brief  Calculates the Hamming edit distance in @b unicode codepoints between two strings.
+ *  @see    sz_edit_distance
+ */
+template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
+std::size_t hamming_distance_utf8(basic_string<char_type_, allocator_type_> const &a,
+                                  basic_string<char_type_, allocator_type_> const &b, std::size_t bound = 0) noexcept {
+    return ashvardanian::stringzilla::hamming_distance_utf8(a.view(), b.view(), bound);
+}
+
+/**
+ *  @brief  Calculates the Levenshtein edit distance in @b bytes between two strings.
  *  @see    sz_edit_distance
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
 std::size_t edit_distance(basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b,
-                          allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
+                          std::size_t bound = 0, allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
     std::size_t result;
     if (!_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
-            result = sz_edit_distance(a.data(), a.size(), b.data(), b.size(), SZ_SIZE_MAX, &alloc);
+            result = sz_edit_distance(a.data(), a.size(), b.data(), b.size(), bound, &alloc);
             return result != SZ_SIZE_MAX;
         }))
         throw std::bad_alloc();
@@ -3529,13 +3582,41 @@ std::size_t edit_distance(basic_string_slice<char_type_> const &a, basic_string_
 }
 
 /**
- *  @brief  Calculates the Levenshtein edit distance between two strings.
+ *  @brief  Calculates the Levenshtein edit distance in @b bytes between two strings.
  *  @see    sz_edit_distance
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
 std::size_t edit_distance(basic_string<char_type_, allocator_type_> const &a,
-                          basic_string<char_type_, allocator_type_> const &b) noexcept(false) {
-    return ashvardanian::stringzilla::edit_distance(a.view(), b.view(), a.get_allocator());
+                          basic_string<char_type_, allocator_type_> const &b, std::size_t bound = 0) noexcept(false) {
+    return ashvardanian::stringzilla::edit_distance(a.view(), b.view(), bound, a.get_allocator());
+}
+
+/**
+ *  @brief  Calculates the Levenshtein edit distance in @b unicode codepoints between two strings.
+ *  @see    sz_edit_distance_utf8
+ */
+template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
+std::size_t edit_distance_utf8(basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b,
+                               std::size_t bound = 0,
+                               allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
+    std::size_t result;
+    if (!_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
+            result = sz_edit_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound, &alloc);
+            return result != SZ_SIZE_MAX;
+        }))
+        throw std::bad_alloc();
+    return result;
+}
+
+/**
+ *  @brief  Calculates the Levenshtein edit distance in @b unicode codepoints between two strings.
+ *  @see    sz_edit_distance_utf8
+ */
+template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
+std::size_t edit_distance_utf8(basic_string<char_type_, allocator_type_> const &a,
+                               basic_string<char_type_, allocator_type_> const &b,
+                               std::size_t bound = 0) noexcept(false) {
+    return ashvardanian::stringzilla::edit_distance_utf8(a.view(), b.view(), bound, a.get_allocator());
 }
 
 /**
@@ -3598,6 +3679,8 @@ void randomize(basic_string_slice<char_type_> string, string_view alphabet = "ab
     randomize(string, &std::rand, alphabet);
 }
 
+using sorted_idx_t = sz_sorted_idx_t;
+
 /**
  *  @brief  Internal data-structure used to forward the arguments to the `sz_sort` function.
  *  @see    sorted_order
@@ -3606,7 +3689,7 @@ template <typename objects_type_, typename string_extractor_>
 struct _sequence_args {
     objects_type_ const *begin;
     std::size_t count;
-    std::size_t *order;
+    sorted_idx_t *order;
     string_extractor_ extractor;
 };
 
@@ -3639,17 +3722,17 @@ sz_size_t _call_sequence_member_length(struct sz_sequence_t const *sequence, sz_
  *  @see    sz_sort
  */
 template <typename objects_type_, typename string_extractor_>
-void sorted_order(objects_type_ const *begin, objects_type_ const *end, std::size_t *order,
+void sorted_order(objects_type_ const *begin, objects_type_ const *end, sorted_idx_t *order,
                   string_extractor_ &&extractor) noexcept {
 
     // Pack the arguments into a single structure to reference it from the callback.
     _sequence_args<objects_type_, string_extractor_> args = {begin, static_cast<std::size_t>(end - begin), order,
                                                              std::forward<string_extractor_>(extractor)};
     // Populate the array with `iota`-style order.
-    for (std::size_t i = 0; i != args.count; ++i) order[i] = i;
+    for (std::size_t i = 0; i != args.count; ++i) order[i] = static_cast<sorted_idx_t>(i);
 
     sz_sequence_t array;
-    array.order = reinterpret_cast<sz_u64_t *>(order);
+    array.order = reinterpret_cast<sorted_idx_t *>(order);
     array.count = args.count;
     array.handle = &args;
     array.get_start = _call_sequence_member_start<objects_type_, string_extractor_>;
@@ -3697,9 +3780,9 @@ std::bitset<bitset_bits_> hashes_fingerprint(basic_string<char_type_> const &str
  *  @throw  `std::bad_alloc` if the allocation fails.
  */
 template <typename objects_type_, typename string_extractor_>
-std::vector<std::size_t> sorted_order(objects_type_ const *begin, objects_type_ const *end,
-                                      string_extractor_ &&extractor) noexcept(false) {
-    std::vector<std::size_t> order(end - begin);
+std::vector<sorted_idx_t> sorted_order(objects_type_ const *begin, objects_type_ const *end,
+                                       string_extractor_ &&extractor) noexcept(false) {
+    std::vector<sorted_idx_t> order(end - begin);
     sorted_order(begin, end, order.data(), std::forward<string_extractor_>(extractor));
     return order;
 }
@@ -3710,7 +3793,7 @@ std::vector<std::size_t> sorted_order(objects_type_ const *begin, objects_type_
  *  @throw  `std::bad_alloc` if the allocation fails.
  */
 template <typename string_like_type_>
-std::vector<std::size_t> sorted_order(string_like_type_ const *begin, string_like_type_ const *end) noexcept(false) {
+std::vector<sorted_idx_t> sorted_order(string_like_type_ const *begin, string_like_type_ const *end) noexcept(false) {
     static_assert(std::is_convertible<string_like_type_, string_view>::value,
                   "The type must be convertible to string_view.");
     return sorted_order(begin, end, [](string_like_type_ const &s) -> string_view { return s; });
@@ -3722,7 +3805,7 @@ std::vector<std::size_t> sorted_order(string_like_type_ const *begin, string_lik
  *  @throw  `std::bad_alloc` if the allocation fails.
  */
 template <typename string_like_type_>
-std::vector<std::size_t> sorted_order(std::vector<string_like_type_> const &array) noexcept(false) {
+std::vector<sorted_idx_t> sorted_order(std::vector<string_like_type_> const &array) noexcept(false) {
     static_assert(std::is_convertible<string_like_type_, string_view>::value,
                   "The type must be convertible to string_view.");
     return sorted_order(array.data(), array.data() + array.size(),
diff --git a/contrib/stringzilla/lib.c b/contrib/stringzilla/lib.c
index 61cfed97c..bcfe7834e 100644
--- a/contrib/stringzilla/lib.c
+++ b/contrib/stringzilla/lib.c
@@ -9,11 +9,6 @@
 #include <windows.h> // `DllMain`
 #endif
 
-// If we don't have the LibC, the `malloc` definition in `stringzilla.h` will be illformed.
-#if SZ_AVOID_LIBC
-typedef __SIZE_TYPE__ size_t;
-#endif
-
 // Overwrite `SZ_DYNAMIC_DISPATCH` before including StringZilla.
 #ifdef SZ_DYNAMIC_DISPATCH
 #undef SZ_DYNAMIC_DISPATCH
@@ -22,6 +17,13 @@ typedef __SIZE_TYPE__ size_t;
 #include <stringzilla/stringzilla.h>
 
 #if SZ_AVOID_LIBC
+// If we don't have the LibC, the `malloc` definition in `stringzilla.h` will be illformed.
+#ifdef _MSC_VER
+typedef sz_size_t size_t; // Reuse the type definition we've inferred from `stringzilla.h`
+#else
+typedef __SIZE_TYPE__ size_t; // For GCC/Clang
+#endif
+int rand(void) { return 0; }
 void free(void *start) { sz_unused(start); }
 void *malloc(size_t length) {
     sz_unused(length);
@@ -116,7 +118,6 @@ typedef struct sz_implementations_t {
     sz_find_set_t find_from_set;
     sz_find_set_t rfind_from_set;
 
-    // TODO: Upcoming vectorization
     sz_edit_distance_t edit_distance;
     sz_alignment_score_t alignment_score;
     sz_hashes_t hashes;
@@ -255,6 +256,20 @@ SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_chars
     return sz_dispatch_table.rfind_from_set(text, length, set);
 }
 
+SZ_DYNAMIC sz_size_t sz_hamming_distance( //
+    sz_cptr_t a, sz_size_t a_length,      //
+    sz_cptr_t b, sz_size_t b_length,      //
+    sz_size_t bound) {
+    return sz_hamming_distance_serial(a, a_length, b, b_length, bound);
+}
+
+SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8( //
+    sz_cptr_t a, sz_size_t a_length,           //
+    sz_cptr_t b, sz_size_t b_length,           //
+    sz_size_t bound) {
+    return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound);
+}
+
 SZ_DYNAMIC sz_size_t sz_edit_distance( //
     sz_cptr_t a, sz_size_t a_length,   //
     sz_cptr_t b, sz_size_t b_length,   //
@@ -262,6 +277,13 @@ SZ_DYNAMIC sz_size_t sz_edit_distance( //
     return sz_dispatch_table.edit_distance(a, a_length, b, b_length, bound, alloc);
 }
 
+SZ_DYNAMIC sz_size_t sz_edit_distance_utf8( //
+    sz_cptr_t a, sz_size_t a_length,        //
+    sz_cptr_t b, sz_size_t b_length,        //
+    sz_size_t bound, sz_memory_allocator_t *alloc) {
+    return _sz_edit_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc);
+}
+
 SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
                                          sz_error_cost_t const *subs, sz_error_cost_t gap,
                                          sz_memory_allocator_t *alloc) {
@@ -302,3 +324,14 @@ SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_
     sz_charset_invert(&set);
     return sz_rfind_charset(h, h_length, &set);
 }
+
+sz_u64_t _sz_random_generator(void *empty_state) {
+    sz_unused(empty_state);
+    return (sz_u64_t)rand();
+}
+
+SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
+                            sz_random_generator_t generator, void *generator_user_data) {
+    if (!generator) generator = _sz_random_generator;
+    sz_generate_serial(alphabet, alphabet_size, result, result_length, generator, generator_user_data);
+}
-- 
2.39.5