6 年之前 · 12da94dc1d
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1242,6 +1242,7 @@ ADD_SUBDIRECTORY(contrib/librdns)
 ADD_SUBDIRECTORY(contrib/aho-corasick)
 ADD_SUBDIRECTORY(contrib/lpeg)
 ADD_SUBDIRECTORY(contrib/linenoise)
 ADD_SUBDIRECTORY(contrib/t1ha)

 IF (ENABLE_SNOWBALL MATCHES "ON")
 	LIST(APPEND RSPAMD_REQUIRED_LIBRARIES stemmer)
@@ -1252,6 +1253,7 @@ ENDIF()
 IF(ENABLE_HYPERSCAN MATCHES "OFF")
 	LIST(APPEND RSPAMD_REQUIRED_LIBRARIES rspamd-actrie)
 ENDIF()
 LIST(APPEND RSPAMD_REQUIRED_LIBRARIES rspamd-t1ha)

 IF(ENABLE_CLANG_PLUGIN MATCHES "ON")
 	ADD_SUBDIRECTORY(clang-plugin)
--- a/contrib/t1ha/CMakeLists.txt
+++ b/contrib/t1ha/CMakeLists.txt
@@ -0,0 +1,14 @@
 SET(T1HASRC     t1ha0.c
                t1ha0_ia32aes_noavx.c
                t1ha1.c
                t1ha2.c)

 ADD_LIBRARY(rspamd-t1ha STATIC ${T1HASRC})
 SET_TARGET_PROPERTIES(rspamd-t1ha PROPERTIES VERSION ${RSPAMD_VERSION})
 ADD_DEFINITIONS("-DT1HA_USE_FAST_ONESHOT_READ=1")

 IF(ENABLE_FULL_DEBUG MATCHES "OFF")
    if ("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
        SET_TARGET_PROPERTIES(rspamd-t1ha PROPERTIES COMPILE_FLAGS "-O3")
    endif ()
 ENDIF()
--- a/contrib/t1ha/LICENSE
+++ b/contrib/t1ha/LICENSE
@@ -0,0 +1,21 @@
 Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com,
 Fast Positive Hash.

 Portions Copyright (c) 2010-2013 Leonid Yuriev <leo@yuriev.ru>,
 The 1Hippeus project (t1h).

 This software is provided 'as-is', without any express or implied
 warranty. In no event will the authors be held liable for any damages
 arising from the use of this software.

 Permission is granted to anyone to use this software for any purpose,
 including commercial applications, and to alter it and redistribute it
 freely, subject to the following restrictions:

 1. The origin of this software must not be misrepresented; you must not
    claim that you wrote the original software. If you use this software
    in a product, an acknowledgement in the product documentation would be
    appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be
    misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
--- a/contrib/t1ha/t1ha.h
+++ b/contrib/t1ha/t1ha.h
@@ -1,8 +1,8 @@
 /*
 *  Copyright (c) 2016 Positive Technologies, https://www.ptsecurity.com,
 *  Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2016 Leonid Yuriev <leo@yuriev.ru>,
 *  Portions Copyright (c) 2010-2018 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
@@ -23,534 +23,400 @@
 */

 /*
 * t1ha = { Fast Positive Hash}
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but without penalties could runs on any 64-bit CPU.
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others which are not use specific hardware tricks.
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

 #ifndef T1HA_INCLUDED
 #define T1HA_INCLUDED
 #include "config.h"
 #include <string.h>
 #include <stddef.h>
 #pragma once

 #ifndef __has_attribute
 #define __has_attribute(x) (0)
 #endif
 #ifndef __has_builtin
 #define __has_builtin(x) (0)

 #ifndef __has_include
 #define __has_include(x) (0)
 #endif

 #ifdef BYTE_ORDER
 #ifndef __ORDER_LITTLE_ENDIAN__
 #define __ORDER_LITTLE_ENDIAN__ LITTLE_ENDIAN
 #ifndef __GNUC_PREREQ
 #if defined(__GNUC__) && defined(__GNUC_MINOR__)
 #define __GNUC_PREREQ(maj, min)                                                \
  ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
 #else
 #define __GNUC_PREREQ(maj, min) 0
 #endif
 #ifndef __ORDER_BIG_ENDIAN__
 #define __ORDER_BIG_ENDIAN__ BIG_ENDIAN
 #endif /* __GNUC_PREREQ */

 #ifndef __CLANG_PREREQ
 #ifdef __clang__
 #define __CLANG_PREREQ(maj, min)                                               \
  ((__clang_major__ << 16) + __clang_minor__ >= ((maj) << 16) + (min))
 #else
 #define __CLANG_PREREQ(maj, min) (0)
 #endif
 #ifndef __BYTE_ORDER__
 #define __BYTE_ORDER__ BYTE_ORDER
 #endif /* __CLANG_PREREQ */

 /*****************************************************************************/

 #ifdef _MSC_VER
 /* Avoid '16' bytes padding added after data member 't1ha_context::total'
 * and other warnings from std-headers if warning-level > 3. */
 #pragma warning(push, 3)
 #endif

 #if defined(__cplusplus) && __cplusplus >= 201103L
 #include <climits>
 #include <cstddef>
 #include <cstdint>
 #else
 #include <limits.h>
 #include <stddef.h>
 #include <stdint.h>
 #endif

 /*****************************************************************************/

 #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \
    defined(i486) || defined(__i486) || defined(__i486__) ||                   \
    defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) ||   \
    defined(__i686) || defined(__i686__) || defined(_M_IX86) ||                \
    defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) ||            \
    defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) ||          \
    defined(__amd64__) || defined(__amd64) || defined(_M_X64) ||               \
    defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__)
 #ifndef __ia32__
 /* LY: define neutral __ia32__ for x86 and x86-64 archs */
 #define __ia32__ 1
 #endif /* __ia32__ */
 #if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) ||        \
                            defined(__amd64) || defined(_M_X64))
 /* LY: define trusty __amd64__ for all AMD64/x86-64 arch */
 #define __amd64__ 1
 #endif /* __amd64__ */
 #endif /* all x86 */

 #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) ||           \
 		!defined(__ORDER_BIG_ENDIAN__)
    !defined(__ORDER_BIG_ENDIAN__)

 /* *INDENT-OFF* */
 /* clang-format off */

 #if defined(__GLIBC__) || defined(__GNU_LIBRARY__) || defined(__ANDROID__) ||  \
    defined(HAVE_ENDIAN_H) || __has_include(<endian.h>)
 #include <endian.h>
 #elif defined(__APPLE__) || defined(__MACH__) || defined(__OpenBSD__) ||       \
    defined(HAVE_MACHINE_ENDIAN_H) || __has_include(<machine/endian.h>)
 #include <machine/endian.h>
 #elif defined(HAVE_SYS_ISA_DEFS_H) || __has_include(<sys/isa_defs.h>)
 #include <sys/isa_defs.h>
 #elif (defined(HAVE_SYS_TYPES_H) && defined(HAVE_SYS_ENDIAN_H)) ||             \
    (__has_include(<sys/types.h>) && __has_include(<sys/endian.h>))
 #include <sys/endian.h>
 #include <sys/types.h>
 #elif defined(__bsdi__) || defined(__DragonFly__) || defined(__FreeBSD__) ||   \
    defined(__NETBSD__) || defined(__NetBSD__) ||                              \
    defined(HAVE_SYS_PARAM_H) || __has_include(<sys/param.h>)
 #include <sys/param.h>
 #endif /* OS */

 /* *INDENT-ON* */
 /* clang-format on */

 #if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN)
 #define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN
 #define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN
 #define __BYTE_ORDER__ __BYTE_ORDER
 #elif defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)
 #define __ORDER_LITTLE_ENDIAN__ _LITTLE_ENDIAN
 #define __ORDER_BIG_ENDIAN__ _BIG_ENDIAN
 #define __BYTE_ORDER__ _BYTE_ORDER
 #else
 #define __ORDER_LITTLE_ENDIAN__ 1234
 #define __ORDER_BIG_ENDIAN__ 4321
 #if defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) ||                        \
 		defined(__THUMBEL__) || defined(__AARCH64EL__) || defined(__MIPSEL__) ||   \
 		defined(_MIPSEL) || defined(__MIPSEL) || defined(__i386) ||                \
 		defined(__x86_64) || defined(_M_IX86) || defined(_M_X64) ||                \
 		defined(i386) || defined(_X86_) || defined(__i386__) || defined(_X86_64_)

 #if defined(__LITTLE_ENDIAN__) ||                                              \
    (defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN)) ||                      \
    defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) ||    \
    defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) ||            \
    defined(_M_ARM) || defined(_M_ARM64) || defined(__e2k__) ||                \
    defined(__elbrus_4c__) || defined(__elbrus_8c__) || defined(__bfin__) ||   \
    defined(__BFIN__) || defined(__ia64__) || defined(_IA64) ||                \
    defined(__IA64__) || defined(__ia64) || defined(_M_IA64) ||                \
    defined(__itanium__) || defined(__ia32__) || defined(__CYGWIN__) ||        \
    defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) ||              \
    defined(__WINDOWS__)
 #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
 #elif defined(__BIG_ENDIAN__) || defined(__ARMEB__) || defined(__THUMBEB__) || \
 		defined(__AARCH64EB__) || defined(__MIPSEB__) || defined(_MIPSEB) ||       \
 		defined(__MIPSEB)

 #elif defined(__BIG_ENDIAN__) ||                                               \
    (defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN)) ||                      \
    defined(__ARMEB__) || defined(__THUMBEB__) || defined(__AARCH64EB__) ||    \
    defined(__MIPSEB__) || defined(_MIPSEB) || defined(__MIPSEB) ||            \
    defined(__m68k__) || defined(M68000) || defined(__hppa__) ||               \
    defined(__hppa) || defined(__HPPA__) || defined(__sparc__) ||              \
    defined(__sparc) || defined(__370__) || defined(__THW_370__) ||            \
    defined(__s390__) || defined(__s390x__) || defined(__SYSC_ZARCH__)
 #define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__

 #else
 #error __BYTE_ORDER__ should be defined.
 #endif /* Arch */

 #endif
 #endif
 #endif
 #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ &&                               \
 		__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
 #error Unsupported byte order.
 #endif
 #endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */

 /*****************************************************************************/

 #if !defined(UNALIGNED_OK)
 #if defined(__i386) || defined(__x86_64) || defined(_M_IX86) ||                \
 		defined(_M_X64) || defined(i386) || defined(_X86_) || defined(__i386__) || \
 		defined(_X86_64_)
 #define UNALIGNED_OK 1
 #ifndef __dll_export
 #if defined(_WIN32) || defined(_WIN64) || defined(__CYGWIN__)
 #if defined(__GNUC__) || __has_attribute(dllexport)
 #define __dll_export __attribute__((dllexport))
 #elif defined(_MSC_VER)
 #define __dll_export __declspec(dllexport)
 #else
 #define UNALIGNED_OK 0
 #define __dll_export
 #endif
 #elif defined(__GNUC__) || __has_attribute(visibility)
 #define __dll_export __attribute__((visibility("default")))
 #else
 #define __dll_export
 #endif
 #endif /* __dll_export */

 #ifndef __GNUC_PREREQ
 #if defined(__GNUC__) && defined(__GNUC_MINOR__)
 #define __GNUC_PREREQ(maj, min)                                                \
  ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
 #ifndef __dll_import
 #if defined(_WIN32) || defined(_WIN64) || defined(__CYGWIN__)
 #if defined(__GNUC__) || __has_attribute(dllimport)
 #define __dll_import __attribute__((dllimport))
 #elif defined(_MSC_VER)
 #define __dll_import __declspec(dllimport)
 #else
 #define __GNUC_PREREQ(maj, min) 0
 #define __dll_import
 #endif
 #else
 #define __dll_import
 #endif
 #endif /* __dll_import */

 #if defined(t1ha_EXPORTS)
 #define T1HA_API __dll_export
 #elif defined(t1ha_IMPORTS)
 #define T1HA_API __dll_import
 #else
 #define T1HA_API
 #endif /* T1HA_API */

 #if __GNUC_PREREQ(4, 4) || defined(__clang__)

 #if defined(__i386__) || defined(__x86_64__)
 #include <x86intrin.h>
 #endif
 #define likely(cond) __builtin_expect(!!(cond), 1)
 #define unlikely(cond) __builtin_expect(!!(cond), 0)
 # if __GNUC_PREREQ(4, 6) || defined(__clang__)
 #define unreachable() __builtin_unreachable()
 # else
 #define unreachable()                                                          \
 		do {                                                                   \
 		for (;;)                                                               \
 		;                                                                      \
 		} while (0)
 # endif
 #define bswap64(v) __builtin_bswap64(v)
 #define bswap32(v) __builtin_bswap32(v)
 #if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
 #define bswap16(v) __builtin_bswap16(v)
 #endif
 #if __GNUC_PREREQ(4, 3) || __has_attribute(unused)
 #define maybe_unused __attribute__((unused))
 #endif
 #if defined(_MSC_VER) && defined(__ia32__)
 #define T1HA_ALIGN_PREFIX __declspec(align(32)) /* required only for SIMD */
 #else
 #define T1HA_ALIGN_PREFIX
 #endif /* _MSC_VER */

 #elif defined(_MSC_VER)
 #if defined(__GNUC__) && defined(__ia32__)
 #define T1HA_ALIGN_SUFFIX                                                      \
  __attribute__((aligned(32))) /* required only for SIMD */
 #else
 #define T1HA_ALIGN_SUFFIX
 #endif /* GCC x86 */

 #include <intrin.h>
 #include <stdlib.h>
 #define likely(cond) (cond)
 #define unlikely(cond) (cond)
 #define unreachable() __assume(0)
 #define bswap64(v) _byteswap_uint64(v)
 #define bswap32(v) _byteswap_ulong(v)
 #define bswap16(v) _byteswap_ushort(v)
 #define rot64(v, s) _rotr64(v, s)
 #define rot32(v, s) _rotr(v, s)

 #if defined(_M_ARM64) || defined(_M_X64)
 #pragma intrinsic(_umul128)
 #define mul_64x64_128(a, b, ph) _umul128(a, b, ph)
 #pragma intrinsic(__umulh)
 #define mul_64x64_high(a, b) __umulh(a, b)
 #ifdef __cplusplus
 extern "C" {
 #endif

 #if defined(_M_IX86)
 #pragma intrinsic(__emulu)
 #define mul_32x32_64(a, b) __emulu(a, b)
 #elif defined(_M_ARM)
 #define mul_32x32_64(a, b) _arm_umull(a, b)
 typedef union T1HA_ALIGN_PREFIX t1ha_state256 {
  uint8_t bytes[32];
  uint32_t u32[8];
  uint64_t u64[4];
  struct {
    uint64_t a, b, c, d;
  } n;
 } t1ha_state256_t T1HA_ALIGN_SUFFIX;

 typedef struct t1ha_context {
  t1ha_state256_t state;
  t1ha_state256_t buffer;
  size_t partial;
  uint64_t total;
 } t1ha_context_t;

 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif

 #else /* Compiler */

 #define likely(cond) (cond)
 #define unlikely(cond) (cond)
 #define unreachable()                                                          \
 		do                                                                           \
 		for (;;)                                                                   \
 		;                                                                        \
 		while (0)
 #endif /* Compiler */

 #ifndef bswap64
 static __inline uint64_t bswap64(uint64_t v) {
 	return v << 56 | v >> 56 | ((v << 40) & 0x00ff000000000000ull) |
 			((v << 24) & 0x0000ff0000000000ull) |
 			((v << 8) & 0x000000ff00000000ull) |
 			((v >> 8) & 0x00000000ff000000ull) |
 			((v >> 24) & 0x0000000000ff0000ull) |
 			((v >> 40) & 0x000000000000ff00ull);
 }
 #endif /* bswap64 */
 /******************************************************************************
 *
 *  t1ha2 = 64 and 128-bit, SLIGHTLY MORE ATTENTION FOR QUALITY AND STRENGTH.
 *
 *    - The recommended version of "Fast Positive Hash" with good quality
 *      for checksum, hash tables and fingerprinting.
 *    - Portable and extremely efficiency on modern 64-bit CPUs.
 *      Designed for 64-bit little-endian platforms,
 *      in other cases will runs slowly.
 *    - Great quality of hashing and still faster than other non-t1ha hashes.
 *      Provides streaming mode and 128-bit result.
 *
 * Note: Due performance reason 64- and 128-bit results are completely
 *       different each other, i.e. 64-bit result is NOT any part of 128-bit.
 */

 #ifndef bswap32
 static __inline uint32_t bswap32(uint32_t v) {
 	return v << 24 | v >> 24 | ((v << 8) & 0x00ff0000) | ((v >> 8) & 0x0000ff00);
 }
 #endif /* bswap32 */
 /* The at-once variant with 64-bit result */
 T1HA_API uint64_t t1ha2_atonce(const void *data, size_t length, uint64_t seed);

 /* The at-once variant with 128-bit result.
 * Argument `extra_result` is NOT optional and MUST be valid.
 * The high 64-bit part of 128-bit hash will be always unconditionally
 * stored to the address given by `extra_result` argument. */
 T1HA_API uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result,
                                  const void *__restrict data, size_t length,
                                  uint64_t seed);

 /* The init/update/final trinity for streaming.
 * Return 64 or 128-bit result depentently from `extra_result` argument. */
 T1HA_API void t1ha2_init(t1ha_context_t *ctx, uint64_t seed_x, uint64_t seed_y);
 T1HA_API void t1ha2_update(t1ha_context_t *__restrict ctx,
                           const void *__restrict data, size_t length);

 /* Argument `extra_result` is optional and MAY be NULL.
 *  - If `extra_result` is NOT NULL then the 128-bit hash will be calculated,
 *    and high 64-bit part of it will be stored to the address given
 *    by `extra_result` argument.
 *  - Otherwise the 64-bit hash will be calculated
 *    and returned from function directly.
 *
 * Note: Due performance reason 64- and 128-bit results are completely
 *       different each other, i.e. 64-bit result is NOT any part of 128-bit. */
 T1HA_API uint64_t t1ha2_final(t1ha_context_t *__restrict ctx,
                              uint64_t *__restrict extra_result /* optional */);

 #ifndef bswap16
 static __inline uint16_t bswap16(uint16_t v) { return v << 8 | v >> 8; }
 #endif /* bswap16 */
 /******************************************************************************
 *
 *  t1ha1 = 64-bit, BASELINE FAST PORTABLE HASH:
 *
 *    - Runs faster on 64-bit platforms in other cases may runs slowly.
 *    - Portable and stable, returns same 64-bit result
 *      on all architectures and CPUs.
 *    - Unfortunately it fails the "strict avalanche criteria",
 *      see test results at https://github.com/demerphq/smhasher.
 *
 *      This flaw is insignificant for the t1ha1() purposes and imperceptible
 *      from a practical point of view.
 *      However, nowadays this issue has resolved in the next t1ha2(),
 *      that was initially planned to providing a bit more quality.
 */

 #ifndef rot64
 static __inline uint64_t rot64(uint64_t v, unsigned s) {
 	return (v >> s) | (v << (64 - s));
 }
 #endif /* rot64 */
 /* The little-endian variant. */
 T1HA_API uint64_t t1ha1_le(const void *data, size_t length, uint64_t seed);

 #ifndef rot32
 static __inline uint32_t rot32(uint32_t v, unsigned s) {
 	return (v >> s) | (v << (32 - s));
 }
 #endif /* rot32 */
 /* The big-endian variant. */
 T1HA_API uint64_t t1ha1_be(const void *data, size_t length, uint64_t seed);

 #ifndef mul_32x32_64
 static __inline uint64_t mul_32x32_64(uint32_t a, uint32_t b) {
 	return a * (uint64_t)b;
 /* The historical nicname for generic little-endian variant. */
 static __inline uint64_t t1ha(const void *data, size_t length, uint64_t seed) {
  return t1ha1_le(data, length, seed);
 }
 #endif /* mul_32x32_64 */

 /***************************************************************************/

 static __inline uint64_t fetch64(const void *v) {
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 	return *(const uint64_t *)v;
 #else
 	return bswap64(*(const uint64_t *)v);
 #endif
 }
 /******************************************************************************
 *
 *  t1ha0 = 64-bit, JUST ONLY FASTER:
 *
 *    - Provides fast-as-possible hashing for current CPU, including
 *      32-bit systems and engaging the available hardware acceleration.
 *    - It is a facade that selects most quick-and-dirty hash
 *      for the current processor. For instance, on IA32 (x86) actual function
 *      will be selected in runtime, depending on current CPU capabilities
 *
 * BE CAREFUL!!!  THIS IS MEANS:
 *
 *   1. The quality of hash is a subject for tradeoffs with performance.
 *      So, the quality and strength of t1ha0() may be lower than t1ha1(),
 *      especially on 32-bit targets, but then much faster.
 *      However, guaranteed that it passes all SMHasher tests.
 *
 *   2. No warranty that the hash result will be same for particular
 *      key on another machine or another version of libt1ha.
 *
 *      Briefly, such hash-results and their derivatives, should be
 *      used only in runtime, but should not be persist or transferred
 *      over a network.
 */

 static __inline uint64_t fetch32(const void *v) {
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 	return *(const uint32_t *)v;
 /* The little-endian variant for 32-bit CPU. */
 uint64_t t1ha0_32le(const void *data, size_t length, uint64_t seed);
 /* The big-endian variant for 32-bit CPU. */
 uint64_t t1ha0_32be(const void *data, size_t length, uint64_t seed);

 /* Define T1HA0_AESNI_AVAILABLE to 0 for disable AES-NI support. */
 #ifndef T1HA0_AESNI_AVAILABLE
 #if  (defined(__ia32__) && (!defined(_M_IX86) || _MSC_VER > 1800))
    #if defined(__GNUC__) && \
 	((defined(__clang__) && (__clang_major__ >= 4 || (__clang_major__ >= 3 && __clang_minor__ >= 8))) || \
 	((__GNUC__ == 4) &&  (__GNUC_MINOR__ >= 8) || (__GNUC__ > 4)))
    #define T1HA0_AESNI_AVAILABLE 1
    #else
    #define T1HA0_AESNI_AVAILABLE 0
 	#endif
 #else
 	return bswap32(*(const uint32_t *)v);
 #define T1HA0_AESNI_AVAILABLE 0
 #endif
 }
 #endif /* T1HA0_AESNI_AVAILABLE */

 static __inline uint64_t fetch16(const void *v) {
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 	return *(const uint16_t *)v;
 /* Define T1HA0_RUNTIME_SELECT to 0 for disable dispatching t1ha0 at runtime. */
 #ifndef T1HA0_RUNTIME_SELECT
 #if T1HA0_AESNI_AVAILABLE && !defined(__e2k__)
 #define T1HA0_RUNTIME_SELECT 1
 #else
 	return bswap16(*(const uint16_t *)v);
 #define T1HA0_RUNTIME_SELECT 0
 #endif
 }
 #endif /* T1HA0_RUNTIME_SELECT */

 static __inline uint64_t fetch_tail(const void *v, size_t tail) {
 	const uint8_t *_ = (const uint8_t *)v;
 	switch (tail & 7) {
 	case 1:
 		return _[0];
 	case 2:
 		return fetch16(_);
 	case 3:
 		return fetch16(_) | (_[2] << 16);
 	case 4:
 		return fetch32(_);
 	case 5:
 		return fetch32(_) | ((uint64_t)_[4] << 32);
 	case 6:
 		return fetch32(_) | (fetch16(_ + 4) << 32);
 	case 7:
 		return fetch32(_) | (fetch16(_ + 4) << 32) | ((uint64_t)_[6] << 48);
 	case 0:
 		return fetch64(_);
 	default:
 		unreachable();
 	}
 }

 /* xor-mul-xor mixer */
 static __inline uint64_t mix(uint64_t v, uint64_t p) {
 	static const unsigned s0 = 41;
 	v *= p;
 	return v ^ rot64(v, s0);
 }

 static __inline unsigned add_with_carry(uint64_t *sum, uint64_t addend) {
 	*sum += addend;
 	return *sum < addend;
 }
 #if T1HA0_AESNI_AVAILABLE
 uint64_t t1ha0_ia32aes_noavx(const void *data, size_t length, uint64_t seed);
 #endif /* T1HA0_AESNI_AVAILABLE */

 /* xor high and low parts of full 128-bit product */
 static __inline uint64_t mux64(uint64_t v, uint64_t p) {
 #ifdef __SIZEOF_INT128__
 	__uint128_t r = (__uint128_t)v * (__uint128_t)p;
 	/* modern GCC could nicely optimize this */
 	return r ^ (r >> 64);
 #elif defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128
 	__uint128 r = (__uint128)v * (__uint128)p;
 	return r ^ (r >> 64);
 #elif defined(mul_64x64_128)
 	uint64_t l, h;
 	l = mul_64x64_128(v, p, &h);
 	return l ^ h;
 #elif defined(mul_64x64_high)
 	uint64_t l, h;
 	l = v * p;
 	h = mul_64x64_high(v, p);
 	return l ^ h;
 #if T1HA0_RUNTIME_SELECT
 #ifdef __ELF__
 /* ifunc/gnu_indirect_function will be used on ELF.
 * Please see https://en.wikipedia.org/wiki/Executable_and_Linkable_Format */
 T1HA_API uint64_t t1ha0(const void *data, size_t length, uint64_t seed);
 #else
 	/* performs 64x64 to 128 bit multiplication */
 	uint64_t ll = mul_32x32_64((uint32_t)v, (uint32_t)p);
 	uint64_t lh = mul_32x32_64(v >> 32, (uint32_t)p);
 	uint64_t hl = mul_32x32_64(p >> 32, (uint32_t)v);
 	uint64_t hh =
 			mul_32x32_64(v >> 32, p >> 32) + (lh >> 32) + (hl >> 32) +
 			/* Few simplification are possible here for 32-bit architectures,
 			 * but thus we would lost compatibility with the original 64-bit
 			 * version.  Think is very bad idea, because then 32-bit t1ha will
 			 * still (relatively) very slowly and well yet not compatible. */
 			add_with_carry(&ll, lh << 32) + add_with_carry(&ll, hl << 32);
 	return hh ^ ll;
 #endif
 }

 static uint64_t
 t1ha(const void *data, size_t len, uint64_t seed)
 {
 	/* 'magic' primes */
 	static const uint64_t p0 = 17048867929148541611ull;
 	static const uint64_t p1 = 9386433910765580089ull;
 	static const uint64_t p2 = 15343884574428479051ull;
 	static const uint64_t p3 = 13662985319504319857ull;
 	static const uint64_t p4 = 11242949449147999147ull;
 	static const uint64_t p5 = 13862205317416547141ull;
 	static const uint64_t p6 = 14653293970879851569ull;
 	/* rotations */
 	static const unsigned s0 = 41;
 	static const unsigned s1 = 17;
 	static const unsigned s2 = 31;

 	uint64_t a = seed;
 	uint64_t b = len;

 	const int need_align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK;
 	uint64_t align[4];

 	if (unlikely(len > 32)) {
 		uint64_t c = rot64(len, s1) + seed;
 		uint64_t d = len ^ rot64(seed, s1);
 		const void *detent = (const uint8_t *)data + len - 31;
 		do {
 			const uint64_t *v = (const uint64_t *)data;
 			if (unlikely(need_align))
 				v = (const uint64_t *)memcpy(&align, v, 32);

 			uint64_t w0 = fetch64(v + 0);
 			uint64_t w1 = fetch64(v + 1);
 			uint64_t w2 = fetch64(v + 2);
 			uint64_t w3 = fetch64(v + 3);

 			uint64_t d02 = w0 ^ rot64(w2 + d, s1);
 			uint64_t c13 = w1 ^ rot64(w3 + c, s1);
 			c += a ^ rot64(w0, s0);
 			d -= b ^ rot64(w1, s2);
 			a ^= p1 * (d02 + w3);
 			b ^= p0 * (c13 + w2);
 			data = (const uint64_t *)data + 4;
 		} while (likely(data < detent));

 		a ^= p6 * (rot64(c, s1) + d);
 		b ^= p5 * (c + rot64(d, s1));
 		len &= 31;
 	}

 	const uint64_t *v = (const uint64_t *)data;
 	if (unlikely(need_align) && len > 1)
 		v = (const uint64_t *)memcpy(&align, v, len);

 	switch (len) {
 	default:
 		b += mux64(fetch64(v++), p4);
 	case 24:
 	case 23:
 	case 22:
 	case 21:
 	case 20:
 	case 19:
 	case 18:
 	case 17:
 		a += mux64(fetch64(v++), p3);
 	case 16:
 	case 15:
 	case 14:
 	case 13:
 	case 12:
 	case 11:
 	case 10:
 	case 9:
 		b += mux64(fetch64(v++), p2);
 	case 8:
 	case 7:
 	case 6:
 	case 5:
 	case 4:
 	case 3:
 	case 2:
 	case 1:
 		a += mux64(fetch_tail(v, len), p1);
 	case 0:
 		return mux64(rot64(a + b, s1), p4) + mix(a ^ b, p0);
 	}
 /* Otherwise function pointer will be used.
 * Unfortunately this may cause some overhead calling. */
 T1HA_API extern uint64_t (*t1ha0_funcptr)(const void *data, size_t length,
                                          uint64_t seed);
 static __inline uint64_t t1ha0(const void *data, size_t length, uint64_t seed) {
  return t1ha0_funcptr(data, length, seed);
 }
 #endif /* __ELF__ */

 static __inline uint32_t tail32_le(const void *v, size_t tail) {
 	const uint8_t *p = (const uint8_t *)v;
 	uint32_t r = 0;
 	switch (tail & 3) {
 #if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 	/* For most CPUs this code is better when not needed
 	 * copying for alignment or byte reordering. */
 	case 0:
 		return fetch32(p);
 	case 3:
 		r = (uint32_t)p[2] << 16;
 	case 2:
 		return r + fetch16(p);
 	case 1:
 		return p[0];
 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 static __inline uint64_t t1ha0(const void *data, size_t length, uint64_t seed) {
 #if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
  return t1ha1_be(data, length, seed);
 #else
 	/* For most CPUs this code is better than a
 	 * copying for alignment and/or byte reordering. */
 	case 0:
 		r += p[3];
 		r <<= 8;
 	case 3:
 		r += p[2];
 		r <<= 8;
 	case 2:
 		r += p[1];
 		r <<= 8;
 	case 1:
 		return r + p[0];
  return t1ha0_32be(data, length, seed);
 #endif
 	}
 	unreachable();
 }

 static __inline uint32_t tail32_be(const void *v, size_t tail) {
 	const uint8_t *p = (const uint8_t *)v;
 	switch (tail & 3) {
 #if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 	/* For most CPUs this code is better when not needed
 	 * copying for alignment or byte reordering. */
 	case 1:
 		return p[0];
 	case 2:
 		return fetch16_be(p);
 	case 3:
 		return fetch16_be(p) << 8 | p[2];
 	case 0:
 		return fetch32_be(p);
 #else
 	/* For most CPUs this code is better than a
 	 * copying for alignment and/or byte reordering. */
 	case 1:
 		return p[0];
 	case 2:
 		return p[1] | (uint32_t)p[0] << 8;
 	case 3:
 		return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16;
 	case 0:
 		return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 |
 				(uint32_t)p[0] << 24;
 static __inline uint64_t t1ha0(const void *data, size_t length, uint64_t seed) {
 #if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
  return t1ha1_le(data, length, seed);
 #else
  return t1ha0_32le(data, length, seed);
 #endif
 	}
 	unreachable();
 }
 #endif /* !T1HA0_RUNTIME_SELECT */

 static __inline uint64_t remix32(uint32_t a, uint32_t b) {
 	static const uint64_t p0 = 17048867929148541611ull;
 	a ^= rot32(b, 13);
 	uint64_t l = a | (uint64_t)b << 32;
 	l *= p0;
 	l ^= l >> 41;
 	return l;
 #ifdef __cplusplus
 }

 static __inline void mixup32(uint32_t *a, uint32_t *b, uint32_t v, uint32_t p) {
 	uint64_t l = mul_32x32_64(*b + v, p);
 	*a ^= (uint32_t)l;
 	*b += (uint32_t)(l >> 32);
 }

 static uint64_t t1ha32(const void *data, size_t len, uint64_t seed) {
 	/* 32-bit 'magic' primes */
 	static const uint32_t q0 = 0x92D78269;
 	static const uint32_t q1 = 0xCA9B4735;
 	static const uint32_t q2 = 0xA4ABA1C3;
 	static const uint32_t q3 = 0xF6499843;
 	static const uint32_t q4 = 0x86F0FD61;
 	static const uint32_t q5 = 0xCA2DA6FB;
 	static const uint32_t q6 = 0xC4BB3575;
 	/* rotations */
 	static const unsigned s1 = 17;

 	uint32_t a = rot32((uint32_t)len, s1) + (uint32_t)seed;
 	uint32_t b = (uint32_t)len ^ (uint32_t)(seed >> 32);

 	const int need_align = (((uintptr_t)data) & 3) != 0 && !UNALIGNED_OK;
 	uint32_t align[4];

 	if (unlikely(len > 16)) {
 		uint32_t c = ~a;
 		uint32_t d = rot32(b, 5);
 		const void *detent = (const uint8_t *)data + len - 15;
 		do {
 			const uint32_t *v = (const uint32_t *)data;
 			if (unlikely(need_align))
 				v = (const uint32_t *)memcpy(&align, v, 16);

 			uint32_t w0 = fetch32(v + 0);
 			uint32_t w1 = fetch32(v + 1);
 			uint32_t w2 = fetch32(v + 2);
 			uint32_t w3 = fetch32(v + 3);

 			uint32_t c02 = w0 ^ rot32(w2 + c, 11);
 			uint32_t d13 = w1 + rot32(w3 + d, s1);
 			c ^= rot32(b + w1, 7);
 			d ^= rot32(a + w0, 3);
 			b = q1 * (c02 + w3);
 			a = q0 * (d13 ^ w2);

 			data = (const uint32_t *)data + 4;
 		} while (likely(data < detent));

 		c += a;
 		d += b;
 		a ^= q6 * (rot32(c, 16) + d);
 		b ^= q5 * (c + rot32(d, 16));

 		len &= 15;
 	}

 	const uint8_t *v = (const uint8_t *)data;
 	if (unlikely(need_align) && len > 4)
 		v = (const uint8_t *)memcpy(&align, v, len);

 	switch (len) {
 	default:
 		mixup32(&a, &b, fetch32(v), q4);
 		v += 4;
 	case 12:
 	case 11:
 	case 10:
 	case 9:
 		mixup32(&b, &a, fetch32(v), q3);
 		v += 4;
 	case 8:
 	case 7:
 	case 6:
 	case 5:
 		mixup32(&a, &b, fetch32(v), q2);
 		v += 4;
 	case 4:
 	case 3:
 	case 2:
 	case 1:
 		mixup32(&b, &a, tail32_le(v, len), q1);
 	case 0:
 		return remix32(a, b);
 	}
 }

 #endif
--- a/contrib/t1ha/t1ha0.c
+++ b/contrib/t1ha/t1ha0.c
@@ -0,0 +1,411 @@
 /*
 *  Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2018 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

 /*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

 #include "config.h"
 #include "t1ha_bits.h"

 static __always_inline uint32_t tail32_le(const void *v, size_t tail) {
  const uint8_t *p = (const uint8_t *)v;
 #ifdef can_read_underside
  /* On some systems (e.g. x86) we can perform a 'oneshot' read, which
   * is little bit faster. Thanks Marcin Żukowski <marcin.zukowski@gmail.com>
   * for the reminder. */
  const unsigned offset = (4 - tail) & 3;
  const unsigned shift = offset << 3;
  if (likely(can_read_underside(p, 4))) {
    p -= offset;
    return fetch32_le(p) >> shift;
  }
  return fetch32_le(p) & ((~UINT32_C(0)) >> shift);
 #endif /* 'oneshot' read */

  uint32_t r = 0;
  switch (tail & 3) {
 #if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  /* For most CPUs this code is better when not needed
   * copying for alignment or byte reordering. */
  case 0:
    return fetch32_le(p);
  case 3:
    r = (uint32_t)p[2] << 16;
  /* fall through */
  case 2:
    return r + fetch16_le(p);
  case 1:
    return p[0];
 #else
  /* For most CPUs this code is better than a
   * copying for alignment and/or byte reordering. */
  case 0:
    r += p[3];
    r <<= 8;
  /* fall through */
  case 3:
    r += p[2];
    r <<= 8;
  /* fall through */
  case 2:
    r += p[1];
    r <<= 8;
  /* fall through */
  case 1:
    return r + p[0];
 #endif
  }
  unreachable();
 }

 static __always_inline uint32_t tail32_be(const void *v, size_t tail) {
  const uint8_t *p = (const uint8_t *)v;
 #ifdef can_read_underside
  /* On some systems we can perform a 'oneshot' read, which is little bit
   * faster. Thanks Marcin Żukowski <marcin.zukowski@gmail.com> for the
   * reminder. */
  const unsigned offset = (4 - tail) & 3;
  const unsigned shift = offset << 3;
  if (likely(can_read_underside(p, 4))) {
    p -= offset;
    return fetch32_be(p) & ((~UINT32_C(0)) >> shift);
  }
  return fetch32_be(p) >> shift;
 #endif /* 'oneshot' read */

  switch (tail & 3) {
 #if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  /* For most CPUs this code is better when not needed
   * copying for alignment or byte reordering. */
  case 1:
    return p[0];
  case 2:
    return fetch16_be(p);
  case 3:
    return fetch16_be(p) << 8 | p[2];
  case 0:
    return fetch32_be(p);
 #else
  /* For most CPUs this code is better than a
   * copying for alignment and/or byte reordering. */
  case 1:
    return p[0];
  case 2:
    return p[1] | (uint32_t)p[0] << 8;
  case 3:
    return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16;
  case 0:
    return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 |
           (uint32_t)p[0] << 24;
 #endif
  }
  unreachable();
 }

 /***************************************************************************/

 #ifndef rot32
 static __maybe_unused __always_inline uint32_t rot32(uint32_t v, unsigned s) {
  return (v >> s) | (v << (32 - s));
 }
 #endif /* rot32 */

 static __always_inline void mixup32(uint32_t *a, uint32_t *b, uint32_t v,
                                    uint32_t prime) {
  uint64_t l = mul_32x32_64(*b + v, prime);
  *a ^= (uint32_t)l;
  *b += (uint32_t)(l >> 32);
 }

 static __always_inline uint64_t final32(uint32_t a, uint32_t b) {
  uint64_t l = (b ^ rot32(a, 13)) | (uint64_t)a << 32;
  l *= prime_0;
  l ^= l >> 41;
  l *= prime_4;
  l ^= l >> 47;
  l *= prime_6;
  return l;
 }

 /* 32-bit 'magic' primes */
 static const uint32_t prime32_0 = UINT32_C(0x92D78269);
 static const uint32_t prime32_1 = UINT32_C(0xCA9B4735);
 static const uint32_t prime32_2 = UINT32_C(0xA4ABA1C3);
 static const uint32_t prime32_3 = UINT32_C(0xF6499843);
 static const uint32_t prime32_4 = UINT32_C(0x86F0FD61);
 static const uint32_t prime32_5 = UINT32_C(0xCA2DA6FB);
 static const uint32_t prime32_6 = UINT32_C(0xC4BB3575);

 uint64_t t1ha0_32le(const void *data, size_t len, uint64_t seed) {
  uint32_t a = rot32((uint32_t)len, 17) + (uint32_t)seed;
  uint32_t b = (uint32_t)len ^ (uint32_t)(seed >> 32);

  const int need_align = (((uintptr_t)data) & 3) != 0 && !UNALIGNED_OK;
  uint32_t align[4];

  if (unlikely(len > 16)) {
    uint32_t c = ~a;
    uint32_t d = rot32(b, 5);
    const void *detent = (const uint8_t *)data + len - 15;
    do {
      const uint32_t *v = (const uint32_t *)data;
      if (unlikely(need_align))
        v = (const uint32_t *)memcpy(&align, unaligned(v), 16);

      uint32_t w0 = fetch32_le(v + 0);
      uint32_t w1 = fetch32_le(v + 1);
      uint32_t w2 = fetch32_le(v + 2);
      uint32_t w3 = fetch32_le(v + 3);

      uint32_t c02 = w0 ^ rot32(w2 + c, 11);
      uint32_t d13 = w1 + rot32(w3 + d, 17);
      c ^= rot32(b + w1, 7);
      d ^= rot32(a + w0, 3);
      b = prime32_1 * (c02 + w3);
      a = prime32_0 * (d13 ^ w2);

      data = (const uint32_t *)data + 4;
    } while (likely(data < detent));

    c += a;
    d += b;
    a ^= prime32_6 * (rot32(c, 16) + d);
    b ^= prime32_5 * (c + rot32(d, 16));

    len &= 15;
  }

  const uint8_t *v = (const uint8_t *)data;
  if (unlikely(need_align) && len > 4)
    v = (const uint8_t *)memcpy(&align, unaligned(v), len);

  switch (len) {
  default:
    mixup32(&a, &b, fetch32_le(v), prime32_4);
    v += 4;
  /* fall through */
  case 12:
  case 11:
  case 10:
  case 9:
    mixup32(&b, &a, fetch32_le(v), prime32_3);
    v += 4;
  /* fall through */
  case 8:
  case 7:
  case 6:
  case 5:
    mixup32(&a, &b, fetch32_le(v), prime32_2);
    v += 4;
  /* fall through */
  case 4:
  case 3:
  case 2:
  case 1:
    mixup32(&b, &a, tail32_le(v, len), prime32_1);
  /* fall through */
  case 0:
    return final32(a, b);
  }
 }

 uint64_t t1ha0_32be(const void *data, size_t len, uint64_t seed) {
  uint32_t a = rot32((uint32_t)len, 17) + (uint32_t)seed;
  uint32_t b = (uint32_t)len ^ (uint32_t)(seed >> 32);

  const int need_align = (((uintptr_t)data) & 3) != 0 && !UNALIGNED_OK;
  uint32_t align[4];

  if (unlikely(len > 16)) {
    uint32_t c = ~a;
    uint32_t d = rot32(b, 5);
    const void *detent = (const uint8_t *)data + len - 15;
    do {
      const uint32_t *v = (const uint32_t *)data;
      if (unlikely(need_align))
        v = (const uint32_t *)memcpy(&align, unaligned(v), 16);

      uint32_t w0 = fetch32_be(v + 0);
      uint32_t w1 = fetch32_be(v + 1);
      uint32_t w2 = fetch32_be(v + 2);
      uint32_t w3 = fetch32_be(v + 3);

      uint32_t c02 = w0 ^ rot32(w2 + c, 11);
      uint32_t d13 = w1 + rot32(w3 + d, 17);
      c ^= rot32(b + w1, 7);
      d ^= rot32(a + w0, 3);
      b = prime32_1 * (c02 + w3);
      a = prime32_0 * (d13 ^ w2);

      data = (const uint32_t *)data + 4;
    } while (likely(data < detent));

    c += a;
    d += b;
    a ^= prime32_6 * (rot32(c, 16) + d);
    b ^= prime32_5 * (c + rot32(d, 16));

    len &= 15;
  }

  const uint8_t *v = (const uint8_t *)data;
  if (unlikely(need_align) && len > 4)
    v = (const uint8_t *)memcpy(&align, unaligned(v), len);

  switch (len) {
  default:
    mixup32(&a, &b, fetch32_be(v), prime32_4);
    v += 4;
  /* fall through */
  case 12:
  case 11:
  case 10:
  case 9:
    mixup32(&b, &a, fetch32_be(v), prime32_3);
    v += 4;
  /* fall through */
  case 8:
  case 7:
  case 6:
  case 5:
    mixup32(&a, &b, fetch32_be(v), prime32_2);
    v += 4;
  /* fall through */
  case 4:
  case 3:
  case 2:
  case 1:
    mixup32(&b, &a, tail32_be(v, len), prime32_1);
  /* fall through */
  case 0:
    return final32(a, b);
  }
 }

 /***************************************************************************/

 #if T1HA0_RUNTIME_SELECT

 #if T1HA0_AESNI_AVAILABLE && defined(__ia32__)
 static uint64_t x86_cpu_features(void) {
  uint32_t features = 0;
  uint32_t extended = 0;
 #ifdef __GNUC__
  uint32_t eax, ebx, ecx, edx;
  const unsigned cpuid_max = __get_cpuid_max(0, NULL);
  if (cpuid_max >= 1) {
    __cpuid_count(1, 0, eax, ebx, features, edx);
    if (cpuid_max >= 7)
      __cpuid_count(7, 0, eax, extended, ecx, edx);
  }
 #elif defined(_MSC_VER)
  int info[4];
  __cpuid(info, 0);
  const unsigned cpuid_max = info[0];
  if (cpuid_max >= 1) {
    __cpuidex(info, 1, 0);
    features = info[2];
    if (cpuid_max >= 7) {
      __cpuidex(info, 7, 0);
      extended = info[1];
    }
  }
 #endif
  return features | (uint64_t)extended << 32;
 }
 #endif /* T1HA0_AESNI_AVAILABLE && __ia32__ */

 static
 #if __GNUC_PREREQ(4, 0) || __has_attribute(used)
    __attribute__((used))
 #endif
    uint64_t (*t1ha0_resolve(void))(const void *, size_t, uint64_t) {

 #if T1HA0_AESNI_AVAILABLE && defined(__ia32__)
  uint64_t features = x86_cpu_features();
  if (features & UINT32_C(0x02000000) /* check for AES-NI */) {
    return t1ha0_ia32aes_noavx;
  }
 #endif /* T1HA0_AESNI_AVAILABLE && __ia32__ */

 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 #if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
  return t1ha1_be;
 #else
  return t1ha0_32be;
 #endif
 #else /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */
 #if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
  return t1ha1_le;
 #else
  return t1ha0_32le;
 #endif
 #endif /* __BYTE_ORDER__ */
 }

 #ifdef __ELF__

 #if __has_attribute(ifunc)

 uint64_t t1ha0(const void *data, size_t len, uint64_t seed)
    __attribute__((ifunc("t1ha0_resolve")));
 #else
 __asm("\t.globl\tt1ha0\n\t.type\tt1ha0, "
      "%gnu_indirect_function\n\t.set\tt1ha0,t1ha0_resolve");
 #endif /* ifunc */

 #elif __GNUC_PREREQ(4, 0) || __has_attribute(constructor)

 uint64_t (*t1ha0_funcptr)(const void *, size_t, uint64_t);

 static void __attribute__((constructor)) t1ha0_init(void) {
  t1ha0_funcptr = t1ha0_resolve();
 }

 #else /* ELF */
 static uint64_t t1ha0_proxy(const void *data, size_t len, uint64_t seed) {
  t1ha0_funcptr = t1ha0_resolve();
  return t1ha0_funcptr(data, len, seed);
 }

 uint64_t (*t1ha0_funcptr)(const void *, size_t, uint64_t) = t1ha0_proxy;

 #endif /* !ELF */
 #endif /* T1HA0_RUNTIME_SELECT */
--- a/contrib/t1ha/t1ha0_ia32aes_a.h
+++ b/contrib/t1ha/t1ha0_ia32aes_a.h
@@ -0,0 +1,200 @@
 /*
 *  Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2018 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

 /*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

 #include "t1ha_bits.h"

 #if T1HA0_AESNI_AVAILABLE

 #pragma GCC push_options
 #pragma GCC target("aes")
 #ifndef __SSE2__
 #define __SSE2__
 #endif
 #ifndef __SSE__
 #define __SSE__
 #endif
 #ifndef __AES__
 #define __AES__
 #endif
 #include <immintrin.h>

 uint64_t T1HA_IA32AES_NAME(const void *data, size_t len, uint64_t seed) {
  uint64_t a = seed;
  uint64_t b = len;

  if (unlikely(len > 32)) {
    __m128i x = _mm_set_epi64x(a, b);
    __m128i y = _mm_aesenc_si128(x, _mm_set_epi64x(prime_5, prime_6));

    const __m128i *__restrict v = (const __m128i *)data;
    const __m128i *__restrict const detent =
        (const __m128i *)((const uint8_t *)data + len - 127);

    while (v < detent) {
      __m128i v0 = _mm_loadu_si128(v + 0);
      __m128i v1 = _mm_loadu_si128(v + 1);
      __m128i v2 = _mm_loadu_si128(v + 2);
      __m128i v3 = _mm_loadu_si128(v + 3);
      __m128i v4 = _mm_loadu_si128(v + 4);
      __m128i v5 = _mm_loadu_si128(v + 5);
      __m128i v6 = _mm_loadu_si128(v + 6);
      __m128i v7 = _mm_loadu_si128(v + 7);

      __m128i v0y = _mm_aesenc_si128(v0, y);
      __m128i v2x6 = _mm_aesenc_si128(v2, _mm_xor_si128(x, v6));
      __m128i v45_67 =
          _mm_xor_si128(_mm_aesenc_si128(v4, v5), _mm_add_epi64(v6, v7));

      __m128i v0y7_1 = _mm_aesdec_si128(_mm_sub_epi64(v7, v0y), v1);
      __m128i v2x6_3 = _mm_aesenc_si128(v2x6, v3);

      x = _mm_aesenc_si128(v45_67, _mm_add_epi64(x, y));
      y = _mm_aesenc_si128(v2x6_3, _mm_xor_si128(v0y7_1, v5));
      v += 8;
    }

    if (len & 64) {
      __m128i v0y = _mm_add_epi64(y, _mm_loadu_si128(v++));
      __m128i v1x = _mm_sub_epi64(x, _mm_loadu_si128(v++));
      x = _mm_aesdec_si128(x, v0y);
      y = _mm_aesdec_si128(y, v1x);

      __m128i v2y = _mm_add_epi64(y, _mm_loadu_si128(v++));
      __m128i v3x = _mm_sub_epi64(x, _mm_loadu_si128(v++));
      x = _mm_aesdec_si128(x, v2y);
      y = _mm_aesdec_si128(y, v3x);
    }

    if (len & 32) {
      __m128i v0y = _mm_add_epi64(y, _mm_loadu_si128(v++));
      __m128i v1x = _mm_sub_epi64(x, _mm_loadu_si128(v++));
      x = _mm_aesdec_si128(x, v0y);
      y = _mm_aesdec_si128(y, v1x);
    }

    if (len & 16) {
      y = _mm_add_epi64(x, y);
      x = _mm_aesdec_si128(x, _mm_loadu_si128(v++));
    }

    x = _mm_add_epi64(_mm_aesdec_si128(x, _mm_aesenc_si128(y, x)), y);
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__SSE4_1__) || defined(__AVX__)
    a = _mm_extract_epi64(x, 0);
    b = _mm_extract_epi64(x, 1);
 #else
    a = _mm_cvtsi128_si64(x);
    b = _mm_cvtsi128_si64(_mm_unpackhi_epi64(x, x));
 #endif
 #else
 #if defined(__SSE4_1__) || defined(__AVX__)
    a = (uint32_t)_mm_extract_epi32(x, 0) |
        (uint64_t)_mm_extract_epi32(x, 1) << 32;
    b = (uint32_t)_mm_extract_epi32(x, 2) |
        (uint64_t)_mm_extract_epi32(x, 3) << 32;
 #else
    a = (uint32_t)_mm_cvtsi128_si32(x);
    a |= (uint64_t)_mm_cvtsi128_si32(_mm_shuffle_epi32(x, 1)) << 32;
    x = _mm_unpackhi_epi64(x, x);
    b = (uint32_t)_mm_cvtsi128_si32(x);
    b |= (uint64_t)_mm_cvtsi128_si32(_mm_shuffle_epi32(x, 1)) << 32;
 #endif
 #endif
 #ifdef __AVX__
    _mm256_zeroall();
 #elif !(defined(_X86_64_) || defined(__x86_64__) || defined(_M_X64))
    _mm_empty();
 #endif
    data = v;
    len &= 15;
  }

  const uint64_t *v = (const uint64_t *)data;
 #ifdef __e2k__
  const int need_align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK;
  uint64_t align[4];
  if (unlikely(need_align) && len > 8)
    v = (const uint64_t *)memcpy(&align, unaligned(v), len);
 #endif /* __e2k__ */

  switch (len) {
  default:
    mixup64(&a, &b, *v++, prime_4);
  /* fall through */
  case 24:
  case 23:
  case 22:
  case 21:
  case 20:
  case 19:
  case 18:
  case 17:
    mixup64(&b, &a, *v++, prime_3);
  /* fall through */
  case 16:
  case 15:
  case 14:
  case 13:
  case 12:
  case 11:
  case 10:
  case 9:
    mixup64(&a, &b, *v++, prime_2);
  /* fall through */
  case 8:
  case 7:
  case 6:
  case 5:
  case 4:
  case 3:
  case 2:
  case 1:
    mixup64(&b, &a, tail64_le(v, len), prime_1);
  /* fall through */
  case 0:
    return final64(a, b);
  }
 }

 #endif /* T1HA0_AESNI_AVAILABLE */
 #undef T1HA_IA32AES_NAME
--- a/contrib/t1ha/t1ha0_ia32aes_noavx.c
+++ b/contrib/t1ha/t1ha0_ia32aes_noavx.c
@@ -0,0 +1,2 @@
 #define T1HA_IA32AES_NAME t1ha0_ia32aes_noavx
 #include "t1ha0_ia32aes_a.h"
--- a/contrib/t1ha/t1ha1.c
+++ b/contrib/t1ha/t1ha1.c
@@ -0,0 +1,215 @@
 /*
 *  Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2018 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

 /*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

 #include "config.h"
 #include "t1ha_bits.h"

 /* xor-mul-xor mixer */
 static __inline uint64_t mix64(uint64_t v, uint64_t p) {
  v *= p;
  return v ^ rot64(v, 41);
 }

 static __inline uint64_t final_weak_avalanche(uint64_t a, uint64_t b) {
  /* LY: for performance reason on a some not high-end CPUs
   * I replaced the second mux64() operation by mix64().
   * Unfortunately this approach fails the "strict avalanche criteria",
   * see test results at https://github.com/demerphq/smhasher. */
  return mux64(rot64(a + b, 17), prime_4) + mix64(a ^ b, prime_0);
 }

 uint64_t t1ha1_le(const void *data, size_t len, uint64_t seed) {
  uint64_t a = seed;
  uint64_t b = len;

  const int need_align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK;
  uint64_t align[4];

  if (unlikely(len > 32)) {
    uint64_t c = rot64(len, 17) + seed;
    uint64_t d = len ^ rot64(seed, 17);
    const void *detent = (const uint8_t *)data + len - 31;
    do {
      const uint64_t *v = (const uint64_t *)data;
      if (unlikely(need_align))
        v = (const uint64_t *)memcpy(&align, unaligned(v), 32);

      uint64_t w0 = fetch64_le(v + 0);
      uint64_t w1 = fetch64_le(v + 1);
      uint64_t w2 = fetch64_le(v + 2);
      uint64_t w3 = fetch64_le(v + 3);

      uint64_t d02 = w0 ^ rot64(w2 + d, 17);
      uint64_t c13 = w1 ^ rot64(w3 + c, 17);
      c += a ^ rot64(w0, 41);
      d -= b ^ rot64(w1, 31);
      a ^= prime_1 * (d02 + w3);
      b ^= prime_0 * (c13 + w2);
      data = (const uint64_t *)data + 4;
    } while (likely(data < detent));

    a ^= prime_6 * (rot64(c, 17) + d);
    b ^= prime_5 * (c + rot64(d, 17));
    len &= 31;
  }

  const uint64_t *v = (const uint64_t *)data;
  if (unlikely(need_align) && len > 8)
    v = (const uint64_t *)memcpy(&align, unaligned(v), len);

  switch (len) {
  default:
    b += mux64(fetch64_le(v++), prime_4);
  /* fall through */
  case 24:
  case 23:
  case 22:
  case 21:
  case 20:
  case 19:
  case 18:
  case 17:
    a += mux64(fetch64_le(v++), prime_3);
  /* fall through */
  case 16:
  case 15:
  case 14:
  case 13:
  case 12:
  case 11:
  case 10:
  case 9:
    b += mux64(fetch64_le(v++), prime_2);
  /* fall through */
  case 8:
  case 7:
  case 6:
  case 5:
  case 4:
  case 3:
  case 2:
  case 1:
    a += mux64(tail64_le(v, len), prime_1);
  /* fall through */
  case 0:
    return final_weak_avalanche(a, b);
  }
 }

 uint64_t t1ha1_be(const void *data, size_t len, uint64_t seed) {
  uint64_t a = seed;
  uint64_t b = len;

  const int need_align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK;
  uint64_t align[4];

  if (unlikely(len > 32)) {
    uint64_t c = rot64(len, 17) + seed;
    uint64_t d = len ^ rot64(seed, 17);
    const void *detent = (const uint8_t *)data + len - 31;
    do {
      const uint64_t *v = (const uint64_t *)data;
      if (unlikely(need_align))
        v = (const uint64_t *)memcpy(&align, unaligned(v), 32);

      uint64_t w0 = fetch64_be(v + 0);
      uint64_t w1 = fetch64_be(v + 1);
      uint64_t w2 = fetch64_be(v + 2);
      uint64_t w3 = fetch64_be(v + 3);

      uint64_t d02 = w0 ^ rot64(w2 + d, 17);
      uint64_t c13 = w1 ^ rot64(w3 + c, 17);
      c += a ^ rot64(w0, 41);
      d -= b ^ rot64(w1, 31);
      a ^= prime_1 * (d02 + w3);
      b ^= prime_0 * (c13 + w2);
      data = (const uint64_t *)data + 4;
    } while (likely(data < detent));

    a ^= prime_6 * (rot64(c, 17) + d);
    b ^= prime_5 * (c + rot64(d, 17));
    len &= 31;
  }

  const uint64_t *v = (const uint64_t *)data;
  if (unlikely(need_align) && len > 8)
    v = (const uint64_t *)memcpy(&align, unaligned(v), len);

  switch (len) {
  default:
    b += mux64(fetch64_be(v++), prime_4);
  /* fall through */
  case 24:
  case 23:
  case 22:
  case 21:
  case 20:
  case 19:
  case 18:
  case 17:
    a += mux64(fetch64_be(v++), prime_3);
  /* fall through */
  case 16:
  case 15:
  case 14:
  case 13:
  case 12:
  case 11:
  case 10:
  case 9:
    b += mux64(fetch64_be(v++), prime_2);
  /* fall through */
  case 8:
  case 7:
  case 6:
  case 5:
  case 4:
  case 3:
  case 2:
  case 1:
    a += mux64(tail64_be(v, len), prime_1);
  /* fall through */
  case 0:
    return final_weak_avalanche(a, b);
  }
 }
--- a/contrib/t1ha/t1ha2.c
+++ b/contrib/t1ha/t1ha2.c
@@ -0,0 +1,297 @@
 /*
 *  Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2018 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

 /*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

 #include "config.h"
 #include "t1ha_bits.h"

 static __always_inline void init_ab(t1ha_state256_t *s, uint64_t x,
                                    uint64_t y) {
  s->n.a = x;
  s->n.b = y;
 }

 static __always_inline void init_cd(t1ha_state256_t *s, uint64_t x,
                                    uint64_t y) {
  s->n.c = rot64(y, 23) + ~x;
  s->n.d = ~y + rot64(x, 19);
 }

 static __always_inline void update(t1ha_state256_t *__restrict s,
                                   const uint64_t *__restrict v) {
  uint64_t w0 = fetch64_le(v + 0);
  uint64_t w1 = fetch64_le(v + 1);
  uint64_t w2 = fetch64_le(v + 2);
  uint64_t w3 = fetch64_le(v + 3);

  uint64_t d02 = w0 + rot64(w2 + s->n.d, 56);
  uint64_t c13 = w1 + rot64(w3 + s->n.c, 19);
 #ifdef __e2k__
  /* FIXME: temporary workaround for lcc's ELBRUS scheduling bug (LY) */
  s->n.c ^= s->n.a + rot64(w0, 57);
  s->n.d ^= s->n.b + rot64(w1, 38);
 #else
  s->n.d ^= s->n.b + rot64(w1, 38);
  s->n.c ^= s->n.a + rot64(w0, 57);
 #endif
  s->n.b ^= prime_6 * (c13 + w2);
  s->n.a ^= prime_5 * (d02 + w3);
 }

 static __always_inline void squash(t1ha_state256_t *s) {
  s->n.a ^= prime_6 * (s->n.c + rot64(s->n.d, 23));
  s->n.b ^= prime_5 * (rot64(s->n.c, 19) + s->n.d);
 }

 static __always_inline const void *
 loop(bool need_copy4align, uint64_t *__restrict buffer4align,
     t1ha_state256_t *__restrict s, const void *__restrict data, size_t len) {
  const void *detent = (const uint8_t *)data + len - 31;
  do {
    const uint64_t *v = (const uint64_t *)data;
    if (unlikely(need_copy4align))
      v = (const uint64_t *)memcpy(buffer4align, unaligned(v), 32);
    update(s, v);
    data = (const uint64_t *)data + 4;
  } while (likely(data < detent));
  return data;
 }

 static __always_inline void tail_ab(t1ha_state256_t *__restrict s,
                                    const uint64_t *__restrict v, size_t len) {
  switch (len) {
  default:
    mixup64(&s->n.a, &s->n.b, fetch64_le(v++), prime_4);
  /* fall through */
  case 24:
  case 23:
  case 22:
  case 21:
  case 20:
  case 19:
  case 18:
  case 17:
    mixup64(&s->n.b, &s->n.a, fetch64_le(v++), prime_3);
  /* fall through */
  case 16:
  case 15:
  case 14:
  case 13:
  case 12:
  case 11:
  case 10:
  case 9:
    mixup64(&s->n.a, &s->n.b, fetch64_le(v++), prime_2);
  /* fall through */
  case 8:
  case 7:
  case 6:
  case 5:
  case 4:
  case 3:
  case 2:
  case 1:
    mixup64(&s->n.b, &s->n.a, tail64_le(v, len), prime_1);
  /* fall through */
  case 0:
    return;
  }
 }

 static __always_inline void tail_abcd(t1ha_state256_t *__restrict s,
                                      const uint64_t *__restrict v,
                                      size_t len) {
  switch (len) {
  default:
    mixup64(&s->n.a, &s->n.d, fetch64_le(v++), prime_4);
  /* fall through */
  case 24:
  case 23:
  case 22:
  case 21:
  case 20:
  case 19:
  case 18:
  case 17:
    mixup64(&s->n.b, &s->n.a, fetch64_le(v++), prime_3);
  /* fall through */
  case 16:
  case 15:
  case 14:
  case 13:
  case 12:
  case 11:
  case 10:
  case 9:
    mixup64(&s->n.c, &s->n.b, fetch64_le(v++), prime_2);
  /* fall through */
  case 8:
  case 7:
  case 6:
  case 5:
  case 4:
  case 3:
  case 2:
  case 1:
    mixup64(&s->n.d, &s->n.c, tail64_le(v, len), prime_1);
  /* fall through */
  case 0:
    return;
  }
 }

 static __always_inline uint64_t final128(uint64_t a, uint64_t b, uint64_t c,
                                         uint64_t d, uint64_t *h) {
  mixup64(&a, &b, rot64(c, 41) ^ d, prime_0);
  mixup64(&b, &c, rot64(d, 23) ^ a, prime_6);
  mixup64(&c, &d, rot64(a, 19) ^ b, prime_5);
  mixup64(&d, &a, rot64(b, 31) ^ c, prime_4);
  *h = c + d;
  return a ^ b;
 }

 //------------------------------------------------------------------------------

 uint64_t t1ha2_atonce(const void *data, size_t length, uint64_t seed) {
  t1ha_state256_t state;
  init_ab(&state, seed, length);

  const int need_copy4align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK;
  uint64_t buffer4align[4];

  if (unlikely(length > 32)) {
    init_cd(&state, seed, length);
    data = loop(need_copy4align, buffer4align, &state, data, length);
    squash(&state);
    length &= 31;
  }

  const uint64_t *v = (const uint64_t *)data;
  if (unlikely(need_copy4align) && length > 8)
    v = (const uint64_t *)memcpy(&buffer4align, unaligned(v), length);

  tail_ab(&state, v, length);
  return final64(state.n.a, state.n.b);
 }

 uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result,
                         const void *__restrict data, size_t length,
                         uint64_t seed) {
  t1ha_state256_t state;
  init_ab(&state, seed, length);
  init_cd(&state, seed, length);

  const int need_copy4align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK;
  uint64_t buffer4align[4];

  if (unlikely(length > 32)) {
    data = loop(need_copy4align, buffer4align, &state, data, length);
    length &= 31;
  }

  const uint64_t *v = (const uint64_t *)data;
  if (unlikely(need_copy4align) && length > 8)
    v = (const uint64_t *)memcpy(&buffer4align, unaligned(v), length);

  tail_abcd(&state, v, length);
  return final128(state.n.a, state.n.b, state.n.c, state.n.d, extra_result);
 }

 //------------------------------------------------------------------------------

 void t1ha2_init(t1ha_context_t *ctx, uint64_t seed_x, uint64_t seed_y) {
  init_ab(&ctx->state, seed_x, seed_y);
  init_cd(&ctx->state, seed_x, seed_y);
  ctx->partial = 0;
  ctx->total = 0;
 }

 void t1ha2_update(t1ha_context_t *__restrict ctx, const void *__restrict data,
                  size_t length) {
  ctx->total += length;

  if (ctx->partial) {
    const size_t left = 32 - ctx->partial;
    const size_t chunk = (length >= left) ? left : length;
    memcpy(ctx->buffer.bytes + ctx->partial, unaligned(data), chunk);
    ctx->partial += chunk;
    if (ctx->partial < 32) {
      assert(left >= length);
      return;
    }
    ctx->partial = 0;
    data = (const uint8_t *)data + chunk;
    length -= chunk;
    update(&ctx->state, ctx->buffer.u64);
  }

  if (length >= 32) {
    const bool need_copy4align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK;
    if (need_copy4align)
      data = loop(true, ctx->buffer.u64, &ctx->state, data, length);
    else
      data = loop(false, NULL, &ctx->state, data, length);
    length &= 31;
  }

  if (length)
    memcpy(ctx->buffer.bytes, unaligned(data), ctx->partial = length);
 }

 uint64_t t1ha2_final(t1ha_context_t *__restrict ctx,
                     uint64_t *__restrict extra_result) {
  uint64_t bytes = (ctx->total << 3) ^ (UINT64_C(1) << 63);
 #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
  bytes = bswap64(bytes);
 #endif
  t1ha2_update(ctx, &bytes, 8);

  if (likely(!extra_result)) {
    squash(&ctx->state);
    tail_ab(&ctx->state, ctx->buffer.u64, ctx->partial);
    return final64(ctx->state.n.a, ctx->state.n.b);
  }

  tail_abcd(&ctx->state, ctx->buffer.u64, ctx->partial);
  return final128(ctx->state.n.a, ctx->state.n.b, ctx->state.n.c,
                  ctx->state.n.d, extra_result);
 }
--- a/contrib/t1ha/t1ha_bits.h
+++ b/contrib/t1ha/t1ha_bits.h
@@ -0,0 +1,827 @@
 /*
 *  Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2018 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

 /*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

 #pragma once

 #if defined(_MSC_VER)
 #pragma warning(disable : 4201) /* nameless struct/union */
 #if _MSC_VER > 1800
 #pragma warning(disable : 4464) /* relative include path contains '..' */
 #endif                          /* 1800 */
 #endif                          /* MSVC */

 #include "config.h"
 #include "t1ha.h"

 #ifndef T1HA_USE_FAST_ONESHOT_READ
 /* Define it to 1 for little bit faster code.
 * Unfortunately this may triggering a false-positive alarms from Valgrind,
 * AddressSanitizer and other similar tool.
 * So, define it to 0 for calmness if doubt. */
 #define T1HA_USE_FAST_ONESHOT_READ 1
 #endif /* T1HA_USE_FAST_ONESHOT_READ */

 /*****************************************************************************/

 #include <assert.h>  /* for assert() */
 #include <stdbool.h> /* for bool */
 #include <string.h>  /* for memcpy() */

 #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ &&                               \
    __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
 #error Unsupported byte order.
 #endif

 #if !defined(UNALIGNED_OK)
 #if (defined(__ia32__) || defined(__e2k__) ||                                  \
     defined(__ARM_FEATURE_UNALIGNED)) &&                                      \
    !defined(__ALIGNED__)
 #define UNALIGNED_OK 1
 #else
 #define UNALIGNED_OK 0
 #endif
 #endif /* UNALIGNED_OK */

 #if UNALIGNED_OK && !defined(PAGESIZE)
 #define PAGESIZE 4096
 #endif /* PAGESIZE */

 /***************************************************************************/

 #ifndef __has_builtin
 #define __has_builtin(x) (0)
 #endif

 #if __GNUC_PREREQ(4, 4) || defined(__clang__)

 #if defined(__ia32__) || defined(__e2k__)
 #include <x86intrin.h>
 #endif

 #if defined(__ia32__)
 #include <cpuid.h>
 #endif

 #if defined(__e2k__)
 #include <e2kbuiltin.h>
 #endif

 #ifndef likely
 #define likely(cond) __builtin_expect(!!(cond), 1)
 #endif

 #ifndef unlikely
 #define unlikely(cond) __builtin_expect(!!(cond), 0)
 #endif

 #if __GNUC_PREREQ(4, 5) || __has_builtin(__builtin_unreachable)
 #define unreachable() __builtin_unreachable()
 #endif

 #define bswap64(v) __builtin_bswap64(v)
 #define bswap32(v) __builtin_bswap32(v)
 #if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
 #define bswap16(v) __builtin_bswap16(v)
 #endif

 #if !defined(__maybe_unused) && (__GNUC_PREREQ(4, 3) || __has_attribute(unused))
 #define __maybe_unused __attribute__((unused))
 #endif

 #if !defined(__always_inline) &&                                               \
    (__GNUC_PREREQ(3, 2) || __has_attribute(always_inline))
 #define __always_inline __inline __attribute__((always_inline))
 #endif

 #if defined(__e2k__)

 #if __iset__ >= 3
 #define mul_64x64_high(a, b) __builtin_e2k_umulhd(a, b)
 #endif /* __iset__ >= 3 */

 #if __iset__ >= 5
 static __maybe_unused __always_inline unsigned
 e2k_add64carry_first(uint64_t base, uint64_t addend, uint64_t *sum) {
  *sum = base + addend;
  return (unsigned)__builtin_e2k_addcd_c(base, addend, 0);
 }
 #define add64carry_first(base, addend, sum)                                    \
  e2k_add64carry_first(base, addend, sum)

 static __maybe_unused __always_inline unsigned
 e2k_add64carry_next(unsigned carry, uint64_t base, uint64_t addend,
                    uint64_t *sum) {
  *sum = __builtin_e2k_addcd(base, addend, carry);
  return (unsigned)__builtin_e2k_addcd_c(base, addend, carry);
 }
 #define add64carry_next(carry, base, addend, sum)                              \
  e2k_add64carry_next(carry, base, addend, sum)

 static __maybe_unused __always_inline void e2k_add64carry_last(unsigned carry,
                                                               uint64_t base,
                                                               uint64_t addend,
                                                               uint64_t *sum) {
  *sum = __builtin_e2k_addcd(base, addend, carry);
 }
 #define add64carry_last(carry, base, addend, sum)                              \
  e2k_add64carry_last(carry, base, addend, sum)
 #endif /* __iset__ >= 5 */

 #if 0 /* LY: unreasonable, because alignment is required :( */
 #define fetch64_be(ptr) ((uint64_t)__builtin_e2k_ld_64s_be(ptr))
 #define fetch32_be(ptr) ((uint32_t)__builtin_e2k_ld_32u_be(ptr))
 #endif

 #endif /* __e2k__ Elbrus */

 #elif defined(_MSC_VER)

 #if _MSC_FULL_VER < 190024218 && defined(_M_IX86)
 #pragma message(                                                               \
    "For AES-NI at least \"Microsoft C/C++ Compiler\" version 19.00.24218 (Visual Studio 2015 Update 5) is required.")
 #endif
 #if _MSC_FULL_VER < 191025019
 #pragma message(                                                               \
    "It is recommended to use \"Microsoft C/C++ Compiler\" version 19.10.25019 (Visual Studio 2017) or newer.")
 #endif
 #if _MSC_FULL_VER < 180040629
 #error At least "Microsoft C/C++ Compiler" version 18.00.40629 (Visual Studio 2013 Update 5) is required.
 #endif

 #pragma warning(push, 1)

 #include <intrin.h>
 #include <stdlib.h>
 #define likely(cond) (cond)
 #define unlikely(cond) (cond)
 #define unreachable() __assume(0)
 #define bswap64(v) _byteswap_uint64(v)
 #define bswap32(v) _byteswap_ulong(v)
 #define bswap16(v) _byteswap_ushort(v)
 #define rot64(v, s) _rotr64(v, s)
 #define rot32(v, s) _rotr(v, s)
 #define __always_inline __forceinline

 #if defined(_M_X64) || defined(_M_IA64)
 #pragma intrinsic(_umul128)
 #define mul_64x64_128(a, b, ph) _umul128(a, b, ph)
 #pragma intrinsic(_addcarry_u64)
 #define add64carry_first(base, addend, sum) _addcarry_u64(0, base, addend, sum)
 #define add64carry_next(carry, base, addend, sum)                              \
  _addcarry_u64(carry, base, addend, sum)
 #define add64carry_last(carry, base, addend, sum)                              \
  (void)_addcarry_u64(carry, base, addend, sum)
 #endif

 #if defined(_M_ARM64) || defined(_M_X64) || defined(_M_IA64)
 #pragma intrinsic(__umulh)
 #define mul_64x64_high(a, b) __umulh(a, b)
 #endif

 #if defined(_M_IX86)
 #pragma intrinsic(__emulu)
 #define mul_32x32_64(a, b) __emulu(a, b)

 #if _MSC_FULL_VER >= 190024231 /* LY: workaround for optimizer bug */
 #pragma intrinsic(_addcarry_u32)
 #define add32carry_first(base, addend, sum) _addcarry_u32(0, base, addend, sum)
 #define add32carry_next(carry, base, addend, sum)                              \
  _addcarry_u32(carry, base, addend, sum)
 #define add32carry_last(carry, base, addend, sum)                              \
  (void)_addcarry_u32(carry, base, addend, sum)

 static __forceinline char
 msvc32_add64carry_first(uint64_t base, uint64_t addend, uint64_t *sum) {
  uint32_t *const sum32 = (uint32_t *)sum;
  const uint32_t base_32l = (uint32_t)base;
  const uint32_t base_32h = (uint32_t)(base >> 32);
  const uint32_t addend_32l = (uint32_t)addend;
  const uint32_t addend_32h = (uint32_t)(addend >> 32);
  return add32carry_next(add32carry_first(base_32l, addend_32l, sum32),
                         base_32h, addend_32h, sum32 + 1);
 }
 #define add64carry_first(base, addend, sum)                                    \
  msvc32_add64carry_first(base, addend, sum)

 static __forceinline char msvc32_add64carry_next(char carry, uint64_t base,
                                                 uint64_t addend,
                                                 uint64_t *sum) {
  uint32_t *const sum32 = (uint32_t *)sum;
  const uint32_t base_32l = (uint32_t)base;
  const uint32_t base_32h = (uint32_t)(base >> 32);
  const uint32_t addend_32l = (uint32_t)addend;
  const uint32_t addend_32h = (uint32_t)(addend >> 32);
  return add32carry_next(add32carry_next(carry, base_32l, addend_32l, sum32),
                         base_32h, addend_32h, sum32 + 1);
 }
 #define add64carry_next(carry, base, addend, sum)                              \
  msvc32_add64carry_next(carry, base, addend, sum)

 static __forceinline void msvc32_add64carry_last(char carry, uint64_t base,
                                                 uint64_t addend,
                                                 uint64_t *sum) {
  uint32_t *const sum32 = (uint32_t *)sum;
  const uint32_t base_32l = (uint32_t)base;
  const uint32_t base_32h = (uint32_t)(base >> 32);
  const uint32_t addend_32l = (uint32_t)addend;
  const uint32_t addend_32h = (uint32_t)(addend >> 32);
  add32carry_last(add32carry_next(carry, base_32l, addend_32l, sum32), base_32h,
                  addend_32h, sum32 + 1);
 }
 #define add64carry_last(carry, base, addend, sum)                              \
  msvc32_add64carry_last(carry, base, addend, sum)
 #endif /* _MSC_FULL_VER >= 190024231 */

 #elif defined(_M_ARM)
 #define mul_32x32_64(a, b) _arm_umull(a, b)
 #endif

 #pragma warning(pop)
 #pragma warning(disable : 4514) /* 'xyz': unreferenced inline function         \
                                   has been removed */
 #pragma warning(disable : 4710) /* 'xyz': function not inlined */
 #pragma warning(disable : 4711) /* function 'xyz' selected for                 \
                                   automatic inline expansion */
 #pragma warning(disable : 4127) /* conditional expression is constant */
 #pragma warning(disable : 4702) /* unreachable code */
 #endif                          /* Compiler */

 #ifndef likely
 #define likely(cond) (cond)
 #endif
 #ifndef unlikely
 #define unlikely(cond) (cond)
 #endif
 #ifndef __maybe_unused
 #define __maybe_unused
 #endif
 #ifndef __always_inline
 #define __always_inline __inline
 #endif
 #ifndef unreachable
 #define unreachable()                                                          \
  do {                                                                         \
  } while (1)
 #endif

 #ifndef bswap64
 #if defined(bswap_64)
 #define bswap64 bswap_64
 #elif defined(__bswap_64)
 #define bswap64 __bswap_64
 #else
 static __always_inline uint64_t bswap64(uint64_t v) {
  return v << 56 | v >> 56 | ((v << 40) & UINT64_C(0x00ff000000000000)) |
         ((v << 24) & UINT64_C(0x0000ff0000000000)) |
         ((v << 8) & UINT64_C(0x000000ff00000000)) |
         ((v >> 8) & UINT64_C(0x00000000ff000000)) |
         ((v >> 24) & UINT64_C(0x0000000000ff0000)) |
         ((v >> 40) & UINT64_C(0x000000000000ff00));
 }
 #endif
 #endif /* bswap64 */

 #ifndef bswap32
 #if defined(bswap_32)
 #define bswap32 bswap_32
 #elif defined(__bswap_32)
 #define bswap32 __bswap_32
 #else
 static __always_inline uint32_t bswap32(uint32_t v) {
  return v << 24 | v >> 24 | ((v << 8) & UINT32_C(0x00ff0000)) |
         ((v >> 8) & UINT32_C(0x0000ff00));
 }
 #endif
 #endif /* bswap32 */

 #ifndef bswap16
 #if defined(bswap_16)
 #define bswap16 bswap_16
 #elif defined(__bswap_16)
 #define bswap16 __bswap_16
 #else
 static __always_inline uint16_t bswap16(uint16_t v) { return v << 8 | v >> 8; }
 #endif
 #endif /* bswap16 */

 #ifndef unaligned
 #if defined(__LCC__)
 #pragma diag_suppress wrong_entity_for_attribute
 #define unaligned(ptr) ((const char __attribute__((packed, aligned(1))) *)(ptr))
 #elif defined(__clang__)
 #pragma clang diagnostic ignored "-Wignored-attributes"
 #define unaligned(ptr) ((const char __attribute__((packed, aligned(1))) *)(ptr))
 #elif defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Wpacked"
 #define unaligned(ptr) ((const char __attribute__((packed, aligned(1))) *)(ptr))
 #elif defined(_MSC_VER)
 #pragma warning(                                                               \
    disable : 4235) /* nonstandard extension used: '__unaligned'               \
                     * keyword not supported on this architecture */
 #define unaligned(ptr) ((const char __unaligned *)(ptr))
 #else
 #define unaligned(ptr) ((const char *)(ptr))
 #endif
 #endif /* unaligned */

 /***************************************************************************/

 #ifndef fetch64_le
 static __always_inline uint64_t fetch64_le(const void *v) {
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  return *(const uint64_t *)v;
 #else
  return bswap64(*(const uint64_t *)v);
 #endif
 }
 #endif /* fetch64_le */

 #ifndef fetch32_le
 static __always_inline uint32_t fetch32_le(const void *v) {
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  return *(const uint32_t *)v;
 #else
  return bswap32(*(const uint32_t *)v);
 #endif
 }
 #endif /* fetch32_le */

 #ifndef fetch16_le
 static __always_inline uint16_t fetch16_le(const void *v) {
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  return *(const uint16_t *)v;
 #else
  return bswap16(*(const uint16_t *)v);
 #endif
 }
 #endif /* fetch16_le */

 #if T1HA_USE_FAST_ONESHOT_READ && UNALIGNED_OK && defined(PAGESIZE) &&         \
    PAGESIZE > 0 && !defined(__SANITIZE_ADDRESS__)
 #define can_read_underside(ptr, size)                                          \
  ((size) <= sizeof(uintptr_t) && ((PAGESIZE - (size)) & (uintptr_t)(ptr)) != 0)
 #endif /* can_fast_read */

 static __always_inline uint64_t tail64_le(const void *v, size_t tail) {
  const uint8_t *p = (const uint8_t *)v;
 #ifdef can_read_underside
  /* On some systems (e.g. x86) we can perform a 'oneshot' read, which
   * is little bit faster. Thanks Marcin Żukowski <marcin.zukowski@gmail.com>
   * for the reminder. */
  const unsigned offset = (8 - tail) & 7;
  const unsigned shift = offset << 3;
  if (likely(can_read_underside(p, 8))) {
    p -= offset;
    return fetch64_le(p) >> shift;
  }
  return fetch64_le(p) & ((~UINT64_C(0)) >> shift);
 #endif /* 'oneshot' read */

  uint64_t r = 0;
  switch (tail & 7) {
 #if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  /* For most CPUs this code is better when not needed
   * copying for alignment or byte reordering. */
  case 0:
    return fetch64_le(p);
  case 7:
    r = (uint64_t)p[6] << 8;
  /* fall through */
  case 6:
    r += p[5];
    r <<= 8;
  /* fall through */
  case 5:
    r += p[4];
    r <<= 32;
  /* fall through */
  case 4:
    return r + fetch32_le(p);
  case 3:
    r = (uint64_t)p[2] << 16;
  /* fall through */
  case 2:
    return r + fetch16_le(p);
  case 1:
    return p[0];
 #else
  /* For most CPUs this code is better than a
   * copying for alignment and/or byte reordering. */
  case 0:
    r = p[7] << 8;
  /* fall through */
  case 7:
    r += p[6];
    r <<= 8;
  /* fall through */
  case 6:
    r += p[5];
    r <<= 8;
  /* fall through */
  case 5:
    r += p[4];
    r <<= 8;
  /* fall through */
  case 4:
    r += p[3];
    r <<= 8;
  /* fall through */
  case 3:
    r += p[2];
    r <<= 8;
  /* fall through */
  case 2:
    r += p[1];
    r <<= 8;
  /* fall through */
  case 1:
    return r + p[0];
 #endif
  }
  unreachable();
 }

 #ifndef fetch64_be
 static __maybe_unused __always_inline uint64_t fetch64_be(const void *v) {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return *(const uint64_t *)v;
 #else
  return bswap64(*(const uint64_t *)v);
 #endif
 }
 #endif /* fetch64_be */

 #ifndef fetch32_be
 static __maybe_unused __always_inline uint32_t fetch32_be(const void *v) {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return *(const uint32_t *)v;
 #else
  return bswap32(*(const uint32_t *)v);
 #endif
 }
 #endif /* fetch32_be */

 #ifndef fetch16_be
 static __maybe_unused __always_inline uint16_t fetch16_be(const void *v) {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return *(const uint16_t *)v;
 #else
  return bswap16(*(const uint16_t *)v);
 #endif
 }
 #endif /* fetch16_be */

 static __maybe_unused __always_inline uint64_t tail64_be(const void *v,
                                                         size_t tail) {
  const uint8_t *p = (const uint8_t *)v;
 #ifdef can_read_underside
  /* On some systems we can perform a 'oneshot' read, which is little bit
   * faster. Thanks Marcin Żukowski <marcin.zukowski@gmail.com> for the
   * reminder. */
  const unsigned offset = (8 - tail) & 7;
  const unsigned shift = offset << 3;
  if (likely(can_read_underside(p, 8))) {
    p -= offset;
    return fetch64_be(p) & ((~UINT64_C(0)) >> shift);
  }
  return fetch64_be(p) >> shift;
 #endif /* 'oneshot' read */

  switch (tail & 7) {
 #if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  /* For most CPUs this code is better when not needed
   * copying for alignment or byte reordering. */
  case 1:
    return p[0];
  case 2:
    return fetch16_be(p);
  case 3:
    return (uint32_t)fetch16_be(p) << 8 | p[2];
  case 4:
    return fetch32_be(p);
  case 5:
    return (uint64_t)fetch32_be(p) << 8 | p[4];
  case 6:
    return (uint64_t)fetch32_be(p) << 16 | fetch16_be(p + 4);
  case 7:
    return (uint64_t)fetch32_be(p) << 24 | (uint32_t)fetch16_be(p + 4) << 8 |
           p[6];
  case 0:
    return fetch64_be(p);
 #else
  /* For most CPUs this code is better than a
   * copying for alignment and/or byte reordering. */
  case 1:
    return p[0];
  case 2:
    return p[1] | (uint32_t)p[0] << 8;
  case 3:
    return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16;
  case 4:
    return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 |
           (uint32_t)p[0] << 24;
  case 5:
    return p[4] | (uint32_t)p[3] << 8 | (uint32_t)p[2] << 16 |
           (uint32_t)p[1] << 24 | (uint64_t)p[0] << 32;
  case 6:
    return p[5] | (uint32_t)p[4] << 8 | (uint32_t)p[3] << 16 |
           (uint32_t)p[2] << 24 | (uint64_t)p[1] << 32 | (uint64_t)p[0] << 40;
  case 7:
    return p[6] | (uint32_t)p[5] << 8 | (uint32_t)p[4] << 16 |
           (uint32_t)p[3] << 24 | (uint64_t)p[2] << 32 | (uint64_t)p[1] << 40 |
           (uint64_t)p[0] << 48;
  case 0:
    return p[7] | (uint32_t)p[6] << 8 | (uint32_t)p[5] << 16 |
           (uint32_t)p[4] << 24 | (uint64_t)p[3] << 32 | (uint64_t)p[2] << 40 |
           (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56;
 #endif
  }
  unreachable();
 }

 /***************************************************************************/

 #ifndef rot64
 static __always_inline uint64_t rot64(uint64_t v, unsigned s) {
  return (v >> s) | (v << (64 - s));
 }
 #endif /* rot64 */

 #ifndef mul_32x32_64
 static __always_inline uint64_t mul_32x32_64(uint32_t a, uint32_t b) {
  return a * (uint64_t)b;
 }
 #endif /* mul_32x32_64 */

 #ifndef add64carry_first
 static __maybe_unused __always_inline unsigned
 add64carry_first(uint64_t base, uint64_t addend, uint64_t *sum) {
 #if __has_builtin(__builtin_addcll)
  unsigned long long carryout;
  *sum = __builtin_addcll(base, addend, 0, &carryout);
  return (unsigned)carryout;
 #else
  *sum = base + addend;
  return *sum < addend;
 #endif /* __has_builtin(__builtin_addcll) */
 }
 #endif /* add64carry_fist */

 #ifndef add64carry_next
 static __maybe_unused __always_inline unsigned
 add64carry_next(unsigned carry, uint64_t base, uint64_t addend, uint64_t *sum) {
 #if __has_builtin(__builtin_addcll)
  unsigned long long carryout;
  *sum = __builtin_addcll(base, addend, carry, &carryout);
  return (unsigned)carryout;
 #else
  *sum = base + addend + carry;
  return *sum < addend || (carry && *sum == addend);
 #endif /* __has_builtin(__builtin_addcll) */
 }
 #endif /* add64carry_next */

 #ifndef add64carry_last
 static __maybe_unused __always_inline void
 add64carry_last(unsigned carry, uint64_t base, uint64_t addend, uint64_t *sum) {
 #if __has_builtin(__builtin_addcll)
  unsigned long long carryout;
  *sum = __builtin_addcll(base, addend, carry, &carryout);
  (void)carryout;
 #else
  *sum = base + addend + carry;
 #endif /* __has_builtin(__builtin_addcll) */
 }
 #endif /* add64carry_last */

 #ifndef mul_64x64_128
 static __maybe_unused __always_inline uint64_t mul_64x64_128(uint64_t a,
                                                             uint64_t b,
                                                             uint64_t *h) {
 #if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  __uint128_t r = (__uint128_t)a * (__uint128_t)b;
  /* modern GCC could nicely optimize this */
  *h = (uint64_t)(r >> 64);
  return (uint64_t)r;
 #elif defined(mul_64x64_high)
  *h = mul_64x64_high(a, b);
  return a * b;
 #else
  /* performs 64x64 to 128 bit multiplication */
  const uint64_t ll = mul_32x32_64((uint32_t)a, (uint32_t)b);
  const uint64_t lh = mul_32x32_64(a >> 32, (uint32_t)b);
  const uint64_t hl = mul_32x32_64((uint32_t)a, b >> 32);
  const uint64_t hh = mul_32x32_64(a >> 32, b >> 32);

  /* Few simplification are possible here for 32-bit architectures,
   * but thus we would lost compatibility with the original 64-bit
   * version.  Think is very bad idea, because then 32-bit t1ha will
   * still (relatively) very slowly and well yet not compatible. */
  uint64_t l;
  add64carry_last(add64carry_first(ll, lh << 32, &l), hh, lh >> 32, h);
  add64carry_last(add64carry_first(l, hl << 32, &l), *h, hl >> 32, h);
  return l;
 #endif
 }
 #endif /* mul_64x64_128() */

 #ifndef mul_64x64_high
 static __maybe_unused __always_inline uint64_t mul_64x64_high(uint64_t a,
                                                              uint64_t b) {
  uint64_t h;
  mul_64x64_128(a, b, &h);
  return h;
 }
 #endif /* mul_64x64_high */

 /***************************************************************************/

 /* 'magic' primes */
 static const uint64_t prime_0 = UINT64_C(0xEC99BF0D8372CAAB);
 static const uint64_t prime_1 = UINT64_C(0x82434FE90EDCEF39);
 static const uint64_t prime_2 = UINT64_C(0xD4F06DB99D67BE4B);
 static const uint64_t prime_3 = UINT64_C(0xBD9CACC22C6E9571);
 static const uint64_t prime_4 = UINT64_C(0x9C06FAF4D023E3AB);
 static const uint64_t prime_5 = UINT64_C(0xC060724A8424F345);
 static const uint64_t prime_6 = UINT64_C(0xCB5AF53AE3AAAC31);

 /* xor high and low parts of full 128-bit product */
 static __maybe_unused __always_inline uint64_t mux64(uint64_t v,
                                                     uint64_t prime) {
  uint64_t l, h;
  l = mul_64x64_128(v, prime, &h);
  return l ^ h;
 }

 static __always_inline uint64_t final64(uint64_t a, uint64_t b) {
  uint64_t x = (a + rot64(b, 41)) * prime_0;
  uint64_t y = (rot64(a, 23) + b) * prime_6;
  return mux64(x ^ y, prime_5);
 }

 static __always_inline void mixup64(uint64_t *__restrict a,
                                    uint64_t *__restrict b, uint64_t v,
                                    uint64_t prime) {
  uint64_t h;
  *a ^= mul_64x64_128(*b + v, prime, &h);
  *b += h;
 }

 /***************************************************************************/

 typedef union t1ha_uint128 {
 #if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  __uint128_t v;
 #endif
  struct {
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
    uint64_t l, h;
 #else
    uint64_t h, l;
 #endif
  };
 } t1ha_uint128_t;

 static __always_inline t1ha_uint128_t not128(const t1ha_uint128_t v) {
  t1ha_uint128_t r;
 #if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  r.v = ~v.v;
 #else
  r.l = ~v.l;
  r.h = ~v.h;
 #endif
  return r;
 }

 static __always_inline t1ha_uint128_t left128(const t1ha_uint128_t v,
                                              unsigned s) {
  t1ha_uint128_t r;
  assert(s < 128);
 #if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  r.v = v.v << s;
 #else
  r.l = (s < 64) ? v.l << s : 0;
  r.h = (s < 64) ? (v.h << s) | (s ? v.l >> (64 - s) : 0) : v.l << (s - 64);
 #endif
  return r;
 }

 static __always_inline t1ha_uint128_t right128(const t1ha_uint128_t v,
                                               unsigned s) {
  t1ha_uint128_t r;
  assert(s < 128);
 #if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  r.v = v.v >> s;
 #else
  r.l = (s < 64) ? (s ? v.h << (64 - s) : 0) | (v.l >> s) : v.h >> (s - 64);
  r.h = (s < 64) ? v.h >> s : 0;
 #endif
  return r;
 }

 static __always_inline t1ha_uint128_t or128(t1ha_uint128_t x,
                                            t1ha_uint128_t y) {
  t1ha_uint128_t r;
 #if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  r.v = x.v | y.v;
 #else
  r.l = x.l | y.l;
  r.h = x.h | y.h;
 #endif
  return r;
 }

 static __always_inline t1ha_uint128_t xor128(t1ha_uint128_t x,
                                             t1ha_uint128_t y) {
  t1ha_uint128_t r;
 #if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  r.v = x.v ^ y.v;
 #else
  r.l = x.l ^ y.l;
  r.h = x.h ^ y.h;
 #endif
  return r;
 }

 static __always_inline t1ha_uint128_t rot128(t1ha_uint128_t v, unsigned s) {
  s &= 127;
 #if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  v.v = (v.v << (128 - s)) | (v.v >> s);
  return v;
 #else
  return s ? or128(left128(v, 128 - s), right128(v, s)) : v;
 #endif
 }

 static __always_inline t1ha_uint128_t add128(t1ha_uint128_t x,
                                             t1ha_uint128_t y) {
  t1ha_uint128_t r;
 #if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  r.v = x.v + y.v;
 #else
  add64carry_last(add64carry_first(x.l, y.l, &r.l), x.h, y.h, &r.h);
 #endif
  return r;
 }

 static __always_inline t1ha_uint128_t mul128(t1ha_uint128_t x,
                                             t1ha_uint128_t y) {
  t1ha_uint128_t r;
 #if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  r.v = x.v * y.v;
 #else
  r.l = mul_64x64_128(x.l, y.l, &r.h);
  r.h += x.l * y.h + y.l * x.h;
 #endif
  return r;
 }
--- a/src/libcryptobox/cryptobox.c
+++ b/src/libcryptobox/cryptobox.c
@@ -1477,78 +1477,31 @@ void rspamd_cryptobox_hash (guchar *out,
 	rspamd_cryptobox_hash_final (&st, out);
 }

 /* MUST be 64 bytes at maximum */
 struct rspamd_cryptobox_fast_hash_state_real {
 	guint64 h;  /* current hash value */
 	guint64 pos; /* pos in bytes in the buf */
 	guint64 buf;
 };
 G_STATIC_ASSERT (sizeof (t1ha_context_t) ==
 		sizeof (rspamd_cryptobox_fast_hash_state_t));

 void
 rspamd_cryptobox_fast_hash_init (rspamd_cryptobox_fast_hash_state_t *st,
 		guint64 seed)
 {
 	struct rspamd_cryptobox_fast_hash_state_real *rst =
 			(struct rspamd_cryptobox_fast_hash_state_real *)st;

 	memset (rst, 0, sizeof (*rst));
 	rst->h = seed;
 	t1ha_context_t *rst = (t1ha_context_t *)st;
 	t1ha2_init (rst, seed, 0);
 }

 void
 rspamd_cryptobox_fast_hash_update (rspamd_cryptobox_fast_hash_state_t *st,
 		const void *data, gsize len)
 {
 	struct rspamd_cryptobox_fast_hash_state_real *rst =
 			(struct rspamd_cryptobox_fast_hash_state_real *)st;
 	const guchar *d = data;
 	guint leftover;
 	guint64 n;

 	leftover = rst->pos;

 	if (leftover > 0 && len + leftover >= 8) {
 		n = sizeof (rst->buf) - leftover;
 		memcpy (((guchar *)&rst->buf) + leftover, d, n);
 		d += n;
 		len -= n;
 		rst->h = mum_hash_step (rst->h, rst->buf);
 		rst->buf = 0;
 		rst->pos = 0;
 	}

 	while (len > 8) {
 #ifdef _MUM_UNALIGNED_ACCESS
 		rst->h = mum_hash_step (rst->h, *(guint64 *)d);
 #else
 		memcpy (&n, d, sizeof (n));
 		rst->h = mum_hash_step (rst->h, n);
 #endif
 		len -= 8;
 		d += 8;
 	}

 	if (len > 0 && rst->pos + len <= 8) {
 		memcpy (((guchar *)&rst->buf) + rst->pos, d, len);
 		rst->pos += len;
 	}
 	t1ha_context_t *rst = (t1ha_context_t *)st;
 	t1ha2_update (rst, data, len);
 }

 guint64
 rspamd_cryptobox_fast_hash_final (rspamd_cryptobox_fast_hash_state_t *st)
 {
 	struct rspamd_cryptobox_fast_hash_state_real *rst =
 			(struct rspamd_cryptobox_fast_hash_state_real *)st;
 	guint leftover;
 	t1ha_context_t *rst = (t1ha_context_t *)st;

 	leftover = rst->pos;

 	if (leftover > 0) {
 		memset (((guchar *)&rst->buf) + leftover, 0, sizeof (rst->buf) - leftover);
 		rst->h = mum_hash_step (rst->h, rst->buf);
 	}

 	return mum_hash_finish (rst->h);
 	return t1ha2_final (rst, NULL);
 }

 /**
@@ -1558,18 +1511,14 @@ static inline guint64
 rspamd_cryptobox_fast_hash_machdep (const void *data,
 		gsize len, guint64 seed)
 {
 #if defined(__LP64__) || defined(_LP64)
 	return t1ha (data, len, seed);
 #else
 	return t1ha32 (data, len, seed);
 #endif
 	return t1ha0 (data, len, seed);
 }

 static inline guint64
 rspamd_cryptobox_fast_hash_indep (const void *data,
 		gsize len, guint64 seed)
 {
 	return t1ha (data, len, seed);
 	return t1ha2_atonce (data, len, seed);
 }

 guint64
@@ -1593,7 +1542,7 @@ rspamd_cryptobox_fast_hash_specific (
 	case RSPAMD_CRYPTOBOX_MUMHASH:
 		return mum_hash (data, len, seed);
 	case RSPAMD_CRYPTOBOX_T1HA:
 		return t1ha (data, len, seed);
 		return t1ha2_atonce (data, len, seed);
 	case RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT:
 		return rspamd_cryptobox_fast_hash_indep (data, len, seed);
 	case RSPAMD_CRYPTOBOX_HASHFAST:
--- a/src/libcryptobox/cryptobox.h
+++ b/src/libcryptobox/cryptobox.h
@@ -344,7 +344,7 @@ void rspamd_cryptobox_hash (guchar *out,

 /* Non crypto hash IUF interface */
 typedef struct RSPAMD_ALIGNED(32) rspamd_cryptobox_fast_hash_state_s  {
 	unsigned char opaque[64];
 	unsigned char opaque[64 + sizeof (size_t) + sizeof (uint64_t)];
 } rspamd_cryptobox_fast_hash_state_t;

 /**