diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-04-28 13:38:11 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-04-28 13:38:11 +0100 |
commit | 12da94dc1d36900e16d336c766bfde1cbbcf90ff (patch) | |
tree | 79dc7bf55c8640d79d2df5dd540b07607a41eb36 /contrib/t1ha | |
parent | 7852bacad4e123ba4309182a0bc77f6b5884c4b4 (diff) | |
download | rspamd-12da94dc1d36900e16d336c766bfde1cbbcf90ff.tar.gz rspamd-12da94dc1d36900e16d336c766bfde1cbbcf90ff.zip |
[Feature] Upgrade t1ha distribution
Diffstat (limited to 'contrib/t1ha')
-rw-r--r-- | contrib/t1ha/CMakeLists.txt | 14 | ||||
-rw-r--r-- | contrib/t1ha/LICENSE | 21 | ||||
-rw-r--r-- | contrib/t1ha/t1ha.h | 788 | ||||
-rw-r--r-- | contrib/t1ha/t1ha0.c | 411 | ||||
-rw-r--r-- | contrib/t1ha/t1ha0_ia32aes_a.h | 200 | ||||
-rw-r--r-- | contrib/t1ha/t1ha0_ia32aes_noavx.c | 2 | ||||
-rw-r--r-- | contrib/t1ha/t1ha1.c | 215 | ||||
-rw-r--r-- | contrib/t1ha/t1ha2.c | 297 | ||||
-rw-r--r-- | contrib/t1ha/t1ha_bits.h | 827 |
9 files changed, 2314 insertions, 461 deletions
diff --git a/contrib/t1ha/CMakeLists.txt b/contrib/t1ha/CMakeLists.txt new file mode 100644 index 000000000..1b54c96d4 --- /dev/null +++ b/contrib/t1ha/CMakeLists.txt @@ -0,0 +1,14 @@ +SET(T1HASRC t1ha0.c + t1ha0_ia32aes_noavx.c + t1ha1.c + t1ha2.c) + +ADD_LIBRARY(rspamd-t1ha STATIC ${T1HASRC}) +SET_TARGET_PROPERTIES(rspamd-t1ha PROPERTIES VERSION ${RSPAMD_VERSION}) +ADD_DEFINITIONS("-DT1HA_USE_FAST_ONESHOT_READ=1") + +IF(ENABLE_FULL_DEBUG MATCHES "OFF") + if ("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU") + SET_TARGET_PROPERTIES(rspamd-t1ha PROPERTIES COMPILE_FLAGS "-O3") + endif () +ENDIF() diff --git a/contrib/t1ha/LICENSE b/contrib/t1ha/LICENSE new file mode 100644 index 000000000..d02db65fc --- /dev/null +++ b/contrib/t1ha/LICENSE @@ -0,0 +1,21 @@ + Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, + Fast Positive Hash. + + Portions Copyright (c) 2010-2013 Leonid Yuriev <leo@yuriev.ru>, + The 1Hippeus project (t1h). + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgement in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. diff --git a/contrib/t1ha/t1ha.h b/contrib/t1ha/t1ha.h index 1ad763c3a..82e6e6a77 100644 --- a/contrib/t1ha/t1ha.h +++ b/contrib/t1ha/t1ha.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2016 Positive Technologies, https://www.ptsecurity.com, + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, * Fast Positive Hash. * - * Portions Copyright (c) 2010-2016 Leonid Yuriev <leo@yuriev.ru>, + * Portions Copyright (c) 2010-2018 Leonid Yuriev <leo@yuriev.ru>, * The 1Hippeus project (t1h). * * This software is provided 'as-is', without any express or implied @@ -23,534 +23,400 @@ */ /* - * t1ha = { Fast Positive Hash} + * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" } * by [Positive Technologies](https://www.ptsecurity.ru) * * Briefly, it is a 64-bit Hash Function: * 1. Created for 64-bit little-endian platforms, in predominantly for x86_64, - * but without penalties could runs on any 64-bit CPU. + * but portable and without penalties it can run on any 64-bit CPU. * 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash - * and all others which are not use specific hardware tricks. + * and all others portable hash-functions (which do not use specific + * hardware tricks). * 3. Not suitable for cryptography. * + * The Future will Positive. Всё будет хорошо. + * * ACKNOWLEDGEMENT: - * The t1ha was originally developed by Leonid Yuriev + * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев) * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! */ -#ifndef T1HA_INCLUDED -#define T1HA_INCLUDED -#include "config.h" -#include <string.h> -#include <stddef.h> +#pragma once #ifndef __has_attribute #define __has_attribute(x) (0) #endif -#ifndef __has_builtin -#define __has_builtin(x) (0) + +#ifndef __has_include +#define __has_include(x) (0) #endif -#ifdef BYTE_ORDER -#ifndef __ORDER_LITTLE_ENDIAN__ -#define __ORDER_LITTLE_ENDIAN__ LITTLE_ENDIAN +#ifndef __GNUC_PREREQ +#if defined(__GNUC__) && defined(__GNUC_MINOR__) +#define __GNUC_PREREQ(maj, min) \ + ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) +#else +#define __GNUC_PREREQ(maj, min) 0 #endif -#ifndef __ORDER_BIG_ENDIAN__ -#define __ORDER_BIG_ENDIAN__ BIG_ENDIAN +#endif /* __GNUC_PREREQ */ + +#ifndef __CLANG_PREREQ +#ifdef __clang__ +#define __CLANG_PREREQ(maj, min) \ + ((__clang_major__ << 16) + __clang_minor__ >= ((maj) << 16) + (min)) +#else +#define __CLANG_PREREQ(maj, min) (0) #endif -#ifndef __BYTE_ORDER__ -#define __BYTE_ORDER__ BYTE_ORDER +#endif /* __CLANG_PREREQ */ + +/*****************************************************************************/ + +#ifdef _MSC_VER +/* Avoid '16' bytes padding added after data member 't1ha_context::total' + * and other warnings from std-headers if warning-level > 3. */ +#pragma warning(push, 3) #endif + +#if defined(__cplusplus) && __cplusplus >= 201103L +#include <climits> +#include <cstddef> +#include <cstdint> #else +#include <limits.h> +#include <stddef.h> +#include <stdint.h> +#endif + +/*****************************************************************************/ + +#if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ + defined(i486) || defined(__i486) || defined(__i486__) || \ + defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ + defined(__i686) || defined(__i686__) || defined(_M_IX86) || \ + defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) || \ + defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \ + defined(__amd64__) || defined(__amd64) || defined(_M_X64) || \ + defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__) +#ifndef __ia32__ +/* LY: define neutral __ia32__ for x86 and x86-64 archs */ +#define __ia32__ 1 +#endif /* __ia32__ */ +#if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ + defined(__amd64) || defined(_M_X64)) +/* LY: define trusty __amd64__ for all AMD64/x86-64 arch */ +#define __amd64__ 1 +#endif /* __amd64__ */ +#endif /* all x86 */ + #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ - !defined(__ORDER_BIG_ENDIAN__) + !defined(__ORDER_BIG_ENDIAN__) + +/* *INDENT-OFF* */ +/* clang-format off */ + +#if defined(__GLIBC__) || defined(__GNU_LIBRARY__) || defined(__ANDROID__) || \ + defined(HAVE_ENDIAN_H) || __has_include(<endian.h>) +#include <endian.h> +#elif defined(__APPLE__) || defined(__MACH__) || defined(__OpenBSD__) || \ + defined(HAVE_MACHINE_ENDIAN_H) || __has_include(<machine/endian.h>) +#include <machine/endian.h> +#elif defined(HAVE_SYS_ISA_DEFS_H) || __has_include(<sys/isa_defs.h>) +#include <sys/isa_defs.h> +#elif (defined(HAVE_SYS_TYPES_H) && defined(HAVE_SYS_ENDIAN_H)) || \ + (__has_include(<sys/types.h>) && __has_include(<sys/endian.h>)) +#include <sys/endian.h> +#include <sys/types.h> +#elif defined(__bsdi__) || defined(__DragonFly__) || defined(__FreeBSD__) || \ + defined(__NETBSD__) || defined(__NetBSD__) || \ + defined(HAVE_SYS_PARAM_H) || __has_include(<sys/param.h>) +#include <sys/param.h> +#endif /* OS */ + +/* *INDENT-ON* */ +/* clang-format on */ + +#if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN) +#define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN +#define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN +#define __BYTE_ORDER__ __BYTE_ORDER +#elif defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) +#define __ORDER_LITTLE_ENDIAN__ _LITTLE_ENDIAN +#define __ORDER_BIG_ENDIAN__ _BIG_ENDIAN +#define __BYTE_ORDER__ _BYTE_ORDER +#else #define __ORDER_LITTLE_ENDIAN__ 1234 #define __ORDER_BIG_ENDIAN__ 4321 -#if defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || \ - defined(__THUMBEL__) || defined(__AARCH64EL__) || defined(__MIPSEL__) || \ - defined(_MIPSEL) || defined(__MIPSEL) || defined(__i386) || \ - defined(__x86_64) || defined(_M_IX86) || defined(_M_X64) || \ - defined(i386) || defined(_X86_) || defined(__i386__) || defined(_X86_64_) + +#if defined(__LITTLE_ENDIAN__) || \ + (defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN)) || \ + defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) || \ + defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) || \ + defined(_M_ARM) || defined(_M_ARM64) || defined(__e2k__) || \ + defined(__elbrus_4c__) || defined(__elbrus_8c__) || defined(__bfin__) || \ + defined(__BFIN__) || defined(__ia64__) || defined(_IA64) || \ + defined(__IA64__) || defined(__ia64) || defined(_M_IA64) || \ + defined(__itanium__) || defined(__ia32__) || defined(__CYGWIN__) || \ + defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) || \ + defined(__WINDOWS__) #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ -#elif defined(__BIG_ENDIAN__) || defined(__ARMEB__) || defined(__THUMBEB__) || \ - defined(__AARCH64EB__) || defined(__MIPSEB__) || defined(_MIPSEB) || \ - defined(__MIPSEB) + +#elif defined(__BIG_ENDIAN__) || \ + (defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN)) || \ + defined(__ARMEB__) || defined(__THUMBEB__) || defined(__AARCH64EB__) || \ + defined(__MIPSEB__) || defined(_MIPSEB) || defined(__MIPSEB) || \ + defined(__m68k__) || defined(M68000) || defined(__hppa__) || \ + defined(__hppa) || defined(__HPPA__) || defined(__sparc__) || \ + defined(__sparc) || defined(__370__) || defined(__THW_370__) || \ + defined(__s390__) || defined(__s390x__) || defined(__SYSC_ZARCH__) #define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__ + #else #error __BYTE_ORDER__ should be defined. +#endif /* Arch */ + #endif -#endif -#endif -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ && \ - __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ -#error Unsupported byte order. -#endif +#endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ + +/*****************************************************************************/ -#if !defined(UNALIGNED_OK) -#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) || \ - defined(_M_X64) || defined(i386) || defined(_X86_) || defined(__i386__) || \ - defined(_X86_64_) -#define UNALIGNED_OK 1 +#ifndef __dll_export +#if defined(_WIN32) || defined(_WIN64) || defined(__CYGWIN__) +#if defined(__GNUC__) || __has_attribute(dllexport) +#define __dll_export __attribute__((dllexport)) +#elif defined(_MSC_VER) +#define __dll_export __declspec(dllexport) #else -#define UNALIGNED_OK 0 +#define __dll_export #endif +#elif defined(__GNUC__) || __has_attribute(visibility) +#define __dll_export __attribute__((visibility("default"))) +#else +#define __dll_export #endif +#endif /* __dll_export */ -#ifndef __GNUC_PREREQ -#if defined(__GNUC__) && defined(__GNUC_MINOR__) -#define __GNUC_PREREQ(maj, min) \ - ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) +#ifndef __dll_import +#if defined(_WIN32) || defined(_WIN64) || defined(__CYGWIN__) +#if defined(__GNUC__) || __has_attribute(dllimport) +#define __dll_import __attribute__((dllimport)) +#elif defined(_MSC_VER) +#define __dll_import __declspec(dllimport) #else -#define __GNUC_PREREQ(maj, min) 0 +#define __dll_import #endif +#else +#define __dll_import #endif +#endif /* __dll_import */ +#if defined(t1ha_EXPORTS) +#define T1HA_API __dll_export +#elif defined(t1ha_IMPORTS) +#define T1HA_API __dll_import +#else +#define T1HA_API +#endif /* T1HA_API */ -#if __GNUC_PREREQ(4, 4) || defined(__clang__) - -#if defined(__i386__) || defined(__x86_64__) -#include <x86intrin.h> -#endif -#define likely(cond) __builtin_expect(!!(cond), 1) -#define unlikely(cond) __builtin_expect(!!(cond), 0) -# if __GNUC_PREREQ(4, 6) || defined(__clang__) -#define unreachable() __builtin_unreachable() -# else -#define unreachable() \ - do { \ - for (;;) \ - ; \ - } while (0) -# endif -#define bswap64(v) __builtin_bswap64(v) -#define bswap32(v) __builtin_bswap32(v) -#if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) -#define bswap16(v) __builtin_bswap16(v) -#endif -#if __GNUC_PREREQ(4, 3) || __has_attribute(unused) -#define maybe_unused __attribute__((unused)) -#endif +#if defined(_MSC_VER) && defined(__ia32__) +#define T1HA_ALIGN_PREFIX __declspec(align(32)) /* required only for SIMD */ +#else +#define T1HA_ALIGN_PREFIX +#endif /* _MSC_VER */ -#elif defined(_MSC_VER) +#if defined(__GNUC__) && defined(__ia32__) +#define T1HA_ALIGN_SUFFIX \ + __attribute__((aligned(32))) /* required only for SIMD */ +#else +#define T1HA_ALIGN_SUFFIX +#endif /* GCC x86 */ -#include <intrin.h> -#include <stdlib.h> -#define likely(cond) (cond) -#define unlikely(cond) (cond) -#define unreachable() __assume(0) -#define bswap64(v) _byteswap_uint64(v) -#define bswap32(v) _byteswap_ulong(v) -#define bswap16(v) _byteswap_ushort(v) -#define rot64(v, s) _rotr64(v, s) -#define rot32(v, s) _rotr(v, s) - -#if defined(_M_ARM64) || defined(_M_X64) -#pragma intrinsic(_umul128) -#define mul_64x64_128(a, b, ph) _umul128(a, b, ph) -#pragma intrinsic(__umulh) -#define mul_64x64_high(a, b) __umulh(a, b) +#ifdef __cplusplus +extern "C" { #endif -#if defined(_M_IX86) -#pragma intrinsic(__emulu) -#define mul_32x32_64(a, b) __emulu(a, b) -#elif defined(_M_ARM) -#define mul_32x32_64(a, b) _arm_umull(a, b) +typedef union T1HA_ALIGN_PREFIX t1ha_state256 { + uint8_t bytes[32]; + uint32_t u32[8]; + uint64_t u64[4]; + struct { + uint64_t a, b, c, d; + } n; +} t1ha_state256_t T1HA_ALIGN_SUFFIX; + +typedef struct t1ha_context { + t1ha_state256_t state; + t1ha_state256_t buffer; + size_t partial; + uint64_t total; +} t1ha_context_t; + +#ifdef _MSC_VER +#pragma warning(pop) #endif -#else /* Compiler */ - -#define likely(cond) (cond) -#define unlikely(cond) (cond) -#define unreachable() \ - do \ - for (;;) \ - ; \ - while (0) -#endif /* Compiler */ - -#ifndef bswap64 -static __inline uint64_t bswap64(uint64_t v) { - return v << 56 | v >> 56 | ((v << 40) & 0x00ff000000000000ull) | - ((v << 24) & 0x0000ff0000000000ull) | - ((v << 8) & 0x000000ff00000000ull) | - ((v >> 8) & 0x00000000ff000000ull) | - ((v >> 24) & 0x0000000000ff0000ull) | - ((v >> 40) & 0x000000000000ff00ull); -} -#endif /* bswap64 */ +/****************************************************************************** + * + * t1ha2 = 64 and 128-bit, SLIGHTLY MORE ATTENTION FOR QUALITY AND STRENGTH. + * + * - The recommended version of "Fast Positive Hash" with good quality + * for checksum, hash tables and fingerprinting. + * - Portable and extremely efficiency on modern 64-bit CPUs. + * Designed for 64-bit little-endian platforms, + * in other cases will runs slowly. + * - Great quality of hashing and still faster than other non-t1ha hashes. + * Provides streaming mode and 128-bit result. + * + * Note: Due performance reason 64- and 128-bit results are completely + * different each other, i.e. 64-bit result is NOT any part of 128-bit. + */ -#ifndef bswap32 -static __inline uint32_t bswap32(uint32_t v) { - return v << 24 | v >> 24 | ((v << 8) & 0x00ff0000) | ((v >> 8) & 0x0000ff00); -} -#endif /* bswap32 */ +/* The at-once variant with 64-bit result */ +T1HA_API uint64_t t1ha2_atonce(const void *data, size_t length, uint64_t seed); + +/* The at-once variant with 128-bit result. + * Argument `extra_result` is NOT optional and MUST be valid. + * The high 64-bit part of 128-bit hash will be always unconditionally + * stored to the address given by `extra_result` argument. */ +T1HA_API uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result, + const void *__restrict data, size_t length, + uint64_t seed); + +/* The init/update/final trinity for streaming. + * Return 64 or 128-bit result depentently from `extra_result` argument. */ +T1HA_API void t1ha2_init(t1ha_context_t *ctx, uint64_t seed_x, uint64_t seed_y); +T1HA_API void t1ha2_update(t1ha_context_t *__restrict ctx, + const void *__restrict data, size_t length); + +/* Argument `extra_result` is optional and MAY be NULL. + * - If `extra_result` is NOT NULL then the 128-bit hash will be calculated, + * and high 64-bit part of it will be stored to the address given + * by `extra_result` argument. + * - Otherwise the 64-bit hash will be calculated + * and returned from function directly. + * + * Note: Due performance reason 64- and 128-bit results are completely + * different each other, i.e. 64-bit result is NOT any part of 128-bit. */ +T1HA_API uint64_t t1ha2_final(t1ha_context_t *__restrict ctx, + uint64_t *__restrict extra_result /* optional */); -#ifndef bswap16 -static __inline uint16_t bswap16(uint16_t v) { return v << 8 | v >> 8; } -#endif /* bswap16 */ +/****************************************************************************** + * + * t1ha1 = 64-bit, BASELINE FAST PORTABLE HASH: + * + * - Runs faster on 64-bit platforms in other cases may runs slowly. + * - Portable and stable, returns same 64-bit result + * on all architectures and CPUs. + * - Unfortunately it fails the "strict avalanche criteria", + * see test results at https://github.com/demerphq/smhasher. + * + * This flaw is insignificant for the t1ha1() purposes and imperceptible + * from a practical point of view. + * However, nowadays this issue has resolved in the next t1ha2(), + * that was initially planned to providing a bit more quality. + */ -#ifndef rot64 -static __inline uint64_t rot64(uint64_t v, unsigned s) { - return (v >> s) | (v << (64 - s)); -} -#endif /* rot64 */ +/* The little-endian variant. */ +T1HA_API uint64_t t1ha1_le(const void *data, size_t length, uint64_t seed); -#ifndef rot32 -static __inline uint32_t rot32(uint32_t v, unsigned s) { - return (v >> s) | (v << (32 - s)); -} -#endif /* rot32 */ +/* The big-endian variant. */ +T1HA_API uint64_t t1ha1_be(const void *data, size_t length, uint64_t seed); -#ifndef mul_32x32_64 -static __inline uint64_t mul_32x32_64(uint32_t a, uint32_t b) { - return a * (uint64_t)b; +/* The historical nicname for generic little-endian variant. */ +static __inline uint64_t t1ha(const void *data, size_t length, uint64_t seed) { + return t1ha1_le(data, length, seed); } -#endif /* mul_32x32_64 */ -/***************************************************************************/ - -static __inline uint64_t fetch64(const void *v) { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return *(const uint64_t *)v; -#else - return bswap64(*(const uint64_t *)v); -#endif -} +/****************************************************************************** + * + * t1ha0 = 64-bit, JUST ONLY FASTER: + * + * - Provides fast-as-possible hashing for current CPU, including + * 32-bit systems and engaging the available hardware acceleration. + * - It is a facade that selects most quick-and-dirty hash + * for the current processor. For instance, on IA32 (x86) actual function + * will be selected in runtime, depending on current CPU capabilities + * + * BE CAREFUL!!! THIS IS MEANS: + * + * 1. The quality of hash is a subject for tradeoffs with performance. + * So, the quality and strength of t1ha0() may be lower than t1ha1(), + * especially on 32-bit targets, but then much faster. + * However, guaranteed that it passes all SMHasher tests. + * + * 2. No warranty that the hash result will be same for particular + * key on another machine or another version of libt1ha. + * + * Briefly, such hash-results and their derivatives, should be + * used only in runtime, but should not be persist or transferred + * over a network. + */ -static __inline uint64_t fetch32(const void *v) { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return *(const uint32_t *)v; +/* The little-endian variant for 32-bit CPU. */ +uint64_t t1ha0_32le(const void *data, size_t length, uint64_t seed); +/* The big-endian variant for 32-bit CPU. */ +uint64_t t1ha0_32be(const void *data, size_t length, uint64_t seed); + +/* Define T1HA0_AESNI_AVAILABLE to 0 for disable AES-NI support. */ +#ifndef T1HA0_AESNI_AVAILABLE +#if (defined(__ia32__) && (!defined(_M_IX86) || _MSC_VER > 1800)) + #if defined(__GNUC__) && \ + ((defined(__clang__) && (__clang_major__ >= 4 || (__clang_major__ >= 3 && __clang_minor__ >= 8))) || \ + ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8) || (__GNUC__ > 4))) + #define T1HA0_AESNI_AVAILABLE 1 + #else + #define T1HA0_AESNI_AVAILABLE 0 + #endif #else - return bswap32(*(const uint32_t *)v); +#define T1HA0_AESNI_AVAILABLE 0 #endif -} +#endif /* T1HA0_AESNI_AVAILABLE */ -static __inline uint64_t fetch16(const void *v) { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return *(const uint16_t *)v; +/* Define T1HA0_RUNTIME_SELECT to 0 for disable dispatching t1ha0 at runtime. */ +#ifndef T1HA0_RUNTIME_SELECT +#if T1HA0_AESNI_AVAILABLE && !defined(__e2k__) +#define T1HA0_RUNTIME_SELECT 1 #else - return bswap16(*(const uint16_t *)v); +#define T1HA0_RUNTIME_SELECT 0 #endif -} +#endif /* T1HA0_RUNTIME_SELECT */ -static __inline uint64_t fetch_tail(const void *v, size_t tail) { - const uint8_t *_ = (const uint8_t *)v; - switch (tail & 7) { - case 1: - return _[0]; - case 2: - return fetch16(_); - case 3: - return fetch16(_) | (_[2] << 16); - case 4: - return fetch32(_); - case 5: - return fetch32(_) | ((uint64_t)_[4] << 32); - case 6: - return fetch32(_) | (fetch16(_ + 4) << 32); - case 7: - return fetch32(_) | (fetch16(_ + 4) << 32) | ((uint64_t)_[6] << 48); - case 0: - return fetch64(_); - default: - unreachable(); - } -} - -/* xor-mul-xor mixer */ -static __inline uint64_t mix(uint64_t v, uint64_t p) { - static const unsigned s0 = 41; - v *= p; - return v ^ rot64(v, s0); -} - -static __inline unsigned add_with_carry(uint64_t *sum, uint64_t addend) { - *sum += addend; - return *sum < addend; -} +#if T1HA0_AESNI_AVAILABLE +uint64_t t1ha0_ia32aes_noavx(const void *data, size_t length, uint64_t seed); +#endif /* T1HA0_AESNI_AVAILABLE */ -/* xor high and low parts of full 128-bit product */ -static __inline uint64_t mux64(uint64_t v, uint64_t p) { -#ifdef __SIZEOF_INT128__ - __uint128_t r = (__uint128_t)v * (__uint128_t)p; - /* modern GCC could nicely optimize this */ - return r ^ (r >> 64); -#elif defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128 - __uint128 r = (__uint128)v * (__uint128)p; - return r ^ (r >> 64); -#elif defined(mul_64x64_128) - uint64_t l, h; - l = mul_64x64_128(v, p, &h); - return l ^ h; -#elif defined(mul_64x64_high) - uint64_t l, h; - l = v * p; - h = mul_64x64_high(v, p); - return l ^ h; +#if T1HA0_RUNTIME_SELECT +#ifdef __ELF__ +/* ifunc/gnu_indirect_function will be used on ELF. + * Please see https://en.wikipedia.org/wiki/Executable_and_Linkable_Format */ +T1HA_API uint64_t t1ha0(const void *data, size_t length, uint64_t seed); #else - /* performs 64x64 to 128 bit multiplication */ - uint64_t ll = mul_32x32_64((uint32_t)v, (uint32_t)p); - uint64_t lh = mul_32x32_64(v >> 32, (uint32_t)p); - uint64_t hl = mul_32x32_64(p >> 32, (uint32_t)v); - uint64_t hh = - mul_32x32_64(v >> 32, p >> 32) + (lh >> 32) + (hl >> 32) + - /* Few simplification are possible here for 32-bit architectures, - * but thus we would lost compatibility with the original 64-bit - * version. Think is very bad idea, because then 32-bit t1ha will - * still (relatively) very slowly and well yet not compatible. */ - add_with_carry(&ll, lh << 32) + add_with_carry(&ll, hl << 32); - return hh ^ ll; -#endif -} - -static uint64_t -t1ha(const void *data, size_t len, uint64_t seed) -{ - /* 'magic' primes */ - static const uint64_t p0 = 17048867929148541611ull; - static const uint64_t p1 = 9386433910765580089ull; - static const uint64_t p2 = 15343884574428479051ull; - static const uint64_t p3 = 13662985319504319857ull; - static const uint64_t p4 = 11242949449147999147ull; - static const uint64_t p5 = 13862205317416547141ull; - static const uint64_t p6 = 14653293970879851569ull; - /* rotations */ - static const unsigned s0 = 41; - static const unsigned s1 = 17; - static const unsigned s2 = 31; - - uint64_t a = seed; - uint64_t b = len; - - const int need_align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK; - uint64_t align[4]; - - if (unlikely(len > 32)) { - uint64_t c = rot64(len, s1) + seed; - uint64_t d = len ^ rot64(seed, s1); - const void *detent = (const uint8_t *)data + len - 31; - do { - const uint64_t *v = (const uint64_t *)data; - if (unlikely(need_align)) - v = (const uint64_t *)memcpy(&align, v, 32); - - uint64_t w0 = fetch64(v + 0); - uint64_t w1 = fetch64(v + 1); - uint64_t w2 = fetch64(v + 2); - uint64_t w3 = fetch64(v + 3); - - uint64_t d02 = w0 ^ rot64(w2 + d, s1); - uint64_t c13 = w1 ^ rot64(w3 + c, s1); - c += a ^ rot64(w0, s0); - d -= b ^ rot64(w1, s2); - a ^= p1 * (d02 + w3); - b ^= p0 * (c13 + w2); - data = (const uint64_t *)data + 4; - } while (likely(data < detent)); - - a ^= p6 * (rot64(c, s1) + d); - b ^= p5 * (c + rot64(d, s1)); - len &= 31; - } - - const uint64_t *v = (const uint64_t *)data; - if (unlikely(need_align) && len > 1) - v = (const uint64_t *)memcpy(&align, v, len); - - switch (len) { - default: - b += mux64(fetch64(v++), p4); - case 24: - case 23: - case 22: - case 21: - case 20: - case 19: - case 18: - case 17: - a += mux64(fetch64(v++), p3); - case 16: - case 15: - case 14: - case 13: - case 12: - case 11: - case 10: - case 9: - b += mux64(fetch64(v++), p2); - case 8: - case 7: - case 6: - case 5: - case 4: - case 3: - case 2: - case 1: - a += mux64(fetch_tail(v, len), p1); - case 0: - return mux64(rot64(a + b, s1), p4) + mix(a ^ b, p0); - } +/* Otherwise function pointer will be used. + * Unfortunately this may cause some overhead calling. */ +T1HA_API extern uint64_t (*t1ha0_funcptr)(const void *data, size_t length, + uint64_t seed); +static __inline uint64_t t1ha0(const void *data, size_t length, uint64_t seed) { + return t1ha0_funcptr(data, length, seed); } +#endif /* __ELF__ */ -static __inline uint32_t tail32_le(const void *v, size_t tail) { - const uint8_t *p = (const uint8_t *)v; - uint32_t r = 0; - switch (tail & 3) { -#if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - /* For most CPUs this code is better when not needed - * copying for alignment or byte reordering. */ - case 0: - return fetch32(p); - case 3: - r = (uint32_t)p[2] << 16; - case 2: - return r + fetch16(p); - case 1: - return p[0]; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +static __inline uint64_t t1ha0(const void *data, size_t length, uint64_t seed) { +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul + return t1ha1_be(data, length, seed); #else - /* For most CPUs this code is better than a - * copying for alignment and/or byte reordering. */ - case 0: - r += p[3]; - r <<= 8; - case 3: - r += p[2]; - r <<= 8; - case 2: - r += p[1]; - r <<= 8; - case 1: - return r + p[0]; + return t1ha0_32be(data, length, seed); #endif - } - unreachable(); } - -static __inline uint32_t tail32_be(const void *v, size_t tail) { - const uint8_t *p = (const uint8_t *)v; - switch (tail & 3) { -#if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - /* For most CPUs this code is better when not needed - * copying for alignment or byte reordering. */ - case 1: - return p[0]; - case 2: - return fetch16_be(p); - case 3: - return fetch16_be(p) << 8 | p[2]; - case 0: - return fetch32_be(p); #else - /* For most CPUs this code is better than a - * copying for alignment and/or byte reordering. */ - case 1: - return p[0]; - case 2: - return p[1] | (uint32_t)p[0] << 8; - case 3: - return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; - case 0: - return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | - (uint32_t)p[0] << 24; +static __inline uint64_t t1ha0(const void *data, size_t length, uint64_t seed) { +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul + return t1ha1_le(data, length, seed); +#else + return t1ha0_32le(data, length, seed); #endif - } - unreachable(); } +#endif /* !T1HA0_RUNTIME_SELECT */ -static __inline uint64_t remix32(uint32_t a, uint32_t b) { - static const uint64_t p0 = 17048867929148541611ull; - a ^= rot32(b, 13); - uint64_t l = a | (uint64_t)b << 32; - l *= p0; - l ^= l >> 41; - return l; +#ifdef __cplusplus } - -static __inline void mixup32(uint32_t *a, uint32_t *b, uint32_t v, uint32_t p) { - uint64_t l = mul_32x32_64(*b + v, p); - *a ^= (uint32_t)l; - *b += (uint32_t)(l >> 32); -} - -static uint64_t t1ha32(const void *data, size_t len, uint64_t seed) { - /* 32-bit 'magic' primes */ - static const uint32_t q0 = 0x92D78269; - static const uint32_t q1 = 0xCA9B4735; - static const uint32_t q2 = 0xA4ABA1C3; - static const uint32_t q3 = 0xF6499843; - static const uint32_t q4 = 0x86F0FD61; - static const uint32_t q5 = 0xCA2DA6FB; - static const uint32_t q6 = 0xC4BB3575; - /* rotations */ - static const unsigned s1 = 17; - - uint32_t a = rot32((uint32_t)len, s1) + (uint32_t)seed; - uint32_t b = (uint32_t)len ^ (uint32_t)(seed >> 32); - - const int need_align = (((uintptr_t)data) & 3) != 0 && !UNALIGNED_OK; - uint32_t align[4]; - - if (unlikely(len > 16)) { - uint32_t c = ~a; - uint32_t d = rot32(b, 5); - const void *detent = (const uint8_t *)data + len - 15; - do { - const uint32_t *v = (const uint32_t *)data; - if (unlikely(need_align)) - v = (const uint32_t *)memcpy(&align, v, 16); - - uint32_t w0 = fetch32(v + 0); - uint32_t w1 = fetch32(v + 1); - uint32_t w2 = fetch32(v + 2); - uint32_t w3 = fetch32(v + 3); - - uint32_t c02 = w0 ^ rot32(w2 + c, 11); - uint32_t d13 = w1 + rot32(w3 + d, s1); - c ^= rot32(b + w1, 7); - d ^= rot32(a + w0, 3); - b = q1 * (c02 + w3); - a = q0 * (d13 ^ w2); - - data = (const uint32_t *)data + 4; - } while (likely(data < detent)); - - c += a; - d += b; - a ^= q6 * (rot32(c, 16) + d); - b ^= q5 * (c + rot32(d, 16)); - - len &= 15; - } - - const uint8_t *v = (const uint8_t *)data; - if (unlikely(need_align) && len > 4) - v = (const uint8_t *)memcpy(&align, v, len); - - switch (len) { - default: - mixup32(&a, &b, fetch32(v), q4); - v += 4; - case 12: - case 11: - case 10: - case 9: - mixup32(&b, &a, fetch32(v), q3); - v += 4; - case 8: - case 7: - case 6: - case 5: - mixup32(&a, &b, fetch32(v), q2); - v += 4; - case 4: - case 3: - case 2: - case 1: - mixup32(&b, &a, tail32_le(v, len), q1); - case 0: - return remix32(a, b); - } -} - #endif diff --git a/contrib/t1ha/t1ha0.c b/contrib/t1ha/t1ha0.c new file mode 100644 index 000000000..16cbefa5a --- /dev/null +++ b/contrib/t1ha/t1ha0.c @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, + * Fast Positive Hash. + * + * Portions Copyright (c) 2010-2018 Leonid Yuriev <leo@yuriev.ru>, + * The 1Hippeus project (t1h). + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgement in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* + * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" } + * by [Positive Technologies](https://www.ptsecurity.ru) + * + * Briefly, it is a 64-bit Hash Function: + * 1. Created for 64-bit little-endian platforms, in predominantly for x86_64, + * but portable and without penalties it can run on any 64-bit CPU. + * 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash + * and all others portable hash-functions (which do not use specific + * hardware tricks). + * 3. Not suitable for cryptography. + * + * The Future will Positive. Всё будет хорошо. + * + * ACKNOWLEDGEMENT: + * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев) + * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! + */ + +#include "config.h" +#include "t1ha_bits.h" + +static __always_inline uint32_t tail32_le(const void *v, size_t tail) { + const uint8_t *p = (const uint8_t *)v; +#ifdef can_read_underside + /* On some systems (e.g. x86) we can perform a 'oneshot' read, which + * is little bit faster. Thanks Marcin Żukowski <marcin.zukowski@gmail.com> + * for the reminder. */ + const unsigned offset = (4 - tail) & 3; + const unsigned shift = offset << 3; + if (likely(can_read_underside(p, 4))) { + p -= offset; + return fetch32_le(p) >> shift; + } + return fetch32_le(p) & ((~UINT32_C(0)) >> shift); +#endif /* 'oneshot' read */ + + uint32_t r = 0; + switch (tail & 3) { +#if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + /* For most CPUs this code is better when not needed + * copying for alignment or byte reordering. */ + case 0: + return fetch32_le(p); + case 3: + r = (uint32_t)p[2] << 16; + /* fall through */ + case 2: + return r + fetch16_le(p); + case 1: + return p[0]; +#else + /* For most CPUs this code is better than a + * copying for alignment and/or byte reordering. */ + case 0: + r += p[3]; + r <<= 8; + /* fall through */ + case 3: + r += p[2]; + r <<= 8; + /* fall through */ + case 2: + r += p[1]; + r <<= 8; + /* fall through */ + case 1: + return r + p[0]; +#endif + } + unreachable(); +} + +static __always_inline uint32_t tail32_be(const void *v, size_t tail) { + const uint8_t *p = (const uint8_t *)v; +#ifdef can_read_underside + /* On some systems we can perform a 'oneshot' read, which is little bit + * faster. Thanks Marcin Żukowski <marcin.zukowski@gmail.com> for the + * reminder. */ + const unsigned offset = (4 - tail) & 3; + const unsigned shift = offset << 3; + if (likely(can_read_underside(p, 4))) { + p -= offset; + return fetch32_be(p) & ((~UINT32_C(0)) >> shift); + } + return fetch32_be(p) >> shift; +#endif /* 'oneshot' read */ + + switch (tail & 3) { +#if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + /* For most CPUs this code is better when not needed + * copying for alignment or byte reordering. */ + case 1: + return p[0]; + case 2: + return fetch16_be(p); + case 3: + return fetch16_be(p) << 8 | p[2]; + case 0: + return fetch32_be(p); +#else + /* For most CPUs this code is better than a + * copying for alignment and/or byte reordering. */ + case 1: + return p[0]; + case 2: + return p[1] | (uint32_t)p[0] << 8; + case 3: + return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; + case 0: + return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | + (uint32_t)p[0] << 24; +#endif + } + unreachable(); +} + +/***************************************************************************/ + +#ifndef rot32 +static __maybe_unused __always_inline uint32_t rot32(uint32_t v, unsigned s) { + return (v >> s) | (v << (32 - s)); +} +#endif /* rot32 */ + +static __always_inline void mixup32(uint32_t *a, uint32_t *b, uint32_t v, + uint32_t prime) { + uint64_t l = mul_32x32_64(*b + v, prime); + *a ^= (uint32_t)l; + *b += (uint32_t)(l >> 32); +} + +static __always_inline uint64_t final32(uint32_t a, uint32_t b) { + uint64_t l = (b ^ rot32(a, 13)) | (uint64_t)a << 32; + l *= prime_0; + l ^= l >> 41; + l *= prime_4; + l ^= l >> 47; + l *= prime_6; + return l; +} + +/* 32-bit 'magic' primes */ +static const uint32_t prime32_0 = UINT32_C(0x92D78269); +static const uint32_t prime32_1 = UINT32_C(0xCA9B4735); +static const uint32_t prime32_2 = UINT32_C(0xA4ABA1C3); +static const uint32_t prime32_3 = UINT32_C(0xF6499843); +static const uint32_t prime32_4 = UINT32_C(0x86F0FD61); +static const uint32_t prime32_5 = UINT32_C(0xCA2DA6FB); +static const uint32_t prime32_6 = UINT32_C(0xC4BB3575); + +uint64_t t1ha0_32le(const void *data, size_t len, uint64_t seed) { + uint32_t a = rot32((uint32_t)len, 17) + (uint32_t)seed; + uint32_t b = (uint32_t)len ^ (uint32_t)(seed >> 32); + + const int need_align = (((uintptr_t)data) & 3) != 0 && !UNALIGNED_OK; + uint32_t align[4]; + + if (unlikely(len > 16)) { + uint32_t c = ~a; + uint32_t d = rot32(b, 5); + const void *detent = (const uint8_t *)data + len - 15; + do { + const uint32_t *v = (const uint32_t *)data; + if (unlikely(need_align)) + v = (const uint32_t *)memcpy(&align, unaligned(v), 16); + + uint32_t w0 = fetch32_le(v + 0); + uint32_t w1 = fetch32_le(v + 1); + uint32_t w2 = fetch32_le(v + 2); + uint32_t w3 = fetch32_le(v + 3); + + uint32_t c02 = w0 ^ rot32(w2 + c, 11); + uint32_t d13 = w1 + rot32(w3 + d, 17); + c ^= rot32(b + w1, 7); + d ^= rot32(a + w0, 3); + b = prime32_1 * (c02 + w3); + a = prime32_0 * (d13 ^ w2); + + data = (const uint32_t *)data + 4; + } while (likely(data < detent)); + + c += a; + d += b; + a ^= prime32_6 * (rot32(c, 16) + d); + b ^= prime32_5 * (c + rot32(d, 16)); + + len &= 15; + } + + const uint8_t *v = (const uint8_t *)data; + if (unlikely(need_align) && len > 4) + v = (const uint8_t *)memcpy(&align, unaligned(v), len); + + switch (len) { + default: + mixup32(&a, &b, fetch32_le(v), prime32_4); + v += 4; + /* fall through */ + case 12: + case 11: + case 10: + case 9: + mixup32(&b, &a, fetch32_le(v), prime32_3); + v += 4; + /* fall through */ + case 8: + case 7: + case 6: + case 5: + mixup32(&a, &b, fetch32_le(v), prime32_2); + v += 4; + /* fall through */ + case 4: + case 3: + case 2: + case 1: + mixup32(&b, &a, tail32_le(v, len), prime32_1); + /* fall through */ + case 0: + return final32(a, b); + } +} + +uint64_t t1ha0_32be(const void *data, size_t len, uint64_t seed) { + uint32_t a = rot32((uint32_t)len, 17) + (uint32_t)seed; + uint32_t b = (uint32_t)len ^ (uint32_t)(seed >> 32); + + const int need_align = (((uintptr_t)data) & 3) != 0 && !UNALIGNED_OK; + uint32_t align[4]; + + if (unlikely(len > 16)) { + uint32_t c = ~a; + uint32_t d = rot32(b, 5); + const void *detent = (const uint8_t *)data + len - 15; + do { + const uint32_t *v = (const uint32_t *)data; + if (unlikely(need_align)) + v = (const uint32_t *)memcpy(&align, unaligned(v), 16); + + uint32_t w0 = fetch32_be(v + 0); + uint32_t w1 = fetch32_be(v + 1); + uint32_t w2 = fetch32_be(v + 2); + uint32_t w3 = fetch32_be(v + 3); + + uint32_t c02 = w0 ^ rot32(w2 + c, 11); + uint32_t d13 = w1 + rot32(w3 + d, 17); + c ^= rot32(b + w1, 7); + d ^= rot32(a + w0, 3); + b = prime32_1 * (c02 + w3); + a = prime32_0 * (d13 ^ w2); + + data = (const uint32_t *)data + 4; + } while (likely(data < detent)); + + c += a; + d += b; + a ^= prime32_6 * (rot32(c, 16) + d); + b ^= prime32_5 * (c + rot32(d, 16)); + + len &= 15; + } + + const uint8_t *v = (const uint8_t *)data; + if (unlikely(need_align) && len > 4) + v = (const uint8_t *)memcpy(&align, unaligned(v), len); + + switch (len) { + default: + mixup32(&a, &b, fetch32_be(v), prime32_4); + v += 4; + /* fall through */ + case 12: + case 11: + case 10: + case 9: + mixup32(&b, &a, fetch32_be(v), prime32_3); + v += 4; + /* fall through */ + case 8: + case 7: + case 6: + case 5: + mixup32(&a, &b, fetch32_be(v), prime32_2); + v += 4; + /* fall through */ + case 4: + case 3: + case 2: + case 1: + mixup32(&b, &a, tail32_be(v, len), prime32_1); + /* fall through */ + case 0: + return final32(a, b); + } +} + +/***************************************************************************/ + +#if T1HA0_RUNTIME_SELECT + +#if T1HA0_AESNI_AVAILABLE && defined(__ia32__) +static uint64_t x86_cpu_features(void) { + uint32_t features = 0; + uint32_t extended = 0; +#ifdef __GNUC__ + uint32_t eax, ebx, ecx, edx; + const unsigned cpuid_max = __get_cpuid_max(0, NULL); + if (cpuid_max >= 1) { + __cpuid_count(1, 0, eax, ebx, features, edx); + if (cpuid_max >= 7) + __cpuid_count(7, 0, eax, extended, ecx, edx); + } +#elif defined(_MSC_VER) + int info[4]; + __cpuid(info, 0); + const unsigned cpuid_max = info[0]; + if (cpuid_max >= 1) { + __cpuidex(info, 1, 0); + features = info[2]; + if (cpuid_max >= 7) { + __cpuidex(info, 7, 0); + extended = info[1]; + } + } +#endif + return features | (uint64_t)extended << 32; +} +#endif /* T1HA0_AESNI_AVAILABLE && __ia32__ */ + +static +#if __GNUC_PREREQ(4, 0) || __has_attribute(used) + __attribute__((used)) +#endif + uint64_t (*t1ha0_resolve(void))(const void *, size_t, uint64_t) { + +#if T1HA0_AESNI_AVAILABLE && defined(__ia32__) + uint64_t features = x86_cpu_features(); + if (features & UINT32_C(0x02000000) /* check for AES-NI */) { + return t1ha0_ia32aes_noavx; + } +#endif /* T1HA0_AESNI_AVAILABLE && __ia32__ */ + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul + return t1ha1_be; +#else + return t1ha0_32be; +#endif +#else /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */ +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul + return t1ha1_le; +#else + return t1ha0_32le; +#endif +#endif /* __BYTE_ORDER__ */ +} + +#ifdef __ELF__ + +#if __has_attribute(ifunc) + +uint64_t t1ha0(const void *data, size_t len, uint64_t seed) + __attribute__((ifunc("t1ha0_resolve"))); +#else +__asm("\t.globl\tt1ha0\n\t.type\tt1ha0, " + "%gnu_indirect_function\n\t.set\tt1ha0,t1ha0_resolve"); +#endif /* ifunc */ + +#elif __GNUC_PREREQ(4, 0) || __has_attribute(constructor) + +uint64_t (*t1ha0_funcptr)(const void *, size_t, uint64_t); + +static void __attribute__((constructor)) t1ha0_init(void) { + t1ha0_funcptr = t1ha0_resolve(); +} + +#else /* ELF */ +static uint64_t t1ha0_proxy(const void *data, size_t len, uint64_t seed) { + t1ha0_funcptr = t1ha0_resolve(); + return t1ha0_funcptr(data, len, seed); +} + +uint64_t (*t1ha0_funcptr)(const void *, size_t, uint64_t) = t1ha0_proxy; + +#endif /* !ELF */ +#endif /* T1HA0_RUNTIME_SELECT */ diff --git a/contrib/t1ha/t1ha0_ia32aes_a.h b/contrib/t1ha/t1ha0_ia32aes_a.h new file mode 100644 index 000000000..fa02e419e --- /dev/null +++ b/contrib/t1ha/t1ha0_ia32aes_a.h @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, + * Fast Positive Hash. + * + * Portions Copyright (c) 2010-2018 Leonid Yuriev <leo@yuriev.ru>, + * The 1Hippeus project (t1h). + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgement in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* + * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" } + * by [Positive Technologies](https://www.ptsecurity.ru) + * + * Briefly, it is a 64-bit Hash Function: + * 1. Created for 64-bit little-endian platforms, in predominantly for x86_64, + * but portable and without penalties it can run on any 64-bit CPU. + * 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash + * and all others portable hash-functions (which do not use specific + * hardware tricks). + * 3. Not suitable for cryptography. + * + * The Future will Positive. Всё будет хорошо. + * + * ACKNOWLEDGEMENT: + * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев) + * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! + */ + +#include "t1ha_bits.h" + +#if T1HA0_AESNI_AVAILABLE + +#pragma GCC push_options +#pragma GCC target("aes") +#ifndef __SSE2__ +#define __SSE2__ +#endif +#ifndef __SSE__ +#define __SSE__ +#endif +#ifndef __AES__ +#define __AES__ +#endif +#include <immintrin.h> + +uint64_t T1HA_IA32AES_NAME(const void *data, size_t len, uint64_t seed) { + uint64_t a = seed; + uint64_t b = len; + + if (unlikely(len > 32)) { + __m128i x = _mm_set_epi64x(a, b); + __m128i y = _mm_aesenc_si128(x, _mm_set_epi64x(prime_5, prime_6)); + + const __m128i *__restrict v = (const __m128i *)data; + const __m128i *__restrict const detent = + (const __m128i *)((const uint8_t *)data + len - 127); + + while (v < detent) { + __m128i v0 = _mm_loadu_si128(v + 0); + __m128i v1 = _mm_loadu_si128(v + 1); + __m128i v2 = _mm_loadu_si128(v + 2); + __m128i v3 = _mm_loadu_si128(v + 3); + __m128i v4 = _mm_loadu_si128(v + 4); + __m128i v5 = _mm_loadu_si128(v + 5); + __m128i v6 = _mm_loadu_si128(v + 6); + __m128i v7 = _mm_loadu_si128(v + 7); + + __m128i v0y = _mm_aesenc_si128(v0, y); + __m128i v2x6 = _mm_aesenc_si128(v2, _mm_xor_si128(x, v6)); + __m128i v45_67 = + _mm_xor_si128(_mm_aesenc_si128(v4, v5), _mm_add_epi64(v6, v7)); + + __m128i v0y7_1 = _mm_aesdec_si128(_mm_sub_epi64(v7, v0y), v1); + __m128i v2x6_3 = _mm_aesenc_si128(v2x6, v3); + + x = _mm_aesenc_si128(v45_67, _mm_add_epi64(x, y)); + y = _mm_aesenc_si128(v2x6_3, _mm_xor_si128(v0y7_1, v5)); + v += 8; + } + + if (len & 64) { + __m128i v0y = _mm_add_epi64(y, _mm_loadu_si128(v++)); + __m128i v1x = _mm_sub_epi64(x, _mm_loadu_si128(v++)); + x = _mm_aesdec_si128(x, v0y); + y = _mm_aesdec_si128(y, v1x); + + __m128i v2y = _mm_add_epi64(y, _mm_loadu_si128(v++)); + __m128i v3x = _mm_sub_epi64(x, _mm_loadu_si128(v++)); + x = _mm_aesdec_si128(x, v2y); + y = _mm_aesdec_si128(y, v3x); + } + + if (len & 32) { + __m128i v0y = _mm_add_epi64(y, _mm_loadu_si128(v++)); + __m128i v1x = _mm_sub_epi64(x, _mm_loadu_si128(v++)); + x = _mm_aesdec_si128(x, v0y); + y = _mm_aesdec_si128(y, v1x); + } + + if (len & 16) { + y = _mm_add_epi64(x, y); + x = _mm_aesdec_si128(x, _mm_loadu_si128(v++)); + } + + x = _mm_add_epi64(_mm_aesdec_si128(x, _mm_aesenc_si128(y, x)), y); +#if defined(__x86_64__) || defined(_M_X64) +#if defined(__SSE4_1__) || defined(__AVX__) + a = _mm_extract_epi64(x, 0); + b = _mm_extract_epi64(x, 1); +#else + a = _mm_cvtsi128_si64(x); + b = _mm_cvtsi128_si64(_mm_unpackhi_epi64(x, x)); +#endif +#else +#if defined(__SSE4_1__) || defined(__AVX__) + a = (uint32_t)_mm_extract_epi32(x, 0) | + (uint64_t)_mm_extract_epi32(x, 1) << 32; + b = (uint32_t)_mm_extract_epi32(x, 2) | + (uint64_t)_mm_extract_epi32(x, 3) << 32; +#else + a = (uint32_t)_mm_cvtsi128_si32(x); + a |= (uint64_t)_mm_cvtsi128_si32(_mm_shuffle_epi32(x, 1)) << 32; + x = _mm_unpackhi_epi64(x, x); + b = (uint32_t)_mm_cvtsi128_si32(x); + b |= (uint64_t)_mm_cvtsi128_si32(_mm_shuffle_epi32(x, 1)) << 32; +#endif +#endif +#ifdef __AVX__ + _mm256_zeroall(); +#elif !(defined(_X86_64_) || defined(__x86_64__) || defined(_M_X64)) + _mm_empty(); +#endif + data = v; + len &= 15; + } + + const uint64_t *v = (const uint64_t *)data; +#ifdef __e2k__ + const int need_align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK; + uint64_t align[4]; + if (unlikely(need_align) && len > 8) + v = (const uint64_t *)memcpy(&align, unaligned(v), len); +#endif /* __e2k__ */ + + switch (len) { + default: + mixup64(&a, &b, *v++, prime_4); + /* fall through */ + case 24: + case 23: + case 22: + case 21: + case 20: + case 19: + case 18: + case 17: + mixup64(&b, &a, *v++, prime_3); + /* fall through */ + case 16: + case 15: + case 14: + case 13: + case 12: + case 11: + case 10: + case 9: + mixup64(&a, &b, *v++, prime_2); + /* fall through */ + case 8: + case 7: + case 6: + case 5: + case 4: + case 3: + case 2: + case 1: + mixup64(&b, &a, tail64_le(v, len), prime_1); + /* fall through */ + case 0: + return final64(a, b); + } +} + +#endif /* T1HA0_AESNI_AVAILABLE */ +#undef T1HA_IA32AES_NAME diff --git a/contrib/t1ha/t1ha0_ia32aes_noavx.c b/contrib/t1ha/t1ha0_ia32aes_noavx.c new file mode 100644 index 000000000..ca4588de7 --- /dev/null +++ b/contrib/t1ha/t1ha0_ia32aes_noavx.c @@ -0,0 +1,2 @@ +#define T1HA_IA32AES_NAME t1ha0_ia32aes_noavx +#include "t1ha0_ia32aes_a.h" diff --git a/contrib/t1ha/t1ha1.c b/contrib/t1ha/t1ha1.c new file mode 100644 index 000000000..1c92fd0f6 --- /dev/null +++ b/contrib/t1ha/t1ha1.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, + * Fast Positive Hash. + * + * Portions Copyright (c) 2010-2018 Leonid Yuriev <leo@yuriev.ru>, + * The 1Hippeus project (t1h). + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgement in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* + * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" } + * by [Positive Technologies](https://www.ptsecurity.ru) + * + * Briefly, it is a 64-bit Hash Function: + * 1. Created for 64-bit little-endian platforms, in predominantly for x86_64, + * but portable and without penalties it can run on any 64-bit CPU. + * 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash + * and all others portable hash-functions (which do not use specific + * hardware tricks). + * 3. Not suitable for cryptography. + * + * The Future will Positive. Всё будет хорошо. + * + * ACKNOWLEDGEMENT: + * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев) + * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! + */ + +#include "config.h" +#include "t1ha_bits.h" + +/* xor-mul-xor mixer */ +static __inline uint64_t mix64(uint64_t v, uint64_t p) { + v *= p; + return v ^ rot64(v, 41); +} + +static __inline uint64_t final_weak_avalanche(uint64_t a, uint64_t b) { + /* LY: for performance reason on a some not high-end CPUs + * I replaced the second mux64() operation by mix64(). + * Unfortunately this approach fails the "strict avalanche criteria", + * see test results at https://github.com/demerphq/smhasher. */ + return mux64(rot64(a + b, 17), prime_4) + mix64(a ^ b, prime_0); +} + +uint64_t t1ha1_le(const void *data, size_t len, uint64_t seed) { + uint64_t a = seed; + uint64_t b = len; + + const int need_align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK; + uint64_t align[4]; + + if (unlikely(len > 32)) { + uint64_t c = rot64(len, 17) + seed; + uint64_t d = len ^ rot64(seed, 17); + const void *detent = (const uint8_t *)data + len - 31; + do { + const uint64_t *v = (const uint64_t *)data; + if (unlikely(need_align)) + v = (const uint64_t *)memcpy(&align, unaligned(v), 32); + + uint64_t w0 = fetch64_le(v + 0); + uint64_t w1 = fetch64_le(v + 1); + uint64_t w2 = fetch64_le(v + 2); + uint64_t w3 = fetch64_le(v + 3); + + uint64_t d02 = w0 ^ rot64(w2 + d, 17); + uint64_t c13 = w1 ^ rot64(w3 + c, 17); + c += a ^ rot64(w0, 41); + d -= b ^ rot64(w1, 31); + a ^= prime_1 * (d02 + w3); + b ^= prime_0 * (c13 + w2); + data = (const uint64_t *)data + 4; + } while (likely(data < detent)); + + a ^= prime_6 * (rot64(c, 17) + d); + b ^= prime_5 * (c + rot64(d, 17)); + len &= 31; + } + + const uint64_t *v = (const uint64_t *)data; + if (unlikely(need_align) && len > 8) + v = (const uint64_t *)memcpy(&align, unaligned(v), len); + + switch (len) { + default: + b += mux64(fetch64_le(v++), prime_4); + /* fall through */ + case 24: + case 23: + case 22: + case 21: + case 20: + case 19: + case 18: + case 17: + a += mux64(fetch64_le(v++), prime_3); + /* fall through */ + case 16: + case 15: + case 14: + case 13: + case 12: + case 11: + case 10: + case 9: + b += mux64(fetch64_le(v++), prime_2); + /* fall through */ + case 8: + case 7: + case 6: + case 5: + case 4: + case 3: + case 2: + case 1: + a += mux64(tail64_le(v, len), prime_1); + /* fall through */ + case 0: + return final_weak_avalanche(a, b); + } +} + +uint64_t t1ha1_be(const void *data, size_t len, uint64_t seed) { + uint64_t a = seed; + uint64_t b = len; + + const int need_align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK; + uint64_t align[4]; + + if (unlikely(len > 32)) { + uint64_t c = rot64(len, 17) + seed; + uint64_t d = len ^ rot64(seed, 17); + const void *detent = (const uint8_t *)data + len - 31; + do { + const uint64_t *v = (const uint64_t *)data; + if (unlikely(need_align)) + v = (const uint64_t *)memcpy(&align, unaligned(v), 32); + + uint64_t w0 = fetch64_be(v + 0); + uint64_t w1 = fetch64_be(v + 1); + uint64_t w2 = fetch64_be(v + 2); + uint64_t w3 = fetch64_be(v + 3); + + uint64_t d02 = w0 ^ rot64(w2 + d, 17); + uint64_t c13 = w1 ^ rot64(w3 + c, 17); + c += a ^ rot64(w0, 41); + d -= b ^ rot64(w1, 31); + a ^= prime_1 * (d02 + w3); + b ^= prime_0 * (c13 + w2); + data = (const uint64_t *)data + 4; + } while (likely(data < detent)); + + a ^= prime_6 * (rot64(c, 17) + d); + b ^= prime_5 * (c + rot64(d, 17)); + len &= 31; + } + + const uint64_t *v = (const uint64_t *)data; + if (unlikely(need_align) && len > 8) + v = (const uint64_t *)memcpy(&align, unaligned(v), len); + + switch (len) { + default: + b += mux64(fetch64_be(v++), prime_4); + /* fall through */ + case 24: + case 23: + case 22: + case 21: + case 20: + case 19: + case 18: + case 17: + a += mux64(fetch64_be(v++), prime_3); + /* fall through */ + case 16: + case 15: + case 14: + case 13: + case 12: + case 11: + case 10: + case 9: + b += mux64(fetch64_be(v++), prime_2); + /* fall through */ + case 8: + case 7: + case 6: + case 5: + case 4: + case 3: + case 2: + case 1: + a += mux64(tail64_be(v, len), prime_1); + /* fall through */ + case 0: + return final_weak_avalanche(a, b); + } +} diff --git a/contrib/t1ha/t1ha2.c b/contrib/t1ha/t1ha2.c new file mode 100644 index 000000000..f87e8bb82 --- /dev/null +++ b/contrib/t1ha/t1ha2.c @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, + * Fast Positive Hash. + * + * Portions Copyright (c) 2010-2018 Leonid Yuriev <leo@yuriev.ru>, + * The 1Hippeus project (t1h). + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgement in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* + * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" } + * by [Positive Technologies](https://www.ptsecurity.ru) + * + * Briefly, it is a 64-bit Hash Function: + * 1. Created for 64-bit little-endian platforms, in predominantly for x86_64, + * but portable and without penalties it can run on any 64-bit CPU. + * 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash + * and all others portable hash-functions (which do not use specific + * hardware tricks). + * 3. Not suitable for cryptography. + * + * The Future will Positive. Всё будет хорошо. + * + * ACKNOWLEDGEMENT: + * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев) + * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! + */ + +#include "config.h" +#include "t1ha_bits.h" + +static __always_inline void init_ab(t1ha_state256_t *s, uint64_t x, + uint64_t y) { + s->n.a = x; + s->n.b = y; +} + +static __always_inline void init_cd(t1ha_state256_t *s, uint64_t x, + uint64_t y) { + s->n.c = rot64(y, 23) + ~x; + s->n.d = ~y + rot64(x, 19); +} + +static __always_inline void update(t1ha_state256_t *__restrict s, + const uint64_t *__restrict v) { + uint64_t w0 = fetch64_le(v + 0); + uint64_t w1 = fetch64_le(v + 1); + uint64_t w2 = fetch64_le(v + 2); + uint64_t w3 = fetch64_le(v + 3); + + uint64_t d02 = w0 + rot64(w2 + s->n.d, 56); + uint64_t c13 = w1 + rot64(w3 + s->n.c, 19); +#ifdef __e2k__ + /* FIXME: temporary workaround for lcc's ELBRUS scheduling bug (LY) */ + s->n.c ^= s->n.a + rot64(w0, 57); + s->n.d ^= s->n.b + rot64(w1, 38); +#else + s->n.d ^= s->n.b + rot64(w1, 38); + s->n.c ^= s->n.a + rot64(w0, 57); +#endif + s->n.b ^= prime_6 * (c13 + w2); + s->n.a ^= prime_5 * (d02 + w3); +} + +static __always_inline void squash(t1ha_state256_t *s) { + s->n.a ^= prime_6 * (s->n.c + rot64(s->n.d, 23)); + s->n.b ^= prime_5 * (rot64(s->n.c, 19) + s->n.d); +} + +static __always_inline const void * +loop(bool need_copy4align, uint64_t *__restrict buffer4align, + t1ha_state256_t *__restrict s, const void *__restrict data, size_t len) { + const void *detent = (const uint8_t *)data + len - 31; + do { + const uint64_t *v = (const uint64_t *)data; + if (unlikely(need_copy4align)) + v = (const uint64_t *)memcpy(buffer4align, unaligned(v), 32); + update(s, v); + data = (const uint64_t *)data + 4; + } while (likely(data < detent)); + return data; +} + +static __always_inline void tail_ab(t1ha_state256_t *__restrict s, + const uint64_t *__restrict v, size_t len) { + switch (len) { + default: + mixup64(&s->n.a, &s->n.b, fetch64_le(v++), prime_4); + /* fall through */ + case 24: + case 23: + case 22: + case 21: + case 20: + case 19: + case 18: + case 17: + mixup64(&s->n.b, &s->n.a, fetch64_le(v++), prime_3); + /* fall through */ + case 16: + case 15: + case 14: + case 13: + case 12: + case 11: + case 10: + case 9: + mixup64(&s->n.a, &s->n.b, fetch64_le(v++), prime_2); + /* fall through */ + case 8: + case 7: + case 6: + case 5: + case 4: + case 3: + case 2: + case 1: + mixup64(&s->n.b, &s->n.a, tail64_le(v, len), prime_1); + /* fall through */ + case 0: + return; + } +} + +static __always_inline void tail_abcd(t1ha_state256_t *__restrict s, + const uint64_t *__restrict v, + size_t len) { + switch (len) { + default: + mixup64(&s->n.a, &s->n.d, fetch64_le(v++), prime_4); + /* fall through */ + case 24: + case 23: + case 22: + case 21: + case 20: + case 19: + case 18: + case 17: + mixup64(&s->n.b, &s->n.a, fetch64_le(v++), prime_3); + /* fall through */ + case 16: + case 15: + case 14: + case 13: + case 12: + case 11: + case 10: + case 9: + mixup64(&s->n.c, &s->n.b, fetch64_le(v++), prime_2); + /* fall through */ + case 8: + case 7: + case 6: + case 5: + case 4: + case 3: + case 2: + case 1: + mixup64(&s->n.d, &s->n.c, tail64_le(v, len), prime_1); + /* fall through */ + case 0: + return; + } +} + +static __always_inline uint64_t final128(uint64_t a, uint64_t b, uint64_t c, + uint64_t d, uint64_t *h) { + mixup64(&a, &b, rot64(c, 41) ^ d, prime_0); + mixup64(&b, &c, rot64(d, 23) ^ a, prime_6); + mixup64(&c, &d, rot64(a, 19) ^ b, prime_5); + mixup64(&d, &a, rot64(b, 31) ^ c, prime_4); + *h = c + d; + return a ^ b; +} + +//------------------------------------------------------------------------------ + +uint64_t t1ha2_atonce(const void *data, size_t length, uint64_t seed) { + t1ha_state256_t state; + init_ab(&state, seed, length); + + const int need_copy4align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK; + uint64_t buffer4align[4]; + + if (unlikely(length > 32)) { + init_cd(&state, seed, length); + data = loop(need_copy4align, buffer4align, &state, data, length); + squash(&state); + length &= 31; + } + + const uint64_t *v = (const uint64_t *)data; + if (unlikely(need_copy4align) && length > 8) + v = (const uint64_t *)memcpy(&buffer4align, unaligned(v), length); + + tail_ab(&state, v, length); + return final64(state.n.a, state.n.b); +} + +uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result, + const void *__restrict data, size_t length, + uint64_t seed) { + t1ha_state256_t state; + init_ab(&state, seed, length); + init_cd(&state, seed, length); + + const int need_copy4align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK; + uint64_t buffer4align[4]; + + if (unlikely(length > 32)) { + data = loop(need_copy4align, buffer4align, &state, data, length); + length &= 31; + } + + const uint64_t *v = (const uint64_t *)data; + if (unlikely(need_copy4align) && length > 8) + v = (const uint64_t *)memcpy(&buffer4align, unaligned(v), length); + + tail_abcd(&state, v, length); + return final128(state.n.a, state.n.b, state.n.c, state.n.d, extra_result); +} + +//------------------------------------------------------------------------------ + +void t1ha2_init(t1ha_context_t *ctx, uint64_t seed_x, uint64_t seed_y) { + init_ab(&ctx->state, seed_x, seed_y); + init_cd(&ctx->state, seed_x, seed_y); + ctx->partial = 0; + ctx->total = 0; +} + +void t1ha2_update(t1ha_context_t *__restrict ctx, const void *__restrict data, + size_t length) { + ctx->total += length; + + if (ctx->partial) { + const size_t left = 32 - ctx->partial; + const size_t chunk = (length >= left) ? left : length; + memcpy(ctx->buffer.bytes + ctx->partial, unaligned(data), chunk); + ctx->partial += chunk; + if (ctx->partial < 32) { + assert(left >= length); + return; + } + ctx->partial = 0; + data = (const uint8_t *)data + chunk; + length -= chunk; + update(&ctx->state, ctx->buffer.u64); + } + + if (length >= 32) { + const bool need_copy4align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK; + if (need_copy4align) + data = loop(true, ctx->buffer.u64, &ctx->state, data, length); + else + data = loop(false, NULL, &ctx->state, data, length); + length &= 31; + } + + if (length) + memcpy(ctx->buffer.bytes, unaligned(data), ctx->partial = length); +} + +uint64_t t1ha2_final(t1ha_context_t *__restrict ctx, + uint64_t *__restrict extra_result) { + uint64_t bytes = (ctx->total << 3) ^ (UINT64_C(1) << 63); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + bytes = bswap64(bytes); +#endif + t1ha2_update(ctx, &bytes, 8); + + if (likely(!extra_result)) { + squash(&ctx->state); + tail_ab(&ctx->state, ctx->buffer.u64, ctx->partial); + return final64(ctx->state.n.a, ctx->state.n.b); + } + + tail_abcd(&ctx->state, ctx->buffer.u64, ctx->partial); + return final128(ctx->state.n.a, ctx->state.n.b, ctx->state.n.c, + ctx->state.n.d, extra_result); +} diff --git a/contrib/t1ha/t1ha_bits.h b/contrib/t1ha/t1ha_bits.h new file mode 100644 index 000000000..eb032a69c --- /dev/null +++ b/contrib/t1ha/t1ha_bits.h @@ -0,0 +1,827 @@ +/* + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, + * Fast Positive Hash. + * + * Portions Copyright (c) 2010-2018 Leonid Yuriev <leo@yuriev.ru>, + * The 1Hippeus project (t1h). + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgement in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* + * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" } + * by [Positive Technologies](https://www.ptsecurity.ru) + * + * Briefly, it is a 64-bit Hash Function: + * 1. Created for 64-bit little-endian platforms, in predominantly for x86_64, + * but portable and without penalties it can run on any 64-bit CPU. + * 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash + * and all others portable hash-functions (which do not use specific + * hardware tricks). + * 3. Not suitable for cryptography. + * + * The Future will Positive. Всё будет хорошо. + * + * ACKNOWLEDGEMENT: + * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев) + * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! + */ + +#pragma once + +#if defined(_MSC_VER) +#pragma warning(disable : 4201) /* nameless struct/union */ +#if _MSC_VER > 1800 +#pragma warning(disable : 4464) /* relative include path contains '..' */ +#endif /* 1800 */ +#endif /* MSVC */ + +#include "config.h" +#include "t1ha.h" + +#ifndef T1HA_USE_FAST_ONESHOT_READ +/* Define it to 1 for little bit faster code. + * Unfortunately this may triggering a false-positive alarms from Valgrind, + * AddressSanitizer and other similar tool. + * So, define it to 0 for calmness if doubt. */ +#define T1HA_USE_FAST_ONESHOT_READ 1 +#endif /* T1HA_USE_FAST_ONESHOT_READ */ + +/*****************************************************************************/ + +#include <assert.h> /* for assert() */ +#include <stdbool.h> /* for bool */ +#include <string.h> /* for memcpy() */ + +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ && \ + __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ +#error Unsupported byte order. +#endif + +#if !defined(UNALIGNED_OK) +#if (defined(__ia32__) || defined(__e2k__) || \ + defined(__ARM_FEATURE_UNALIGNED)) && \ + !defined(__ALIGNED__) +#define UNALIGNED_OK 1 +#else +#define UNALIGNED_OK 0 +#endif +#endif /* UNALIGNED_OK */ + +#if UNALIGNED_OK && !defined(PAGESIZE) +#define PAGESIZE 4096 +#endif /* PAGESIZE */ + +/***************************************************************************/ + +#ifndef __has_builtin +#define __has_builtin(x) (0) +#endif + +#if __GNUC_PREREQ(4, 4) || defined(__clang__) + +#if defined(__ia32__) || defined(__e2k__) +#include <x86intrin.h> +#endif + +#if defined(__ia32__) +#include <cpuid.h> +#endif + +#if defined(__e2k__) +#include <e2kbuiltin.h> +#endif + +#ifndef likely +#define likely(cond) __builtin_expect(!!(cond), 1) +#endif + +#ifndef unlikely +#define unlikely(cond) __builtin_expect(!!(cond), 0) +#endif + +#if __GNUC_PREREQ(4, 5) || __has_builtin(__builtin_unreachable) +#define unreachable() __builtin_unreachable() +#endif + +#define bswap64(v) __builtin_bswap64(v) +#define bswap32(v) __builtin_bswap32(v) +#if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) +#define bswap16(v) __builtin_bswap16(v) +#endif + +#if !defined(__maybe_unused) && (__GNUC_PREREQ(4, 3) || __has_attribute(unused)) +#define __maybe_unused __attribute__((unused)) +#endif + +#if !defined(__always_inline) && \ + (__GNUC_PREREQ(3, 2) || __has_attribute(always_inline)) +#define __always_inline __inline __attribute__((always_inline)) +#endif + +#if defined(__e2k__) + +#if __iset__ >= 3 +#define mul_64x64_high(a, b) __builtin_e2k_umulhd(a, b) +#endif /* __iset__ >= 3 */ + +#if __iset__ >= 5 +static __maybe_unused __always_inline unsigned +e2k_add64carry_first(uint64_t base, uint64_t addend, uint64_t *sum) { + *sum = base + addend; + return (unsigned)__builtin_e2k_addcd_c(base, addend, 0); +} +#define add64carry_first(base, addend, sum) \ + e2k_add64carry_first(base, addend, sum) + +static __maybe_unused __always_inline unsigned +e2k_add64carry_next(unsigned carry, uint64_t base, uint64_t addend, + uint64_t *sum) { + *sum = __builtin_e2k_addcd(base, addend, carry); + return (unsigned)__builtin_e2k_addcd_c(base, addend, carry); +} +#define add64carry_next(carry, base, addend, sum) \ + e2k_add64carry_next(carry, base, addend, sum) + +static __maybe_unused __always_inline void e2k_add64carry_last(unsigned carry, + uint64_t base, + uint64_t addend, + uint64_t *sum) { + *sum = __builtin_e2k_addcd(base, addend, carry); +} +#define add64carry_last(carry, base, addend, sum) \ + e2k_add64carry_last(carry, base, addend, sum) +#endif /* __iset__ >= 5 */ + +#if 0 /* LY: unreasonable, because alignment is required :( */ +#define fetch64_be(ptr) ((uint64_t)__builtin_e2k_ld_64s_be(ptr)) +#define fetch32_be(ptr) ((uint32_t)__builtin_e2k_ld_32u_be(ptr)) +#endif + +#endif /* __e2k__ Elbrus */ + +#elif defined(_MSC_VER) + +#if _MSC_FULL_VER < 190024218 && defined(_M_IX86) +#pragma message( \ + "For AES-NI at least \"Microsoft C/C++ Compiler\" version 19.00.24218 (Visual Studio 2015 Update 5) is required.") +#endif +#if _MSC_FULL_VER < 191025019 +#pragma message( \ + "It is recommended to use \"Microsoft C/C++ Compiler\" version 19.10.25019 (Visual Studio 2017) or newer.") +#endif +#if _MSC_FULL_VER < 180040629 +#error At least "Microsoft C/C++ Compiler" version 18.00.40629 (Visual Studio 2013 Update 5) is required. +#endif + +#pragma warning(push, 1) + +#include <intrin.h> +#include <stdlib.h> +#define likely(cond) (cond) +#define unlikely(cond) (cond) +#define unreachable() __assume(0) +#define bswap64(v) _byteswap_uint64(v) +#define bswap32(v) _byteswap_ulong(v) +#define bswap16(v) _byteswap_ushort(v) +#define rot64(v, s) _rotr64(v, s) +#define rot32(v, s) _rotr(v, s) +#define __always_inline __forceinline + +#if defined(_M_X64) || defined(_M_IA64) +#pragma intrinsic(_umul128) +#define mul_64x64_128(a, b, ph) _umul128(a, b, ph) +#pragma intrinsic(_addcarry_u64) +#define add64carry_first(base, addend, sum) _addcarry_u64(0, base, addend, sum) +#define add64carry_next(carry, base, addend, sum) \ + _addcarry_u64(carry, base, addend, sum) +#define add64carry_last(carry, base, addend, sum) \ + (void)_addcarry_u64(carry, base, addend, sum) +#endif + +#if defined(_M_ARM64) || defined(_M_X64) || defined(_M_IA64) +#pragma intrinsic(__umulh) +#define mul_64x64_high(a, b) __umulh(a, b) +#endif + +#if defined(_M_IX86) +#pragma intrinsic(__emulu) +#define mul_32x32_64(a, b) __emulu(a, b) + +#if _MSC_FULL_VER >= 190024231 /* LY: workaround for optimizer bug */ +#pragma intrinsic(_addcarry_u32) +#define add32carry_first(base, addend, sum) _addcarry_u32(0, base, addend, sum) +#define add32carry_next(carry, base, addend, sum) \ + _addcarry_u32(carry, base, addend, sum) +#define add32carry_last(carry, base, addend, sum) \ + (void)_addcarry_u32(carry, base, addend, sum) + +static __forceinline char +msvc32_add64carry_first(uint64_t base, uint64_t addend, uint64_t *sum) { + uint32_t *const sum32 = (uint32_t *)sum; + const uint32_t base_32l = (uint32_t)base; + const uint32_t base_32h = (uint32_t)(base >> 32); + const uint32_t addend_32l = (uint32_t)addend; + const uint32_t addend_32h = (uint32_t)(addend >> 32); + return add32carry_next(add32carry_first(base_32l, addend_32l, sum32), + base_32h, addend_32h, sum32 + 1); +} +#define add64carry_first(base, addend, sum) \ + msvc32_add64carry_first(base, addend, sum) + +static __forceinline char msvc32_add64carry_next(char carry, uint64_t base, + uint64_t addend, + uint64_t *sum) { + uint32_t *const sum32 = (uint32_t *)sum; + const uint32_t base_32l = (uint32_t)base; + const uint32_t base_32h = (uint32_t)(base >> 32); + const uint32_t addend_32l = (uint32_t)addend; + const uint32_t addend_32h = (uint32_t)(addend >> 32); + return add32carry_next(add32carry_next(carry, base_32l, addend_32l, sum32), + base_32h, addend_32h, sum32 + 1); +} +#define add64carry_next(carry, base, addend, sum) \ + msvc32_add64carry_next(carry, base, addend, sum) + +static __forceinline void msvc32_add64carry_last(char carry, uint64_t base, + uint64_t addend, + uint64_t *sum) { + uint32_t *const sum32 = (uint32_t *)sum; + const uint32_t base_32l = (uint32_t)base; + const uint32_t base_32h = (uint32_t)(base >> 32); + const uint32_t addend_32l = (uint32_t)addend; + const uint32_t addend_32h = (uint32_t)(addend >> 32); + add32carry_last(add32carry_next(carry, base_32l, addend_32l, sum32), base_32h, + addend_32h, sum32 + 1); +} +#define add64carry_last(carry, base, addend, sum) \ + msvc32_add64carry_last(carry, base, addend, sum) +#endif /* _MSC_FULL_VER >= 190024231 */ + +#elif defined(_M_ARM) +#define mul_32x32_64(a, b) _arm_umull(a, b) +#endif + +#pragma warning(pop) +#pragma warning(disable : 4514) /* 'xyz': unreferenced inline function \ + has been removed */ +#pragma warning(disable : 4710) /* 'xyz': function not inlined */ +#pragma warning(disable : 4711) /* function 'xyz' selected for \ + automatic inline expansion */ +#pragma warning(disable : 4127) /* conditional expression is constant */ +#pragma warning(disable : 4702) /* unreachable code */ +#endif /* Compiler */ + +#ifndef likely +#define likely(cond) (cond) +#endif +#ifndef unlikely +#define unlikely(cond) (cond) +#endif +#ifndef __maybe_unused +#define __maybe_unused +#endif +#ifndef __always_inline +#define __always_inline __inline +#endif +#ifndef unreachable +#define unreachable() \ + do { \ + } while (1) +#endif + +#ifndef bswap64 +#if defined(bswap_64) +#define bswap64 bswap_64 +#elif defined(__bswap_64) +#define bswap64 __bswap_64 +#else +static __always_inline uint64_t bswap64(uint64_t v) { + return v << 56 | v >> 56 | ((v << 40) & UINT64_C(0x00ff000000000000)) | + ((v << 24) & UINT64_C(0x0000ff0000000000)) | + ((v << 8) & UINT64_C(0x000000ff00000000)) | + ((v >> 8) & UINT64_C(0x00000000ff000000)) | + ((v >> 24) & UINT64_C(0x0000000000ff0000)) | + ((v >> 40) & UINT64_C(0x000000000000ff00)); +} +#endif +#endif /* bswap64 */ + +#ifndef bswap32 +#if defined(bswap_32) +#define bswap32 bswap_32 +#elif defined(__bswap_32) +#define bswap32 __bswap_32 +#else +static __always_inline uint32_t bswap32(uint32_t v) { + return v << 24 | v >> 24 | ((v << 8) & UINT32_C(0x00ff0000)) | + ((v >> 8) & UINT32_C(0x0000ff00)); +} +#endif +#endif /* bswap32 */ + +#ifndef bswap16 +#if defined(bswap_16) +#define bswap16 bswap_16 +#elif defined(__bswap_16) +#define bswap16 __bswap_16 +#else +static __always_inline uint16_t bswap16(uint16_t v) { return v << 8 | v >> 8; } +#endif +#endif /* bswap16 */ + +#ifndef unaligned +#if defined(__LCC__) +#pragma diag_suppress wrong_entity_for_attribute +#define unaligned(ptr) ((const char __attribute__((packed, aligned(1))) *)(ptr)) +#elif defined(__clang__) +#pragma clang diagnostic ignored "-Wignored-attributes" +#define unaligned(ptr) ((const char __attribute__((packed, aligned(1))) *)(ptr)) +#elif defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wpacked" +#define unaligned(ptr) ((const char __attribute__((packed, aligned(1))) *)(ptr)) +#elif defined(_MSC_VER) +#pragma warning( \ + disable : 4235) /* nonstandard extension used: '__unaligned' \ + * keyword not supported on this architecture */ +#define unaligned(ptr) ((const char __unaligned *)(ptr)) +#else +#define unaligned(ptr) ((const char *)(ptr)) +#endif +#endif /* unaligned */ + +/***************************************************************************/ + +#ifndef fetch64_le +static __always_inline uint64_t fetch64_le(const void *v) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return *(const uint64_t *)v; +#else + return bswap64(*(const uint64_t *)v); +#endif +} +#endif /* fetch64_le */ + +#ifndef fetch32_le +static __always_inline uint32_t fetch32_le(const void *v) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return *(const uint32_t *)v; +#else + return bswap32(*(const uint32_t *)v); +#endif +} +#endif /* fetch32_le */ + +#ifndef fetch16_le +static __always_inline uint16_t fetch16_le(const void *v) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return *(const uint16_t *)v; +#else + return bswap16(*(const uint16_t *)v); +#endif +} +#endif /* fetch16_le */ + +#if T1HA_USE_FAST_ONESHOT_READ && UNALIGNED_OK && defined(PAGESIZE) && \ + PAGESIZE > 0 && !defined(__SANITIZE_ADDRESS__) +#define can_read_underside(ptr, size) \ + ((size) <= sizeof(uintptr_t) && ((PAGESIZE - (size)) & (uintptr_t)(ptr)) != 0) +#endif /* can_fast_read */ + +static __always_inline uint64_t tail64_le(const void *v, size_t tail) { + const uint8_t *p = (const uint8_t *)v; +#ifdef can_read_underside + /* On some systems (e.g. x86) we can perform a 'oneshot' read, which + * is little bit faster. Thanks Marcin Żukowski <marcin.zukowski@gmail.com> + * for the reminder. */ + const unsigned offset = (8 - tail) & 7; + const unsigned shift = offset << 3; + if (likely(can_read_underside(p, 8))) { + p -= offset; + return fetch64_le(p) >> shift; + } + return fetch64_le(p) & ((~UINT64_C(0)) >> shift); +#endif /* 'oneshot' read */ + + uint64_t r = 0; + switch (tail & 7) { +#if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + /* For most CPUs this code is better when not needed + * copying for alignment or byte reordering. */ + case 0: + return fetch64_le(p); + case 7: + r = (uint64_t)p[6] << 8; + /* fall through */ + case 6: + r += p[5]; + r <<= 8; + /* fall through */ + case 5: + r += p[4]; + r <<= 32; + /* fall through */ + case 4: + return r + fetch32_le(p); + case 3: + r = (uint64_t)p[2] << 16; + /* fall through */ + case 2: + return r + fetch16_le(p); + case 1: + return p[0]; +#else + /* For most CPUs this code is better than a + * copying for alignment and/or byte reordering. */ + case 0: + r = p[7] << 8; + /* fall through */ + case 7: + r += p[6]; + r <<= 8; + /* fall through */ + case 6: + r += p[5]; + r <<= 8; + /* fall through */ + case 5: + r += p[4]; + r <<= 8; + /* fall through */ + case 4: + r += p[3]; + r <<= 8; + /* fall through */ + case 3: + r += p[2]; + r <<= 8; + /* fall through */ + case 2: + r += p[1]; + r <<= 8; + /* fall through */ + case 1: + return r + p[0]; +#endif + } + unreachable(); +} + +#ifndef fetch64_be +static __maybe_unused __always_inline uint64_t fetch64_be(const void *v) { +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return *(const uint64_t *)v; +#else + return bswap64(*(const uint64_t *)v); +#endif +} +#endif /* fetch64_be */ + +#ifndef fetch32_be +static __maybe_unused __always_inline uint32_t fetch32_be(const void *v) { +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return *(const uint32_t *)v; +#else + return bswap32(*(const uint32_t *)v); +#endif +} +#endif /* fetch32_be */ + +#ifndef fetch16_be +static __maybe_unused __always_inline uint16_t fetch16_be(const void *v) { +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return *(const uint16_t *)v; +#else + return bswap16(*(const uint16_t *)v); +#endif +} +#endif /* fetch16_be */ + +static __maybe_unused __always_inline uint64_t tail64_be(const void *v, + size_t tail) { + const uint8_t *p = (const uint8_t *)v; +#ifdef can_read_underside + /* On some systems we can perform a 'oneshot' read, which is little bit + * faster. Thanks Marcin Żukowski <marcin.zukowski@gmail.com> for the + * reminder. */ + const unsigned offset = (8 - tail) & 7; + const unsigned shift = offset << 3; + if (likely(can_read_underside(p, 8))) { + p -= offset; + return fetch64_be(p) & ((~UINT64_C(0)) >> shift); + } + return fetch64_be(p) >> shift; +#endif /* 'oneshot' read */ + + switch (tail & 7) { +#if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + /* For most CPUs this code is better when not needed + * copying for alignment or byte reordering. */ + case 1: + return p[0]; + case 2: + return fetch16_be(p); + case 3: + return (uint32_t)fetch16_be(p) << 8 | p[2]; + case 4: + return fetch32_be(p); + case 5: + return (uint64_t)fetch32_be(p) << 8 | p[4]; + case 6: + return (uint64_t)fetch32_be(p) << 16 | fetch16_be(p + 4); + case 7: + return (uint64_t)fetch32_be(p) << 24 | (uint32_t)fetch16_be(p + 4) << 8 | + p[6]; + case 0: + return fetch64_be(p); +#else + /* For most CPUs this code is better than a + * copying for alignment and/or byte reordering. */ + case 1: + return p[0]; + case 2: + return p[1] | (uint32_t)p[0] << 8; + case 3: + return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; + case 4: + return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | + (uint32_t)p[0] << 24; + case 5: + return p[4] | (uint32_t)p[3] << 8 | (uint32_t)p[2] << 16 | + (uint32_t)p[1] << 24 | (uint64_t)p[0] << 32; + case 6: + return p[5] | (uint32_t)p[4] << 8 | (uint32_t)p[3] << 16 | + (uint32_t)p[2] << 24 | (uint64_t)p[1] << 32 | (uint64_t)p[0] << 40; + case 7: + return p[6] | (uint32_t)p[5] << 8 | (uint32_t)p[4] << 16 | + (uint32_t)p[3] << 24 | (uint64_t)p[2] << 32 | (uint64_t)p[1] << 40 | + (uint64_t)p[0] << 48; + case 0: + return p[7] | (uint32_t)p[6] << 8 | (uint32_t)p[5] << 16 | + (uint32_t)p[4] << 24 | (uint64_t)p[3] << 32 | (uint64_t)p[2] << 40 | + (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56; +#endif + } + unreachable(); +} + +/***************************************************************************/ + +#ifndef rot64 +static __always_inline uint64_t rot64(uint64_t v, unsigned s) { + return (v >> s) | (v << (64 - s)); +} +#endif /* rot64 */ + +#ifndef mul_32x32_64 +static __always_inline uint64_t mul_32x32_64(uint32_t a, uint32_t b) { + return a * (uint64_t)b; +} +#endif /* mul_32x32_64 */ + +#ifndef add64carry_first +static __maybe_unused __always_inline unsigned +add64carry_first(uint64_t base, uint64_t addend, uint64_t *sum) { +#if __has_builtin(__builtin_addcll) + unsigned long long carryout; + *sum = __builtin_addcll(base, addend, 0, &carryout); + return (unsigned)carryout; +#else + *sum = base + addend; + return *sum < addend; +#endif /* __has_builtin(__builtin_addcll) */ +} +#endif /* add64carry_fist */ + +#ifndef add64carry_next +static __maybe_unused __always_inline unsigned +add64carry_next(unsigned carry, uint64_t base, uint64_t addend, uint64_t *sum) { +#if __has_builtin(__builtin_addcll) + unsigned long long carryout; + *sum = __builtin_addcll(base, addend, carry, &carryout); + return (unsigned)carryout; +#else + *sum = base + addend + carry; + return *sum < addend || (carry && *sum == addend); +#endif /* __has_builtin(__builtin_addcll) */ +} +#endif /* add64carry_next */ + +#ifndef add64carry_last +static __maybe_unused __always_inline void +add64carry_last(unsigned carry, uint64_t base, uint64_t addend, uint64_t *sum) { +#if __has_builtin(__builtin_addcll) + unsigned long long carryout; + *sum = __builtin_addcll(base, addend, carry, &carryout); + (void)carryout; +#else + *sum = base + addend + carry; +#endif /* __has_builtin(__builtin_addcll) */ +} +#endif /* add64carry_last */ + +#ifndef mul_64x64_128 +static __maybe_unused __always_inline uint64_t mul_64x64_128(uint64_t a, + uint64_t b, + uint64_t *h) { +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + __uint128_t r = (__uint128_t)a * (__uint128_t)b; + /* modern GCC could nicely optimize this */ + *h = (uint64_t)(r >> 64); + return (uint64_t)r; +#elif defined(mul_64x64_high) + *h = mul_64x64_high(a, b); + return a * b; +#else + /* performs 64x64 to 128 bit multiplication */ + const uint64_t ll = mul_32x32_64((uint32_t)a, (uint32_t)b); + const uint64_t lh = mul_32x32_64(a >> 32, (uint32_t)b); + const uint64_t hl = mul_32x32_64((uint32_t)a, b >> 32); + const uint64_t hh = mul_32x32_64(a >> 32, b >> 32); + + /* Few simplification are possible here for 32-bit architectures, + * but thus we would lost compatibility with the original 64-bit + * version. Think is very bad idea, because then 32-bit t1ha will + * still (relatively) very slowly and well yet not compatible. */ + uint64_t l; + add64carry_last(add64carry_first(ll, lh << 32, &l), hh, lh >> 32, h); + add64carry_last(add64carry_first(l, hl << 32, &l), *h, hl >> 32, h); + return l; +#endif +} +#endif /* mul_64x64_128() */ + +#ifndef mul_64x64_high +static __maybe_unused __always_inline uint64_t mul_64x64_high(uint64_t a, + uint64_t b) { + uint64_t h; + mul_64x64_128(a, b, &h); + return h; +} +#endif /* mul_64x64_high */ + +/***************************************************************************/ + +/* 'magic' primes */ +static const uint64_t prime_0 = UINT64_C(0xEC99BF0D8372CAAB); +static const uint64_t prime_1 = UINT64_C(0x82434FE90EDCEF39); +static const uint64_t prime_2 = UINT64_C(0xD4F06DB99D67BE4B); +static const uint64_t prime_3 = UINT64_C(0xBD9CACC22C6E9571); +static const uint64_t prime_4 = UINT64_C(0x9C06FAF4D023E3AB); +static const uint64_t prime_5 = UINT64_C(0xC060724A8424F345); +static const uint64_t prime_6 = UINT64_C(0xCB5AF53AE3AAAC31); + +/* xor high and low parts of full 128-bit product */ +static __maybe_unused __always_inline uint64_t mux64(uint64_t v, + uint64_t prime) { + uint64_t l, h; + l = mul_64x64_128(v, prime, &h); + return l ^ h; +} + +static __always_inline uint64_t final64(uint64_t a, uint64_t b) { + uint64_t x = (a + rot64(b, 41)) * prime_0; + uint64_t y = (rot64(a, 23) + b) * prime_6; + return mux64(x ^ y, prime_5); +} + +static __always_inline void mixup64(uint64_t *__restrict a, + uint64_t *__restrict b, uint64_t v, + uint64_t prime) { + uint64_t h; + *a ^= mul_64x64_128(*b + v, prime, &h); + *b += h; +} + +/***************************************************************************/ + +typedef union t1ha_uint128 { +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + __uint128_t v; +#endif + struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + uint64_t l, h; +#else + uint64_t h, l; +#endif + }; +} t1ha_uint128_t; + +static __always_inline t1ha_uint128_t not128(const t1ha_uint128_t v) { + t1ha_uint128_t r; +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + r.v = ~v.v; +#else + r.l = ~v.l; + r.h = ~v.h; +#endif + return r; +} + +static __always_inline t1ha_uint128_t left128(const t1ha_uint128_t v, + unsigned s) { + t1ha_uint128_t r; + assert(s < 128); +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + r.v = v.v << s; +#else + r.l = (s < 64) ? v.l << s : 0; + r.h = (s < 64) ? (v.h << s) | (s ? v.l >> (64 - s) : 0) : v.l << (s - 64); +#endif + return r; +} + +static __always_inline t1ha_uint128_t right128(const t1ha_uint128_t v, + unsigned s) { + t1ha_uint128_t r; + assert(s < 128); +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + r.v = v.v >> s; +#else + r.l = (s < 64) ? (s ? v.h << (64 - s) : 0) | (v.l >> s) : v.h >> (s - 64); + r.h = (s < 64) ? v.h >> s : 0; +#endif + return r; +} + +static __always_inline t1ha_uint128_t or128(t1ha_uint128_t x, + t1ha_uint128_t y) { + t1ha_uint128_t r; +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + r.v = x.v | y.v; +#else + r.l = x.l | y.l; + r.h = x.h | y.h; +#endif + return r; +} + +static __always_inline t1ha_uint128_t xor128(t1ha_uint128_t x, + t1ha_uint128_t y) { + t1ha_uint128_t r; +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + r.v = x.v ^ y.v; +#else + r.l = x.l ^ y.l; + r.h = x.h ^ y.h; +#endif + return r; +} + +static __always_inline t1ha_uint128_t rot128(t1ha_uint128_t v, unsigned s) { + s &= 127; +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + v.v = (v.v << (128 - s)) | (v.v >> s); + return v; +#else + return s ? or128(left128(v, 128 - s), right128(v, s)) : v; +#endif +} + +static __always_inline t1ha_uint128_t add128(t1ha_uint128_t x, + t1ha_uint128_t y) { + t1ha_uint128_t r; +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + r.v = x.v + y.v; +#else + add64carry_last(add64carry_first(x.l, y.l, &r.l), x.h, y.h, &r.h); +#endif + return r; +} + +static __always_inline t1ha_uint128_t mul128(t1ha_uint128_t x, + t1ha_uint128_t y) { + t1ha_uint128_t r; +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + r.v = x.v * y.v; +#else + r.l = mul_64x64_128(x.l, y.l, &r.h); + r.h += x.l * y.h + y.l * x.h; +#endif + return r; +} |