/* * Copyright (c) 2016 Positive Technologies, https://www.ptsecurity.com, * Fast Positive Hash. * * Portions Copyright (c) 2010-2016 Leonid Yuriev , * The 1Hippeus project (t1h). * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages * arising from the use of this software. * * Permission is granted to anyone to use this software for any purpose, * including commercial applications, and to alter it and redistribute it * freely, subject to the following restrictions: * * 1. The origin of this software must not be misrepresented; you must not * claim that you wrote the original software. If you use this software * in a product, an acknowledgement in the product documentation would be * appreciated but is not required. * 2. Altered source versions must be plainly marked as such, and must not be * misrepresented as being the original software. * 3. This notice may not be removed or altered from any source distribution. */ /* * t1ha = { Fast Positive Hash} * by [Positive Technologies](https://www.ptsecurity.ru) * * Briefly, it is a 64-bit Hash Function: * 1. Created for 64-bit little-endian platforms, in predominantly for x86_64, * but without penalties could runs on any 64-bit CPU. * 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash * and all others which are not use specific hardware tricks. * 3. Not suitable for cryptography. * * ACKNOWLEDGEMENT: * The t1ha was originally developed by Leonid Yuriev * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! */ #ifndef T1HA_INCLUDED #define T1HA_INCLUDED #include "config.h" #include #include #ifndef __has_attribute #define __has_attribute(x) (0) #endif #ifndef __has_builtin #define __has_builtin(x) (0) #endif #ifdef BYTE_ORDER #ifndef __ORDER_LITTLE_ENDIAN__ #define __ORDER_LITTLE_ENDIAN__ LITTLE_ENDIAN #endif #ifndef __ORDER_BIG_ENDIAN__ #define __ORDER_BIG_ENDIAN__ BIG_ENDIAN #endif #ifndef __BYTE_ORDER__ #define __BYTE_ORDER__ BYTE_ORDER #endif #else #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ !defined(__ORDER_BIG_ENDIAN__) #define __ORDER_LITTLE_ENDIAN__ 1234 #define __ORDER_BIG_ENDIAN__ 4321 #if defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || \ defined(__THUMBEL__) || defined(__AARCH64EL__) || defined(__MIPSEL__) || \ defined(_MIPSEL) || defined(__MIPSEL) || defined(__i386) || \ defined(__x86_64) || defined(_M_IX86) || defined(_M_X64) || \ defined(i386) || defined(_X86_) || defined(__i386__) || defined(_X86_64_) #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ #elif defined(__BIG_ENDIAN__) || defined(__ARMEB__) || defined(__THUMBEB__) || \ defined(__AARCH64EB__) || defined(__MIPSEB__) || defined(_MIPSEB) || \ defined(__MIPSEB) #define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__ #else #error __BYTE_ORDER__ should be defined. #endif #endif #endif #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ && \ __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ #error Unsupported byte order. #endif #if !defined(UNALIGNED_OK) #if defined(__i386) || defined(__x86_64) || defined(_M_IX86) || \ defined(_M_X64) || defined(i386) || defined(_X86_) || defined(__i386__) || \ defined(_X86_64_) #define UNALIGNED_OK 1 #else #define UNALIGNED_OK 0 #endif #endif #ifndef __GNUC_PREREQ #if defined(__GNUC__) && defined(__GNUC_MINOR__) #define __GNUC_PREREQ(maj, min) \ ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) #else #define __GNUC_PREREQ(maj, min) 0 #endif #endif #if __GNUC_PREREQ(4, 4) || defined(__clang__) #if defined(__i386__) || defined(__x86_64__) #include #endif #define likely(cond) __builtin_expect(!!(cond), 1) #define unlikely(cond) __builtin_expect(!!(cond), 0) # if __GNUC_PREREQ(4, 6) || defined(__clang__) #define unreachable() __builtin_unreachable() # else #define unreachable() \ do { \ for (;;) \ ; \ } while (0) # endif #define bswap64(v) __builtin_bswap64(v) #define bswap32(v) __builtin_bswap32(v) #if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) #define bswap16(v) __builtin_bswap16(v) #endif #if __GNUC_PREREQ(4, 3) || __has_attribute(unused) #define maybe_unused __attribute__((unused)) #endif #elif defined(_MSC_VER) #include #include #define likely(cond) (cond) #define unlikely(cond) (cond) #define unreachable() __assume(0) #define bswap64(v) _byteswap_uint64(v) #define bswap32(v) _byteswap_ulong(v) #define bswap16(v) _byteswap_ushort(v) #define rot64(v, s) _rotr64(v, s) #define rot32(v, s) _rotr(v, s) #if defined(_M_ARM64) || defined(_M_X64) #pragma intrinsic(_umul128) #define mul_64x64_128(a, b, ph) _umul128(a, b, ph) #pragma intrinsic(__umulh) #define mul_64x64_high(a, b) __umulh(a, b) #endif #if defined(_M_IX86) #pragma intrinsic(__emulu) #define mul_32x32_64(a, b) __emulu(a, b) #elif defined(_M_ARM) #define mul_32x32_64(a, b) _arm_umull(a, b) #endif #else /* Compiler */ #define likely(cond) (cond) #define unlikely(cond) (cond) #define unreachable() \ do \ for (;;) \ ; \ while (0) #endif /* Compiler */ #ifndef bswap64 static __inline uint64_t bswap64(uint64_t v) { return v << 56 | v >> 56 | ((v << 40) & 0x00ff000000000000ull) | ((v << 24) & 0x0000ff0000000000ull) | ((v << 8) & 0x000000ff00000000ull) | ((v >> 8) & 0x00000000ff000000ull) | ((v >> 24) & 0x0000000000ff0000ull) | ((v >> 40) & 0x000000000000ff00ull); } #endif /* bswap64 */ #ifndef bswap32 static __inline uint32_t bswap32(uint32_t v) { return v << 24 | v >> 24 | ((v << 8) & 0x00ff0000) | ((v >> 8) & 0x0000ff00); } #endif /* bswap32 */ #ifndef bswap16 static __inline uint16_t bswap16(uint16_t v) { return v << 8 | v >> 8; } #endif /* bswap16 */ #ifndef rot64 static __inline uint64_t rot64(uint64_t v, unsigned s) { return (v >> s) | (v << (64 - s)); } #endif /* rot64 */ #ifndef rot32 static __inline uint32_t rot32(uint32_t v, unsigned s) { return (v >> s) | (v << (32 - s)); } #endif /* rot32 */ #ifndef mul_32x32_64 static __inline uint64_t mul_32x32_64(uint32_t a, uint32_t b) { return a * (uint64_t)b; } #endif /* mul_32x32_64 */ /***************************************************************************/ static __inline uint64_t fetch64(const void *v) { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return *(const uint64_t *)v; #else return bswap64(*(const uint64_t *)v); #endif } static __inline uint64_t fetch32(const void *v) { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return *(const uint32_t *)v; #else return bswap32(*(const uint32_t *)v); #endif } static __inline uint64_t fetch16(const void *v) { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return *(const uint16_t *)v; #else return bswap16(*(const uint16_t *)v); #endif } static __inline uint64_t fetch_tail(const void *v, size_t tail) { const uint8_t *_ = (const uint8_t *)v; switch (tail & 7) { case 1: return _[0]; case 2: return fetch16(_); case 3: return fetch16(_) | (_[2] << 16); case 4: return fetch32(_); case 5: return fetch32(_) | ((uint64_t)_[4] << 32); case 6: return fetch32(_) | (fetch16(_ + 4) << 32); case 7: return fetch32(_) | (fetch16(_ + 4) << 32) | ((uint64_t)_[6] << 48); case 0: return fetch64(_); default: unreachable(); } } /* xor-mul-xor mixer */ static __inline uint64_t mix(uint64_t v, uint64_t p) { static const unsigned s0 = 41; v *= p; return v ^ rot64(v, s0); } static __inline unsigned add_with_carry(uint64_t *sum, uint64_t addend) { *sum += addend; return *sum < addend; } /* xor high and low parts of full 128-bit product */ static __inline uint64_t mux64(uint64_t v, uint64_t p) { #ifdef __SIZEOF_INT128__ __uint128_t r = (__uint128_t)v * (__uint128_t)p; /* modern GCC could nicely optimize this */ return r ^ (r >> 64); #elif defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128 __uint128 r = (__uint128)v * (__uint128)p; return r ^ (r >> 64); #elif defined(mul_64x64_128) uint64_t l, h; l = mul_64x64_128(v, p, &h); return l ^ h; #elif defined(mul_64x64_high) uint64_t l, h; l = v * p; h = mul_64x64_high(v, p); return l ^ h; #else /* performs 64x64 to 128 bit multiplication */ uint64_t ll = mul_32x32_64((uint32_t)v, (uint32_t)p); uint64_t lh = mul_32x32_64(v >> 32, (uint32_t)p); uint64_t hl = mul_32x32_64(p >> 32, (uint32_t)v); uint64_t hh = mul_32x32_64(v >> 32, p >> 32) + (lh >> 32) + (hl >> 32) + /* Few simplification are possible here for 32-bit architectures, * but thus we would lost compatibility with the original 64-bit * version. Think is very bad idea, because then 32-bit t1ha will * still (relatively) very slowly and well yet not compatible. */ add_with_carry(&ll, lh << 32) + add_with_carry(&ll, hl << 32); return hh ^ ll; #endif } static uint64_t t1ha(const void *data, size_t len, uint64_t seed) { /* 'magic' primes */ static const uint64_t p0 = 17048867929148541611ull; static const uint64_t p1 = 9386433910765580089ull; static const uint64_t p2 = 15343884574428479051ull; static const uint64_t p3 = 13662985319504319857ull; static const uint64_t p4 = 11242949449147999147ull; static const uint64_t p5 = 13862205317416547141ull; static const uint64_t p6 = 14653293970879851569ull; /* rotations */ static const unsigned s0 = 41; static const unsigned s1 = 17; static const unsigned s2 = 31; uint64_t a = seed; uint64_t b = len; const int need_align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK; uint64_t align[4]; if (unlikely(len > 32)) { uint64_t c = rot64(len, s1) + seed; uint64_t d = len ^ rot64(seed, s1); const void *detent = (const uint8_t *)data + len - 31; do { const uint64_t *v = (const uint64_t *)data; if (unlikely(need_align)) v = (const uint64_t *)memcpy(&align, v, 32); uint64_t w0 = fetch64(v + 0); uint64_t w1 = fetch64(v + 1); uint64_t w2 = fetch64(v + 2); uint64_t w3 = fetch64(v + 3); uint64_t d02 = w0 ^ rot64(w2 + d, s1); uint64_t c13 = w1 ^ rot64(w3 + c, s1); c += a ^ rot64(w0, s0); d -= b ^ rot64(w1, s2); a ^= p1 * (d02 + w3); b ^= p0 * (c13 + w2); data = (const uint64_t *)data + 4; } while (likely(data < detent)); a ^= p6 * (rot64(c, s1) + d); b ^= p5 * (c + rot64(d, s1)); len &= 31; } const uint64_t *v = (const uint64_t *)data; if (unlikely(need_align) && len > 1) v = (const uint64_t *)memcpy(&align, v, len); switch (len) { default: b += mux64(fetch64(v++), p4); case 24: case 23: case 22: case 21: case 20: case 19: case 18: case 17: a += mux64(fetch64(v++), p3); case 16: case 15: case 14: case 13: case 12: case 11: case 10: case 9: b += mux64(fetch64(v++), p2); case 8: case 7: case 6: case 5: case 4: case 3: case 2: case 1: a += mux64(fetch_tail(v, len), p1); case 0: return mux64(rot64(a + b, s1), p4) + mix(a ^ b, p0); } } static __inline uint32_t tail32_le(const void *v, size_t tail) { const uint8_t *p = (const uint8_t *)v; uint32_t r = 0; switch (tail & 3) { #if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ /* For most CPUs this code is better when not needed * copying for alignment or byte reordering. */ case 0: return fetch32(p); case 3: r = (uint32_t)p[2] << 16; case 2: return r + fetch16(p); case 1: return p[0]; #else /* For most CPUs this code is better than a * copying for alignment and/or byte reordering. */ case 0: r += p[3]; r <<= 8; case 3: r += p[2]; r <<= 8; case 2: r += p[1]; r <<= 8; case 1: return r + p[0]; #endif } unreachable(); } static __inline uint32_t tail32_be(const void *v, size_t tail) { const uint8_t *p = (const uint8_t *)v; switch (tail & 3) { #if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ /* For most CPUs this code is better when not needed * copying for alignment or byte reordering. */ case 1: return p[0]; case 2: return fetch16_be(p); case 3: return fetch16_be(p) << 8 | p[2]; case 0: return fetch32_be(p); #else /* For most CPUs this code is better than a * copying for alignment and/or byte reordering. */ case 1: return p[0]; case 2: return p[1] | (uint32_t)p[0] << 8; case 3: return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; case 0: return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | (uint32_t)p[0] << 24; #endif } unreachable(); } static __inline uint64_t remix32(uint32_t a, uint32_t b) { static const uint64_t p0 = 17048867929148541611ull; a ^= rot32(b, 13); uint64_t l = a | (uint64_t)b << 32; l *= p0; l ^= l >> 41; return l; } static __inline void mixup32(uint32_t *a, uint32_t *b, uint32_t v, uint32_t p) { uint64_t l = mul_32x32_64(*b + v, p); *a ^= (uint32_t)l; *b += (uint32_t)(l >> 32); } static uint64_t t1ha32(const void *data, size_t len, uint64_t seed) { /* 32-bit 'magic' primes */ static const uint32_t q0 = 0x92D78269; static const uint32_t q1 = 0xCA9B4735; static const uint32_t q2 = 0xA4ABA1C3; static const uint32_t q3 = 0xF6499843; static const uint32_t q4 = 0x86F0FD61; static const uint32_t q5 = 0xCA2DA6FB; static const uint32_t q6 = 0xC4BB3575; /* rotations */ static const unsigned s1 = 17; uint32_t a = rot32((uint32_t)len, s1) + (uint32_t)seed; uint32_t b = (uint32_t)len ^ (uint32_t)(seed >> 32); const int need_align = (((uintptr_t)data) & 3) != 0 && !UNALIGNED_OK; uint32_t align[4]; if (unlikely(len > 16)) { uint32_t c = ~a; uint32_t d = rot32(b, 5); const void *detent = (const uint8_t *)data + len - 15; do { const uint32_t *v = (const uint32_t *)data; if (unlikely(need_align)) v = (const uint32_t *)memcpy(&align, v, 16); uint32_t w0 = fetch32(v + 0); uint32_t w1 = fetch32(v + 1); uint32_t w2 = fetch32(v + 2); uint32_t w3 = fetch32(v + 3); uint32_t c02 = w0 ^ rot32(w2 + c, 11); uint32_t d13 = w1 + rot32(w3 + d, s1); c ^= rot32(b + w1, 7); d ^= rot32(a + w0, 3); b = q1 * (c02 + w3); a = q0 * (d13 ^ w2); data = (const uint32_t *)data + 4; } while (likely(data < detent)); c += a; d += b; a ^= q6 * (rot32(c, 16) + d); b ^= q5 * (c + rot32(d, 16)); len &= 15; } const uint8_t *v = (const uint8_t *)data; if (unlikely(need_align) && len > 4) v = (const uint8_t *)memcpy(&align, v, len); switch (len) { default: mixup32(&a, &b, fetch32(v), q4); v += 4; case 12: case 11: case 10: case 9: mixup32(&b, &a, fetch32(v), q3); v += 4; case 8: case 7: case 6: case 5: mixup32(&a, &b, fetch32(v), q2); v += 4; case 4: case 3: case 2: case 1: mixup32(&b, &a, tail32_le(v, len), q1); case 0: return remix32(a, b); } } #endif