diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-12-08 13:41:45 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-12-08 13:49:52 +0000 |
commit | 84793e0fe51a1ddd9b3c1be69f50a8251ceb92b7 (patch) | |
tree | 3f9371a72edd2b63e4dc2b116c8c4731108d33f7 /contrib/t1ha | |
parent | 2aff8dd1681d4b8d01e2c4be68243ff5b2b9a0a8 (diff) | |
download | rspamd-84793e0fe51a1ddd9b3c1be69f50a8251ceb92b7.tar.gz rspamd-84793e0fe51a1ddd9b3c1be69f50a8251ceb92b7.zip |
[Feature] Use t1ha instead of metrohash and xxhash32
Diffstat (limited to 'contrib/t1ha')
-rw-r--r-- | contrib/t1ha/t1ha.h | 525 |
1 files changed, 525 insertions, 0 deletions
diff --git a/contrib/t1ha/t1ha.h b/contrib/t1ha/t1ha.h new file mode 100644 index 000000000..7c97fee90 --- /dev/null +++ b/contrib/t1ha/t1ha.h @@ -0,0 +1,525 @@ +/* + * Copyright (c) 2016 Positive Technologies, https://www.ptsecurity.com, + * Fast Positive Hash. + * + * Portions Copyright (c) 2010-2016 Leonid Yuriev <leo@yuriev.ru>, + * The 1Hippeus project (t1h). + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgement in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* + * t1ha = { Fast Positive Hash} + * by [Positive Technologies](https://www.ptsecurity.ru) + * + * Briefly, it is a 64-bit Hash Function: + * 1. Created for 64-bit little-endian platforms, in predominantly for x86_64, + * but without penalties could runs on any 64-bit CPU. + * 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash + * and all others which are not use specific hardware tricks. + * 3. Not suitable for cryptography. + * + * ACKNOWLEDGEMENT: + * The t1ha was originally developed by Leonid Yuriev + * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! + */ + +#ifndef T1HA_INCLUDED +#define T1HA_INCLUDED +#include "config.h" +#include <string.h> + +#ifdef BYTE_ORDER +#ifndef __ORDER_LITTLE_ENDIAN__ +#define __ORDER_LITTLE_ENDIAN__ LITTLE_ENDIAN +#endif +#ifndef __ORDER_BIG_ENDIAN__ +#define __ORDER_BIG_ENDIAN__ BIG_ENDIAN +#endif +#ifndef __BYTE_ORDER__ +#define __BYTE_ORDER__ BYTE_ORDER +#endif +#else +#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ + !defined(__ORDER_BIG_ENDIAN__) +#define __ORDER_LITTLE_ENDIAN__ 1234 +#define __ORDER_BIG_ENDIAN__ 4321 +#if defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || \ + defined(__THUMBEL__) || defined(__AARCH64EL__) || defined(__MIPSEL__) || \ + defined(_MIPSEL) || defined(__MIPSEL) || defined(__i386) || \ + defined(__x86_64) || defined(_M_IX86) || defined(_M_X64) || \ + defined(i386) || defined(_X86_) || defined(__i386__) || defined(_X86_64_) +#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ +#elif defined(__BIG_ENDIAN__) || defined(__ARMEB__) || defined(__THUMBEB__) || \ + defined(__AARCH64EB__) || defined(__MIPSEB__) || defined(_MIPSEB) || \ + defined(__MIPSEB) +#define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__ +#else +#error __BYTE_ORDER__ should be defined. +#endif +#endif +#endif +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ && \ + __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ +#error Unsupported byte order. +#endif + +#if !defined(UNALIGNED_OK) +#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) || \ + defined(_M_X64) || defined(i386) || defined(_X86_) || defined(__i386__) || \ + defined(_X86_64_) +#define UNALIGNED_OK 1 +#else +#define UNALIGNED_OK 0 +#endif +#endif + +#if defined(__GNUC__) && (__GNUC__ > 3) + +#if defined(__i386) || defined(__x86_64) +#include <x86intrin.h> +#endif +#define likely(cond) __builtin_expect(!!(cond), 1) +#define unlikely(cond) __builtin_expect(!!(cond), 0) +#define unreachable() __builtin_unreachable() +#define bswap64(v) __builtin_bswap64(v) +#define bswap32(v) __builtin_bswap32(v) +#define bswap16(v) __builtin_bswap16(v) + +#elif defined(_MSC_VER) + +#include <intrin.h> +#include <stdlib.h> +#define likely(cond) (cond) +#define unlikely(cond) (cond) +#define unreachable() __assume(0) +#define bswap64(v) _byteswap_uint64(v) +#define bswap32(v) _byteswap_ulong(v) +#define bswap16(v) _byteswap_ushort(v) +#define rot64(v, s) _rotr64(v, s) +#define rot32(v, s) _rotr(v, s) + +#if defined(_M_ARM64) || defined(_M_X64) +#pragma intrinsic(_umul128) +#define mul_64x64_128(a, b, ph) _umul128(a, b, ph) +#pragma intrinsic(__umulh) +#define mul_64x64_high(a, b) __umulh(a, b) +#endif + +#if defined(_M_IX86) +#pragma intrinsic(__emulu) +#define mul_32x32_64(a, b) __emulu(a, b) +#elif defined(_M_ARM) +#define mul_32x32_64(a, b) _arm_umull(a, b) +#endif + +#else /* Compiler */ + +#define likely(cond) (cond) +#define unlikely(cond) (cond) +#define unreachable() \ + do \ + for (;;) \ + ; \ + while (0) +#endif /* Compiler */ + +#ifndef bswap64 +static __inline uint64_t bswap64(uint64_t v) { + return v << 56 | v >> 56 | ((v << 40) & 0x00ff000000000000ull) | + ((v << 24) & 0x0000ff0000000000ull) | + ((v << 8) & 0x000000ff00000000ull) | + ((v >> 8) & 0x00000000ff000000ull) | + ((v >> 24) & 0x0000000000ff0000ull) | + ((v >> 40) & 0x000000000000ff00ull); +} +#endif /* bswap64 */ + +#ifndef bswap32 +static __inline uint32_t bswap32(uint32_t v) { + return v << 24 | v >> 24 | ((v << 8) & 0x00ff0000) | ((v >> 8) & 0x0000ff00); +} +#endif /* bswap32 */ + +#ifndef bswap16 +static __inline uint16_t bswap16(uint16_t v) { return v << 8 | v >> 8; } +#endif /* bswap16 */ + +#ifndef rot64 +static __inline uint64_t rot64(uint64_t v, unsigned s) { + return (v >> s) | (v << (64 - s)); +} +#endif /* rot64 */ + +#ifndef rot32 +static __inline uint32_t rot32(uint32_t v, unsigned s) { + return (v >> s) | (v << (32 - s)); +} +#endif /* rot32 */ + +#ifndef mul_32x32_64 +static __inline uint64_t mul_32x32_64(uint32_t a, uint32_t b) { + return a * (uint64_t)b; +} +#endif /* mul_32x32_64 */ + +/***************************************************************************/ + +static __inline uint64_t fetch64(const void *v) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return *(const uint64_t *)v; +#else + return bswap64(*(const uint64_t *)v); +#endif +} + +static __inline uint64_t fetch32(const void *v) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return *(const uint32_t *)v; +#else + return bswap32(*(const uint32_t *)v); +#endif +} + +static __inline uint64_t fetch16(const void *v) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return *(const uint16_t *)v; +#else + return bswap16(*(const uint16_t *)v); +#endif +} + +static __inline uint64_t fetch_tail(const void *v, size_t tail) { + const uint8_t *_ = (const uint8_t *)v; + switch (tail & 7) { + case 1: + return _[0]; + case 2: + return fetch16(_); + case 3: + return fetch16(_) | (_[2] << 16); + case 4: + return fetch32(_); + case 5: + return fetch32(_) | ((uint64_t)_[4] << 32); + case 6: + return fetch32(_) | (fetch16(_ + 4) << 32); + case 7: + return fetch32(_) | (fetch16(_ + 4) << 32) | ((uint64_t)_[6] << 48); + case 0: + return fetch64(_); + default: + unreachable(); + } +} + +/* xor-mul-xor mixer */ +static __inline uint64_t mix(uint64_t v, uint64_t p) { + static const unsigned s0 = 41; + v *= p; + return v ^ rot64(v, s0); +} + +static __inline unsigned add_with_carry(uint64_t *sum, uint64_t addend) { + *sum += addend; + return *sum < addend; +} + +/* xor high and low parts of full 128-bit product */ +static __inline uint64_t mux64(uint64_t v, uint64_t p) { +#ifdef __SIZEOF_INT128__ + __uint128_t r = (__uint128_t)v * (__uint128_t)p; + /* modern GCC could nicely optimize this */ + return r ^ (r >> 64); +#elif defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128 + __uint128 r = (__uint128)v * (__uint128)p; + return r ^ (r >> 64); +#elif defined(mul_64x64_128) + uint64_t l, h; + l = mul_64x64_128(v, p, &h); + return l ^ h; +#elif defined(mul_64x64_high) + uint64_t l, h; + l = v * p; + h = mul_64x64_high(v, p); + return l ^ h; +#else + /* performs 64x64 to 128 bit multiplication */ + uint64_t ll = mul_32x32_64((uint32_t)v, (uint32_t)p); + uint64_t lh = mul_32x32_64(v >> 32, (uint32_t)p); + uint64_t hl = mul_32x32_64(p >> 32, (uint32_t)v); + uint64_t hh = + mul_32x32_64(v >> 32, p >> 32) + (lh >> 32) + (hl >> 32) + + /* Few simplification are possible here for 32-bit architectures, + * but thus we would lost compatibility with the original 64-bit + * version. Think is very bad idea, because then 32-bit t1ha will + * still (relatively) very slowly and well yet not compatible. */ + add_with_carry(&ll, lh << 32) + add_with_carry(&ll, hl << 32); + return hh ^ ll; +#endif +} + +static uint64_t +t1ha(const void *data, size_t len, uint64_t seed) +{ + /* 'magic' primes */ + static const uint64_t p0 = 17048867929148541611ull; + static const uint64_t p1 = 9386433910765580089ull; + static const uint64_t p2 = 15343884574428479051ull; + static const uint64_t p3 = 13662985319504319857ull; + static const uint64_t p4 = 11242949449147999147ull; + static const uint64_t p5 = 13862205317416547141ull; + static const uint64_t p6 = 14653293970879851569ull; + /* rotations */ + static const unsigned s0 = 41; + static const unsigned s1 = 17; + static const unsigned s2 = 31; + + uint64_t a = seed; + uint64_t b = len; + + const int need_align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK; + uint64_t align[4]; + + if (unlikely(len > 32)) { + uint64_t c = rot64(len, s1) + seed; + uint64_t d = len ^ rot64(seed, s1); + const void *detent = (const uint8_t *)data + len - 31; + do { + const uint64_t *v = (const uint64_t *)data; + if (unlikely(need_align)) + v = (const uint64_t *)memcpy(&align, v, 32); + + uint64_t w0 = fetch64(v + 0); + uint64_t w1 = fetch64(v + 1); + uint64_t w2 = fetch64(v + 2); + uint64_t w3 = fetch64(v + 3); + + uint64_t d02 = w0 ^ rot64(w2 + d, s1); + uint64_t c13 = w1 ^ rot64(w3 + c, s1); + c += a ^ rot64(w0, s0); + d -= b ^ rot64(w1, s2); + a ^= p1 * (d02 + w3); + b ^= p0 * (c13 + w2); + data = (const uint64_t *)data + 4; + } while (likely(data < detent)); + + a ^= p6 * (rot64(c, s1) + d); + b ^= p5 * (c + rot64(d, s1)); + len &= 31; + } + + const uint64_t *v = (const uint64_t *)data; + if (unlikely(need_align) && len > 1) + v = (const uint64_t *)memcpy(&align, v, len); + + switch (len) { + default: + b += mux64(fetch64(v++), p4); + case 24: + case 23: + case 22: + case 21: + case 20: + case 19: + case 18: + case 17: + a += mux64(fetch64(v++), p3); + case 16: + case 15: + case 14: + case 13: + case 12: + case 11: + case 10: + case 9: + b += mux64(fetch64(v++), p2); + case 8: + case 7: + case 6: + case 5: + case 4: + case 3: + case 2: + case 1: + a += mux64(fetch_tail(v, len), p1); + case 0: + return mux64(rot64(a + b, s1), p4) + mix(a ^ b, p0); + } +} + +static __inline uint32_t tail32_le(const void *v, size_t tail) { + const uint8_t *p = (const uint8_t *)v; + uint32_t r = 0; + switch (tail & 3) { +#if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + /* For most CPUs this code is better when not needed + * copying for alignment or byte reordering. */ + case 0: + return fetch32(p); + case 3: + r = (uint32_t)p[2] << 16; + case 2: + return r + fetch16(p); + case 1: + return p[0]; +#else + /* For most CPUs this code is better than a + * copying for alignment and/or byte reordering. */ + case 0: + r += p[3]; + r <<= 8; + case 3: + r += p[2]; + r <<= 8; + case 2: + r += p[1]; + r <<= 8; + case 1: + return r + p[0]; +#endif + } + unreachable(); +} + +static __inline uint32_t tail32_be(const void *v, size_t tail) { + const uint8_t *p = (const uint8_t *)v; + switch (tail & 3) { +#if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + /* For most CPUs this code is better when not needed + * copying for alignment or byte reordering. */ + case 1: + return p[0]; + case 2: + return fetch16_be(p); + case 3: + return fetch16_be(p) << 8 | p[2]; + case 0: + return fetch32_be(p); +#else + /* For most CPUs this code is better than a + * copying for alignment and/or byte reordering. */ + case 1: + return p[0]; + case 2: + return p[1] | (uint32_t)p[0] << 8; + case 3: + return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; + case 0: + return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | + (uint32_t)p[0] << 24; +#endif + } + unreachable(); +} + +static __inline uint64_t remix32(uint32_t a, uint32_t b) { + static const uint64_t p0 = 17048867929148541611ull; + a ^= rot32(b, 13); + uint64_t l = a | (uint64_t)b << 32; + l *= p0; + l ^= l >> 41; + return l; +} + +static __inline void mixup32(uint32_t *a, uint32_t *b, uint32_t v, uint32_t p) { + uint64_t l = mul_32x32_64(*b + v, p); + *a ^= (uint32_t)l; + *b += (uint32_t)(l >> 32); +} + +static uint64_t t1ha32(const void *data, size_t len, uint64_t seed) { + /* 32-bit 'magic' primes */ + static const uint32_t q0 = 0x92D78269; + static const uint32_t q1 = 0xCA9B4735; + static const uint32_t q2 = 0xA4ABA1C3; + static const uint32_t q3 = 0xF6499843; + static const uint32_t q4 = 0x86F0FD61; + static const uint32_t q5 = 0xCA2DA6FB; + static const uint32_t q6 = 0xC4BB3575; + /* rotations */ + static const unsigned s1 = 17; + + uint32_t a = rot32((uint32_t)len, s1) + (uint32_t)seed; + uint32_t b = (uint32_t)len ^ (uint32_t)(seed >> 32); + + const int need_align = (((uintptr_t)data) & 3) != 0 && !UNALIGNED_OK; + uint32_t align[4]; + + if (unlikely(len > 16)) { + uint32_t c = ~a; + uint32_t d = rot32(b, 5); + const void *detent = (const uint8_t *)data + len - 15; + do { + const uint32_t *v = (const uint32_t *)data; + if (unlikely(need_align)) + v = (const uint32_t *)memcpy(&align, v, 16); + + uint32_t w0 = fetch32(v + 0); + uint32_t w1 = fetch32(v + 1); + uint32_t w2 = fetch32(v + 2); + uint32_t w3 = fetch32(v + 3); + + uint32_t c02 = w0 ^ rot32(w2 + c, 11); + uint32_t d13 = w1 + rot32(w3 + d, s1); + c ^= rot32(b + w1, 7); + d ^= rot32(a + w0, 3); + b = q1 * (c02 + w3); + a = q0 * (d13 ^ w2); + + data = (const uint32_t *)data + 4; + } while (likely(data < detent)); + + c += a; + d += b; + a ^= q6 * (rot32(c, 16) + d); + b ^= q5 * (c + rot32(d, 16)); + + len &= 15; + } + + const uint8_t *v = (const uint8_t *)data; + if (unlikely(need_align) && len > 4) + v = (const uint8_t *)memcpy(&align, v, len); + + switch (len) { + default: + mixup32(&a, &b, fetch32(v), q4); + v += 4; + case 12: + case 11: + case 10: + case 9: + mixup32(&b, &a, fetch32(v), q3); + v += 4; + case 8: + case 7: + case 6: + case 5: + mixup32(&a, &b, fetch32(v), q2); + v += 4; + case 4: + case 3: + case 2: + case 1: + mixup32(&b, &a, tail32_le(v, len), q1); + case 0: + return remix32(a, b); + } +} + +#endif |