diff options
Diffstat (limited to 'contrib/mumhash/mum.h')
-rw-r--r-- | contrib/mumhash/mum.h | 76 |
1 files changed, 61 insertions, 15 deletions
diff --git a/contrib/mumhash/mum.h b/contrib/mumhash/mum.h index 1daebf3de..161c5390e 100644 --- a/contrib/mumhash/mum.h +++ b/contrib/mumhash/mum.h @@ -43,12 +43,13 @@ #ifndef __MUM_HASH__ #define __MUM_HASH__ -#include "config.h" #include <stddef.h> #include <stdlib.h> #include <string.h> #include <limits.h> + #ifdef _MSC_VER +typedef unsigned __int16 uint16_t; typedef unsigned __int32 uint32_t; typedef unsigned __int64 uint64_t; #else @@ -57,8 +58,13 @@ typedef unsigned __int64 uint64_t; #ifdef __GNUC__ #define _MUM_ATTRIBUTE_UNUSED __attribute__((unused)) -#define _MUM_OPTIMIZE(opts) RSPAMD_OPTIMIZE(opts) +#ifndef __clang__ +#define _MUM_OPTIMIZE(opts) __attribute__((__optimize__ (opts))) +#define _MUM_TARGET(opts) __attribute__((__target__ (opts))) +#else +#define _MUM_OPTIMIZE(opts) #define _MUM_TARGET(opts) +#endif #else #define _MUM_ATTRIBUTE_UNUSED #define _MUM_OPTIMIZE(opts) @@ -145,14 +151,18 @@ _mum (uint64_t v, uint64_t p) { } #if defined(_MSC_VER) +#define _mum_bswap_32(x) _byteswap_uint32_t (x) #define _mum_bswap_64(x) _byteswap_uint64_t (x) #elif defined(__APPLE__) #include <libkern/OSByteOrder.h> +#define _mum_bswap_32(x) OSSwapInt32 (x) #define _mum_bswap_64(x) OSSwapInt64 (x) #elif defined(__GNUC__) +#define _mum_bswap32(x) __builtin_bswap32 (x) #define _mum_bswap64(x) __builtin_bswap64 (x) #else #include <byteswap.h> +#define _mum_bswap32(x) bswap32 (x) #define _mum_bswap64(x) bswap64 (x) #endif @@ -166,6 +176,18 @@ _mum_le (uint64_t v) { #error "Unknown endianess" #endif } + +static inline uint32_t +_mum_le32 (uint32_t v) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH) + return v; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return _mum_bswap32 (v); +#else +#error "Unknown endianess" +#endif +} + /* Macro defining how many times the most nested loop in _mum_hash_aligned will be unrolled by the compiler (although it can make an own decision:). Use only a constant here to help a @@ -196,7 +218,7 @@ static inline uint64_t _MUM_OPTIMIZE("unroll-loops") _mum_hash_aligned (uint64_t start, const void *key, size_t len) { uint64_t result = start; const unsigned char *str = (const unsigned char *) key; - union {uint64_t i; unsigned char s[sizeof (uint64_t)];} p; + uint64_t u64; int i; size_t n; @@ -217,9 +239,37 @@ _mum_hash_aligned (uint64_t start, const void *key, size_t len) { for (i = 0; i < n; i++) result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]); len -= n * sizeof (uint64_t); str += n * sizeof (uint64_t); - if (len) { - p.i = 0; memcpy (p.s, str, len); - result ^= _mum (_mum_le (((uint64_t *) p.s)[0]), _mum_tail_prime); + switch (len) { + case 7: + u64 = _mum_le32 (*(uint32_t *) str); + u64 |= (uint64_t) str[4] << 32; + u64 |= (uint64_t) str[5] << 40; + u64 |= (uint64_t) str[6] << 48; + return result ^ _mum (u64, _mum_tail_prime); + case 6: + u64 = _mum_le32 (*(uint32_t *) str); + u64 |= (uint64_t) str[4] << 32; + u64 |= (uint64_t) str[5] << 40; + return result ^ _mum (u64, _mum_tail_prime); + case 5: + u64 = _mum_le32 (*(uint32_t *) str); + u64 |= (uint64_t) str[4] << 32; + return result ^ _mum (u64, _mum_tail_prime); + case 4: + u64 = _mum_le32 (*(uint32_t *) str); + return result ^ _mum (u64, _mum_tail_prime); + case 3: + u64 = str[0]; + u64 |= (uint64_t) str[1] << 8; + u64 |= (uint64_t) str[2] << 16; + return result ^ _mum (u64, _mum_tail_prime); + case 2: + u64 = str[0]; + u64 |= (uint64_t) str[1] << 8; + return result ^ _mum (u64, _mum_tail_prime); + case 1: + u64 = str[0]; + return result ^ _mum (u64, _mum_tail_prime); } return result; } @@ -232,8 +282,8 @@ _mum_final (uint64_t h) { return h; } -#if defined(__x86_64__) && defined(__GNUC__) -#if 0 +#if defined(__x86_64__) && defined(__GNUC__) && (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 9) && !defined(__clang__) + /* We want to use AVX2 insn MULX instead of generic x86-64 MULQ where it is possible. Although on modern Intel processors MULQ takes 3-cycles vs. 4 for MULX, MULX permits more freedom in insn @@ -243,7 +293,6 @@ _mum_hash_avx2 (const void * key, size_t len, uint64_t seed) { return _mum_final (_mum_hash_aligned (seed + len, key, len)); } #endif -#endif #ifndef _MUM_UNALIGNED_ACCESS #if defined(__x86_64__) || defined(__i386__) || defined(__PPC64__) \ @@ -283,7 +332,7 @@ _mum_hash_default (const void *key, size_t len, uint64_t seed) { else { while (len != 0) { block_len = len < _MUM_BLOCK_LEN ? len : _MUM_BLOCK_LEN; - memcpy (buf, str, block_len); + memmove (buf, str, block_len); result = _mum_hash_aligned (result, buf, block_len); len -= block_len; str += block_len; @@ -351,13 +400,11 @@ mum_hash64 (uint64_t key, uint64_t seed) { target endianess and the unroll factor. */ static inline uint64_t mum_hash (const void *key, size_t len, uint64_t seed) { -#if defined(__x86_64__) && defined(__GNUC__) -#if 0 - static int avx2_support = 0; +#if defined(__x86_64__) && defined(__GNUC__) && (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 9) && !defined(__clang__) + static int avx2_support = 0; if (avx2_support > 0) return _mum_hash_avx2 (key, len, seed); - else if (! avx2_support) { __builtin_cpu_init (); avx2_support = __builtin_cpu_supports ("avx2") ? 1 : -1; @@ -365,7 +412,6 @@ mum_hash (const void *key, size_t len, uint64_t seed) { return _mum_hash_avx2 (key, len, seed); } #endif -#endif return _mum_hash_default (key, len, seed); } |