From b6156175b6e0e9a1d8063571f81c428f4aac570f Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 22 May 2018 14:15:47 +0100 Subject: [PATCH] [Minor] Backport fixes from t1ha --- contrib/t1ha/t1ha1.c | 51 +++++---- contrib/t1ha/t1ha2.c | 95 ++++++++-------- contrib/t1ha/t1ha_bits.h | 226 +++++++++++++++++++++++++-------------- 3 files changed, 221 insertions(+), 151 deletions(-) diff --git a/contrib/t1ha/t1ha1.c b/contrib/t1ha/t1ha1.c index 956f7e24e..6e25d37f4 100644 --- a/contrib/t1ha/t1ha1.c +++ b/contrib/t1ha/t1ha1.c @@ -58,21 +58,21 @@ static __inline uint64_t final_weak_avalanche(uint64_t a, uint64_t b) { return mux64(rot64(a + b, 17), prime_4) + mix64(a ^ b, prime_0); } -/* TODO C++ template in the next version */ -#define T1HA1_BODY(ENDIANNES, ALIGNESS, DOCOPY) \ +/* TODO: C++ template in the next version */ +#define T1HA1_BODY(ENDIANNES, ALIGNESS) \ + const uint64_t *v = (const uint64_t *)data; \ if (unlikely(len > 32)) { \ uint64_t c = rot64(len, 17) + seed; \ uint64_t d = len ^ rot64(seed, 17); \ - const void *detent = (const uint8_t *)data + len - 31; \ + const uint64_t *detent = \ + (const uint64_t *)((const uint8_t *)data + len - 31); \ do { \ - const uint64_t *v = (const uint64_t *)data; \ - if (DOCOPY) \ - memcpy((void *)(v = align), data, 32); \ - \ const uint64_t w0 = fetch64_##ENDIANNES##_##ALIGNESS(v + 0); \ const uint64_t w1 = fetch64_##ENDIANNES##_##ALIGNESS(v + 1); \ const uint64_t w2 = fetch64_##ENDIANNES##_##ALIGNESS(v + 2); \ const uint64_t w3 = fetch64_##ENDIANNES##_##ALIGNESS(v + 3); \ + v += 4; \ + prefetch(v); \ \ const uint64_t d02 = w0 ^ rot64(w2 + d, 17); \ const uint64_t c13 = w1 ^ rot64(w3 + c, 17); \ @@ -80,18 +80,13 @@ static __inline uint64_t final_weak_avalanche(uint64_t a, uint64_t b) { d -= b ^ rot64(w1, 31); \ a ^= prime_1 * (d02 + w3); \ b ^= prime_0 * (c13 + w2); \ - data = (const uint64_t *)data + 4; \ - } while (likely(data < detent)); \ + } while (likely(v < detent)); \ \ a ^= prime_6 * (rot64(c, 17) + d); \ b ^= prime_5 * (c + rot64(d, 17)); \ len &= 31; \ } \ \ - const uint64_t *v = (const uint64_t *)data; \ - if (unlikely(need_copy4align) && len > 8) \ - memcpy((void *)(v = align), data, len); \ - \ switch (len) { \ default: \ b += mux64(fetch64_##ENDIANNES##_##ALIGNESS(v++), prime_4); \ @@ -134,26 +129,30 @@ uint64_t t1ha1_le(const void *data, size_t len, uint64_t seed) { uint64_t a = seed; uint64_t b = len; - const bool need_copy4align = - (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0 && !UNALIGNED_OK; - uint64_t align[4]; - if (need_copy4align) { - T1HA1_BODY(le, aligned, true); +#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT + T1HA1_BODY(le, unaligned); +#else + const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0; + if (misaligned) { + T1HA1_BODY(le, unaligned); } else { - T1HA1_BODY(le, unaligned, false); + T1HA1_BODY(le, aligned); } +#endif } uint64_t t1ha1_be(const void *data, size_t len, uint64_t seed) { uint64_t a = seed; uint64_t b = len; - const bool need_copy4align = - (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0 && !UNALIGNED_OK; - uint64_t align[4]; - if (need_copy4align) { - T1HA1_BODY(be, aligned, true); +#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT + T1HA1_BODY(be, unaligned); +#else + const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0; + if (misaligned) { + T1HA1_BODY(be, unaligned); } else { - T1HA1_BODY(be, unaligned, false); + T1HA1_BODY(be, aligned); } -} \ No newline at end of file +#endif +} diff --git a/contrib/t1ha/t1ha2.c b/contrib/t1ha/t1ha2.c index 95f646da4..4cb5281ee 100644 --- a/contrib/t1ha/t1ha2.c +++ b/contrib/t1ha/t1ha2.c @@ -56,7 +56,7 @@ static __always_inline void init_cd(t1ha_state256_t *s, uint64_t x, s->n.d = ~y + rot64(x, 19); } -/* TODO C++ template in the next version */ +/* TODO: C++ template in the next version */ #define T1HA2_UPDATE(ENDIANNES, ALIGNESS, state, v) \ do { \ t1ha_state256_t *const s = state; \ @@ -67,8 +67,8 @@ static __always_inline void init_cd(t1ha_state256_t *s, uint64_t x, \ const uint64_t d02 = w0 + rot64(w2 + s->n.d, 56); \ const uint64_t c13 = w1 + rot64(w3 + s->n.c, 19); \ - s->n.d ^= s->n.b + rot64(w1, 38); \ s->n.c ^= s->n.a + rot64(w0, 57); \ + s->n.d ^= s->n.b + rot64(w1, 38); \ s->n.b ^= prime_6 * (c13 + w2); \ s->n.a ^= prime_5 * (d02 + w3); \ } while (0) @@ -78,26 +78,23 @@ static __always_inline void squash(t1ha_state256_t *s) { s->n.b ^= prime_5 * (rot64(s->n.c, 19) + s->n.d); } -/* TODO C++ template in the next version */ -#define T1HA2_LOOP(ENDIANNES, ALIGNESS, BUFFER4COPY, state, data, len) \ +/* TODO: C++ template in the next version */ +#define T1HA2_LOOP(ENDIANNES, ALIGNESS, state, data, len) \ do { \ const void *detent = (const uint8_t *)data + len - 31; \ do { \ const uint64_t *v = (const uint64_t *)data; \ - if (BUFFER4COPY != NULL) \ - memcpy((void *)(v = BUFFER4COPY), data, 32); \ - T1HA2_UPDATE(le, unaligned, state, v); \ data = (const uint64_t *)data + 4; \ + prefetch(data); \ + T1HA2_UPDATE(le, ALIGNESS, state, v); \ } while (likely(data < detent)); \ } while (0) -/* TODO C++ template in the next version */ -#define T1HA2_TAIL_AB(ENDIANNES, ALIGNESS, BUFFER4COPY, state, data, len) \ +/* TODO: C++ template in the next version */ +#define T1HA2_TAIL_AB(ENDIANNES, ALIGNESS, state, data, len) \ do { \ t1ha_state256_t *const s = state; \ const uint64_t *v = (const uint64_t *)data; \ - if (BUFFER4COPY != NULL) \ - memcpy((void *)(v = BUFFER4COPY), data, len); \ switch (len) { \ default: \ mixup64(&s->n.a, &s->n.b, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ @@ -141,13 +138,11 @@ static __always_inline void squash(t1ha_state256_t *s) { } \ } while (0) -/* TODO C++ template in the next version */ -#define T1HA2_TAIL_ABCD(ENDIANNES, ALIGNESS, BUFFER4COPY, state, data, len) \ +/* TODO: C++ template in the next version */ +#define T1HA2_TAIL_ABCD(ENDIANNES, ALIGNESS, state, data, len) \ do { \ t1ha_state256_t *const s = state; \ const uint64_t *v = (const uint64_t *)data; \ - if (BUFFER4COPY != NULL) \ - memcpy((void *)(v = BUFFER4COPY), data, len); \ switch (len) { \ default: \ mixup64(&s->n.a, &s->n.d, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ @@ -207,26 +202,34 @@ uint64_t t1ha2_atonce(const void *data, size_t length, uint64_t seed) { t1ha_state256_t state; init_ab(&state, seed, length); - const bool need_copy4align = - (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0 && !UNALIGNED_OK; - if (need_copy4align) { - uint64_t buffer4align[4]; +#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT + if (unlikely(length > 32)) { + init_cd(&state, seed, length); + T1HA2_LOOP(le, unaligned, &state, data, length); + squash(&state); + length &= 31; + } + T1HA2_TAIL_AB(le, unaligned, &state, data, length); +#else + const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0; + if (misaligned) { if (unlikely(length > 32)) { init_cd(&state, seed, length); - T1HA2_LOOP(le, aligned, buffer4align, &state, data, length); + T1HA2_LOOP(le, unaligned, &state, data, length); squash(&state); length &= 31; } - T1HA2_TAIL_AB(le, aligned, buffer4align, &state, data, length); + T1HA2_TAIL_AB(le, unaligned, &state, data, length); } else { if (unlikely(length > 32)) { init_cd(&state, seed, length); - T1HA2_LOOP(le, unaligned, NULL, &state, data, length); + T1HA2_LOOP(le, aligned, &state, data, length); squash(&state); length &= 31; } - T1HA2_TAIL_AB(le, unaligned, NULL, &state, data, length); + T1HA2_TAIL_AB(le, aligned, &state, data, length); } +#endif } uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result, @@ -236,22 +239,28 @@ uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result, init_ab(&state, seed, length); init_cd(&state, seed, length); - const bool need_copy4align = - (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0 && !UNALIGNED_OK; - if (need_copy4align) { - uint64_t buffer4align[4]; +#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT + if (unlikely(length > 32)) { + T1HA2_LOOP(le, unaligned, &state, data, length); + length &= 31; + } + T1HA2_TAIL_ABCD(le, unaligned, &state, data, length); +#else + const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0; + if (misaligned) { if (unlikely(length > 32)) { - T1HA2_LOOP(le, aligned, buffer4align, &state, data, length); + T1HA2_LOOP(le, unaligned, &state, data, length); length &= 31; } - T1HA2_TAIL_ABCD(le, aligned, buffer4align, &state, data, length); + T1HA2_TAIL_ABCD(le, unaligned, &state, data, length); } else { if (unlikely(length > 32)) { - T1HA2_LOOP(le, unaligned, NULL, &state, data, length); + T1HA2_LOOP(le, aligned, &state, data, length); length &= 31; } - T1HA2_TAIL_ABCD(le, unaligned, NULL, &state, data, length); + T1HA2_TAIL_ABCD(le, aligned, &state, data, length); } +#endif } //------------------------------------------------------------------------------ @@ -283,13 +292,16 @@ void t1ha2_update(t1ha_context_t *__restrict ctx, const void *__restrict data, } if (length >= 32) { - const bool need_copy4align = - (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0 && !UNALIGNED_OK; - if (need_copy4align) { - T1HA2_LOOP(le, aligned, ctx->buffer.u64, &ctx->state, data, length); +#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT + T1HA2_LOOP(le, unaligned, &ctx->state, data, length); +#else + const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0; + if (misaligned) { + T1HA2_LOOP(le, unaligned, &ctx->state, data, length); } else { - T1HA2_LOOP(le, unaligned, NULL, &ctx->state, data, length); + T1HA2_LOOP(le, aligned, &ctx->state, data, length); } +#endif length &= 31; } @@ -307,13 +319,8 @@ uint64_t t1ha2_final(t1ha_context_t *__restrict ctx, if (likely(!extra_result)) { squash(&ctx->state); - T1HA2_TAIL_AB(le, aligned, NULL, &ctx->state, ctx->buffer.u64, - ctx->partial); - return final64(ctx->state.n.a, ctx->state.n.b); + T1HA2_TAIL_AB(le, aligned, &ctx->state, ctx->buffer.u64, ctx->partial); } - T1HA2_TAIL_ABCD(le, aligned, NULL, &ctx->state, ctx->buffer.u64, - ctx->partial); - return final128(ctx->state.n.a, ctx->state.n.b, ctx->state.n.c, - ctx->state.n.d, extra_result); -} \ No newline at end of file + T1HA2_TAIL_ABCD(le, aligned, &ctx->state, ctx->buffer.u64, ctx->partial); +} diff --git a/contrib/t1ha/t1ha_bits.h b/contrib/t1ha/t1ha_bits.h index e3815a4e7..454e43aed 100644 --- a/contrib/t1ha/t1ha_bits.h +++ b/contrib/t1ha/t1ha_bits.h @@ -72,19 +72,23 @@ #error Unsupported byte order. #endif -#if !defined(UNALIGNED_OK) -#if (defined(__ia32__) || defined(__e2k__) || \ - defined(__ARM_FEATURE_UNALIGNED)) && \ - !defined(__ALIGNED__) -#define UNALIGNED_OK 1 +#define T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE 0 +#define T1HA_CONFIG_UNALIGNED_ACCESS__SLOW 1 +#define T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT 2 + +#ifndef T1HA_CONFIG_UNALIGNED_ACCESS +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#define T1HA_CONFIG_UNALIGNED_ACCESS T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT +#elif defined(__ia32__) +#define T1HA_CONFIG_UNALIGNED_ACCESS T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT +#elif defined(__e2k__) +#define T1HA_CONFIG_UNALIGNED_ACCESS T1HA_CONFIG_UNALIGNED_ACCESS__SLOW +#elif defined(__ARM_FEATURE_UNALIGNED) +#define T1HA_CONFIG_UNALIGNED_ACCESS T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT #else -#define UNALIGNED_OK 0 +#define T1HA_CONFIG_UNALIGNED_ACCESS T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE #endif -#endif /* UNALIGNED_OK */ - -#if UNALIGNED_OK && !defined(PAGESIZE) -#define PAGESIZE 4096 -#endif /* PAGESIZE */ +#endif /* T1HA_CONFIG_UNALIGNED_ACCESS */ #define ALIGNMENT_16 2 #define ALIGNMENT_32 4 @@ -94,6 +98,10 @@ #define ALIGNMENT_64 4 #endif +#ifndef PAGESIZE +#define PAGESIZE 4096 +#endif /* PAGESIZE */ + /***************************************************************************/ #ifndef __has_builtin @@ -118,6 +126,13 @@ #if __GNUC_PREREQ(4, 4) || defined(__clang__) +#if defined(__ia32__) || defined(__e2k__) +#include +#endif + +#if defined(__ia32__) && !defined(__cpuid_count) +#include +#endif #if defined(__e2k__) #include @@ -393,10 +408,10 @@ typedef struct { #endif /* read_unaligned */ #ifndef read_aligned -#if __has_builtin(assume_aligned) +#if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_assume_aligned) #define read_aligned(ptr, bits) \ (*(const uint##bits##_t *)__builtin_assume_aligned(ptr, ALIGNMENT_##bits)) -#elif __has_attribute(aligned) && !defined(__clang__) +#elif (__GNUC_PREREQ(3, 3) || __has_attribute(aligned)) && !defined(__clang__) #define read_aligned(ptr, bits) \ (*(const uint##bits##_t __attribute__((aligned(ALIGNMENT_##bits))) *)(ptr)) #elif __has_attribute(assume_aligned) @@ -427,6 +442,20 @@ static __always_inline const #endif #endif /* read_aligned */ +#ifndef prefetch +#if (__GNUC_PREREQ(4, 0) || __has_builtin(__builtin_prefetch)) && \ + !defined(__ia32__) +#define prefetch(ptr) __builtin_prefetch(ptr) +#elif defined(_M_ARM64) || defined(_M_ARM) +#define prefetch(ptr) __prefetch(ptr) +#else +#define prefetch(ptr) \ + do { \ + (void)(ptr); \ + } while (0) +#endif +#endif /* prefetch */ + #if __has_warning("-Wconstant-logical-operand") #if defined(__clang__) #pragma clang diagnostic ignored "-Wconstant-logical-operand" @@ -451,28 +480,33 @@ static __always_inline const /*---------------------------------------------------------- Little Endian */ -#ifndef fetch64_le_aligned -static __always_inline uint64_t fetch64_le_aligned(const void *v) { +#ifndef fetch16_le_aligned +static __always_inline uint16_t fetch16_le_aligned(const void *v) { + assert(((uintptr_t)v) % ALIGNMENT_16 == 0); #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return read_aligned(v, 64); + return read_aligned(v, 16); #else - return bswap64(read_aligned(v, 64)); + return bswap16(read_aligned(v, 16)); #endif } -#endif /* fetch64_le_aligned */ +#endif /* fetch16_le_aligned */ -#ifndef fetch64_le_unaligned -static __always_inline uint64_t fetch64_le_unaligned(const void *v) { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return read_unaligned(v, 64); +#ifndef fetch16_le_unaligned +static __always_inline uint16_t fetch16_le_unaligned(const void *v) { +#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE + const uint8_t *p = (const uint8_t *)v; + return p[0] | (uint16_t)p[1] << 8; +#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return read_unaligned(v, 16); #else - return bswap64(read_unaligned(v, 64)); + return bswap16(read_unaligned(v, 16)); #endif } -#endif /* fetch64_le_unaligned */ +#endif /* fetch16_le_unaligned */ #ifndef fetch32_le_aligned static __always_inline uint32_t fetch32_le_aligned(const void *v) { + assert(((uintptr_t)v) % ALIGNMENT_32 == 0); #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return read_aligned(v, 32); #else @@ -483,7 +517,10 @@ static __always_inline uint32_t fetch32_le_aligned(const void *v) { #ifndef fetch32_le_unaligned static __always_inline uint32_t fetch32_le_unaligned(const void *v) { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE + return fetch16_le_unaligned(v) | + (uint32_t)fetch16_le_unaligned((const uint8_t *)v + 2) << 16; +#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return read_unaligned(v, 32); #else return bswap32(read_unaligned(v, 32)); @@ -491,25 +528,29 @@ static __always_inline uint32_t fetch32_le_unaligned(const void *v) { } #endif /* fetch32_le_unaligned */ -#ifndef fetch16_le_aligned -static __always_inline uint16_t fetch16_le_aligned(const void *v) { +#ifndef fetch64_le_aligned +static __always_inline uint64_t fetch64_le_aligned(const void *v) { + assert(((uintptr_t)v) % ALIGNMENT_64 == 0); #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return read_aligned(v, 16); + return read_aligned(v, 64); #else - return bswap16(read_aligned(v, 16)); + return bswap64(read_aligned(v, 64)); #endif } -#endif /* fetch16_le_aligned */ +#endif /* fetch64_le_aligned */ -#ifndef fetch16_le_unaligned -static __always_inline uint16_t fetch16_le_unaligned(const void *v) { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return read_unaligned(v, 16); +#ifndef fetch64_le_unaligned +static __always_inline uint64_t fetch64_le_unaligned(const void *v) { +#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE + return fetch32_le_unaligned(v) | + (uint64_t)fetch32_le_unaligned((const uint8_t *)v + 4) << 32; +#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return read_unaligned(v, 64); #else - return bswap16(read_unaligned(v, 16)); + return bswap64(read_unaligned(v, 64)); #endif } -#endif /* fetch16_le_unaligned */ +#endif /* fetch64_le_unaligned */ static __always_inline uint64_t tail64_le_aligned(const void *v, size_t tail) { const uint8_t *const p = (const uint8_t *)v; @@ -517,10 +558,12 @@ static __always_inline uint64_t tail64_le_aligned(const void *v, size_t tail) { /* We can perform a 'oneshot' read, which is little bit faster. */ const unsigned shift = ((8 - tail) & 7) << 3; return fetch64_le_aligned(p) & ((~UINT64_C(0)) >> shift); -#endif /* 'oneshot' read */ - +#else uint64_t r = 0; switch (tail & 7) { + default: + unreachable(); +/* fall through */ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ /* For most CPUs this code is better when not needed byte reordering. */ case 0: @@ -577,14 +620,15 @@ static __always_inline uint64_t tail64_le_aligned(const void *v, size_t tail) { return r + p[0]; #endif } - unreachable(); +#endif /* T1HA_USE_FAST_ONESHOT_READ */ } -#if T1HA_USE_FAST_ONESHOT_READ && UNALIGNED_OK && defined(PAGESIZE) && \ - !defined(__SANITIZE_ADDRESS__) && !defined(__sun) +#if T1HA_USE_FAST_ONESHOT_READ && \ + T1HA_CONFIG_UNALIGNED_ACCESS != T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE && \ + defined(PAGESIZE) && !defined(__sun) && !defined(__SANITIZE_ADDRESS__) #define can_read_underside(ptr, size) \ ((size) <= sizeof(uintptr_t) && ((PAGESIZE - (size)) & (uintptr_t)(ptr)) != 0) -#endif /* can_fast_read */ +#endif /* T1HA_USE_FAST_ONESHOT_READ */ static __always_inline uint64_t tail64_le_unaligned(const void *v, size_t tail) { @@ -600,11 +644,14 @@ static __always_inline uint64_t tail64_le_unaligned(const void *v, return fetch64_le_unaligned(p) >> shift; } return fetch64_le_unaligned(p) & ((~UINT64_C(0)) >> shift); -#endif /* 'oneshot' read */ - +#else uint64_t r = 0; switch (tail & 7) { -#if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + default: + unreachable(); +/* fall through */ +#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT && \ + __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ /* For most CPUs this code is better when not needed * copying for alignment or byte reordering. */ case 0: @@ -663,36 +710,41 @@ static __always_inline uint64_t tail64_le_unaligned(const void *v, return r + p[0]; #endif } - unreachable(); +#endif /* can_read_underside */ } /*------------------------------------------------------------- Big Endian */ -#ifndef fetch64_be_aligned -static __maybe_unused __always_inline uint64_t -fetch64_be_aligned(const void *v) { +#ifndef fetch16_be_aligned +static __maybe_unused __always_inline uint16_t +fetch16_be_aligned(const void *v) { + assert(((uintptr_t)v) % ALIGNMENT_16 == 0); #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return read_aligned(v, 64); + return read_aligned(v, 16); #else - return bswap64(read_aligned(v, 64)); + return bswap16(read_aligned(v, 16)); #endif } -#endif /* fetch64_be_aligned */ +#endif /* fetch16_be_aligned */ -#ifndef fetch64_be_unaligned -static __maybe_unused __always_inline uint64_t -fetch64_be_unaligned(const void *v) { -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return read_unaligned(v, 64); +#ifndef fetch16_be_unaligned +static __maybe_unused __always_inline uint16_t +fetch16_be_unaligned(const void *v) { +#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE + const uint8_t *p = (const uint8_t *)v; + return (uint16_t)p[0] << 8 | p[1]; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return read_unaligned(v, 16); #else - return bswap64(read_unaligned(v, 64)); + return bswap16(read_unaligned(v, 16)); #endif } -#endif /* fetch64_be_unaligned */ +#endif /* fetch16_be_unaligned */ #ifndef fetch32_be_aligned static __maybe_unused __always_inline uint32_t fetch32_be_aligned(const void *v) { + assert(((uintptr_t)v) % ALIGNMENT_32 == 0); #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ return read_aligned(v, 32); #else @@ -704,7 +756,10 @@ fetch32_be_aligned(const void *v) { #ifndef fetch32_be_unaligned static __maybe_unused __always_inline uint32_t fetch32_be_unaligned(const void *v) { -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE + return (uint32_t)fetch16_be_unaligned(v) << 16 | + fetch16_be_unaligned((const uint8_t *)v + 2); +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ return read_unaligned(v, 32); #else return bswap32(read_unaligned(v, 32)); @@ -712,27 +767,31 @@ fetch32_be_unaligned(const void *v) { } #endif /* fetch32_be_unaligned */ -#ifndef fetch16_be_aligned -static __maybe_unused __always_inline uint16_t -fetch16_be_aligned(const void *v) { +#ifndef fetch64_be_aligned +static __maybe_unused __always_inline uint64_t +fetch64_be_aligned(const void *v) { + assert(((uintptr_t)v) % ALIGNMENT_64 == 0); #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return read_aligned(v, 16); + return read_aligned(v, 64); #else - return bswap16(read_aligned(v, 16)); + return bswap64(read_aligned(v, 64)); #endif } -#endif /* fetch16_be_aligned */ +#endif /* fetch64_be_aligned */ -#ifndef fetch16_be_unaligned -static __maybe_unused __always_inline uint16_t -fetch16_be_unaligned(const void *v) { -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return read_unaligned(v, 16); +#ifndef fetch64_be_unaligned +static __maybe_unused __always_inline uint64_t +fetch64_be_unaligned(const void *v) { +#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE + return (uint64_t)fetch32_be_unaligned(v) << 32 | + fetch32_be_unaligned((const uint8_t *)v + 4); +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return read_unaligned(v, 64); #else - return bswap16(read_unaligned(v, 16)); + return bswap64(read_unaligned(v, 64)); #endif } -#endif /* fetch16_be_unaligned */ +#endif /* fetch64_be_unaligned */ static __maybe_unused __always_inline uint64_t tail64_be_aligned(const void *v, size_t tail) { @@ -741,9 +800,11 @@ static __maybe_unused __always_inline uint64_t tail64_be_aligned(const void *v, /* We can perform a 'oneshot' read, which is little bit faster. */ const unsigned shift = ((8 - tail) & 7) << 3; return fetch64_be_aligned(p) >> shift; -#endif /* 'oneshot' read */ - +#else switch (tail & 7) { + default: + unreachable(); +/* fall through */ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ /* For most CPUs this code is better when not byte reordering. */ case 1: @@ -762,7 +823,7 @@ static __maybe_unused __always_inline uint64_t tail64_be_aligned(const void *v, return (uint64_t)fetch32_be_aligned(p) << 24 | (uint32_t)fetch16_be_aligned(p + 4) << 8 | p[6]; case 0: - return fetch64_be(p); + return fetch64_be_aligned(p); #else case 1: return p[0]; @@ -789,7 +850,7 @@ static __maybe_unused __always_inline uint64_t tail64_be_aligned(const void *v, (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56; #endif } - unreachable(); +#endif /* T1HA_USE_FAST_ONESHOT_READ */ } static __maybe_unused __always_inline uint64_t @@ -806,10 +867,13 @@ tail64_be_unaligned(const void *v, size_t tail) { return fetch64_be_unaligned(p) & ((~UINT64_C(0)) >> shift); } return fetch64_be_unaligned(p) >> shift; -#endif /* 'oneshot' read */ - +#else switch (tail & 7) { -#if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + default: + unreachable(); +/* fall through */ +#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT && \ + __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ /* For most CPUs this code is better when not needed * copying for alignment or byte reordering. */ case 1: @@ -858,7 +922,7 @@ tail64_be_unaligned(const void *v, size_t tail) { (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56; #endif } - unreachable(); +#endif /* can_read_underside */ } /***************************************************************************/ -- 2.39.5