From 974d1dbcab92b8380879665f94d688b2475807d9 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 14 May 2018 16:14:58 +0100 Subject: [PATCH] [Minor] Backport unaligned access fixes from t1ha Issue: #2222 --- contrib/t1ha/t1ha1.c | 226 +++++++++------------- contrib/t1ha/t1ha2.c | 334 +++++++++++++++++---------------- contrib/t1ha/t1ha_bits.h | 391 +++++++++++++++++++++++++++++++++------ 3 files changed, 596 insertions(+), 355 deletions(-) diff --git a/contrib/t1ha/t1ha1.c b/contrib/t1ha/t1ha1.c index 1c92fd0f6..5baa5692f 100644 --- a/contrib/t1ha/t1ha1.c +++ b/contrib/t1ha/t1ha1.c @@ -58,81 +58,89 @@ static __inline uint64_t final_weak_avalanche(uint64_t a, uint64_t b) { return mux64(rot64(a + b, 17), prime_4) + mix64(a ^ b, prime_0); } +/* TODO C++ template in the next version */ +#define T1HA1_BODY(ENDIANNES, ALIGNESS, DOCOPY) \ + if (unlikely(len > 32)) { \ + uint64_t c = rot64(len, 17) + seed; \ + uint64_t d = len ^ rot64(seed, 17); \ + const void *detent = (const uint8_t *)data + len - 31; \ + do { \ + const uint64_t *v = (const uint64_t *)data; \ + if (DOCOPY) \ + memcpy((void *)(v = align), data, 32); \ + \ + const uint64_t w0 = fetch64_##ENDIANNES##_##ALIGNESS(v + 0); \ + const uint64_t w1 = fetch64_##ENDIANNES##_##ALIGNESS(v + 1); \ + const uint64_t w2 = fetch64_##ENDIANNES##_##ALIGNESS(v + 2); \ + const uint64_t w3 = fetch64_##ENDIANNES##_##ALIGNESS(v + 3); \ + \ + const uint64_t d02 = w0 ^ rot64(w2 + d, 17); \ + const uint64_t c13 = w1 ^ rot64(w3 + c, 17); \ + c += a ^ rot64(w0, 41); \ + d -= b ^ rot64(w1, 31); \ + a ^= prime_1 * (d02 + w3); \ + b ^= prime_0 * (c13 + w2); \ + data = (const uint64_t *)data + 4; \ + } while (likely(data < detent)); \ + \ + a ^= prime_6 * (rot64(c, 17) + d); \ + b ^= prime_5 * (c + rot64(d, 17)); \ + len &= 31; \ + } \ + \ + const uint64_t *v = (const uint64_t *)data; \ + if (unlikely(need_copy4align) && len > 8) \ + memcpy((void *)(v = align), data, len); \ + \ + switch (len) { \ + default: \ + b += mux64(fetch64_##ENDIANNES##_##ALIGNESS(v++), prime_4); \ + /* fall through */ \ + case 24: \ + case 23: \ + case 22: \ + case 21: \ + case 20: \ + case 19: \ + case 18: \ + case 17: \ + a += mux64(fetch64_##ENDIANNES##_##ALIGNESS(v++), prime_3); \ + /* fall through */ \ + case 16: \ + case 15: \ + case 14: \ + case 13: \ + case 12: \ + case 11: \ + case 10: \ + case 9: \ + b += mux64(fetch64_##ENDIANNES##_##ALIGNESS(v++), prime_2); \ + /* fall through */ \ + case 8: \ + case 7: \ + case 6: \ + case 5: \ + case 4: \ + case 3: \ + case 2: \ + case 1: \ + a += mux64(tail64_##ENDIANNES##_##ALIGNESS(v, len), prime_1); \ + /* fall through */ \ + case 0: \ + return final_weak_avalanche(a, b); \ + } + uint64_t t1ha1_le(const void *data, size_t len, uint64_t seed) { uint64_t a = seed; uint64_t b = len; - const int need_align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK; + const bool need_copy4align = + (((uintptr_t)data) & (ALIGMENT_64 - 1)) != 0 && !UNALIGNED_OK; uint64_t align[4]; - - if (unlikely(len > 32)) { - uint64_t c = rot64(len, 17) + seed; - uint64_t d = len ^ rot64(seed, 17); - const void *detent = (const uint8_t *)data + len - 31; - do { - const uint64_t *v = (const uint64_t *)data; - if (unlikely(need_align)) - v = (const uint64_t *)memcpy(&align, unaligned(v), 32); - - uint64_t w0 = fetch64_le(v + 0); - uint64_t w1 = fetch64_le(v + 1); - uint64_t w2 = fetch64_le(v + 2); - uint64_t w3 = fetch64_le(v + 3); - - uint64_t d02 = w0 ^ rot64(w2 + d, 17); - uint64_t c13 = w1 ^ rot64(w3 + c, 17); - c += a ^ rot64(w0, 41); - d -= b ^ rot64(w1, 31); - a ^= prime_1 * (d02 + w3); - b ^= prime_0 * (c13 + w2); - data = (const uint64_t *)data + 4; - } while (likely(data < detent)); - - a ^= prime_6 * (rot64(c, 17) + d); - b ^= prime_5 * (c + rot64(d, 17)); - len &= 31; - } - - const uint64_t *v = (const uint64_t *)data; - if (unlikely(need_align) && len > 8) - v = (const uint64_t *)memcpy(&align, unaligned(v), len); - - switch (len) { - default: - b += mux64(fetch64_le(v++), prime_4); - /* fall through */ - case 24: - case 23: - case 22: - case 21: - case 20: - case 19: - case 18: - case 17: - a += mux64(fetch64_le(v++), prime_3); - /* fall through */ - case 16: - case 15: - case 14: - case 13: - case 12: - case 11: - case 10: - case 9: - b += mux64(fetch64_le(v++), prime_2); - /* fall through */ - case 8: - case 7: - case 6: - case 5: - case 4: - case 3: - case 2: - case 1: - a += mux64(tail64_le(v, len), prime_1); - /* fall through */ - case 0: - return final_weak_avalanche(a, b); + if (need_copy4align) { + T1HA1_BODY(le, aligned, true); + } else { + T1HA1_BODY(le, unaligned, false); } } @@ -140,76 +148,12 @@ uint64_t t1ha1_be(const void *data, size_t len, uint64_t seed) { uint64_t a = seed; uint64_t b = len; - const int need_align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK; + const bool need_copy4align = + (((uintptr_t)data) & (ALIGMENT_64 - 1)) != 0 && !UNALIGNED_OK; uint64_t align[4]; - - if (unlikely(len > 32)) { - uint64_t c = rot64(len, 17) + seed; - uint64_t d = len ^ rot64(seed, 17); - const void *detent = (const uint8_t *)data + len - 31; - do { - const uint64_t *v = (const uint64_t *)data; - if (unlikely(need_align)) - v = (const uint64_t *)memcpy(&align, unaligned(v), 32); - - uint64_t w0 = fetch64_be(v + 0); - uint64_t w1 = fetch64_be(v + 1); - uint64_t w2 = fetch64_be(v + 2); - uint64_t w3 = fetch64_be(v + 3); - - uint64_t d02 = w0 ^ rot64(w2 + d, 17); - uint64_t c13 = w1 ^ rot64(w3 + c, 17); - c += a ^ rot64(w0, 41); - d -= b ^ rot64(w1, 31); - a ^= prime_1 * (d02 + w3); - b ^= prime_0 * (c13 + w2); - data = (const uint64_t *)data + 4; - } while (likely(data < detent)); - - a ^= prime_6 * (rot64(c, 17) + d); - b ^= prime_5 * (c + rot64(d, 17)); - len &= 31; + if (need_copy4align) { + T1HA1_BODY(be, aligned, true); + } else { + T1HA1_BODY(be, unaligned, false); } - - const uint64_t *v = (const uint64_t *)data; - if (unlikely(need_align) && len > 8) - v = (const uint64_t *)memcpy(&align, unaligned(v), len); - - switch (len) { - default: - b += mux64(fetch64_be(v++), prime_4); - /* fall through */ - case 24: - case 23: - case 22: - case 21: - case 20: - case 19: - case 18: - case 17: - a += mux64(fetch64_be(v++), prime_3); - /* fall through */ - case 16: - case 15: - case 14: - case 13: - case 12: - case 11: - case 10: - case 9: - b += mux64(fetch64_be(v++), prime_2); - /* fall through */ - case 8: - case 7: - case 6: - case 5: - case 4: - case 3: - case 2: - case 1: - a += mux64(tail64_be(v, len), prime_1); - /* fall through */ - case 0: - return final_weak_avalanche(a, b); - } -} +} \ No newline at end of file diff --git a/contrib/t1ha/t1ha2.c b/contrib/t1ha/t1ha2.c index f87e8bb82..1f5a9d905 100644 --- a/contrib/t1ha/t1ha2.c +++ b/contrib/t1ha/t1ha2.c @@ -56,128 +56,140 @@ static __always_inline void init_cd(t1ha_state256_t *s, uint64_t x, s->n.d = ~y + rot64(x, 19); } -static __always_inline void update(t1ha_state256_t *__restrict s, - const uint64_t *__restrict v) { - uint64_t w0 = fetch64_le(v + 0); - uint64_t w1 = fetch64_le(v + 1); - uint64_t w2 = fetch64_le(v + 2); - uint64_t w3 = fetch64_le(v + 3); - - uint64_t d02 = w0 + rot64(w2 + s->n.d, 56); - uint64_t c13 = w1 + rot64(w3 + s->n.c, 19); -#ifdef __e2k__ - /* FIXME: temporary workaround for lcc's ELBRUS scheduling bug (LY) */ - s->n.c ^= s->n.a + rot64(w0, 57); - s->n.d ^= s->n.b + rot64(w1, 38); -#else - s->n.d ^= s->n.b + rot64(w1, 38); - s->n.c ^= s->n.a + rot64(w0, 57); -#endif - s->n.b ^= prime_6 * (c13 + w2); - s->n.a ^= prime_5 * (d02 + w3); -} +/* TODO C++ template in the next version */ +#define T1HA2_UPDATE(ENDIANNES, ALIGNESS, state, v) \ + do { \ + t1ha_state256_t *const s = state; \ + const uint64_t w0 = fetch64_##ENDIANNES##_##ALIGNESS(v + 0); \ + const uint64_t w1 = fetch64_##ENDIANNES##_##ALIGNESS(v + 1); \ + const uint64_t w2 = fetch64_##ENDIANNES##_##ALIGNESS(v + 2); \ + const uint64_t w3 = fetch64_##ENDIANNES##_##ALIGNESS(v + 3); \ + \ + const uint64_t d02 = w0 + rot64(w2 + s->n.d, 56); \ + const uint64_t c13 = w1 + rot64(w3 + s->n.c, 19); \ + s->n.d ^= s->n.b + rot64(w1, 38); \ + s->n.c ^= s->n.a + rot64(w0, 57); \ + s->n.b ^= prime_6 * (c13 + w2); \ + s->n.a ^= prime_5 * (d02 + w3); \ + } while (0) static __always_inline void squash(t1ha_state256_t *s) { s->n.a ^= prime_6 * (s->n.c + rot64(s->n.d, 23)); s->n.b ^= prime_5 * (rot64(s->n.c, 19) + s->n.d); } -static __always_inline const void * -loop(bool need_copy4align, uint64_t *__restrict buffer4align, - t1ha_state256_t *__restrict s, const void *__restrict data, size_t len) { - const void *detent = (const uint8_t *)data + len - 31; - do { - const uint64_t *v = (const uint64_t *)data; - if (unlikely(need_copy4align)) - v = (const uint64_t *)memcpy(buffer4align, unaligned(v), 32); - update(s, v); - data = (const uint64_t *)data + 4; - } while (likely(data < detent)); - return data; -} +/* TODO C++ template in the next version */ +#define T1HA2_LOOP(ENDIANNES, ALIGNESS, BUFFER4COPY, state, data, len) \ + do { \ + const void *detent = (const uint8_t *)data + len - 31; \ + do { \ + const uint64_t *v = (const uint64_t *)data; \ + if (BUFFER4COPY != NULL) \ + memcpy((void *)(v = BUFFER4COPY), data, 32); \ + T1HA2_UPDATE(le, unaligned, state, v); \ + data = (const uint64_t *)data + 4; \ + } while (likely(data < detent)); \ + } while (0) -static __always_inline void tail_ab(t1ha_state256_t *__restrict s, - const uint64_t *__restrict v, size_t len) { - switch (len) { - default: - mixup64(&s->n.a, &s->n.b, fetch64_le(v++), prime_4); - /* fall through */ - case 24: - case 23: - case 22: - case 21: - case 20: - case 19: - case 18: - case 17: - mixup64(&s->n.b, &s->n.a, fetch64_le(v++), prime_3); - /* fall through */ - case 16: - case 15: - case 14: - case 13: - case 12: - case 11: - case 10: - case 9: - mixup64(&s->n.a, &s->n.b, fetch64_le(v++), prime_2); - /* fall through */ - case 8: - case 7: - case 6: - case 5: - case 4: - case 3: - case 2: - case 1: - mixup64(&s->n.b, &s->n.a, tail64_le(v, len), prime_1); - /* fall through */ - case 0: - return; - } -} +/* TODO C++ template in the next version */ +#define T1HA2_TAIL_AB(ENDIANNES, ALIGNESS, BUFFER4COPY, state, data, len) \ + do { \ + t1ha_state256_t *const s = state; \ + const uint64_t *v = (const uint64_t *)data; \ + if (BUFFER4COPY != NULL) \ + memcpy((void *)(v = BUFFER4COPY), data, len); \ + switch (len) { \ + default: \ + mixup64(&s->n.a, &s->n.b, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ + prime_4); \ + /* fall through */ \ + case 24: \ + case 23: \ + case 22: \ + case 21: \ + case 20: \ + case 19: \ + case 18: \ + case 17: \ + mixup64(&s->n.b, &s->n.a, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ + prime_3); \ + /* fall through */ \ + case 16: \ + case 15: \ + case 14: \ + case 13: \ + case 12: \ + case 11: \ + case 10: \ + case 9: \ + mixup64(&s->n.a, &s->n.b, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ + prime_2); \ + /* fall through */ \ + case 8: \ + case 7: \ + case 6: \ + case 5: \ + case 4: \ + case 3: \ + case 2: \ + case 1: \ + mixup64(&s->n.b, &s->n.a, tail64_##ENDIANNES##_##ALIGNESS(v, len), \ + prime_1); \ + /* fall through */ \ + case 0: \ + return final64(s->n.a, s->n.b); \ + } \ + } while (0) -static __always_inline void tail_abcd(t1ha_state256_t *__restrict s, - const uint64_t *__restrict v, - size_t len) { - switch (len) { - default: - mixup64(&s->n.a, &s->n.d, fetch64_le(v++), prime_4); - /* fall through */ - case 24: - case 23: - case 22: - case 21: - case 20: - case 19: - case 18: - case 17: - mixup64(&s->n.b, &s->n.a, fetch64_le(v++), prime_3); - /* fall through */ - case 16: - case 15: - case 14: - case 13: - case 12: - case 11: - case 10: - case 9: - mixup64(&s->n.c, &s->n.b, fetch64_le(v++), prime_2); - /* fall through */ - case 8: - case 7: - case 6: - case 5: - case 4: - case 3: - case 2: - case 1: - mixup64(&s->n.d, &s->n.c, tail64_le(v, len), prime_1); - /* fall through */ - case 0: - return; - } -} +/* TODO C++ template in the next version */ +#define T1HA2_TAIL_ABCD(ENDIANNES, ALIGNESS, BUFFER4COPY, state, data, len) \ + do { \ + t1ha_state256_t *const s = state; \ + const uint64_t *v = (const uint64_t *)data; \ + if (BUFFER4COPY != NULL) \ + memcpy((void *)(v = BUFFER4COPY), data, len); \ + switch (len) { \ + default: \ + mixup64(&s->n.a, &s->n.d, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ + prime_4); \ + /* fall through */ \ + case 24: \ + case 23: \ + case 22: \ + case 21: \ + case 20: \ + case 19: \ + case 18: \ + case 17: \ + mixup64(&s->n.b, &s->n.a, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ + prime_3); \ + /* fall through */ \ + case 16: \ + case 15: \ + case 14: \ + case 13: \ + case 12: \ + case 11: \ + case 10: \ + case 9: \ + mixup64(&s->n.c, &s->n.b, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ + prime_2); \ + /* fall through */ \ + case 8: \ + case 7: \ + case 6: \ + case 5: \ + case 4: \ + case 3: \ + case 2: \ + case 1: \ + mixup64(&s->n.d, &s->n.c, tail64_##ENDIANNES##_##ALIGNESS(v, len), \ + prime_1); \ + /* fall through */ \ + case 0: \ + return final128(s->n.a, s->n.b, s->n.c, s->n.d, extra_result); \ + } \ + } while (0) static __always_inline uint64_t final128(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t *h) { @@ -195,22 +207,26 @@ uint64_t t1ha2_atonce(const void *data, size_t length, uint64_t seed) { t1ha_state256_t state; init_ab(&state, seed, length); - const int need_copy4align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK; - uint64_t buffer4align[4]; - - if (unlikely(length > 32)) { - init_cd(&state, seed, length); - data = loop(need_copy4align, buffer4align, &state, data, length); - squash(&state); - length &= 31; + const bool need_copy4align = + (((uintptr_t)data) & (ALIGMENT_64 - 1)) != 0 && !UNALIGNED_OK; + if (need_copy4align) { + uint64_t buffer4align[4]; + if (unlikely(length > 32)) { + init_cd(&state, seed, length); + T1HA2_LOOP(le, aligned, buffer4align, &state, data, length); + squash(&state); + length &= 31; + } + T1HA2_TAIL_AB(le, aligned, buffer4align, &state, data, length); + } else { + if (unlikely(length > 32)) { + init_cd(&state, seed, length); + T1HA2_LOOP(le, unaligned, NULL, &state, data, length); + squash(&state); + length &= 31; + } + T1HA2_TAIL_AB(le, unaligned, NULL, &state, data, length); } - - const uint64_t *v = (const uint64_t *)data; - if (unlikely(need_copy4align) && length > 8) - v = (const uint64_t *)memcpy(&buffer4align, unaligned(v), length); - - tail_ab(&state, v, length); - return final64(state.n.a, state.n.b); } uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result, @@ -220,20 +236,22 @@ uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result, init_ab(&state, seed, length); init_cd(&state, seed, length); - const int need_copy4align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK; - uint64_t buffer4align[4]; - - if (unlikely(length > 32)) { - data = loop(need_copy4align, buffer4align, &state, data, length); - length &= 31; + const bool need_copy4align = + (((uintptr_t)data) & (ALIGMENT_64 - 1)) != 0 && !UNALIGNED_OK; + if (need_copy4align) { + uint64_t buffer4align[4]; + if (unlikely(length > 32)) { + T1HA2_LOOP(le, aligned, buffer4align, &state, data, length); + length &= 31; + } + T1HA2_TAIL_ABCD(le, aligned, buffer4align, &state, data, length); + } else { + if (unlikely(length > 32)) { + T1HA2_LOOP(le, unaligned, NULL, &state, data, length); + length &= 31; + } + T1HA2_TAIL_ABCD(le, unaligned, NULL, &state, data, length); } - - const uint64_t *v = (const uint64_t *)data; - if (unlikely(need_copy4align) && length > 8) - v = (const uint64_t *)memcpy(&buffer4align, unaligned(v), length); - - tail_abcd(&state, v, length); - return final128(state.n.a, state.n.b, state.n.c, state.n.d, extra_result); } //------------------------------------------------------------------------------ @@ -252,7 +270,7 @@ void t1ha2_update(t1ha_context_t *__restrict ctx, const void *__restrict data, if (ctx->partial) { const size_t left = 32 - ctx->partial; const size_t chunk = (length >= left) ? left : length; - memcpy(ctx->buffer.bytes + ctx->partial, unaligned(data), chunk); + memcpy(ctx->buffer.bytes + ctx->partial, data, chunk); ctx->partial += chunk; if (ctx->partial < 32) { assert(left >= length); @@ -261,37 +279,41 @@ void t1ha2_update(t1ha_context_t *__restrict ctx, const void *__restrict data, ctx->partial = 0; data = (const uint8_t *)data + chunk; length -= chunk; - update(&ctx->state, ctx->buffer.u64); + T1HA2_UPDATE(le, aligned, &ctx->state, ctx->buffer.u64); } if (length >= 32) { - const bool need_copy4align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK; - if (need_copy4align) - data = loop(true, ctx->buffer.u64, &ctx->state, data, length); - else - data = loop(false, NULL, &ctx->state, data, length); + const bool need_copy4align = + (((uintptr_t)data) & (ALIGMENT_64 - 1)) != 0 && !UNALIGNED_OK; + if (need_copy4align) { + T1HA2_LOOP(le, aligned, ctx->buffer.u64, &ctx->state, data, length); + } else { + T1HA2_LOOP(le, unaligned, NULL, &ctx->state, data, length); + } length &= 31; } if (length) - memcpy(ctx->buffer.bytes, unaligned(data), ctx->partial = length); + memcpy(ctx->buffer.bytes, data, ctx->partial = length); } uint64_t t1ha2_final(t1ha_context_t *__restrict ctx, uint64_t *__restrict extra_result) { - uint64_t bytes = (ctx->total << 3) ^ (UINT64_C(1) << 63); + uint64_t bits = (ctx->total << 3) ^ (UINT64_C(1) << 63); #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ - bytes = bswap64(bytes); + bits = bswap64(bits); #endif - t1ha2_update(ctx, &bytes, 8); + t1ha2_update(ctx, &bits, 8); if (likely(!extra_result)) { squash(&ctx->state); - tail_ab(&ctx->state, ctx->buffer.u64, ctx->partial); + T1HA2_TAIL_AB(le, aligned, NULL, &ctx->state, ctx->buffer.u64, + ctx->partial); return final64(ctx->state.n.a, ctx->state.n.b); } - tail_abcd(&ctx->state, ctx->buffer.u64, ctx->partial); + T1HA2_TAIL_ABCD(le, aligned, NULL, &ctx->state, ctx->buffer.u64, + ctx->partial); return final128(ctx->state.n.a, ctx->state.n.b, ctx->state.n.c, ctx->state.n.d, extra_result); -} +} \ No newline at end of file diff --git a/contrib/t1ha/t1ha_bits.h b/contrib/t1ha/t1ha_bits.h index 3e03c551d..42daf5790 100644 --- a/contrib/t1ha/t1ha_bits.h +++ b/contrib/t1ha/t1ha_bits.h @@ -86,6 +86,14 @@ #define PAGESIZE 4096 #endif /* PAGESIZE */ +#define ALIGMENT_16 2 +#define ALIGMENT_32 4 +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul +#define ALIGMENT_64 8 +#else +#define ALIGMENT_64 4 +#endif + /***************************************************************************/ #ifndef __has_builtin @@ -160,10 +168,8 @@ static __maybe_unused __always_inline void e2k_add64carry_last(unsigned carry, e2k_add64carry_last(carry, base, addend, sum) #endif /* __iset__ >= 5 */ -#if 0 /* LY: unreasonable, because alignment is required :( */ -#define fetch64_be(ptr) ((uint64_t)__builtin_e2k_ld_64s_be(ptr)) -#define fetch32_be(ptr) ((uint32_t)__builtin_e2k_ld_32u_be(ptr)) -#endif +#define fetch64_be_aligned(ptr) ((uint64_t)__builtin_e2k_ld_64s_be(ptr)) +#define fetch32_be_aligned(ptr) ((uint32_t)__builtin_e2k_ld_32u_be(ptr)) #endif /* __e2k__ Elbrus */ @@ -337,57 +343,228 @@ static __always_inline uint16_t bswap16(uint16_t v) { return v << 8 | v >> 8; } #endif #endif /* bswap16 */ -#ifndef unaligned +#if defined(__GNUC__) || (__has_attribute(packed) && __has_attribute(aligned)) +typedef struct { + uint8_t unaligned_8; + uint16_t unaligned_16; + uint32_t unaligned_32; + uint64_t unaligned_64; +} __attribute__((packed, aligned(1))) t1ha_unaligned_proxy; +#define read_unaligned(ptr, bits) \ + (((const t1ha_unaligned_proxy *)((const uint8_t *)(ptr)-offsetof( \ + t1ha_unaligned_proxy, unaligned_##bits))) \ + ->unaligned_##bits) +#define read_aligned(ptr, bits) \ + (*(const __attribute__((aligned(ALIGMENT_##bits))) uint##bits##_t *)(ptr)) +#elif defined(_MSC_VER) +#pragma warning( \ + disable : 4235) /* nonstandard extension used: '__unaligned' \ + * keyword not supported on this architecture */ +#define read_unaligned(ptr, bits) (*(const __unaligned uint##bits##_t *)(ptr)) +#define read_aligned(ptr, bits) \ + (*(const __declspec(align(ALIGMENT_##bits)) uint##bits##_t *)(ptr)) +#else +#error FIXME +#define read_unaligned(ptr, bits) \ + (*(const uint##bits##_ *)((const char *)(ptr))) +#define read_aligned(ptr, bits) (*(const uint##bits##_t *)(ptr)) +#endif /* read_unaligned */ + +#if 0 +#ifndef DECLARE_UNALIGNED_PTR +#if defined(__LCC__) +#pragma diag_suppress wrong_entity_for_attribute +#define DECLARE_UNALIGNED_PTR(ptr) char __attribute__((packed)) * ptr +#elif defined(__clang__) +#pragma clang diagnostic ignored "-Wignored-attributes" +#define DECLARE_UNALIGNED_PTR(ptr) char __attribute__((packed)) * ptr +#elif defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wpacked" +#define DECLARE_UNALIGNED_PTR(ptr) char __attribute__((packed)) * ptr +#elif defined(_MSC_VER) +#pragma warning( \ + disable : 4235) /* nonstandard extension used: '__unaligned' \ + * keyword not supported on this architecture */ +#define DECLARE_UNALIGNED_PTR(ptr) char __unaligned *ptr +#else +#define DECLARE_UNALIGNED_PTR(ptr) char *ptr +#endif +#endif /* cast_unaligned */ + +#ifndef cast_unaligned #if defined(__LCC__) #pragma diag_suppress wrong_entity_for_attribute -#define unaligned(ptr) ((const char __attribute__((packed, aligned(1))) *)(ptr)) +#define cast_unaligned(ptr) ((const char __attribute__((packed)) *)(ptr)) #elif defined(__clang__) #pragma clang diagnostic ignored "-Wignored-attributes" -#define unaligned(ptr) ((const char __attribute__((packed, aligned(1))) *)(ptr)) +#define cast_unaligned(ptr) ((const char __attribute__((packed)) *)(ptr)) #elif defined(__GNUC__) #pragma GCC diagnostic ignored "-Wpacked" -#define unaligned(ptr) ((const char __attribute__((packed, aligned(1))) *)(ptr)) +#define cast_unaligned(ptr) ((const char __attribute__((packed)) *)(ptr)) #elif defined(_MSC_VER) #pragma warning( \ disable : 4235) /* nonstandard extension used: '__unaligned' \ * keyword not supported on this architecture */ -#define unaligned(ptr) ((const char __unaligned *)(ptr)) +#define cast_unaligned(ptr) ((const char __unaligned *)(ptr)) +#else +#define cast_unaligned(ptr) ((const char *)(ptr)) +#endif +#endif /* cast_unaligned */ + +#ifndef cast_aligned +#if defined(__LCC__) +#pragma diag_suppress wrong_entity_for_attribute +#define cast_aligned(type, ptr) \ + ((const type __attribute__((aligned(sizeof(type)))) *)(ptr)) +#elif defined(__clang__) +#pragma clang diagnostic ignored "-Wignored-attributes" +#define cast_aligned(type, ptr) \ + ((const type __attribute__((aligned(sizeof(type)))) *)(ptr)) +#elif defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wpacked" +#define cast_aligned(type, ptr) \ + ((const type __attribute__((aligned(sizeof(type)))) *)(ptr)) +#elif defined(_MSC_VER) +#define cast_aligned(type, ptr) \ + ((const type __declspec((align(sizeof(type)))) *)(ptr)) #else -#define unaligned(ptr) ((const char *)(ptr)) +#define cast_aligned(type, ptr) ((const type *)(ptr)) #endif -#endif /* unaligned */ +#endif /* cast_aligned */ +#endif /* 0 */ /***************************************************************************/ -#ifndef fetch64_le -static __always_inline uint64_t fetch64_le(const void *v) { +/*---------------------------------------------------------- Little Endian */ + +#ifndef fetch64_le_aligned +static __always_inline uint64_t fetch64_le_aligned(const void *v) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return read_aligned(v, 64); +#else + return bswap64(read_aligned(v, 64)); +#endif +} +#endif /* fetch64_le_aligned */ + +#ifndef fetch64_le_unaligned +static __always_inline uint64_t fetch64_le_unaligned(const void *v) { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return *(const uint64_t *)v; + return read_unaligned(v, 64); #else - return bswap64(*(const uint64_t *)v); + return bswap64(read_unaligned(v, 64)); #endif } -#endif /* fetch64_le */ +#endif /* fetch64_le_unaligned */ -#ifndef fetch32_le -static __always_inline uint32_t fetch32_le(const void *v) { +#ifndef fetch32_le_aligned +static __always_inline uint32_t fetch32_le_aligned(const void *v) { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return *(const uint32_t *)v; + return read_aligned(v, 32); #else - return bswap32(*(const uint32_t *)v); + return bswap32(read_aligned(v, 32)); #endif } -#endif /* fetch32_le */ +#endif /* fetch32_le_aligned */ -#ifndef fetch16_le -static __always_inline uint16_t fetch16_le(const void *v) { +#ifndef fetch32_le_unaligned +static __always_inline uint32_t fetch32_le_unaligned(const void *v) { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return *(const uint16_t *)v; + return read_unaligned(v, 32); #else - return bswap16(*(const uint16_t *)v); + return bswap32(read_unaligned(v, 32)); #endif } -#endif /* fetch16_le */ +#endif /* fetch32_le_unaligned */ + +#ifndef fetch16_le_aligned +static __always_inline uint16_t fetch16_le_aligned(const void *v) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return read_aligned(v, 16); +#else + return bswap16(read_aligned(v, 16)); +#endif +} +#endif /* fetch16_le_aligned */ + +#ifndef fetch16_le_unaligned +static __always_inline uint16_t fetch16_le_unaligned(const void *v) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return read_unaligned(v, 16); +#else + return bswap16(read_unaligned(v, 16)); +#endif +} +#endif /* fetch16_le_unaligned */ + +static __always_inline uint64_t tail64_le_aligned(const void *v, size_t tail) { + const uint8_t *const p = (const uint8_t *)v; +#if T1HA_USE_FAST_ONESHOT_READ && !defined(__SANITIZE_ADDRESS__) + /* We can perform a 'oneshot' read, which is little bit faster. */ + const unsigned shift = ((8 - tail) & 7) << 3; + return fetch64_le_aligned(p) & ((~UINT64_C(0)) >> shift); +#endif /* 'oneshot' read */ + + uint64_t r = 0; + switch (tail & 7) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + /* For most CPUs this code is better when not needed byte reordering. */ + case 0: + return fetch64_le_aligned(p); + case 7: + r = (uint64_t)p[6] << 8; + /* fall through */ + case 6: + r += p[5]; + r <<= 8; + /* fall through */ + case 5: + r += p[4]; + r <<= 32; + /* fall through */ + case 4: + return r + fetch32_le_aligned(p); + case 3: + r = (uint64_t)p[2] << 16; + /* fall through */ + case 2: + return r + fetch16_le_aligned(p); + case 1: + return p[0]; +#else + case 0: + r = p[7] << 8; + /* fall through */ + case 7: + r += p[6]; + r <<= 8; + /* fall through */ + case 6: + r += p[5]; + r <<= 8; + /* fall through */ + case 5: + r += p[4]; + r <<= 8; + /* fall through */ + case 4: + r += p[3]; + r <<= 8; + /* fall through */ + case 3: + r += p[2]; + r <<= 8; + /* fall through */ + case 2: + r += p[1]; + r <<= 8; + /* fall through */ + case 1: + return r + p[0]; +#endif + } + unreachable(); +} #if T1HA_USE_FAST_ONESHOT_READ && UNALIGNED_OK && defined(PAGESIZE) && \ !defined(__SANITIZE_ADDRESS__) && !defined(__sun) @@ -395,7 +572,8 @@ static __always_inline uint16_t fetch16_le(const void *v) { ((size) <= sizeof(uintptr_t) && ((PAGESIZE - (size)) & (uintptr_t)(ptr)) != 0) #endif /* can_fast_read */ -static __always_inline uint64_t tail64_le(const void *v, size_t tail) { +static __always_inline uint64_t tail64_le_unaligned(const void *v, + size_t tail) { const uint8_t *p = (const uint8_t *)v; #ifdef can_read_underside /* On some systems (e.g. x86) we can perform a 'oneshot' read, which @@ -405,9 +583,9 @@ static __always_inline uint64_t tail64_le(const void *v, size_t tail) { const unsigned shift = offset << 3; if (likely(can_read_underside(p, 8))) { p -= offset; - return fetch64_le(p) >> shift; + return fetch64_le_unaligned(p) >> shift; } - return fetch64_le(p) & ((~UINT64_C(0)) >> shift); + return fetch64_le_unaligned(p) & ((~UINT64_C(0)) >> shift); #endif /* 'oneshot' read */ uint64_t r = 0; @@ -416,7 +594,7 @@ static __always_inline uint64_t tail64_le(const void *v, size_t tail) { /* For most CPUs this code is better when not needed * copying for alignment or byte reordering. */ case 0: - return fetch64_le(p); + return fetch64_le_unaligned(p); case 7: r = (uint64_t)p[6] << 8; /* fall through */ @@ -429,12 +607,12 @@ static __always_inline uint64_t tail64_le(const void *v, size_t tail) { r <<= 32; /* fall through */ case 4: - return r + fetch32_le(p); + return r + fetch32_le_unaligned(p); case 3: r = (uint64_t)p[2] << 16; /* fall through */ case 2: - return r + fetch16_le(p); + return r + fetch16_le_unaligned(p); case 1: return p[0]; #else @@ -474,38 +652,134 @@ static __always_inline uint64_t tail64_le(const void *v, size_t tail) { unreachable(); } -#ifndef fetch64_be -static __maybe_unused __always_inline uint64_t fetch64_be(const void *v) { +/*------------------------------------------------------------- Big Endian */ + +#ifndef fetch64_be_aligned +static __maybe_unused __always_inline uint64_t +fetch64_be_aligned(const void *v) { +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return read_aligned(v, 64); +#else + return bswap64(read_aligned(v, 64)); +#endif +} +#endif /* fetch64_be_aligned */ + +#ifndef fetch64_be_unaligned +static __maybe_unused __always_inline uint64_t +fetch64_be_unaligned(const void *v) { +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return read_unaligned(v, 64); +#else + return bswap64(read_unaligned(v, 64)); +#endif +} +#endif /* fetch64_be_unaligned */ + +#ifndef fetch32_be_aligned +static __maybe_unused __always_inline uint32_t +fetch32_be_aligned(const void *v) { +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return read_aligned(v, 32); +#else + return bswap32(read_aligned(v, 32)); +#endif +} +#endif /* fetch32_be_aligned */ + +#ifndef fetch32_be_unaligned +static __maybe_unused __always_inline uint32_t +fetch32_be_unaligned(const void *v) { +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return read_unaligned(v, 32); +#else + return bswap32(read_unaligned(v, 32)); +#endif +} +#endif /* fetch32_be_unaligned */ + +#ifndef fetch16_be_aligned +static __maybe_unused __always_inline uint16_t +fetch16_be_aligned(const void *v) { #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return *(const uint64_t *)v; + return read_aligned(v, 16); #else - return bswap64(*(const uint64_t *)v); + return bswap16(read_aligned(v, 16)); #endif } -#endif /* fetch64_be */ +#endif /* fetch16_be_aligned */ -#ifndef fetch32_be -static __maybe_unused __always_inline uint32_t fetch32_be(const void *v) { +#ifndef fetch16_be_unaligned +static __maybe_unused __always_inline uint16_t +fetch16_be_unaligned(const void *v) { #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return *(const uint32_t *)v; + return read_unaligned(v, 16); #else - return bswap32(*(const uint32_t *)v); + return bswap16(read_unaligned(v, 16)); #endif } -#endif /* fetch32_be */ +#endif /* fetch16_be_unaligned */ + +static __maybe_unused __always_inline uint64_t tail64_be_aligned(const void *v, + size_t tail) { + const uint8_t *const p = (const uint8_t *)v; +#if T1HA_USE_FAST_ONESHOT_READ && !defined(__SANITIZE_ADDRESS__) + /* We can perform a 'oneshot' read, which is little bit faster. */ + const unsigned shift = ((8 - tail) & 7) << 3; + return fetch64_be_aligned(p) >> shift; +#endif /* 'oneshot' read */ -#ifndef fetch16_be -static __maybe_unused __always_inline uint16_t fetch16_be(const void *v) { + switch (tail & 7) { #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return *(const uint16_t *)v; + /* For most CPUs this code is better when not byte reordering. */ + case 1: + return p[0]; + case 2: + return fetch16_be_aligned(p); + case 3: + return (uint32_t)fetch16_be_aligned(p) << 8 | p[2]; + case 4: + return fetch32_be_aligned(p); + case 5: + return (uint64_t)fetch32_be_aligned(p) << 8 | p[4]; + case 6: + return (uint64_t)fetch32_be_aligned(p) << 16 | fetch16_be_aligned(p + 4); + case 7: + return (uint64_t)fetch32_be_aligned(p) << 24 | + (uint32_t)fetch16_be_aligned(p + 4) << 8 | p[6]; + case 0: + return fetch64_be(p); #else - return bswap16(*(const uint16_t *)v); + case 1: + return p[0]; + case 2: + return p[1] | (uint32_t)p[0] << 8; + case 3: + return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; + case 4: + return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | + (uint32_t)p[0] << 24; + case 5: + return p[4] | (uint32_t)p[3] << 8 | (uint32_t)p[2] << 16 | + (uint32_t)p[1] << 24 | (uint64_t)p[0] << 32; + case 6: + return p[5] | (uint32_t)p[4] << 8 | (uint32_t)p[3] << 16 | + (uint32_t)p[2] << 24 | (uint64_t)p[1] << 32 | (uint64_t)p[0] << 40; + case 7: + return p[6] | (uint32_t)p[5] << 8 | (uint32_t)p[4] << 16 | + (uint32_t)p[3] << 24 | (uint64_t)p[2] << 32 | (uint64_t)p[1] << 40 | + (uint64_t)p[0] << 48; + case 0: + return p[7] | (uint32_t)p[6] << 8 | (uint32_t)p[5] << 16 | + (uint32_t)p[4] << 24 | (uint64_t)p[3] << 32 | (uint64_t)p[2] << 40 | + (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56; #endif + } + unreachable(); } -#endif /* fetch16_be */ -static __maybe_unused __always_inline uint64_t tail64_be(const void *v, - size_t tail) { +static __maybe_unused __always_inline uint64_t +tail64_be_unaligned(const void *v, size_t tail) { const uint8_t *p = (const uint8_t *)v; #ifdef can_read_underside /* On some systems we can perform a 'oneshot' read, which is little bit @@ -515,9 +789,9 @@ static __maybe_unused __always_inline uint64_t tail64_be(const void *v, const unsigned shift = offset << 3; if (likely(can_read_underside(p, 8))) { p -= offset; - return fetch64_be(p) & ((~UINT64_C(0)) >> shift); + return fetch64_be_unaligned(p) & ((~UINT64_C(0)) >> shift); } - return fetch64_be(p) >> shift; + return fetch64_be_unaligned(p) >> shift; #endif /* 'oneshot' read */ switch (tail & 7) { @@ -527,20 +801,21 @@ static __maybe_unused __always_inline uint64_t tail64_be(const void *v, case 1: return p[0]; case 2: - return fetch16_be(p); + return fetch16_be_unaligned(p); case 3: - return (uint32_t)fetch16_be(p) << 8 | p[2]; + return (uint32_t)fetch16_be_unaligned(p) << 8 | p[2]; case 4: return fetch32_be(p); case 5: - return (uint64_t)fetch32_be(p) << 8 | p[4]; + return (uint64_t)fetch32_be_unaligned(p) << 8 | p[4]; case 6: - return (uint64_t)fetch32_be(p) << 16 | fetch16_be(p + 4); + return (uint64_t)fetch32_be_unaligned(p) << 16 | + fetch16_be_unaligned(p + 4); case 7: - return (uint64_t)fetch32_be(p) << 24 | (uint32_t)fetch16_be(p + 4) << 8 | - p[6]; + return (uint64_t)fetch32_be_unaligned(p) << 24 | + (uint32_t)fetch16_be_unaligned(p + 4) << 8 | p[6]; case 0: - return fetch64_be(p); + return fetch64_be_unaligned(p); #else /* For most CPUs this code is better than a * copying for alignment and/or byte reordering. */ -- 2.39.5