From b6156175b6e0e9a1d8063571f81c428f4aac570f Mon Sep 17 00:00:00 2001
From: Vsevolod Stakhov <vsevolod@highsecure.ru>
Date: Tue, 22 May 2018 14:15:47 +0100
Subject: [PATCH] [Minor] Backport fixes from t1ha

---
 contrib/t1ha/t1ha1.c     |  51 +++++----
 contrib/t1ha/t1ha2.c     |  95 ++++++++--------
 contrib/t1ha/t1ha_bits.h | 226 +++++++++++++++++++++++++--------------
 3 files changed, 221 insertions(+), 151 deletions(-)

diff --git a/contrib/t1ha/t1ha1.c b/contrib/t1ha/t1ha1.c
index 956f7e24e..6e25d37f4 100644
--- a/contrib/t1ha/t1ha1.c
+++ b/contrib/t1ha/t1ha1.c
@@ -58,21 +58,21 @@ static __inline uint64_t final_weak_avalanche(uint64_t a, uint64_t b) {
   return mux64(rot64(a + b, 17), prime_4) + mix64(a ^ b, prime_0);
 }
 
-/* TODO C++ template in the next version */
-#define T1HA1_BODY(ENDIANNES, ALIGNESS, DOCOPY)                                \
+/* TODO: C++ template in the next version */
+#define T1HA1_BODY(ENDIANNES, ALIGNESS)                                        \
+  const uint64_t *v = (const uint64_t *)data;                                  \
   if (unlikely(len > 32)) {                                                    \
     uint64_t c = rot64(len, 17) + seed;                                        \
     uint64_t d = len ^ rot64(seed, 17);                                        \
-    const void *detent = (const uint8_t *)data + len - 31;                     \
+    const uint64_t *detent =                                                   \
+        (const uint64_t *)((const uint8_t *)data + len - 31);                  \
     do {                                                                       \
-      const uint64_t *v = (const uint64_t *)data;                              \
-      if (DOCOPY)                                                              \
-        memcpy((void *)(v = align), data, 32);                                 \
-                                                                               \
       const uint64_t w0 = fetch64_##ENDIANNES##_##ALIGNESS(v + 0);             \
       const uint64_t w1 = fetch64_##ENDIANNES##_##ALIGNESS(v + 1);             \
       const uint64_t w2 = fetch64_##ENDIANNES##_##ALIGNESS(v + 2);             \
       const uint64_t w3 = fetch64_##ENDIANNES##_##ALIGNESS(v + 3);             \
+      v += 4;                                                                  \
+      prefetch(v);                                                             \
                                                                                \
       const uint64_t d02 = w0 ^ rot64(w2 + d, 17);                             \
       const uint64_t c13 = w1 ^ rot64(w3 + c, 17);                             \
@@ -80,18 +80,13 @@ static __inline uint64_t final_weak_avalanche(uint64_t a, uint64_t b) {
       d -= b ^ rot64(w1, 31);                                                  \
       a ^= prime_1 * (d02 + w3);                                               \
       b ^= prime_0 * (c13 + w2);                                               \
-      data = (const uint64_t *)data + 4;                                       \
-    } while (likely(data < detent));                                           \
+    } while (likely(v < detent));                                              \
                                                                                \
     a ^= prime_6 * (rot64(c, 17) + d);                                         \
     b ^= prime_5 * (c + rot64(d, 17));                                         \
     len &= 31;                                                                 \
   }                                                                            \
                                                                                \
-  const uint64_t *v = (const uint64_t *)data;                                  \
-  if (unlikely(need_copy4align) && len > 8)                                    \
-    memcpy((void *)(v = align), data, len);                                    \
-                                                                               \
   switch (len) {                                                               \
   default:                                                                     \
     b += mux64(fetch64_##ENDIANNES##_##ALIGNESS(v++), prime_4);                \
@@ -134,26 +129,30 @@ uint64_t t1ha1_le(const void *data, size_t len, uint64_t seed) {
   uint64_t a = seed;
   uint64_t b = len;
 
-  const bool need_copy4align =
-      (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0 && !UNALIGNED_OK;
-  uint64_t align[4];
-  if (need_copy4align) {
-    T1HA1_BODY(le, aligned, true);
+#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT
+  T1HA1_BODY(le, unaligned);
+#else
+  const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0;
+  if (misaligned) {
+    T1HA1_BODY(le, unaligned);
   } else {
-    T1HA1_BODY(le, unaligned, false);
+    T1HA1_BODY(le, aligned);
   }
+#endif
 }
 
 uint64_t t1ha1_be(const void *data, size_t len, uint64_t seed) {
   uint64_t a = seed;
   uint64_t b = len;
 
-  const bool need_copy4align =
-      (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0 && !UNALIGNED_OK;
-  uint64_t align[4];
-  if (need_copy4align) {
-    T1HA1_BODY(be, aligned, true);
+#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT
+  T1HA1_BODY(be, unaligned);
+#else
+  const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0;
+  if (misaligned) {
+    T1HA1_BODY(be, unaligned);
   } else {
-    T1HA1_BODY(be, unaligned, false);
+    T1HA1_BODY(be, aligned);
   }
-}
\ No newline at end of file
+#endif
+}
diff --git a/contrib/t1ha/t1ha2.c b/contrib/t1ha/t1ha2.c
index 95f646da4..4cb5281ee 100644
--- a/contrib/t1ha/t1ha2.c
+++ b/contrib/t1ha/t1ha2.c
@@ -56,7 +56,7 @@ static __always_inline void init_cd(t1ha_state256_t *s, uint64_t x,
   s->n.d = ~y + rot64(x, 19);
 }
 
-/* TODO C++ template in the next version */
+/* TODO: C++ template in the next version */
 #define T1HA2_UPDATE(ENDIANNES, ALIGNESS, state, v)                            \
   do {                                                                         \
     t1ha_state256_t *const s = state;                                          \
@@ -67,8 +67,8 @@ static __always_inline void init_cd(t1ha_state256_t *s, uint64_t x,
                                                                                \
     const uint64_t d02 = w0 + rot64(w2 + s->n.d, 56);                          \
     const uint64_t c13 = w1 + rot64(w3 + s->n.c, 19);                          \
-    s->n.d ^= s->n.b + rot64(w1, 38);                                          \
     s->n.c ^= s->n.a + rot64(w0, 57);                                          \
+    s->n.d ^= s->n.b + rot64(w1, 38);                                          \
     s->n.b ^= prime_6 * (c13 + w2);                                            \
     s->n.a ^= prime_5 * (d02 + w3);                                            \
   } while (0)
@@ -78,26 +78,23 @@ static __always_inline void squash(t1ha_state256_t *s) {
   s->n.b ^= prime_5 * (rot64(s->n.c, 19) + s->n.d);
 }
 
-/* TODO C++ template in the next version */
-#define T1HA2_LOOP(ENDIANNES, ALIGNESS, BUFFER4COPY, state, data, len)         \
+/* TODO: C++ template in the next version */
+#define T1HA2_LOOP(ENDIANNES, ALIGNESS, state, data, len)                      \
   do {                                                                         \
     const void *detent = (const uint8_t *)data + len - 31;                     \
     do {                                                                       \
       const uint64_t *v = (const uint64_t *)data;                              \
-      if (BUFFER4COPY != NULL)                                                 \
-        memcpy((void *)(v = BUFFER4COPY), data, 32);                           \
-      T1HA2_UPDATE(le, unaligned, state, v);                                   \
       data = (const uint64_t *)data + 4;                                       \
+      prefetch(data);                                                          \
+      T1HA2_UPDATE(le, ALIGNESS, state, v);                                    \
     } while (likely(data < detent));                                           \
   } while (0)
 
-/* TODO C++ template in the next version */
-#define T1HA2_TAIL_AB(ENDIANNES, ALIGNESS, BUFFER4COPY, state, data, len)      \
+/* TODO: C++ template in the next version */
+#define T1HA2_TAIL_AB(ENDIANNES, ALIGNESS, state, data, len)                   \
   do {                                                                         \
     t1ha_state256_t *const s = state;                                          \
     const uint64_t *v = (const uint64_t *)data;                                \
-    if (BUFFER4COPY != NULL)                                                   \
-      memcpy((void *)(v = BUFFER4COPY), data, len);                            \
     switch (len) {                                                             \
     default:                                                                   \
       mixup64(&s->n.a, &s->n.b, fetch64_##ENDIANNES##_##ALIGNESS(v++),         \
@@ -141,13 +138,11 @@ static __always_inline void squash(t1ha_state256_t *s) {
     }                                                                          \
   } while (0)
 
-/* TODO C++ template in the next version */
-#define T1HA2_TAIL_ABCD(ENDIANNES, ALIGNESS, BUFFER4COPY, state, data, len)    \
+/* TODO: C++ template in the next version */
+#define T1HA2_TAIL_ABCD(ENDIANNES, ALIGNESS, state, data, len)                 \
   do {                                                                         \
     t1ha_state256_t *const s = state;                                          \
     const uint64_t *v = (const uint64_t *)data;                                \
-    if (BUFFER4COPY != NULL)                                                   \
-      memcpy((void *)(v = BUFFER4COPY), data, len);                            \
     switch (len) {                                                             \
     default:                                                                   \
       mixup64(&s->n.a, &s->n.d, fetch64_##ENDIANNES##_##ALIGNESS(v++),         \
@@ -207,26 +202,34 @@ uint64_t t1ha2_atonce(const void *data, size_t length, uint64_t seed) {
   t1ha_state256_t state;
   init_ab(&state, seed, length);
 
-  const bool need_copy4align =
-      (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0 && !UNALIGNED_OK;
-  if (need_copy4align) {
-    uint64_t buffer4align[4];
+#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT
+  if (unlikely(length > 32)) {
+    init_cd(&state, seed, length);
+    T1HA2_LOOP(le, unaligned, &state, data, length);
+    squash(&state);
+    length &= 31;
+  }
+  T1HA2_TAIL_AB(le, unaligned, &state, data, length);
+#else
+  const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0;
+  if (misaligned) {
     if (unlikely(length > 32)) {
       init_cd(&state, seed, length);
-      T1HA2_LOOP(le, aligned, buffer4align, &state, data, length);
+      T1HA2_LOOP(le, unaligned, &state, data, length);
       squash(&state);
       length &= 31;
     }
-    T1HA2_TAIL_AB(le, aligned, buffer4align, &state, data, length);
+    T1HA2_TAIL_AB(le, unaligned, &state, data, length);
   } else {
     if (unlikely(length > 32)) {
       init_cd(&state, seed, length);
-      T1HA2_LOOP(le, unaligned, NULL, &state, data, length);
+      T1HA2_LOOP(le, aligned, &state, data, length);
       squash(&state);
       length &= 31;
     }
-    T1HA2_TAIL_AB(le, unaligned, NULL, &state, data, length);
+    T1HA2_TAIL_AB(le, aligned, &state, data, length);
   }
+#endif
 }
 
 uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result,
@@ -236,22 +239,28 @@ uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result,
   init_ab(&state, seed, length);
   init_cd(&state, seed, length);
 
-  const bool need_copy4align =
-      (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0 && !UNALIGNED_OK;
-  if (need_copy4align) {
-    uint64_t buffer4align[4];
+#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT
+  if (unlikely(length > 32)) {
+    T1HA2_LOOP(le, unaligned, &state, data, length);
+    length &= 31;
+  }
+  T1HA2_TAIL_ABCD(le, unaligned, &state, data, length);
+#else
+  const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0;
+  if (misaligned) {
     if (unlikely(length > 32)) {
-      T1HA2_LOOP(le, aligned, buffer4align, &state, data, length);
+      T1HA2_LOOP(le, unaligned, &state, data, length);
       length &= 31;
     }
-    T1HA2_TAIL_ABCD(le, aligned, buffer4align, &state, data, length);
+    T1HA2_TAIL_ABCD(le, unaligned, &state, data, length);
   } else {
     if (unlikely(length > 32)) {
-      T1HA2_LOOP(le, unaligned, NULL, &state, data, length);
+      T1HA2_LOOP(le, aligned, &state, data, length);
       length &= 31;
     }
-    T1HA2_TAIL_ABCD(le, unaligned, NULL, &state, data, length);
+    T1HA2_TAIL_ABCD(le, aligned, &state, data, length);
   }
+#endif
 }
 
 //------------------------------------------------------------------------------
@@ -283,13 +292,16 @@ void t1ha2_update(t1ha_context_t *__restrict ctx, const void *__restrict data,
   }
 
   if (length >= 32) {
-    const bool need_copy4align =
-        (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0 && !UNALIGNED_OK;
-    if (need_copy4align) {
-      T1HA2_LOOP(le, aligned, ctx->buffer.u64, &ctx->state, data, length);
+#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT
+    T1HA2_LOOP(le, unaligned, &ctx->state, data, length);
+#else
+    const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0;
+    if (misaligned) {
+      T1HA2_LOOP(le, unaligned, &ctx->state, data, length);
     } else {
-      T1HA2_LOOP(le, unaligned, NULL, &ctx->state, data, length);
+      T1HA2_LOOP(le, aligned, &ctx->state, data, length);
     }
+#endif
     length &= 31;
   }
 
@@ -307,13 +319,8 @@ uint64_t t1ha2_final(t1ha_context_t *__restrict ctx,
 
   if (likely(!extra_result)) {
     squash(&ctx->state);
-    T1HA2_TAIL_AB(le, aligned, NULL, &ctx->state, ctx->buffer.u64,
-                  ctx->partial);
-    return final64(ctx->state.n.a, ctx->state.n.b);
+    T1HA2_TAIL_AB(le, aligned, &ctx->state, ctx->buffer.u64, ctx->partial);
   }
 
-  T1HA2_TAIL_ABCD(le, aligned, NULL, &ctx->state, ctx->buffer.u64,
-                  ctx->partial);
-  return final128(ctx->state.n.a, ctx->state.n.b, ctx->state.n.c,
-                  ctx->state.n.d, extra_result);
-}
\ No newline at end of file
+  T1HA2_TAIL_ABCD(le, aligned, &ctx->state, ctx->buffer.u64, ctx->partial);
+}
diff --git a/contrib/t1ha/t1ha_bits.h b/contrib/t1ha/t1ha_bits.h
index e3815a4e7..454e43aed 100644
--- a/contrib/t1ha/t1ha_bits.h
+++ b/contrib/t1ha/t1ha_bits.h
@@ -72,19 +72,23 @@
 #error Unsupported byte order.
 #endif
 
-#if !defined(UNALIGNED_OK)
-#if (defined(__ia32__) || defined(__e2k__) ||                                  \
-     defined(__ARM_FEATURE_UNALIGNED)) &&                                      \
-    !defined(__ALIGNED__)
-#define UNALIGNED_OK 1
+#define T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE 0
+#define T1HA_CONFIG_UNALIGNED_ACCESS__SLOW 1
+#define T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT 2
+
+#ifndef T1HA_CONFIG_UNALIGNED_ACCESS
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#define T1HA_CONFIG_UNALIGNED_ACCESS T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT
+#elif defined(__ia32__)
+#define T1HA_CONFIG_UNALIGNED_ACCESS T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT
+#elif defined(__e2k__)
+#define T1HA_CONFIG_UNALIGNED_ACCESS T1HA_CONFIG_UNALIGNED_ACCESS__SLOW
+#elif defined(__ARM_FEATURE_UNALIGNED)
+#define T1HA_CONFIG_UNALIGNED_ACCESS T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT
 #else
-#define UNALIGNED_OK 0
+#define T1HA_CONFIG_UNALIGNED_ACCESS T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE
 #endif
-#endif /* UNALIGNED_OK */
-
-#if UNALIGNED_OK && !defined(PAGESIZE)
-#define PAGESIZE 4096
-#endif /* PAGESIZE */
+#endif /* T1HA_CONFIG_UNALIGNED_ACCESS */
 
 #define ALIGNMENT_16 2
 #define ALIGNMENT_32 4
@@ -94,6 +98,10 @@
 #define ALIGNMENT_64 4
 #endif
 
+#ifndef PAGESIZE
+#define PAGESIZE 4096
+#endif /* PAGESIZE */
+
 /***************************************************************************/
 
 #ifndef __has_builtin
@@ -118,6 +126,13 @@
 
 #if __GNUC_PREREQ(4, 4) || defined(__clang__)
 
+#if defined(__ia32__) || defined(__e2k__)
+#include <x86intrin.h>
+#endif
+
+#if defined(__ia32__) && !defined(__cpuid_count)
+#include <cpuid.h>
+#endif
 
 #if defined(__e2k__)
 #include <e2kbuiltin.h>
@@ -393,10 +408,10 @@ typedef struct {
 #endif /* read_unaligned */
 
 #ifndef read_aligned
-#if __has_builtin(assume_aligned)
+#if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_assume_aligned)
 #define read_aligned(ptr, bits)                                                \
   (*(const uint##bits##_t *)__builtin_assume_aligned(ptr, ALIGNMENT_##bits))
-#elif __has_attribute(aligned) && !defined(__clang__)
+#elif (__GNUC_PREREQ(3, 3) || __has_attribute(aligned)) && !defined(__clang__)
 #define read_aligned(ptr, bits)                                                \
   (*(const uint##bits##_t __attribute__((aligned(ALIGNMENT_##bits))) *)(ptr))
 #elif __has_attribute(assume_aligned)
@@ -427,6 +442,20 @@ static __always_inline const
 #endif
 #endif /* read_aligned */
 
+#ifndef prefetch
+#if (__GNUC_PREREQ(4, 0) || __has_builtin(__builtin_prefetch)) &&              \
+    !defined(__ia32__)
+#define prefetch(ptr) __builtin_prefetch(ptr)
+#elif defined(_M_ARM64) || defined(_M_ARM)
+#define prefetch(ptr) __prefetch(ptr)
+#else
+#define prefetch(ptr)                                                          \
+  do {                                                                         \
+    (void)(ptr);                                                               \
+  } while (0)
+#endif
+#endif /* prefetch */
+
 #if __has_warning("-Wconstant-logical-operand")
 #if defined(__clang__)
 #pragma clang diagnostic ignored "-Wconstant-logical-operand"
@@ -451,28 +480,33 @@ static __always_inline const
 
 /*---------------------------------------------------------- Little Endian */
 
-#ifndef fetch64_le_aligned
-static __always_inline uint64_t fetch64_le_aligned(const void *v) {
+#ifndef fetch16_le_aligned
+static __always_inline uint16_t fetch16_le_aligned(const void *v) {
+  assert(((uintptr_t)v) % ALIGNMENT_16 == 0);
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-  return read_aligned(v, 64);
+  return read_aligned(v, 16);
 #else
-  return bswap64(read_aligned(v, 64));
+  return bswap16(read_aligned(v, 16));
 #endif
 }
-#endif /* fetch64_le_aligned */
+#endif /* fetch16_le_aligned */
 
-#ifndef fetch64_le_unaligned
-static __always_inline uint64_t fetch64_le_unaligned(const void *v) {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-  return read_unaligned(v, 64);
+#ifndef fetch16_le_unaligned
+static __always_inline uint16_t fetch16_le_unaligned(const void *v) {
+#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE
+  const uint8_t *p = (const uint8_t *)v;
+  return p[0] | (uint16_t)p[1] << 8;
+#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  return read_unaligned(v, 16);
 #else
-  return bswap64(read_unaligned(v, 64));
+  return bswap16(read_unaligned(v, 16));
 #endif
 }
-#endif /* fetch64_le_unaligned */
+#endif /* fetch16_le_unaligned */
 
 #ifndef fetch32_le_aligned
 static __always_inline uint32_t fetch32_le_aligned(const void *v) {
+  assert(((uintptr_t)v) % ALIGNMENT_32 == 0);
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
   return read_aligned(v, 32);
 #else
@@ -483,7 +517,10 @@ static __always_inline uint32_t fetch32_le_aligned(const void *v) {
 
 #ifndef fetch32_le_unaligned
 static __always_inline uint32_t fetch32_le_unaligned(const void *v) {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE
+  return fetch16_le_unaligned(v) |
+         (uint32_t)fetch16_le_unaligned((const uint8_t *)v + 2) << 16;
+#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
   return read_unaligned(v, 32);
 #else
   return bswap32(read_unaligned(v, 32));
@@ -491,25 +528,29 @@ static __always_inline uint32_t fetch32_le_unaligned(const void *v) {
 }
 #endif /* fetch32_le_unaligned */
 
-#ifndef fetch16_le_aligned
-static __always_inline uint16_t fetch16_le_aligned(const void *v) {
+#ifndef fetch64_le_aligned
+static __always_inline uint64_t fetch64_le_aligned(const void *v) {
+  assert(((uintptr_t)v) % ALIGNMENT_64 == 0);
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-  return read_aligned(v, 16);
+  return read_aligned(v, 64);
 #else
-  return bswap16(read_aligned(v, 16));
+  return bswap64(read_aligned(v, 64));
 #endif
 }
-#endif /* fetch16_le_aligned */
+#endif /* fetch64_le_aligned */
 
-#ifndef fetch16_le_unaligned
-static __always_inline uint16_t fetch16_le_unaligned(const void *v) {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-  return read_unaligned(v, 16);
+#ifndef fetch64_le_unaligned
+static __always_inline uint64_t fetch64_le_unaligned(const void *v) {
+#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE
+  return fetch32_le_unaligned(v) |
+         (uint64_t)fetch32_le_unaligned((const uint8_t *)v + 4) << 32;
+#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  return read_unaligned(v, 64);
 #else
-  return bswap16(read_unaligned(v, 16));
+  return bswap64(read_unaligned(v, 64));
 #endif
 }
-#endif /* fetch16_le_unaligned */
+#endif /* fetch64_le_unaligned */
 
 static __always_inline uint64_t tail64_le_aligned(const void *v, size_t tail) {
   const uint8_t *const p = (const uint8_t *)v;
@@ -517,10 +558,12 @@ static __always_inline uint64_t tail64_le_aligned(const void *v, size_t tail) {
   /* We can perform a 'oneshot' read, which is little bit faster. */
   const unsigned shift = ((8 - tail) & 7) << 3;
   return fetch64_le_aligned(p) & ((~UINT64_C(0)) >> shift);
-#endif /* 'oneshot' read */
-
+#else
   uint64_t r = 0;
   switch (tail & 7) {
+  default:
+    unreachable();
+/* fall through */
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
   /* For most CPUs this code is better when not needed byte reordering. */
   case 0:
@@ -577,14 +620,15 @@ static __always_inline uint64_t tail64_le_aligned(const void *v, size_t tail) {
     return r + p[0];
 #endif
   }
-  unreachable();
+#endif /* T1HA_USE_FAST_ONESHOT_READ */
 }
 
-#if T1HA_USE_FAST_ONESHOT_READ && UNALIGNED_OK && defined(PAGESIZE) &&         \
-    !defined(__SANITIZE_ADDRESS__) && !defined(__sun)
+#if T1HA_USE_FAST_ONESHOT_READ &&                                              \
+    T1HA_CONFIG_UNALIGNED_ACCESS != T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE &&    \
+    defined(PAGESIZE) && !defined(__sun) && !defined(__SANITIZE_ADDRESS__)
 #define can_read_underside(ptr, size)                                          \
   ((size) <= sizeof(uintptr_t) && ((PAGESIZE - (size)) & (uintptr_t)(ptr)) != 0)
-#endif /* can_fast_read */
+#endif /* T1HA_USE_FAST_ONESHOT_READ */
 
 static __always_inline uint64_t tail64_le_unaligned(const void *v,
                                                     size_t tail) {
@@ -600,11 +644,14 @@ static __always_inline uint64_t tail64_le_unaligned(const void *v,
     return fetch64_le_unaligned(p) >> shift;
   }
   return fetch64_le_unaligned(p) & ((~UINT64_C(0)) >> shift);
-#endif /* 'oneshot' read */
-
+#else
   uint64_t r = 0;
   switch (tail & 7) {
-#if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  default:
+    unreachable();
+/* fall through */
+#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT && \
+    __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
   /* For most CPUs this code is better when not needed
    * copying for alignment or byte reordering. */
   case 0:
@@ -663,36 +710,41 @@ static __always_inline uint64_t tail64_le_unaligned(const void *v,
     return r + p[0];
 #endif
   }
-  unreachable();
+#endif /* can_read_underside */
 }
 
 /*------------------------------------------------------------- Big Endian */
 
-#ifndef fetch64_be_aligned
-static __maybe_unused __always_inline uint64_t
-fetch64_be_aligned(const void *v) {
+#ifndef fetch16_be_aligned
+static __maybe_unused __always_inline uint16_t
+fetch16_be_aligned(const void *v) {
+  assert(((uintptr_t)v) % ALIGNMENT_16 == 0);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-  return read_aligned(v, 64);
+  return read_aligned(v, 16);
 #else
-  return bswap64(read_aligned(v, 64));
+  return bswap16(read_aligned(v, 16));
 #endif
 }
-#endif /* fetch64_be_aligned */
+#endif /* fetch16_be_aligned */
 
-#ifndef fetch64_be_unaligned
-static __maybe_unused __always_inline uint64_t
-fetch64_be_unaligned(const void *v) {
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-  return read_unaligned(v, 64);
+#ifndef fetch16_be_unaligned
+static __maybe_unused __always_inline uint16_t
+fetch16_be_unaligned(const void *v) {
+#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE
+  const uint8_t *p = (const uint8_t *)v;
+  return (uint16_t)p[0] << 8 | p[1];
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  return read_unaligned(v, 16);
 #else
-  return bswap64(read_unaligned(v, 64));
+  return bswap16(read_unaligned(v, 16));
 #endif
 }
-#endif /* fetch64_be_unaligned */
+#endif /* fetch16_be_unaligned */
 
 #ifndef fetch32_be_aligned
 static __maybe_unused __always_inline uint32_t
 fetch32_be_aligned(const void *v) {
+  assert(((uintptr_t)v) % ALIGNMENT_32 == 0);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
   return read_aligned(v, 32);
 #else
@@ -704,7 +756,10 @@ fetch32_be_aligned(const void *v) {
 #ifndef fetch32_be_unaligned
 static __maybe_unused __always_inline uint32_t
 fetch32_be_unaligned(const void *v) {
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE
+  return (uint32_t)fetch16_be_unaligned(v) << 16 |
+         fetch16_be_unaligned((const uint8_t *)v + 2);
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
   return read_unaligned(v, 32);
 #else
   return bswap32(read_unaligned(v, 32));
@@ -712,27 +767,31 @@ fetch32_be_unaligned(const void *v) {
 }
 #endif /* fetch32_be_unaligned */
 
-#ifndef fetch16_be_aligned
-static __maybe_unused __always_inline uint16_t
-fetch16_be_aligned(const void *v) {
+#ifndef fetch64_be_aligned
+static __maybe_unused __always_inline uint64_t
+fetch64_be_aligned(const void *v) {
+  assert(((uintptr_t)v) % ALIGNMENT_64 == 0);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-  return read_aligned(v, 16);
+  return read_aligned(v, 64);
 #else
-  return bswap16(read_aligned(v, 16));
+  return bswap64(read_aligned(v, 64));
 #endif
 }
-#endif /* fetch16_be_aligned */
+#endif /* fetch64_be_aligned */
 
-#ifndef fetch16_be_unaligned
-static __maybe_unused __always_inline uint16_t
-fetch16_be_unaligned(const void *v) {
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-  return read_unaligned(v, 16);
+#ifndef fetch64_be_unaligned
+static __maybe_unused __always_inline uint64_t
+fetch64_be_unaligned(const void *v) {
+#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__UNABLE
+  return (uint64_t)fetch32_be_unaligned(v) << 32 |
+         fetch32_be_unaligned((const uint8_t *)v + 4);
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  return read_unaligned(v, 64);
 #else
-  return bswap16(read_unaligned(v, 16));
+  return bswap64(read_unaligned(v, 64));
 #endif
 }
-#endif /* fetch16_be_unaligned */
+#endif /* fetch64_be_unaligned */
 
 static __maybe_unused __always_inline uint64_t tail64_be_aligned(const void *v,
                                                                  size_t tail) {
@@ -741,9 +800,11 @@ static __maybe_unused __always_inline uint64_t tail64_be_aligned(const void *v,
   /* We can perform a 'oneshot' read, which is little bit faster. */
   const unsigned shift = ((8 - tail) & 7) << 3;
   return fetch64_be_aligned(p) >> shift;
-#endif /* 'oneshot' read */
-
+#else
   switch (tail & 7) {
+  default:
+    unreachable();
+/* fall through */
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
   /* For most CPUs this code is better when not byte reordering. */
   case 1:
@@ -762,7 +823,7 @@ static __maybe_unused __always_inline uint64_t tail64_be_aligned(const void *v,
     return (uint64_t)fetch32_be_aligned(p) << 24 |
            (uint32_t)fetch16_be_aligned(p + 4) << 8 | p[6];
   case 0:
-    return fetch64_be(p);
+    return fetch64_be_aligned(p);
 #else
   case 1:
     return p[0];
@@ -789,7 +850,7 @@ static __maybe_unused __always_inline uint64_t tail64_be_aligned(const void *v,
            (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56;
 #endif
   }
-  unreachable();
+#endif /* T1HA_USE_FAST_ONESHOT_READ */
 }
 
 static __maybe_unused __always_inline uint64_t
@@ -806,10 +867,13 @@ tail64_be_unaligned(const void *v, size_t tail) {
     return fetch64_be_unaligned(p) & ((~UINT64_C(0)) >> shift);
   }
   return fetch64_be_unaligned(p) >> shift;
-#endif /* 'oneshot' read */
-
+#else
   switch (tail & 7) {
-#if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  default:
+    unreachable();
+/* fall through */
+#if T1HA_CONFIG_UNALIGNED_ACCESS == T1HA_CONFIG_UNALIGNED_ACCESS__EFFICIENT && \
+    __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
   /* For most CPUs this code is better when not needed
    * copying for alignment or byte reordering. */
   case 1:
@@ -858,7 +922,7 @@ tail64_be_unaligned(const void *v, size_t tail) {
            (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56;
 #endif
   }
-  unreachable();
+#endif /* can_read_underside */
 }
 
 /***************************************************************************/
-- 
2.39.5