[Minor] Backport unaligned access fixes from t1ha

Issue: #2222
2024-07-29 20:17:47 +02:00 · 2018-05-14 16:14:58 +01:00 · 2018-05-14 16:14:58 +01:00 · 974d1dbcab
commit 974d1dbcab
parent 1a21511cf9
3 changed files with 620 additions and 379 deletions
--- a/contrib/t1ha/t1ha1.c
+++ b/contrib/t1ha/t1ha1.c
@ -58,81 +58,89 @@ static __inline uint64_t final_weak_avalanche(uint64_t a, uint64_t b) {
  return mux64(rot64(a + b, 17), prime_4) + mix64(a ^ b, prime_0);
 }

+/* TODO C++ template in the next version */
+#define T1HA1_BODY(ENDIANNES, ALIGNESS, DOCOPY)                                \
+  if (unlikely(len > 32)) {                                                    \
+    uint64_t c = rot64(len, 17) + seed;                                        \
+    uint64_t d = len ^ rot64(seed, 17);                                        \
+    const void *detent = (const uint8_t *)data + len - 31;                     \
+    do {                                                                       \
+      const uint64_t *v = (const uint64_t *)data;                              \
+      if (DOCOPY)                                                              \
+        memcpy((void *)(v = align), data, 32);                                 \
+                                                                               \
+      const uint64_t w0 = fetch64_##ENDIANNES##_##ALIGNESS(v + 0);             \
+      const uint64_t w1 = fetch64_##ENDIANNES##_##ALIGNESS(v + 1);             \
+      const uint64_t w2 = fetch64_##ENDIANNES##_##ALIGNESS(v + 2);             \
+      const uint64_t w3 = fetch64_##ENDIANNES##_##ALIGNESS(v + 3);             \
+                                                                               \
+      const uint64_t d02 = w0 ^ rot64(w2 + d, 17);                             \
+      const uint64_t c13 = w1 ^ rot64(w3 + c, 17);                             \
+      c += a ^ rot64(w0, 41);                                                  \
+      d -= b ^ rot64(w1, 31);                                                  \
+      a ^= prime_1 * (d02 + w3);                                               \
+      b ^= prime_0 * (c13 + w2);                                               \
+      data = (const uint64_t *)data + 4;                                       \
+    } while (likely(data < detent));                                           \
+                                                                               \
+    a ^= prime_6 * (rot64(c, 17) + d);                                         \
+    b ^= prime_5 * (c + rot64(d, 17));                                         \
+    len &= 31;                                                                 \
+  }                                                                            \
+                                                                               \
+  const uint64_t *v = (const uint64_t *)data;                                  \
+  if (unlikely(need_copy4align) && len > 8)                                    \
+    memcpy((void *)(v = align), data, len);                                    \
+                                                                               \
+  switch (len) {                                                               \
+  default:                                                                     \
+    b += mux64(fetch64_##ENDIANNES##_##ALIGNESS(v++), prime_4);                \
+  /* fall through */                                                           \
+  case 24:                                                                     \
+  case 23:                                                                     \
+  case 22:                                                                     \
+  case 21:                                                                     \
+  case 20:                                                                     \
+  case 19:                                                                     \
+  case 18:                                                                     \
+  case 17:                                                                     \
+    a += mux64(fetch64_##ENDIANNES##_##ALIGNESS(v++), prime_3);                \
+  /* fall through */                                                           \
+  case 16:                                                                     \
+  case 15:                                                                     \
+  case 14:                                                                     \
+  case 13:                                                                     \
+  case 12:                                                                     \
+  case 11:                                                                     \
+  case 10:                                                                     \
+  case 9:                                                                      \
+    b += mux64(fetch64_##ENDIANNES##_##ALIGNESS(v++), prime_2);                \
+  /* fall through */                                                           \
+  case 8:                                                                      \
+  case 7:                                                                      \
+  case 6:                                                                      \
+  case 5:                                                                      \
+  case 4:                                                                      \
+  case 3:                                                                      \
+  case 2:                                                                      \
+  case 1:                                                                      \
+    a += mux64(tail64_##ENDIANNES##_##ALIGNESS(v, len), prime_1);              \
+  /* fall through */                                                           \
+  case 0:                                                                      \
+    return final_weak_avalanche(a, b);                                         \
+  }
+
 uint64_t t1ha1_le(const void *data, size_t len, uint64_t seed) {
  uint64_t a = seed;
  uint64_t b = len;

-  const int need_align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK;
+  const bool need_copy4align =
+      (((uintptr_t)data) & (ALIGMENT_64 - 1)) != 0 && !UNALIGNED_OK;
  uint64_t align[4];
-
-  if (unlikely(len > 32)) {
-    uint64_t c = rot64(len, 17) + seed;
-    uint64_t d = len ^ rot64(seed, 17);
-    const void *detent = (const uint8_t *)data + len - 31;
-    do {
-      const uint64_t *v = (const uint64_t *)data;
-      if (unlikely(need_align))
-        v = (const uint64_t *)memcpy(&align, unaligned(v), 32);
-
-      uint64_t w0 = fetch64_le(v + 0);
-      uint64_t w1 = fetch64_le(v + 1);
-      uint64_t w2 = fetch64_le(v + 2);
-      uint64_t w3 = fetch64_le(v + 3);
-
-      uint64_t d02 = w0 ^ rot64(w2 + d, 17);
-      uint64_t c13 = w1 ^ rot64(w3 + c, 17);
-      c += a ^ rot64(w0, 41);
-      d -= b ^ rot64(w1, 31);
-      a ^= prime_1 * (d02 + w3);
-      b ^= prime_0 * (c13 + w2);
-      data = (const uint64_t *)data + 4;
-    } while (likely(data < detent));
-
-    a ^= prime_6 * (rot64(c, 17) + d);
-    b ^= prime_5 * (c + rot64(d, 17));
-    len &= 31;
-  }
-
-  const uint64_t *v = (const uint64_t *)data;
-  if (unlikely(need_align) && len > 8)
-    v = (const uint64_t *)memcpy(&align, unaligned(v), len);
-
-  switch (len) {
-  default:
-    b += mux64(fetch64_le(v++), prime_4);
-  /* fall through */
-  case 24:
-  case 23:
-  case 22:
-  case 21:
-  case 20:
-  case 19:
-  case 18:
-  case 17:
-    a += mux64(fetch64_le(v++), prime_3);
-  /* fall through */
-  case 16:
-  case 15:
-  case 14:
-  case 13:
-  case 12:
-  case 11:
-  case 10:
-  case 9:
-    b += mux64(fetch64_le(v++), prime_2);
-  /* fall through */
-  case 8:
-  case 7:
-  case 6:
-  case 5:
-  case 4:
-  case 3:
-  case 2:
-  case 1:
-    a += mux64(tail64_le(v, len), prime_1);
-  /* fall through */
-  case 0:
-    return final_weak_avalanche(a, b);
+  if (need_copy4align) {
+    T1HA1_BODY(le, aligned, true);
+  } else {
+    T1HA1_BODY(le, unaligned, false);
  }
 }

@ -140,76 +148,12 @@ uint64_t t1ha1_be(const void *data, size_t len, uint64_t seed) {
  uint64_t a = seed;
  uint64_t b = len;

-  const int need_align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK;
+  const bool need_copy4align =
+      (((uintptr_t)data) & (ALIGMENT_64 - 1)) != 0 && !UNALIGNED_OK;
  uint64_t align[4];
-
-  if (unlikely(len > 32)) {
-    uint64_t c = rot64(len, 17) + seed;
-    uint64_t d = len ^ rot64(seed, 17);
-    const void *detent = (const uint8_t *)data + len - 31;
-    do {
-      const uint64_t *v = (const uint64_t *)data;
-      if (unlikely(need_align))
-        v = (const uint64_t *)memcpy(&align, unaligned(v), 32);
-
-      uint64_t w0 = fetch64_be(v + 0);
-      uint64_t w1 = fetch64_be(v + 1);
-      uint64_t w2 = fetch64_be(v + 2);
-      uint64_t w3 = fetch64_be(v + 3);
-
-      uint64_t d02 = w0 ^ rot64(w2 + d, 17);
-      uint64_t c13 = w1 ^ rot64(w3 + c, 17);
-      c += a ^ rot64(w0, 41);
-      d -= b ^ rot64(w1, 31);
-      a ^= prime_1 * (d02 + w3);
-      b ^= prime_0 * (c13 + w2);
-      data = (const uint64_t *)data + 4;
-    } while (likely(data < detent));
-
-    a ^= prime_6 * (rot64(c, 17) + d);
-    b ^= prime_5 * (c + rot64(d, 17));
-    len &= 31;
+  if (need_copy4align) {
+    T1HA1_BODY(be, aligned, true);
+  } else {
+    T1HA1_BODY(be, unaligned, false);
  }
-
-  const uint64_t *v = (const uint64_t *)data;
-  if (unlikely(need_align) && len > 8)
-    v = (const uint64_t *)memcpy(&align, unaligned(v), len);
-
-  switch (len) {
-  default:
-    b += mux64(fetch64_be(v++), prime_4);
-  /* fall through */
-  case 24:
-  case 23:
-  case 22:
-  case 21:
-  case 20:
-  case 19:
-  case 18:
-  case 17:
-    a += mux64(fetch64_be(v++), prime_3);
-  /* fall through */
-  case 16:
-  case 15:
-  case 14:
-  case 13:
-  case 12:
-  case 11:
-  case 10:
-  case 9:
-    b += mux64(fetch64_be(v++), prime_2);
-  /* fall through */
-  case 8:
-  case 7:
-  case 6:
-  case 5:
-  case 4:
-  case 3:
-  case 2:
-  case 1:
-    a += mux64(tail64_be(v, len), prime_1);
-  /* fall through */
-  case 0:
-    return final_weak_avalanche(a, b);
-  }
-}
+}
--- a/contrib/t1ha/t1ha2.c
+++ b/contrib/t1ha/t1ha2.c
@ -56,128 +56,140 @@ static __always_inline void init_cd(t1ha_state256_t *s, uint64_t x,
  s->n.d = ~y + rot64(x, 19);
 }

-static __always_inline void update(t1ha_state256_t *__restrict s,
-                                   const uint64_t *__restrict v) {
-  uint64_t w0 = fetch64_le(v + 0);
-  uint64_t w1 = fetch64_le(v + 1);
-  uint64_t w2 = fetch64_le(v + 2);
-  uint64_t w3 = fetch64_le(v + 3);
-
-  uint64_t d02 = w0 + rot64(w2 + s->n.d, 56);
-  uint64_t c13 = w1 + rot64(w3 + s->n.c, 19);
-#ifdef __e2k__
-  /* FIXME: temporary workaround for lcc's ELBRUS scheduling bug (LY) */
-  s->n.c ^= s->n.a + rot64(w0, 57);
-  s->n.d ^= s->n.b + rot64(w1, 38);
-#else
-  s->n.d ^= s->n.b + rot64(w1, 38);
-  s->n.c ^= s->n.a + rot64(w0, 57);
-#endif
-  s->n.b ^= prime_6 * (c13 + w2);
-  s->n.a ^= prime_5 * (d02 + w3);
-}
+/* TODO C++ template in the next version */
+#define T1HA2_UPDATE(ENDIANNES, ALIGNESS, state, v)                            \
+  do {                                                                         \
+    t1ha_state256_t *const s = state;                                          \
+    const uint64_t w0 = fetch64_##ENDIANNES##_##ALIGNESS(v + 0);               \
+    const uint64_t w1 = fetch64_##ENDIANNES##_##ALIGNESS(v + 1);               \
+    const uint64_t w2 = fetch64_##ENDIANNES##_##ALIGNESS(v + 2);               \
+    const uint64_t w3 = fetch64_##ENDIANNES##_##ALIGNESS(v + 3);               \
+                                                                               \
+    const uint64_t d02 = w0 + rot64(w2 + s->n.d, 56);                          \
+    const uint64_t c13 = w1 + rot64(w3 + s->n.c, 19);                          \
+    s->n.d ^= s->n.b + rot64(w1, 38);                                          \
+    s->n.c ^= s->n.a + rot64(w0, 57);                                          \
+    s->n.b ^= prime_6 * (c13 + w2);                                            \
+    s->n.a ^= prime_5 * (d02 + w3);                                            \
+  } while (0)

 static __always_inline void squash(t1ha_state256_t *s) {
  s->n.a ^= prime_6 * (s->n.c + rot64(s->n.d, 23));
  s->n.b ^= prime_5 * (rot64(s->n.c, 19) + s->n.d);
 }

-static __always_inline const void *
-loop(bool need_copy4align, uint64_t *__restrict buffer4align,
-     t1ha_state256_t *__restrict s, const void *__restrict data, size_t len) {
-  const void *detent = (const uint8_t *)data + len - 31;
-  do {
-    const uint64_t *v = (const uint64_t *)data;
-    if (unlikely(need_copy4align))
-      v = (const uint64_t *)memcpy(buffer4align, unaligned(v), 32);
-    update(s, v);
-    data = (const uint64_t *)data + 4;
-  } while (likely(data < detent));
-  return data;
-}
+/* TODO C++ template in the next version */
+#define T1HA2_LOOP(ENDIANNES, ALIGNESS, BUFFER4COPY, state, data, len)         \
+  do {                                                                         \
+    const void *detent = (const uint8_t *)data + len - 31;                     \
+    do {                                                                       \
+      const uint64_t *v = (const uint64_t *)data;                              \
+      if (BUFFER4COPY != NULL)                                                 \
+        memcpy((void *)(v = BUFFER4COPY), data, 32);                           \
+      T1HA2_UPDATE(le, unaligned, state, v);                                   \
+      data = (const uint64_t *)data + 4;                                       \
+    } while (likely(data < detent));                                           \
+  } while (0)

-static __always_inline void tail_ab(t1ha_state256_t *__restrict s,
-                                    const uint64_t *__restrict v, size_t len) {
-  switch (len) {
-  default:
-    mixup64(&s->n.a, &s->n.b, fetch64_le(v++), prime_4);
-  /* fall through */
-  case 24:
-  case 23:
-  case 22:
-  case 21:
-  case 20:
-  case 19:
-  case 18:
-  case 17:
-    mixup64(&s->n.b, &s->n.a, fetch64_le(v++), prime_3);
-  /* fall through */
-  case 16:
-  case 15:
-  case 14:
-  case 13:
-  case 12:
-  case 11:
-  case 10:
-  case 9:
-    mixup64(&s->n.a, &s->n.b, fetch64_le(v++), prime_2);
-  /* fall through */
-  case 8:
-  case 7:
-  case 6:
-  case 5:
-  case 4:
-  case 3:
-  case 2:
-  case 1:
-    mixup64(&s->n.b, &s->n.a, tail64_le(v, len), prime_1);
-  /* fall through */
-  case 0:
-    return;
-  }
-}
+/* TODO C++ template in the next version */
+#define T1HA2_TAIL_AB(ENDIANNES, ALIGNESS, BUFFER4COPY, state, data, len)      \
+  do {                                                                         \
+    t1ha_state256_t *const s = state;                                          \
+    const uint64_t *v = (const uint64_t *)data;                                \
+    if (BUFFER4COPY != NULL)                                                   \
+      memcpy((void *)(v = BUFFER4COPY), data, len);                            \
+    switch (len) {                                                             \
+    default:                                                                   \
+      mixup64(&s->n.a, &s->n.b, fetch64_##ENDIANNES##_##ALIGNESS(v++),         \
+              prime_4);                                                        \
+    /* fall through */                                                         \
+    case 24:                                                                   \
+    case 23:                                                                   \
+    case 22:                                                                   \
+    case 21:                                                                   \
+    case 20:                                                                   \
+    case 19:                                                                   \
+    case 18:                                                                   \
+    case 17:                                                                   \
+      mixup64(&s->n.b, &s->n.a, fetch64_##ENDIANNES##_##ALIGNESS(v++),         \
+              prime_3);                                                        \
+    /* fall through */                                                         \
+    case 16:                                                                   \
+    case 15:                                                                   \
+    case 14:                                                                   \
+    case 13:                                                                   \
+    case 12:                                                                   \
+    case 11:                                                                   \
+    case 10:                                                                   \
+    case 9:                                                                    \
+      mixup64(&s->n.a, &s->n.b, fetch64_##ENDIANNES##_##ALIGNESS(v++),         \
+              prime_2);                                                        \
+    /* fall through */                                                         \
+    case 8:                                                                    \
+    case 7:                                                                    \
+    case 6:                                                                    \
+    case 5:                                                                    \
+    case 4:                                                                    \
+    case 3:                                                                    \
+    case 2:                                                                    \
+    case 1:                                                                    \
+      mixup64(&s->n.b, &s->n.a, tail64_##ENDIANNES##_##ALIGNESS(v, len),       \
+              prime_1);                                                        \
+    /* fall through */                                                         \
+    case 0:                                                                    \
+      return final64(s->n.a, s->n.b);                                          \
+    }                                                                          \
+  } while (0)

-static __always_inline void tail_abcd(t1ha_state256_t *__restrict s,
-                                      const uint64_t *__restrict v,
-                                      size_t len) {
-  switch (len) {
-  default:
-    mixup64(&s->n.a, &s->n.d, fetch64_le(v++), prime_4);
-  /* fall through */
-  case 24:
-  case 23:
-  case 22:
-  case 21:
-  case 20:
-  case 19:
-  case 18:
-  case 17:
-    mixup64(&s->n.b, &s->n.a, fetch64_le(v++), prime_3);
-  /* fall through */
-  case 16:
-  case 15:
-  case 14:
-  case 13:
-  case 12:
-  case 11:
-  case 10:
-  case 9:
-    mixup64(&s->n.c, &s->n.b, fetch64_le(v++), prime_2);
-  /* fall through */
-  case 8:
-  case 7:
-  case 6:
-  case 5:
-  case 4:
-  case 3:
-  case 2:
-  case 1:
-    mixup64(&s->n.d, &s->n.c, tail64_le(v, len), prime_1);
-  /* fall through */
-  case 0:
-    return;
-  }
-}
+/* TODO C++ template in the next version */
+#define T1HA2_TAIL_ABCD(ENDIANNES, ALIGNESS, BUFFER4COPY, state, data, len)    \
+  do {                                                                         \
+    t1ha_state256_t *const s = state;                                          \
+    const uint64_t *v = (const uint64_t *)data;                                \
+    if (BUFFER4COPY != NULL)                                                   \
+      memcpy((void *)(v = BUFFER4COPY), data, len);                            \
+    switch (len) {                                                             \
+    default:                                                                   \
+      mixup64(&s->n.a, &s->n.d, fetch64_##ENDIANNES##_##ALIGNESS(v++),         \
+              prime_4);                                                        \
+    /* fall through */                                                         \
+    case 24:                                                                   \
+    case 23:                                                                   \
+    case 22:                                                                   \
+    case 21:                                                                   \
+    case 20:                                                                   \
+    case 19:                                                                   \
+    case 18:                                                                   \
+    case 17:                                                                   \
+      mixup64(&s->n.b, &s->n.a, fetch64_##ENDIANNES##_##ALIGNESS(v++),         \
+              prime_3);                                                        \
+    /* fall through */                                                         \
+    case 16:                                                                   \
+    case 15:                                                                   \
+    case 14:                                                                   \
+    case 13:                                                                   \
+    case 12:                                                                   \
+    case 11:                                                                   \
+    case 10:                                                                   \
+    case 9:                                                                    \
+      mixup64(&s->n.c, &s->n.b, fetch64_##ENDIANNES##_##ALIGNESS(v++),         \
+              prime_2);                                                        \
+    /* fall through */                                                         \
+    case 8:                                                                    \
+    case 7:                                                                    \
+    case 6:                                                                    \
+    case 5:                                                                    \
+    case 4:                                                                    \
+    case 3:                                                                    \
+    case 2:                                                                    \
+    case 1:                                                                    \
+      mixup64(&s->n.d, &s->n.c, tail64_##ENDIANNES##_##ALIGNESS(v, len),       \
+              prime_1);                                                        \
+    /* fall through */                                                         \
+    case 0:                                                                    \
+      return final128(s->n.a, s->n.b, s->n.c, s->n.d, extra_result);           \
+    }                                                                          \
+  } while (0)

 static __always_inline uint64_t final128(uint64_t a, uint64_t b, uint64_t c,
                                         uint64_t d, uint64_t *h) {
@ -195,22 +207,26 @@ uint64_t t1ha2_atonce(const void *data, size_t length, uint64_t seed) {
  t1ha_state256_t state;
  init_ab(&state, seed, length);

-  const int need_copy4align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK;
-  uint64_t buffer4align[4];
-
-  if (unlikely(length > 32)) {
-    init_cd(&state, seed, length);
-    data = loop(need_copy4align, buffer4align, &state, data, length);
-    squash(&state);
-    length &= 31;
+  const bool need_copy4align =
+      (((uintptr_t)data) & (ALIGMENT_64 - 1)) != 0 && !UNALIGNED_OK;
+  if (need_copy4align) {
+    uint64_t buffer4align[4];
+    if (unlikely(length > 32)) {
+      init_cd(&state, seed, length);
+      T1HA2_LOOP(le, aligned, buffer4align, &state, data, length);
+      squash(&state);
+      length &= 31;
+    }
+    T1HA2_TAIL_AB(le, aligned, buffer4align, &state, data, length);
+  } else {
+    if (unlikely(length > 32)) {
+      init_cd(&state, seed, length);
+      T1HA2_LOOP(le, unaligned, NULL, &state, data, length);
+      squash(&state);
+      length &= 31;
+    }
+    T1HA2_TAIL_AB(le, unaligned, NULL, &state, data, length);
  }
-
-  const uint64_t *v = (const uint64_t *)data;
-  if (unlikely(need_copy4align) && length > 8)
-    v = (const uint64_t *)memcpy(&buffer4align, unaligned(v), length);
-
-  tail_ab(&state, v, length);
-  return final64(state.n.a, state.n.b);
 }

 uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result,
@ -220,20 +236,22 @@ uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result,
  init_ab(&state, seed, length);
  init_cd(&state, seed, length);

-  const int need_copy4align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK;
-  uint64_t buffer4align[4];
-
-  if (unlikely(length > 32)) {
-    data = loop(need_copy4align, buffer4align, &state, data, length);
-    length &= 31;
+  const bool need_copy4align =
+      (((uintptr_t)data) & (ALIGMENT_64 - 1)) != 0 && !UNALIGNED_OK;
+  if (need_copy4align) {
+    uint64_t buffer4align[4];
+    if (unlikely(length > 32)) {
+      T1HA2_LOOP(le, aligned, buffer4align, &state, data, length);
+      length &= 31;
+    }
+    T1HA2_TAIL_ABCD(le, aligned, buffer4align, &state, data, length);
+  } else {
+    if (unlikely(length > 32)) {
+      T1HA2_LOOP(le, unaligned, NULL, &state, data, length);
+      length &= 31;
+    }
+    T1HA2_TAIL_ABCD(le, unaligned, NULL, &state, data, length);
  }
-
-  const uint64_t *v = (const uint64_t *)data;
-  if (unlikely(need_copy4align) && length > 8)
-    v = (const uint64_t *)memcpy(&buffer4align, unaligned(v), length);
-
-  tail_abcd(&state, v, length);
-  return final128(state.n.a, state.n.b, state.n.c, state.n.d, extra_result);
 }

 //------------------------------------------------------------------------------
@ -252,7 +270,7 @@ void t1ha2_update(t1ha_context_t *__restrict ctx, const void *__restrict data,
  if (ctx->partial) {
    const size_t left = 32 - ctx->partial;
    const size_t chunk = (length >= left) ? left : length;
-    memcpy(ctx->buffer.bytes + ctx->partial, unaligned(data), chunk);
+    memcpy(ctx->buffer.bytes + ctx->partial, data, chunk);
    ctx->partial += chunk;
    if (ctx->partial < 32) {
      assert(left >= length);
@ -261,37 +279,41 @@ void t1ha2_update(t1ha_context_t *__restrict ctx, const void *__restrict data,
    ctx->partial = 0;
    data = (const uint8_t *)data + chunk;
    length -= chunk;
-    update(&ctx->state, ctx->buffer.u64);
+    T1HA2_UPDATE(le, aligned, &ctx->state, ctx->buffer.u64);
  }

  if (length >= 32) {
-    const bool need_copy4align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK;
-    if (need_copy4align)
-      data = loop(true, ctx->buffer.u64, &ctx->state, data, length);
-    else
-      data = loop(false, NULL, &ctx->state, data, length);
+    const bool need_copy4align =
+        (((uintptr_t)data) & (ALIGMENT_64 - 1)) != 0 && !UNALIGNED_OK;
+    if (need_copy4align) {
+      T1HA2_LOOP(le, aligned, ctx->buffer.u64, &ctx->state, data, length);
+    } else {
+      T1HA2_LOOP(le, unaligned, NULL, &ctx->state, data, length);
+    }
    length &= 31;
  }

  if (length)
-    memcpy(ctx->buffer.bytes, unaligned(data), ctx->partial = length);
+    memcpy(ctx->buffer.bytes, data, ctx->partial = length);
 }

 uint64_t t1ha2_final(t1ha_context_t *__restrict ctx,
                     uint64_t *__restrict extra_result) {
-  uint64_t bytes = (ctx->total << 3) ^ (UINT64_C(1) << 63);
+  uint64_t bits = (ctx->total << 3) ^ (UINT64_C(1) << 63);
 #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
-  bytes = bswap64(bytes);
+  bits = bswap64(bits);
 #endif
-  t1ha2_update(ctx, &bytes, 8);
+  t1ha2_update(ctx, &bits, 8);

  if (likely(!extra_result)) {
    squash(&ctx->state);
-    tail_ab(&ctx->state, ctx->buffer.u64, ctx->partial);
+    T1HA2_TAIL_AB(le, aligned, NULL, &ctx->state, ctx->buffer.u64,
+                  ctx->partial);
    return final64(ctx->state.n.a, ctx->state.n.b);
  }

-  tail_abcd(&ctx->state, ctx->buffer.u64, ctx->partial);
+  T1HA2_TAIL_ABCD(le, aligned, NULL, &ctx->state, ctx->buffer.u64,
+                  ctx->partial);
  return final128(ctx->state.n.a, ctx->state.n.b, ctx->state.n.c,
                  ctx->state.n.d, extra_result);
-}
+}
--- a/contrib/t1ha/t1ha_bits.h
+++ b/contrib/t1ha/t1ha_bits.h
@ -86,6 +86,14 @@
 #define PAGESIZE 4096
 #endif /* PAGESIZE */

+#define ALIGMENT_16 2
+#define ALIGMENT_32 4
+#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
+#define ALIGMENT_64 8
+#else
+#define ALIGMENT_64 4
+#endif
+
 /***************************************************************************/

 #ifndef __has_builtin
@ -160,10 +168,8 @@ static __maybe_unused __always_inline void e2k_add64carry_last(unsigned carry,
  e2k_add64carry_last(carry, base, addend, sum)
 #endif /* __iset__ >= 5 */

-#if 0 /* LY: unreasonable, because alignment is required :( */
-#define fetch64_be(ptr) ((uint64_t)__builtin_e2k_ld_64s_be(ptr))
-#define fetch32_be(ptr) ((uint32_t)__builtin_e2k_ld_32u_be(ptr))
-#endif
+#define fetch64_be_aligned(ptr) ((uint64_t)__builtin_e2k_ld_64s_be(ptr))
+#define fetch32_be_aligned(ptr) ((uint32_t)__builtin_e2k_ld_32u_be(ptr))

 #endif /* __e2k__ Elbrus */

@ -337,86 +343,174 @@ static __always_inline uint16_t bswap16(uint16_t v) { return v << 8 | v >> 8; }
 #endif
 #endif /* bswap16 */

-#ifndef unaligned
-#if defined(__LCC__)
-#pragma diag_suppress wrong_entity_for_attribute
-#define unaligned(ptr) ((const char __attribute__((packed, aligned(1))) *)(ptr))
-#elif defined(__clang__)
-#pragma clang diagnostic ignored "-Wignored-attributes"
-#define unaligned(ptr) ((const char __attribute__((packed, aligned(1))) *)(ptr))
-#elif defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wpacked"
-#define unaligned(ptr) ((const char __attribute__((packed, aligned(1))) *)(ptr))
+#if defined(__GNUC__) || (__has_attribute(packed) && __has_attribute(aligned))
+typedef struct {
+  uint8_t unaligned_8;
+  uint16_t unaligned_16;
+  uint32_t unaligned_32;
+  uint64_t unaligned_64;
+} __attribute__((packed, aligned(1))) t1ha_unaligned_proxy;
+#define read_unaligned(ptr, bits)                                              \
+  (((const t1ha_unaligned_proxy *)((const uint8_t *)(ptr)-offsetof(            \
+        t1ha_unaligned_proxy, unaligned_##bits)))                              \
+       ->unaligned_##bits)
+#define read_aligned(ptr, bits)                                                \
+  (*(const __attribute__((aligned(ALIGMENT_##bits))) uint##bits##_t *)(ptr))
 #elif defined(_MSC_VER)
 #pragma warning(                                                               \
    disable : 4235) /* nonstandard extension used: '__unaligned'               \
                     * keyword not supported on this architecture */
-#define unaligned(ptr) ((const char __unaligned *)(ptr))
+#define read_unaligned(ptr, bits) (*(const __unaligned uint##bits##_t *)(ptr))
+#define read_aligned(ptr, bits)                                                \
+  (*(const __declspec(align(ALIGMENT_##bits)) uint##bits##_t *)(ptr))
 #else
-#define unaligned(ptr) ((const char *)(ptr))
+#error FIXME
+#define read_unaligned(ptr, bits)                                              \
+  (*(const uint##bits##_ *)((const char *)(ptr)))
+#define read_aligned(ptr, bits) (*(const uint##bits##_t *)(ptr))
+#endif /* read_unaligned */
+
+#if 0
+#ifndef DECLARE_UNALIGNED_PTR
+#if defined(__LCC__)
+#pragma diag_suppress wrong_entity_for_attribute
+#define DECLARE_UNALIGNED_PTR(ptr) char __attribute__((packed)) * ptr
+#elif defined(__clang__)
+#pragma clang diagnostic ignored "-Wignored-attributes"
+#define DECLARE_UNALIGNED_PTR(ptr) char __attribute__((packed)) * ptr
+#elif defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wpacked"
+#define DECLARE_UNALIGNED_PTR(ptr) char __attribute__((packed)) * ptr
+#elif defined(_MSC_VER)
+#pragma warning(                                                               \
+    disable : 4235) /* nonstandard extension used: '__unaligned'               \
+                     * keyword not supported on this architecture */
+#define DECLARE_UNALIGNED_PTR(ptr) char __unaligned *ptr
+#else
+#define DECLARE_UNALIGNED_PTR(ptr) char *ptr
 #endif
-#endif /* unaligned */
+#endif /* cast_unaligned */
+
+#ifndef cast_unaligned
+#if defined(__LCC__)
+#pragma diag_suppress wrong_entity_for_attribute
+#define cast_unaligned(ptr) ((const char __attribute__((packed)) *)(ptr))
+#elif defined(__clang__)
+#pragma clang diagnostic ignored "-Wignored-attributes"
+#define cast_unaligned(ptr) ((const char __attribute__((packed)) *)(ptr))
+#elif defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wpacked"
+#define cast_unaligned(ptr) ((const char __attribute__((packed)) *)(ptr))
+#elif defined(_MSC_VER)
+#pragma warning(                                                               \
+    disable : 4235) /* nonstandard extension used: '__unaligned'               \
+                     * keyword not supported on this architecture */
+#define cast_unaligned(ptr) ((const char __unaligned *)(ptr))
+#else
+#define cast_unaligned(ptr) ((const char *)(ptr))
+#endif
+#endif /* cast_unaligned */
+
+#ifndef cast_aligned
+#if defined(__LCC__)
+#pragma diag_suppress wrong_entity_for_attribute
+#define cast_aligned(type, ptr)                                                \
+  ((const type __attribute__((aligned(sizeof(type)))) *)(ptr))
+#elif defined(__clang__)
+#pragma clang diagnostic ignored "-Wignored-attributes"
+#define cast_aligned(type, ptr)                                                \
+  ((const type __attribute__((aligned(sizeof(type)))) *)(ptr))
+#elif defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wpacked"
+#define cast_aligned(type, ptr)                                                \
+  ((const type __attribute__((aligned(sizeof(type)))) *)(ptr))
+#elif defined(_MSC_VER)
+#define cast_aligned(type, ptr)                                                \
+  ((const type __declspec((align(sizeof(type)))) *)(ptr))
+#else
+#define cast_aligned(type, ptr) ((const type *)(ptr))
+#endif
+#endif /* cast_aligned */
+#endif /* 0 */

 /***************************************************************************/

-#ifndef fetch64_le
-static __always_inline uint64_t fetch64_le(const void *v) {
+/*---------------------------------------------------------- Little Endian */
+
+#ifndef fetch64_le_aligned
+static __always_inline uint64_t fetch64_le_aligned(const void *v) {
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-  return *(const uint64_t *)v;
+  return read_aligned(v, 64);
 #else
-  return bswap64(*(const uint64_t *)v);
+  return bswap64(read_aligned(v, 64));
 #endif
 }
-#endif /* fetch64_le */
+#endif /* fetch64_le_aligned */

-#ifndef fetch32_le
-static __always_inline uint32_t fetch32_le(const void *v) {
+#ifndef fetch64_le_unaligned
+static __always_inline uint64_t fetch64_le_unaligned(const void *v) {
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-  return *(const uint32_t *)v;
+  return read_unaligned(v, 64);
 #else
-  return bswap32(*(const uint32_t *)v);
+  return bswap64(read_unaligned(v, 64));
 #endif
 }
-#endif /* fetch32_le */
+#endif /* fetch64_le_unaligned */

-#ifndef fetch16_le
-static __always_inline uint16_t fetch16_le(const void *v) {
+#ifndef fetch32_le_aligned
+static __always_inline uint32_t fetch32_le_aligned(const void *v) {
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-  return *(const uint16_t *)v;
+  return read_aligned(v, 32);
 #else
-  return bswap16(*(const uint16_t *)v);
+  return bswap32(read_aligned(v, 32));
 #endif
 }
-#endif /* fetch16_le */
+#endif /* fetch32_le_aligned */

-#if T1HA_USE_FAST_ONESHOT_READ && UNALIGNED_OK && defined(PAGESIZE) &&         \
-    !defined(__SANITIZE_ADDRESS__) && !defined(__sun)
-#define can_read_underside(ptr, size)                                          \
-  ((size) <= sizeof(uintptr_t) && ((PAGESIZE - (size)) & (uintptr_t)(ptr)) != 0)
-#endif /* can_fast_read */
+#ifndef fetch32_le_unaligned
+static __always_inline uint32_t fetch32_le_unaligned(const void *v) {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  return read_unaligned(v, 32);
+#else
+  return bswap32(read_unaligned(v, 32));
+#endif
+}
+#endif /* fetch32_le_unaligned */

-static __always_inline uint64_t tail64_le(const void *v, size_t tail) {
-  const uint8_t *p = (const uint8_t *)v;
-#ifdef can_read_underside
-  /* On some systems (e.g. x86) we can perform a 'oneshot' read, which
-   * is little bit faster. Thanks Marcin Żukowski <marcin.zukowski@gmail.com>
-   * for the reminder. */
-  const unsigned offset = (8 - tail) & 7;
-  const unsigned shift = offset << 3;
-  if (likely(can_read_underside(p, 8))) {
-    p -= offset;
-    return fetch64_le(p) >> shift;
-  }
-  return fetch64_le(p) & ((~UINT64_C(0)) >> shift);
+#ifndef fetch16_le_aligned
+static __always_inline uint16_t fetch16_le_aligned(const void *v) {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  return read_aligned(v, 16);
+#else
+  return bswap16(read_aligned(v, 16));
+#endif
+}
+#endif /* fetch16_le_aligned */
+
+#ifndef fetch16_le_unaligned
+static __always_inline uint16_t fetch16_le_unaligned(const void *v) {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  return read_unaligned(v, 16);
+#else
+  return bswap16(read_unaligned(v, 16));
+#endif
+}
+#endif /* fetch16_le_unaligned */
+
+static __always_inline uint64_t tail64_le_aligned(const void *v, size_t tail) {
+  const uint8_t *const p = (const uint8_t *)v;
+#if T1HA_USE_FAST_ONESHOT_READ && !defined(__SANITIZE_ADDRESS__)
+  /* We can perform a 'oneshot' read, which is little bit faster. */
+  const unsigned shift = ((8 - tail) & 7) << 3;
+  return fetch64_le_aligned(p) & ((~UINT64_C(0)) >> shift);
 #endif /* 'oneshot' read */

  uint64_t r = 0;
  switch (tail & 7) {
-#if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-  /* For most CPUs this code is better when not needed
-   * copying for alignment or byte reordering. */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  /* For most CPUs this code is better when not needed byte reordering. */
  case 0:
-    return fetch64_le(p);
+    return fetch64_le_aligned(p);
  case 7:
    r = (uint64_t)p[6] << 8;
  /* fall through */
@ -429,12 +523,96 @@ static __always_inline uint64_t tail64_le(const void *v, size_t tail) {
    r <<= 32;
  /* fall through */
  case 4:
-    return r + fetch32_le(p);
+    return r + fetch32_le_aligned(p);
  case 3:
    r = (uint64_t)p[2] << 16;
  /* fall through */
  case 2:
-    return r + fetch16_le(p);
+    return r + fetch16_le_aligned(p);
+  case 1:
+    return p[0];
+#else
+  case 0:
+    r = p[7] << 8;
+  /* fall through */
+  case 7:
+    r += p[6];
+    r <<= 8;
+  /* fall through */
+  case 6:
+    r += p[5];
+    r <<= 8;
+  /* fall through */
+  case 5:
+    r += p[4];
+    r <<= 8;
+  /* fall through */
+  case 4:
+    r += p[3];
+    r <<= 8;
+  /* fall through */
+  case 3:
+    r += p[2];
+    r <<= 8;
+  /* fall through */
+  case 2:
+    r += p[1];
+    r <<= 8;
+  /* fall through */
+  case 1:
+    return r + p[0];
+#endif
+  }
+  unreachable();
+}
+
+#if T1HA_USE_FAST_ONESHOT_READ && UNALIGNED_OK && defined(PAGESIZE) &&         \
+    !defined(__SANITIZE_ADDRESS__) && !defined(__sun)
+#define can_read_underside(ptr, size)                                          \
+  ((size) <= sizeof(uintptr_t) && ((PAGESIZE - (size)) & (uintptr_t)(ptr)) != 0)
+#endif /* can_fast_read */
+
+static __always_inline uint64_t tail64_le_unaligned(const void *v,
+                                                    size_t tail) {
+  const uint8_t *p = (const uint8_t *)v;
+#ifdef can_read_underside
+  /* On some systems (e.g. x86) we can perform a 'oneshot' read, which
+   * is little bit faster. Thanks Marcin Żukowski <marcin.zukowski@gmail.com>
+   * for the reminder. */
+  const unsigned offset = (8 - tail) & 7;
+  const unsigned shift = offset << 3;
+  if (likely(can_read_underside(p, 8))) {
+    p -= offset;
+    return fetch64_le_unaligned(p) >> shift;
+  }
+  return fetch64_le_unaligned(p) & ((~UINT64_C(0)) >> shift);
+#endif /* 'oneshot' read */
+
+  uint64_t r = 0;
+  switch (tail & 7) {
+#if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  /* For most CPUs this code is better when not needed
+   * copying for alignment or byte reordering. */
+  case 0:
+    return fetch64_le_unaligned(p);
+  case 7:
+    r = (uint64_t)p[6] << 8;
+  /* fall through */
+  case 6:
+    r += p[5];
+    r <<= 8;
+  /* fall through */
+  case 5:
+    r += p[4];
+    r <<= 32;
+  /* fall through */
+  case 4:
+    return r + fetch32_le_unaligned(p);
+  case 3:
+    r = (uint64_t)p[2] << 16;
+  /* fall through */
+  case 2:
+    return r + fetch16_le_unaligned(p);
  case 1:
    return p[0];
 #else
@ -474,38 +652,134 @@ static __always_inline uint64_t tail64_le(const void *v, size_t tail) {
  unreachable();
 }

-#ifndef fetch64_be
-static __maybe_unused __always_inline uint64_t fetch64_be(const void *v) {
+/*------------------------------------------------------------- Big Endian */
+
+#ifndef fetch64_be_aligned
+static __maybe_unused __always_inline uint64_t
+fetch64_be_aligned(const void *v) {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-  return *(const uint64_t *)v;
+  return read_aligned(v, 64);
 #else
-  return bswap64(*(const uint64_t *)v);
+  return bswap64(read_aligned(v, 64));
 #endif
 }
-#endif /* fetch64_be */
+#endif /* fetch64_be_aligned */

-#ifndef fetch32_be
-static __maybe_unused __always_inline uint32_t fetch32_be(const void *v) {
+#ifndef fetch64_be_unaligned
+static __maybe_unused __always_inline uint64_t
+fetch64_be_unaligned(const void *v) {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-  return *(const uint32_t *)v;
+  return read_unaligned(v, 64);
 #else
-  return bswap32(*(const uint32_t *)v);
+  return bswap64(read_unaligned(v, 64));
 #endif
 }
-#endif /* fetch32_be */
+#endif /* fetch64_be_unaligned */

-#ifndef fetch16_be
-static __maybe_unused __always_inline uint16_t fetch16_be(const void *v) {
+#ifndef fetch32_be_aligned
+static __maybe_unused __always_inline uint32_t
+fetch32_be_aligned(const void *v) {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-  return *(const uint16_t *)v;
+  return read_aligned(v, 32);
 #else
-  return bswap16(*(const uint16_t *)v);
+  return bswap32(read_aligned(v, 32));
 #endif
 }
-#endif /* fetch16_be */
+#endif /* fetch32_be_aligned */

-static __maybe_unused __always_inline uint64_t tail64_be(const void *v,
-                                                         size_t tail) {
+#ifndef fetch32_be_unaligned
+static __maybe_unused __always_inline uint32_t
+fetch32_be_unaligned(const void *v) {
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  return read_unaligned(v, 32);
+#else
+  return bswap32(read_unaligned(v, 32));
+#endif
+}
+#endif /* fetch32_be_unaligned */
+
+#ifndef fetch16_be_aligned
+static __maybe_unused __always_inline uint16_t
+fetch16_be_aligned(const void *v) {
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  return read_aligned(v, 16);
+#else
+  return bswap16(read_aligned(v, 16));
+#endif
+}
+#endif /* fetch16_be_aligned */
+
+#ifndef fetch16_be_unaligned
+static __maybe_unused __always_inline uint16_t
+fetch16_be_unaligned(const void *v) {
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  return read_unaligned(v, 16);
+#else
+  return bswap16(read_unaligned(v, 16));
+#endif
+}
+#endif /* fetch16_be_unaligned */
+
+static __maybe_unused __always_inline uint64_t tail64_be_aligned(const void *v,
+                                                                 size_t tail) {
+  const uint8_t *const p = (const uint8_t *)v;
+#if T1HA_USE_FAST_ONESHOT_READ && !defined(__SANITIZE_ADDRESS__)
+  /* We can perform a 'oneshot' read, which is little bit faster. */
+  const unsigned shift = ((8 - tail) & 7) << 3;
+  return fetch64_be_aligned(p) >> shift;
+#endif /* 'oneshot' read */
+
+  switch (tail & 7) {
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  /* For most CPUs this code is better when not byte reordering. */
+  case 1:
+    return p[0];
+  case 2:
+    return fetch16_be_aligned(p);
+  case 3:
+    return (uint32_t)fetch16_be_aligned(p) << 8 | p[2];
+  case 4:
+    return fetch32_be_aligned(p);
+  case 5:
+    return (uint64_t)fetch32_be_aligned(p) << 8 | p[4];
+  case 6:
+    return (uint64_t)fetch32_be_aligned(p) << 16 | fetch16_be_aligned(p + 4);
+  case 7:
+    return (uint64_t)fetch32_be_aligned(p) << 24 |
+           (uint32_t)fetch16_be_aligned(p + 4) << 8 | p[6];
+  case 0:
+    return fetch64_be(p);
+#else
+  case 1:
+    return p[0];
+  case 2:
+    return p[1] | (uint32_t)p[0] << 8;
+  case 3:
+    return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16;
+  case 4:
+    return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 |
+           (uint32_t)p[0] << 24;
+  case 5:
+    return p[4] | (uint32_t)p[3] << 8 | (uint32_t)p[2] << 16 |
+           (uint32_t)p[1] << 24 | (uint64_t)p[0] << 32;
+  case 6:
+    return p[5] | (uint32_t)p[4] << 8 | (uint32_t)p[3] << 16 |
+           (uint32_t)p[2] << 24 | (uint64_t)p[1] << 32 | (uint64_t)p[0] << 40;
+  case 7:
+    return p[6] | (uint32_t)p[5] << 8 | (uint32_t)p[4] << 16 |
+           (uint32_t)p[3] << 24 | (uint64_t)p[2] << 32 | (uint64_t)p[1] << 40 |
+           (uint64_t)p[0] << 48;
+  case 0:
+    return p[7] | (uint32_t)p[6] << 8 | (uint32_t)p[5] << 16 |
+           (uint32_t)p[4] << 24 | (uint64_t)p[3] << 32 | (uint64_t)p[2] << 40 |
+           (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56;
+#endif
+  }
+  unreachable();
+}
+
+static __maybe_unused __always_inline uint64_t
+tail64_be_unaligned(const void *v, size_t tail) {
  const uint8_t *p = (const uint8_t *)v;
 #ifdef can_read_underside
  /* On some systems we can perform a 'oneshot' read, which is little bit
@ -515,9 +789,9 @@ static __maybe_unused __always_inline uint64_t tail64_be(const void *v,
  const unsigned shift = offset << 3;
  if (likely(can_read_underside(p, 8))) {
    p -= offset;
-    return fetch64_be(p) & ((~UINT64_C(0)) >> shift);
+    return fetch64_be_unaligned(p) & ((~UINT64_C(0)) >> shift);
  }
-  return fetch64_be(p) >> shift;
+  return fetch64_be_unaligned(p) >> shift;
 #endif /* 'oneshot' read */

  switch (tail & 7) {
@ -527,20 +801,21 @@ static __maybe_unused __always_inline uint64_t tail64_be(const void *v,
  case 1:
    return p[0];
  case 2:
-    return fetch16_be(p);
+    return fetch16_be_unaligned(p);
  case 3:
-    return (uint32_t)fetch16_be(p) << 8 | p[2];
+    return (uint32_t)fetch16_be_unaligned(p) << 8 | p[2];
  case 4:
    return fetch32_be(p);
  case 5:
-    return (uint64_t)fetch32_be(p) << 8 | p[4];
+    return (uint64_t)fetch32_be_unaligned(p) << 8 | p[4];
  case 6:
-    return (uint64_t)fetch32_be(p) << 16 | fetch16_be(p + 4);
+    return (uint64_t)fetch32_be_unaligned(p) << 16 |
+           fetch16_be_unaligned(p + 4);
  case 7:
-    return (uint64_t)fetch32_be(p) << 24 | (uint32_t)fetch16_be(p + 4) << 8 |
-           p[6];
+    return (uint64_t)fetch32_be_unaligned(p) << 24 |
+           (uint32_t)fetch16_be_unaligned(p + 4) << 8 | p[6];
  case 0:
-    return fetch64_be(p);
+    return fetch64_be_unaligned(p);
 #else
  /* For most CPUs this code is better than a
   * copying for alignment and/or byte reordering. */