TARGET_ARCHITECTURE(ARCH)
SET(CHACHASRC chacha20/ref.c)
+SET(POLYSRC poly1305/poly1305-donna.c)
# For now we support only x86_64 architecture with optimizations
IF(${ARCH} STREQUAL "x86_64")
" "dollar macro convention")
CONFIGURE_FILE(platform_config.h.in platform_config.h)
INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}")
+ SET(CURVESRC curve25519/curve25519-donna-c64.c)
+ELSEIF(${ARCH} STREQUAL "i386")
+ SET(CURVESRC curve25519/curve25519-donna.c)
+ELSE()
+ SET(CURVESRC curve25519/ref.c)
ENDIF()
IF(HAVE_AVX2)
SET(LIBCRYPTOBOXSRC cryptobox.c)
-ADD_LIBRARY(rspamd-cryptobox ${LINK_TYPE} ${LIBCRYPTOBOXSRC} ${CHACHASRC})
+ADD_LIBRARY(rspamd-cryptobox ${LINK_TYPE} ${LIBCRYPTOBOXSRC}
+ ${CHACHASRC} ${POLYSRC} ${CURVESRC})
IF(NOT DEBIAN_BUILD)
SET_TARGET_PROPERTIES(rspamd-cryptobox PROPERTIES VERSION ${RSPAMD_VERSION})
ENDIF(NOT DEBIAN_BUILD)
--- /dev/null
+Copyright 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+* Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+curve25519-donna: Curve25519 elliptic curve, public key function
+
+http://code.google.com/p/curve25519-donna/
+
+Adam Langley <agl@imperialviolet.org>
+
+Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
+
+More information about curve25519 can be found here
+ http://cr.yp.to/ecdh.html
+
+djb's sample implementation of curve25519 is written in a special assembly
+language called qhasm and uses the floating point registers.
+
+This is, almost, a clean room reimplementation from the curve25519 paper. It
+uses many of the tricks described therein. Only the crecip function is taken
+from the sample implementation.
--- /dev/null
+/* Copyright 2008, Google Inc.
+ * All rights reserved.
+ *
+ * Code released into the public domain.
+ *
+ * curve25519-donna: Curve25519 elliptic curve, public key function
+ *
+ * http://code.google.com/p/curve25519-donna/
+ *
+ * Adam Langley <agl@imperialviolet.org>
+ *
+ * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
+ *
+ * More information about curve25519 can be found here
+ * http://cr.yp.to/ecdh.html
+ *
+ * djb's sample implementation of curve25519 is written in a special assembly
+ * language called qhasm and uses the floating point registers.
+ *
+ * This is, almost, a clean room reimplementation from the curve25519 paper. It
+ * uses many of the tricks described therein. Only the crecip function is taken
+ * from the sample implementation.
+ */
+
+#include <string.h>
+#include <stdint.h>
+
+typedef uint8_t u8;
+typedef uint64_t limb;
+typedef limb felem[5];
+// This is a special gcc mode for 128-bit integers. It's implemented on 64-bit
+// platforms only as far as I know.
+typedef unsigned uint128_t __attribute__((mode(TI)));
+
+#undef force_inline
+#define force_inline __attribute__((always_inline))
+
+/* Sum two numbers: output += in */
+static inline void force_inline
+fsum (limb *output, const limb *in)
+{
+ output[0] += in[0];
+ output[1] += in[1];
+ output[2] += in[2];
+ output[3] += in[3];
+ output[4] += in[4];
+}
+
+/* Find the difference of two numbers: output = in - output
+ * (note the order of the arguments!)
+ *
+ * Assumes that out[i] < 2**52
+ * On return, out[i] < 2**55
+ */
+static inline void force_inline
+fdifference_backwards (felem out, const felem in)
+{
+ /* 152 is 19 << 3 */
+ static const limb two54m152 = (((limb) 1) << 54) - 152;
+ static const limb two54m8 = (((limb) 1) << 54) - 8;
+
+ out[0] = in[0] + two54m152 - out[0];
+ out[1] = in[1] + two54m8 - out[1];
+ out[2] = in[2] + two54m8 - out[2];
+ out[3] = in[3] + two54m8 - out[3];
+ out[4] = in[4] + two54m8 - out[4];
+}
+
+/* Multiply a number by a scalar: output = in * scalar */
+static inline void force_inline
+fscalar_product (felem output, const felem in, const limb scalar)
+{
+ uint128_t a;
+
+ a = ((uint128_t) in[0]) * scalar;
+ output[0] = ((limb) a) & 0x7ffffffffffff;
+
+ a = ((uint128_t) in[1]) * scalar + ((limb) (a >> 51));
+ output[1] = ((limb) a) & 0x7ffffffffffff;
+
+ a = ((uint128_t) in[2]) * scalar + ((limb) (a >> 51));
+ output[2] = ((limb) a) & 0x7ffffffffffff;
+
+ a = ((uint128_t) in[3]) * scalar + ((limb) (a >> 51));
+ output[3] = ((limb) a) & 0x7ffffffffffff;
+
+ a = ((uint128_t) in[4]) * scalar + ((limb) (a >> 51));
+ output[4] = ((limb) a) & 0x7ffffffffffff;
+
+ output[0] += (a >> 51) * 19;
+}
+
+/* Multiply two numbers: output = in2 * in
+ *
+ * output must be distinct to both inputs. The inputs are reduced coefficient
+ * form, the output is not.
+ *
+ * Assumes that in[i] < 2**55 and likewise for in2.
+ * On return, output[i] < 2**52
+ */
+static inline void force_inline
+fmul (felem output, const felem in2, const felem in)
+{
+ uint128_t t[5];
+ limb r0, r1, r2, r3, r4, s0, s1, s2, s3, s4, c;
+
+ r0 = in[0];
+ r1 = in[1];
+ r2 = in[2];
+ r3 = in[3];
+ r4 = in[4];
+
+ s0 = in2[0];
+ s1 = in2[1];
+ s2 = in2[2];
+ s3 = in2[3];
+ s4 = in2[4];
+
+ t[0] = ((uint128_t) r0) * s0;
+ t[1] = ((uint128_t) r0) * s1 + ((uint128_t) r1) * s0;
+ t[2] = ((uint128_t) r0) * s2 + ((uint128_t) r2) * s0
+ + ((uint128_t) r1) * s1;
+ t[3] = ((uint128_t) r0) * s3 + ((uint128_t) r3) * s0 + ((uint128_t) r1) * s2
+ + ((uint128_t) r2) * s1;
+ t[4] = ((uint128_t) r0) * s4 + ((uint128_t) r4) * s0 + ((uint128_t) r3) * s1
+ + ((uint128_t) r1) * s3 + ((uint128_t) r2) * s2;
+
+ r4 *= 19;
+ r1 *= 19;
+ r2 *= 19;
+ r3 *= 19;
+
+ t[0] += ((uint128_t) r4) * s1 + ((uint128_t) r1) * s4
+ + ((uint128_t) r2) * s3 + ((uint128_t) r3) * s2;
+ t[1] += ((uint128_t) r4) * s2 + ((uint128_t) r2) * s4
+ + ((uint128_t) r3) * s3;
+ t[2] += ((uint128_t) r4) * s3 + ((uint128_t) r3) * s4;
+ t[3] += ((uint128_t) r4) * s4;
+
+ r0 = (limb) t[0] & 0x7ffffffffffff;
+ c = (limb) (t[0] >> 51);
+ t[1] += c;
+ r1 = (limb) t[1] & 0x7ffffffffffff;
+ c = (limb) (t[1] >> 51);
+ t[2] += c;
+ r2 = (limb) t[2] & 0x7ffffffffffff;
+ c = (limb) (t[2] >> 51);
+ t[3] += c;
+ r3 = (limb) t[3] & 0x7ffffffffffff;
+ c = (limb) (t[3] >> 51);
+ t[4] += c;
+ r4 = (limb) t[4] & 0x7ffffffffffff;
+ c = (limb) (t[4] >> 51);
+ r0 += c * 19;
+ c = r0 >> 51;
+ r0 = r0 & 0x7ffffffffffff;
+ r1 += c;
+ c = r1 >> 51;
+ r1 = r1 & 0x7ffffffffffff;
+ r2 += c;
+
+ output[0] = r0;
+ output[1] = r1;
+ output[2] = r2;
+ output[3] = r3;
+ output[4] = r4;
+}
+
+static inline void force_inline
+fsquare_times (felem output, const felem in, limb count)
+{
+ uint128_t t[5];
+ limb r0, r1, r2, r3, r4, c;
+ limb d0, d1, d2, d4, d419;
+
+ r0 = in[0];
+ r1 = in[1];
+ r2 = in[2];
+ r3 = in[3];
+ r4 = in[4];
+
+ do {
+ d0 = r0 * 2;
+ d1 = r1 * 2;
+ d2 = r2 * 2 * 19;
+ d419 = r4 * 19;
+ d4 = d419 * 2;
+
+ t[0] = ((uint128_t) r0) * r0 + ((uint128_t) d4) * r1
+ + (((uint128_t) d2) * (r3));
+ t[1] = ((uint128_t) d0) * r1 + ((uint128_t) d4) * r2
+ + (((uint128_t) r3) * (r3 * 19));
+ t[2] = ((uint128_t) d0) * r2 + ((uint128_t) r1) * r1
+ + (((uint128_t) d4) * (r3));
+ t[3] = ((uint128_t) d0) * r3 + ((uint128_t) d1) * r2
+ + (((uint128_t) r4) * (d419));
+ t[4] = ((uint128_t) d0) * r4 + ((uint128_t) d1) * r3
+ + (((uint128_t) r2) * (r2));
+
+ r0 = (limb) t[0] & 0x7ffffffffffff;
+ c = (limb) (t[0] >> 51);
+ t[1] += c;
+ r1 = (limb) t[1] & 0x7ffffffffffff;
+ c = (limb) (t[1] >> 51);
+ t[2] += c;
+ r2 = (limb) t[2] & 0x7ffffffffffff;
+ c = (limb) (t[2] >> 51);
+ t[3] += c;
+ r3 = (limb) t[3] & 0x7ffffffffffff;
+ c = (limb) (t[3] >> 51);
+ t[4] += c;
+ r4 = (limb) t[4] & 0x7ffffffffffff;
+ c = (limb) (t[4] >> 51);
+ r0 += c * 19;
+ c = r0 >> 51;
+ r0 = r0 & 0x7ffffffffffff;
+ r1 += c;
+ c = r1 >> 51;
+ r1 = r1 & 0x7ffffffffffff;
+ r2 += c;
+ } while (--count);
+
+ output[0] = r0;
+ output[1] = r1;
+ output[2] = r2;
+ output[3] = r3;
+ output[4] = r4;
+}
+
+/* Load a little-endian 64-bit number */
+static limb load_limb (const u8 *in)
+{
+ return ((limb) in[0]) | (((limb) in[1]) << 8) | (((limb) in[2]) << 16)
+ | (((limb) in[3]) << 24) | (((limb) in[4]) << 32)
+ | (((limb) in[5]) << 40) | (((limb) in[6]) << 48)
+ | (((limb) in[7]) << 56);
+}
+
+static void store_limb (u8 *out, limb in)
+{
+ out[0] = in & 0xff;
+ out[1] = (in >> 8) & 0xff;
+ out[2] = (in >> 16) & 0xff;
+ out[3] = (in >> 24) & 0xff;
+ out[4] = (in >> 32) & 0xff;
+ out[5] = (in >> 40) & 0xff;
+ out[6] = (in >> 48) & 0xff;
+ out[7] = (in >> 56) & 0xff;
+}
+
+/* Take a little-endian, 32-byte number and expand it into polynomial form */
+static void fexpand (limb *output, const u8 *in)
+{
+ output[0] = load_limb (in) & 0x7ffffffffffff;
+ output[1] = (load_limb (in + 6) >> 3) & 0x7ffffffffffff;
+ output[2] = (load_limb (in + 12) >> 6) & 0x7ffffffffffff;
+ output[3] = (load_limb (in + 19) >> 1) & 0x7ffffffffffff;
+ output[4] = (load_limb (in + 24) >> 12) & 0x7ffffffffffff;
+}
+
+/* Take a fully reduced polynomial form number and contract it into a
+ * little-endian, 32-byte array
+ */
+static void fcontract (u8 *output, const felem input)
+{
+ uint128_t t[5];
+
+ t[0] = input[0];
+ t[1] = input[1];
+ t[2] = input[2];
+ t[3] = input[3];
+ t[4] = input[4];
+
+ t[1] += t[0] >> 51;
+ t[0] &= 0x7ffffffffffff;
+ t[2] += t[1] >> 51;
+ t[1] &= 0x7ffffffffffff;
+ t[3] += t[2] >> 51;
+ t[2] &= 0x7ffffffffffff;
+ t[4] += t[3] >> 51;
+ t[3] &= 0x7ffffffffffff;
+ t[0] += 19 * (t[4] >> 51);
+ t[4] &= 0x7ffffffffffff;
+
+ t[1] += t[0] >> 51;
+ t[0] &= 0x7ffffffffffff;
+ t[2] += t[1] >> 51;
+ t[1] &= 0x7ffffffffffff;
+ t[3] += t[2] >> 51;
+ t[2] &= 0x7ffffffffffff;
+ t[4] += t[3] >> 51;
+ t[3] &= 0x7ffffffffffff;
+ t[0] += 19 * (t[4] >> 51);
+ t[4] &= 0x7ffffffffffff;
+
+ /* now t is between 0 and 2^255-1, properly carried. */
+ /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
+
+ t[0] += 19;
+
+ t[1] += t[0] >> 51;
+ t[0] &= 0x7ffffffffffff;
+ t[2] += t[1] >> 51;
+ t[1] &= 0x7ffffffffffff;
+ t[3] += t[2] >> 51;
+ t[2] &= 0x7ffffffffffff;
+ t[4] += t[3] >> 51;
+ t[3] &= 0x7ffffffffffff;
+ t[0] += 19 * (t[4] >> 51);
+ t[4] &= 0x7ffffffffffff;
+
+ /* now between 19 and 2^255-1 in both cases, and offset by 19. */
+
+ t[0] += 0x8000000000000 - 19;
+ t[1] += 0x8000000000000 - 1;
+ t[2] += 0x8000000000000 - 1;
+ t[3] += 0x8000000000000 - 1;
+ t[4] += 0x8000000000000 - 1;
+
+ /* now between 2^255 and 2^256-20, and offset by 2^255. */
+
+ t[1] += t[0] >> 51;
+ t[0] &= 0x7ffffffffffff;
+ t[2] += t[1] >> 51;
+ t[1] &= 0x7ffffffffffff;
+ t[3] += t[2] >> 51;
+ t[2] &= 0x7ffffffffffff;
+ t[4] += t[3] >> 51;
+ t[3] &= 0x7ffffffffffff;
+ t[4] &= 0x7ffffffffffff;
+
+ store_limb (output, t[0] | (t[1] << 51));
+ store_limb (output + 8, (t[1] >> 13) | (t[2] << 38));
+ store_limb (output + 16, (t[2] >> 26) | (t[3] << 25));
+ store_limb (output + 24, (t[3] >> 39) | (t[4] << 12));
+}
+
+/* Input: Q, Q', Q-Q'
+ * Output: 2Q, Q+Q'
+ *
+ * x2 z3: long form
+ * x3 z3: long form
+ * x z: short form, destroyed
+ * xprime zprime: short form, destroyed
+ * qmqp: short form, preserved
+ */
+static void fmonty (limb *x2, limb *z2, /* output 2Q */
+limb *x3, limb *z3, /* output Q + Q' */
+limb *x, limb *z, /* input Q */
+limb *xprime, limb *zprime, /* input Q' */
+const limb *qmqp /* input Q - Q' */)
+{
+ limb origx[5], origxprime[5], zzz[5], xx[5], zz[5], xxprime[5], zzprime[5],
+ zzzprime[5];
+
+ memcpy (origx, x, 5 * sizeof(limb));
+ fsum (x, z);
+ fdifference_backwards (z, origx); // does x - z
+
+ memcpy (origxprime, xprime, sizeof(limb) * 5);
+ fsum (xprime, zprime);
+ fdifference_backwards (zprime, origxprime);
+ fmul (xxprime, xprime, z);
+ fmul (zzprime, x, zprime);
+ memcpy (origxprime, xxprime, sizeof(limb) * 5);
+ fsum (xxprime, zzprime);
+ fdifference_backwards (zzprime, origxprime);
+ fsquare_times (x3, xxprime, 1);
+ fsquare_times (zzzprime, zzprime, 1);
+ fmul (z3, zzzprime, qmqp);
+
+ fsquare_times (xx, x, 1);
+ fsquare_times (zz, z, 1);
+ fmul (x2, xx, zz);
+ fdifference_backwards (zz, xx); // does zz = xx - zz
+ fscalar_product (zzz, zz, 121665);
+ fsum (zzz, xx);
+ fmul (z2, zz, zzz);
+}
+
+// -----------------------------------------------------------------------------
+// Maybe swap the contents of two limb arrays (@a and @b), each @len elements
+// long. Perform the swap iff @swap is non-zero.
+//
+// This function performs the swap without leaking any side-channel
+// information.
+// -----------------------------------------------------------------------------
+static void swap_conditional (limb a[5], limb b[5], limb iswap)
+{
+ unsigned i;
+ const limb swap = -iswap;
+
+ for (i = 0; i < 5; ++i) {
+ const limb x = swap & (a[i] ^ b[i]);
+ a[i] ^= x;
+ b[i] ^= x;
+ }
+}
+
+/* Calculates nQ where Q is the x-coordinate of a point on the curve
+ *
+ * resultx/resultz: the x coordinate of the resulting curve point (short form)
+ * n: a little endian, 32-byte number
+ * q: a point of the curve (short form)
+ */
+static void cmult (limb *resultx, limb *resultz, const u8 *n, const limb *q)
+{
+ limb a[5] = { 0 }, b[5] = { 1 }, c[5] = { 1 }, d[5] = { 0 };
+ limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
+ limb e[5] = { 0 }, f[5] = { 1 }, g[5] = { 0 }, h[5] = { 1 };
+ limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
+
+ unsigned i, j;
+
+ memcpy (nqpqx, q, sizeof(limb) * 5);
+
+ for (i = 0; i < 32; ++i) {
+ u8 byte = n[31 - i];
+ for (j = 0; j < 8; ++j) {
+ const limb bit = byte >> 7;
+
+ swap_conditional (nqx, nqpqx, bit);
+ swap_conditional (nqz, nqpqz, bit);
+ fmonty (nqx2, nqz2, nqpqx2, nqpqz2, nqx, nqz, nqpqx, nqpqz, q);
+ swap_conditional (nqx2, nqpqx2, bit);
+ swap_conditional (nqz2, nqpqz2, bit);
+
+ t = nqx;
+ nqx = nqx2;
+ nqx2 = t;
+ t = nqz;
+ nqz = nqz2;
+ nqz2 = t;
+ t = nqpqx;
+ nqpqx = nqpqx2;
+ nqpqx2 = t;
+ t = nqpqz;
+ nqpqz = nqpqz2;
+ nqpqz2 = t;
+
+ byte <<= 1;
+ }
+ }
+
+ memcpy (resultx, nqx, sizeof(limb) * 5);
+ memcpy (resultz, nqz, sizeof(limb) * 5);
+}
+
+// -----------------------------------------------------------------------------
+// Shamelessly copied from djb's code, tightened a little
+// -----------------------------------------------------------------------------
+static void crecip (felem out, const felem z)
+{
+ felem a, t0, b, c;
+
+ /* 2 */fsquare_times (a, z, 1); // a = 2
+ /* 8 */fsquare_times (t0, a, 2);
+ /* 9 */fmul (b, t0, z); // b = 9
+ /* 11 */fmul (a, b, a); // a = 11
+ /* 22 */fsquare_times (t0, a, 1);
+ /* 2^5 - 2^0 = 31 */fmul (b, t0, b);
+ /* 2^10 - 2^5 */fsquare_times (t0, b, 5);
+ /* 2^10 - 2^0 */fmul (b, t0, b);
+ /* 2^20 - 2^10 */fsquare_times (t0, b, 10);
+ /* 2^20 - 2^0 */fmul (c, t0, b);
+ /* 2^40 - 2^20 */fsquare_times (t0, c, 20);
+ /* 2^40 - 2^0 */fmul (t0, t0, c);
+ /* 2^50 - 2^10 */fsquare_times (t0, t0, 10);
+ /* 2^50 - 2^0 */fmul (b, t0, b);
+ /* 2^100 - 2^50 */fsquare_times (t0, b, 50);
+ /* 2^100 - 2^0 */fmul (c, t0, b);
+ /* 2^200 - 2^100 */fsquare_times (t0, c, 100);
+ /* 2^200 - 2^0 */fmul (t0, t0, c);
+ /* 2^250 - 2^50 */fsquare_times (t0, t0, 50);
+ /* 2^250 - 2^0 */fmul (t0, t0, b);
+ /* 2^255 - 2^5 */fsquare_times (t0, t0, 5);
+ /* 2^255 - 21 */fmul (out, t0, a);
+}
+
+int curve25519 (u8 *, const u8 *, const u8 *);
+
+int curve25519 (u8 *mypublic, const u8 *secret, const u8 *basepoint)
+{
+ limb bp[5], x[5], z[5], zmone[5];
+ uint8_t e[32];
+ int i;
+
+ for (i = 0; i < 32; ++i)
+ e[i] = secret[i];
+ e[0] &= 248;
+ e[31] &= 127;
+ e[31] |= 64;
+
+ fexpand (bp, basepoint);
+ cmult (x, z, e, bp);
+ crecip (zmone, z);
+ fmul (z, x, zmone);
+ fcontract (mypublic, z);
+ return 0;
+}
--- /dev/null
+/* Copyright 2008, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * curve25519-donna: Curve25519 elliptic curve, public key function
+ *
+ * http://code.google.com/p/curve25519-donna/
+ *
+ * Adam Langley <agl@imperialviolet.org>
+ *
+ * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
+ *
+ * More information about curve25519 can be found here
+ * http://cr.yp.to/ecdh.html
+ *
+ * djb's sample implementation of curve25519 is written in a special assembly
+ * language called qhasm and uses the floating point registers.
+ *
+ * This is, almost, a clean room reimplementation from the curve25519 paper. It
+ * uses many of the tricks described therein. Only the crecip function is taken
+ * from the sample implementation. */
+
+#include <string.h>
+#include <stdint.h>
+
+#ifdef _MSC_VER
+#define inline __inline
+#endif
+
+typedef uint8_t u8;
+typedef int32_t s32;
+typedef int64_t limb;
+
+/* Field element representation:
+ *
+ * Field elements are written as an array of signed, 64-bit limbs, least
+ * significant first. The value of the field element is:
+ * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
+ *
+ * i.e. the limbs are 26, 25, 26, 25, ... bits wide. */
+
+/* Sum two numbers: output += in */
+static void fsum (limb *output, const limb *in)
+{
+ unsigned i;
+ for (i = 0; i < 10; i += 2) {
+ output[0 + i] = output[0 + i] + in[0 + i];
+ output[1 + i] = output[1 + i] + in[1 + i];
+ }
+}
+
+/* Find the difference of two numbers: output = in - output
+ * (note the order of the arguments!). */
+static void fdifference (limb *output, const limb *in)
+{
+ unsigned i;
+ for (i = 0; i < 10; ++i) {
+ output[i] = in[i] - output[i];
+ }
+}
+
+/* Multiply a number by a scalar: output = in * scalar */
+static void fscalar_product (limb *output, const limb *in, const limb scalar)
+{
+ unsigned i;
+ for (i = 0; i < 10; ++i) {
+ output[i] = in[i] * scalar;
+ }
+}
+
+/* Multiply two numbers: output = in2 * in
+ *
+ * output must be distinct to both inputs. The inputs are reduced coefficient
+ * form, the output is not.
+ *
+ * output[x] <= 14 * the largest product of the input limbs. */
+static void fproduct (limb *output, const limb *in2, const limb *in)
+{
+ output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]);
+ output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1])
+ + ((limb) ((s32) in2[1])) * ((s32) in[0]);
+ output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1])
+ + ((limb) ((s32) in2[0])) * ((s32) in[2])
+ + ((limb) ((s32) in2[2])) * ((s32) in[0]);
+ output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2])
+ + ((limb) ((s32) in2[2])) * ((s32) in[1])
+ + ((limb) ((s32) in2[0])) * ((s32) in[3])
+ + ((limb) ((s32) in2[3])) * ((s32) in[0]);
+ output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2])
+ + 2
+ * (((limb) ((s32) in2[1])) * ((s32) in[3])
+ + ((limb) ((s32) in2[3])) * ((s32) in[1]))
+ + ((limb) ((s32) in2[0])) * ((s32) in[4])
+ + ((limb) ((s32) in2[4])) * ((s32) in[0]);
+ output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3])
+ + ((limb) ((s32) in2[3])) * ((s32) in[2])
+ + ((limb) ((s32) in2[1])) * ((s32) in[4])
+ + ((limb) ((s32) in2[4])) * ((s32) in[1])
+ + ((limb) ((s32) in2[0])) * ((s32) in[5])
+ + ((limb) ((s32) in2[5])) * ((s32) in[0]);
+ output[6] = 2
+ * (((limb) ((s32) in2[3])) * ((s32) in[3])
+ + ((limb) ((s32) in2[1])) * ((s32) in[5])
+ + ((limb) ((s32) in2[5])) * ((s32) in[1]))
+ + ((limb) ((s32) in2[2])) * ((s32) in[4])
+ + ((limb) ((s32) in2[4])) * ((s32) in[2])
+ + ((limb) ((s32) in2[0])) * ((s32) in[6])
+ + ((limb) ((s32) in2[6])) * ((s32) in[0]);
+ output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4])
+ + ((limb) ((s32) in2[4])) * ((s32) in[3])
+ + ((limb) ((s32) in2[2])) * ((s32) in[5])
+ + ((limb) ((s32) in2[5])) * ((s32) in[2])
+ + ((limb) ((s32) in2[1])) * ((s32) in[6])
+ + ((limb) ((s32) in2[6])) * ((s32) in[1])
+ + ((limb) ((s32) in2[0])) * ((s32) in[7])
+ + ((limb) ((s32) in2[7])) * ((s32) in[0]);
+ output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4])
+ + 2
+ * (((limb) ((s32) in2[3])) * ((s32) in[5])
+ + ((limb) ((s32) in2[5])) * ((s32) in[3])
+ + ((limb) ((s32) in2[1])) * ((s32) in[7])
+ + ((limb) ((s32) in2[7])) * ((s32) in[1]))
+ + ((limb) ((s32) in2[2])) * ((s32) in[6])
+ + ((limb) ((s32) in2[6])) * ((s32) in[2])
+ + ((limb) ((s32) in2[0])) * ((s32) in[8])
+ + ((limb) ((s32) in2[8])) * ((s32) in[0]);
+ output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5])
+ + ((limb) ((s32) in2[5])) * ((s32) in[4])
+ + ((limb) ((s32) in2[3])) * ((s32) in[6])
+ + ((limb) ((s32) in2[6])) * ((s32) in[3])
+ + ((limb) ((s32) in2[2])) * ((s32) in[7])
+ + ((limb) ((s32) in2[7])) * ((s32) in[2])
+ + ((limb) ((s32) in2[1])) * ((s32) in[8])
+ + ((limb) ((s32) in2[8])) * ((s32) in[1])
+ + ((limb) ((s32) in2[0])) * ((s32) in[9])
+ + ((limb) ((s32) in2[9])) * ((s32) in[0]);
+ output[10] = 2
+ * (((limb) ((s32) in2[5])) * ((s32) in[5])
+ + ((limb) ((s32) in2[3])) * ((s32) in[7])
+ + ((limb) ((s32) in2[7])) * ((s32) in[3])
+ + ((limb) ((s32) in2[1])) * ((s32) in[9])
+ + ((limb) ((s32) in2[9])) * ((s32) in[1]))
+ + ((limb) ((s32) in2[4])) * ((s32) in[6])
+ + ((limb) ((s32) in2[6])) * ((s32) in[4])
+ + ((limb) ((s32) in2[2])) * ((s32) in[8])
+ + ((limb) ((s32) in2[8])) * ((s32) in[2]);
+ output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6])
+ + ((limb) ((s32) in2[6])) * ((s32) in[5])
+ + ((limb) ((s32) in2[4])) * ((s32) in[7])
+ + ((limb) ((s32) in2[7])) * ((s32) in[4])
+ + ((limb) ((s32) in2[3])) * ((s32) in[8])
+ + ((limb) ((s32) in2[8])) * ((s32) in[3])
+ + ((limb) ((s32) in2[2])) * ((s32) in[9])
+ + ((limb) ((s32) in2[9])) * ((s32) in[2]);
+ output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6])
+ + 2
+ * (((limb) ((s32) in2[5])) * ((s32) in[7])
+ + ((limb) ((s32) in2[7])) * ((s32) in[5])
+ + ((limb) ((s32) in2[3])) * ((s32) in[9])
+ + ((limb) ((s32) in2[9])) * ((s32) in[3]))
+ + ((limb) ((s32) in2[4])) * ((s32) in[8])
+ + ((limb) ((s32) in2[8])) * ((s32) in[4]);
+ output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7])
+ + ((limb) ((s32) in2[7])) * ((s32) in[6])
+ + ((limb) ((s32) in2[5])) * ((s32) in[8])
+ + ((limb) ((s32) in2[8])) * ((s32) in[5])
+ + ((limb) ((s32) in2[4])) * ((s32) in[9])
+ + ((limb) ((s32) in2[9])) * ((s32) in[4]);
+ output[14] = 2
+ * (((limb) ((s32) in2[7])) * ((s32) in[7])
+ + ((limb) ((s32) in2[5])) * ((s32) in[9])
+ + ((limb) ((s32) in2[9])) * ((s32) in[5]))
+ + ((limb) ((s32) in2[6])) * ((s32) in[8])
+ + ((limb) ((s32) in2[8])) * ((s32) in[6]);
+ output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8])
+ + ((limb) ((s32) in2[8])) * ((s32) in[7])
+ + ((limb) ((s32) in2[6])) * ((s32) in[9])
+ + ((limb) ((s32) in2[9])) * ((s32) in[6]);
+ output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8])
+ + 2
+ * (((limb) ((s32) in2[7])) * ((s32) in[9])
+ + ((limb) ((s32) in2[9])) * ((s32) in[7]));
+ output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9])
+ + ((limb) ((s32) in2[9])) * ((s32) in[8]);
+ output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]);
+}
+
+/* Reduce a long form to a short form by taking the input mod 2^255 - 19.
+ *
+ * On entry: |output[i]| < 14*2^54
+ * On exit: |output[0..8]| < 280*2^54 */
+static void freduce_degree (limb *output)
+{
+ /* Each of these shifts and adds ends up multiplying the value by 19.
+ *
+ * For output[0..8], the absolute entry value is < 14*2^54 and we add, at
+ * most, 19*14*2^54 thus, on exit, |output[0..8]| < 280*2^54. */
+ output[8] += output[18] << 4;
+ output[8] += output[18] << 1;
+ output[8] += output[18];
+ output[7] += output[17] << 4;
+ output[7] += output[17] << 1;
+ output[7] += output[17];
+ output[6] += output[16] << 4;
+ output[6] += output[16] << 1;
+ output[6] += output[16];
+ output[5] += output[15] << 4;
+ output[5] += output[15] << 1;
+ output[5] += output[15];
+ output[4] += output[14] << 4;
+ output[4] += output[14] << 1;
+ output[4] += output[14];
+ output[3] += output[13] << 4;
+ output[3] += output[13] << 1;
+ output[3] += output[13];
+ output[2] += output[12] << 4;
+ output[2] += output[12] << 1;
+ output[2] += output[12];
+ output[1] += output[11] << 4;
+ output[1] += output[11] << 1;
+ output[1] += output[11];
+ output[0] += output[10] << 4;
+ output[0] += output[10] << 1;
+ output[0] += output[10];
+}
+
+#if (-1 & 3) != 3
+#error "This code only works on a two's complement system"
+#endif
+
+/* return v / 2^26, using only shifts and adds.
+ *
+ * On entry: v can take any value. */
+static inline limb div_by_2_26 (const limb v)
+{
+ /* High word of v; no shift needed. */
+ const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
+ /* Set to all 1s if v was negative; else set to 0s. */
+ const int32_t sign = ((int32_t) highword) >> 31;
+ /* Set to 0x3ffffff if v was negative; else set to 0. */
+ const int32_t roundoff = ((uint32_t) sign) >> 6;
+ /* Should return v / (1<<26) */
+ return (v + roundoff) >> 26;
+}
+
+/* return v / (2^25), using only shifts and adds.
+ *
+ * On entry: v can take any value. */
+static inline limb div_by_2_25 (const limb v)
+{
+ /* High word of v; no shift needed*/
+ const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
+ /* Set to all 1s if v was negative; else set to 0s. */
+ const int32_t sign = ((int32_t) highword) >> 31;
+ /* Set to 0x1ffffff if v was negative; else set to 0. */
+ const int32_t roundoff = ((uint32_t) sign) >> 7;
+ /* Should return v / (1<<25) */
+ return (v + roundoff) >> 25;
+}
+
+/* Reduce all coefficients of the short form input so that |x| < 2^26.
+ *
+ * On entry: |output[i]| < 280*2^54 */
+static void freduce_coefficients (limb *output)
+{
+ unsigned i;
+
+ output[10] = 0;
+
+ for (i = 0; i < 10; i += 2) {
+ limb over = div_by_2_26 (output[i]);
+ /* The entry condition (that |output[i]| < 280*2^54) means that over is, at
+ * most, 280*2^28 in the first iteration of this loop. This is added to the
+ * next limb and we can approximate the resulting bound of that limb by
+ * 281*2^54. */
+ output[i] -= over << 26;
+ output[i + 1] += over;
+
+ /* For the first iteration, |output[i+1]| < 281*2^54, thus |over| <
+ * 281*2^29. When this is added to the next limb, the resulting bound can
+ * be approximated as 281*2^54.
+ *
+ * For subsequent iterations of the loop, 281*2^54 remains a conservative
+ * bound and no overflow occurs. */
+ over = div_by_2_25 (output[i + 1]);
+ output[i + 1] -= over << 25;
+ output[i + 2] += over;
+ }
+ /* Now |output[10]| < 281*2^29 and all other coefficients are reduced. */
+ output[0] += output[10] << 4;
+ output[0] += output[10] << 1;
+ output[0] += output[10];
+
+ output[10] = 0;
+
+ /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29
+ * So |over| will be no more than 2^16. */
+ {
+ limb over = div_by_2_26 (output[0]);
+ output[0] -= over << 26;
+ output[1] += over;
+ }
+
+ /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The
+ * bound on |output[1]| is sufficient to meet our needs. */
+}
+
+/* A helpful wrapper around fproduct: output = in * in2.
+ *
+ * On entry: |in[i]| < 2^27 and |in2[i]| < 2^27.
+ *
+ * output must be distinct to both inputs. The output is reduced degree
+ * (indeed, one need only provide storage for 10 limbs) and |output[i]| < 2^26. */
+static void fmul (limb *output, const limb *in, const limb *in2)
+{
+ limb t[19];
+ fproduct (t, in, in2);
+ /* |t[i]| < 14*2^54 */
+ freduce_degree (t);
+ freduce_coefficients (t);
+ /* |t[i]| < 2^26 */
+ memcpy (output, t, sizeof(limb) * 10);
+}
+
+/* Square a number: output = in**2
+ *
+ * output must be distinct from the input. The inputs are reduced coefficient
+ * form, the output is not.
+ *
+ * output[x] <= 14 * the largest product of the input limbs. */
+static void fsquare_inner (limb *output, const limb *in)
+{
+ output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]);
+ output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]);
+ output[2] = 2
+ * (((limb) ((s32) in[1])) * ((s32) in[1])
+ + ((limb) ((s32) in[0])) * ((s32) in[2]));
+ output[3] = 2
+ * (((limb) ((s32) in[1])) * ((s32) in[2])
+ + ((limb) ((s32) in[0])) * ((s32) in[3]));
+ output[4] = ((limb) ((s32) in[2])) * ((s32) in[2])
+ + 4 * ((limb) ((s32) in[1])) * ((s32) in[3])
+ + 2 * ((limb) ((s32) in[0])) * ((s32) in[4]);
+ output[5] = 2
+ * (((limb) ((s32) in[2])) * ((s32) in[3])
+ + ((limb) ((s32) in[1])) * ((s32) in[4])
+ + ((limb) ((s32) in[0])) * ((s32) in[5]));
+ output[6] = 2
+ * (((limb) ((s32) in[3])) * ((s32) in[3])
+ + ((limb) ((s32) in[2])) * ((s32) in[4])
+ + ((limb) ((s32) in[0])) * ((s32) in[6])
+ + 2 * ((limb) ((s32) in[1])) * ((s32) in[5]));
+ output[7] = 2
+ * (((limb) ((s32) in[3])) * ((s32) in[4])
+ + ((limb) ((s32) in[2])) * ((s32) in[5])
+ + ((limb) ((s32) in[1])) * ((s32) in[6])
+ + ((limb) ((s32) in[0])) * ((s32) in[7]));
+ output[8] = ((limb) ((s32) in[4])) * ((s32) in[4])
+ + 2
+ * (((limb) ((s32) in[2])) * ((s32) in[6])
+ + ((limb) ((s32) in[0])) * ((s32) in[8])
+ + 2
+ * (((limb) ((s32) in[1])) * ((s32) in[7])
+ + ((limb) ((s32) in[3]))
+ * ((s32) in[5])));
+ output[9] = 2
+ * (((limb) ((s32) in[4])) * ((s32) in[5])
+ + ((limb) ((s32) in[3])) * ((s32) in[6])
+ + ((limb) ((s32) in[2])) * ((s32) in[7])
+ + ((limb) ((s32) in[1])) * ((s32) in[8])
+ + ((limb) ((s32) in[0])) * ((s32) in[9]));
+ output[10] = 2
+ * (((limb) ((s32) in[5])) * ((s32) in[5])
+ + ((limb) ((s32) in[4])) * ((s32) in[6])
+ + ((limb) ((s32) in[2])) * ((s32) in[8])
+ + 2
+ * (((limb) ((s32) in[3])) * ((s32) in[7])
+ + ((limb) ((s32) in[1])) * ((s32) in[9])));
+ output[11] = 2
+ * (((limb) ((s32) in[5])) * ((s32) in[6])
+ + ((limb) ((s32) in[4])) * ((s32) in[7])
+ + ((limb) ((s32) in[3])) * ((s32) in[8])
+ + ((limb) ((s32) in[2])) * ((s32) in[9]));
+ output[12] = ((limb) ((s32) in[6])) * ((s32) in[6])
+ + 2
+ * (((limb) ((s32) in[4])) * ((s32) in[8])
+ + 2
+ * (((limb) ((s32) in[5])) * ((s32) in[7])
+ + ((limb) ((s32) in[3]))
+ * ((s32) in[9])));
+ output[13] = 2
+ * (((limb) ((s32) in[6])) * ((s32) in[7])
+ + ((limb) ((s32) in[5])) * ((s32) in[8])
+ + ((limb) ((s32) in[4])) * ((s32) in[9]));
+ output[14] = 2
+ * (((limb) ((s32) in[7])) * ((s32) in[7])
+ + ((limb) ((s32) in[6])) * ((s32) in[8])
+ + 2 * ((limb) ((s32) in[5])) * ((s32) in[9]));
+ output[15] = 2
+ * (((limb) ((s32) in[7])) * ((s32) in[8])
+ + ((limb) ((s32) in[6])) * ((s32) in[9]));
+ output[16] = ((limb) ((s32) in[8])) * ((s32) in[8])
+ + 4 * ((limb) ((s32) in[7])) * ((s32) in[9]);
+ output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]);
+ output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]);
+}
+
+/* fsquare sets output = in^2.
+ *
+ * On entry: The |in| argument is in reduced coefficients form and |in[i]| <
+ * 2^27.
+ *
+ * On exit: The |output| argument is in reduced coefficients form (indeed, one
+ * need only provide storage for 10 limbs) and |out[i]| < 2^26. */
+static void fsquare (limb *output, const limb *in)
+{
+ limb t[19];
+ fsquare_inner (t, in);
+ /* |t[i]| < 14*2^54 because the largest product of two limbs will be <
+ * 2^(27+27) and fsquare_inner adds together, at most, 14 of those
+ * products. */
+ freduce_degree (t);
+ freduce_coefficients (t);
+ /* |t[i]| < 2^26 */
+ memcpy (output, t, sizeof(limb) * 10);
+}
+
+/* Take a little-endian, 32-byte number and expand it into polynomial form */
+static void fexpand (limb *output, const u8 *input)
+{
+#define F(n,start,shift,mask) \
+ output[n] = ((((limb) input[start + 0]) | \
+ ((limb) input[start + 1]) << 8 | \
+ ((limb) input[start + 2]) << 16 | \
+ ((limb) input[start + 3]) << 24) >> shift) & mask;
+ F(0, 0, 0, 0x3ffffff);
+ F(1, 3, 2, 0x1ffffff);
+ F(2, 6, 3, 0x3ffffff);
+ F(3, 9, 5, 0x1ffffff);
+ F(4, 12, 6, 0x3ffffff);
+ F(5, 16, 0, 0x1ffffff);
+ F(6, 19, 1, 0x3ffffff);
+ F(7, 22, 3, 0x1ffffff);
+ F(8, 25, 4, 0x3ffffff);
+ F(9, 28, 6, 0x1ffffff);
+#undef F
+}
+
+#if (-32 >> 1) != -16
+#error "This code only works when >> does sign-extension on negative numbers"
+#endif
+
+/* s32_eq returns 0xffffffff iff a == b and zero otherwise. */
+static s32 s32_eq (s32 a, s32 b)
+{
+ a = ~(a ^ b);
+ a &= a << 16;
+ a &= a << 8;
+ a &= a << 4;
+ a &= a << 2;
+ a &= a << 1;
+ return a >> 31;
+}
+
+/* s32_gte returns 0xffffffff if a >= b and zero otherwise, where a and b are
+ * both non-negative. */
+static s32 s32_gte (s32 a, s32 b)
+{
+ a -= b;
+ /* a >= 0 iff a >= b. */
+ return ~(a >> 31);
+}
+
+/* Take a fully reduced polynomial form number and contract it into a
+ * little-endian, 32-byte array.
+ *
+ * On entry: |input_limbs[i]| < 2^26 */
+static void fcontract (u8 *output, limb *input_limbs)
+{
+ int i;
+ int j;
+ s32 input[10];
+ s32 mask;
+
+ /* |input_limbs[i]| < 2^26, so it's valid to convert to an s32. */
+ for (i = 0; i < 10; i++) {
+ input[i] = input_limbs[i];
+ }
+
+ for (j = 0; j < 2; ++j) {
+ for (i = 0; i < 9; ++i) {
+ if ((i & 1) == 1) {
+ /* This calculation is a time-invariant way to make input[i]
+ * non-negative by borrowing from the next-larger limb. */
+ const s32 mask = input[i] >> 31;
+ const s32 carry = -((input[i] & mask) >> 25);
+ input[i] = input[i] + (carry << 25);
+ input[i + 1] = input[i + 1] - carry;
+ }
+ else {
+ const s32 mask = input[i] >> 31;
+ const s32 carry = -((input[i] & mask) >> 26);
+ input[i] = input[i] + (carry << 26);
+ input[i + 1] = input[i + 1] - carry;
+ }
+ }
+
+ /* There's no greater limb for input[9] to borrow from, but we can multiply
+ * by 19 and borrow from input[0], which is valid mod 2^255-19. */
+ {
+ const s32 mask = input[9] >> 31;
+ const s32 carry = -((input[9] & mask) >> 25);
+ input[9] = input[9] + (carry << 25);
+ input[0] = input[0] - (carry * 19);
+ }
+
+ /* After the first iteration, input[1..9] are non-negative and fit within
+ * 25 or 26 bits, depending on position. However, input[0] may be
+ * negative. */
+ }
+
+ /* The first borrow-propagation pass above ended with every limb
+ except (possibly) input[0] non-negative.
+
+ If input[0] was negative after the first pass, then it was because of a
+ carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most,
+ one, since (2**26-1) >> 25 = 1. Thus input[0] >= -19.
+
+ In the second pass, each limb is decreased by at most one. Thus the second
+ borrow-propagation pass could only have wrapped around to decrease
+ input[0] again if the first pass left input[0] negative *and* input[1]
+ through input[9] were all zero. In that case, input[1] is now 2^25 - 1,
+ and this last borrow-propagation step will leave input[1] non-negative. */
+ {
+ const s32 mask = input[0] >> 31;
+ const s32 carry = -((input[0] & mask) >> 26);
+ input[0] = input[0] + (carry << 26);
+ input[1] = input[1] - carry;
+ }
+
+ /* All input[i] are now non-negative. However, there might be values between
+ * 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. */
+ for (j = 0; j < 2; j++) {
+ for (i = 0; i < 9; i++) {
+ if ((i & 1) == 1) {
+ const s32 carry = input[i] >> 25;
+ input[i] &= 0x1ffffff;
+ input[i + 1] += carry;
+ }
+ else {
+ const s32 carry = input[i] >> 26;
+ input[i] &= 0x3ffffff;
+ input[i + 1] += carry;
+ }
+ }
+
+ {
+ const s32 carry = input[9] >> 25;
+ input[9] &= 0x1ffffff;
+ input[0] += 19 * carry;
+ }
+ }
+
+ /* If the first carry-chain pass, just above, ended up with a carry from
+ * input[9], and that caused input[0] to be out-of-bounds, then input[0] was
+ * < 2^26 + 2*19, because the carry was, at most, two.
+ *
+ * If the second pass carried from input[9] again then input[0] is < 2*19 and
+ * the input[9] -> input[0] carry didn't push input[0] out of bounds. */
+
+ /* It still remains the case that input might be between 2^255-19 and 2^255.
+ * In this case, input[1..9] must take their maximum value and input[0] must
+ * be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. */
+ mask = s32_gte (input[0], 0x3ffffed);
+ for (i = 1; i < 10; i++) {
+ if ((i & 1) == 1) {
+ mask &= s32_eq (input[i], 0x1ffffff);
+ }
+ else {
+ mask &= s32_eq (input[i], 0x3ffffff);
+ }
+ }
+
+ /* mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus
+ * this conditionally subtracts 2^255-19. */
+ input[0] -= mask & 0x3ffffed;
+
+ for (i = 1; i < 10; i++) {
+ if ((i & 1) == 1) {
+ input[i] -= mask & 0x1ffffff;
+ }
+ else {
+ input[i] -= mask & 0x3ffffff;
+ }
+ }
+
+ input[1] <<= 2;
+ input[2] <<= 3;
+ input[3] <<= 5;
+ input[4] <<= 6;
+ input[6] <<= 1;
+ input[7] <<= 3;
+ input[8] <<= 4;
+ input[9] <<= 6;
+#define F(i, s) \
+ output[s+0] |= input[i] & 0xff; \
+ output[s+1] = (input[i] >> 8) & 0xff; \
+ output[s+2] = (input[i] >> 16) & 0xff; \
+ output[s+3] = (input[i] >> 24) & 0xff;
+ output[0] = 0;
+ output[16] = 0;
+ F(0, 0);
+ F(1, 3);
+ F(2, 6);
+ F(3, 9);
+ F(4, 12);
+ F(5, 16);
+ F(6, 19);
+ F(7, 22);
+ F(8, 25);
+ F(9, 28);
+#undef F
+}
+
+/* Input: Q, Q', Q-Q'
+ * Output: 2Q, Q+Q'
+ *
+ * x2 z3: long form
+ * x3 z3: long form
+ * x z: short form, destroyed
+ * xprime zprime: short form, destroyed
+ * qmqp: short form, preserved
+ *
+ * On entry and exit, the absolute value of the limbs of all inputs and outputs
+ * are < 2^26. */
+static void fmonty (limb *x2, limb *z2, /* output 2Q */
+limb *x3, limb *z3, /* output Q + Q' */
+limb *x, limb *z, /* input Q */
+limb *xprime, limb *zprime, /* input Q' */
+const limb *qmqp /* input Q - Q' */)
+{
+ limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
+ zzprime[19], zzzprime[19], xxxprime[19];
+
+ memcpy (origx, x, 10 * sizeof(limb));
+ fsum (x, z);
+ /* |x[i]| < 2^27 */
+ fdifference (z, origx); /* does x - z */
+ /* |z[i]| < 2^27 */
+
+ memcpy (origxprime, xprime, sizeof(limb) * 10);
+ fsum (xprime, zprime);
+ /* |xprime[i]| < 2^27 */
+ fdifference (zprime, origxprime);
+ /* |zprime[i]| < 2^27 */
+ fproduct (xxprime, xprime, z);
+ /* |xxprime[i]| < 14*2^54: the largest product of two limbs will be <
+ * 2^(27+27) and fproduct adds together, at most, 14 of those products.
+ * (Approximating that to 2^58 doesn't work out.) */
+ fproduct (zzprime, x, zprime);
+ /* |zzprime[i]| < 14*2^54 */
+ freduce_degree (xxprime);
+ freduce_coefficients (xxprime);
+ /* |xxprime[i]| < 2^26 */
+ freduce_degree (zzprime);
+ freduce_coefficients (zzprime);
+ /* |zzprime[i]| < 2^26 */
+ memcpy (origxprime, xxprime, sizeof(limb) * 10);
+ fsum (xxprime, zzprime);
+ /* |xxprime[i]| < 2^27 */
+ fdifference (zzprime, origxprime);
+ /* |zzprime[i]| < 2^27 */
+ fsquare (xxxprime, xxprime);
+ /* |xxxprime[i]| < 2^26 */
+ fsquare (zzzprime, zzprime);
+ /* |zzzprime[i]| < 2^26 */
+ fproduct (zzprime, zzzprime, qmqp);
+ /* |zzprime[i]| < 14*2^52 */
+ freduce_degree (zzprime);
+ freduce_coefficients (zzprime);
+ /* |zzprime[i]| < 2^26 */
+ memcpy (x3, xxxprime, sizeof(limb) * 10);
+ memcpy (z3, zzprime, sizeof(limb) * 10);
+
+ fsquare (xx, x);
+ /* |xx[i]| < 2^26 */
+ fsquare (zz, z);
+ /* |zz[i]| < 2^26 */
+ fproduct (x2, xx, zz);
+ /* |x2[i]| < 14*2^52 */
+ freduce_degree (x2);
+ freduce_coefficients (x2);
+ /* |x2[i]| < 2^26 */
+ fdifference (zz, xx); // does zz = xx - zz
+ /* |zz[i]| < 2^27 */
+ memset (zzz + 10, 0, sizeof(limb) * 9);
+ fscalar_product (zzz, zz, 121665);
+ /* |zzz[i]| < 2^(27+17) */
+ /* No need to call freduce_degree here:
+ fscalar_product doesn't increase the degree of its input. */
+ freduce_coefficients (zzz);
+ /* |zzz[i]| < 2^26 */
+ fsum (zzz, xx);
+ /* |zzz[i]| < 2^27 */
+ fproduct (z2, zz, zzz);
+ /* |z2[i]| < 14*2^(26+27) */
+ freduce_degree (z2);
+ freduce_coefficients (z2);
+ /* |z2|i| < 2^26 */
+}
+
+/* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave
+ * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid
+ * side-channel attacks.
+ *
+ * NOTE that this function requires that 'iswap' be 1 or 0; other values give
+ * wrong results. Also, the two limb arrays must be in reduced-coefficient,
+ * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
+ * and all all values in a[0..9],b[0..9] must have magnitude less than
+ * INT32_MAX. */
+static void swap_conditional (limb a[19], limb b[19], limb iswap)
+{
+ unsigned i;
+ const s32 swap = (s32) -iswap;
+
+ for (i = 0; i < 10; ++i) {
+ const s32 x = swap & (((s32) a[i]) ^ ((s32) b[i]));
+ a[i] = ((s32) a[i]) ^ x;
+ b[i] = ((s32) b[i]) ^ x;
+ }
+}
+
+/* Calculates nQ where Q is the x-coordinate of a point on the curve
+ *
+ * resultx/resultz: the x coordinate of the resulting curve point (short form)
+ * n: a little endian, 32-byte number
+ * q: a point of the curve (short form) */
+static void cmult (limb *resultx, limb *resultz, const u8 *n, const limb *q)
+{
+ limb a[19] = { 0 }, b[19] = { 1 }, c[19] = { 1 }, d[19] = { 0 };
+ limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
+ limb e[19] = { 0 }, f[19] = { 1 }, g[19] = { 0 }, h[19] = { 1 };
+ limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
+
+ unsigned i, j;
+
+ memcpy (nqpqx, q, sizeof(limb) * 10);
+
+ for (i = 0; i < 32; ++i) {
+ u8 byte = n[31 - i];
+ for (j = 0; j < 8; ++j) {
+ const limb bit = byte >> 7;
+
+ swap_conditional (nqx, nqpqx, bit);
+ swap_conditional (nqz, nqpqz, bit);
+ fmonty (nqx2, nqz2, nqpqx2, nqpqz2, nqx, nqz, nqpqx, nqpqz, q);
+ swap_conditional (nqx2, nqpqx2, bit);
+ swap_conditional (nqz2, nqpqz2, bit);
+
+ t = nqx;
+ nqx = nqx2;
+ nqx2 = t;
+ t = nqz;
+ nqz = nqz2;
+ nqz2 = t;
+ t = nqpqx;
+ nqpqx = nqpqx2;
+ nqpqx2 = t;
+ t = nqpqz;
+ nqpqz = nqpqz2;
+ nqpqz2 = t;
+
+ byte <<= 1;
+ }
+ }
+
+ memcpy (resultx, nqx, sizeof(limb) * 10);
+ memcpy (resultz, nqz, sizeof(limb) * 10);
+}
+
+// -----------------------------------------------------------------------------
+// Shamelessly copied from djb's code
+// -----------------------------------------------------------------------------
+static void crecip (limb *out, const limb *z)
+{
+ limb z2[10];
+ limb z9[10];
+ limb z11[10];
+ limb z2_5_0[10];
+ limb z2_10_0[10];
+ limb z2_20_0[10];
+ limb z2_50_0[10];
+ limb z2_100_0[10];
+ limb t0[10];
+ limb t1[10];
+ int i;
+
+ /* 2 */fsquare (z2, z);
+ /* 4 */fsquare (t1, z2);
+ /* 8 */fsquare (t0, t1);
+ /* 9 */fmul (z9, t0, z);
+ /* 11 */fmul (z11, z9, z2);
+ /* 22 */fsquare (t0, z11);
+ /* 2^5 - 2^0 = 31 */fmul (z2_5_0, t0, z9);
+
+ /* 2^6 - 2^1 */fsquare (t0, z2_5_0);
+ /* 2^7 - 2^2 */fsquare (t1, t0);
+ /* 2^8 - 2^3 */fsquare (t0, t1);
+ /* 2^9 - 2^4 */fsquare (t1, t0);
+ /* 2^10 - 2^5 */fsquare (t0, t1);
+ /* 2^10 - 2^0 */fmul (z2_10_0, t0, z2_5_0);
+
+ /* 2^11 - 2^1 */fsquare (t0, z2_10_0);
+ /* 2^12 - 2^2 */fsquare (t1, t0);
+ /* 2^20 - 2^10 */for (i = 2; i < 10; i += 2) {
+ fsquare (t0, t1);
+ fsquare (t1, t0);
+ }
+ /* 2^20 - 2^0 */fmul (z2_20_0, t1, z2_10_0);
+
+ /* 2^21 - 2^1 */fsquare (t0, z2_20_0);
+ /* 2^22 - 2^2 */fsquare (t1, t0);
+ /* 2^40 - 2^20 */for (i = 2; i < 20; i += 2) {
+ fsquare (t0, t1);
+ fsquare (t1, t0);
+ }
+ /* 2^40 - 2^0 */fmul (t0, t1, z2_20_0);
+
+ /* 2^41 - 2^1 */fsquare (t1, t0);
+ /* 2^42 - 2^2 */fsquare (t0, t1);
+ /* 2^50 - 2^10 */for (i = 2; i < 10; i += 2) {
+ fsquare (t1, t0);
+ fsquare (t0, t1);
+ }
+ /* 2^50 - 2^0 */fmul (z2_50_0, t0, z2_10_0);
+
+ /* 2^51 - 2^1 */fsquare (t0, z2_50_0);
+ /* 2^52 - 2^2 */fsquare (t1, t0);
+ /* 2^100 - 2^50 */for (i = 2; i < 50; i += 2) {
+ fsquare (t0, t1);
+ fsquare (t1, t0);
+ }
+ /* 2^100 - 2^0 */fmul (z2_100_0, t1, z2_50_0);
+
+ /* 2^101 - 2^1 */fsquare (t1, z2_100_0);
+ /* 2^102 - 2^2 */fsquare (t0, t1);
+ /* 2^200 - 2^100 */for (i = 2; i < 100; i += 2) {
+ fsquare (t1, t0);
+ fsquare (t0, t1);
+ }
+ /* 2^200 - 2^0 */fmul (t1, t0, z2_100_0);
+
+ /* 2^201 - 2^1 */fsquare (t0, t1);
+ /* 2^202 - 2^2 */fsquare (t1, t0);
+ /* 2^250 - 2^50 */for (i = 2; i < 50; i += 2) {
+ fsquare (t0, t1);
+ fsquare (t1, t0);
+ }
+ /* 2^250 - 2^0 */fmul (t0, t1, z2_50_0);
+
+ /* 2^251 - 2^1 */fsquare (t1, t0);
+ /* 2^252 - 2^2 */fsquare (t0, t1);
+ /* 2^253 - 2^3 */fsquare (t1, t0);
+ /* 2^254 - 2^4 */fsquare (t0, t1);
+ /* 2^255 - 2^5 */fsquare (t1, t0);
+ /* 2^255 - 21 */fmul (out, t1, z11);
+}
+
+int curve25519 (u8 *mypublic, const u8 *secret, const u8 *basepoint)
+{
+ limb bp[10], x[10], z[11], zmone[10];
+ uint8_t e[32];
+ int i;
+
+ for (i = 0; i < 32; ++i)
+ e[i] = secret[i];
+ e[0] &= 248;
+ e[31] &= 127;
+ e[31] |= 64;
+
+ fexpand (bp, basepoint);
+ cmult (x, z, e, bp);
+ crecip (zmone, z);
+ fmul (z, x, zmone);
+ fcontract (mypublic, z);
+ return 0;
+}
--- /dev/null
+#ifndef CURVE25519_H
+#define CURVE25519_H
+
+#include "config.h"
+
+static const guchar curve25519_basepoint[32] = {9};
+
+int curve25519 (guchar *mypublic, const guchar *secret, const guchar *basepoint);
+
+#endif
--- /dev/null
+/*
+version 20081011
+Matthew Dempsky
+Public domain.
+Derived from public domain code by D. J. Bernstein.
+20140216 tweak: Mask top bit of point input.
+*/
+
+static void add (unsigned int out[32], const unsigned int a[32],
+ const unsigned int b[32])
+{
+ unsigned int j;
+ unsigned int u;
+ u = 0;
+ for (j = 0; j < 31; ++j) {
+ u += a[j] + b[j];
+ out[j] = u & 255;
+ u >>= 8;
+ }
+ u += a[31] + b[31];
+ out[31] = u;
+}
+
+static void sub (unsigned int out[32], const unsigned int a[32],
+ const unsigned int b[32])
+{
+ unsigned int j;
+ unsigned int u;
+ u = 218;
+ for (j = 0; j < 31; ++j) {
+ u += a[j] + 65280 - b[j];
+ out[j] = u & 255;
+ u >>= 8;
+ }
+ u += a[31] - b[31];
+ out[31] = u;
+}
+
+static void squeeze (unsigned int a[32])
+{
+ unsigned int j;
+ unsigned int u;
+ u = 0;
+ for (j = 0; j < 31; ++j) {
+ u += a[j];
+ a[j] = u & 255;
+ u >>= 8;
+ }
+ u += a[31];
+ a[31] = u & 127;
+ u = 19 * (u >> 7);
+ for (j = 0; j < 31; ++j) {
+ u += a[j];
+ a[j] = u & 255;
+ u >>= 8;
+ }
+ u += a[31];
+ a[31] = u;
+}
+
+static const unsigned int minusp[32] = { 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 };
+
+static void freeze (unsigned int a[32])
+{
+ unsigned int aorig[32];
+ unsigned int j;
+ unsigned int negative;
+
+ for (j = 0; j < 32; ++j)
+ aorig[j] = a[j];
+ add (a, a, minusp);
+ negative = -((a[31] >> 7) & 1);
+ for (j = 0; j < 32; ++j)
+ a[j] ^= negative & (aorig[j] ^ a[j]);
+}
+
+static void mult (unsigned int out[32], const unsigned int a[32],
+ const unsigned int b[32])
+{
+ unsigned int i;
+ unsigned int j;
+ unsigned int u;
+
+ for (i = 0; i < 32; ++i) {
+ u = 0;
+ for (j = 0; j <= i; ++j)
+ u += a[j] * b[i - j];
+ for (j = i + 1; j < 32; ++j)
+ u += 38 * a[j] * b[i + 32 - j];
+ out[i] = u;
+ }
+ squeeze (out);
+}
+
+static void mult121665 (unsigned int out[32], const unsigned int a[32])
+{
+ unsigned int j;
+ unsigned int u;
+
+ u = 0;
+ for (j = 0; j < 31; ++j) {
+ u += 121665 * a[j];
+ out[j] = u & 255;
+ u >>= 8;
+ }
+ u += 121665 * a[31];
+ out[31] = u & 127;
+ u = 19 * (u >> 7);
+ for (j = 0; j < 31; ++j) {
+ u += out[j];
+ out[j] = u & 255;
+ u >>= 8;
+ }
+ u += out[j];
+ out[j] = u;
+}
+
+static void square (unsigned int out[32], const unsigned int a[32])
+{
+ unsigned int i;
+ unsigned int j;
+ unsigned int u;
+
+ for (i = 0; i < 32; ++i) {
+ u = 0;
+ for (j = 0; j < i - j; ++j)
+ u += a[j] * a[i - j];
+ for (j = i + 1; j < i + 32 - j; ++j)
+ u += 38 * a[j] * a[i + 32 - j];
+ u *= 2;
+ if ((i & 1) == 0) {
+ u += a[i / 2] * a[i / 2];
+ u += 38 * a[i / 2 + 16] * a[i / 2 + 16];
+ }
+ out[i] = u;
+ }
+ squeeze (out);
+}
+
+static void select (unsigned int p[64], unsigned int q[64],
+ const unsigned int r[64], const unsigned int s[64], unsigned int b)
+{
+ unsigned int j;
+ unsigned int t;
+ unsigned int bminus1;
+
+ bminus1 = b - 1;
+ for (j = 0; j < 64; ++j) {
+ t = bminus1 & (r[j] ^ s[j]);
+ p[j] = s[j] ^ t;
+ q[j] = r[j] ^ t;
+ }
+}
+
+static void mainloop (unsigned int work[64], const unsigned char e[32])
+{
+ unsigned int xzm1[64];
+ unsigned int xzm[64];
+ unsigned int xzmb[64];
+ unsigned int xzm1b[64];
+ unsigned int xznb[64];
+ unsigned int xzn1b[64];
+ unsigned int a0[64];
+ unsigned int a1[64];
+ unsigned int b0[64];
+ unsigned int b1[64];
+ unsigned int c1[64];
+ unsigned int r[32];
+ unsigned int s[32];
+ unsigned int t[32];
+ unsigned int u[32];
+ unsigned int i;
+ unsigned int j;
+ unsigned int b;
+ int pos;
+
+ for (j = 0; j < 32; ++j)
+ xzm1[j] = work[j];
+ xzm1[32] = 1;
+ for (j = 33; j < 64; ++j)
+ xzm1[j] = 0;
+
+ xzm[0] = 1;
+ for (j = 1; j < 64; ++j)
+ xzm[j] = 0;
+
+ for (pos = 254; pos >= 0; --pos) {
+ b = e[pos / 8] >> (pos & 7);
+ b &= 1;
+ select (xzmb, xzm1b, xzm, xzm1, b);
+ add (a0, xzmb, xzmb + 32);
+ sub (a0 + 32, xzmb, xzmb + 32);
+ add (a1, xzm1b, xzm1b + 32);
+ sub (a1 + 32, xzm1b, xzm1b + 32);
+ square (b0, a0);
+ square (b0 + 32, a0 + 32);
+ mult (b1, a1, a0 + 32);
+ mult (b1 + 32, a1 + 32, a0);
+ add (c1, b1, b1 + 32);
+ sub (c1 + 32, b1, b1 + 32);
+ square (r, c1 + 32);
+ sub (s, b0, b0 + 32);
+ mult121665 (t, s);
+ add (u, t, b0);
+ mult (xznb, b0, b0 + 32);
+ mult (xznb + 32, s, u);
+ square (xzn1b, c1);
+ mult (xzn1b + 32, r, work);
+ select (xzm, xzm1, xznb, xzn1b, b);
+ }
+
+ for (j = 0; j < 64; ++j)
+ work[j] = xzm[j];
+}
+
+static void recip (unsigned int out[32], const unsigned int z[32])
+{
+ unsigned int z2[32];
+ unsigned int z9[32];
+ unsigned int z11[32];
+ unsigned int z2_5_0[32];
+ unsigned int z2_10_0[32];
+ unsigned int z2_20_0[32];
+ unsigned int z2_50_0[32];
+ unsigned int z2_100_0[32];
+ unsigned int t0[32];
+ unsigned int t1[32];
+ int i;
+
+ /* 2 */square (z2, z);
+ /* 4 */square (t1, z2);
+ /* 8 */square (t0, t1);
+ /* 9 */mult (z9, t0, z);
+ /* 11 */mult (z11, z9, z2);
+ /* 22 */square (t0, z11);
+ /* 2^5 - 2^0 = 31 */mult (z2_5_0, t0, z9);
+
+ /* 2^6 - 2^1 */square (t0, z2_5_0);
+ /* 2^7 - 2^2 */square (t1, t0);
+ /* 2^8 - 2^3 */square (t0, t1);
+ /* 2^9 - 2^4 */square (t1, t0);
+ /* 2^10 - 2^5 */square (t0, t1);
+ /* 2^10 - 2^0 */mult (z2_10_0, t0, z2_5_0);
+
+ /* 2^11 - 2^1 */square (t0, z2_10_0);
+ /* 2^12 - 2^2 */square (t1, t0);
+ /* 2^20 - 2^10 */for (i = 2; i < 10; i += 2) {
+ square (t0, t1);
+ square (t1, t0);
+ }
+ /* 2^20 - 2^0 */mult (z2_20_0, t1, z2_10_0);
+
+ /* 2^21 - 2^1 */square (t0, z2_20_0);
+ /* 2^22 - 2^2 */square (t1, t0);
+ /* 2^40 - 2^20 */for (i = 2; i < 20; i += 2) {
+ square (t0, t1);
+ square (t1, t0);
+ }
+ /* 2^40 - 2^0 */mult (t0, t1, z2_20_0);
+
+ /* 2^41 - 2^1 */square (t1, t0);
+ /* 2^42 - 2^2 */square (t0, t1);
+ /* 2^50 - 2^10 */for (i = 2; i < 10; i += 2) {
+ square (t1, t0);
+ square (t0, t1);
+ }
+ /* 2^50 - 2^0 */mult (z2_50_0, t0, z2_10_0);
+
+ /* 2^51 - 2^1 */square (t0, z2_50_0);
+ /* 2^52 - 2^2 */square (t1, t0);
+ /* 2^100 - 2^50 */for (i = 2; i < 50; i += 2) {
+ square (t0, t1);
+ square (t1, t0);
+ }
+ /* 2^100 - 2^0 */mult (z2_100_0, t1, z2_50_0);
+
+ /* 2^101 - 2^1 */square (t1, z2_100_0);
+ /* 2^102 - 2^2 */square (t0, t1);
+ /* 2^200 - 2^100 */for (i = 2; i < 100; i += 2) {
+ square (t1, t0);
+ square (t0, t1);
+ }
+ /* 2^200 - 2^0 */mult (t1, t0, z2_100_0);
+
+ /* 2^201 - 2^1 */square (t0, t1);
+ /* 2^202 - 2^2 */square (t1, t0);
+ /* 2^250 - 2^50 */for (i = 2; i < 50; i += 2) {
+ square (t0, t1);
+ square (t1, t0);
+ }
+ /* 2^250 - 2^0 */mult (t0, t1, z2_50_0);
+
+ /* 2^251 - 2^1 */square (t1, t0);
+ /* 2^252 - 2^2 */square (t0, t1);
+ /* 2^253 - 2^3 */square (t1, t0);
+ /* 2^254 - 2^4 */square (t0, t1);
+ /* 2^255 - 2^5 */square (t1, t0);
+ /* 2^255 - 21 */mult (out, t1, z11);
+}
+
+int curve25519 (unsigned char *q, const unsigned char *n,
+ const unsigned char *p)
+{
+ unsigned int work[96];
+ unsigned char e[32];
+ unsigned int i;
+ for (i = 0; i < 32; ++i)
+ e[i] = n[i];
+ e[0] &= 248;
+ e[31] &= 127;
+ e[31] |= 64;
+ for (i = 0; i < 32; ++i)
+ work[i] = p[i];
+ work[31] &= 127;
+ mainloop (work, e);
+ recip (work + 32, work + 32);
+ mult (work + 64, work, work + 32);
+ freeze (work + 64);
+ for (i = 0; i < 32; ++i)
+ q[i] = work[64 + i];
+ return 0;
+}
--- /dev/null
+"A state-of-the-art message-authentication code"\r
+\r
+# ABOUT\r
+\r
+See: [http://cr.yp.to/mac.html](http://cr.yp.to/mac.html) and [http://cr.yp.to/mac/poly1305-20050329.pdf](http://cr.yp.to/mac/poly1305-20050329.pdf)\r
+\r
+These are quite portable implementations of increasing efficiency depending on the size of the multiplier available.\r
+Optimized implementations have been moved to [poly1305-opt](https://github.com/floodyberry/poly1305-opt)\r
+\r
+# BUILDING\r
+\r
+## Default\r
+\r
+If compiled with no options, `poly1305-donna.c` will select between the 32 bit and 64 bit implementations based\r
+on what it can tell the compiler supports\r
+\r
+ gcc poly1305-donna.c -O3 -o poly1305.o\r
+\r
+## Selecting a specific version\r
+\r
+ gcc poly1305-donna.c -O3 -o poly1305.o -DPOLY1305_XXBITS\r
+\r
+Where `-DPOLY1305_XXBITS` is one of\r
+\r
+ * `-DPOLY1305_8BITS`, 8->16 bit multiplies, 32 bit additions\r
+ * `-DPOLY1305_16BITS`, 16->32 bit multiples, 32 bit additions\r
+ * `-DPOLY1305_32BITS`, 32->64 bit multiplies, 64 bit additions\r
+ * `-DPOLY1305_64BITS`, 64->128 bit multiplies, 128 bit additions\r
+\r
+8 bit and 16 bit versions were written to keep the code size small, 32 bit and 64 bit versions are mildly optimized due\r
+to needing fewer multiplications. All 4 can be made faster at the expense of increased code size and complexity, which \r
+is not the intention of this project.\r
+\r
+# USAGE\r
+\r
+See: [http://nacl.cace-project.eu/onetimeauth.html](http://nacl.cace-project.eu/onetimeauth.html), in specific, slightly plagiarized:\r
+\r
+The poly1305_auth function, viewed as a function of the message for a uniform random key, is \r
+designed to meet the standard notion of unforgeability after a single message. After the sender \r
+authenticates one message, an attacker cannot find authenticators for any other messages.\r
+\r
+The sender **MUST NOT** use poly1305_auth to authenticate more than one message under the same key.\r
+Authenticators for two messages under the same key should be expected to reveal enough information \r
+to allow forgeries of authenticators on other messages. \r
+\r
+## Functions\r
+\r
+`poly1305_context` is declared in [poly1305.h](poly1305.h) and is an opaque structure large enough to support \r
+every underlying platform specific implementation. It should be size_t aligned, which should be handled already\r
+with the size_t member `aligner`.\r
+\r
+`void poly1305_init(poly1305_context *ctx, const unsigned char key[32]);`\r
+\r
+where\r
+\r
+`key` is the 32 byte key that is **only used for this message and is discarded immediately after**\r
+\r
+`void poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes);`\r
+\r
+where `m` is a pointer to the message fragment to be processed, and\r
+\r
+`bytes` is the length of the message fragment\r
+\r
+`void poly1305_finish(poly1305_context *ctx, unsigned char mac[16]);`\r
+\r
+where `mac` is the buffer which receives the 16 byte authenticator. After calling finish, the underlying\r
+implementation will zero out `ctx`.\r
+\r
+`void poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]);`\r
+\r
+where `mac` is the buffer which receives the 16 byte authenticator,\r
+\r
+`m` is a pointer to the message to be processed,\r
+\r
+`bytes` is the number of bytes in the message, and\r
+\r
+`key` is the 32 byte key that is **only used for this message and is discarded immediately after**.\r
+\r
+`int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]);`\r
+\r
+where `mac1` is compared to `mac2` in constant time and returns `1` if they are equal and `0` if they are not\r
+\r
+`int poly1305_power_on_self_test(void);`\r
+\r
+tests the underlying implementation to verify it is working correctly. It returns `1` if all tests pass, and `0` if \r
+any tests fail.\r
+\r
+## Example\r
+\r
+### Simple\r
+\r
+ #include "poly1305-donna.h"\r
+\r
+ unsigned char key[32] = {...}, mac[16];\r
+ unsigned char msg[] = {...};\r
+\r
+ poly1305_auth(mac, msg, msglen, key);\r
+\r
+### Full\r
+\r
+[example-poly1305.c](example-poly1305.c) is a simple example of how to verify the underlying implementation is producing\r
+the correct results, compute an authenticator, and test it against an expected value.\r
+\r
+# LICENSE\r
+\r
+[MIT](http://www.opensource.org/licenses/mit-license.php) or PUBLIC DOMAIN\r
+\r
+\r
+# NAMESAKE\r
+\r
+I borrowed the idea for these from Adam Langley's [curve25519-donna](http://github.com/agl/curve25519-donna), hence\r
+the name.
\ No newline at end of file
--- /dev/null
+/*
+ poly1305 implementation using 16 bit * 16 bit = 32 bit multiplication and 32 bit addition
+*/
+
+#if defined(_MSC_VER)
+ #define POLY1305_NOINLINE __declspec(noinline)
+#elif defined(__GNUC__)
+ #define POLY1305_NOINLINE __attribute__((noinline))
+#else
+ #define POLY1305_NOINLINE
+#endif
+
+#define poly1305_block_size 16
+
+/* 17 + sizeof(size_t) + 18*sizeof(unsigned short) */
+typedef struct poly1305_state_internal_t {
+ unsigned char buffer[poly1305_block_size];
+ size_t leftover;
+ unsigned short r[10];
+ unsigned short h[10];
+ unsigned short pad[8];
+ unsigned char final;
+} poly1305_state_internal_t;
+
+/* interpret two 8 bit unsigned integers as a 16 bit unsigned integer in little endian */
+static unsigned short
+U8TO16(const unsigned char *p) {
+ return
+ (((unsigned short)(p[0] & 0xff) ) |
+ ((unsigned short)(p[1] & 0xff) << 8));
+}
+
+/* store a 16 bit unsigned integer as two 8 bit unsigned integers in little endian */
+static void
+U16TO8(unsigned char *p, unsigned short v) {
+ p[0] = (v ) & 0xff;
+ p[1] = (v >> 8) & 0xff;
+}
+
+void
+poly1305_init(poly1305_context *ctx, const unsigned char key[32]) {
+ poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+ unsigned short t0,t1,t2,t3,t4,t5,t6,t7;
+ size_t i;
+
+ /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+ t0 = U8TO16(&key[ 0]); st->r[0] = ( t0 ) & 0x1fff;
+ t1 = U8TO16(&key[ 2]); st->r[1] = ((t0 >> 13) | (t1 << 3)) & 0x1fff;
+ t2 = U8TO16(&key[ 4]); st->r[2] = ((t1 >> 10) | (t2 << 6)) & 0x1f03;
+ t3 = U8TO16(&key[ 6]); st->r[3] = ((t2 >> 7) | (t3 << 9)) & 0x1fff;
+ t4 = U8TO16(&key[ 8]); st->r[4] = ((t3 >> 4) | (t4 << 12)) & 0x00ff;
+ st->r[5] = ((t4 >> 1) ) & 0x1ffe;
+ t5 = U8TO16(&key[10]); st->r[6] = ((t4 >> 14) | (t5 << 2)) & 0x1fff;
+ t6 = U8TO16(&key[12]); st->r[7] = ((t5 >> 11) | (t6 << 5)) & 0x1f81;
+ t7 = U8TO16(&key[14]); st->r[8] = ((t6 >> 8) | (t7 << 8)) & 0x1fff;
+ st->r[9] = ((t7 >> 5) ) & 0x007f;
+
+ /* h = 0 */
+ for (i = 0; i < 10; i++)
+ st->h[i] = 0;
+
+ /* save pad for later */
+ for (i = 0; i < 8; i++)
+ st->pad[i] = U8TO16(&key[16 + (2 * i)]);
+
+ st->leftover = 0;
+ st->final = 0;
+}
+
+static void
+poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) {
+ const unsigned short hibit = (st->final) ? 0 : (1 << 11); /* 1 << 128 */
+ unsigned short t0,t1,t2,t3,t4,t5,t6,t7;
+ unsigned long d[10];
+ unsigned long c;
+
+ while (bytes >= poly1305_block_size) {
+ size_t i, j;
+
+ /* h += m[i] */
+ t0 = U8TO16(&m[ 0]); st->h[0] += ( t0 ) & 0x1fff;
+ t1 = U8TO16(&m[ 2]); st->h[1] += ((t0 >> 13) | (t1 << 3)) & 0x1fff;
+ t2 = U8TO16(&m[ 4]); st->h[2] += ((t1 >> 10) | (t2 << 6)) & 0x1fff;
+ t3 = U8TO16(&m[ 6]); st->h[3] += ((t2 >> 7) | (t3 << 9)) & 0x1fff;
+ t4 = U8TO16(&m[ 8]); st->h[4] += ((t3 >> 4) | (t4 << 12)) & 0x1fff;
+ st->h[5] += ((t4 >> 1) ) & 0x1fff;
+ t5 = U8TO16(&m[10]); st->h[6] += ((t4 >> 14) | (t5 << 2)) & 0x1fff;
+ t6 = U8TO16(&m[12]); st->h[7] += ((t5 >> 11) | (t6 << 5)) & 0x1fff;
+ t7 = U8TO16(&m[14]); st->h[8] += ((t6 >> 8) | (t7 << 8)) & 0x1fff;
+ st->h[9] += ((t7 >> 5) ) | hibit;
+
+ /* h *= r, (partial) h %= p */
+ for (i = 0, c = 0; i < 10; i++) {
+ d[i] = c;
+ for (j = 0; j < 10; j++) {
+ d[i] += (unsigned long)st->h[j] * ((j <= i) ? st->r[i - j] : (5 * st->r[i + 10 - j]));
+ /* Sum(h[i] * r[i] * 5) will overflow slightly above 6 products with an unclamped r, so carry at 5 */
+ if (j == 4) {
+ c = (d[i] >> 13);
+ d[i] &= 0x1fff;
+ }
+ }
+ c += (d[i] >> 13);
+ d[i] &= 0x1fff;
+ }
+ c = ((c << 2) + c); /* c *= 5 */
+ c += d[0];
+ d[0] = ((unsigned short)c & 0x1fff);
+ c = (c >> 13);
+ d[1] += c;
+
+ for (i = 0; i < 10; i++)
+ st->h[i] = (unsigned short)d[i];
+
+ m += poly1305_block_size;
+ bytes -= poly1305_block_size;
+ }
+}
+
+POLY1305_NOINLINE void
+poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) {
+ poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+ unsigned short c;
+ unsigned short g[10];
+ unsigned short mask;
+ unsigned long f;
+ size_t i;
+
+ /* process the remaining block */
+ if (st->leftover) {
+ size_t i = st->leftover;
+ st->buffer[i++] = 1;
+ for (; i < poly1305_block_size; i++)
+ st->buffer[i] = 0;
+ st->final = 1;
+ poly1305_blocks(st, st->buffer, poly1305_block_size);
+ }
+
+ /* fully carry h */
+ c = st->h[1] >> 13;
+ st->h[1] &= 0x1fff;
+ for (i = 2; i < 10; i++) {
+ st->h[i] += c;
+ c = st->h[i] >> 13;
+ st->h[i] &= 0x1fff;
+ }
+ st->h[0] += (c * 5);
+ c = st->h[0] >> 13;
+ st->h[0] &= 0x1fff;
+ st->h[1] += c;
+ c = st->h[1] >> 13;
+ st->h[1] &= 0x1fff;
+ st->h[2] += c;
+
+ /* compute h + -p */
+ g[0] = st->h[0] + 5;
+ c = g[0] >> 13;
+ g[0] &= 0x1fff;
+ for (i = 1; i < 10; i++) {
+ g[i] = st->h[i] + c;
+ c = g[i] >> 13;
+ g[i] &= 0x1fff;
+ }
+ g[9] -= (1 << 13);
+
+ /* select h if h < p, or h + -p if h >= p */
+ mask = (g[9] >> ((sizeof(unsigned short) * 8) - 1)) - 1;
+ for (i = 0; i < 10; i++)
+ g[i] &= mask;
+ mask = ~mask;
+ for (i = 0; i < 10; i++)
+ st->h[i] = (st->h[i] & mask) | g[i];
+
+ /* h = h % (2^128) */
+ st->h[0] = ((st->h[0] ) | (st->h[1] << 13) ) & 0xffff;
+ st->h[1] = ((st->h[1] >> 3) | (st->h[2] << 10) ) & 0xffff;
+ st->h[2] = ((st->h[2] >> 6) | (st->h[3] << 7) ) & 0xffff;
+ st->h[3] = ((st->h[3] >> 9) | (st->h[4] << 4) ) & 0xffff;
+ st->h[4] = ((st->h[4] >> 12) | (st->h[5] << 1) | (st->h[6] << 14)) & 0xffff;
+ st->h[5] = ((st->h[6] >> 2) | (st->h[7] << 11) ) & 0xffff;
+ st->h[6] = ((st->h[7] >> 5) | (st->h[8] << 8) ) & 0xffff;
+ st->h[7] = ((st->h[8] >> 8) | (st->h[9] << 5) ) & 0xffff;
+
+ /* mac = (h + pad) % (2^128) */
+ f = (unsigned long)st->h[0] + st->pad[0];
+ st->h[0] = (unsigned short)f;
+ for (i = 1; i < 8; i++) {
+ f = (unsigned long)st->h[i] + st->pad[i] + (f >> 16);
+ st->h[i] = (unsigned short)f;
+ }
+
+ for (i = 0; i < 8; i++)
+ U16TO8(mac + (i * 2), st->h[i]);
+
+ /* zero out the state */
+ for (i = 0; i < 10; i++)
+ st->h[i] = 0;
+ for (i = 0; i < 10; i++)
+ st->r[i] = 0;
+ for (i = 0; i < 8; i++)
+ st->pad[i] = 0;
+}
--- /dev/null
+/*
+ poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication and 64 bit addition
+*/
+
+#if defined(_MSC_VER)
+ #define POLY1305_NOINLINE __declspec(noinline)
+#elif defined(__GNUC__)
+ #define POLY1305_NOINLINE __attribute__((noinline))
+#else
+ #define POLY1305_NOINLINE
+#endif
+
+#define poly1305_block_size 16
+
+/* 17 + sizeof(size_t) + 14*sizeof(unsigned long) */
+typedef struct poly1305_state_internal_t {
+ unsigned long r[5];
+ unsigned long h[5];
+ unsigned long pad[4];
+ size_t leftover;
+ unsigned char buffer[poly1305_block_size];
+ unsigned char final;
+} poly1305_state_internal_t;
+
+/* interpret four 8 bit unsigned integers as a 32 bit unsigned integer in little endian */
+static unsigned long
+U8TO32(const unsigned char *p) {
+ return
+ (((unsigned long)(p[0] & 0xff) ) |
+ ((unsigned long)(p[1] & 0xff) << 8) |
+ ((unsigned long)(p[2] & 0xff) << 16) |
+ ((unsigned long)(p[3] & 0xff) << 24));
+}
+
+/* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */
+static void
+U32TO8(unsigned char *p, unsigned long v) {
+ p[0] = (v ) & 0xff;
+ p[1] = (v >> 8) & 0xff;
+ p[2] = (v >> 16) & 0xff;
+ p[3] = (v >> 24) & 0xff;
+}
+
+void
+poly1305_init(poly1305_context *ctx, const unsigned char key[32]) {
+ poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+
+ /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+ st->r[0] = (U8TO32(&key[ 0]) ) & 0x3ffffff;
+ st->r[1] = (U8TO32(&key[ 3]) >> 2) & 0x3ffff03;
+ st->r[2] = (U8TO32(&key[ 6]) >> 4) & 0x3ffc0ff;
+ st->r[3] = (U8TO32(&key[ 9]) >> 6) & 0x3f03fff;
+ st->r[4] = (U8TO32(&key[12]) >> 8) & 0x00fffff;
+
+ /* h = 0 */
+ st->h[0] = 0;
+ st->h[1] = 0;
+ st->h[2] = 0;
+ st->h[3] = 0;
+ st->h[4] = 0;
+
+ /* save pad for later */
+ st->pad[0] = U8TO32(&key[16]);
+ st->pad[1] = U8TO32(&key[20]);
+ st->pad[2] = U8TO32(&key[24]);
+ st->pad[3] = U8TO32(&key[28]);
+
+ st->leftover = 0;
+ st->final = 0;
+}
+
+static void
+poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) {
+ const unsigned long hibit = (st->final) ? 0 : (1 << 24); /* 1 << 128 */
+ unsigned long r0,r1,r2,r3,r4;
+ unsigned long s1,s2,s3,s4;
+ unsigned long h0,h1,h2,h3,h4;
+ unsigned long long d0,d1,d2,d3,d4;
+ unsigned long c;
+
+ r0 = st->r[0];
+ r1 = st->r[1];
+ r2 = st->r[2];
+ r3 = st->r[3];
+ r4 = st->r[4];
+
+ s1 = r1 * 5;
+ s2 = r2 * 5;
+ s3 = r3 * 5;
+ s4 = r4 * 5;
+
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ while (bytes >= poly1305_block_size) {
+ /* h += m[i] */
+ h0 += (U8TO32(m+ 0) ) & 0x3ffffff;
+ h1 += (U8TO32(m+ 3) >> 2) & 0x3ffffff;
+ h2 += (U8TO32(m+ 6) >> 4) & 0x3ffffff;
+ h3 += (U8TO32(m+ 9) >> 6) & 0x3ffffff;
+ h4 += (U8TO32(m+12) >> 8) | hibit;
+
+ /* h *= r */
+ d0 = ((unsigned long long)h0 * r0) + ((unsigned long long)h1 * s4) + ((unsigned long long)h2 * s3) + ((unsigned long long)h3 * s2) + ((unsigned long long)h4 * s1);
+ d1 = ((unsigned long long)h0 * r1) + ((unsigned long long)h1 * r0) + ((unsigned long long)h2 * s4) + ((unsigned long long)h3 * s3) + ((unsigned long long)h4 * s2);
+ d2 = ((unsigned long long)h0 * r2) + ((unsigned long long)h1 * r1) + ((unsigned long long)h2 * r0) + ((unsigned long long)h3 * s4) + ((unsigned long long)h4 * s3);
+ d3 = ((unsigned long long)h0 * r3) + ((unsigned long long)h1 * r2) + ((unsigned long long)h2 * r1) + ((unsigned long long)h3 * r0) + ((unsigned long long)h4 * s4);
+ d4 = ((unsigned long long)h0 * r4) + ((unsigned long long)h1 * r3) + ((unsigned long long)h2 * r2) + ((unsigned long long)h3 * r1) + ((unsigned long long)h4 * r0);
+
+ /* (partial) h %= p */
+ c = (unsigned long)(d0 >> 26); h0 = (unsigned long)d0 & 0x3ffffff;
+ d1 += c; c = (unsigned long)(d1 >> 26); h1 = (unsigned long)d1 & 0x3ffffff;
+ d2 += c; c = (unsigned long)(d2 >> 26); h2 = (unsigned long)d2 & 0x3ffffff;
+ d3 += c; c = (unsigned long)(d3 >> 26); h3 = (unsigned long)d3 & 0x3ffffff;
+ d4 += c; c = (unsigned long)(d4 >> 26); h4 = (unsigned long)d4 & 0x3ffffff;
+ h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff;
+ h1 += c;
+
+ m += poly1305_block_size;
+ bytes -= poly1305_block_size;
+ }
+
+ st->h[0] = h0;
+ st->h[1] = h1;
+ st->h[2] = h2;
+ st->h[3] = h3;
+ st->h[4] = h4;
+}
+
+POLY1305_NOINLINE void
+poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) {
+ poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+ unsigned long h0,h1,h2,h3,h4,c;
+ unsigned long g0,g1,g2,g3,g4;
+ unsigned long long f;
+ unsigned long mask;
+
+ /* process the remaining block */
+ if (st->leftover) {
+ size_t i = st->leftover;
+ st->buffer[i++] = 1;
+ for (; i < poly1305_block_size; i++)
+ st->buffer[i] = 0;
+ st->final = 1;
+ poly1305_blocks(st, st->buffer, poly1305_block_size);
+ }
+
+ /* fully carry h */
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ c = h1 >> 26; h1 = h1 & 0x3ffffff;
+ h2 += c; c = h2 >> 26; h2 = h2 & 0x3ffffff;
+ h3 += c; c = h3 >> 26; h3 = h3 & 0x3ffffff;
+ h4 += c; c = h4 >> 26; h4 = h4 & 0x3ffffff;
+ h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff;
+ h1 += c;
+
+ /* compute h + -p */
+ g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff;
+ g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
+ g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
+ g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
+ g4 = h4 + c - (1 << 26);
+
+ /* select h if h < p, or h + -p if h >= p */
+ mask = (g4 >> ((sizeof(unsigned long) * 8) - 1)) - 1;
+ g0 &= mask;
+ g1 &= mask;
+ g2 &= mask;
+ g3 &= mask;
+ g4 &= mask;
+ mask = ~mask;
+ h0 = (h0 & mask) | g0;
+ h1 = (h1 & mask) | g1;
+ h2 = (h2 & mask) | g2;
+ h3 = (h3 & mask) | g3;
+ h4 = (h4 & mask) | g4;
+
+ /* h = h % (2^128) */
+ h0 = ((h0 ) | (h1 << 26)) & 0xffffffff;
+ h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
+ h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
+ h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
+
+ /* mac = (h + pad) % (2^128) */
+ f = (unsigned long long)h0 + st->pad[0] ; h0 = (unsigned long)f;
+ f = (unsigned long long)h1 + st->pad[1] + (f >> 32); h1 = (unsigned long)f;
+ f = (unsigned long long)h2 + st->pad[2] + (f >> 32); h2 = (unsigned long)f;
+ f = (unsigned long long)h3 + st->pad[3] + (f >> 32); h3 = (unsigned long)f;
+
+ U32TO8(mac + 0, h0);
+ U32TO8(mac + 4, h1);
+ U32TO8(mac + 8, h2);
+ U32TO8(mac + 12, h3);
+
+ /* zero out the state */
+ st->h[0] = 0;
+ st->h[1] = 0;
+ st->h[2] = 0;
+ st->h[3] = 0;
+ st->h[4] = 0;
+ st->r[0] = 0;
+ st->r[1] = 0;
+ st->r[2] = 0;
+ st->r[3] = 0;
+ st->r[4] = 0;
+ st->pad[0] = 0;
+ st->pad[1] = 0;
+ st->pad[2] = 0;
+ st->pad[3] = 0;
+}
+
--- /dev/null
+/*
+ poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication and 128 bit addition
+*/
+
+#if defined(_MSC_VER)
+ #include <intrin.h>
+
+ typedef struct uint128_t {
+ unsigned long long lo;
+ unsigned long long hi;
+ } uint128_t;
+
+ #define MUL(out, x, y) out.lo = _umul128((x), (y), &out.hi)
+ #define ADD(out, in) { unsigned long long t = out.lo; out.lo += in.lo; out.hi += (out.lo < t) + in.hi; }
+ #define ADDLO(out, in) { unsigned long long t = out.lo; out.lo += in; out.hi += (out.lo < t); }
+ #define SHR(in, shift) (__shiftright128(in.lo, in.hi, (shift)))
+ #define LO(in) (in.lo)
+
+ #define POLY1305_NOINLINE __declspec(noinline)
+#elif defined(__GNUC__)
+ #if defined(__SIZEOF_INT128__)
+ typedef unsigned __int128 uint128_t;
+ #else
+ typedef unsigned uint128_t __attribute__((mode(TI)));
+ #endif
+
+ #define MUL(out, x, y) out = ((uint128_t)x * y)
+ #define ADD(out, in) out += in
+ #define ADDLO(out, in) out += in
+ #define SHR(in, shift) (unsigned long long)(in >> (shift))
+ #define LO(in) (unsigned long long)(in)
+
+ #define POLY1305_NOINLINE __attribute__((noinline))
+#endif
+
+#define poly1305_block_size 16
+
+/* 17 + sizeof(size_t) + 8*sizeof(unsigned long long) */
+typedef struct poly1305_state_internal_t {
+ unsigned long long r[3];
+ unsigned long long h[3];
+ unsigned long long pad[2];
+ size_t leftover;
+ unsigned char buffer[poly1305_block_size];
+ unsigned char final;
+} poly1305_state_internal_t;
+
+/* interpret eight 8 bit unsigned integers as a 64 bit unsigned integer in little endian */
+static unsigned long long
+U8TO64(const unsigned char *p) {
+ return
+ (((unsigned long long)(p[0] & 0xff) ) |
+ ((unsigned long long)(p[1] & 0xff) << 8) |
+ ((unsigned long long)(p[2] & 0xff) << 16) |
+ ((unsigned long long)(p[3] & 0xff) << 24) |
+ ((unsigned long long)(p[4] & 0xff) << 32) |
+ ((unsigned long long)(p[5] & 0xff) << 40) |
+ ((unsigned long long)(p[6] & 0xff) << 48) |
+ ((unsigned long long)(p[7] & 0xff) << 56));
+}
+
+/* store a 64 bit unsigned integer as eight 8 bit unsigned integers in little endian */
+static void
+U64TO8(unsigned char *p, unsigned long long v) {
+ p[0] = (v ) & 0xff;
+ p[1] = (v >> 8) & 0xff;
+ p[2] = (v >> 16) & 0xff;
+ p[3] = (v >> 24) & 0xff;
+ p[4] = (v >> 32) & 0xff;
+ p[5] = (v >> 40) & 0xff;
+ p[6] = (v >> 48) & 0xff;
+ p[7] = (v >> 56) & 0xff;
+}
+
+void
+poly1305_init(poly1305_context *ctx, const unsigned char key[32]) {
+ poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+ unsigned long long t0,t1;
+
+ /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+ t0 = U8TO64(&key[0]);
+ t1 = U8TO64(&key[8]);
+
+ st->r[0] = ( t0 ) & 0xffc0fffffff;
+ st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
+ st->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f;
+
+ /* h = 0 */
+ st->h[0] = 0;
+ st->h[1] = 0;
+ st->h[2] = 0;
+
+ /* save pad for later */
+ st->pad[0] = U8TO64(&key[16]);
+ st->pad[1] = U8TO64(&key[24]);
+
+ st->leftover = 0;
+ st->final = 0;
+}
+
+static void
+poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) {
+ const unsigned long long hibit = (st->final) ? 0 : ((unsigned long long)1 << 40); /* 1 << 128 */
+ unsigned long long r0,r1,r2;
+ unsigned long long s1,s2;
+ unsigned long long h0,h1,h2;
+ unsigned long long c;
+ uint128_t d0,d1,d2,d;
+
+ r0 = st->r[0];
+ r1 = st->r[1];
+ r2 = st->r[2];
+
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+
+ s1 = r1 * (5 << 2);
+ s2 = r2 * (5 << 2);
+
+ while (bytes >= poly1305_block_size) {
+ unsigned long long t0,t1;
+
+ /* h += m[i] */
+ t0 = U8TO64(&m[0]);
+ t1 = U8TO64(&m[8]);
+
+ h0 += (( t0 ) & 0xfffffffffff);
+ h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
+ h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit;
+
+ /* h *= r */
+ MUL(d0, h0, r0); MUL(d, h1, s2); ADD(d0, d); MUL(d, h2, s1); ADD(d0, d);
+ MUL(d1, h0, r1); MUL(d, h1, r0); ADD(d1, d); MUL(d, h2, s2); ADD(d1, d);
+ MUL(d2, h0, r2); MUL(d, h1, r1); ADD(d2, d); MUL(d, h2, r0); ADD(d2, d);
+
+ /* (partial) h %= p */
+ c = SHR(d0, 44); h0 = LO(d0) & 0xfffffffffff;
+ ADDLO(d1, c); c = SHR(d1, 44); h1 = LO(d1) & 0xfffffffffff;
+ ADDLO(d2, c); c = SHR(d2, 42); h2 = LO(d2) & 0x3ffffffffff;
+ h0 += c * 5; c = (h0 >> 44); h0 = h0 & 0xfffffffffff;
+ h1 += c;
+
+ m += poly1305_block_size;
+ bytes -= poly1305_block_size;
+ }
+
+ st->h[0] = h0;
+ st->h[1] = h1;
+ st->h[2] = h2;
+}
+
+
+POLY1305_NOINLINE void
+poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) {
+ poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+ unsigned long long h0,h1,h2,c;
+ unsigned long long g0,g1,g2;
+ unsigned long long t0,t1;
+
+ /* process the remaining block */
+ if (st->leftover) {
+ size_t i = st->leftover;
+ st->buffer[i] = 1;
+ for (i = i + 1; i < poly1305_block_size; i++)
+ st->buffer[i] = 0;
+ st->final = 1;
+ poly1305_blocks(st, st->buffer, poly1305_block_size);
+ }
+
+ /* fully carry h */
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+
+ c = (h1 >> 44); h1 &= 0xfffffffffff;
+ h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
+ h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
+ h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff;
+ h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
+ h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
+ h1 += c;
+
+ /* compute h + -p */
+ g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
+ g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
+ g2 = h2 + c - ((unsigned long long)1 << 42);
+
+ /* select h if h < p, or h + -p if h >= p */
+ c = (g2 >> ((sizeof(unsigned long long) * 8) - 1)) - 1;
+ g0 &= c;
+ g1 &= c;
+ g2 &= c;
+ c = ~c;
+ h0 = (h0 & c) | g0;
+ h1 = (h1 & c) | g1;
+ h2 = (h2 & c) | g2;
+
+ /* h = (h + pad) */
+ t0 = st->pad[0];
+ t1 = st->pad[1];
+
+ h0 += (( t0 ) & 0xfffffffffff) ; c = (h0 >> 44); h0 &= 0xfffffffffff;
+ h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; c = (h1 >> 44); h1 &= 0xfffffffffff;
+ h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c; h2 &= 0x3ffffffffff;
+
+ /* mac = h % (2^128) */
+ h0 = ((h0 ) | (h1 << 44));
+ h1 = ((h1 >> 20) | (h2 << 24));
+
+ U64TO8(&mac[0], h0);
+ U64TO8(&mac[8], h1);
+
+ /* zero out the state */
+ st->h[0] = 0;
+ st->h[1] = 0;
+ st->h[2] = 0;
+ st->r[0] = 0;
+ st->r[1] = 0;
+ st->r[2] = 0;
+ st->pad[0] = 0;
+ st->pad[1] = 0;
+}
+
--- /dev/null
+/*
+ poly1305 implementation using 8 bit * 8 bit = 16 bit multiplication and 32 bit addition
+
+ based on the public domain reference version in supercop by djb
+*/
+
+#if defined(_MSC_VER)
+ #define POLY1305_NOINLINE __declspec(noinline)
+#elif defined(__GNUC__)
+ #define POLY1305_NOINLINE __attribute__((noinline))
+#else
+ #define POLY1305_NOINLINE
+#endif
+
+#define poly1305_block_size 16
+
+/* 17 + sizeof(size_t) + 51*sizeof(unsigned char) */
+typedef struct poly1305_state_internal_t {
+ unsigned char buffer[poly1305_block_size];
+ size_t leftover;
+ unsigned char h[17];
+ unsigned char r[17];
+ unsigned char pad[17];
+ unsigned char final;
+} poly1305_state_internal_t;
+
+void
+poly1305_init(poly1305_context *ctx, const unsigned char key[32]) {
+ poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+ size_t i;
+
+ st->leftover = 0;
+
+ /* h = 0 */
+ for (i = 0; i < 17; i++)
+ st->h[i] = 0;
+
+ /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+ st->r[ 0] = key[ 0] & 0xff;
+ st->r[ 1] = key[ 1] & 0xff;
+ st->r[ 2] = key[ 2] & 0xff;
+ st->r[ 3] = key[ 3] & 0x0f;
+ st->r[ 4] = key[ 4] & 0xfc;
+ st->r[ 5] = key[ 5] & 0xff;
+ st->r[ 6] = key[ 6] & 0xff;
+ st->r[ 7] = key[ 7] & 0x0f;
+ st->r[ 8] = key[ 8] & 0xfc;
+ st->r[ 9] = key[ 9] & 0xff;
+ st->r[10] = key[10] & 0xff;
+ st->r[11] = key[11] & 0x0f;
+ st->r[12] = key[12] & 0xfc;
+ st->r[13] = key[13] & 0xff;
+ st->r[14] = key[14] & 0xff;
+ st->r[15] = key[15] & 0x0f;
+ st->r[16] = 0;
+
+ /* save pad for later */
+ for (i = 0; i < 16; i++)
+ st->pad[i] = key[i + 16];
+ st->pad[16] = 0;
+
+ st->final = 0;
+}
+
+static void
+poly1305_add(unsigned char h[17], const unsigned char c[17]) {
+ unsigned short u;
+ unsigned int i;
+ for (u = 0, i = 0; i < 17; i++) {
+ u += (unsigned short)h[i] + (unsigned short)c[i];
+ h[i] = (unsigned char)u & 0xff;
+ u >>= 8;
+ }
+}
+
+static void
+poly1305_squeeze(unsigned char h[17], unsigned long hr[17]) {
+ unsigned long u;
+ unsigned int i;
+ u = 0;
+ for (i = 0; i < 16; i++) {
+ u += hr[i];
+ h[i] = (unsigned char)u & 0xff;
+ u >>= 8;
+ }
+ u += hr[16];
+ h[16] = (unsigned char)u & 0x03;
+ u >>= 2;
+ u += (u << 2); /* u *= 5; */
+ for (i = 0; i < 16; i++) {
+ u += h[i];
+ h[i] = (unsigned char)u & 0xff;
+ u >>= 8;
+ }
+ h[16] += (unsigned char)u;
+}
+
+static void
+poly1305_freeze(unsigned char h[17]) {
+ static const unsigned char minusp[17] = {
+ 0x05,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+ 0xfc
+ };
+ unsigned char horig[17], negative;
+ unsigned int i;
+
+ /* compute h + -p */
+ for (i = 0; i < 17; i++)
+ horig[i] = h[i];
+ poly1305_add(h, minusp);
+
+ /* select h if h < p, or h + -p if h >= p */
+ negative = -(h[16] >> 7);
+ for (i = 0; i < 17; i++)
+ h[i] ^= negative & (horig[i] ^ h[i]);
+}
+
+static void
+poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) {
+ const unsigned char hibit = st->final ^ 1; /* 1 << 128 */
+
+ while (bytes >= poly1305_block_size) {
+ unsigned long hr[17], u;
+ unsigned char c[17];
+ unsigned int i, j;
+
+ /* h += m */
+ for (i = 0; i < 16; i++)
+ c[i] = m[i];
+ c[16] = hibit;
+ poly1305_add(st->h, c);
+
+ /* h *= r */
+ for (i = 0; i < 17; i++) {
+ u = 0;
+ for (j = 0; j <= i ; j++) {
+ u += (unsigned short)st->h[j] * st->r[i - j];
+ }
+ for (j = i + 1; j < 17; j++) {
+ unsigned long v = (unsigned short)st->h[j] * st->r[i + 17 - j];
+ v = ((v << 8) + (v << 6)); /* v *= (5 << 6); */
+ u += v;
+ }
+ hr[i] = u;
+ }
+
+ /* (partial) h %= p */
+ poly1305_squeeze(st->h, hr);
+
+ m += poly1305_block_size;
+ bytes -= poly1305_block_size;
+ }
+}
+
+POLY1305_NOINLINE void
+poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) {
+ poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+ size_t i;
+
+ /* process the remaining block */
+ if (st->leftover) {
+ size_t i = st->leftover;
+ st->buffer[i++] = 1;
+ for (; i < poly1305_block_size; i++)
+ st->buffer[i] = 0;
+ st->final = 1;
+ poly1305_blocks(st, st->buffer, poly1305_block_size);
+ }
+
+ /* fully reduce h */
+ poly1305_freeze(st->h);
+
+ /* h = (h + pad) % (1 << 128) */
+ poly1305_add(st->h, st->pad);
+ for (i = 0; i < 16; i++)
+ mac[i] = st->h[i];
+
+ /* zero out the state */
+ for (i = 0; i < 17; i++)
+ st->h[i] = 0;
+ for (i = 0; i < 17; i++)
+ st->r[i] = 0;
+ for (i = 0; i < 17; i++)
+ st->pad[i] = 0;
+}
--- /dev/null
+#include "poly1305-donna.h"
+
+#if defined(POLY1305_8BIT)
+#include "poly1305-donna-8.h"
+#elif defined(POLY1305_16BIT)
+#include "poly1305-donna-16.h"
+#elif defined(POLY1305_32BIT)
+#include "poly1305-donna-32.h"
+#elif defined(POLY1305_64BIT)
+#include "poly1305-donna-64.h"
+#else
+
+/* auto detect between 32bit / 64bit */
+#define HAS_SIZEOF_INT128_64BIT (defined(__SIZEOF_INT128__) && defined(__LP64__))
+#define HAS_MSVC_64BIT (defined(_MSC_VER) && defined(_M_X64))
+#define HAS_GCC_4_4_64BIT (defined(__GNUC__) && defined(__LP64__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4))))
+
+#if (HAS_SIZEOF_INT128_64BIT || HAS_MSVC_64BIT || HAS_GCC_4_4_64BIT)
+#include "poly1305-donna-64.h"
+#else
+#include "poly1305-donna-32.h"
+#endif
+
+#endif
+
+void
+poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes) {
+ poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+ size_t i;
+
+ /* handle leftover */
+ if (st->leftover) {
+ size_t want = (poly1305_block_size - st->leftover);
+ if (want > bytes)
+ want = bytes;
+ for (i = 0; i < want; i++)
+ st->buffer[st->leftover + i] = m[i];
+ bytes -= want;
+ m += want;
+ st->leftover += want;
+ if (st->leftover < poly1305_block_size)
+ return;
+ poly1305_blocks(st, st->buffer, poly1305_block_size);
+ st->leftover = 0;
+ }
+
+ /* process full blocks */
+ if (bytes >= poly1305_block_size) {
+ size_t want = (bytes & ~(poly1305_block_size - 1));
+ poly1305_blocks(st, m, want);
+ m += want;
+ bytes -= want;
+ }
+
+ /* store leftover */
+ if (bytes) {
+ for (i = 0; i < bytes; i++)
+ st->buffer[st->leftover + i] = m[i];
+ st->leftover += bytes;
+ }
+}
+
+void
+poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]) {
+ poly1305_context ctx;
+ poly1305_init(&ctx, key);
+ poly1305_update(&ctx, m, bytes);
+ poly1305_finish(&ctx, mac);
+}
+
+int
+poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]) {
+ size_t i;
+ unsigned int dif = 0;
+ for (i = 0; i < 16; i++)
+ dif |= (mac1[i] ^ mac2[i]);
+ dif = (dif - 1) >> ((sizeof(unsigned int) * 8) - 1);
+ return (dif & 1);
+}
+
+
+/* test a few basic operations */
+int
+poly1305_power_on_self_test(void) {
+ /* example from nacl */
+ static const unsigned char nacl_key[32] = {
+ 0xee,0xa6,0xa7,0x25,0x1c,0x1e,0x72,0x91,
+ 0x6d,0x11,0xc2,0xcb,0x21,0x4d,0x3c,0x25,
+ 0x25,0x39,0x12,0x1d,0x8e,0x23,0x4e,0x65,
+ 0x2d,0x65,0x1f,0xa4,0xc8,0xcf,0xf8,0x80,
+ };
+
+ static const unsigned char nacl_msg[131] = {
+ 0x8e,0x99,0x3b,0x9f,0x48,0x68,0x12,0x73,
+ 0xc2,0x96,0x50,0xba,0x32,0xfc,0x76,0xce,
+ 0x48,0x33,0x2e,0xa7,0x16,0x4d,0x96,0xa4,
+ 0x47,0x6f,0xb8,0xc5,0x31,0xa1,0x18,0x6a,
+ 0xc0,0xdf,0xc1,0x7c,0x98,0xdc,0xe8,0x7b,
+ 0x4d,0xa7,0xf0,0x11,0xec,0x48,0xc9,0x72,
+ 0x71,0xd2,0xc2,0x0f,0x9b,0x92,0x8f,0xe2,
+ 0x27,0x0d,0x6f,0xb8,0x63,0xd5,0x17,0x38,
+ 0xb4,0x8e,0xee,0xe3,0x14,0xa7,0xcc,0x8a,
+ 0xb9,0x32,0x16,0x45,0x48,0xe5,0x26,0xae,
+ 0x90,0x22,0x43,0x68,0x51,0x7a,0xcf,0xea,
+ 0xbd,0x6b,0xb3,0x73,0x2b,0xc0,0xe9,0xda,
+ 0x99,0x83,0x2b,0x61,0xca,0x01,0xb6,0xde,
+ 0x56,0x24,0x4a,0x9e,0x88,0xd5,0xf9,0xb3,
+ 0x79,0x73,0xf6,0x22,0xa4,0x3d,0x14,0xa6,
+ 0x59,0x9b,0x1f,0x65,0x4c,0xb4,0x5a,0x74,
+ 0xe3,0x55,0xa5
+ };
+
+ static const unsigned char nacl_mac[16] = {
+ 0xf3,0xff,0xc7,0x70,0x3f,0x94,0x00,0xe5,
+ 0x2a,0x7d,0xfb,0x4b,0x3d,0x33,0x05,0xd9
+ };
+
+ /* generates a final value of (2^130 - 2) == 3 */
+ static const unsigned char wrap_key[32] = {
+ 0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+ };
+
+ static const unsigned char wrap_msg[16] = {
+ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
+ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ };
+
+ static const unsigned char wrap_mac[16] = {
+ 0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+ };
+
+ /*
+ mac of the macs of messages of length 0 to 256, where the key and messages
+ have all their values set to the length
+ */
+ static const unsigned char total_key[32] = {
+ 0x01,0x02,0x03,0x04,0x05,0x06,0x07,
+ 0xff,0xfe,0xfd,0xfc,0xfb,0xfa,0xf9,
+ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,
+ 0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ };
+
+ static const unsigned char total_mac[16] = {
+ 0x64,0xaf,0xe2,0xe8,0xd6,0xad,0x7b,0xbd,
+ 0xd2,0x87,0xf9,0x7c,0x44,0x62,0x3d,0x39
+ };
+
+ poly1305_context ctx;
+ poly1305_context total_ctx;
+ unsigned char all_key[32];
+ unsigned char all_msg[256];
+ unsigned char mac[16];
+ size_t i, j;
+ int result = 1;
+
+ for (i = 0; i < sizeof(mac); i++)
+ mac[i] = 0;
+ poly1305_auth(mac, nacl_msg, sizeof(nacl_msg), nacl_key);
+ result &= poly1305_verify(nacl_mac, mac);
+
+ for (i = 0; i < sizeof(mac); i++)
+ mac[i] = 0;
+ poly1305_init(&ctx, nacl_key);
+ poly1305_update(&ctx, nacl_msg + 0, 32);
+ poly1305_update(&ctx, nacl_msg + 32, 64);
+ poly1305_update(&ctx, nacl_msg + 96, 16);
+ poly1305_update(&ctx, nacl_msg + 112, 8);
+ poly1305_update(&ctx, nacl_msg + 120, 4);
+ poly1305_update(&ctx, nacl_msg + 124, 2);
+ poly1305_update(&ctx, nacl_msg + 126, 1);
+ poly1305_update(&ctx, nacl_msg + 127, 1);
+ poly1305_update(&ctx, nacl_msg + 128, 1);
+ poly1305_update(&ctx, nacl_msg + 129, 1);
+ poly1305_update(&ctx, nacl_msg + 130, 1);
+ poly1305_finish(&ctx, mac);
+ result &= poly1305_verify(nacl_mac, mac);
+
+ for (i = 0; i < sizeof(mac); i++)
+ mac[i] = 0;
+ poly1305_auth(mac, wrap_msg, sizeof(wrap_msg), wrap_key);
+ result &= poly1305_verify(wrap_mac, mac);
+
+ poly1305_init(&total_ctx, total_key);
+ for (i = 0; i < 256; i++) {
+ /* set key and message to 'i,i,i..' */
+ for (j = 0; j < sizeof(all_key); j++)
+ all_key[j] = i;
+ for (j = 0; j < i; j++)
+ all_msg[j] = i;
+ poly1305_auth(mac, all_msg, i, all_key);
+ poly1305_update(&total_ctx, mac, 16);
+ }
+ poly1305_finish(&total_ctx, mac);
+ result &= poly1305_verify(total_mac, mac);
+
+ return result;
+}
--- /dev/null
+#ifndef POLY1305_DONNA_H
+#define POLY1305_DONNA_H
+
+#include <stddef.h>
+
+typedef struct poly1305_context {
+ size_t aligner;
+ unsigned char opaque[136];
+} poly1305_context;
+
+void poly1305_init(poly1305_context *ctx, const unsigned char key[32]);
+void poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes);
+void poly1305_finish(poly1305_context *ctx, unsigned char mac[16]);
+void poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]);
+
+int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]);
+int poly1305_power_on_self_test(void);
+
+#endif /* POLY1305_DONNA_H */
+